Getting started

Add any of these lines to your build.sbt:

// The core library
libraryDependencies += "io.github.pityka" %% "saddle-core" % "0.0.0+1-e4694aaf-SNAPSHOT"
// Inlined binary operation instances for Mat and Vec
libraryDependencies += "io.github.pityka" %% "saddle-ops-inlined" % "0.0.0+1-e4694aaf-SNAPSHOT"
// Linear algebra interface to BLAS
libraryDependencies += "io.github.pityka" %% "saddle-linalg" % "0.0.0+1-e4694aaf-SNAPSHOT"
// Binary representation of data frames and matrices
libraryDependencies += "io.github.pityka" %% "saddle-binary" % "0.0.0+1-e4694aaf-SNAPSHOT"
// Circe Encoder and Decoder instances
libraryDependencies += "io.github.pityka" %% "saddle-circe" % "0.0.0+1-e4694aaf-SNAPSHOT"
// Jsoniter-scala codec instances
libraryDependencies += "io.github.pityka" %% "saddle-jsoniter" % "0.0.0+1-e4694aaf-SNAPSHOT"
// Interface to joda time (not maintained)
libraryDependencies += "io.github.pityka" %% "saddle-time" % "0.0.0+1-e4694aaf-SNAPSHOT"
// Interface to EJML (not maintained)
libraryDependencies += "io.github.pityka" %% "saddle-stats" % "0.0.0+1-e4694aaf-SNAPSHOT"

Dependencies #

The actively maintained artifacts have minimal dependencies:

  • saddle-io has no dependency (not even saddle-core)
  • saddle-core depends on cats-kernel
  • saddle-linalg depends on a fork of netlib-java
  • saddle-binary depends on ujson
  • saddle-circe depends on circe
  • saddle-jsoniter depends on jsoniter-scala

Example: SVD on the Iris dataset #

import scala.io.Source
import org.saddle._
val irisURL = "https://gist.githubusercontent.com/pityka/d05bb892541d71c2a06a0efb6933b323/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv"
// irisURL: String = "https://gist.githubusercontent.com/pityka/d05bb892541d71c2a06a0efb6933b323/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv"
val iris : Frame[Int,String,Double] = csv.CsvParser.parseInputStreamWithHeader[Double](
      inputStream = new java.net.URL(irisURL).openStream, 
      cols = List(0,1,2,3), 
      recordSeparator = "\n").toOption.get
// iris: Frame[Int, String, Double] = [150 x 4]
//        sepal_length sepal_width petal_length petal_width 
//        ------------ ----------- ------------ ----------- 
//   0 ->       5.1000      3.5000       1.4000      0.2000 
//   1 ->       4.9000      3.0000       1.4000      0.2000 
//   2 ->       4.7000      3.2000       1.3000      0.2000 
//   3 ->       4.6000      3.1000       1.5000      0.2000 
//   4 ->       5.0000      3.6000       1.4000      0.2000 
// ...
// 145 ->       6.7000      3.0000       5.2000      2.3000 
// 146 ->       6.3000      2.5000       5.0000      1.9000 
// 147 ->       6.5000      3.0000       5.2000      2.0000 
// 148 ->       6.2000      3.4000       5.4000      2.3000 
// 149 ->       5.9000      3.0000       5.1000      1.8000 
// 

import org.saddle.linalg._
val centered = iris.mapVec(_.demeaned)
// centered: Frame[Int, String, Double] = [150 x 4]
//        sepal_length sepal_width petal_length petal_width 
//        ------------ ----------- ------------ ----------- 
//   0 ->      -0.7433      0.4460      -2.3587     -0.9987 
//   1 ->      -0.9433     -0.0540      -2.3587     -0.9987 
//   2 ->      -1.1433      0.1460      -2.4587     -0.9987 
//   3 ->      -1.2433      0.0460      -2.2587     -0.9987 
//   4 ->      -0.8433      0.5460      -2.3587     -0.9987 
// ...
// 145 ->       0.8567     -0.0540       1.4413      1.1013 
// 146 ->       0.4567     -0.5540       1.2413      0.7013 
// 147 ->       0.6567     -0.0540       1.4413      0.8013 
// 148 ->       0.3567      0.3460       1.6413      1.1013 
// 149 ->       0.0567     -0.0540       1.3413      0.6013 
// 
val SVDResult(u, s, vt) = centered.toMat.svd(2)
// u: Mat[Double] = [150 x 2]
//  0.1070  0.0544 
//  0.1082 -0.0282 
//  0.1152 -0.0229 
//  0.1095 -0.0518 
// ...
// -0.0608 -0.0624 
// -0.0703  0.0131 
// -0.0758  0.0193 
// -0.0554 -0.0471 
// 
// s: Vec[Double] = [2 x 1]
// 25.0899
//  6.0079
// 
// vt: Mat[Double] = [2 x 4]
// -0.3616 0.0823 -0.8566 -0.3588 
//  0.6565 0.7297 -0.1758 -0.0747 
// 
val pca = u.mDiagFromRight(s).toFrame
// pca: Frame[Int, Int, Double] = [150 x 2]
//              0       1 
//        ------- ------- 
//   0 ->  2.6842  0.3266 
//   1 ->  2.7154 -0.1696 
//   2 ->  2.8898 -0.1373 
//   3 ->  2.7464 -0.3111 
//   4 ->  2.7286  0.3339 
// ...
// 145 -> -1.9440  0.1874 
// 146 -> -1.5257 -0.3750 
// 147 -> -1.7640  0.0785 
// 148 -> -1.9016  0.1159 
// 149 -> -1.3897 -0.2829 
// 

val joined = iris.rconcat(pca.mapColIndex(i => s"PCA$i"))
// joined: Frame[Int, String, Double] = [150 x 6]
//        sepal_length sepal_width petal_length petal_width    PCA0    PCA1 
//        ------------ ----------- ------------ ----------- ------- ------- 
//   0 ->       5.1000      3.5000       1.4000      0.2000  2.6842  0.3266 
//   1 ->       4.9000      3.0000       1.4000      0.2000  2.7154 -0.1696 
//   2 ->       4.7000      3.2000       1.3000      0.2000  2.8898 -0.1373 
//   3 ->       4.6000      3.1000       1.5000      0.2000  2.7464 -0.3111 
//   4 ->       5.0000      3.6000       1.4000      0.2000  2.7286  0.3339 
// ...
// 145 ->       6.7000      3.0000       5.2000      2.3000 -1.9440  0.1874 
// 146 ->       6.3000      2.5000       5.0000      1.9000 -1.5257 -0.3750 
// 147 ->       6.5000      3.0000       5.2000      2.0000 -1.7640  0.0785 
// 148 ->       6.2000      3.4000       5.4000      2.3000 -1.9016  0.1159 
// 149 ->       5.9000      3.0000       5.1000      1.8000 -1.3897 -0.2829 
// 
val reducedByColumn = joined.reduce(_.values.mean)
// reducedByColumn: Series[String, Double] = [6 x 1]
// sepal_length ->  5.8433
//  sepal_width ->  3.0540
// petal_length ->  3.7587
//  petal_width ->  1.1987
//         PCA0 -> -0.0000
//         PCA1 ->  0.0000
// 
val reducedByRow = joined.rreduce(_.values.max)
// reducedByRow: Series[Int, scalar.Scalar[Double]] = [150 x 1]
//   0 -> 5.1
//   1 -> 4.9
//   2 -> 4.7
//   3 -> 4.6
//   4 -> 5.0
//  ... 
// 145 -> 6.7
// 146 -> 6.3
// 147 -> 6.5
// 148 -> 6.2
// 149 -> 5.9
//