units passed with updated API

apache · dorx · Jul 25, 2014 · Jul 25, 2014 · Jul 25, 2014 · Jul 25, 2014
commit c39eeb5d4b885f32f6defa06976dccbd06c33c0b
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -98,6 +98,8 @@ object Statistics {
    * expected distribution.
    *
    * Note: the two input Vectors need to have the same size.
+   *       `observed` cannot contain negative values.
+   *       `expected` cannot contain nonpositive values.
    *
    * @param observed Vector containing the observed categorical counts/relative frequencies.
    * @param expected Vector containing the expected categorical counts/relative frequencies.
@@ -114,6 +116,8 @@ object Statistics {
    * Conduct Pearson's chi-squared goodness of fit test of the observed data against the uniform
    * distribution, with each category having an expected frequency of `1 / observed.size`.
    *
+   * Note: `observed` cannot contain negative values.
+   *
    * @param observed Vector containing the observed categorical counts/relative frequencies.
    * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
    *         the method used, and the null hypothesis.
@@ -123,14 +127,25 @@ object Statistics {
 
   /**
    * :: Experimental ::
-   * TODO
+   * Conduct Pearson's independence test on the input contingency matrix, which cannot contain
+   * negative entries or columns or rows that sum up to 0.
+   *
+   * @param counts The contingency matrix.
+   * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
+   *         the method used, and the null hypothesis.
    */
   @Experimental
   def chiSqTest(counts: Matrix): ChiSquaredTestResult = ChiSquaredTest.chiSquaredMatrix(counts)
 
   /**
    * :: Experimental ::
-   * TODO
+   * Conduct Pearson's independence test for every feature against the label across the input RDD.
+   * For each feature, the (feature, label) pairs are converted into a contingency matrix for which
+   * the chi-squared statistic is computed.
+   *
+   * @param data an `RDD[LabeledPoint]` containing the Labeled dataset.
+   * @return an array containing the ChiSquaredTestResult for every feature against the label.
+   *         The order of the elements in the returned array reflects the order of input features.
    */
   @Experimental
   def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSquaredTestResult] = {

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSquaredTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSquaredTest.scala
@@ -27,17 +27,16 @@ import org.apache.spark.rdd.RDD
 
 /**
  * Conduct the Chi-squared test for the input RDDs using the specified method.
- * Goodness-of-fit test is conducted on two RDD[Double]s, whereas test of independence is conducted
- * on an input of type RDD[Vector] or RDD[LabeledPoint] in which independence between columns is
- * assessed.
+ * Goodness-of-fit test is conducted on two `Vectors`, whereas test of independence is conducted
+ * on an input of type `Matrix` in which independence between columns is assessed.
+ * We also provide a method for computing the chi-squared statistic between each feature and the
+ * label for an input `RDD[LabeledPoint]`, return an `Array[ChiSquaredTestResult]` of size =
+ * number of features in the inpuy RDD.
  *
  * Supported methods for goodness of fit: `pearson` (default)
  * Supported methods for independence: `pearson` (default)
  *
  * More information on Chi-squared test: http://en.wikipedia.org/wiki/Chi-squared_test
- * More information on Pearson's chi-squared test:
- *   http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
- *
  */
 private[stat] object ChiSquaredTest extends Logging {
 
@@ -47,17 +46,20 @@ private[stat] object ChiSquaredTest extends Logging {
    */
   case class Method(name: String, chiSqFunc: (Double, Double) => Double)
 
+  // Pearson's chi-squared test: http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
   val PEARSON = new Method("pearson", (observed: Double, expected: Double) => {
     val dev = observed - expected
     dev * dev / expected
   })
 
+  // Null hypothesis for the two different types of chi-squared tests to be included in the result.
   object NullHypothesis extends Enumeration {
     type NullHypothesis = Value
     val goodnessOfFit = Value("observed follows the same distribution as expected.")
     val independence = Value("observations in each column are statistically independent.")
   }
 
+  // Method identification based on input methodName string
   private def methodFromString(methodName: String): Method = {
     methodName match {
       case PEARSON.name => PEARSON
@@ -67,9 +69,9 @@ private[stat] object ChiSquaredTest extends Logging {
 
   /**
    * Conduct Pearson's independence test for each feature against the label across the input RDD.
-   *
-   * @param data RDD of LabeledPoints.
-   * @return Array[ChiSquareTestResult] containing
+   * The contingency table is constructed from the raw (feature, label) pairs and used to conduct
+   * the independence test.
+   * Returns an array containing the ChiSquaredTestResult for every feature against the label.
    */
   def chiSquaredFeatures(data: RDD[LabeledPoint],
       methodName: String = PEARSON.name): Array[ChiSquaredTestResult] = {
@@ -102,7 +104,8 @@ private[stat] object ChiSquaredTest extends Logging {
   }
 
   /*
-   * Pearon's goodness of fit test. This can be easily made abstract to support other methods.
+   * Pearon's goodness of fit test on the input observed and expected counts/relative frequencies.
+   * Uniform distribution is assumed when `expected` is not passed in.
    */
   def chiSquared(observed: Vector,
       expected: Vector = Vectors.dense(Array[Double]()),
@@ -147,20 +150,23 @@ private[stat] object ChiSquaredTest extends Logging {
 
     // compute chi-squared statistic
     var statistic = 0.0
-    i = 0
-    while (i < observed.size) {
-      val obs = observed(i)
+    var j = 0
+    while (j < observed.size) {
+      val obs = observed(j)
       if (obs != 0.0) {
-        statistic += method.chiSqFunc(obs, getExpected(i))
+        statistic += method.chiSqFunc(obs, getExpected(j))
       }
+      j += 1
     }
     val df = size - 1
     val pValue = chiSquareComplemented(df, statistic)
-    new ChiSquaredTestResult(pValue, df, statistic, PEARSON.name, NullHypothesis.goodnessOfFit.toString)
+    new ChiSquaredTestResult(pValue, df, statistic, PEARSON.name,
+      NullHypothesis.goodnessOfFit.toString)
   }
 
   /*
-   * Pearon's independence test. This can be easily made abstract to support other methods.
+   * Pearon's independence test on the input contingency matrix.
+   * TODO: optimize for SparseMatrix when it becomes supported.
    */
   def chiSquaredMatrix(counts: Matrix, methodName:String = PEARSON.name): ChiSquaredTestResult = {
     val method = methodFromString(methodName)
@@ -182,22 +188,24 @@ private[stat] object ChiSquaredTest extends Logging {
       i += 1
     }
     if (!colSums.forall(_ > 0.0) || !rowSums.forall(_ > 0.0)) {
-      throw new IllegalArgumentException("Chi square statistic cannot be computed for input matrix due to "
-        + "0.0 entries in the expected contingency table.")
+      throw new IllegalArgumentException("Chi square statistic cannot be computed for input matrix "
+        + "due to 0.0 entries in the expected contingency table.")
     }
     val total = colSums.sum
 
     // second pass to collect statistic
     var statistic = 0.0
-    i = 0
-    while (i < colMajorArr.size) {
-      val expected = colSums(i / numRows) * rowSums(i % numRows) / total
-      statistic += method.chiSqFunc(colMajorArr(i), expected)
+    var j = 0
+    while (j < colMajorArr.size) {
+      val expected = colSums(j / numRows) * rowSums(j % numRows) / total
+      statistic += method.chiSqFunc(colMajorArr(j), expected)
+      j += 1
     }
 
     // Second pass to compute chi-squared statistic
     val df = (numCols - 1) * (numRows - 1)
     val pValue = chiSquareComplemented(df, statistic)
-    new ChiSquaredTestResult(pValue, df, statistic, methodName, NullHypothesis.independence.toString)
+    new ChiSquaredTestResult(pValue, df, statistic, methodName,
+      NullHypothesis.independence.toString)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala
@@ -19,8 +19,8 @@ package org.apache.spark.mllib.stat
 
 import org.scalatest.FunSuite
 
-import org.apache.spark.SparkException
-import org.apache.spark.mllib.linalg.{Matrices, DenseVector, Vectors}
+import org.apache.spark.mllib.linalg.{DenseVector, Matrices, Vectors}
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.test.ChiSquaredTest
 import org.apache.spark.mllib.util.LocalSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
@@ -34,23 +34,28 @@ class HypothesisTestSuite extends FunSuite with LocalSparkContext {
 
     // Results validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
     assert(pearson.statistic === 0.4)
-    assert(pearson.degreesOfFreedom === Array(2))
-    assert(pearson.pValue ~= 0.8187 absTol 1e-3)
-    assert(pearson.method === ChiSquaredTest.PEARSON)
+    assert(pearson.degreesOfFreedom === 2)
+    assert(pearson.pValue ~= 0.8187 relTol 1e-4)
+    assert(pearson.method === ChiSquaredTest.PEARSON.name)
     assert(pearson.nullHypothesis === ChiSquaredTest.NullHypothesis.goodnessOfFit.toString)
 
     // different expected and observed sum
     val observed1 = new DenseVector(Array[Double](21, 38, 43, 80))
     val expected1 = new DenseVector(Array[Double](3, 5, 7, 20))
-    val c1 = Statistics.chiSqTest(observed1, expected1)
+    val pearson1 = Statistics.chiSqTest(observed1, expected1)
 
     // Results validated against the R command
     // `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
-    assert(c1.statistic ~= 14.1429 absTol 1e-3)
-    assert(c1.degreesOfFreedom === Array(3))
-    assert(c1.pValue ~= 0.002717 absTol 1e-6)
-    assert(c1.method === ChiSquaredTest.PEARSON)
-    assert(c1.nullHypothesis === ChiSquaredTest.NullHypothesis.goodnessOfFit.toString)
+    assert(pearson1.statistic ~= 14.1429 relTol 1e-4)
+    assert(pearson1.degreesOfFreedom === 3)
+    assert(pearson1.pValue ~= 0.002717 relTol 1e-4)
+    assert(pearson1.method === ChiSquaredTest.PEARSON.name)
+    assert(pearson1.nullHypothesis === ChiSquaredTest.NullHypothesis.goodnessOfFit.toString)
+
+    // SparseVector representation to make sure memory doesn't blow up
+    // Commented out because it takes too long for unit tests. Should be run as part of perf test.
+    // val observed2 = new SparseVector(Int.MaxValue, Array(1000005), Array[Double](10.0))
+    // val pearson2 = Statistics.chiSqTest(observed2)
 
     // Vectors with different sizes
     val observed3 = new DenseVector(Array(1.0, 2.0, 3.0))
@@ -66,44 +71,58 @@ class HypothesisTestSuite extends FunSuite with LocalSparkContext {
     intercept[IllegalArgumentException](Statistics.chiSqTest(observed, zeroExpected))
   }
 
-  test("chi squared pearson independence") {
-
-    val data = Array(
-      40.0, 56.0, 31.0, 30.0,
-      24.0, 32.0, 10.0, 15.0,
-      29.0, 42.0, 0.0, 12.0)
+  test("chi squared pearson matrix independence") {
+    val data = Array(40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0)
+    // [[40.0, 56.0, 31.0, 30.0],
+    //  [24.0, 32.0, 10.0, 15.0],
+    //  [29.0, 42.0, 0.0,  12.0]]
     val chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
-    assert(chi.statistic ~= 21.9958 absTol 1e-3)
-    assert(chi.degreesOfFreedom === Array(6))
-    assert(chi.pValue ~= 0.001213 absTol 1e-6)
-    assert(chi.method === ChiSquaredTest.PEARSON)
+    // Results validated against R command
+    // `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
+    assert(chi.statistic ~= 21.9958 relTol 1e-4)
+    assert(chi.degreesOfFreedom === 6)
+    assert(chi.pValue ~= 0.001213 relTol 1e-4)
+    assert(chi.method === ChiSquaredTest.PEARSON.name)
     assert(chi.nullHypothesis === ChiSquaredTest.NullHypothesis.independence.toString)
 
     // Negative counts
-    val negCounts = Array(
-      4.0, 5.0, 3.0, 3.0,
-      0.0, -3.0, 0.0, 5.0,
-      9.0, 0.0, 0.0, 1.0)
-    intercept[SparkException](Statistics.chiSqTest(Matrices.dense(3, 4, negCounts)))
+    val negCounts = Array(4.0, 5.0, 3.0, -3.0)
+    intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(2, 2, negCounts)))
 
     // Row sum = 0.0
-    val rowZero = Array(
-      4.0, 5.0, 3.0, 3.0,
-      0.0, 0.0, 0.0, 0.0,
-      9.0, 0.0, 0.0, 1.0)
-    intercept[SparkException](Statistics.chiSqTest(Matrices.dense(3, 4, rowZero)))
+    val rowZero = Array(0.0, 1.0, 0.0, 2.0)
+    intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(2, 2, rowZero)))
 
     // Column sum  = 0.0
-    val colZero = Array(
-      1.0, 0.0, 0.0, 2.0,
-      4.0, 5.0, 0.0, 3.0,
-      9.0, 0.0, 0.0, 1.0)
+    val colZero = Array(0.0, 0.0, 2.0, 2.0)
     // IllegalArgumentException thrown here since it's thrown on driver, not inside a task
-    intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(3, 4, colZero)))
+    intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(2, 2, colZero)))
   }
 
-  test("chi squared pearson features") {
-
+  test("chi squared pearson RDD[LabeledPoint]") {
+    // labels: 1.0 (2 / 6), 0.0 (4 / 6)
+    // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6)
+    // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6)
+    val data = Array(new LabeledPoint(0.0, Vectors.dense(0.5, 10.0)),
+                     new LabeledPoint(0.0, Vectors.dense(1.5, 20.0)),
+                     new LabeledPoint(1.0, Vectors.dense(1.5, 30.0)),
+                     new LabeledPoint(0.0, Vectors.dense(3.5, 30.0)),
+                     new LabeledPoint(0.0, Vectors.dense(3.5, 40.0)),
+                     new LabeledPoint(1.0, Vectors.dense(3.5, 40.0)))
+    for (numParts <- List(2, 4, 6, 8)) {
+      val chi = Statistics.chiSqTest(sc.parallelize(data, numParts))
+      val feature1 = chi(0)
+      assert(feature1.statistic === 0.75)
+      assert(feature1.degreesOfFreedom === 2)
+      assert(feature1.pValue ~= 0.6873 relTol 1e-4)
+      assert(feature1.method === ChiSquaredTest.PEARSON.name)
+      assert(feature1.nullHypothesis === ChiSquaredTest.NullHypothesis.independence.toString)
+      val feature2 = chi(1)
+      assert(feature2.statistic === 1.5)
+      assert(feature2.degreesOfFreedom === 3)
+      assert(feature2.pValue ~= 0.6823 relTol 1e-4)
+      assert(feature2.method === ChiSquaredTest.PEARSON.name)
+      assert(feature2.nullHypothesis === ChiSquaredTest.NullHypothesis.independence.toString)
+    }
   }
-
 }