additional style changes incorporated and added documentation to mlli…

…b statistics docs
apache · josepablocam · Jun 24, 2015 · Jun 24, 2015 · Jun 24, 2015 · Jun 24, 2015
commit 9c0f1af882c930cafe55fe828c0c2d0fbe2d23f1
diff --git a/docs/mllib-statistics.md b/docs/mllib-statistics.md
@@ -283,7 +283,7 @@ approxSample = data.sampleByKey(False, fractions);
 
 Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically 
 significant, whether this result occurred by chance or not. MLlib currently supports Pearson's 
-chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine 
+chi-squared ( $\chi^2$) tests for goodness of fit and independence. The input data types determine
 whether the goodness of fit or the independence test is conducted. The goodness of fit test requires 
 an input type of `Vector`, whereas the independence test requires a `Matrix` as input.
 
@@ -422,6 +422,38 @@ for i, result in enumerate(featureTestResults):
 
 </div>
 
+Additionally, MLlib provides a 1-sample, 2-sided implementation of the Kolmogorov-Smirnov test
+for equality of probability distributions. By providing the name of a theoretical distribution
+(currently solely supported for the standard normal distribution), or a function to calculate
+the cumulative distribution according to a given theoretical distribution, the user can
+test the null hypothesis that their sample is drawn from that distribution.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+[`Statistics`](api/scala/index.html#org.apache.spark.mllib.stat.Statistics$) provides methods to
+run a 1-sample, 2-sided Kolmogorov-Smirnov test. The following example demonstrates how to run
+and interpret the hypothesis tests.
+
+{% highlight scala %}
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.stat.Statistics._
+
+val data: RDD[Double] = ... // an RDD of sample data
+
+// run a KS test for the sample versus a standard normal distribution
+val ksTestResult = Statistics.ksTest(data, "stdnorm")
+println(ksTestResult) // summary of the test including the p-value, test statistic,
+                      // and null hypothesis
+                      // if our p-value indicates significance, we can reject the null hypothesis
+
+// perform a KS test using a cumulative distribution function of our making
+val myCDF: Double => Double = ...
+val ksTestResult = Statistics.ksTest(data, myCDF)
+{% endhighlight %}
+</div>
+</div>
+
+
 ## Random data generation
 
 Random data generation is useful for randomized algorithms, prototyping, and performance testing.

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KSTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KSTest.scala
@@ -60,9 +60,9 @@ private[stat] object KSTest {
   def testOneSample(data: RDD[Double], cdf: Double => Double): KSTestResult = {
     val n = data.count().toDouble
     val localData = data.sortBy(x => x).mapPartitions { part =>
-      val partDiffs = oneSampleDifferences(part, n, cdf) // local distances
-      searchOneSampleCandidates(partDiffs) // candidates: local extrema
-      }.collect()
+    val partDiffs = oneSampleDifferences(part, n, cdf) // local distances
+    searchOneSampleCandidates(partDiffs) // candidates: local extrema
+    }.collect()
     val ksStat = searchOneSampleStatistic(localData, n) // result: global extreme
     evalOneSampleP(ksStat, n.toLong)
   }
@@ -76,9 +76,9 @@ private[stat] object KSTest {
   def testOneSample(data: RDD[Double], createDist: () => RealDistribution): KSTestResult = {
     val n = data.count().toDouble
     val localData = data.sortBy(x => x).mapPartitions { part =>
-      val partDiffs = oneSampleDifferences(part, n, createDist) // local distances
-      searchOneSampleCandidates(partDiffs) // candidates: local extrema
-      }.collect()
+    val partDiffs = oneSampleDifferences(part, n, createDist) // local distances
+    searchOneSampleCandidates(partDiffs) // candidates: local extrema
+    }.collect()
     val ksStat = searchOneSampleStatistic(localData, n) // result: global extreme
     evalOneSampleP(ksStat, n.toLong)
   }
@@ -101,14 +101,14 @@ private[stat] object KSTest {
     // zip data with index (within that partition)
     // calculate local (unadjusted) ECDF and subtract CDF
     partData.zipWithIndex.map { case (v, ix) =>
-      // dp and dl are later adjusted by constant, when global info is available
-      val dp = (ix + 1) / n
-      val dl = ix / n
-      val cdfVal = cdf(v)
-      // if dp > cdfVal the adjusted dp is still above cdfVal, if dp < cdfVal
-      // we want negative distance so that constant adjusted gives correct distance
-      if (dp > cdfVal) dp - cdfVal else dl - cdfVal
-      }
+    // dp and dl are later adjusted by constant, when global info is available
+    val dp = (ix + 1) / n
+    val dl = ix / n
+    val cdfVal = cdf(v)
+    // if dp > cdfVal the adjusted dp is still above cdfVal, if dp < cdfVal
+    // we want negative distance so that constant adjusted gives correct distance
+    if (dp > cdfVal) dp - cdfVal else dl - cdfVal
+    }
   }
 
   private def oneSampleDifferences(
@@ -132,8 +132,8 @@ private[stat] object KSTest {
     : Iterator[(Double, Double, Double)] = {
     val initAcc = (Double.MaxValue, Double.MinValue, 0.0)
     val partResults = partDiffs.foldLeft(initAcc) { case ((pMin, pMax, pCt), currDiff) =>
-      (Math.min(pMin, currDiff), Math.max(pMax, currDiff), pCt + 1)
-      }
+    (Math.min(pMin, currDiff), Math.max(pMax, currDiff), pCt + 1)
+    }
     Array(partResults).iterator
   }
 
@@ -152,16 +152,16 @@ private[stat] object KSTest {
     // adjust differences based on the # of elements preceding it, which should provide
     // the correct distance between ECDF and CDF
     val results = localData.foldLeft(initAcc) { case ((prevMax, prevCt), (minCand, maxCand, ct)) =>
-      val adjConst = prevCt / n
-      val pdist1 = minCand + adjConst
-      val pdist2 = maxCand + adjConst
-      // adjust by 1 / N if pre-constant the value is less than cdf and post-constant
-      // it is greater than or equal to the cdf
-      val dist1 = if (pdist1 >= 0 && minCand < 0) pdist1 + 1 / n else Math.abs(pdist1)
-      val dist2 = if (pdist2 >= 0 && maxCand < 0) pdist2 + 1 / n else Math.abs(pdist2)
-      val maxVal = Array(prevMax, dist1, dist2).max
-      (maxVal, prevCt + ct)
-      }
+    val adjConst = prevCt / n
+    val pdist1 = minCand + adjConst
+    val pdist2 = maxCand + adjConst
+    // adjust by 1 / N if pre-constant the value is less than cdf and post-constant
+    // it is greater than or equal to the cdf
+    val dist1 = if (pdist1 >= 0 && minCand < 0) pdist1 + 1 / n else Math.abs(pdist1)
+    val dist2 = if (pdist2 >= 0 && maxCand < 0) pdist2 + 1 / n else Math.abs(pdist2)
+    val maxVal = Array(prevMax, dist1, dist2).max
+    (maxVal, prevCt + ct)
+    }
     results._1
   }
 
@@ -177,7 +177,7 @@ private[stat] object KSTest {
       distName match {
         case "stdnorm" => () => new NormalDistribution(0, 1)
         case  _ => throw new UnsupportedOperationException(s"$distName not yet supported through" +
-          s"convenience method. Current options are:[stdnorm].")
+          s" convenience method. Current options are:[stdnorm].")
       }
 
     testOneSample(data, distanceCalc)