Skip to content

Commit bbb30b1

Browse files
author
jose.cambronero
committed
renamed KSTestResult to KolmogorovSmirnovTestResult, to stay consistent with method name
1 parent 0d0c201 commit bbb30b1

File tree

4 files changed

+30
-26
lines changed

4 files changed

+30
-26
lines changed

docs/mllib-statistics.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -444,14 +444,14 @@ import org.apache.spark.mllib.stat.Statistics._
444444
val data: RDD[Double] = ... // an RDD of sample data
445445

446446
// run a KS test for the sample versus a standard normal distribution
447-
val ksTestResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
448-
println(ksTestResult) // summary of the test including the p-value, test statistic,
447+
val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
448+
println(testResult) // summary of the test including the p-value, test statistic,
449449
// and null hypothesis
450450
// if our p-value indicates significance, we can reject the null hypothesis
451451

452452
// perform a KS test using a cumulative distribution function of our making
453453
val myCDF: Double => Double = ...
454-
val ksTestResult = Statistics.kolmogorovSmirnovTest(data, myCDF)
454+
val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
455455
{% endhighlight %}
456456
</div>
457457
</div>

mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ import org.apache.spark.mllib.linalg.distributed.RowMatrix
2525
import org.apache.spark.mllib.linalg.{Matrix, Vector}
2626
import org.apache.spark.mllib.regression.LabeledPoint
2727
import org.apache.spark.mllib.stat.correlation.Correlations
28-
import org.apache.spark.mllib.stat.test.{ChiSqTest, ChiSqTestResult, KSTest, KSTestResult}
28+
import org.apache.spark.mllib.stat.test.{ChiSqTest, ChiSqTestResult, KolmogorovSmirnovTest,
29+
KolmogorovSmirnovTestResult}
2930
import org.apache.spark.rdd.RDD
3031

3132
/**
@@ -171,11 +172,12 @@ object Statistics {
171172
*
172173
* @param data an `RDD[Double]` containing the sample of data to test
173174
* @param cdf a `Double => Double` function to calculate the theoretical CDF at a given value
174-
* @return [[org.apache.spark.mllib.stat.test.KSTestResult]] object containing test statistic,
175-
* p-value, and null hypothesis.
175+
* @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] object containing test
176+
* statistic, p-value, and null hypothesis.
176177
*/
177-
def kolmogorovSmirnovTest(data: RDD[Double], cdf: Double => Double): KSTestResult = {
178-
KSTest.testOneSample(data, cdf)
178+
def kolmogorovSmirnovTest(data: RDD[Double], cdf: Double => Double)
179+
: KolmogorovSmirnovTestResult = {
180+
KolmogorovSmirnovTest.testOneSample(data, cdf)
179181
}
180182

181183
/**
@@ -186,11 +188,12 @@ object Statistics {
186188
* @param data an `RDD[Double]` containing the sample of data to test
187189
* @param distName a `String` name for a theoretical distribution
188190
* @param params `Double*` specifying the parameters to be used for the theoretical distribution
189-
* @return [[org.apache.spark.mllib.stat.test.KSTestResult]] object containing test statistic,
190-
* p-value, and null hypothesis.
191+
* @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] object containing test
192+
* statistic, p-value, and null hypothesis.
191193
*/
192194
@varargs
193-
def kolmogorovSmirnovTest(data: RDD[Double], distName: String, params: Double*): KSTestResult = {
194-
KSTest.testOneSample(data, distName, params: _*)
195+
def kolmogorovSmirnovTest(data: RDD[Double], distName: String, params: Double*)
196+
: KolmogorovSmirnovTestResult = {
197+
KolmogorovSmirnovTest.testOneSample(data, distName, params: _*)
195198
}
196199
}

mllib/src/main/scala/org/apache/spark/mllib/stat/test/KSTest.scala renamed to mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ import org.apache.spark.Logging
2626
import org.apache.spark.rdd.RDD
2727

2828
/**
29-
* Conduct the two-sided Kolmogorov Smirnov test for data sampled from a
29+
* Conduct the two-sided Kolmogorov Smirnov (KS) test for data sampled from a
3030
* continuous distribution. By comparing the largest difference between the empirical cumulative
3131
* distribution of the sample data and the theoretical distribution we can provide a test for the
3232
* the null hypothesis that the sample data comes from that theoretical distribution.
@@ -47,7 +47,7 @@ import org.apache.spark.rdd.RDD
4747
* appropriate constant (the cumulative sum of number of elements in the prior partitions divided by
4848
* thedata set size). Finally, we take the maximum absolute value, and this is the statistic.
4949
*/
50-
private[stat] object KSTest extends Logging {
50+
private[stat] object KolmogorovSmirnovTest extends Logging {
5151

5252
// Null hypothesis for the type of KS test to be included in the result.
5353
object NullHypothesis extends Enumeration {
@@ -59,10 +59,10 @@ private[stat] object KSTest extends Logging {
5959
* Runs a KS test for 1 set of sample data, comparing it to a theoretical distribution
6060
* @param data `RDD[Double]` data on which to run test
6161
* @param cdf `Double => Double` function to calculate the theoretical CDF
62-
* @return [[org.apache.spark.mllib.stat.test.KSTestResult]] summarizing the test results
63-
* (p-value, statistic, and null hypothesis)
62+
* @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] summarizing the test
63+
* results (p-value, statistic, and null hypothesis)
6464
*/
65-
def testOneSample(data: RDD[Double], cdf: Double => Double): KSTestResult = {
65+
def testOneSample(data: RDD[Double], cdf: Double => Double): KolmogorovSmirnovTestResult = {
6666
val n = data.count().toDouble
6767
val localData = data.sortBy(x => x).mapPartitions { part =>
6868
val partDiffs = oneSampleDifferences(part, n, cdf) // local distances
@@ -76,10 +76,10 @@ private[stat] object KSTest extends Logging {
7676
* Runs a KS test for 1 set of sample data, comparing it to a theoretical distribution
7777
* @param data `RDD[Double]` data on which to run test
7878
* @param distObj `RealDistribution` a theoretical distribution
79-
* @return [[org.apache.spark.mllib.stat.test.KSTestResult]] summarizing the test results
80-
* (p-value, statistic, and null hypothesis)
79+
* @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] summarizing the test
80+
* results (p-value, statistic, and null hypothesis)
8181
*/
82-
def testOneSample(data: RDD[Double], distObj: RealDistribution): KSTestResult = {
82+
def testOneSample(data: RDD[Double], distObj: RealDistribution): KolmogorovSmirnovTestResult = {
8383
val cdf = (x: Double) => distObj.cumulativeProbability(x)
8484
testOneSample(data, cdf)
8585
}
@@ -158,11 +158,12 @@ private[stat] object KSTest extends Logging {
158158
* @param data the sample data that we wish to evaluate
159159
* @param distName the name of the theoretical distribution
160160
* @param params Variable length parameter for distribution's parameters
161-
* @return [[org.apache.spark.mllib.stat.test.KSTestResult]] summarizing the test results
162-
* (p-value, statistic, and null hypothesis)
161+
* @return [[org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult]] summarizing the
162+
* test results (p-value, statistic, and null hypothesis)
163163
*/
164164
@varargs
165-
def testOneSample(data: RDD[Double], distName: String, params: Double*): KSTestResult = {
165+
def testOneSample(data: RDD[Double], distName: String, params: Double*)
166+
: KolmogorovSmirnovTestResult = {
166167
val distObj =
167168
distName match {
168169
case "norm" => {
@@ -185,9 +186,9 @@ private[stat] object KSTest extends Logging {
185186
testOneSample(data, distObj)
186187
}
187188

188-
private def evalOneSampleP(ksStat: Double, n: Long): KSTestResult = {
189+
private def evalOneSampleP(ksStat: Double, n: Long): KolmogorovSmirnovTestResult = {
189190
val pval = 1 - new KolmogorovSmirnovTest().cdf(ksStat, n.toInt)
190-
new KSTestResult(pval, ksStat, NullHypothesis.OneSampleTwoSided.toString)
191+
new KolmogorovSmirnovTestResult(pval, ksStat, NullHypothesis.OneSampleTwoSided.toString)
191192
}
192193
}
193194

mllib/src/main/scala/org/apache/spark/mllib/stat/test/TestResult.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ class ChiSqTestResult private[stat] (override val pValue: Double,
9696
* Object containing the test results for the Kolmogorov-Smirnov test.
9797
*/
9898
@Experimental
99-
class KSTestResult private[stat] (
99+
class KolmogorovSmirnovTestResult private[stat] (
100100
override val pValue: Double,
101101
override val statistic: Double,
102102
override val nullHypothesis: String) extends TestResult[Int] {

0 commit comments

Comments
 (0)