-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-8598] [MLlib] Implementation of 1-sample, two-sided, Kolmogorov Smirnov Test for RDDs #6994
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
13dfe4d
c659ea1
4da189b
ce8e9a1
b9cff3a
f6951b6
c18dc66
16b5c4c
0b5e8ec
4b8ba61
6a4784f
992293b
3f81ad2
9c0f1af
1226b30
9026895
3288e42
e760ebd
7e66f57
a4bc0c7
2ec2aa6
1bb44bd
a48ae7b
1f56371
0d0c201
bbb30b1
08834f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
…b statistics docs
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -60,9 +60,9 @@ private[stat] object KSTest { | |
| def testOneSample(data: RDD[Double], cdf: Double => Double): KSTestResult = { | ||
| val n = data.count().toDouble | ||
| val localData = data.sortBy(x => x).mapPartitions { part => | ||
| val partDiffs = oneSampleDifferences(part, n, cdf) // local distances | ||
| searchOneSampleCandidates(partDiffs) // candidates: local extrema | ||
| }.collect() | ||
| val partDiffs = oneSampleDifferences(part, n, cdf) // local distances | ||
|
||
| searchOneSampleCandidates(partDiffs) // candidates: local extrema | ||
| }.collect() | ||
| val ksStat = searchOneSampleStatistic(localData, n) // result: global extreme | ||
| evalOneSampleP(ksStat, n.toLong) | ||
| } | ||
|
|
@@ -76,9 +76,9 @@ private[stat] object KSTest { | |
| def testOneSample(data: RDD[Double], createDist: () => RealDistribution): KSTestResult = { | ||
| val n = data.count().toDouble | ||
| val localData = data.sortBy(x => x).mapPartitions { part => | ||
| val partDiffs = oneSampleDifferences(part, n, createDist) // local distances | ||
| searchOneSampleCandidates(partDiffs) // candidates: local extrema | ||
| }.collect() | ||
| val partDiffs = oneSampleDifferences(part, n, createDist) // local distances | ||
| searchOneSampleCandidates(partDiffs) // candidates: local extrema | ||
| }.collect() | ||
| val ksStat = searchOneSampleStatistic(localData, n) // result: global extreme | ||
| evalOneSampleP(ksStat, n.toLong) | ||
|
||
| } | ||
|
|
@@ -101,14 +101,14 @@ private[stat] object KSTest { | |
| // zip data with index (within that partition) | ||
| // calculate local (unadjusted) ECDF and subtract CDF | ||
| partData.zipWithIndex.map { case (v, ix) => | ||
| // dp and dl are later adjusted by constant, when global info is available | ||
| val dp = (ix + 1) / n | ||
| val dl = ix / n | ||
| val cdfVal = cdf(v) | ||
| // if dp > cdfVal the adjusted dp is still above cdfVal, if dp < cdfVal | ||
| // we want negative distance so that constant adjusted gives correct distance | ||
| if (dp > cdfVal) dp - cdfVal else dl - cdfVal | ||
| } | ||
| // dp and dl are later adjusted by constant, when global info is available | ||
| val dp = (ix + 1) / n | ||
| val dl = ix / n | ||
| val cdfVal = cdf(v) | ||
| // if dp > cdfVal the adjusted dp is still above cdfVal, if dp < cdfVal | ||
| // we want negative distance so that constant adjusted gives correct distance | ||
| if (dp > cdfVal) dp - cdfVal else dl - cdfVal | ||
| } | ||
| } | ||
|
|
||
| private def oneSampleDifferences( | ||
|
|
@@ -132,8 +132,8 @@ private[stat] object KSTest { | |
| : Iterator[(Double, Double, Double)] = { | ||
| val initAcc = (Double.MaxValue, Double.MinValue, 0.0) | ||
| val partResults = partDiffs.foldLeft(initAcc) { case ((pMin, pMax, pCt), currDiff) => | ||
| (Math.min(pMin, currDiff), Math.max(pMax, currDiff), pCt + 1) | ||
| } | ||
| (Math.min(pMin, currDiff), Math.max(pMax, currDiff), pCt + 1) | ||
| } | ||
| Array(partResults).iterator | ||
| } | ||
|
|
||
|
|
@@ -152,16 +152,16 @@ private[stat] object KSTest { | |
| // adjust differences based on the # of elements preceding it, which should provide | ||
| // the correct distance between ECDF and CDF | ||
| val results = localData.foldLeft(initAcc) { case ((prevMax, prevCt), (minCand, maxCand, ct)) => | ||
| val adjConst = prevCt / n | ||
| val pdist1 = minCand + adjConst | ||
| val pdist2 = maxCand + adjConst | ||
| // adjust by 1 / N if pre-constant the value is less than cdf and post-constant | ||
| // it is greater than or equal to the cdf | ||
| val dist1 = if (pdist1 >= 0 && minCand < 0) pdist1 + 1 / n else Math.abs(pdist1) | ||
| val dist2 = if (pdist2 >= 0 && maxCand < 0) pdist2 + 1 / n else Math.abs(pdist2) | ||
| val maxVal = Array(prevMax, dist1, dist2).max | ||
| (maxVal, prevCt + ct) | ||
| } | ||
| val adjConst = prevCt / n | ||
| val pdist1 = minCand + adjConst | ||
| val pdist2 = maxCand + adjConst | ||
| // adjust by 1 / N if pre-constant the value is less than cdf and post-constant | ||
| // it is greater than or equal to the cdf | ||
| val dist1 = if (pdist1 >= 0 && minCand < 0) pdist1 + 1 / n else Math.abs(pdist1) | ||
| val dist2 = if (pdist2 >= 0 && maxCand < 0) pdist2 + 1 / n else Math.abs(pdist2) | ||
| val maxVal = Array(prevMax, dist1, dist2).max | ||
| (maxVal, prevCt + ct) | ||
| } | ||
| results._1 | ||
| } | ||
|
|
||
|
|
@@ -177,7 +177,7 @@ private[stat] object KSTest { | |
| distName match { | ||
| case "stdnorm" => () => new NormalDistribution(0, 1) | ||
| case _ => throw new UnsupportedOperationException(s"$distName not yet supported through" + | ||
| s"convenience method. Current options are:[stdnorm].") | ||
| s" convenience method. Current options are:[stdnorm].") | ||
|
||
| } | ||
|
|
||
| testOneSample(data, distanceCalc) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Kolmogorov-Smirnov->Kolmogorov-Smirnov (KS)Otherwise, we use
KSwithout definition.