-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-2515][mllib] Chi Squared test #1733
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
ff17423
6598379
706d436
3d61582
e6b83f3
4e4e361
50703a5
bc7eb2e
5686082
d64c2fb
7eea80b
e90d90a
c39eeb5
80d03e2
7dde711
e95e485
d286783
cafb3a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -98,6 +98,8 @@ object Statistics { | |
| * expected distribution. | ||
| * | ||
| * Note: the two input Vectors need to have the same size. | ||
| * `observed` cannot contain negative values. | ||
| * `expected` cannot contain nonpositive values. | ||
| * | ||
| * @param observed Vector containing the observed categorical counts/relative frequencies. | ||
| * @param expected Vector containing the expected categorical counts/relative frequencies. | ||
|
|
@@ -114,6 +116,8 @@ object Statistics { | |
| * Conduct Pearson's chi-squared goodness of fit test of the observed data against the uniform | ||
| * distribution, with each category having an expected frequency of `1 / observed.size`. | ||
| * | ||
| * Note: `observed` cannot contain negative values. | ||
| * | ||
| * @param observed Vector containing the observed categorical counts/relative frequencies. | ||
| * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value, | ||
| * the method used, and the null hypothesis. | ||
|
|
@@ -123,14 +127,25 @@ object Statistics { | |
|
|
||
| /** | ||
| * :: Experimental :: | ||
| * TODO | ||
| * Conduct Pearson's independence test on the input contingency matrix, which cannot contain | ||
| * negative entries or columns or rows that sum up to 0. | ||
| * | ||
| * @param counts The contingency matrix. | ||
| * @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value, | ||
| * the method used, and the null hypothesis. | ||
| */ | ||
| @Experimental | ||
| def chiSqTest(counts: Matrix): ChiSquaredTestResult = ChiSquaredTest.chiSquaredMatrix(counts) | ||
|
|
||
| /** | ||
| * :: Experimental :: | ||
| * TODO | ||
| * Conduct Pearson's independence test for every feature against the label across the input RDD. | ||
| * For each feature, the (feature, label) pairs are converted into a contingency matrix for which | ||
| * the chi-squared statistic is computed. | ||
| * | ||
| * @param data an `RDD[LabeledPoint]` containing the Labeled dataset. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mention categorical here? |
||
| * @return an array containing the ChiSquaredTestResult for every feature against the label. | ||
| * The order of the elements in the returned array reflects the order of input features. | ||
| */ | ||
| @Experimental | ||
| def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSquaredTestResult] = { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,17 +27,16 @@ import org.apache.spark.rdd.RDD | |
|
|
||
| /** | ||
| * Conduct the Chi-squared test for the input RDDs using the specified method. | ||
|
||
| * Goodness-of-fit test is conducted on two RDD[Double]s, whereas test of independence is conducted | ||
| * on an input of type RDD[Vector] or RDD[LabeledPoint] in which independence between columns is | ||
| * assessed. | ||
| * Goodness-of-fit test is conducted on two `Vectors`, whereas test of independence is conducted | ||
| * on an input of type `Matrix` in which independence between columns is assessed. | ||
| * We also provide a method for computing the chi-squared statistic between each feature and the | ||
| * label for an input `RDD[LabeledPoint]`, return an `Array[ChiSquaredTestResult]` of size = | ||
| * number of features in the inpuy RDD. | ||
| * | ||
| * Supported methods for goodness of fit: `pearson` (default) | ||
| * Supported methods for independence: `pearson` (default) | ||
| * | ||
| * More information on Chi-squared test: http://en.wikipedia.org/wiki/Chi-squared_test | ||
| * More information on Pearson's chi-squared test: | ||
| * http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test | ||
| * | ||
| */ | ||
| private[stat] object ChiSquaredTest extends Logging { | ||
|
||
|
|
||
|
|
@@ -47,17 +46,20 @@ private[stat] object ChiSquaredTest extends Logging { | |
| */ | ||
| case class Method(name: String, chiSqFunc: (Double, Double) => Double) | ||
|
|
||
| // Pearson's chi-squared test: http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test | ||
| val PEARSON = new Method("pearson", (observed: Double, expected: Double) => { | ||
| val dev = observed - expected | ||
| dev * dev / expected | ||
| }) | ||
|
|
||
| // Null hypothesis for the two different types of chi-squared tests to be included in the result. | ||
| object NullHypothesis extends Enumeration { | ||
| type NullHypothesis = Value | ||
| val goodnessOfFit = Value("observed follows the same distribution as expected.") | ||
| val independence = Value("observations in each column are statistically independent.") | ||
| } | ||
|
|
||
| // Method identification based on input methodName string | ||
| private def methodFromString(methodName: String): Method = { | ||
| methodName match { | ||
| case PEARSON.name => PEARSON | ||
|
|
@@ -67,9 +69,9 @@ private[stat] object ChiSquaredTest extends Logging { | |
|
|
||
| /** | ||
| * Conduct Pearson's independence test for each feature against the label across the input RDD. | ||
| * | ||
| * @param data RDD of LabeledPoints. | ||
| * @return Array[ChiSquareTestResult] containing | ||
| * The contingency table is constructed from the raw (feature, label) pairs and used to conduct | ||
| * the independence test. | ||
| * Returns an array containing the ChiSquaredTestResult for every feature against the label. | ||
| */ | ||
| def chiSquaredFeatures(data: RDD[LabeledPoint], | ||
| methodName: String = PEARSON.name): Array[ChiSquaredTestResult] = { | ||
|
|
@@ -102,7 +104,8 @@ private[stat] object ChiSquaredTest extends Logging { | |
| } | ||
|
|
||
| /* | ||
| * Pearon's goodness of fit test. This can be easily made abstract to support other methods. | ||
| * Pearon's goodness of fit test on the input observed and expected counts/relative frequencies. | ||
| * Uniform distribution is assumed when `expected` is not passed in. | ||
| */ | ||
| def chiSquared(observed: Vector, | ||
| expected: Vector = Vectors.dense(Array[Double]()), | ||
|
|
@@ -147,20 +150,23 @@ private[stat] object ChiSquaredTest extends Logging { | |
|
|
||
| // compute chi-squared statistic | ||
| var statistic = 0.0 | ||
| i = 0 | ||
| while (i < observed.size) { | ||
| val obs = observed(i) | ||
| var j = 0 | ||
| while (j < observed.size) { | ||
| val obs = observed(j) | ||
| if (obs != 0.0) { | ||
| statistic += method.chiSqFunc(obs, getExpected(i)) | ||
| statistic += method.chiSqFunc(obs, getExpected(j)) | ||
| } | ||
| j += 1 | ||
| } | ||
| val df = size - 1 | ||
| val pValue = chiSquareComplemented(df, statistic) | ||
| new ChiSquaredTestResult(pValue, df, statistic, PEARSON.name, NullHypothesis.goodnessOfFit.toString) | ||
| new ChiSquaredTestResult(pValue, df, statistic, PEARSON.name, | ||
| NullHypothesis.goodnessOfFit.toString) | ||
| } | ||
|
|
||
| /* | ||
| * Pearon's independence test. This can be easily made abstract to support other methods. | ||
| * Pearon's independence test on the input contingency matrix. | ||
| * TODO: optimize for SparseMatrix when it becomes supported. | ||
| */ | ||
| def chiSquaredMatrix(counts: Matrix, methodName:String = PEARSON.name): ChiSquaredTestResult = { | ||
| val method = methodFromString(methodName) | ||
|
|
@@ -182,22 +188,24 @@ private[stat] object ChiSquaredTest extends Logging { | |
| i += 1 | ||
| } | ||
| if (!colSums.forall(_ > 0.0) || !rowSums.forall(_ > 0.0)) { | ||
| throw new IllegalArgumentException("Chi square statistic cannot be computed for input matrix due to " | ||
| + "0.0 entries in the expected contingency table.") | ||
| throw new IllegalArgumentException("Chi square statistic cannot be computed for input matrix " | ||
|
||
| + "due to 0.0 entries in the expected contingency table.") | ||
| } | ||
| val total = colSums.sum | ||
|
|
||
| // second pass to collect statistic | ||
| var statistic = 0.0 | ||
| i = 0 | ||
| while (i < colMajorArr.size) { | ||
| val expected = colSums(i / numRows) * rowSums(i % numRows) / total | ||
| statistic += method.chiSqFunc(colMajorArr(i), expected) | ||
| var j = 0 | ||
| while (j < colMajorArr.size) { | ||
| val expected = colSums(j / numRows) * rowSums(j % numRows) / total | ||
| statistic += method.chiSqFunc(colMajorArr(j), expected) | ||
| j += 1 | ||
| } | ||
|
|
||
| // Second pass to compute chi-squared statistic | ||
| val df = (numCols - 1) * (numRows - 1) | ||
| val pValue = chiSquareComplemented(df, statistic) | ||
| new ChiSquaredTestResult(pValue, df, statistic, methodName, NullHypothesis.independence.toString) | ||
| new ChiSquaredTestResult(pValue, df, statistic, methodName, | ||
| NullHypothesis.independence.toString) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,8 +19,8 @@ package org.apache.spark.mllib.stat | |
|
|
||
| import org.scalatest.FunSuite | ||
|
|
||
| import org.apache.spark.SparkException | ||
| import org.apache.spark.mllib.linalg.{Matrices, DenseVector, Vectors} | ||
| import org.apache.spark.mllib.linalg.{DenseVector, Matrices, Vectors} | ||
| import org.apache.spark.mllib.regression.LabeledPoint | ||
| import org.apache.spark.mllib.stat.test.ChiSquaredTest | ||
| import org.apache.spark.mllib.util.LocalSparkContext | ||
| import org.apache.spark.mllib.util.TestingUtils._ | ||
|
|
@@ -34,23 +34,28 @@ class HypothesisTestSuite extends FunSuite with LocalSparkContext { | |
|
|
||
| // Results validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))` | ||
| assert(pearson.statistic === 0.4) | ||
| assert(pearson.degreesOfFreedom === Array(2)) | ||
| assert(pearson.pValue ~= 0.8187 absTol 1e-3) | ||
| assert(pearson.method === ChiSquaredTest.PEARSON) | ||
| assert(pearson.degreesOfFreedom === 2) | ||
| assert(pearson.pValue ~= 0.8187 relTol 1e-4) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| assert(pearson.method === ChiSquaredTest.PEARSON.name) | ||
| assert(pearson.nullHypothesis === ChiSquaredTest.NullHypothesis.goodnessOfFit.toString) | ||
|
|
||
| // different expected and observed sum | ||
| val observed1 = new DenseVector(Array[Double](21, 38, 43, 80)) | ||
| val expected1 = new DenseVector(Array[Double](3, 5, 7, 20)) | ||
| val c1 = Statistics.chiSqTest(observed1, expected1) | ||
| val pearson1 = Statistics.chiSqTest(observed1, expected1) | ||
|
|
||
| // Results validated against the R command | ||
| // `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))` | ||
| assert(c1.statistic ~= 14.1429 absTol 1e-3) | ||
| assert(c1.degreesOfFreedom === Array(3)) | ||
| assert(c1.pValue ~= 0.002717 absTol 1e-6) | ||
| assert(c1.method === ChiSquaredTest.PEARSON) | ||
| assert(c1.nullHypothesis === ChiSquaredTest.NullHypothesis.goodnessOfFit.toString) | ||
| assert(pearson1.statistic ~= 14.1429 relTol 1e-4) | ||
| assert(pearson1.degreesOfFreedom === 3) | ||
| assert(pearson1.pValue ~= 0.002717 relTol 1e-4) | ||
| assert(pearson1.method === ChiSquaredTest.PEARSON.name) | ||
| assert(pearson1.nullHypothesis === ChiSquaredTest.NullHypothesis.goodnessOfFit.toString) | ||
|
|
||
| // SparseVector representation to make sure memory doesn't blow up | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove commented blocks.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's actually meant as a note to perf testers, but okay. |
||
| // Commented out because it takes too long for unit tests. Should be run as part of perf test. | ||
| // val observed2 = new SparseVector(Int.MaxValue, Array(1000005), Array[Double](10.0)) | ||
| // val pearson2 = Statistics.chiSqTest(observed2) | ||
|
|
||
| // Vectors with different sizes | ||
| val observed3 = new DenseVector(Array(1.0, 2.0, 3.0)) | ||
|
|
@@ -66,44 +71,58 @@ class HypothesisTestSuite extends FunSuite with LocalSparkContext { | |
| intercept[IllegalArgumentException](Statistics.chiSqTest(observed, zeroExpected)) | ||
| } | ||
|
|
||
| test("chi squared pearson independence") { | ||
|
|
||
| val data = Array( | ||
| 40.0, 56.0, 31.0, 30.0, | ||
| 24.0, 32.0, 10.0, 15.0, | ||
| 29.0, 42.0, 0.0, 12.0) | ||
| test("chi squared pearson matrix independence") { | ||
| val data = Array(40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0) | ||
| // [[40.0, 56.0, 31.0, 30.0], | ||
| // [24.0, 32.0, 10.0, 15.0], | ||
| // [29.0, 42.0, 0.0, 12.0]] | ||
| val chi = Statistics.chiSqTest(Matrices.dense(3, 4, data)) | ||
| assert(chi.statistic ~= 21.9958 absTol 1e-3) | ||
| assert(chi.degreesOfFreedom === Array(6)) | ||
| assert(chi.pValue ~= 0.001213 absTol 1e-6) | ||
| assert(chi.method === ChiSquaredTest.PEARSON) | ||
| // Results validated against R command | ||
| // `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))` | ||
| assert(chi.statistic ~= 21.9958 relTol 1e-4) | ||
| assert(chi.degreesOfFreedom === 6) | ||
| assert(chi.pValue ~= 0.001213 relTol 1e-4) | ||
| assert(chi.method === ChiSquaredTest.PEARSON.name) | ||
| assert(chi.nullHypothesis === ChiSquaredTest.NullHypothesis.independence.toString) | ||
|
|
||
| // Negative counts | ||
| val negCounts = Array( | ||
| 4.0, 5.0, 3.0, 3.0, | ||
| 0.0, -3.0, 0.0, 5.0, | ||
| 9.0, 0.0, 0.0, 1.0) | ||
| intercept[SparkException](Statistics.chiSqTest(Matrices.dense(3, 4, negCounts))) | ||
| val negCounts = Array(4.0, 5.0, 3.0, -3.0) | ||
| intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(2, 2, negCounts))) | ||
|
|
||
| // Row sum = 0.0 | ||
| val rowZero = Array( | ||
| 4.0, 5.0, 3.0, 3.0, | ||
| 0.0, 0.0, 0.0, 0.0, | ||
| 9.0, 0.0, 0.0, 1.0) | ||
| intercept[SparkException](Statistics.chiSqTest(Matrices.dense(3, 4, rowZero))) | ||
| val rowZero = Array(0.0, 1.0, 0.0, 2.0) | ||
| intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(2, 2, rowZero))) | ||
|
|
||
| // Column sum = 0.0 | ||
| val colZero = Array( | ||
| 1.0, 0.0, 0.0, 2.0, | ||
| 4.0, 5.0, 0.0, 3.0, | ||
| 9.0, 0.0, 0.0, 1.0) | ||
| val colZero = Array(0.0, 0.0, 2.0, 2.0) | ||
| // IllegalArgumentException thrown here since it's thrown on driver, not inside a task | ||
| intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(3, 4, colZero))) | ||
| intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(2, 2, colZero))) | ||
| } | ||
|
|
||
| test("chi squared pearson features") { | ||
|
|
||
| test("chi squared pearson RDD[LabeledPoint]") { | ||
| // labels: 1.0 (2 / 6), 0.0 (4 / 6) | ||
| // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6) | ||
| // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6) | ||
| val data = Array(new LabeledPoint(0.0, Vectors.dense(0.5, 10.0)), | ||
| new LabeledPoint(0.0, Vectors.dense(1.5, 20.0)), | ||
| new LabeledPoint(1.0, Vectors.dense(1.5, 30.0)), | ||
| new LabeledPoint(0.0, Vectors.dense(3.5, 30.0)), | ||
| new LabeledPoint(0.0, Vectors.dense(3.5, 40.0)), | ||
| new LabeledPoint(1.0, Vectors.dense(3.5, 40.0))) | ||
| for (numParts <- List(2, 4, 6, 8)) { | ||
| val chi = Statistics.chiSqTest(sc.parallelize(data, numParts)) | ||
| val feature1 = chi(0) | ||
| assert(feature1.statistic === 0.75) | ||
| assert(feature1.degreesOfFreedom === 2) | ||
| assert(feature1.pValue ~= 0.6873 relTol 1e-4) | ||
| assert(feature1.method === ChiSquaredTest.PEARSON.name) | ||
| assert(feature1.nullHypothesis === ChiSquaredTest.NullHypothesis.independence.toString) | ||
| val feature2 = chi(1) | ||
| assert(feature2.statistic === 1.5) | ||
| assert(feature2.degreesOfFreedom === 3) | ||
| assert(feature2.pValue ~= 0.6823 relTol 1e-4) | ||
| assert(feature2.method === ChiSquaredTest.PEARSON.name) | ||
| assert(feature2.nullHypothesis === ChiSquaredTest.NullHypothesis.independence.toString) | ||
| } | ||
| } | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
counts->observed? This table could also be probabilities.