Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
units passed with updated API
  • Loading branch information
dorx committed Aug 7, 2014
commit c39eeb5d4b885f32f6defa06976dccbd06c33c0b
19 changes: 17 additions & 2 deletions mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ object Statistics {
* expected distribution.
*
* Note: the two input Vectors need to have the same size.
* `observed` cannot contain negative values.
* `expected` cannot contain nonpositive values.
*
* @param observed Vector containing the observed categorical counts/relative frequencies.
* @param expected Vector containing the expected categorical counts/relative frequencies.
Expand All @@ -114,6 +116,8 @@ object Statistics {
* Conduct Pearson's chi-squared goodness of fit test of the observed data against the uniform
* distribution, with each category having an expected frequency of `1 / observed.size`.
*
* Note: `observed` cannot contain negative values.
*
* @param observed Vector containing the observed categorical counts/relative frequencies.
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis.
Expand All @@ -123,14 +127,25 @@ object Statistics {

/**
* :: Experimental ::
* TODO
* Conduct Pearson's independence test on the input contingency matrix, which cannot contain
* negative entries or columns or rows that sum up to 0.
*
* @param counts The contingency matrix.
* @return ChiSquaredTest object containing the test statistic, degrees of freedom, p-value,
* the method used, and the null hypothesis.
*/
@Experimental
def chiSqTest(counts: Matrix): ChiSquaredTestResult = ChiSquaredTest.chiSquaredMatrix(counts)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

counts -> observed? This table could also be probabilities.


/**
* :: Experimental ::
* TODO
* Conduct Pearson's independence test for every feature against the label across the input RDD.
* For each feature, the (feature, label) pairs are converted into a contingency matrix for which
* the chi-squared statistic is computed.
*
* @param data an `RDD[LabeledPoint]` containing the Labeled dataset.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mention categorical here?

* @return an array containing the ChiSquaredTestResult for every feature against the label.
* The order of the elements in the returned array reflects the order of input features.
*/
@Experimental
def chiSqTest(data: RDD[LabeledPoint]): Array[ChiSquaredTestResult] = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,16 @@ import org.apache.spark.rdd.RDD

/**
* Conduct the Chi-squared test for the input RDDs using the specified method.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Chi -> chi

* Goodness-of-fit test is conducted on two RDD[Double]s, whereas test of independence is conducted
* on an input of type RDD[Vector] or RDD[LabeledPoint] in which independence between columns is
* assessed.
* Goodness-of-fit test is conducted on two `Vectors`, whereas test of independence is conducted
* on an input of type `Matrix` in which independence between columns is assessed.
* We also provide a method for computing the chi-squared statistic between each feature and the
* label for an input `RDD[LabeledPoint]`, return an `Array[ChiSquaredTestResult]` of size =
* number of features in the inpuy RDD.
*
* Supported methods for goodness of fit: `pearson` (default)
* Supported methods for independence: `pearson` (default)
*
* More information on Chi-squared test: http://en.wikipedia.org/wiki/Chi-squared_test
* More information on Pearson's chi-squared test:
* http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
*
*/
private[stat] object ChiSquaredTest extends Logging {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: ChiSquaredTest -> ChiSqTest (to match the public method names)


Expand All @@ -47,17 +46,20 @@ private[stat] object ChiSquaredTest extends Logging {
*/
case class Method(name: String, chiSqFunc: (Double, Double) => Double)

// Pearson's chi-squared test: http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
val PEARSON = new Method("pearson", (observed: Double, expected: Double) => {
val dev = observed - expected
dev * dev / expected
})

// Null hypothesis for the two different types of chi-squared tests to be included in the result.
object NullHypothesis extends Enumeration {
type NullHypothesis = Value
val goodnessOfFit = Value("observed follows the same distribution as expected.")
val independence = Value("observations in each column are statistically independent.")
}

// Method identification based on input methodName string
private def methodFromString(methodName: String): Method = {
methodName match {
case PEARSON.name => PEARSON
Expand All @@ -67,9 +69,9 @@ private[stat] object ChiSquaredTest extends Logging {

/**
* Conduct Pearson's independence test for each feature against the label across the input RDD.
*
* @param data RDD of LabeledPoints.
* @return Array[ChiSquareTestResult] containing
* The contingency table is constructed from the raw (feature, label) pairs and used to conduct
* the independence test.
* Returns an array containing the ChiSquaredTestResult for every feature against the label.
*/
def chiSquaredFeatures(data: RDD[LabeledPoint],
methodName: String = PEARSON.name): Array[ChiSquaredTestResult] = {
Expand Down Expand Up @@ -102,7 +104,8 @@ private[stat] object ChiSquaredTest extends Logging {
}

/*
* Pearon's goodness of fit test. This can be easily made abstract to support other methods.
* Pearon's goodness of fit test on the input observed and expected counts/relative frequencies.
* Uniform distribution is assumed when `expected` is not passed in.
*/
def chiSquared(observed: Vector,
expected: Vector = Vectors.dense(Array[Double]()),
Expand Down Expand Up @@ -147,20 +150,23 @@ private[stat] object ChiSquaredTest extends Logging {

// compute chi-squared statistic
var statistic = 0.0
i = 0
while (i < observed.size) {
val obs = observed(i)
var j = 0
while (j < observed.size) {
val obs = observed(j)
if (obs != 0.0) {
statistic += method.chiSqFunc(obs, getExpected(i))
statistic += method.chiSqFunc(obs, getExpected(j))
}
j += 1
}
val df = size - 1
val pValue = chiSquareComplemented(df, statistic)
new ChiSquaredTestResult(pValue, df, statistic, PEARSON.name, NullHypothesis.goodnessOfFit.toString)
new ChiSquaredTestResult(pValue, df, statistic, PEARSON.name,
NullHypothesis.goodnessOfFit.toString)
}

/*
* Pearon's independence test. This can be easily made abstract to support other methods.
* Pearon's independence test on the input contingency matrix.
* TODO: optimize for SparseMatrix when it becomes supported.
*/
def chiSquaredMatrix(counts: Matrix, methodName:String = PEARSON.name): ChiSquaredTestResult = {
val method = methodFromString(methodName)
Expand All @@ -182,22 +188,24 @@ private[stat] object ChiSquaredTest extends Logging {
i += 1
}
if (!colSums.forall(_ > 0.0) || !rowSums.forall(_ > 0.0)) {
throw new IllegalArgumentException("Chi square statistic cannot be computed for input matrix due to "
+ "0.0 entries in the expected contingency table.")
throw new IllegalArgumentException("Chi square statistic cannot be computed for input matrix "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it may be nice to output the column index or row index here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're returning statistic = Double.NaN for when expected = 0.0 for the GOF test, do we also want to do the same thing here instead of throwing an exception?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this case, if there are empty rows or columns, both observed and expected are 0.0, so we should throw an exception.

Btw, for the case when expected = 0 and observed > 0, the result should be statistics = Inf and pValue = 0.0.

+ "due to 0.0 entries in the expected contingency table.")
}
val total = colSums.sum

// second pass to collect statistic
var statistic = 0.0
i = 0
while (i < colMajorArr.size) {
val expected = colSums(i / numRows) * rowSums(i % numRows) / total
statistic += method.chiSqFunc(colMajorArr(i), expected)
var j = 0
while (j < colMajorArr.size) {
val expected = colSums(j / numRows) * rowSums(j % numRows) / total
statistic += method.chiSqFunc(colMajorArr(j), expected)
j += 1
}

// Second pass to compute chi-squared statistic
val df = (numCols - 1) * (numRows - 1)
val pValue = chiSquareComplemented(df, statistic)
new ChiSquaredTestResult(pValue, df, statistic, methodName, NullHypothesis.independence.toString)
new ChiSquaredTestResult(pValue, df, statistic, methodName,
NullHypothesis.independence.toString)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ package org.apache.spark.mllib.stat

import org.scalatest.FunSuite

import org.apache.spark.SparkException
import org.apache.spark.mllib.linalg.{Matrices, DenseVector, Vectors}
import org.apache.spark.mllib.linalg.{DenseVector, Matrices, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.test.ChiSquaredTest
import org.apache.spark.mllib.util.LocalSparkContext
import org.apache.spark.mllib.util.TestingUtils._
Expand All @@ -34,23 +34,28 @@ class HypothesisTestSuite extends FunSuite with LocalSparkContext {

// Results validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
assert(pearson.statistic === 0.4)
assert(pearson.degreesOfFreedom === Array(2))
assert(pearson.pValue ~= 0.8187 absTol 1e-3)
assert(pearson.method === ChiSquaredTest.PEARSON)
assert(pearson.degreesOfFreedom === 2)
assert(pearson.pValue ~= 0.8187 relTol 1e-4)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

~= -> ~==. The latter tells more when something is wrong. (and please also update other places)

assert(pearson.method === ChiSquaredTest.PEARSON.name)
assert(pearson.nullHypothesis === ChiSquaredTest.NullHypothesis.goodnessOfFit.toString)

// different expected and observed sum
val observed1 = new DenseVector(Array[Double](21, 38, 43, 80))
val expected1 = new DenseVector(Array[Double](3, 5, 7, 20))
val c1 = Statistics.chiSqTest(observed1, expected1)
val pearson1 = Statistics.chiSqTest(observed1, expected1)

// Results validated against the R command
// `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
assert(c1.statistic ~= 14.1429 absTol 1e-3)
assert(c1.degreesOfFreedom === Array(3))
assert(c1.pValue ~= 0.002717 absTol 1e-6)
assert(c1.method === ChiSquaredTest.PEARSON)
assert(c1.nullHypothesis === ChiSquaredTest.NullHypothesis.goodnessOfFit.toString)
assert(pearson1.statistic ~= 14.1429 relTol 1e-4)
assert(pearson1.degreesOfFreedom === 3)
assert(pearson1.pValue ~= 0.002717 relTol 1e-4)
assert(pearson1.method === ChiSquaredTest.PEARSON.name)
assert(pearson1.nullHypothesis === ChiSquaredTest.NullHypothesis.goodnessOfFit.toString)

// SparseVector representation to make sure memory doesn't blow up
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove commented blocks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's actually meant as a note to perf testers, but okay.

// Commented out because it takes too long for unit tests. Should be run as part of perf test.
// val observed2 = new SparseVector(Int.MaxValue, Array(1000005), Array[Double](10.0))
// val pearson2 = Statistics.chiSqTest(observed2)

// Vectors with different sizes
val observed3 = new DenseVector(Array(1.0, 2.0, 3.0))
Expand All @@ -66,44 +71,58 @@ class HypothesisTestSuite extends FunSuite with LocalSparkContext {
intercept[IllegalArgumentException](Statistics.chiSqTest(observed, zeroExpected))
}

test("chi squared pearson independence") {

val data = Array(
40.0, 56.0, 31.0, 30.0,
24.0, 32.0, 10.0, 15.0,
29.0, 42.0, 0.0, 12.0)
test("chi squared pearson matrix independence") {
val data = Array(40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0)
// [[40.0, 56.0, 31.0, 30.0],
// [24.0, 32.0, 10.0, 15.0],
// [29.0, 42.0, 0.0, 12.0]]
val chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
assert(chi.statistic ~= 21.9958 absTol 1e-3)
assert(chi.degreesOfFreedom === Array(6))
assert(chi.pValue ~= 0.001213 absTol 1e-6)
assert(chi.method === ChiSquaredTest.PEARSON)
// Results validated against R command
// `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
assert(chi.statistic ~= 21.9958 relTol 1e-4)
assert(chi.degreesOfFreedom === 6)
assert(chi.pValue ~= 0.001213 relTol 1e-4)
assert(chi.method === ChiSquaredTest.PEARSON.name)
assert(chi.nullHypothesis === ChiSquaredTest.NullHypothesis.independence.toString)

// Negative counts
val negCounts = Array(
4.0, 5.0, 3.0, 3.0,
0.0, -3.0, 0.0, 5.0,
9.0, 0.0, 0.0, 1.0)
intercept[SparkException](Statistics.chiSqTest(Matrices.dense(3, 4, negCounts)))
val negCounts = Array(4.0, 5.0, 3.0, -3.0)
intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(2, 2, negCounts)))

// Row sum = 0.0
val rowZero = Array(
4.0, 5.0, 3.0, 3.0,
0.0, 0.0, 0.0, 0.0,
9.0, 0.0, 0.0, 1.0)
intercept[SparkException](Statistics.chiSqTest(Matrices.dense(3, 4, rowZero)))
val rowZero = Array(0.0, 1.0, 0.0, 2.0)
intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(2, 2, rowZero)))

// Column sum = 0.0
val colZero = Array(
1.0, 0.0, 0.0, 2.0,
4.0, 5.0, 0.0, 3.0,
9.0, 0.0, 0.0, 1.0)
val colZero = Array(0.0, 0.0, 2.0, 2.0)
// IllegalArgumentException thrown here since it's thrown on driver, not inside a task
intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(3, 4, colZero)))
intercept[IllegalArgumentException](Statistics.chiSqTest(Matrices.dense(2, 2, colZero)))
}

test("chi squared pearson features") {

test("chi squared pearson RDD[LabeledPoint]") {
// labels: 1.0 (2 / 6), 0.0 (4 / 6)
// feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6)
// feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6)
val data = Array(new LabeledPoint(0.0, Vectors.dense(0.5, 10.0)),
new LabeledPoint(0.0, Vectors.dense(1.5, 20.0)),
new LabeledPoint(1.0, Vectors.dense(1.5, 30.0)),
new LabeledPoint(0.0, Vectors.dense(3.5, 30.0)),
new LabeledPoint(0.0, Vectors.dense(3.5, 40.0)),
new LabeledPoint(1.0, Vectors.dense(3.5, 40.0)))
for (numParts <- List(2, 4, 6, 8)) {
val chi = Statistics.chiSqTest(sc.parallelize(data, numParts))
val feature1 = chi(0)
assert(feature1.statistic === 0.75)
assert(feature1.degreesOfFreedom === 2)
assert(feature1.pValue ~= 0.6873 relTol 1e-4)
assert(feature1.method === ChiSquaredTest.PEARSON.name)
assert(feature1.nullHypothesis === ChiSquaredTest.NullHypothesis.independence.toString)
val feature2 = chi(1)
assert(feature2.statistic === 1.5)
assert(feature2.degreesOfFreedom === 3)
assert(feature2.pValue ~= 0.6823 relTol 1e-4)
assert(feature2.method === ChiSquaredTest.PEARSON.name)
assert(feature2.nullHypothesis === ChiSquaredTest.NullHypothesis.independence.toString)
}
}

}