Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix typos and make the unit tests simpler
  • Loading branch information
lu-wang-dl committed May 7, 2018
commit d065634ae70e5f0582eebcba84c90cc06e27e890
Original file line number Diff line number Diff line change
Expand Up @@ -187,22 +187,17 @@ class BisectingKMeansSuite
}

test("BisectingKMeans with Array input") {
def trainTransfromAndComputeCost(dataset: Dataset[_]): (DataFrame, Double) = {
def trainAndComputeCost(dataset: Dataset[_]): Double = {
val model = new BisectingKMeans().setK(k).setMaxIter(1).setSeed(1).fit(dataset)
(model.transform(dataset), model.computeCost(dataset))
model.computeCost(dataset)
}

val (newDatasetD, newDatasetF) = MLTestingUtils.generateArrayFeatureDataset(dataset)
val (transformed, trueCost) = trainTransfromAndComputeCost(dataset)
val (transformedD, doubleArrayCost) = trainTransfromAndComputeCost(newDatasetD)
val (transformedF, floatArrayCost) = trainTransfromAndComputeCost(newDatasetF)

val predictDifferenceD = transformed.select("prediction")
.except(transformedD.select("prediction"))
assert(predictDifferenceD.count() == 0)
val predictDifferenceF = transformed.select("prediction")
.except(transformedF.select("prediction"))
assert(predictDifferenceF.count() == 0)
val (newDataset, newDatasetD, newDatasetF) = MLTestingUtils.generateArrayFeatureDataset(dataset)
val trueCost = trainAndComputeCost(newDataset)
val doubleArrayCost = trainAndComputeCost(newDatasetD)
val floatArrayCost = trainAndComputeCost(newDatasetF)

// checking the cost is fine enough as a sanity check
assert(trueCost ~== doubleArrayCost absTol 1e-6)
assert(trueCost ~== floatArrayCost absTol 1e-6)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,29 +259,19 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
}

test("GaussianMixture with Array input") {
def trainAndTransfrom(dataset: Dataset[_]): DataFrame = {
def trainAndComputlogLikelihood(dataset: Dataset[_]): Double = {
val model = new GaussianMixture().setK(k).setMaxIter(1).setSeed(1).fit(dataset)
model.transform(dataset)
model.summary.logLikelihood
}

val (newDatasetD, newDatasetF) = MLTestingUtils.generateArrayFeatureDataset(dataset)
val transformed = trainAndTransfrom(dataset)
val transformedD = trainAndTransfrom(newDatasetD)
val transformedF = trainAndTransfrom(newDatasetF)

val predictDifferenceD = transformed.select("prediction")
.except(transformedD.select("prediction"))
assert(predictDifferenceD.count() == 0)
val predictDifferenceF = transformed.select("prediction")
.except(transformedF.select("prediction"))
assert(predictDifferenceF.count() == 0)

val probabilityDifferenceD = transformed.select("probability")
.except(transformedD.select("probability"))
assert(probabilityDifferenceD.count() == 0)
val probabilityDifferenceF = transformed.select("probability")
.except(transformedF.select("probability"))
assert(probabilityDifferenceF.count() == 0)
val (newDataset, newDatasetD, newDatasetF) = MLTestingUtils.generateArrayFeatureDataset(dataset)
val trueLikelihood = trainAndComputlogLikelihood(newDataset)
val doubleLikelihood = trainAndComputlogLikelihood(newDatasetD)
val floatLikelihood = trainAndComputlogLikelihood(newDatasetF)

// checking the cost is fine enough as a sanity check
assert(trueLikelihood == doubleLikelihood)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: should use === instead of == for assertions, the former gives a better error message. (not necessary to update this PR)

assert(trueLikelihood == floatLikelihood)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,22 +201,17 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
}

test("KMean with Array input") {
def trainTransfromAndComputeCost(dataset: Dataset[_]): (DataFrame, Double) = {
def trainAndComputeCost(dataset: Dataset[_]): Double = {
val model = new KMeans().setK(k).setMaxIter(1).setSeed(1).fit(dataset)
(model.transform(dataset), model.computeCost(dataset))
model.computeCost(dataset)
}

val (newDatasetD, newDatasetF) = MLTestingUtils.generateArrayFeatureDataset(dataset)
val (transformed, trueCost) = trainTransfromAndComputeCost(dataset)
val (transformedD, doubleArrayCost) = trainTransfromAndComputeCost(newDatasetD)
val (transformedF, floatArrayCost) = trainTransfromAndComputeCost(newDatasetF)

val predictDifferenceD = transformed.select("prediction")
.except(transformedD.select("prediction"))
assert(predictDifferenceD.count() == 0)
val predictDifferenceF = transformed.select("prediction")
.except(transformedF.select("prediction"))
assert(predictDifferenceF.count() == 0)
val (newDataset, newDatasetD, newDatasetF) = MLTestingUtils.generateArrayFeatureDataset(dataset)
val trueCost = trainAndComputeCost(newDataset)
val doubleArrayCost = trainAndComputeCost(newDatasetD)
val floatArrayCost = trainAndComputeCost(newDatasetF)

// checking the cost is fine enough as a sanity check
assert(trueCost ~== doubleArrayCost absTol 1e-6)
assert(trueCost ~== floatArrayCost absTol 1e-6)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -326,15 +326,15 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead
}

test("LDA with Array input") {
def trainAndLogLikehoodAndPerplexity(dataset: Dataset[_]): (Double, Double) = {
def trainAndLogLikelihoodAndPerplexity(dataset: Dataset[_]): (Double, Double) = {
val model = new LDA().setK(k).setOptimizer("online").setMaxIter(1).setSeed(1).fit(dataset)
(model.logLikelihood(dataset), model.logPerplexity(dataset))
}

val (newDatasetD, newDatasetF) = MLTestingUtils.generateArrayFeatureDataset(dataset)
val (ll, lp) = trainAndLogLikehoodAndPerplexity(dataset)
val (llD, lpD) = trainAndLogLikehoodAndPerplexity(newDatasetD)
val (llF, lpF) = trainAndLogLikehoodAndPerplexity(newDatasetF)
val (newDataset, newDatasetD, newDatasetF) = MLTestingUtils.generateArrayFeatureDataset(dataset)
val (ll, lp) = trainAndLogLikelihoodAndPerplexity(newDataset)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: the output are not used. I expect they will be used once we fixed SPARK-22210

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I want to use this as the base for the comparison after we fix SPARK-22210.

val (llD, lpD) = trainAndLogLikelihoodAndPerplexity(newDatasetD)
val (llF, lpF) = trainAndLogLikelihoodAndPerplexity(newDatasetF)
// TODO: need to compare the result once we fix the seed issue for LDA (SPARK-22210)
assert(llD <= 0.0 && llD != Double.NegativeInfinity)
assert(llF <= 0.0 && llF != Double.NegativeInfinity)
Expand Down
19 changes: 13 additions & 6 deletions mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -248,13 +248,20 @@ object MLTestingUtils extends SparkFunSuite {
models.sliding(2).foreach { case Seq(m1, m2) => modelEquals(m1, m2)}
}

def generateArrayFeatureDataset(dataset: Dataset[_]): (Dataset[_], Dataset[_]) = {
val doubleUDF = udf { (features: Vector) => features.toArray.map(_.toFloat.toDouble)}
val floatUDF = udf { (features: Vector) => features.toArray.map(_.toFloat)}
val newDatasetD = dataset.withColumn("features", doubleUDF(col("features")))
val newDatasetF = dataset.withColumn("features", floatUDF(col("features")))
/**
* Helper function for testing different input types for features. Given a DataFrame, generate
* three output DataFrames: one having vector feature column with float precision, one having
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: should say features column to make the contract clear.

* double array feature column with float precision, and one having float array feature column.
*/
def generateArrayFeatureDataset(dataset: Dataset[_]): (Dataset[_], Dataset[_], Dataset[_]) = {
val toFloatVectorUDF = udf { (features: Vector) => features.toArray.map(_.toFloat).toVector}
val toDoubleArrayUDF = udf { (features: Vector) => features.toArray}
val toFloatArrayUDF = udf { (features: Vector) => features.toArray.map(_.toFloat)}
val newDataset = dataset.withColumn("features", toFloatVectorUDF(col("features")))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: maybe useful to define "features" as a constant at the beginning of the function

val newDatasetD = dataset.withColumn("features", toDoubleArrayUDF(col("features")))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't truncate the precision to single. Did you want to use newDataset instead of dataset?

val newDatasetF = dataset.withColumn("features", toFloatArrayUDF(col("features")))
assert(newDatasetD.schema("features").dataType.equals(new ArrayType(DoubleType, false)))
assert(newDatasetF.schema("features").dataType.equals(new ArrayType(FloatType, false)))
(newDatasetD, newDatasetF)
(newDataset, newDatasetD, newDatasetF)
}
}