-
Notifications
You must be signed in to change notification settings - Fork 29k
[Spark-23975][ML] Add support of array input for all clustering methods #21195
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
31226b4
45e6e96
877c126
c7a14bb
d065634
c9f478e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -326,15 +326,15 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultRead | |
| } | ||
|
|
||
| test("LDA with Array input") { | ||
| def trainAndLogLikehoodAndPerplexity(dataset: Dataset[_]): (Double, Double) = { | ||
| def trainAndLogLikelihoodAndPerplexity(dataset: Dataset[_]): (Double, Double) = { | ||
| val model = new LDA().setK(k).setOptimizer("online").setMaxIter(1).setSeed(1).fit(dataset) | ||
| (model.logLikelihood(dataset), model.logPerplexity(dataset)) | ||
| } | ||
|
|
||
| val (newDatasetD, newDatasetF) = MLTestingUtils.generateArrayFeatureDataset(dataset) | ||
| val (ll, lp) = trainAndLogLikehoodAndPerplexity(dataset) | ||
| val (llD, lpD) = trainAndLogLikehoodAndPerplexity(newDatasetD) | ||
| val (llF, lpF) = trainAndLogLikehoodAndPerplexity(newDatasetF) | ||
| val (newDataset, newDatasetD, newDatasetF) = MLTestingUtils.generateArrayFeatureDataset(dataset) | ||
| val (ll, lp) = trainAndLogLikelihoodAndPerplexity(newDataset) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor: the output are not used. I expect they will be used once we fixed SPARK-22210
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. I want to use this as the base for the comparison after we fix SPARK-22210. |
||
| val (llD, lpD) = trainAndLogLikelihoodAndPerplexity(newDatasetD) | ||
| val (llF, lpF) = trainAndLogLikelihoodAndPerplexity(newDatasetF) | ||
| // TODO: need to compare the result once we fix the seed issue for LDA (SPARK-22210) | ||
| assert(llD <= 0.0 && llD != Double.NegativeInfinity) | ||
| assert(llF <= 0.0 && llF != Double.NegativeInfinity) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -248,13 +248,20 @@ object MLTestingUtils extends SparkFunSuite { | |
| models.sliding(2).foreach { case Seq(m1, m2) => modelEquals(m1, m2)} | ||
| } | ||
|
|
||
| def generateArrayFeatureDataset(dataset: Dataset[_]): (Dataset[_], Dataset[_]) = { | ||
| val doubleUDF = udf { (features: Vector) => features.toArray.map(_.toFloat.toDouble)} | ||
| val floatUDF = udf { (features: Vector) => features.toArray.map(_.toFloat)} | ||
| val newDatasetD = dataset.withColumn("features", doubleUDF(col("features"))) | ||
| val newDatasetF = dataset.withColumn("features", floatUDF(col("features"))) | ||
| /** | ||
| * Helper function for testing different input types for features. Given a DataFrame, generate | ||
| * three output DataFrames: one having vector feature column with float precision, one having | ||
|
||
| * double array feature column with float precision, and one having float array feature column. | ||
| */ | ||
| def generateArrayFeatureDataset(dataset: Dataset[_]): (Dataset[_], Dataset[_], Dataset[_]) = { | ||
| val toFloatVectorUDF = udf { (features: Vector) => features.toArray.map(_.toFloat).toVector} | ||
| val toDoubleArrayUDF = udf { (features: Vector) => features.toArray} | ||
| val toFloatArrayUDF = udf { (features: Vector) => features.toArray.map(_.toFloat)} | ||
| val newDataset = dataset.withColumn("features", toFloatVectorUDF(col("features"))) | ||
|
||
| val newDatasetD = dataset.withColumn("features", toDoubleArrayUDF(col("features"))) | ||
|
||
| val newDatasetF = dataset.withColumn("features", toFloatArrayUDF(col("features"))) | ||
| assert(newDatasetD.schema("features").dataType.equals(new ArrayType(DoubleType, false))) | ||
| assert(newDatasetF.schema("features").dataType.equals(new ArrayType(FloatType, false))) | ||
| (newDatasetD, newDatasetF) | ||
| (newDataset, newDatasetD, newDatasetF) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
minor: should use
===instead of==for assertions, the former gives a better error message. (not necessary to update this PR)