-
Notifications
You must be signed in to change notification settings - Fork 29k
[Spark-7949] [MLlib] [Doc] update document with some missing save/load #6498
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,7 +47,7 @@ Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasin | |
| optimal *k* is usually one where there is an "elbow" in the WSSSE graph. | ||
|
|
||
| {% highlight scala %} | ||
| import org.apache.spark.mllib.clustering.KMeans | ||
| import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} | ||
| import org.apache.spark.mllib.linalg.Vectors | ||
|
|
||
| // Load and parse the data | ||
|
|
@@ -62,6 +62,10 @@ val clusters = KMeans.train(parsedData, numClusters, numIterations) | |
| // Evaluate clustering by computing Within Set Sum of Squared Errors | ||
| val WSSSE = clusters.computeCost(parsedData) | ||
| println("Within Set Sum of Squared Errors = " + WSSSE) | ||
|
|
||
| // Save and load model | ||
| clusters.save(sc, "myModelPath") | ||
| val sameModel = KMeansModel.load(sc, "myModelPath") | ||
| {% endhighlight %} | ||
| </div> | ||
|
|
||
|
|
@@ -110,6 +114,10 @@ public class KMeansExample { | |
| // Evaluate clustering by computing Within Set Sum of Squared Errors | ||
| double WSSSE = clusters.computeCost(parsedData.rdd()); | ||
| System.out.println("Within Set Sum of Squared Errors = " + WSSSE); | ||
|
|
||
| // Save and load model | ||
| clusters.save(sc.sc(), "myModelPath"); | ||
| KMeansModel sameModel = KMeansModel.load(sc.sc(), "myModelPath"); | ||
| } | ||
| } | ||
| {% endhighlight %} | ||
|
|
@@ -124,7 +132,7 @@ Within Set Sum of Squared Error (WSSSE). You can reduce this error measure by in | |
| fact the optimal *k* is usually one where there is an "elbow" in the WSSSE graph. | ||
|
|
||
| {% highlight python %} | ||
| from pyspark.mllib.clustering import KMeans | ||
| from pyspark.mllib.clustering import KMeans, KMeansModel | ||
| from numpy import array | ||
| from math import sqrt | ||
|
|
||
|
|
@@ -143,6 +151,10 @@ def error(point): | |
|
|
||
| WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) | ||
| print("Within Set Sum of Squared Error = " + str(WSSSE)) | ||
|
|
||
| # Save and load model | ||
| clusters.save(sc, "myModelPath") | ||
| sameModel = KMeansModel.load(sc, "myModelPath") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto: import KMeansModel |
||
| {% endhighlight %} | ||
| </div> | ||
|
|
||
|
|
@@ -312,19 +324,23 @@ Calling `PowerIterationClustering.run` returns a | |
| which contains the computed clustering assignments. | ||
|
|
||
| {% highlight scala %} | ||
| import org.apache.spark.mllib.clustering.PowerIterationClustering | ||
| import org.apache.spark.mllib.clustering.{PowerIterationClustering, PowerIterationClusteringModel} | ||
| import org.apache.spark.mllib.linalg.Vectors | ||
|
|
||
| val similarities: RDD[(Long, Long, Double)] = ... | ||
|
|
||
| val pic = new PowerIteartionClustering() | ||
| val pic = new PowerIterationClustering() | ||
| .setK(3) | ||
| .setMaxIterations(20) | ||
| val model = pic.run(similarities) | ||
|
|
||
| model.assignments.foreach { a => | ||
| println(s"${a.id} -> ${a.cluster}") | ||
| } | ||
|
|
||
| // Save and load model | ||
| model.save(sc, "myModelPath") | ||
| val sameModel = PowerIterationClusteringModel.load(sc, "myModelPath") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto: import PowerIterationClusteringModel |
||
| {% endhighlight %} | ||
|
|
||
| A full example that produces the experiment described in the PIC paper can be found under | ||
|
|
@@ -360,6 +376,10 @@ PowerIterationClusteringModel model = pic.run(similarities); | |
| for (PowerIterationClustering.Assignment a: model.assignments().toJavaRDD().collect()) { | ||
| System.out.println(a.id() + " -> " + a.cluster()); | ||
| } | ||
|
|
||
| // Save and load model | ||
| model.save(sc.sc(), "myModelPath"); | ||
| PowerIterationClusteringModel sameModel = PowerIterationClusteringModel.load(sc.sc(), "myModelPath"); | ||
| {% endhighlight %} | ||
| </div> | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -188,7 +188,7 @@ Here we assume the extracted file is `text8` and in same directory as you run th | |
| import org.apache.spark._ | ||
| import org.apache.spark.rdd._ | ||
| import org.apache.spark.SparkContext._ | ||
| import org.apache.spark.mllib.feature.Word2Vec | ||
| import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} | ||
|
|
||
| val input = sc.textFile("text8").map(line => line.split(" ").toSeq) | ||
|
|
||
|
|
@@ -201,6 +201,10 @@ val synonyms = model.findSynonyms("china", 40) | |
| for((synonym, cosineSimilarity) <- synonyms) { | ||
| println(s"$synonym $cosineSimilarity") | ||
| } | ||
|
|
||
| // Save and load model | ||
| model.save(sc, "myModelPath") | ||
| val sameModel = Word2VecModel.load(sc, "myModelPath") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. import Word2VecModel |
||
| {% endhighlight %} | ||
| </div> | ||
| <div data-lang="python"> | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -60,7 +60,7 @@ Model is created using the training set and a mean squared error is calculated f | |
| labels and real labels in the test set. | ||
|
|
||
| {% highlight scala %} | ||
| import org.apache.spark.mllib.regression.IsotonicRegression | ||
| import org.apache.spark.mllib.regression.{IsotonicRegression, IsotonicRegressionModel} | ||
|
|
||
| val data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt") | ||
|
|
||
|
|
@@ -88,6 +88,10 @@ val predictionAndLabel = test.map { point => | |
| // Calculate mean squared error between predicted and real labels. | ||
| val meanSquaredError = predictionAndLabel.map{case(p, l) => math.pow((p - l), 2)}.mean() | ||
| println("Mean Squared Error = " + meanSquaredError) | ||
|
|
||
| // Save and load model | ||
| model.save(sc, "myModelPath") | ||
| val sameModel = IsotonicRegressionModel.load(sc, "myModelPath") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. import IsotonicRegressionModel |
||
| {% endhighlight %} | ||
| </div> | ||
|
|
||
|
|
@@ -150,6 +154,10 @@ Double meanSquaredError = new JavaDoubleRDD(predictionAndLabel.map( | |
| ).rdd()).mean(); | ||
|
|
||
| System.out.println("Mean Squared Error = " + meanSquaredError); | ||
|
|
||
| // Save and load model | ||
| model.save(sc.sc(), "myModelPath"); | ||
| IsotonicRegressionModel sameModel = IsotonicRegressionModel.load(sc.sc(), "myModelPath"); | ||
| {% endhighlight %} | ||
| </div> | ||
| </div> | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops! I missed this before. This should go before the endhighlight.
Also, you now need to import KMeansModel