Skip to content

Commit 0674700

Browse files
hhbyyhjkbradley
authored andcommitted
[SPARK-7949] [MLLIB] [DOC] update document with some missing save/load
add save load for examples: KMeansModel PowerIterationClusteringModel Word2VecModel IsotonicRegressionModel Author: Yuhao Yang <[email protected]> Closes #6498 from hhbyyh/docSaveLoad and squashes the following commits: 7f9f06d [Yuhao Yang] add missing imports c604cad [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into docSaveLoad 1dd77cc [Yuhao Yang] update document with some missing save/load
1 parent e1067d0 commit 0674700

File tree

3 files changed

+38
-6
lines changed

3 files changed

+38
-6
lines changed

docs/mllib-clustering.md

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasin
4747
optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
4848

4949
{% highlight scala %}
50-
import org.apache.spark.mllib.clustering.KMeans
50+
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
5151
import org.apache.spark.mllib.linalg.Vectors
5252

5353
// Load and parse the data
@@ -62,6 +62,10 @@ val clusters = KMeans.train(parsedData, numClusters, numIterations)
6262
// Evaluate clustering by computing Within Set Sum of Squared Errors
6363
val WSSSE = clusters.computeCost(parsedData)
6464
println("Within Set Sum of Squared Errors = " + WSSSE)
65+
66+
// Save and load model
67+
clusters.save(sc, "myModelPath")
68+
val sameModel = KMeansModel.load(sc, "myModelPath")
6569
{% endhighlight %}
6670
</div>
6771

@@ -110,6 +114,10 @@ public class KMeansExample {
110114
// Evaluate clustering by computing Within Set Sum of Squared Errors
111115
double WSSSE = clusters.computeCost(parsedData.rdd());
112116
System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
117+
118+
// Save and load model
119+
clusters.save(sc.sc(), "myModelPath");
120+
KMeansModel sameModel = KMeansModel.load(sc.sc(), "myModelPath");
113121
}
114122
}
115123
{% endhighlight %}
@@ -124,7 +132,7 @@ Within Set Sum of Squared Error (WSSSE). You can reduce this error measure by in
124132
fact the optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
125133

126134
{% highlight python %}
127-
from pyspark.mllib.clustering import KMeans
135+
from pyspark.mllib.clustering import KMeans, KMeansModel
128136
from numpy import array
129137
from math import sqrt
130138

@@ -143,6 +151,10 @@ def error(point):
143151

144152
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
145153
print("Within Set Sum of Squared Error = " + str(WSSSE))
154+
155+
# Save and load model
156+
clusters.save(sc, "myModelPath")
157+
sameModel = KMeansModel.load(sc, "myModelPath")
146158
{% endhighlight %}
147159
</div>
148160

@@ -312,19 +324,23 @@ Calling `PowerIterationClustering.run` returns a
312324
which contains the computed clustering assignments.
313325

314326
{% highlight scala %}
315-
import org.apache.spark.mllib.clustering.PowerIterationClustering
327+
import org.apache.spark.mllib.clustering.{PowerIterationClustering, PowerIterationClusteringModel}
316328
import org.apache.spark.mllib.linalg.Vectors
317329

318330
val similarities: RDD[(Long, Long, Double)] = ...
319331

320-
val pic = new PowerIteartionClustering()
332+
val pic = new PowerIterationClustering()
321333
.setK(3)
322334
.setMaxIterations(20)
323335
val model = pic.run(similarities)
324336

325337
model.assignments.foreach { a =>
326338
println(s"${a.id} -> ${a.cluster}")
327339
}
340+
341+
// Save and load model
342+
model.save(sc, "myModelPath")
343+
val sameModel = PowerIterationClusteringModel.load(sc, "myModelPath")
328344
{% endhighlight %}
329345

330346
A full example that produces the experiment described in the PIC paper can be found under
@@ -360,6 +376,10 @@ PowerIterationClusteringModel model = pic.run(similarities);
360376
for (PowerIterationClustering.Assignment a: model.assignments().toJavaRDD().collect()) {
361377
System.out.println(a.id() + " -> " + a.cluster());
362378
}
379+
380+
// Save and load model
381+
model.save(sc.sc(), "myModelPath");
382+
PowerIterationClusteringModel sameModel = PowerIterationClusteringModel.load(sc.sc(), "myModelPath");
363383
{% endhighlight %}
364384
</div>
365385

docs/mllib-feature-extraction.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ Here we assume the extracted file is `text8` and in same directory as you run th
188188
import org.apache.spark._
189189
import org.apache.spark.rdd._
190190
import org.apache.spark.SparkContext._
191-
import org.apache.spark.mllib.feature.Word2Vec
191+
import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
192192

193193
val input = sc.textFile("text8").map(line => line.split(" ").toSeq)
194194

@@ -201,6 +201,10 @@ val synonyms = model.findSynonyms("china", 40)
201201
for((synonym, cosineSimilarity) <- synonyms) {
202202
println(s"$synonym $cosineSimilarity")
203203
}
204+
205+
// Save and load model
206+
model.save(sc, "myModelPath")
207+
val sameModel = Word2VecModel.load(sc, "myModelPath")
204208
{% endhighlight %}
205209
</div>
206210
<div data-lang="python">

docs/mllib-isotonic-regression.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ Model is created using the training set and a mean squared error is calculated f
6060
labels and real labels in the test set.
6161

6262
{% highlight scala %}
63-
import org.apache.spark.mllib.regression.IsotonicRegression
63+
import org.apache.spark.mllib.regression.{IsotonicRegression, IsotonicRegressionModel}
6464

6565
val data = sc.textFile("data/mllib/sample_isotonic_regression_data.txt")
6666

@@ -88,6 +88,10 @@ val predictionAndLabel = test.map { point =>
8888
// Calculate mean squared error between predicted and real labels.
8989
val meanSquaredError = predictionAndLabel.map{case(p, l) => math.pow((p - l), 2)}.mean()
9090
println("Mean Squared Error = " + meanSquaredError)
91+
92+
// Save and load model
93+
model.save(sc, "myModelPath")
94+
val sameModel = IsotonicRegressionModel.load(sc, "myModelPath")
9195
{% endhighlight %}
9296
</div>
9397

@@ -150,6 +154,10 @@ Double meanSquaredError = new JavaDoubleRDD(predictionAndLabel.map(
150154
).rdd()).mean();
151155

152156
System.out.println("Mean Squared Error = " + meanSquaredError);
157+
158+
// Save and load model
159+
model.save(sc.sc(), "myModelPath");
160+
IsotonicRegressionModel sameModel = IsotonicRegressionModel.load(sc.sc(), "myModelPath");
153161
{% endhighlight %}
154162
</div>
155163
</div>

0 commit comments

Comments
 (0)