@@ -47,7 +47,7 @@ Set Sum of Squared Error (WSSSE). You can reduce this error measure by increasin
4747optimal * k* is usually one where there is an "elbow" in the WSSSE graph.
4848
4949{% highlight scala %}
50- import org.apache.spark.mllib.clustering.KMeans
50+ import org.apache.spark.mllib.clustering.{ KMeans, KMeansModel}
5151import org.apache.spark.mllib.linalg.Vectors
5252
5353// Load and parse the data
@@ -62,6 +62,10 @@ val clusters = KMeans.train(parsedData, numClusters, numIterations)
6262// Evaluate clustering by computing Within Set Sum of Squared Errors
6363val WSSSE = clusters.computeCost(parsedData)
6464println("Within Set Sum of Squared Errors = " + WSSSE)
65+
66+ // Save and load model
67+ clusters.save(sc, "myModelPath")
68+ val sameModel = KMeansModel.load(sc, "myModelPath")
6569{% endhighlight %}
6670</div >
6771
@@ -110,6 +114,10 @@ public class KMeansExample {
110114 // Evaluate clustering by computing Within Set Sum of Squared Errors
111115 double WSSSE = clusters.computeCost(parsedData.rdd());
112116 System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
117+
118+ // Save and load model
119+ clusters.save(sc.sc(), "myModelPath");
120+ KMeansModel sameModel = KMeansModel.load(sc.sc(), "myModelPath");
113121 }
114122}
115123{% endhighlight %}
@@ -124,7 +132,7 @@ Within Set Sum of Squared Error (WSSSE). You can reduce this error measure by in
124132fact the optimal * k* is usually one where there is an "elbow" in the WSSSE graph.
125133
126134{% highlight python %}
127- from pyspark.mllib.clustering import KMeans
135+ from pyspark.mllib.clustering import KMeans, KMeansModel
128136from numpy import array
129137from math import sqrt
130138
@@ -143,6 +151,10 @@ def error(point):
143151
144152WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
145153print("Within Set Sum of Squared Error = " + str(WSSSE))
154+
155+ # Save and load model
156+ clusters.save(sc, "myModelPath")
157+ sameModel = KMeansModel.load(sc, "myModelPath")
146158{% endhighlight %}
147159</div >
148160
@@ -312,19 +324,23 @@ Calling `PowerIterationClustering.run` returns a
312324which contains the computed clustering assignments.
313325
314326{% highlight scala %}
315- import org.apache.spark.mllib.clustering.PowerIterationClustering
327+ import org.apache.spark.mllib.clustering.{ PowerIterationClustering, PowerIterationClusteringModel}
316328import org.apache.spark.mllib.linalg.Vectors
317329
318330val similarities: RDD[ (Long, Long, Double)] = ...
319331
320- val pic = new PowerIteartionClustering ()
332+ val pic = new PowerIterationClustering ()
321333 .setK(3)
322334 .setMaxIterations(20)
323335val model = pic.run(similarities)
324336
325337model.assignments.foreach { a =>
326338 println(s"${a.id} -> ${a.cluster}")
327339}
340+
341+ // Save and load model
342+ model.save(sc, "myModelPath")
343+ val sameModel = PowerIterationClusteringModel.load(sc, "myModelPath")
328344{% endhighlight %}
329345
330346A full example that produces the experiment described in the PIC paper can be found under
@@ -360,6 +376,10 @@ PowerIterationClusteringModel model = pic.run(similarities);
360376for (PowerIterationClustering.Assignment a: model.assignments().toJavaRDD().collect()) {
361377 System.out.println(a.id() + " -> " + a.cluster());
362378}
379+
380+ // Save and load model
381+ model.save(sc.sc(), "myModelPath");
382+ PowerIterationClusteringModel sameModel = PowerIterationClusteringModel.load(sc.sc(), "myModelPath");
363383{% endhighlight %}
364384</div >
365385
0 commit comments