Skip to content

Commit 37ab747

Browse files
committed
Fix some examples and docs due to changes in MLlib API
1 parent da0f27e commit 37ab747

File tree

2 files changed

+33
-23
lines changed

2 files changed

+33
-23
lines changed

docs/mllib-classification-regression.md

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -356,16 +356,17 @@ error.
356356
import org.apache.spark.SparkContext
357357
import org.apache.spark.mllib.classification.SVMWithSGD
358358
import org.apache.spark.mllib.regression.LabeledPoint
359+
import org.apache.spark.mllib.linalg.Vectors
359360

360361
// Load and parse the data file
361362
val data = sc.textFile("mllib/data/sample_svm_data.txt")
362363
val parsedData = data.map { line =>
363-
val parts = line.split(' ')
364-
LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray)
364+
val parts = line.split(' ').map(_.toDouble)
365+
LabeledPoint(parts(0), Vectors.dense(parts.tail))
365366
}
366367

367368
// Run training algorithm to build the model
368-
val numIterations = 20
369+
val numIterations = 100
369370
val model = SVMWithSGD.train(parsedData, numIterations)
370371

371372
// Evaluate model on training examples and compute training error
@@ -401,29 +402,30 @@ val modelL1 = svmAlg.run(parsedData)
401402
The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint.
402403
The example then uses LinearRegressionWithSGD to build a simple linear model to predict label
403404
values. We compute the Mean Squared Error at the end to evaluate
404-
[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit)
405+
[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit).
405406

406407
{% highlight scala %}
407408
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
408409
import org.apache.spark.mllib.regression.LabeledPoint
410+
import org.apache.spark.mllib.linalg.Vectors
409411

410412
// Load and parse the data
411413
val data = sc.textFile("mllib/data/ridge-data/lpsa.data")
412414
val parsedData = data.map { line =>
413415
val parts = line.split(',')
414-
LabeledPoint(parts(0).toDouble, parts(1).split(' ').map(x => x.toDouble).toArray)
416+
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
415417
}
416418

417419
// Building the model
418-
val numIterations = 20
420+
val numIterations = 100
419421
val model = LinearRegressionWithSGD.train(parsedData, numIterations)
420422

421423
// Evaluate model on training examples and compute training error
422424
val valuesAndPreds = parsedData.map { point =>
423425
val prediction = model.predict(point.features)
424426
(point.label, prediction)
425427
}
426-
val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.reduce(_ + _)/valuesAndPreds.count
428+
val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.reduce(_ + _) / valuesAndPreds.count
427429
println("training Mean Squared Error = " + MSE)
428430
{% endhighlight %}
429431

@@ -518,18 +520,22 @@ and make predictions with the resulting model to compute the training error.
518520

519521
{% highlight python %}
520522
from pyspark.mllib.classification import LogisticRegressionWithSGD
523+
from pyspark.mllib.regression import LabeledPoint
521524
from numpy import array
522525

523526
# Load and parse the data
527+
def parsePoint(line):
528+
values = [float(x) for x in line.split(' ')]
529+
return LabeledPoint(values[0], values[1:])
530+
524531
data = sc.textFile("mllib/data/sample_svm_data.txt")
525-
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
526-
model = LogisticRegressionWithSGD.train(parsedData)
532+
parsedData = data.map(parsePoint)
527533

528534
# Build the model
529-
labelsAndPreds = parsedData.map(lambda point: (int(point.item(0)),
530-
model.predict(point.take(range(1, point.size)))))
535+
model = LogisticRegressionWithSGD.train(parsedData)
531536

532537
# Evaluating the model on training data
538+
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
533539
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
534540
print("Training Error = " + str(trainErr))
535541
{% endhighlight %}
@@ -538,22 +544,25 @@ print("Training Error = " + str(trainErr))
538544
The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint.
539545
The example then uses LinearRegressionWithSGD to build a simple linear model to predict label
540546
values. We compute the Mean Squared Error at the end to evaluate
541-
[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit)
547+
[goodness of fit](http://en.wikipedia.org/wiki/Goodness_of_fit).
542548

543549
{% highlight python %}
544-
from pyspark.mllib.regression import LinearRegressionWithSGD
550+
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
545551
from numpy import array
546552

547553
# Load and parse the data
554+
def parsePoint(line):
555+
values = [float(x) for x in line.replace(',', ' ').split(' ')]
556+
return LabeledPoint(values[0], values[1:])
557+
548558
data = sc.textFile("mllib/data/ridge-data/lpsa.data")
549-
parsedData = data.map(lambda line: array([float(x) for x in line.replace(',', ' ').split(' ')]))
559+
parsedData = data.map(parsePoint)
550560

551561
# Build the model
552562
model = LinearRegressionWithSGD.train(parsedData)
553563

554564
# Evaluate the model on training data
555-
valuesAndPreds = parsedData.map(lambda point: (point.item(0),
556-
model.predict(point.take(range(1, point.size)))))
557-
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y)/valuesAndPreds.count()
565+
valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
566+
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
558567
print("Mean Squared Error = " + str(MSE))
559-
{% endhighlight %}
568+
{% endhighlight %}

docs/mllib-clustering.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,15 @@ optimal *k* is usually one where there is an "elbow" in the WSSSE graph.
4848

4949
{% highlight scala %}
5050
import org.apache.spark.mllib.clustering.KMeans
51+
import org.apache.spark.mllib.linalg.Vectors
5152

5253
// Load and parse the data
53-
val data = sc.textFile("kmeans_data.txt")
54-
val parsedData = data.map( _.split(' ').map(_.toDouble))
54+
val data = sc.textFile("data/kmeans_data.txt")
55+
val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
5556

5657
// Cluster the data into two classes using KMeans
57-
val numIterations = 20
5858
val numClusters = 2
59+
val numIterations = 20
5960
val clusters = KMeans.train(parsedData, numClusters, numIterations)
6061

6162
// Evaluate clustering by computing Within Set Sum of Squared Errors
@@ -85,12 +86,12 @@ from numpy import array
8586
from math import sqrt
8687

8788
# Load and parse the data
88-
data = sc.textFile("kmeans_data.txt")
89+
data = sc.textFile("data/kmeans_data.txt")
8990
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
9091

9192
# Build the model (cluster the data)
9293
clusters = KMeans.train(parsedData, 2, maxIterations=10,
93-
runs=30, initialization_mode="random")
94+
runs=10, initialization_mode="random")
9495

9596
# Evaluate clustering by computing Within Set Sum of Squared Errors
9697
def error(point):

0 commit comments

Comments
 (0)