@@ -356,16 +356,17 @@ error.
356356import org.apache.spark.SparkContext
357357import org.apache.spark.mllib.classification.SVMWithSGD
358358import org.apache.spark.mllib.regression.LabeledPoint
359+ import org.apache.spark.mllib.linalg.Vectors
359360
360361// Load and parse the data file
361362val data = sc.textFile("mllib/data/sample_svm_data.txt")
362363val parsedData = data.map { line =>
363- val parts = line.split(' ')
364- LabeledPoint(parts(0).toDouble, parts.tail.map(x => x.toDouble).toArray )
364+ val parts = line.split(' ').map( _ .toDouble)
365+ LabeledPoint(parts(0), Vectors.dense(parts.tail) )
365366}
366367
367368// Run training algorithm to build the model
368- val numIterations = 20
369+ val numIterations = 100
369370val model = SVMWithSGD.train(parsedData, numIterations)
370371
371372// Evaluate model on training examples and compute training error
@@ -401,29 +402,30 @@ val modelL1 = svmAlg.run(parsedData)
401402The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint.
402403The example then uses LinearRegressionWithSGD to build a simple linear model to predict label
403404values. We compute the Mean Squared Error at the end to evaluate
404- [ goodness of fit] ( http://en.wikipedia.org/wiki/Goodness_of_fit )
405+ [ goodness of fit] ( http://en.wikipedia.org/wiki/Goodness_of_fit ) .
405406
406407{% highlight scala %}
407408import org.apache.spark.mllib.regression.LinearRegressionWithSGD
408409import org.apache.spark.mllib.regression.LabeledPoint
410+ import org.apache.spark.mllib.linalg.Vectors
409411
410412// Load and parse the data
411413val data = sc.textFile("mllib/data/ridge-data/lpsa.data")
412414val parsedData = data.map { line =>
413415 val parts = line.split(',')
414- LabeledPoint(parts(0).toDouble, parts(1).split(' ').map(x => x .toDouble).toArray )
416+ LabeledPoint(parts(0).toDouble, Vectors.dense( parts(1).split(' ').map(_ .toDouble)) )
415417}
416418
417419// Building the model
418- val numIterations = 20
420+ val numIterations = 100
419421val model = LinearRegressionWithSGD.train(parsedData, numIterations)
420422
421423// Evaluate model on training examples and compute training error
422424val valuesAndPreds = parsedData.map { point =>
423425 val prediction = model.predict(point.features)
424426 (point.label, prediction)
425427}
426- val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.reduce(_ + _ )/ valuesAndPreds.count
428+ val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.reduce(_ + _ ) / valuesAndPreds.count
427429println("training Mean Squared Error = " + MSE)
428430{% endhighlight %}
429431
@@ -518,18 +520,22 @@ and make predictions with the resulting model to compute the training error.
518520
519521{% highlight python %}
520522from pyspark.mllib.classification import LogisticRegressionWithSGD
523+ from pyspark.mllib.regression import LabeledPoint
521524from numpy import array
522525
523526# Load and parse the data
527+ def parsePoint(line):
528+ values = [ float(x) for x in line.split(' ')]
529+ return LabeledPoint(values[ 0] , values[ 1:] )
530+
524531data = sc.textFile("mllib/data/sample_svm_data.txt")
525- parsedData = data.map(lambda line: array([ float(x) for x in line.split(' ')] ))
526- model = LogisticRegressionWithSGD.train(parsedData)
532+ parsedData = data.map(parsePoint)
527533
528534# Build the model
529- labelsAndPreds = parsedData.map(lambda point: (int(point.item(0)),
530- model.predict(point.take(range(1, point.size)))))
535+ model = LogisticRegressionWithSGD.train(parsedData)
531536
532537# Evaluating the model on training data
538+ labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
533539trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
534540print("Training Error = " + str(trainErr))
535541{% endhighlight %}
@@ -538,22 +544,25 @@ print("Training Error = " + str(trainErr))
538544The following example demonstrate how to load training data, parse it as an RDD of LabeledPoint.
539545The example then uses LinearRegressionWithSGD to build a simple linear model to predict label
540546values. We compute the Mean Squared Error at the end to evaluate
541- [ goodness of fit] ( http://en.wikipedia.org/wiki/Goodness_of_fit )
547+ [ goodness of fit] ( http://en.wikipedia.org/wiki/Goodness_of_fit ) .
542548
543549{% highlight python %}
544- from pyspark.mllib.regression import LinearRegressionWithSGD
550+ from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
545551from numpy import array
546552
547553# Load and parse the data
554+ def parsePoint(line):
555+ values = [ float(x) for x in line.replace(',', ' ').split(' ')]
556+ return LabeledPoint(values[ 0] , values[ 1:] )
557+
548558data = sc.textFile("mllib/data/ridge-data/lpsa.data")
549- parsedData = data.map(lambda line: array( [ float(x) for x in line.replace(',', ' ').split(' ') ] ) )
559+ parsedData = data.map(parsePoint )
550560
551561# Build the model
552562model = LinearRegressionWithSGD.train(parsedData)
553563
554564# Evaluate the model on training data
555- valuesAndPreds = parsedData.map(lambda point: (point.item(0),
556- model.predict(point.take(range(1, point.size)))))
557- MSE = valuesAndPreds.map(lambda (v, p): (v - p)** 2).reduce(lambda x, y: x + y)/valuesAndPreds.count()
565+ valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
566+ MSE = valuesAndPreds.map(lambda (v, p): (v - p)** 2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
558567print("Mean Squared Error = " + str(MSE))
559- {% endhighlight %}
568+ {% endhighlight %}
0 commit comments