[SPARK-9905] [ML] [DOC] Adds LinearRegressionSummary user guide

Feynman Liang · mengxr · commit af0e1249b1c8 · 2015-08-27T21:55:20.000-07:00
* Adds user guide for `LinearRegressionSummary` * Fixes unresolved issues in #8197 CC jkbradley mengxr Author: Feynman Liang <fliang@databricks.com> Closes #8491 from feynmanliang/SPARK-9905.
diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md
@@ -34,7 +34,7 @@ net](http://users.stat.umn.edu/~zouxx019/Papers/elasticnet.pdf).
 Mathematically, it is defined as a convex combination of the $L_1$ and
 the $L_2$ regularization terms:
 `\[
-\alpha~\lambda \|\wv\|_1 + (1-\alpha) \frac{\lambda}{2}\|\wv\|_2^2, \alpha \in [0, 1], \lambda \geq 0.
+\alpha \left( \lambda \|\wv\|_1 \right) + (1-\alpha) \left( \frac{\lambda}{2}\|\wv\|_2^2 \right) , \alpha \in [0, 1], \lambda \geq 0
 \]`
 By setting $\alpha$ properly, elastic net contains both $L_1$ and $L_2$
 regularization as special cases. For example, if a [linear
@@ -95,15 +95,15 @@ public class LogisticRegressionWithElasticNetExample {
 
     SparkContext sc = new SparkContext(conf);
     SQLContext sql = new SQLContext(sc);
-    String path = "sample_libsvm_data.txt";
+    String path = "data/mllib/sample_libsvm_data.txt";
 
     // Load training data
     DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class);
 
     LogisticRegression lr = new LogisticRegression()
       .setMaxIter(10)
       .setRegParam(0.3)
-      .setElasticNetParam(0.8)
+      .setElasticNetParam(0.8);
 
     // Fit the model
     LogisticRegressionModel lrModel = lr.fit(training);
@@ -158,10 +158,12 @@ This will likely change when multiclass classification is supported.
 Continuing the earlier example:
 
 {% highlight scala %}
+import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary
+
 // Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example
 val trainingSummary = lrModel.summary
 
-// Obtain the loss per iteration.
+// Obtain the objective per iteration.
 val objectiveHistory = trainingSummary.objectiveHistory
 objectiveHistory.foreach(loss => println(loss))
 
@@ -173,17 +175,14 @@ val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary
 // Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
 val roc = binarySummary.roc
 roc.show()
-roc.select("FPR").show()
 println(binarySummary.areaUnderROC)
 
-// Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with
-// this selected threshold.
+// Set the model threshold to maximize F-Measure
 val fMeasure = binarySummary.fMeasureByThreshold
 val maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0)
 val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure).
   select("threshold").head().getDouble(0)
-logReg.setThreshold(bestThreshold)
-logReg.fit(logRegDataFrame)
+lrModel.setThreshold(bestThreshold)
 {% endhighlight %}
 </div>
 
@@ -199,8 +198,12 @@ This will likely change when multiclass classification is supported.
 Continuing the earlier example:
 
 {% highlight java %}
+import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary;
+import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary;
+import org.apache.spark.sql.functions;
+
 // Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example
-LogisticRegressionTrainingSummary trainingSummary = logRegModel.summary();
+LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();
 
 // Obtain the loss per iteration.
 double[] objectiveHistory = trainingSummary.objectiveHistory();
@@ -222,20 +225,131 @@ System.out.println(binarySummary.areaUnderROC());
 // Get the threshold corresponding to the maximum F-Measure and rerun LogisticRegression with
 // this selected threshold.
 DataFrame fMeasure = binarySummary.fMeasureByThreshold();
-double maxFMeasure = fMeasure.select(max("F-Measure")).head().getDouble(0);
+double maxFMeasure = fMeasure.select(functions.max("F-Measure")).head().getDouble(0);
 double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure)).
   select("threshold").head().getDouble(0);
-logReg.setThreshold(bestThreshold);
-logReg.fit(logRegDataFrame);
+lrModel.setThreshold(bestThreshold);
 {% endhighlight %}
 </div>
 
+<!--- TODO: Add python model summaries once implemented -->
 <div data-lang="python" markdown="1">
 Logistic regression model summary is not yet supported in Python.
 </div>
 
 </div>
 
+## Example: Linear Regression
+
+The interface for working with linear regression models and model
+summaries is similar to the logistic regression case. The following
+example demonstrates training an elastic net regularized linear
+regression model and extracting model summary statistics.
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.mllib.util.MLUtils
+
+// Load training data
+val training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+val lr = new LinearRegression()
+  .setMaxIter(10)
+  .setRegParam(0.3)
+  .setElasticNetParam(0.8)
+
+// Fit the model
+val lrModel = lr.fit(training)
+
+// Print the weights and intercept for linear regression
+println(s"Weights: ${lrModel.weights} Intercept: ${lrModel.intercept}")
+
+// Summarize the model over the training set and print out some metrics
+val trainingSummary = lrModel.summary
+println(s"numIterations: ${trainingSummary.totalIterations}")
+println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
+trainingSummary.residuals.show()
+println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
+println(s"r2: ${trainingSummary.r2}")
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+import org.apache.spark.ml.regression.LinearRegression;
+import org.apache.spark.ml.regression.LinearRegressionModel;
+import org.apache.spark.ml.regression.LinearRegressionTrainingSummary;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+
+public class LinearRegressionWithElasticNetExample {
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf()
+      .setAppName("Linear Regression with Elastic Net Example");
+
+    SparkContext sc = new SparkContext(conf);
+    SQLContext sql = new SQLContext(sc);
+    String path = "data/mllib/sample_libsvm_data.txt";
+
+    // Load training data
+    DataFrame training = sql.createDataFrame(MLUtils.loadLibSVMFile(sc, path).toJavaRDD(), LabeledPoint.class);
+
+    LinearRegression lr = new LinearRegression()
+      .setMaxIter(10)
+      .setRegParam(0.3)
+      .setElasticNetParam(0.8);
+
+    // Fit the model
+    LinearRegressionModel lrModel = lr.fit(training);
+
+    // Print the weights and intercept for linear regression
+    System.out.println("Weights: " + lrModel.weights() + " Intercept: " + lrModel.intercept());
+
+    // Summarize the model over the training set and print out some metrics
+    LinearRegressionTrainingSummary trainingSummary = lrModel.summary();
+    System.out.println("numIterations: " + trainingSummary.totalIterations());
+    System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory()));
+    trainingSummary.residuals().show();
+    System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
+    System.out.println("r2: " + trainingSummary.r2());
+  }
+}
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+<!--- TODO: Add python model summaries once implemented -->
+{% highlight python %}
+from pyspark.ml.regression import LinearRegression
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.util import MLUtils
+
+# Load training data
+training = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+
+lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
+
+# Fit the model
+lrModel = lr.fit(training)
+
+# Print the weights and intercept for linear regression
+print("Weights: " + str(lrModel.weights))
+print("Intercept: " + str(lrModel.intercept))
+
+# Linear regression model summary is not yet supported in Python.
+{% endhighlight %}
+</div>
+
+</div>
+
 # Optimization
 
 The optimization algorithm underlying the implementation is called