-
[`LogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionTrainingSummary)
-provides an interface to access information such as `objectiveHistory` and metrics
-to evaluate the performance on the training data directly with very less code to be rewritten by
-the user. [`LogisticRegression`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression)
-currently supports only binary classification and hence in order to access the binary metrics
-the summary must be explicitly cast to
-[BinaryLogisticRegressionTrainingSummary](api/scala/index.html#org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary)
-as done in the code below. This avoids raising errors for multiclass outputs while providing
-extensiblity when multiclass classification is supported in the future.
+provides a summary for a
+[`LogisticRegressionModel`](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegressionModel).
+Currently, only binary classification is supported and the
+summary must be explicitly cast to
+[`BinaryLogisticRegressionTrainingSummary`](api/scala/index.html#org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummary).
+This will likely change when multiclass classification is supported.
-This example illustrates the use of `LogisticRegressionTrainingSummary` on some toy data.
+Continuing the earlier example:
{% highlight scala %}
-import org.apache.spark.ml.classification.{LogisticRegression, BinaryLogisticRegressionSummary}
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.sql.Row
-
-// Use some random data for demonstration.
-// Note that the RDD of LabeledPoints can be converted to a dataframe directly.
-val data = sc.parallelize(Array(
- LabeledPoint(0.0, Vectors.dense(0.2, 4.5, 1.6)),
- LabeledPoint(1.0, Vectors.dense(3.1, 6.8, 3.6)),
- LabeledPoint(0.0, Vectors.dense(2.4, 0.9, 1.9)),
- LabeledPoint(1.0, Vectors.dense(9.1, 3.1, 3.6)),
- LabeledPoint(0.0, Vectors.dense(2.5, 1.9, 9.1)))
-)
-val logRegDataFrame = data.toDF()
-
-// Run Logistic Regression on your toy data.
-// Since LogisticRegression is an estimator, it returns an instance of LogisticRegressionModel
-// which is a transformer.
-val logReg = new LogisticRegression().setMaxIter(5).setRegParam(0.01)
-val logRegModel = logReg.fit(logRegDataFrame)
-
-// Extract the summary directly from the returned LogisticRegressionModel instance.
-val trainingSummary = logRegModel.summary
+// Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example
+val trainingSummary = lrModel.summary
// Obtain the loss per iteration.
val objectiveHistory = trainingSummary.objectiveHistory
@@ -206,60 +187,30 @@ logReg.fit(logRegDataFrame)
{% endhighlight %}
-
-[`LogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary)
-provides an interface to access information such as `objectiveHistory` and metrics
-to evaluate the performance on the training data directly with very less code to be rewritten by
-the user. [`LogisticRegression`](api/java/org/apache/spark/ml/classification/LogisticRegression)
-currently supports only binary classification and hence in order to access the binary metrics
-the summary must be explicitly cast to
-[BinaryLogisticRegressionTrainingSummary](api/java/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary)
-as done in the code below. This avoids raising errors for multiclass outputs while providing
-extensiblity when multiclass classification is supported in the future
+
+[`LogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/LogisticRegressionTrainingSummary.html)
+provides a summary for a
+[`LogisticRegressionModel`](api/java/org/apache/spark/ml/classification/LogisticRegressionModel.html).
+Currently, only binary classification is supported and the
+summary must be explicitly cast to
+[`BinaryLogisticRegressionTrainingSummary`](api/java/org/apache/spark/ml/classification/BinaryLogisticRegressionTrainingSummary.html).
+This will likely change when multiclass classification is supported.
-This example illustrates the use of `LogisticRegressionTrainingSummary` on some toy data.
+Continuing the earlier example:
{% highlight java %}
-import com.google.common.collect.Lists;
-
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary;
-import org.apache.spark.ml.classification.LogisticRegression;
-import org.apache.spark.ml.classification.LogisticRegressionModel;
-import org.apache.spark.ml.classification.LogisticRegressionTrainingSummary;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.sql.DataFrame;
-import org.apache.spark.sql.Row;
-import static org.apache.spark.sql.functions.*;
-
-// Use some random data for demonstration.
-// Note that the RDD of LabeledPoints can be converted to a dataframe directly.
-JavaRDD data = sc.parallelize(Lists.newArrayList(
- new LabeledPoint(0.0, Vectors.dense(0.2, 4.5, 1.6)),
- new LabeledPoint(1.0, Vectors.dense(3.1, 6.8, 3.6)),
- new LabeledPoint(0.0, Vectors.dense(2.4, 0.9, 1.9)),
- new LabeledPoint(1.0, Vectors.dense(9.1, 3.1, 3.6)),
- new LabeledPoint(0.0, Vectors.dense(2.5, 1.9, 9.1)))
-);
-DataFrame logRegDataFrame = sql.createDataFrame(data, LabeledPoint.class);
-
-// Run Logistic Regression on your toy data.
-// Since LogisticRegression is an estimator, it returns an instance of LogisticRegressionModel
-// which is a transformer.
-LogisticRegression logReg = new LogisticRegression().setMaxIter(5).setRegParam(0.01);
-LogisticRegressionModel logRegModel = logReg.fit(logRegDataFrame);
-
-// Extract the summary directly from the returned LogisticRegressionModel instance.
+// Extract the summary from the returned LogisticRegressionModel instance trained in the earlier example
LogisticRegressionTrainingSummary trainingSummary = logRegModel.summary();
// Obtain the loss per iteration.
double[] objectiveHistory = trainingSummary.objectiveHistory();
-for (double lossPerIteration: objectiveHistory) {
+for (double lossPerIteration : objectiveHistory) {
System.out.println(lossPerIteration);
}
// Obtain the metrics useful to judge performance on test data.
+// We cast the summary to a BinaryLogisticRegressionSummary since the problem is a
+// binary classification problem.
BinaryLogisticRegressionSummary binarySummary = (BinaryLogisticRegressionSummary) trainingSummary;
// Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
@@ -278,4 +229,18 @@ logReg.setThreshold(bestThreshold);
logReg.fit(logRegDataFrame);
{% endhighlight %}
+
+
+Logistic regression model summary is not yet supported in Python.
+
+
+
+# Optimization
+
+The optimization algorithm underlying the implementation is called
+[Orthant-Wise Limited-memory
+QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
+(OWL-QN). It is an extension of L-BFGS that can effectively handle L1
+regularization and elastic net.
+