Fixed bugs from previous commit.

apache · jkbradley · Nov 24, 2014 · Nov 24, 2014 · Dec 1, 2014 · Dec 4, 2014
commit 934f97b7c876fe704dbe8a55d2b8bcff798b6b59
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
@@ -102,7 +102,7 @@ public static void main(String[] args) {
     // 'probability' column since we renamed the lr.probabilityCol parameter previously.
     model2.transform(test).registerTempTable("results");
     DataFrame results =
-        jsql.sql("SELECT features, label, probability, prediction FROM results");
+        jsql.sql("SELECT features, label, myProbability, prediction FROM results");
     for (Row r: results.collect()) {
       System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
           + ", prediction=" + r.get(3));

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -43,8 +43,6 @@ object DeveloperApiExample {
     import sqlContext._
 
     // Prepare training data.
-    // We use LabeledPoint, which is a case class.  Spark SQL can convert RDDs of Java Beans
-    // into SchemaRDDs, where it uses the bean metadata to infer the schema.
     val training = sparkContext.parallelize(Seq(
       LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
       LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -93,7 +93,7 @@ object SimpleParamsExample {
     model2.transform(test)
       .select('features, 'label, 'myProbability, 'prediction)
       .collect()
-      .foreach { case Row(features: Vector, label: Double, prob: Double, prediction: Double) =>
+      .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
         println("(" + features + ", " + label + ") -> prob=" + prob + ", prediction=" + prediction)
       }
 

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.ml.classification
 
-import scala.reflect.runtime.universe._
-
 import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
 import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor, PredictorParams}
 import org.apache.spark.ml.param.{Params, ParamMap, HasRawPredictionCol}
@@ -62,8 +60,6 @@ abstract class Classifier[
   extends Predictor[FeaturesType, Learner, M]
   with ClassifierParams {
 
-  setRawPredictionCol("") // Do not output by default
-
   def setRawPredictionCol(value: String): Learner =
     set(rawPredictionCol, value).asInstanceOf[Learner]
 
@@ -82,8 +78,6 @@ abstract class Classifier[
 abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[FeaturesType, M]]
   extends PredictionModel[FeaturesType, M] with ClassifierParams {
 
-  setRawPredictionCol("") // Do not output by default
-
   def setRawPredictionCol(value: String): M = set(rawPredictionCol, value).asInstanceOf[M]
 
   /** Number of classes (values which the label can take). */

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.classification
 import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml.param._
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
-import org.apache.spark.mllib.linalg.{VectorUDT, Vectors, BLAS, Vector}
+import org.apache.spark.mllib.linalg.{BLAS, Vector, VectorUDT, Vectors}
 import org.apache.spark.sql._
 import org.apache.spark.sql.Dsl._
 import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
@@ -35,6 +35,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
 
 /**
  * :: AlphaComponent ::
+ *
  * Logistic regression.
  * Currently, this class only supports binary classification.
  */
@@ -86,6 +87,7 @@ class LogisticRegression
 
 /**
  * :: AlphaComponent ::
+ *
  * Model produced by [[LogisticRegression]].
  */
 @AlphaComponent

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.ml.classification
 
-import scala.reflect.runtime.universe._
-
 import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
 import org.apache.spark.ml.param.{HasProbabilityCol, ParamMap, Params}
 import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
@@ -42,8 +40,10 @@ private[classification] trait ProbabilisticClassifierParams
   }
 }
 
+
 /**
  * :: AlphaComponent ::
+ *
  * Single-label binary or multiclass classifier which can output class conditional probabilities.
  *
  * @tparam FeaturesType  Type of input features.  E.g., [[Vector]]
@@ -57,13 +57,13 @@ abstract class ProbabilisticClassifier[
     M <: ProbabilisticClassificationModel[FeaturesType, M]]
   extends Classifier[FeaturesType, Learner, M] with ProbabilisticClassifierParams {
 
-  setProbabilityCol("") // Do not output by default
-
   def setProbabilityCol(value: String): Learner = set(probabilityCol, value).asInstanceOf[Learner]
 }
 
+
 /**
  * :: AlphaComponent ::
+ *
  * Model produced by a [[ProbabilisticClassifier]].
  * Classes are indexed {0, 1, ..., numClasses - 1}.
  *
@@ -76,8 +76,6 @@ abstract class ProbabilisticClassificationModel[
     M <: ProbabilisticClassificationModel[FeaturesType, M]]
   extends ClassificationModel[FeaturesType, M] with ProbabilisticClassifierParams {
 
-  setProbabilityCol("") // Do not output by default
-
   def setProbabilityCol(value: String): M = set(probabilityCol, value).asInstanceOf[M]
 
   /**

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.types.DoubleType
 
 /**
  * :: AlphaComponent ::
+ *
  * Evaluator for binary classification, which expects two input columns: score and label.
  */
 @AlphaComponent

diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.impl.estimator
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.mllib.linalg.Vector
@@ -62,6 +62,8 @@ trait PredictorParams extends Params
 }
 
 /**
+ * :: AlphaComponent ::
+ *
  * Abstraction for prediction problems (regression and classification).
  *
  * @tparam FeaturesType  Type of features.
@@ -71,7 +73,7 @@ trait PredictorParams extends Params
  * @tparam M  Specialization of [[PredictionModel]].  If you subclass this type, use this type
  *            parameter to specify the concrete type for the corresponding model.
  */
-@DeveloperApi
+@AlphaComponent
 abstract class Predictor[
     FeaturesType,
     Learner <: Predictor[FeaturesType, Learner, M],
@@ -124,7 +126,18 @@ abstract class Predictor[
   }
 }
 
-private[ml] abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType, M]]
+/**
+ * :: AlphaComponent ::
+ *
+ * Abstraction for a model for prediction tasks (regression and classification).
+ *
+ * @tparam FeaturesType  Type of features.
+ *                       E.g., [[org.apache.spark.mllib.linalg.VectorUDT]] for vector features.
+ * @tparam M  Specialization of [[PredictionModel]].  If you subclass this type, use this type
+ *            parameter to specify the concrete type for the corresponding model.
+ */
+@AlphaComponent
+abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType, M]]
   extends Model[M] with PredictorParams {
 
   def setFeaturesCol(value: String): M = set(featuresCol, value).asInstanceOf[M]

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -19,14 +19,12 @@ package org.apache.spark.ml.param
 
 import scala.annotation.varargs
 import scala.collection.mutable
-import scala.reflect.runtime.universe._
 
 import java.lang.reflect.Modifier
 
-import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
+import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
 import org.apache.spark.ml.Identifiable
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.ScalaReflection
 
 /**
  * :: AlphaComponent ::

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -32,7 +32,8 @@ private[regression] trait LinearRegressionParams extends RegressorParams
 
 /**
  * :: AlphaComponent ::
- * Logistic regression.
+ *
+ * Linear regression.
  */
 @AlphaComponent
 class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegressionModel]
@@ -78,6 +79,7 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
 
 /**
  * :: AlphaComponent ::
+ *
  * Model produced by [[LinearRegression]].
  */
 @AlphaComponent

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala
@@ -30,6 +30,7 @@ trait RegressorParams extends PredictorParams
 
 /**
  * :: AlphaComponent ::
+ *
  * Single-label regression
  *
  * @tparam FeaturesType  Type of input features.  E.g., [[org.apache.spark.mllib.linalg.Vector]]
@@ -49,6 +50,7 @@ abstract class Regressor[
 
 /**
  * :: AlphaComponent ::
+ *
  * Model produced by a [[Regressor]].
  *
  * @tparam FeaturesType  Type of input features.  E.g., [[org.apache.spark.mllib.linalg.Vector]]

diff --git a/mllib/src/test/java/org/apache/spark/ml/JavaLabeledPointSuite.java b/mllib/src/test/java/org/apache/spark/ml/JavaLabeledPointSuite.java
diff --git a/mllib/src/test/scala/org/apache/spark/ml/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/LabeledPointSuite.scala
-Original file line number
+Diff line change
@@ Expand Up / @@ -27,6 +27,7 @@ import org.apache.spark.sql.types.DoubleType @@
     /**
      * :: AlphaComponent ::
+     *
      * Evaluator for binary classification, which expects two input columns: score and label.
      */
     @AlphaComponent
@@ Expand Down @@