tuning summary

apache · hhbyyh · Dec 3, 2016 · Dec 5, 2016 · Dec 5, 2016 · Dec 5, 2016
commit d1e22d58f5ecb5972f2ea528dc18d1230d678424
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -20,11 +20,10 @@ package org.apache.spark.ml.tuning
 import java.util.{List => JList}
 
 import scala.collection.JavaConverters._
-
 import com.github.fommil.netlib.F2jBLAS
 import org.apache.hadoop.fs.Path
+import org.apache.spark.SparkException
 import org.json4s.DefaultFormats
-
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml._
@@ -127,7 +126,10 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
     logInfo(s"Best set of parameters:\n${epm(bestIndex)}")
     logInfo(s"Best cross-validation metric: $bestMetric.")
     val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]]
-    copyValues(new CrossValidatorModel(uid, bestModel, metrics).setParent(this))
+    val model = copyValues(new CrossValidatorModel(uid, bestModel, metrics).setParent(this))
+    val summary = new TuningSummary(bestModel.transform(dataset), epm, metrics, bestIndex)
+    model.setSummary(Some(summary))
+    model
   }
 
   @Since("1.4.0")
@@ -234,6 +236,29 @@ class CrossValidatorModel private[ml] (
 
   @Since("1.6.0")
   override def write: MLWriter = new CrossValidatorModel.CrossValidatorModelWriter(this)
+
+  private var trainingSummary: Option[TuningSummary] = None
+
+  private[tuning] def setSummary(summary: Option[TuningSummary]): this.type = {
+    this.trainingSummary = summary
+    this
+  }
+
+  /**
+    * Return true if there exists summary of model.
+    */
+  @Since("2.0.0")
+  def hasSummary: Boolean = trainingSummary.nonEmpty
+
+  /**
+    * Gets summary of model on training set. An exception is
+    * thrown if `trainingSummary == None`.
+    */
+  @Since("2.0.0")
+  def summary: TuningSummary = trainingSummary.getOrElse {
+    throw new SparkException(
+      s"No training summary available for the ${this.getClass.getSimpleName}")
+  }
 }
 
 @Since("1.6.0")

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
@@ -25,6 +25,7 @@ import scala.language.existentials
 import org.apache.hadoop.fs.Path
 import org.json4s.DefaultFormats
 
+import org.apache.spark.SparkException
 import org.apache.spark.annotation.Since
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml.{Estimator, Model}
@@ -123,7 +124,10 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St
     logInfo(s"Best set of parameters:\n${epm(bestIndex)}")
     logInfo(s"Best train validation split metric: $bestMetric.")
     val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]]
-    copyValues(new TrainValidationSplitModel(uid, bestModel, metrics).setParent(this))
+    val model = copyValues(new TrainValidationSplitModel(uid, bestModel, metrics).setParent(this))
+    val summary = new TuningSummary(bestModel.transform(dataset), epm, metrics, bestIndex)
+    model.setSummary(Some(summary))
+    model
   }
 
   @Since("1.5.0")
@@ -226,6 +230,29 @@ class TrainValidationSplitModel private[ml] (
 
   @Since("2.0.0")
   override def write: MLWriter = new TrainValidationSplitModel.TrainValidationSplitModelWriter(this)
+
+  private var trainingSummary: Option[TuningSummary] = None
+
+  private[tuning] def setSummary(summary: Option[TuningSummary]): this.type = {
+    this.trainingSummary = summary
+    this
+  }
+
+  /**
+   * Return true if there exists summary of model.
+   */
+  @Since("2.0.0")
+  def hasSummary: Boolean = trainingSummary.nonEmpty
+
+  /**
+   * Gets summary of model on training set. An exception is
+   * thrown if `trainingSummary == None`.
+   */
+  @Since("2.0.0")
+  def summary: TuningSummary = trainingSummary.getOrElse {
+    throw new SparkException(
+      s"No training summary available for the ${this.getClass.getSimpleName}")
+  }
 }
 
 @Since("2.0.0")
@@ -275,3 +302,4 @@ object TrainValidationSplitModel extends MLReadable[TrainValidationSplitModel] {
     }
   }
 }
+
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TuningSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TuningSummary.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tuning
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
+
+/**
+ * :: Experimental ::
+ * Summary of grid search tuning.
+ *
+ * @param params  estimator param maps
+ * @param metrics  Corresponding evaluation metrics for the param maps
+ */
+@Since("2.2.0")
+@Experimental
+class TuningSummary private[tuning](
+    @transient val predictions: DataFrame,
+    val params: Array[ParamMap],
+    val metrics: Array[Double],
+    val bestIndex: Int) {
+
+  def trainingMetrics: DataFrame = {
+    require(params.nonEmpty, "estimator param maps should not be empty")
+    require(params.length == metrics.length, "estimator param maps numner should match metrics")
+    val sqlContext = predictions.sqlContext
+    val sc = sqlContext.sparkContext
+    val fields = params(0).toSeq.sortBy(_.param.name).map(_.param.name) ++ Seq("metrics")
+    val schema = new StructType(fields.map(name => StructField(name, StringType)).toArray)
+
+    val rows = sc.parallelize(params.zip(metrics)).map { case (param, metric) =>
+      val values = param.toSeq.sortBy(_.param.name).map(_.value.toString) ++ Seq(metric.toString)
+      Row.fromSeq(values)
+    }
+    sqlContext.createDataFrame(rows, schema)
+  }
+}
+
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -45,18 +45,19 @@ class TrainValidationSplitSuite
       .addGrid(lr.maxIter, Array(0, 10))
       .build()
     val eval = new BinaryClassificationEvaluator
-    val cv = new TrainValidationSplit()
+    val tv = new TrainValidationSplit()
       .setEstimator(lr)
       .setEstimatorParamMaps(lrParamMaps)
       .setEvaluator(eval)
       .setTrainRatio(0.5)
       .setSeed(42L)
-    val cvModel = cv.fit(dataset)
-    val parent = cvModel.bestModel.parent.asInstanceOf[LogisticRegression]
-    assert(cv.getTrainRatio === 0.5)
+    val tvModel = tv.fit(dataset)
+    val parent = tvModel.bestModel.parent.asInstanceOf[LogisticRegression]
+    assert(tv.getTrainRatio === 0.5)
     assert(parent.getRegParam === 0.001)
     assert(parent.getMaxIter === 10)
-    assert(cvModel.validationMetrics.length === lrParamMaps.length)
+    assert(tvModel.validationMetrics.length === lrParamMaps.length)
+    assert(tvModel.summary.params === lrParamMaps)
   }
 
   test("train validation with linear regression") {