apache · WeichenXu123 · Sep 11, 2017 · Sep 12, 2017 · Sep 14, 2017 · Sep 27, 2017
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -82,7 +82,11 @@ private[shared] object SharedParamsCodeGen {
         "all instance weights as 1.0"),
       ParamDesc[String]("solver", "the solver algorithm for optimization", finalFields = false),
       ParamDesc[Int]("aggregationDepth", "suggested depth for treeAggregate (>= 2)", Some("2"),
-        isValid = "ParamValidators.gtEq(2)", isExpertParam = true))
+        isValid = "ParamValidators.gtEq(2)", isExpertParam = true),
+      ParamDesc[Boolean]("collectSubModels", "whether to collect a list of sub-models trained " +
+        "during tuning",
+        Some("false"), isExpertParam = true)
+    )
 
     val code = genSharedParams(params)
     val file = "src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala"

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -402,4 +402,21 @@ private[ml] trait HasAggregationDepth extends Params {
   /** @group expertGetParam */
   final def getAggregationDepth: Int = $(aggregationDepth)
 }
+
+/**
+ * Trait for shared param collectSubModels (default: false).
+ */
+private[ml] trait HasCollectSubModels extends Params {
+
+  /**
+   * Param for whether to collect a list of sub-models trained during tuning.
+   * @group expertParam
+   */
+  final val collectSubModels: BooleanParam = new BooleanParam(this, "collectSubModels", "whether to collect a list of sub-models trained during tuning")
+
+  setDefault(collectSubModels, false)
+
+  /** @group expertGetParam */
+  final def getCollectSubModels: Boolean = $(collectSubModels)
+}
 // scalastyle:on
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.ml.tuning
 
+import java.io.IOException
 import java.util.{List => JList}
 
 import scala.collection.JavaConverters._
@@ -31,7 +32,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.evaluation.Evaluator
 import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
-import org.apache.spark.ml.param.shared.HasParallelism
+import org.apache.spark.ml.param.shared.{HasCollectSubModels, HasParallelism}
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.sql.{DataFrame, Dataset}
@@ -67,7 +68,8 @@ private[ml] trait CrossValidatorParams extends ValidatorParams {
 @Since("1.2.0")
 class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
   extends Estimator[CrossValidatorModel]
-  with CrossValidatorParams with HasParallelism with MLWritable with Logging {
+  with CrossValidatorParams with HasParallelism with HasCollectSubModels
+  with MLWritable with Logging {
 
   @Since("1.2.0")
   def this() = this(Identifiable.randomUID("cv"))
@@ -101,6 +103,10 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
   @Since("2.3.0")
   def setParallelism(value: Int): this.type = set(parallelism, value)
 
+  /** @group expertSetParam */
+  @Since("2.3.0")
+  def setCollectSubModels(value: Boolean): this.type = set(collectSubModels, value)
+
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): CrossValidatorModel = {
     val schema = dataset.schema
@@ -117,6 +123,12 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
     instr.logParams(numFolds, seed, parallelism)
     logTuningParams(instr)
 
+    val collectSubModelsParam = $(collectSubModels)
+
+    var subModels: Option[Array[Array[Model[_]]]] = if (collectSubModelsParam) {
+      Some(Array.fill($(numFolds))(Array.fill[Model[_]](epm.length)(null)))
+    } else None
+
     // Compute metrics for each model over each split
     val splits = MLUtils.kFold(dataset.toDF.rdd, $(numFolds), $(seed))
     val metrics = splits.zipWithIndex.map { case ((training, validation), splitIndex) =>
@@ -125,10 +137,14 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
       logDebug(s"Train split $splitIndex with multiple sets of parameters.")
 
       // Fit models in a Future for training in parallel
-      val modelFutures = epm.map { paramMap =>
+      val modelFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
         Future[Model[_]] {
-          val model = est.fit(trainingDataset, paramMap)
-          model.asInstanceOf[Model[_]]
+          val model = est.fit(trainingDataset, paramMap).asInstanceOf[Model[_]]
+
+          if (collectSubModelsParam) {
+            subModels.get(splitIndex)(paramIndex) = model
+          }
+          model
         } (executionContext)
       }
 
@@ -160,7 +176,7 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
     logInfo(s"Best cross-validation metric: $bestMetric.")
     val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]]
     instr.logSuccess(bestModel)
-    copyValues(new CrossValidatorModel(uid, bestModel, metrics).setParent(this))
+    copyValues(new CrossValidatorModel(uid, bestModel, metrics, subModels).setParent(this))
   }
 
   @Since("1.4.0")
@@ -236,12 +252,17 @@ object CrossValidator extends MLReadable[CrossValidator] {
 class CrossValidatorModel private[ml] (
     @Since("1.4.0") override val uid: String,
     @Since("1.2.0") val bestModel: Model[_],
-    @Since("1.5.0") val avgMetrics: Array[Double])
+    @Since("1.5.0") val avgMetrics: Array[Double],
+    @Since("2.3.0") val subModels: Option[Array[Array[Model[_]]]])
   extends Model[CrossValidatorModel] with CrossValidatorParams with MLWritable {
 
   /** A Python-friendly auxiliary constructor. */
   private[ml] def this(uid: String, bestModel: Model[_], avgMetrics: JList[Double]) = {
-    this(uid, bestModel, avgMetrics.asScala.toArray)
+    this(uid, bestModel, avgMetrics.asScala.toArray, null)
+  }
+
+  private[ml] def this(uid: String, bestModel: Model[_], avgMetrics: Array[Double]) = {
+    this(uid, bestModel, avgMetrics, null)
   }
 
   @Since("2.0.0")
@@ -260,17 +281,39 @@ class CrossValidatorModel private[ml] (
     val copied = new CrossValidatorModel(
       uid,
       bestModel.copy(extra).asInstanceOf[Model[_]],
-      avgMetrics.clone())
+      avgMetrics.clone(),
+      CrossValidatorModel.copySubModels(subModels))
     copyValues(copied, extra).setParent(parent)
   }
 
   @Since("1.6.0")
   override def write: MLWriter = new CrossValidatorModel.CrossValidatorModelWriter(this)
+
+  @Since("2.3.0")
+  @throws[IOException]("If the input path already exists but overwrite is not enabled.")
+  def save(path: String, persistSubModels: Boolean): Unit = {
+    write.asInstanceOf[CrossValidatorModel.CrossValidatorModelWriter]
+      .persistSubModels(persistSubModels).save(path)
+  }
 }
 
 @Since("1.6.0")
 object CrossValidatorModel extends MLReadable[CrossValidatorModel] {
 
+  private[CrossValidatorModel] def copySubModels(subModels: Option[Array[Array[Model[_]]]]) = {
+    subModels.map { subModels =>
+      val numFolds = subModels.length
+      val numParamMaps = subModels(0).length
+      val copiedSubModels = Array.fill(numFolds)(Array.fill[Model[_]](numParamMaps)(null))
+      for (i <- 0 until numFolds) {
+        for (j <- 0 until numParamMaps) {
+          copiedSubModels(i)(j) = subModels(i)(j).copy(ParamMap.empty).asInstanceOf[Model[_]]
+        }
+      }
+      copiedSubModels
+    }
+  }
+
   @Since("1.6.0")
   override def read: MLReader[CrossValidatorModel] = new CrossValidatorModelReader
 
@@ -282,12 +325,35 @@ object CrossValidatorModel extends MLReadable[CrossValidatorModel] {
 
     ValidatorParams.validateParams(instance)
 
+    protected var shouldPersistSubModels: Boolean = false
+
+    /**
+     * Set option for persist sub models.
+     */
+    @Since("2.3.0")
+    def persistSubModels(persist: Boolean): this.type = {
+      shouldPersistSubModels = persist
+      this
+    }
+
     override protected def saveImpl(path: String): Unit = {
       import org.json4s.JsonDSL._
-      val extraMetadata = "avgMetrics" -> instance.avgMetrics.toSeq
+      val extraMetadata = ("avgMetrics" -> instance.avgMetrics.toSeq) ~
+        ("shouldPersistSubModels" -> shouldPersistSubModels)
       ValidatorParams.saveImpl(path, instance, sc, Some(extraMetadata))
       val bestModelPath = new Path(path, "bestModel").toString
       instance.bestModel.asInstanceOf[MLWritable].save(bestModelPath)
+      if (shouldPersistSubModels) {
+        require(instance.subModels.isDefined, "Cannot get sub models to persist.")
+        val subModelsPath = new Path(path, "subModels")
+        for (splitIndex <- 0 until instance.getNumFolds) {
+          val splitPath = new Path(subModelsPath, splitIndex.toString)
+          for (paramIndex <- 0 until instance.getEstimatorParamMaps.length) {
+            val modelPath = new Path(splitPath, paramIndex.toString).toString
+            instance.subModels.get(splitIndex)(paramIndex).asInstanceOf[MLWritable].save(modelPath)
+          }
+        }
+      }
     }
   }
 
@@ -301,11 +367,28 @@ object CrossValidatorModel extends MLReadable[CrossValidatorModel] {
 
       val (metadata, estimator, evaluator, estimatorParamMaps) =
         ValidatorParams.loadImpl(path, sc, className)
+      val numFolds = (metadata.params \ "numFolds").extract[Int]
       val bestModelPath = new Path(path, "bestModel").toString
       val bestModel = DefaultParamsReader.loadParamsInstance[Model[_]](bestModelPath, sc)
       val avgMetrics = (metadata.metadata \ "avgMetrics").extract[Seq[Double]].toArray
-
-      val model = new CrossValidatorModel(metadata.uid, bestModel, avgMetrics)
+      val shouldPersistSubModels = (metadata.metadata \ "shouldPersistSubModels").extract[Boolean]
+
+      val subModels: Option[Array[Array[Model[_]]]] = if (shouldPersistSubModels) {
+        val subModelsPath = new Path(path, "subModels")
+        val _subModels = Array.fill(numFolds)(Array.fill[Model[_]](
+          estimatorParamMaps.length)(null))
+        for (splitIndex <- 0 until numFolds) {
+          val splitPath = new Path(subModelsPath, splitIndex.toString)
+          for (paramIndex <- 0 until estimatorParamMaps.length) {
+            val modelPath = new Path(splitPath, paramIndex.toString).toString
+            _subModels(splitIndex)(paramIndex) =
+              DefaultParamsReader.loadParamsInstance(modelPath, sc)
+          }
+        }
+        Some(_subModels)
+      } else None
+
+      val model = new CrossValidatorModel(metadata.uid, bestModel, avgMetrics, subModels)
       model.set(model.estimator, estimator)
         .set(model.evaluator, evaluator)
         .set(model.estimatorParamMaps, estimatorParamMaps)