Partly done with adding checks, but blocking on adding checking functionality to Param

jkbradley · jkbradley · commit b987319d4e9c · 2015-04-29T15:29:15.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable.ListBuffer
 
 import org.apache.spark.Logging
 import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
-import org.apache.spark.ml.param.{Param, ParamMap}
+import org.apache.spark.ml.param.{Params, Param, ParamMap}
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.types.StructType
 
@@ -86,6 +86,13 @@ class Pipeline extends Estimator[PipelineModel] {
   def setStages(value: Array[PipelineStage]): this.type = { set(stages, value); this }
   def getStages: Array[PipelineStage] = getOrDefault(stages)
 
+  override def validate(paramMap: ParamMap): Unit = {
+    val map = extractParamMap(paramMap)
+    getStages.foreach {
+      case pStage: Params => pStage.validate(map)
+    }
+  }
+
   /**
    * Fits the pipeline to the input dataset with additional parameters. If a stage is an
    * [[Estimator]], its [[Estimator#fit]] method will be called on the input dataset to fit a model.
@@ -140,7 +147,7 @@ class Pipeline extends Estimator[PipelineModel] {
   override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
     val map = extractParamMap(paramMap)
     val theStages = map(stages)
-    require(theStages.toSet.size == theStages.size,
+    require(theStages.toSet.size == theStages.length,
       "Cannot have duplicate components in a pipeline.")
     theStages.foldLeft(schema)((cur, stage) => stage.transformSchema(cur, paramMap))
   }
@@ -157,6 +164,11 @@ class PipelineModel private[ml] (
     private[ml] val stages: Array[Transformer])
   extends Model[PipelineModel] with Logging {
 
+  override def validate(paramMap: ParamMap): Unit = {
+    val map = fittingParamMap ++ extractParamMap(paramMap)
+    stages.foreach(_.validate(map))
+  }
+
   /**
    * Gets the model produced by the input estimator. Throws an NoSuchElementException is the input
    * estimator does not exist in the pipeline.
@@ -168,7 +180,7 @@ class PipelineModel private[ml] (
     }
     if (matched.isEmpty) {
       throw new NoSuchElementException(s"Cannot find stage $stage from the pipeline.")
-    } else if (matched.size > 1) {
+    } else if (matched.length > 1) {
       throw new IllegalStateException(s"Cannot have duplicate estimators in the sample pipeline.")
     } else {
       matched.head.asInstanceOf[M]
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -38,6 +38,8 @@ import org.apache.spark.sql.types._
 @AlphaComponent
 class VectorAssembler extends Transformer with HasInputCols with HasOutputCol {
 
+  override def validate(paramMap: ParamMap): Unit = { }
+
   /** @group setParam */
   def setInputCols(value: Array[String]): this.type = set(inputCols, value)
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -37,17 +37,23 @@ private[ml] trait VectorIndexerParams extends Params with HasInputCol with HasOu
   /**
    * Threshold for the number of values a categorical feature can take.
    * If a feature is found to have > maxCategories values, then it is declared continuous.
+   * Must be >= 2.
    *
    * (default = 20)
    */
   val maxCategories = new IntParam(this, "maxCategories",
-    "Threshold for the number of values a categorical feature can take." +
+    "Threshold for the number of values a categorical feature can take (>= 2)." +
       " If a feature is found to have > maxCategories values, then it is declared continuous.")
 
+  setDefault(maxCategories -> 20)
+
   /** @group getParam */
   def getMaxCategories: Int = getOrDefault(maxCategories)
 
-  setDefault(maxCategories -> 20)
+  override def validate(paramMap: ParamMap): Unit = {
+    require(getOrDefault(maxCategories) >= 2,
+      s"VectorIndexer maxCategories must be >= 2, but was ${getOrDefault(maxCategories)}")
+  }
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/impl/tree/treeParams.scala
@@ -38,13 +38,13 @@ import org.apache.spark.mllib.tree.loss.{Loss => OldLoss}
 private[ml] trait DecisionTreeParams extends PredictorParams {
 
   /**
-   * Maximum depth of the tree.
+   * Maximum depth of the tree (>= 0).
    * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
    * (default = 5)
    * @group param
    */
   final val maxDepth: IntParam =
-    new IntParam(this, "maxDepth", "Maximum depth of the tree." +
+    new IntParam(this, "maxDepth", "Maximum depth of the tree. (>= 0)" +
       " E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
 
   /**
@@ -173,6 +173,24 @@ private[ml] trait DecisionTreeParams extends PredictorParams {
   /** @group expertGetParam */
   final def getCheckpointInterval: Int = getOrDefault(checkpointInterval)
 
+  /**
+   * Same as [[validate()]], but renamed to force concrete classes to explicitly implement
+   * validation (in case concrete classes have their own parameters).
+   */
+  protected def validateImpl(paramMap: ParamMap): Unit = {
+    val map = extractParamMap(paramMap)
+    require(map(maxDepth) >= 0, s"${this.getClass.getSimpleName}" +
+      s" maxDepth must be >= 0, but was ${map(maxDepth)}")
+    require(map(maxBins) >= 2, s"${this.getClass.getSimpleName}" +
+      s" maxBins must be >= 2, but was ${map(maxBins)}")
+    require(map(minInstancesPerNode) >= 1, s"${this.getClass.getSimpleName}" +
+      s" minInstancesPerNode must be >= 1, but was ${map(minInstancesPerNode)}")
+    require(map(maxMemoryInMB) > 0, s"${this.getClass.getSimpleName}" +
+      s" maxMemoryInMB must be > 0, but was ${map(maxMemoryInMB)}")
+    require(map(checkpointInterval) >= 1, s"${this.getClass.getSimpleName}" +
+      s" checkpointInterval must be >= 1, but was ${map(checkpointInterval)}")
+  }
+
   /** (private[ml]) Create a Strategy instance to use with the old API. */
   private[ml] def getOldStrategy(
       categoricalFeatures: Map[Int, Int],
@@ -299,12 +317,12 @@ private[ml] object TreeRegressorParams {
 private[ml] trait TreeEnsembleParams extends DecisionTreeParams with HasSeed {
 
   /**
-   * Fraction of the training data used for learning each decision tree.
+   * Fraction of the training data used for learning each decision tree, in range (0, 1].
    * (default = 1.0)
    * @group param
    */
   final val subsamplingRate: DoubleParam = new DoubleParam(this, "subsamplingRate",
-    "Fraction of the training data used for learning each decision tree.")
+    "Fraction of the training data used for learning each decision tree, in range (0, 1].")
 
   setDefault(subsamplingRate -> 1.0)
 
@@ -321,6 +339,14 @@ private[ml] trait TreeEnsembleParams extends DecisionTreeParams with HasSeed {
   /** @group setParam */
   def setSeed(value: Long): this.type = set(seed, value)
 
+  override protected def validateImpl(paramMap: ParamMap): Unit = {
+    super.validateImpl(paramMap)
+    val map = extractParamMap(paramMap)
+    val rate = map(subsamplingRate)
+    require(0.0 < rate && rate <= 1.0, s"${this.getClass.getSimpleName}" +
+      s" subsamplingRate must be in range (0, 1], but was $rate")
+  }
+
   /**
    * Create a Strategy instance to use with the old API.
    * NOTE: The caller should set impurity and seed.
@@ -402,6 +428,18 @@ private[ml] trait RandomForestParams extends TreeEnsembleParams {
 
   /** @group getParam */
   final def getFeatureSubsetStrategy: String = getOrDefault(featureSubsetStrategy)
+
+  override protected def validateImpl(paramMap: ParamMap): Unit = {
+    super.validateImpl(paramMap)
+    val map = extractParamMap(paramMap)
+    require(map(numTrees) >= 1, s"${this.getClass.getSimpleName}" +
+      s" numTrees must be >= 1, but was ${map(numTrees)}")
+    require(
+      RandomForestParams.supportedFeatureSubsetStrategies.contains(map(featureSubsetStrategy)),
+      s"RandomForestParams was given unrecognized featureSubsetStrategy:" +
+        s" ${map(featureSubsetStrategy)}. Supported" +
+        s" options: ${RandomForestParams.supportedFeatureSubsetStrategies.mkString(", ")}")
+  }
 }
 
 private[ml] object RandomForestParams {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -61,15 +61,26 @@ private[ml] trait CrossValidatorParams extends Params {
   def getEvaluator: Evaluator = getOrDefault(evaluator)
 
   /**
-   * param for number of folds for cross validation
+   * Param for number of folds for cross validation.  Must be >= 2.
+   * Default: 3
    * @group param
    */
-  val numFolds: IntParam = new IntParam(this, "numFolds", "number of folds for cross validation")
+  val numFolds: IntParam =
+    new IntParam(this, "numFolds", "number of folds for cross validation (>= 2)")
 
   /** @group getParam */
   def getNumFolds: Int = getOrDefault(numFolds)
 
   setDefault(numFolds -> 3)
+
+  override def validate(paramMap: ParamMap): Unit = {
+    require(getOrDefault(numFolds) >= 2,
+      s"CrossValidator numFolds must be >= 2, but was ${getOrDefault(numFolds)}")
+    val map = extractParamMap(paramMap)
+    getEstimatorParamMaps.foreach { eMap =>
+      getEstimator.validate(map ++ eMap)
+    }
+  }
 }
 
 /**
@@ -101,8 +112,8 @@ class CrossValidator extends Estimator[CrossValidatorModel] with CrossValidatorP
     val est = map(estimator)
     val eval = map(evaluator)
     val epm = map(estimatorParamMaps)
-    val numModels = epm.size
-    val metrics = new Array[Double](epm.size)
+    val numModels = epm.length
+    val metrics = new Array[Double](epm.length)
     val splits = MLUtils.kFold(dataset.rdd, map(numFolds), 0)
     splits.zipWithIndex.foreach { case ((training, validation), splitIndex) =>
       val trainingDataset = sqlCtx.createDataFrame(training, schema).cache()