[SPARK-22397][ML] add multiple columns support to QuantileDiscretizer

## What changes were proposed in this pull request? add multi columns support to QuantileDiscretizer. When calculating the splits, we can either merge together all the probabilities into one array by calculating approxQuantiles on multiple columns at once, or compute approxQuantiles separately for each column. After doing the performance comparision, we found it’s better to calculating approxQuantiles on multiple columns at once. Here is how we measuring the performance time: ``` var duration = 0.0 for (i<- 0 until 10) { val start = System.nanoTime() discretizer.fit(df) val end = System.nanoTime() duration += (end - start) / 1e9 } println(duration/10) ``` Here is the performance test result: |numCols |NumRows | compute each approxQuantiles separately|compute multiple columns approxQuantiles at one time| |--------|----------|--------------------------------|-------------------------------------------| |10 |60 |0.3623195839 |0.1626658607 | |10 |6000 |0.7537239841 |0.3869370046 | |22 |6000 |1.6497598557 |0.4767903059 | |50 |6000 |3.2268305752 |0.7217818396 | ## How was this patch tested? add UT in QuantileDiscretizerSuite to test multi columns supports Author: Huaxin Gao <[email protected]> Closes apache#19715 from huaxingao/spark_22397.
fjh100456 · fjh100456 · Jan 8, 2018 · Dec 23, 2017 · Dec 23, 2017 · Dec 23, 2017
commit 3d8837e59aadd726805371041567ceff375194c0
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -22,7 +22,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.ml._
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol, HasOutputCol}
+import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol, HasInputCols, HasOutputCol, HasOutputCols}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.types.StructType
@@ -50,10 +50,28 @@ private[feature] trait QuantileDiscretizerBase extends Params
   /** @group getParam */
   def getNumBuckets: Int = getOrDefault(numBuckets)
 
+  /**
+   * Array of number of buckets (quantiles, or categories) into which data points are grouped.
+   * Each value must be greater than or equal to 2
+   *
+   * See also [[handleInvalid]], which can optionally create an additional bucket for NaN values.
+   *
+   * @group param
+   */
+  val numBucketsArray = new IntArrayParam(this, "numBucketsArray", "Array of number of buckets " +
+    "(quantiles, or categories) into which data points are grouped. This is for multiple " +
+    "columns input. If transforming multiple columns and numBucketsArray is not set, but " +
+    "numBuckets is set, then numBuckets will be applied across all columns.",
+    (arrayOfNumBuckets: Array[Int]) => arrayOfNumBuckets.forall(ParamValidators.gtEq(2)))
+
+  /** @group getParam */
+  def getNumBucketsArray: Array[Int] = $(numBucketsArray)
+
   /**
    * Relative error (see documentation for
    * `org.apache.spark.sql.DataFrameStatFunctions.approxQuantile` for description)
    * Must be in the range [0, 1].
+   * Note that in multiple columns case, relative error is applied to all columns.
    * default: 0.001
    * @group param
    */
@@ -68,7 +86,9 @@ private[feature] trait QuantileDiscretizerBase extends Params
   /**
    * Param for how to handle invalid entries. Options are 'skip' (filter out rows with
    * invalid values), 'error' (throw an error), or 'keep' (keep invalid values in a special
-   * additional bucket).
+   * additional bucket). Note that in the multiple columns case, the invalid handling is applied
+   * to all columns. That said for 'error' it will throw an error if any invalids are found in
+   * any column, for 'skip' it will skip rows with any invalids in any columns, etc.
    * Default: "error"
    * @group param
    */
@@ -86,6 +106,11 @@ private[feature] trait QuantileDiscretizerBase extends Params
  * categorical features. The number of bins can be set using the `numBuckets` parameter. It is
  * possible that the number of buckets used will be smaller than this value, for example, if there
  * are too few distinct values of the input to create enough distinct quantiles.
+ * Since 2.3.0, `QuantileDiscretizer` can map multiple columns at once by setting the `inputCols`
+ * parameter. If both of the `inputCol` and `inputCols` parameters are set, an Exception will be
+ * thrown. To specify the number of buckets for each column, the `numBucketsArray` parameter can
+ * be set, or if the number of buckets should be the same across columns, `numBuckets` can be
+ * set as a convenience.
  *
  * NaN handling:
  * null and NaN values will be ignored from the column during `QuantileDiscretizer` fitting. This
@@ -104,7 +129,8 @@ private[feature] trait QuantileDiscretizerBase extends Params
  */
 @Since("1.6.0")
 final class QuantileDiscretizer @Since("1.6.0") (@Since("1.6.0") override val uid: String)
-  extends Estimator[Bucketizer] with QuantileDiscretizerBase with DefaultParamsWritable {
+  extends Estimator[Bucketizer] with QuantileDiscretizerBase with DefaultParamsWritable
+    with HasInputCols with HasOutputCols {
 
   @Since("1.6.0")
   def this() = this(Identifiable.randomUID("quantileDiscretizer"))
@@ -129,34 +155,96 @@ final class QuantileDiscretizer @Since("1.6.0") (@Since("1.6.0") override val ui
   @Since("2.1.0")
   def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
 
+  /** @group setParam */
+  @Since("2.3.0")
+  def setNumBucketsArray(value: Array[Int]): this.type = set(numBucketsArray, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setOutputCols(value: Array[String]): this.type = set(outputCols, value)
+
+  private[feature] def getInOutCols: (Array[String], Array[String]) = {
+    require((isSet(inputCol) && isSet(outputCol) && !isSet(inputCols) && !isSet(outputCols)) ||
+      (!isSet(inputCol) && !isSet(outputCol) && isSet(inputCols) && isSet(outputCols)),
+      "QuantileDiscretizer only supports setting either inputCol/outputCol or" +
+        "inputCols/outputCols."
+    )
+
+    if (isSet(inputCol)) {
+      (Array($(inputCol)), Array($(outputCol)))
+    } else {
+      require($(inputCols).length == $(outputCols).length,
+        "inputCols number do not match outputCols")
+      ($(inputCols), $(outputCols))
+    }
+  }
+
   @Since("1.6.0")
   override def transformSchema(schema: StructType): StructType = {
-    SchemaUtils.checkNumericType(schema, $(inputCol))
-    val inputFields = schema.fields
-    require(inputFields.forall(_.name != $(outputCol)),
-      s"Output column ${$(outputCol)} already exists.")
-    val attr = NominalAttribute.defaultAttr.withName($(outputCol))
-    val outputFields = inputFields :+ attr.toStructField()
+    val (inputColNames, outputColNames) = getInOutCols
+    val existingFields = schema.fields
+    var outputFields = existingFields
+    inputColNames.zip(outputColNames).foreach { case (inputColName, outputColName) =>
+      SchemaUtils.checkNumericType(schema, inputColName)
+      require(existingFields.forall(_.name != outputColName),
+        s"Output column ${outputColName} already exists.")
+      val attr = NominalAttribute.defaultAttr.withName(outputColName)
+      outputFields :+= attr.toStructField()
+    }
     StructType(outputFields)
   }
 
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): Bucketizer = {
     transformSchema(dataset.schema, logging = true)
-    val splits = dataset.stat.approxQuantile($(inputCol),
-      (0.0 to 1.0 by 1.0/$(numBuckets)).toArray, $(relativeError))
+    val bucketizer = new Bucketizer(uid).setHandleInvalid($(handleInvalid))
+    if (isSet(inputCols)) {
+      val splitsArray = if (isSet(numBucketsArray)) {
+        val probArrayPerCol = $(numBucketsArray).map { numOfBuckets =>
+          (0.0 to 1.0 by 1.0 / numOfBuckets).toArray
+        }
+
+        val probabilityArray = probArrayPerCol.flatten.sorted.distinct
+        val splitsArrayRaw = dataset.stat.approxQuantile($(inputCols),
+          probabilityArray, $(relativeError))
+
+        splitsArrayRaw.zip(probArrayPerCol).map { case (splits, probs) =>
+          val probSet = probs.toSet
+          val idxSet = probabilityArray.zipWithIndex.collect {
+            case (p, idx) if probSet(p) =>
+              idx
+          }.toSet
+          splits.zipWithIndex.collect {
+            case (s, idx) if idxSet(idx) =>
+              s
+          }
+        }
+      } else {
+        dataset.stat.approxQuantile($(inputCols),
+          (0.0 to 1.0 by 1.0 / $(numBuckets)).toArray, $(relativeError))
+      }
+      bucketizer.setSplitsArray(splitsArray.map(getDistinctSplits))
+    } else {
+      val splits = dataset.stat.approxQuantile($(inputCol),
+        (0.0 to 1.0 by 1.0 / $(numBuckets)).toArray, $(relativeError))
+      bucketizer.setSplits(getDistinctSplits(splits))
+    }
+    copyValues(bucketizer.setParent(this))
+  }
+
+  private def getDistinctSplits(splits: Array[Double]): Array[Double] = {
     splits(0) = Double.NegativeInfinity
     splits(splits.length - 1) = Double.PositiveInfinity
-
     val distinctSplits = splits.distinct
     if (splits.length != distinctSplits.length) {
       log.warn(s"Some quantiles were identical. Bucketing to ${distinctSplits.length - 1}" +
         s" buckets as a result.")
     }
-    val bucketizer = new Bucketizer(uid)
-      .setSplits(distinctSplits.sorted)
-      .setHandleInvalid($(handleInvalid))
-    copyValues(bucketizer.setParent(this))
+    distinctSplits.sorted
   }
 
   @Since("1.6.0")