Added VectorSizeHint Transformer in ml.feature.

apache · MrBago · Nov 14, 2017 · Nov 20, 2017 · Nov 20, 2017 · Nov 21, 2017
commit 24cc41792770c7f08481a2bbcb120a119631e5ee
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSizeHint.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSizeHint.scala
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.linalg.{Vector, VectorUDT}
+import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
+import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol}
+import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
+import org.apache.spark.sql.{Column, DataFrame, Dataset}
+import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * A feature transformer that adds vector size information to a vector column.
+ */
+@Experimental
+@Since("2.3.0")
+class VectorSizeHint @Since("2.3.0") (@Since("2.3.0") override val uid: String)
+  extends Transformer with HasInputCol with HasHandleInvalid with DefaultParamsWritable {
+
+  @Since("2.3.0")
+  def this() = this(Identifiable.randomUID("vectSizeHint"))
+
+  @Since("2.3.0")
+  val size = new Param[Int](this, "size", "Size of vectors in column.", {s: Int => s >= 0})
+
+  @Since("2.3.0")
+  def getSize: Int = getOrDefault(size)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setSize(value: Int): this.type = set(size, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  @Since("2.3.0")
+  override val handleInvalid: Param[String] = new Param[String](
+    this,
+    "handleInvalid",
+    "How to handle invalid vectors in inputCol, (invalid vectors include nulls and vectors with " +
+      "the wrong size. The options are `skip` (filter out rows with invalid vectors), `error` " +
+      "(throw an error) and `optimistic` (don't check the vector size).",
+    ParamValidators.inArray(VectorSizeHint.supportedHandleInvalids))
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
+  setDefault(handleInvalid, VectorSizeHint.ERROR_INVALID)
+
+  @Since("2.3.0")
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    val localInputCol = getInputCol
+    val localSize = getSize
+    val localHandleInvalid = getHandleInvalid
+
+    val group = AttributeGroup.fromStructField(dataset.schema(localInputCol))
+    if (localHandleInvalid == VectorSizeHint.OPTIMISTIC_INVALID && group.size == localSize) {
+      dataset.toDF
+    } else {
+      val newGroup = if (group.size == localSize) {
+        // Pass along any existing metadata about vector.
+        group
+      } else {
+        new AttributeGroup(localInputCol, localSize)
+      }
+
+      val newCol: Column = localHandleInvalid match {
+        case VectorSizeHint.OPTIMISTIC_INVALID => col(localInputCol)
+        case VectorSizeHint.ERROR_INVALID =>
+          val checkVectorSize = { vector: Vector =>
+            if (vector == null) {
+              throw new VectorSizeHint.InvalidEntryException(s"Got null vector in VectorSizeHint," +
+                s" set `handleInvalid` to 'skip' to filter invalid rows.")
+            }
+            if (vector.size != localSize) {
+              throw new VectorSizeHint.InvalidEntryException(s"VectorSizeHint Expecting a vector " +
+                s"of size $localSize but got ${vector.size}")
+            }
+            vector
+          }
+          udf(checkVectorSize, new VectorUDT)(col(localInputCol))
+        case VectorSizeHint.SKIP_INVALID =>
+          val checkVectorSize = { vector: Vector =>
+            if (vector != null && vector.size == localSize) {
+              vector
+            } else {
+              null
+            }
+          }
+          udf(checkVectorSize, new VectorUDT)(col(localInputCol))
+      }
+
+      val res = dataset.withColumn(localInputCol, newCol.as(localInputCol, newGroup.toMetadata))
+      if (localHandleInvalid == VectorSizeHint.SKIP_INVALID) {
+        res.filter(col(localInputCol).isNotNull)
+      } else {
+        res
+      }
+    }
+  }
+
+  @Since("2.3.0")
+  override def transformSchema(schema: StructType): StructType = {
+    val inputColType = schema(getInputCol).dataType
+    require(
+      inputColType.isInstanceOf[VectorUDT],
+      s"Input column, $getInputCol must be of Vector type, got $inputColType"
+    )
+    schema
+  }
+
+  @Since("2.3.0")
+  override def copy(extra: ParamMap): VectorAssembler = defaultCopy(extra)
+}
+
+@Experimental
+@Since("2.3.0")
+object VectorSizeHint extends DefaultParamsReadable[VectorSizeHint] {
+
+  private[feature] val OPTIMISTIC_INVALID = "optimistic"
+  private[feature] val ERROR_INVALID = "error"
+  private[feature] val SKIP_INVALID = "skip"
+  private[feature] val supportedHandleInvalids: Array[String] =
+    Array(OPTIMISTIC_INVALID, ERROR_INVALID, SKIP_INVALID)
+
+  @Since("2.3.0")
+  class InvalidEntryException(msg: String) extends Exception(msg)
+
+  @Since("2.3.0")
+  override def load(path: String): VectorSizeHint = super.load(path)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSizeHintSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSizeHintSuite.scala
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+
+class VectorSizeHintSuite
+  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+  import testImplicits._
+
+  test("Test Param Validators") {
+    assertThrows[IllegalArgumentException] (new VectorSizeHint().setHandleInvalid("invalidValue"))
+    assertThrows[IllegalArgumentException] (new VectorSizeHint().setSize(-3))
+  }
+
+  test("Adding size to column of vectors.") {
+
+    val size = 3
+    val denseVector = Vectors.dense(1, 2, 3)
+    val sparseVector = Vectors.sparse(size, Array(), Array())
+
+    val data = Seq(denseVector, denseVector, sparseVector).map(Tuple1.apply)
+    val dataFrame = data.toDF("vector")
+
+    val transformer = new VectorSizeHint()
+      .setInputCol("vector")
+      .setSize(3)
+      .setHandleInvalid("error")
+    val withSize = transformer.transform(dataFrame)
+    assert(
+      AttributeGroup.fromStructField(withSize.schema("vector")).size == size,
+      "Transformer did not add expected size data.")
+  }
+
+  test("Size hint preserves attributes.") {
+
+    case class Foo(x: Double, y: Double, z: Double)
+    val size = 3
+    val data = Seq((1, 2, 3), (2, 3, 3))
+    val boo = data.toDF("x", "y", "z")
+
+    val assembler = new VectorAssembler()
+      .setInputCols(Array("x", "y", "z"))
+      .setOutputCol("vector")
+    val dataFrameWithMeatadata = assembler.transform(boo)
+    val group = AttributeGroup.fromStructField(dataFrameWithMeatadata.schema("vector"))
+
+    val transformer = new VectorSizeHint()
+      .setInputCol("vector")
+      .setSize(3)
+      .setHandleInvalid("error")
+    val withSize = transformer.transform(dataFrameWithMeatadata)
+
+    val newGroup = AttributeGroup.fromStructField(withSize.schema("vector"))
+    assert(newGroup.size == size, "Transformer did not add expected size data.")
+    assert(
+      newGroup.attributes.get.deep === group.attributes.get.deep,
+      "SizeHintTransformer did not preserve attributes.")
+  }
+
+  test("Handle invalid does the right thing.") {
+
+    val vector = Vectors.dense(1, 2, 3)
+    val short = Vectors.dense(2)
+    val dataWithNull = Seq(vector, null).map(Tuple1.apply).toDF("vector")
+    val dataWithShort = Seq(vector, short).map(Tuple1.apply).toDF("vector")
+
+    val sizeHint = new VectorSizeHint()
+      .setInputCol("vector")
+      .setHandleInvalid("error")
+      .setSize(3)
+
+    assertThrows[SparkException](sizeHint.transform(dataWithNull).collect)
+    assertThrows[SparkException](sizeHint.transform(dataWithShort).collect)
+
+    sizeHint.setHandleInvalid("skip")
+    assert(sizeHint.transform(dataWithNull).count() === 1)
+    assert(sizeHint.transform(dataWithShort).count() === 1)
+  }
+}
+
+class VectorSizeHintStreamingSuite extends StreamTest {
+
+  import testImplicits._
+
+  test("Test assemble vectors with size hint in steaming.") {
+    val a = Vectors.dense(0, 1, 2)
+    val b = Vectors.sparse(4, Array(0, 3), Array(3, 6))
+
+    val stream = MemoryStream[(Vector, Vector)]
+    val streamingDF = stream.toDS.toDF("a", "b")
+    val sizeHintA = new VectorSizeHint()
+      .setSize(3)
+      .setInputCol("a")
+    val sizeHintB = new VectorSizeHint()
+      .setSize(4)
+      .setInputCol("b")
+    val vectorAssembler = new VectorAssembler()
+      .setInputCols(Array("a", "b"))
+      .setOutputCol("assembled")
+    val output = Seq(sizeHintA, sizeHintB, vectorAssembler).foldLeft(streamingDF) {
+      case (data, transform) => transform.transform(data)
+    }.select("assembled")
+
+    val expected = Vectors.dense(0, 1, 2, 3, 0, 0, 6)
+
+    testStream (output) (
+      AddData(stream, (a, b), (a, b)),
+      CheckAnswerRows(Seq(Row(expected), Row(expected)), false, false)
+    )
+  }
+}