SignRandomProjection: LSH Classes for cosine distance metrics

apache · Yunni · Sep 13, 2016 · Sep 13, 2016 · Sep 15, 2016 · Sep 19, 2016
commit fb120afc65fee1badc23d3e502f7196dc1d3c4fe
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SignRandomProjection.scala
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.util.Random
+
+import breeze.linalg.normalize
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors, VectorUDT}
+import org.apache.spark.ml.param.shared.HasSeed
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * :: Experimental ::
+ * Model produced by [[SignRandomProjection]]
+ * @param randUnitVectors An array of random unit vectors. Each vector represents a hash function.
+ */
+@Experimental
+@Since("2.1.0")
+class SignRandomProjectionModel private[ml] (
+    override val uid: String,
+    val randUnitVectors: Array[Vector])
+  extends LSHModel[SignRandomProjectionModel] {
+
+  @Since("2.1.0")
+  override protected[this] val hashFunction: (Vector) => Vector = {
+    key: Vector => {
+      val hashValues: Array[Double] = randUnitVectors.map({
+        randUnitVector => Math.signum(BLAS.dot(key, randUnitVector))
+      })
+      Vectors.dense(hashValues)
+    }
+  }
+
+  @Since("2.1.0")
+  override protected[ml] def keyDistance(x: Vector, y: Vector): Double = {
+    // 1 - cosine similarity
+    1 - BLAS.dot(x, y) / (Vectors.norm(x, 2) * Vectors.norm(y, 2))
+  }
+
+  @Since("2.1.0")
+  override protected[ml] def hashDistance(x: Vector, y: Vector): Double = {
+    // Since it's generated by hashing, it will be a pair of dense vectors.
+    x.toDense.values.zip(y.toDense.values).map(x => math.abs(x._1 - x._2)).min
+  }
+}
+
+/**
+ * :: Experimental ::
+ * This [[SignRandomProjectionModel]] implements Locality Sensitive Hashing functions for cosine
+ * distance metrics.
+ *
+ * The input is dense or sparse vectors, each of which represents a point in the space. The output
+ * will be vectors of configurable dimension, taking values from {-1, 1, 0}. Hash value in the same
+ * dimension is calculated by the same hash function.
+ *
+ * References:
+ * Wang, Jingdong et al. "Hashing for similarity search: A survey." arXiv preprint
+ * arXiv:1408.2927 (2014).
+ */
+@Experimental
+@Since("2.1.0")
+class SignRandomProjection(override val uid: String) extends LSH[SignRandomProjectionModel]
+  with HasSeed {
+
+  @Since("2.1.0")
+  override def setInputCol(value: String): this.type = super.setInputCol(value)
+
+  @Since("2.1.0")
+  override def setOutputCol(value: String): this.type = super.setOutputCol(value)
+
+  @Since("2.1.0")
+  override def setOutputDim(value: Int): this.type = super.setOutputDim(value)
+
+  @Since("2.1.0")
+  def this() = {
+    this(Identifiable.randomUID("random projection"))
+  }
+
+  /** @group setParam */
+  @Since("2.1.0")
+  def setSeed(value: Long): this.type = set(seed, value)
+
+  @Since("2.1.0")
+  override protected[this] def createRawLSHModel(inputDim: Int): SignRandomProjectionModel = {
+    val rand = new Random($(seed))
+    val randUnitVectors: Array[Vector] = {
+      Array.fill($(outputDim)) {
+        val randArray = Array.fill(inputDim)(rand.nextGaussian())
+        Vectors.fromBreeze(normalize(breeze.linalg.Vector(randArray)))
+      }
+    }
+    new SignRandomProjectionModel(uid, randUnitVectors)
+  }
+
+  @Since("2.1.0")
+  override def transformSchema(schema: StructType): StructType = {
+    SchemaUtils.checkColumnType(schema, $(inputCol), new VectorUDT)
+    validateAndTransformSchema(schema)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/SignRandomProjectionSuite.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import breeze.numerics.{cos, sin}
+import breeze.numerics.constants.Pi
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+class SignRandomProjectionSuite extends SparkFunSuite with MLlibTestSparkContext {
+  test("SignRandomProjection") {
+    val data = {
+      for (i <- -5 until 5; j <- -5 until 5) yield Vectors.dense(i.toDouble, j.toDouble)
+    }
+    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys")
+
+    val srp = new SignRandomProjection()
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setSeed(0)
+
+    val (falsePositive, falseNegative) = LSHTest.calculateLSHProperty(df, srp, 1.6, 0.4)
+    assert(falsePositive < 0.1)
+    assert(falseNegative < 0.1)
+  }
+
+  test("approxNearestNeighbors for cosine distance") {
+    val data = {
+      for (i <- -5 until 5; j <- -5 until 5) yield Vectors.dense(i.toDouble, j.toDouble)
+    }
+    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("keys")
+    val key = Vectors.dense(1.2, 3.4)
+
+    val mh = new SignRandomProjection()
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setSeed(0)
+
+    val (precision, recall) = LSHTest.calculateApproxNearestNeighbors(mh, df, key, 30,
+      singleProbing = true)
+    assert(precision >= 0.8)
+    assert(recall >= 0.8)
+  }
+
+  test("approxSimilarityJoin for cosine distance") {
+    val dataA = {
+      for (i <- -5 until 5; j <- -5 until 5) yield Vectors.dense(i.toDouble, j.toDouble)
+    }
+    val dfA = spark.createDataFrame(dataA.map(Tuple1.apply)).toDF("keys")
+
+    val dataB = {
+      for (i <- 0 until 24) yield Vectors.dense(10 * sin(Pi / 12 * i), 10 * cos(Pi / 12 * i))
+    }
+    val dfB = spark.createDataFrame(dataB.map(Tuple1.apply)).toDF("keys")
+
+    val mh = new SignRandomProjection()
+      .setInputCol("keys")
+      .setOutputCol("values")
+      .setSeed(0)
+
+    val (precision, recall) = LSHTest.calculateApproxSimilarityJoin(mh, dfA, dfB, 0.5)
+    assert(precision == 1.0)
+    assert(recall >= 0.8)
+  }
+}