Add warning for multi-probe in MinHash

apache · Yunni · Nov 7, 2016 · Nov 8, 2016 · Nov 8, 2016 · Nov 8, 2016
commit b546dbd207a04e73bde097f25cae8c927322c2ae
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
@@ -99,20 +99,27 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
     validateAndTransformSchema(schema)
   }
 
+  /**
+   * Check prerequisite for nearest neighbor. This method will be overridden in subclasses.
+   *
+   * @param singleProbe True for using single-probe; false for multi-probe
+   */
+  protected[this] def checkNearestNeighbor(singleProbe: Boolean) = {}
+
   /**
    * Given a large dataset and an item, approximately find at most k items which have the closest
    * distance to the item. If the [[outputCol]] is missing, the method will transform the data; if
    * the [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the
    * transformed data when necessary.
    *
    * This method implements two ways of fetching k nearest neighbors:
-   *  - Single Probing: Fast, return at most k elements (Probing only one buckets)
-   *  - Multiple Probing: Slow, return exact k elements (Probing multiple buckets close to the key)
+   *  - Single-probe: Fast, return at most k elements (Probing only one buckets)
+   *  - Multi-probe: Slow, return exact k elements (Probing multiple buckets close to the key)
    *
    * @param dataset the dataset to search for nearest neighbors of the key
    * @param key Feature vector representing the item to search for
    * @param numNearestNeighbors The maximum number of nearest neighbors
-   * @param singleProbing True for using Single Probing; false for multiple probing
+   * @param singleProbe True for using single-probe; false for multi-probe
    * @param distCol Output column for storing the distance between each result row and the key
    * @return A dataset containing at most k items closest to the key. A distCol is added to show
    *         the distance between each row and the key.
@@ -121,9 +128,10 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
       dataset: Dataset[_],
       key: Vector,
       numNearestNeighbors: Int,
-      singleProbing: Boolean,
+      singleProbe: Boolean,
       distCol: String): Dataset[_] = {
     require(numNearestNeighbors > 0, "The number of nearest neighbors cannot be less than 1")
+    checkNearestNeighbor(singleProbe)
     // Get Hash Value of the key
     val keyHash = hashFunction(key)
     val modelDataset: DataFrame = if (!dataset.columns.contains($(outputCol))) {
@@ -136,7 +144,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
     val hashDistUDF = udf((x: Vector) => hashDistance(x, keyHash), DataTypes.DoubleType)
     val hashDistCol = hashDistUDF(col($(outputCol)))
 
-    val modelSubset = if (singleProbing) {
+    val modelSubset = if (singleProbe) {
       modelDataset.filter(hashDistCol === 0.0)
     } else {
       // Compute threshold to get exact k elements.

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHash.scala
@@ -83,6 +83,14 @@ class MinHashModel private[ml] (
     }
   }
 
+  @Since("2.1.0")
+  override protected[this] def checkNearestNeighbor(singleProbe: Boolean) = {
+    if (!singleProbe) {
+      log.warn("Multi-probe for MinHash will run brute force nearest neighbor when there " +
+        "aren't enough candidates.")
+    }
+  }
+
   @Since("2.1.0")
   override def copy(extra: ParamMap): this.type = defaultCopy(extra)
 

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LSHTest.scala
@@ -83,6 +83,7 @@ private[ml] object LSHTest {
    * @param dataset the dataset to look for the key
    * @param key The key to hash for the item
    * @param k The maximum number of items closest to the key
+   * @param singleProbe True for using single-probe; false for multi-probe
    * @tparam T The class type of lsh
    * @return A tuple of two doubles, representing precision and recall rate
    */
@@ -91,22 +92,22 @@ private[ml] object LSHTest {
       dataset: Dataset[_],
       key: Vector,
       k: Int,
-      singleProbing: Boolean): (Double, Double) = {
+      singleProbe: Boolean): (Double, Double) = {
     val model = lsh.fit(dataset)
 
     // Compute expected
     val distUDF = udf((x: Vector) => model.keyDistance(x, key), DataTypes.DoubleType)
     val expected = dataset.sort(distUDF(col(model.getInputCol))).limit(k)
 
     // Compute actual
-    val actual = model.approxNearestNeighbors(dataset, key, k, singleProbing, "distCol")
+    val actual = model.approxNearestNeighbors(dataset, key, k, singleProbe, "distCol")
 
     assert(actual.schema.sameType(model
       .transformSchema(dataset.schema)
       .add("distCol", DataTypes.DoubleType))
     )
 
-    if (!singleProbing) {
+    if (!singleProbe) {
       assert(actual.count() == k)
     }