Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
559c099
[SPARK-18334] MinHash should use binary hash distance
Nov 7, 2016
517a97b
Remove misleading documentation as requested
Yunni Nov 8, 2016
b546dbd
Add warning for multi-probe in MinHash
Nov 8, 2016
a3cd928
Merge branch 'SPARK-18334-yunn-minhash-bug' of https://github.com/Yun…
Nov 8, 2016
c8243c7
(1) Fix documentation as CR suggested (2) Fix typo in unit test
Nov 9, 2016
6aac8b3
Fix typo in unit test
Nov 9, 2016
9870743
[SPARK-18408] API Improvements for LSH
Nov 14, 2016
0e9250b
(1) Fix description for numHashFunctions (2) Make numEntries in MinHa…
Nov 14, 2016
adbbefe
Add assertion for hashFunction in BucketedRandomProjectionLSHSuite
Nov 14, 2016
c115ed3
Revert AND-amplification for a future PR
Nov 14, 2016
033ae5d
Code Review Comments
Nov 15, 2016
c597f4c
Add unit tests to run on Jenkins.
Nov 16, 2016
d759875
Add unit tests to run on Jenkins.
Nov 16, 2016
596eb06
CR comments
Nov 17, 2016
00d08bf
Merge branch 'master' of https://github.com/apache/spark into SPARK-1…
Nov 17, 2016
3d0810f
Update comments
Nov 17, 2016
257ef19
Add scaladoc for approximately min-wise independence
Yunni Nov 18, 2016
2c264b7
Change documentation reference
Yunni Nov 18, 2016
36ca278
Removing modulo numEntries
Nov 19, 2016
4508393
Merge branch 'SPARK-18408-yunn-api-improvements' of https://github.co…
Nov 19, 2016
939e9d5
Code Review Comments
Nov 22, 2016
8b9403d
Minimize the test cases by directly using artificial models
Nov 22, 2016
f0ebcb7
Code review comments
Nov 22, 2016
e198080
Merge branch 'master' of https://github.com/apache/spark into SPARK-1…
Yunni Nov 28, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add warning for multi-probe in MinHash
  • Loading branch information
Yun Ni committed Nov 8, 2016
commit b546dbd207a04e73bde097f25cae8c927322c2ae
18 changes: 13 additions & 5 deletions mllib/src/main/scala/org/apache/spark/ml/feature/LSH.scala
Original file line number Diff line number Diff line change
Expand Up @@ -99,20 +99,27 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
validateAndTransformSchema(schema)
}

/**
* Check prerequisite for nearest neighbor. This method will be overridden in subclasses.
*
* @param singleProbe True for using single-probe; false for multi-probe
*/
protected[this] def checkNearestNeighbor(singleProbe: Boolean) = {}

/**
* Given a large dataset and an item, approximately find at most k items which have the closest
* distance to the item. If the [[outputCol]] is missing, the method will transform the data; if
* the [[outputCol]] exists, it will use the [[outputCol]]. This allows caching of the
* transformed data when necessary.
*
* This method implements two ways of fetching k nearest neighbors:
* - Single Probing: Fast, return at most k elements (Probing only one buckets)
* - Multiple Probing: Slow, return exact k elements (Probing multiple buckets close to the key)
* - Single-probe: Fast, return at most k elements (Probing only one buckets)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Probing only one bucket"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

* - Multi-probe: Slow, return exact k elements (Probing multiple buckets close to the key)
*
* @param dataset the dataset to search for nearest neighbors of the key
* @param key Feature vector representing the item to search for
* @param numNearestNeighbors The maximum number of nearest neighbors
* @param singleProbing True for using Single Probing; false for multiple probing
* @param singleProbe True for using single-probe; false for multi-probe
* @param distCol Output column for storing the distance between each result row and the key
* @return A dataset containing at most k items closest to the key. A distCol is added to show
* the distance between each row and the key.
Expand All @@ -121,9 +128,10 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
dataset: Dataset[_],
key: Vector,
numNearestNeighbors: Int,
singleProbing: Boolean,
singleProbe: Boolean,
distCol: String): Dataset[_] = {
require(numNearestNeighbors > 0, "The number of nearest neighbors cannot be less than 1")
checkNearestNeighbor(singleProbe)
// Get Hash Value of the key
val keyHash = hashFunction(key)
val modelDataset: DataFrame = if (!dataset.columns.contains($(outputCol))) {
Expand All @@ -136,7 +144,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
val hashDistUDF = udf((x: Vector) => hashDistance(x, keyHash), DataTypes.DoubleType)
val hashDistCol = hashDistUDF(col($(outputCol)))

val modelSubset = if (singleProbing) {
val modelSubset = if (singleProbe) {
modelDataset.filter(hashDistCol === 0.0)
} else {
// Compute threshold to get exact k elements.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,14 @@ class MinHashModel private[ml] (
}
}

@Since("2.1.0")
override protected[this] def checkNearestNeighbor(singleProbe: Boolean) = {
if (!singleProbe) {
log.warn("Multi-probe for MinHash will run brute force nearest neighbor when there " +
"aren't enough candidates.")
}
}

@Since("2.1.0")
override def copy(extra: ParamMap): this.type = defaultCopy(extra)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ private[ml] object LSHTest {
* @param dataset the dataset to look for the key
* @param key The key to hash for the item
* @param k The maximum number of items closest to the key
* @param singleProbe True for using single-probe; false for multi-probe
* @tparam T The class type of lsh
* @return A tuple of two doubles, representing precision and recall rate
*/
Expand All @@ -91,22 +92,22 @@ private[ml] object LSHTest {
dataset: Dataset[_],
key: Vector,
k: Int,
singleProbing: Boolean): (Double, Double) = {
singleProbe: Boolean): (Double, Double) = {
val model = lsh.fit(dataset)

// Compute expected
val distUDF = udf((x: Vector) => model.keyDistance(x, key), DataTypes.DoubleType)
val expected = dataset.sort(distUDF(col(model.getInputCol))).limit(k)

// Compute actual
val actual = model.approxNearestNeighbors(dataset, key, k, singleProbing, "distCol")
val actual = model.approxNearestNeighbors(dataset, key, k, singleProbe, "distCol")

assert(actual.schema.sameType(model
.transformSchema(dataset.schema)
.add("distCol", DataTypes.DoubleType))
)

if (!singleProbing) {
if (!singleProbe) {
assert(actual.count() == k)
}

Expand Down