Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
85d22c3
Locality Sensitive Hashing (LSH) Python API.
yanboliang Nov 4, 2016
cdeca1c
Fix typos.
yanboliang Nov 4, 2016
66d308b
Merge branch 'spark-18080' of https://github.com/yanboliang/spark int…
Jan 25, 2017
d62a2d0
Merge branch 'master' of https://github.com/apache/spark into spark-1…
Jan 26, 2017
dafc4d1
Changes to fix LSH Python API
Jan 26, 2017
ac1f4f7
Merge branch 'spark-18080' of https://github.com/Yunni/spark into spa…
Yunni Jan 26, 2017
3a21f26
Fix examples and class definition
Yunni Jan 26, 2017
65dab3e
Add python examples and updated the user guide
Jan 26, 2017
3d3bcf0
Fix lint issues
Jan 26, 2017
69dccde
Fix python doc issues
Jan 26, 2017
e7542d0
Fix 'Definition list ends without a blank line'
Jan 26, 2017
5cfc9c5
Fix python unit tests
Jan 26, 2017
ccabbf4
Merge branch 'master' of https://github.com/apache/spark into spark-1…
Feb 7, 2017
2508a2f
Code Review Comments
Feb 8, 2017
2dd6aad
Merge branch 'master' of https://github.com/apache/spark into spark-1…
Feb 8, 2017
8e5468f
Add printing messages for the LSH Scala/Java/Python exmaples
Feb 8, 2017
6e85e1a
(1) Rename 'keys''values' to 'features''hashes' (2) Printing the ids …
Feb 8, 2017
4bc670c
Fix jenkins build
Feb 9, 2017
b45ec0a
Fix failed jenkins test
Feb 9, 2017
1b70b91
Fix Jenkins test
Feb 9, 2017
b1da01e
Code Review Comments for the LSH examples
Feb 10, 2017
8f1d708
Add alias for similarity join examples
Feb 10, 2017
49edc93
Merge branch 'master' of https://github.com/apache/spark into spark-1…
Feb 14, 2017
c64d50b
Code Review Comments
Feb 14, 2017
5d55752
Code Review Comments: Some minor fixes
Feb 14, 2017
d849c3a
Code Review Comment
Feb 15, 2017
36fd9bc
Merge branch 'master' of https://github.com/apache/spark into spark-1…
Feb 15, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add alias for similarity join examples
  • Loading branch information
Yun Ni committed Feb 10, 2017
commit 8f1d70819e9ed6d8b8bb8540dcc1ca9747b67cae
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import static org.apache.spark.sql.functions.*;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just import col here and minhash

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

// $example off$

public class JavaBucketedRandomProjectionLSHExample {
Expand Down Expand Up @@ -85,7 +87,10 @@ public static void main(String[] args) {
// We could avoid computing hashes by passing in the already-transformed dataset, e.g.
// `model.approxSimilarityJoin(transformedA, transformedB, 1.5)`
System.out.println("Approximately joining dfA and dfB on distance smaller than 1.5:");
model.approxSimilarityJoin(dfA, dfB, 1.5).show();
model.approxSimilarityJoin(dfA, dfB, 1.5)
.select(col("datasetA.id").alias("idA"),
col("datasetB.id").alias("idB"),
col("distCol").alias("EuclideanDistance")).show();

// Compute the locality sensitive hashes for the input rows, then perform approximate nearest
// neighbor search.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import static org.apache.spark.sql.functions.*;
// $example off$

public class JavaMinHashLSHExample {
Expand Down Expand Up @@ -85,8 +87,9 @@ public static void main(String[] args) {
// `model.approxSimilarityJoin(transformedA, transformedB, 0.6)`
System.out.println("Approximately joining dfA and dfB on Jaccard distance smaller than 0.6:");
model.approxSimilarityJoin(dfA, dfB, 0.6)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we make it be "exactDistance" or "euclideanDistance" and "jaccardSimilarity" here and in all the examples, for random projection and minhash respectively. I think it will be much clearer to the user what distCol represents.

.select("datasetA.id", "datasetB.id", "distCol")
.show();
.select(col("datasetA.id").alias("idA"),
col("datasetB.id").alias("idB"),
col("distCol").alias("JaccardDistance")).show();

// Compute the locality sensitive hashes for the input rows, then perform approximate nearest
// neighbor search.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# $example on$
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
# $example off$
from pyspark.sql import SparkSession

Expand Down Expand Up @@ -65,7 +66,9 @@
# `model.approxSimilarityJoin(transformedA, transformedB, 1.5)`
print("Approximately joining dfA and dfB on Euclidean distance smaller than 1.5:")
model.approxSimilarityJoin(dfA, dfB, 1.5)\
.select("datasetA.id", "datasetB.id", "distCol").show()
.select(col("datasetA.id").alias("idA"),
col("datasetB.id").alias("idB"),
col("distCol").alias("EuclideanDistance")).show()

# Compute the locality sensitive hashes for the input rows, then perform approximate nearest
# neighbor search.
Expand Down
5 changes: 4 additions & 1 deletion examples/src/main/python/ml/min_hash_lsh_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# $example on$
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
# $example off$
from pyspark.sql import SparkSession

Expand Down Expand Up @@ -62,7 +63,9 @@
# `model.approxSimilarityJoin(transformedA, transformedB, 0.6)`
print("Approximately joining dfA and dfB on distance smaller than 0.6:")
model.approxSimilarityJoin(dfA, dfB, 0.6)\
.select("datasetA.id", "datasetB.id", "distCol").show()
.select(col("datasetA.id").alias("idA"),
col("datasetB.id").alias("idB"),
col("distCol").alias("JaccardDistance")).show()

# Compute the locality sensitive hashes for the input rows, then perform approximate nearest
# neighbor search.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.BucketedRandomProjectionLSH
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.functions._
// $example off$
import org.apache.spark.sql.SparkSession

Expand Down Expand Up @@ -67,8 +68,9 @@ object BucketedRandomProjectionLSHExample {
// `model.approxSimilarityJoin(transformedA, transformedB, 1.5)`
println("Approximately joining dfA and dfB on Euclidean distance smaller than 1.5:")
model.approxSimilarityJoin(dfA, dfB, 1.5)
.select("datasetA.id", "datasetB.id", "distCol")
.show()
.select(col("datasetA.id").alias("idA"),
col("datasetB.id").alias("idB"),
col("distCol").alias("EuclideanDistance")).show()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can just pass distCol = EuclideanDistance here, and for approxNearestNeighbors.

We can do this throughout the examples (and obviously for min hash change it to jaccard accordingly).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 6 places.


// Compute the locality sensitive hashes for the input rows, then perform approximate nearest
// neighbor search.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package org.apache.spark.examples.ml
// $example on$
import org.apache.spark.ml.feature.MinHashLSH
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.functions._
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just import col here and above

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

// $example off$
import org.apache.spark.sql.SparkSession

Expand Down Expand Up @@ -64,8 +65,9 @@ object MinHashLSHExample {
// `model.approxSimilarityJoin(transformedA, transformedB, 0.6)`
println("Approximately joining dfA and dfB on Jaccard distance smaller than 0.6:")
model.approxSimilarityJoin(dfA, dfB, 0.6)
.select("datasetA.id", "datasetB.id", "distCol")
.show()
.select(col("datasetA.id").alias("idA"),
col("datasetB.id").alias("idB"),
col("distCol").alias("JaccardDistance")).show()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pass distCol as method parameter instead of alias

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


// Compute the locality sensitive hashes for the input rows, then perform approximate nearest
// neighbor search.
Expand Down
38 changes: 21 additions & 17 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
.. seealso:: `Hashing for Similarity Search: A Survey <https://arxiv.org/abs/1408.2927>`_

>>> from pyspark.ml.linalg import Vectors
>>> from pyspark.sql.functions import col
>>> data = [(0, Vectors.dense([-1.0, -1.0 ]),),
... (1, Vectors.dense([-1.0, 1.0 ]),),
... (2, Vectors.dense([1.0, -1.0 ]),),
Expand All @@ -229,14 +230,15 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
>>> df2 = spark.createDataFrame(data2, ["id", "features"])
>>> model.approxNearestNeighbors(df2, Vectors.dense([1.0, 2.0]), 1).collect()
[Row(id=4, features=DenseVector([2.0, 2.0]), hashes=[DenseVector([1.0])], distCol=1.0)]
>>> model.approxSimilarityJoin(df, df2, 3.0).select("datasetA.id",
... "datasetB.id",
... "distCol").show()
+---+---+----------------+
| id| id| distCol|
+---+---+----------------+
| 3| 6|2.23606797749979|
+---+---+----------------+
>>> model.approxSimilarityJoin(df, df2, 3.0).select(
... col("datasetA.id").alias("idA"),
... col("datasetB.id").alias("idB"),
... col("distCol").alias("EuclideanDistance")).show()
+---+---+-----------------+
|idA|idB|EuclideanDistance|
+---+---+-----------------+
| 3| 6| 2.23606797749979|
+---+---+-----------------+
...
>>> brpPath = temp_path + "/brp"
>>> brp.save(brpPath)
Expand Down Expand Up @@ -962,6 +964,7 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed,
.. seealso:: `Wikipedia on MinHash <https://en.wikipedia.org/wiki/MinHash>`_

>>> from pyspark.ml.linalg import Vectors
>>> from pyspark.sql.functions import col
>>> data = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
... (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
... (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]
Expand All @@ -977,15 +980,16 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, HasOutputCol, HasSeed,
>>> key = Vectors.sparse(6, [1, 2], [1.0, 1.0])
>>> model.approxNearestNeighbors(df2, key, 1).collect()
[Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), hashes=[DenseVector([-163892...
>>> model.approxSimilarityJoin(df, df2, 0.6).select("datasetA.id",
... "datasetB.id",
... "distCol").show()
+---+---+-------+
| id| id|distCol|
+---+---+-------+
| 1| 4| 0.5|
| 0| 5| 0.5|
+---+---+-------+
>>> model.approxSimilarityJoin(df, df2, 0.6).select(
... col("datasetA.id").alias("idA"),
... col("datasetB.id").alias("idB"),
... col("distCol").alias("JaccardDistance")).show()
+---+---+---------------+
|idA|idB|JaccardDistance|
+---+---+---------------+
| 1| 4| 0.5|
| 0| 5| 0.5|
+---+---+---------------+
...
>>> mhPath = temp_path + "/mh"
>>> mh.save(mhPath)
Expand Down