-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-7140][MLLIB] only scan the first 16 entries in Vector.hashCode #5697
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -52,7 +52,7 @@ sealed trait Vector extends Serializable { | |
|
|
||
| override def equals(other: Any): Boolean = { | ||
| other match { | ||
| case v2: Vector => { | ||
| case v2: Vector => | ||
| if (this.size != v2.size) return false | ||
| (this, v2) match { | ||
| case (s1: SparseVector, s2: SparseVector) => | ||
|
|
@@ -63,20 +63,28 @@ sealed trait Vector extends Serializable { | |
| Vectors.equals(0 until d1.size, d1.values, s1.indices, s1.values) | ||
| case (_, _) => util.Arrays.equals(this.toArray, v2.toArray) | ||
| } | ||
| } | ||
| case _ => false | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns a hash code value for the vector. The hash code is based on its size and its nonzeros | ||
| * in the first 16 entries, using a hash algorithm similar to [[java.util.Arrays.hashCode]]. | ||
| */ | ||
| override def hashCode(): Int = { | ||
| var result: Int = size + 31 | ||
| this.foreachActive { case (index, value) => | ||
| // ignore explict 0 for comparison between sparse and dense | ||
| if (value != 0) { | ||
| result = 31 * result + index | ||
| // refer to {@link java.util.Arrays.equals} for hash algorithm | ||
| val bits = java.lang.Double.doubleToLongBits(value) | ||
| result = 31 * result + (bits ^ (bits >>> 32)).toInt | ||
| // This is a reference implementation. It calls return in foreachActive, which is slow. | ||
| // Subclasses should override it with optimized implementation. | ||
| var result: Int = 31 + size | ||
| this.foreachActive { (index, value) => | ||
| if (index < 16) { | ||
| // ignore explicit 0 for comparison between sparse and dense | ||
| if (value != 0) { | ||
| result = 31 * result + index | ||
| val bits = java.lang.Double.doubleToLongBits(value) | ||
| result = 31 * result + (bits ^ (bits >>> 32)).toInt | ||
| } | ||
| } else { | ||
| return result | ||
| } | ||
| } | ||
| result | ||
|
|
@@ -317,7 +325,7 @@ object Vectors { | |
| case SparseVector(n, ids, vs) => vs | ||
| case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) | ||
| } | ||
| val size = values.size | ||
| val size = values.length | ||
|
|
||
| if (p == 1) { | ||
| var sum = 0.0 | ||
|
|
@@ -371,8 +379,8 @@ object Vectors { | |
| val v1Indices = v1.indices | ||
| val v2Values = v2.values | ||
| val v2Indices = v2.indices | ||
| val nnzv1 = v1Indices.size | ||
| val nnzv2 = v2Indices.size | ||
| val nnzv1 = v1Indices.length | ||
| val nnzv2 = v2Indices.length | ||
|
|
||
| var kv1 = 0 | ||
| var kv2 = 0 | ||
|
|
@@ -401,7 +409,7 @@ object Vectors { | |
|
|
||
| case (DenseVector(vv1), DenseVector(vv2)) => | ||
| var kv = 0 | ||
| val sz = vv1.size | ||
| val sz = vv1.length | ||
| while (kv < sz) { | ||
| val score = vv1(kv) - vv2(kv) | ||
| squaredDistance += score * score | ||
|
|
@@ -422,7 +430,7 @@ object Vectors { | |
| var kv2 = 0 | ||
| val indices = v1.indices | ||
| var squaredDistance = 0.0 | ||
| val nnzv1 = indices.size | ||
| val nnzv1 = indices.length | ||
| val nnzv2 = v2.size | ||
| var iv1 = if (nnzv1 > 0) indices(kv1) else -1 | ||
|
|
||
|
|
@@ -451,8 +459,8 @@ object Vectors { | |
| v1Values: Array[Double], | ||
| v2Indices: IndexedSeq[Int], | ||
| v2Values: Array[Double]): Boolean = { | ||
| val v1Size = v1Values.size | ||
| val v2Size = v2Values.size | ||
| val v1Size = v1Values.length | ||
| val v2Size = v2Values.length | ||
| var k1 = 0 | ||
| var k2 = 0 | ||
| var allEqual = true | ||
|
|
@@ -493,14 +501,30 @@ class DenseVector(val values: Array[Double]) extends Vector { | |
|
|
||
| private[spark] override def foreachActive(f: (Int, Double) => Unit) = { | ||
| var i = 0 | ||
| val localValuesSize = values.size | ||
| val localValuesSize = values.length | ||
| val localValues = values | ||
|
|
||
| while (i < localValuesSize) { | ||
| f(i, localValues(i)) | ||
| i += 1 | ||
| } | ||
| } | ||
|
|
||
| override def hashCode(): Int = { | ||
| var result: Int = 31 + size | ||
| var i = 0 | ||
| val end = math.min(values.length, 16) | ||
| while (i < end) { | ||
| val v = values(i) | ||
| if (v != 0.0) { | ||
| result = 31 * result + i | ||
| val bits = java.lang.Double.doubleToLongBits(values(i)) | ||
| result = 31 * result + (bits ^ (bits >>> 32)).toInt | ||
| } | ||
| i += 1 | ||
| } | ||
| result | ||
| } | ||
| } | ||
|
|
||
| object DenseVector { | ||
|
|
@@ -522,8 +546,8 @@ class SparseVector( | |
| val values: Array[Double]) extends Vector { | ||
|
|
||
| require(indices.length == values.length, "Sparse vectors require that the dimension of the" + | ||
| s" indices match the dimension of the values. You provided ${indices.size} indices and " + | ||
| s" ${values.size} values.") | ||
| s" indices match the dimension of the values. You provided ${indices.length} indices and " + | ||
| s" ${values.length} values.") | ||
|
|
||
| override def toString: String = | ||
| "(%s,%s,%s)".format(size, indices.mkString("[", ",", "]"), values.mkString("[", ",", "]")) | ||
|
|
@@ -547,7 +571,7 @@ class SparseVector( | |
|
|
||
| private[spark] override def foreachActive(f: (Int, Double) => Unit) = { | ||
| var i = 0 | ||
| val localValuesSize = values.size | ||
| val localValuesSize = values.length | ||
| val localIndices = indices | ||
| val localValues = values | ||
|
|
||
|
|
@@ -556,6 +580,28 @@ class SparseVector( | |
| i += 1 | ||
| } | ||
| } | ||
|
|
||
| override def hashCode(): Int = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should SparseVector use the first 16 non-zero values instead of the first 16? For sparse data, it would not be surprising for a bunch of vectors to have all-zeros in the first 16 elements.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For dense vectors, it may contains many zeros at the beginning, and then the cost is still high if we look for the first 16 nonzeros. The
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, sounds fine. Curious: Why do DenseVector and SparseVector need to implement the same hash code? I was thinking DenseVectors could stay as they are (the first 16 elements), while SparseVectors could look at the first 16 non-zeros to reduce collisions. But I'm not familiar with this code.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a contract in Java (http://en.wikipedia.org/wiki/Java_hashCode()):
So |
||
| var result: Int = 31 + size | ||
| val end = values.length | ||
| var continue = true | ||
| var k = 0 | ||
| while ((k < end) & continue) { | ||
| val i = indices(k) | ||
| if (i < 16) { | ||
| val v = values(k) | ||
| if (v != 0.0) { | ||
| result = 31 * result + i | ||
| val bits = java.lang.Double.doubleToLongBits(v) | ||
| result = 31 * result + (bits ^ (bits >>> 32)).toInt | ||
| } | ||
| } else { | ||
| continue = false | ||
| } | ||
| k += 1 | ||
| } | ||
| result | ||
| } | ||
| } | ||
|
|
||
| object SparseVector { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is Vector.hashCode overridden, even though it will never be called (since Sparse & Dense Vector both override it)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just a reference implementation for subclasses to optimize.