Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 67 additions & 21 deletions mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ sealed trait Vector extends Serializable {

override def equals(other: Any): Boolean = {
other match {
case v2: Vector => {
case v2: Vector =>
if (this.size != v2.size) return false
(this, v2) match {
case (s1: SparseVector, s2: SparseVector) =>
Expand All @@ -63,20 +63,28 @@ sealed trait Vector extends Serializable {
Vectors.equals(0 until d1.size, d1.values, s1.indices, s1.values)
case (_, _) => util.Arrays.equals(this.toArray, v2.toArray)
}
}
case _ => false
}
}

/**
* Returns a hash code value for the vector. The hash code is based on its size and its nonzeros
* in the first 16 entries, using a hash algorithm similar to [[java.util.Arrays.hashCode]].
*/
override def hashCode(): Int = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is Vector.hashCode overridden, even though it will never be called (since Sparse & Dense Vector both override it)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a reference implementation for subclasses to optimize.

var result: Int = size + 31
this.foreachActive { case (index, value) =>
// ignore explict 0 for comparison between sparse and dense
if (value != 0) {
result = 31 * result + index
// refer to {@link java.util.Arrays.equals} for hash algorithm
val bits = java.lang.Double.doubleToLongBits(value)
result = 31 * result + (bits ^ (bits >>> 32)).toInt
// This is a reference implementation. It calls return in foreachActive, which is slow.
// Subclasses should override it with optimized implementation.
var result: Int = 31 + size
this.foreachActive { (index, value) =>
if (index < 16) {
// ignore explicit 0 for comparison between sparse and dense
if (value != 0) {
result = 31 * result + index
val bits = java.lang.Double.doubleToLongBits(value)
result = 31 * result + (bits ^ (bits >>> 32)).toInt
}
} else {
return result
}
}
result
Expand Down Expand Up @@ -317,7 +325,7 @@ object Vectors {
case SparseVector(n, ids, vs) => vs
case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
}
val size = values.size
val size = values.length

if (p == 1) {
var sum = 0.0
Expand Down Expand Up @@ -371,8 +379,8 @@ object Vectors {
val v1Indices = v1.indices
val v2Values = v2.values
val v2Indices = v2.indices
val nnzv1 = v1Indices.size
val nnzv2 = v2Indices.size
val nnzv1 = v1Indices.length
val nnzv2 = v2Indices.length

var kv1 = 0
var kv2 = 0
Expand Down Expand Up @@ -401,7 +409,7 @@ object Vectors {

case (DenseVector(vv1), DenseVector(vv2)) =>
var kv = 0
val sz = vv1.size
val sz = vv1.length
while (kv < sz) {
val score = vv1(kv) - vv2(kv)
squaredDistance += score * score
Expand All @@ -422,7 +430,7 @@ object Vectors {
var kv2 = 0
val indices = v1.indices
var squaredDistance = 0.0
val nnzv1 = indices.size
val nnzv1 = indices.length
val nnzv2 = v2.size
var iv1 = if (nnzv1 > 0) indices(kv1) else -1

Expand Down Expand Up @@ -451,8 +459,8 @@ object Vectors {
v1Values: Array[Double],
v2Indices: IndexedSeq[Int],
v2Values: Array[Double]): Boolean = {
val v1Size = v1Values.size
val v2Size = v2Values.size
val v1Size = v1Values.length
val v2Size = v2Values.length
var k1 = 0
var k2 = 0
var allEqual = true
Expand Down Expand Up @@ -493,14 +501,30 @@ class DenseVector(val values: Array[Double]) extends Vector {

private[spark] override def foreachActive(f: (Int, Double) => Unit) = {
var i = 0
val localValuesSize = values.size
val localValuesSize = values.length
val localValues = values

while (i < localValuesSize) {
f(i, localValues(i))
i += 1
}
}

override def hashCode(): Int = {
var result: Int = 31 + size
var i = 0
val end = math.min(values.length, 16)
while (i < end) {
val v = values(i)
if (v != 0.0) {
result = 31 * result + i
val bits = java.lang.Double.doubleToLongBits(values(i))
result = 31 * result + (bits ^ (bits >>> 32)).toInt
}
i += 1
}
result
}
}

object DenseVector {
Expand All @@ -522,8 +546,8 @@ class SparseVector(
val values: Array[Double]) extends Vector {

require(indices.length == values.length, "Sparse vectors require that the dimension of the" +
s" indices match the dimension of the values. You provided ${indices.size} indices and " +
s" ${values.size} values.")
s" indices match the dimension of the values. You provided ${indices.length} indices and " +
s" ${values.length} values.")

override def toString: String =
"(%s,%s,%s)".format(size, indices.mkString("[", ",", "]"), values.mkString("[", ",", "]"))
Expand All @@ -547,7 +571,7 @@ class SparseVector(

private[spark] override def foreachActive(f: (Int, Double) => Unit) = {
var i = 0
val localValuesSize = values.size
val localValuesSize = values.length
val localIndices = indices
val localValues = values

Expand All @@ -556,6 +580,28 @@ class SparseVector(
i += 1
}
}

override def hashCode(): Int = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should SparseVector use the first 16 non-zero values instead of the first 16? For sparse data, it would not be surprising for a bunch of vectors to have all-zeros in the first 16 elements.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For dense vectors, it may contains many zeros at the beginning, and then the cost is still high if we look for the first 16 nonzeros. The hashCode is not used in any MLlib code. This is just to reduce the overhead of Pyrolite serializer.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, sounds fine. Curious: Why do DenseVector and SparseVector need to implement the same hash code? I was thinking DenseVectors could stay as they are (the first 16 elements), while SparseVectors could look at the first 16 non-zeros to reduce collisions. But I'm not familiar with this code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a contract in Java (http://en.wikipedia.org/wiki/Java_hashCode()):

The general contract for overridden implementations of this method is that they behave in a way consistent with the same object's equals() method: that a given object must consistently report the same hash value (unless it is changed so that the new version is no longer considered "equal" to the old), and that two objects which equals() says are equal must report the same hash value.

So sv == dv -> sv.## == dv.##.

var result: Int = 31 + size
val end = values.length
var continue = true
var k = 0
while ((k < end) & continue) {
val i = indices(k)
if (i < 16) {
val v = values(k)
if (v != 0.0) {
result = 31 * result + i
val bits = java.lang.Double.doubleToLongBits(v)
result = 31 * result + (bits ^ (bits >>> 32)).toInt
}
} else {
continue = false
}
k += 1
}
result
}
}

object SparseVector {
Expand Down