Update the OpenHashSet comment to make sense (there is no constant EX…

…ISTANCE_MASK), provide default params for rehashIfNeeded, and switch the graphimpl to use it instead of the java hash map which is probably what Reynold ment in the CR feedback
apache · holdenk · Feb 27, 2014 · Mar 8, 2014 · Mar 8, 2014 · Mar 8, 2014
commit e055a31ef3c252b4d38513e267031f5b90dca152
diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -114,7 +114,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
    * The caller is responsible for calling rehashIfNeeded.
    *
    * Use (retval & POSITION_MASK) to get the actual position, and
-   * (retval & EXISTENCE_MASK) != 0 for prior existence.
+   * (retval & NONEXISTENCE_MASK) != 0 for prior existence.
    *
    * @return The position where the key is placed, plus the highest order bit is set if the key
    *         exists previously.
@@ -151,7 +151,8 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
    * @param moveFunc Callback invoked when we move the key from one position (in the old data array)
    *                 to a new position (in the new data array).
    */
-  def rehashIfNeeded(k: T, allocateFunc: (Int) => Unit, moveFunc: (Int, Int) => Unit) {
+  def rehashIfNeeded(k: T, allocateFunc: (Int) => Unit = grow,
+    moveFunc: (Int, Int) => Unit = move) {
     if (_size > _growThreshold) {
       rehash(k, allocateFunc, moveFunc)
     }

diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashSetSuite.scala
@@ -73,6 +73,9 @@ class OpenHashSetSuite extends FunSuite with ShouldMatchers {
     assert(set.contains(50))
     assert(set.contains(999))
     assert(!set.contains(10000))
+
+    assert(set.addWithoutResize(50) & set.NONEXISTENCE_MASK != 0)
+    assert(set.addWithoutResize(10000) & set.NONEXISTENCE_MASK === 0)
   }
 
   test("primitive long") {

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
@@ -17,11 +17,10 @@
 
 package org.apache.spark.graphx.impl
 
-import java.util.HashSet
-
 import scala.reflect.{classTag, ClassTag}
 
 import org.apache.spark.util.collection.PrimitiveVector
+import org.apache.spark.util.collection.OpenHashSet
 import org.apache.spark.{HashPartitioner, Partitioner}
 import org.apache.spark.SparkContext._
 import org.apache.spark.graphx._
@@ -392,8 +391,13 @@ object GraphImpl {
       partitioner: Partitioner): RDD[(VertexId, Int)] = {
     new ShuffledRDD[VertexId, Int, (VertexId, Int)](
       edges.collectVertexIds.mapPartitions { vids =>
-        val present = new HashSet[VertexId]()
-        vids.filter(vid => present.add(vid)).map(vid => (vid, 0))
+        val present = new OpenHashSet[VertexId](vids.size)
+        vids.filter{ vid => 
+          // This is a bit ugly but we can't just call add since add is of type unit
+          val isPresent = ((present.addWithoutResize(vid) & OpenHashSet.NONEXISTENCE_MASK) != 0)
+          present.rehashIfNeeded(vid)
+          isPresent
+        }.map(vid => (vid, 0))
       },
       partitioner)
       .setSerializer(classOf[VertexIdMsgSerializer].getName)