SPARK-8309: Support for more than 12M items in OpenHashMap

apache · SlavikBaranov · Jun 11, 2015 · Jun 12, 2015 · Jun 12, 2015 · Jun 15, 2015
commit 39206563dc84e0d200c605717b69ff7485ea9c54
diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala
@@ -223,6 +223,7 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
    */
   private def rehash(k: T, allocateFunc: (Int) => Unit, moveFunc: (Int, Int) => Unit) {
     val newCapacity = _capacity * 2
+    require(newCapacity <= (1 << 29), "Can't make capacity bigger than 2^29 elements")
     allocateFunc(newCapacity)
     val newBitset = new BitSet(newCapacity)
     val newData = new Array[T](newCapacity)
@@ -278,7 +279,7 @@ object OpenHashSet {
 
   val INVALID_POS = -1
   val NONEXISTENCE_MASK = 0x80000000
-  val POSITION_MASK = 0xEFFFFFF
+  val POSITION_MASK = 0x1FFFFFFF
 
   /**
    * A set of specialized hash function implementation to avoid boxing hash code computation

diff --git a/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/OpenHashMapSuite.scala
@@ -186,4 +186,14 @@ class OpenHashMapSuite extends SparkFunSuite with Matchers {
     map(null) = 0
     assert(map.contains(null))
   }
+
+  test("support for more than 12M items") {
+    val cnt = 12000000 // 12M
+    val map = new OpenHashMap[Int, Int](cnt)
+    for (i <- 0 until cnt) {
+      map(i) = 1
+    }
+    val numInvalidValues = map.iterator.count(_._2 == 0)
+    assertResult(0)(numInvalidValues)
+  }
 }