Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
address review comment
  • Loading branch information
kiszk committed Aug 4, 2018
commit 352743f0e0fe18c88c37f67f535fff12351bae06
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,13 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
protected var _capacity = nextPowerOf2(initialCapacity)
protected var _mask = _capacity - 1
protected var _size = 0
protected var _occupied = 0
protected var _growThreshold = (loadFactor * _capacity).toInt
def g: Int = _growThreshold
def o: Int = _occupied
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, please use more descriptive and comprehensible names

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, sorry for putting this. This is used only for my debugging. This should be removed.


protected var _bitset = new BitSet(_capacity)
protected var _bitsetDeleted: BitSet = null
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why protected ? Make it private instead.


def getBitSet: BitSet = _bitset

Expand Down Expand Up @@ -122,9 +126,13 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
* Remove an element from the set. If an element does not exists in the set, nothing is done.
*/
def remove(k: T): Unit = {
if (_bitsetDeleted == null) {
_bitsetDeleted = new BitSet(_capacity)
}
val pos = getPos(k)
if (pos != INVALID_POS && _bitset.get(pos)) {
if (pos != INVALID_POS) {
_bitset.unset(pos)
_bitsetDeleted.set(pos)
_size -= 1
}
}
Expand Down Expand Up @@ -152,19 +160,24 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
var delta = 1
while (true) {
if (!_bitset.get(pos)) {
// This is a new key.
_data(pos) = k
_bitset.set(pos)
_size += 1
return pos | NONEXISTENCE_MASK
if (_bitsetDeleted == null || !_bitsetDeleted.get(pos)) {
// This is a new key.
_data(pos) = k
_bitset.set(pos)
if (_bitsetDeleted != null) {
_bitsetDeleted.unset(pos)
}
_size += 1
_occupied += 1
return pos | NONEXISTENCE_MASK
}
} else if (_data(pos) == k) {
// Found an existing key.
return pos
} else {
// quadratic probing with values increase by 1, 2, 3, ...
pos = (pos + delta) & _mask
delta += 1
}
// quadratic probing with values increase by 1, 2, 3, ...
pos = (pos + delta) & _mask
delta += 1
}
throw new RuntimeException("Should never reach here.")
}
Expand All @@ -178,7 +191,7 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
* to a new position (in the new data array).
*/
def rehashIfNeeded(k: T, allocateFunc: (Int) => Unit, moveFunc: (Int, Int) => Unit) {
if (_size > _growThreshold) {
if (_occupied > _growThreshold) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont see any value in _occupied - on contrary it can cause very bad behavior if there is a lot of remove's expected.
_size is a better metric to decide to rehash and grow.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When 'remove' is called, '_size' is decremented. But, an entry is not released. This is a motivation to introduce 'occupied'.
I will try to use another implementation without 'remove' while it may introduce some overhead.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no explicitly entry here - it is simply unoccupied slots in an array.
The slot is free, it can be used by some other (new) entry when insert is called.

It must be trivial to see how very bad behavior can happen with actual size of set being very small - with a series of add/remove's : resulting in unending growth of the set.

something like this, for example, is enough to cause set to blow to 2B entries:

var i = 0
while (i < Int.MaxValue) {
  set.add(1)
  set.remove(1)
  assert (0 == set.size)
  i += 1
}

Copy link
Contributor

@mridulm mridulm Jul 18, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For accuracy sake - my example snippet above will fail much earlier - due to OpenHashSet. MAX_CAPACITY. Though that is probably not the point anyway :-)

rehash(k, allocateFunc, moveFunc)
}
}
Expand All @@ -191,14 +204,15 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
var delta = 1
while (true) {
if (!_bitset.get(pos)) {
return INVALID_POS
if (_bitsetDeleted == null || !_bitsetDeleted.get(pos)) {
return INVALID_POS
}
} else if (k == _data(pos)) {
return pos
} else {
// quadratic probing with values increase by 1, 2, 3, ...
pos = (pos + delta) & _mask
delta += 1
}
// quadratic probing with values increase by 1, 2, 3, ...
pos = (pos + delta) & _mask
delta += 1
}
throw new RuntimeException("Should never reach here.")
}
Expand All @@ -219,6 +233,7 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
/** Return the value at the specified position. */
def getValueSafe(pos: Int): T = {
assert(_bitset.get(pos))
assert(_bitsetDeleted == null || !_bitsetDeleted.get(pos))
_data(pos)
}

Expand Down Expand Up @@ -274,6 +289,7 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
}

_bitset = newBitset
_bitsetDeleted = null
_data = newData
_capacity = newCapacity
_mask = newMask
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,15 +82,15 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
assert(set.contains(1132))
assert(!set.contains(10000))

set.remove(1132)
set.remove(999)
assert(set.size === 3)
assert(set.contains(10))
assert(set.contains(50))
assert(set.contains(999))
assert(!set.contains(1132))
assert(!set.contains(999))
assert(set.contains(1132))
assert(!set.contains(10000))

set.remove(999)
set.remove(1132)
assert(set.size === 2)
assert(set.contains(10))
assert(set.contains(50))
Expand All @@ -106,6 +106,22 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
assert(set.contains(1132))
assert(!set.contains(10000))

set.add(999)
assert(set.size === 4)
assert(set.contains(10))
assert(set.contains(50))
assert(set.contains(999))
assert(set.contains(1132))
assert(!set.contains(10000))

set.remove(999)
assert(set.size === 3)
assert(set.contains(10))
assert(set.contains(50))
assert(!set.contains(999))
assert(set.contains(1132))
assert(!set.contains(10000))

set.remove(10000)
assert(set.size === 3)
assert(set.contains(10))
Expand Down Expand Up @@ -159,15 +175,15 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
assert(set.contains(1132L))
assert(!set.contains(10000L))

set.remove(1132)
set.remove(999L)
assert(set.size === 3)
assert(set.contains(10L))
assert(set.contains(50L))
assert(set.contains(999L))
assert(!set.contains(1132L))
assert(!set.contains(999L))
assert(set.contains(1132L))
assert(!set.contains(10000L))

set.remove(999L)
set.remove(1132L)
assert(set.size === 2)
assert(set.contains(10L))
assert(set.contains(50L))
Expand All @@ -183,6 +199,22 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
assert(set.contains(1132L))
assert(!set.contains(10000L))

set.add(999L)
assert(set.size === 4)
assert(set.contains(10L))
assert(set.contains(50L))
assert(set.contains(999L))
assert(set.contains(1132L))
assert(!set.contains(10000L))

set.remove(999L)
assert(set.size === 3)
assert(set.contains(10L))
assert(set.contains(50L))
assert(!set.contains(999L))
assert(set.contains(1132L))
assert(!set.contains(10000L))

set.remove(10000L)
assert(set.size === 3)
assert(set.contains(10L))
Expand Down Expand Up @@ -352,6 +384,27 @@ class OpenHashSetSuite extends SparkFunSuite with Matchers {
assert(set.capacity > 1000)
}

test("growth with remove") {
val loadFactor = 0.7
val set = new OpenHashSet[Int](8, loadFactor)
for (i <- 1 to 5) {
set.add(i)
assert(set.contains(i))
set.remove(i)
assert(!set.contains(i))
}
assert(set.size == 0)
assert(set.capacity == 8)

// resize should occur
set.add(6)
assert(set.contains(6))
set.remove(6)
assert(!set.contains(6))
assert(set.size == 0)
assert(set.capacity > 8)
}

test("SPARK-18200 Support zero as an initial set size") {
val set = new OpenHashSet[Long](0)
assert(set.size === 0)
Expand Down