Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
858f54e
Unify unique/factorize, remove kwargs (perf); enable inverse for unique
h-vetinari Oct 15, 2018
4ed354a
Template over {return_inverse, ignore_na} for perf
h-vetinari Oct 26, 2018
906cd50
Re-add kwargs to method signature
h-vetinari Oct 26, 2018
0d6dad0
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Oct 28, 2018
19c7c1f
Fix small oversight
h-vetinari Oct 28, 2018
a8f079f
Simplify an if-condition
h-vetinari Oct 29, 2018
1c5b97a
Reword comment
h-vetinari Oct 29, 2018
d71d68c
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Oct 29, 2018
c7327fd
Revert templating over {return_inverse, ignore_na}
h-vetinari Oct 31, 2018
e09f6fe
Merge branch 'master' into unique_inverse_cython
h-vetinari Nov 2, 2018
f93a912
Merge branch 'master' into unique_inverse_cython
h-vetinari Nov 3, 2018
a06494e
Add new kwargs at the end (review jreback)
h-vetinari Nov 3, 2018
799fdfb
Merge branch 'master' into unique_inverse_cython
h-vetinari Nov 6, 2018
0ad2272
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Nov 11, 2018
906a2b9
Retrigger CircleCI
h-vetinari Nov 11, 2018
29aecdd
Retrigger CI after flaky hypothesis test
h-vetinari Nov 11, 2018
746c0e3
Retrigger CircleCI
h-vetinari Nov 11, 2018
8da33f4
Retrigger CI after timeout
h-vetinari Nov 11, 2018
ba9d8b8
Retrigger CircleCI
h-vetinari Nov 11, 2018
2423a05
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Nov 12, 2018
6079c26
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Nov 18, 2018
30de418
Always calculate inverse
h-vetinari Nov 18, 2018
feb32d6
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Nov 19, 2018
0b85759
Revert "Always calculate inverse"
h-vetinari Nov 19, 2018
aad4b91
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Nov 21, 2018
4451854
Add comments to ignore_na branches (review jreback)
h-vetinari Nov 21, 2018
e6b71b5
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Nov 27, 2018
00a304d
Switch signature of hashtable.factorize (review jreback)
h-vetinari Nov 27, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Always calculate inverse
  • Loading branch information
h-vetinari committed Nov 18, 2018
commit 30de418936cda9aa9c26a8cfa4c3a0b42906e2b2
106 changes: 41 additions & 65 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -358,8 +358,7 @@ cdef class {{name}}HashTable(HashTable):
@cython.wraparound(False)
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
bint return_inverse=False):
object na_value=None, bint ignore_na=False):
"""
Calculate unique values and labels (no sorting!)

Expand All @@ -382,15 +381,12 @@ cdef class {{name}}HashTable(HashTable):
Whether NA-values should be ignored for calculating the uniques. If
True, the labels corresponding to missing values will be set to
na_sentinel.
return_inverse : boolean, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.

Returns
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[int64] (if return_inverse=True)
labels : ndarray[int64]
The labels from values to uniques
"""
cdef:
Expand All @@ -402,8 +398,7 @@ cdef class {{name}}HashTable(HashTable):
{{name}}VectorData *ud
bint use_na_value

if return_inverse:
labels = np.empty(n, dtype=np.int64)
labels = np.empty(n, dtype=np.int64)
ud = uniques.data
use_na_value = na_value is not None

Expand Down Expand Up @@ -440,19 +435,15 @@ cdef class {{name}}HashTable(HashTable):
"Vector.resize() needed")
uniques.resize()
append_data_{{dtype}}(ud, val)
if return_inverse:
self.table.vals[k] = count
labels[i] = count
count += 1
elif return_inverse:
self.table.vals[k] = count
labels[i] = count
count += 1
else:
# k falls into a previous bucket
# only relevant in case we need to construct the inverse
idx = self.table.vals[k]
labels[i] = idx

if return_inverse:
return uniques.to_array(), np.asarray(labels)
return uniques.to_array()
return uniques.to_array(), np.asarray(labels)

def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
"""
Expand All @@ -474,8 +465,10 @@ cdef class {{name}}HashTable(HashTable):
The labels from values to uniques
"""
uniques = {{name}}Vector()
return self._unique(values, uniques, ignore_na=False,
return_inverse=return_inverse)
uniques, inverse = self._unique(values, uniques, ignore_na=False)
if return_inverse:
return uniques, inverse
return uniques

def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
Expand Down Expand Up @@ -507,8 +500,7 @@ cdef class {{name}}HashTable(HashTable):
uniques_vector = {{name}}Vector()
uniques, labels = self._unique(values, uniques_vector,
na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True,
return_inverse=True)
na_value=na_value, ignore_na=True)
# factorize has reversed outputs compared to _unique
return labels, uniques

Expand All @@ -517,7 +509,7 @@ cdef class {{name}}HashTable(HashTable):
object na_value=None):
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
ignore_na=True)
return labels

@cython.boundscheck(False)
Expand Down Expand Up @@ -709,8 +701,7 @@ cdef class StringHashTable(HashTable):
@cython.wraparound(False)
def _unique(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
bint return_inverse=False):
object na_value=None, bint ignore_na=False):
"""
Calculate unique values and labels (no sorting!)

Expand All @@ -733,15 +724,12 @@ cdef class StringHashTable(HashTable):
Whether NA-values should be ignored for calculating the uniques. If
True, the labels corresponding to missing values will be set to
na_sentinel.
return_inverse : boolean, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.

Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64] (if return_inverse=True)
labels : ndarray[int64]
The labels from values to uniques
"""
cdef:
Expand All @@ -755,8 +743,7 @@ cdef class StringHashTable(HashTable):
khiter_t k
bint use_na_value

if return_inverse:
labels = np.zeros(n, dtype=np.int64)
labels = np.zeros(n, dtype=np.int64)
uindexer = np.empty(n, dtype=np.int64)
use_na_value = na_value is not None

Expand Down Expand Up @@ -787,13 +774,11 @@ cdef class StringHashTable(HashTable):
# k hasn't been seen yet
k = kh_put_str(self.table, v, &ret)
uindexer[count] = i
if return_inverse:
self.table.vals[k] = count
labels[i] = <int64_t>count
self.table.vals[k] = count
labels[i] = <int64_t>count
count += 1
elif return_inverse:
else:
# k falls into a previous bucket
# only relevant in case we need to construct the inverse
idx = self.table.vals[k]
labels[i] = <int64_t>idx

Expand All @@ -803,9 +788,7 @@ cdef class StringHashTable(HashTable):
for i in range(count):
uniques.append(values[uindexer[i]])

if return_inverse:
return uniques.to_array(), np.asarray(labels)
return uniques.to_array()
return uniques.to_array(), np.asarray(labels)

def unique(self, ndarray[object] values, bint return_inverse=False):
"""
Expand All @@ -827,8 +810,10 @@ cdef class StringHashTable(HashTable):
The labels from values to uniques
"""
uniques = ObjectVector()
return self._unique(values, uniques, ignore_na=False,
return_inverse=return_inverse)
uniques, inverse = self._unique(values, uniques, ignore_na=False)
if return_inverse:
return uniques, inverse
return uniques

def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
Expand Down Expand Up @@ -860,8 +845,7 @@ cdef class StringHashTable(HashTable):
uniques_vector = ObjectVector()
uniques, labels = self._unique(values, uniques_vector,
na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True,
return_inverse=True)
na_value=na_value, ignore_na=True)
# factorize has reversed outputs compared to _unique
return labels, uniques

Expand All @@ -870,7 +854,7 @@ cdef class StringHashTable(HashTable):
object na_value=None):
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
ignore_na=True)
return labels


Expand Down Expand Up @@ -963,8 +947,7 @@ cdef class PyObjectHashTable(HashTable):
@cython.wraparound(False)
def _unique(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
bint return_inverse=False):
object na_value=None, bint ignore_na=False):
"""
Calculate unique values and labels (no sorting!)

Expand All @@ -987,15 +970,12 @@ cdef class PyObjectHashTable(HashTable):
Whether NA-values should be ignored for calculating the uniques. If
True, the labels corresponding to missing values will be set to
na_sentinel.
return_inverse : boolean, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.

Returns
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64] (if return_inverse=True)
labels : ndarray[int64]
The labels from values to uniques
"""
cdef:
Expand All @@ -1006,8 +986,7 @@ cdef class PyObjectHashTable(HashTable):
khiter_t k
bint use_na_value

if return_inverse:
labels = np.empty(n, dtype=np.int64)
labels = np.empty(n, dtype=np.int64)
use_na_value = na_value is not None

for i in range(n):
Expand All @@ -1024,19 +1003,15 @@ cdef class PyObjectHashTable(HashTable):
# k hasn't been seen yet
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
uniques.append(val)
if return_inverse:
self.table.vals[k] = count
labels[i] = count
count += 1
elif return_inverse:
self.table.vals[k] = count
labels[i] = count
count += 1
else:
# k falls into a previous bucket
# only relevant in case we need to construct the inverse
idx = self.table.vals[k]
labels[i] = idx

if return_inverse:
return uniques.to_array(), np.asarray(labels)
return uniques.to_array()
return uniques.to_array(), np.asarray(labels)

def unique(self, ndarray[object] values, bint return_inverse=False):
"""
Expand All @@ -1058,8 +1033,10 @@ cdef class PyObjectHashTable(HashTable):
The labels from values to uniques
"""
uniques = ObjectVector()
return self._unique(values, uniques, ignore_na=False,
return_inverse=return_inverse)
uniques, inverse = self._unique(values, uniques, ignore_na=False)
if return_inverse:
return uniques, inverse
return uniques

def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
Expand Down Expand Up @@ -1091,8 +1068,7 @@ cdef class PyObjectHashTable(HashTable):
uniques_vector = ObjectVector()
uniques, labels = self._unique(values, uniques_vector,
na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True,
return_inverse=True)
na_value=na_value, ignore_na=True)
# factorize has reversed outputs compared to _unique
return labels, uniques

Expand All @@ -1101,5 +1077,5 @@ cdef class PyObjectHashTable(HashTable):
object na_value=None):
_, labels = self._unique(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value,
ignore_na=True, return_inverse=True)
ignore_na=True)
return labels