Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
640162f
Fix ASV import error
h-vetinari Oct 3, 2018
31d0dc5
Add return_inverse to hashtable.unique
h-vetinari Sep 27, 2018
c5e5147
Pure copy/paste: Group unique/factorize functions next to each other
h-vetinari Sep 30, 2018
9918d52
Unify hashtable.factorize and .unique
h-vetinari Oct 3, 2018
52ae84e
Force compilation of different code paths
h-vetinari Oct 4, 2018
dbe4e0e
Add separate functions for return_inverse=False
h-vetinari Oct 4, 2018
8481e19
Finish split in _unique_with_inverse and _unique_no_inverse
h-vetinari Oct 4, 2018
27ceb4d
Add cython.wraparound(False)
h-vetinari Oct 4, 2018
f5cd5e9
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 6, 2018
b1705a9
Unmove unique-implementation (review jreback)
h-vetinari Oct 6, 2018
a6ed5dd
Undo line artefacts
h-vetinari Oct 6, 2018
17752ce
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 7, 2018
19eaf32
Clean up test_algos.test_vector_resize
h-vetinari Oct 7, 2018
ce7626f
Add test for hashtable.unique (esp. for return_inverse=True)
h-vetinari Oct 7, 2018
7b9014f
Review (jreback)
h-vetinari Oct 7, 2018
471c4da
Fix typo
h-vetinari Oct 8, 2018
9d45378
Small fixes
h-vetinari Oct 8, 2018
00b2ccb
Review (jorisvandenbossche)
h-vetinari Oct 8, 2018
8687315
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 11, 2018
a267d4a
Review (jorisvandenbossche)
h-vetinari Oct 11, 2018
7f1bb40
Improve comment
h-vetinari Oct 11, 2018
9593992
Test for writable; expand comments
h-vetinari Oct 12, 2018
08d7f50
Simplify factorize test
h-vetinari Oct 12, 2018
d91be98
Add simple test
h-vetinari Oct 12, 2018
e27ec9a
Tiny fixes
h-vetinari Oct 12, 2018
d825be0
Remove idx_duplicated from test (now unnecessary)
h-vetinari Oct 12, 2018
1a342d0
Review (jreback)
h-vetinari Oct 14, 2018
28e0441
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 15, 2018
d65e4fd
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 15, 2018
bca615c
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 17, 2018
3438727
Review (jreback)
h-vetinari Oct 17, 2018
facc111
Merge remote-tracking branch 'upstream/master' into re_factor_ize
h-vetinari Oct 18, 2018
6d0e86b
Retrigger Circle
h-vetinari Oct 18, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Review (jorisvandenbossche)
  • Loading branch information
h-vetinari committed Oct 11, 2018
commit a267d4a2e872a48986b35c771814fdf8617b0792
66 changes: 33 additions & 33 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -357,9 +357,9 @@ cdef class {{name}}HashTable(HashTable):

@cython.boundscheck(False)
@cython.wraparound(False)
def _unique_with_inverse(self, const {{dtype}}_t[:] values,
{{name}}Vector uniques, Py_ssize_t count_prior=0,
Py_ssize_t na_sentinel=-1, object na_value=None):
def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
"""
Calculate unique values and labels (no sorting); ignores all NA-values

Expand Down Expand Up @@ -437,20 +437,20 @@ cdef class {{name}}HashTable(HashTable):
labels[i] = count
count += 1

return uniques.to_array(), np.asarray(labels)
return np.asarray(labels)

def factorize(self, {{dtype}}_t[:] values):
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
uniques = {{name}}Vector()
return self._unique_with_inverse(values, uniques=uniques)
labels = self._factorize(values, uniques=uniques,
na_sentinel=na_sentinel, na_value=na_value)
return labels, uniques.to_array()

def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
_, labels = self._unique_with_inverse(values, uniques,
count_prior=count_prior,
na_sentinel=na_sentinel,
na_value=na_value)
return labels
return self._factorize(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value)

@cython.boundscheck(False)
def get_labels_groupby(self, const {{dtype}}_t[:] values):
Expand Down Expand Up @@ -727,9 +727,9 @@ cdef class StringHashTable(HashTable):

@cython.boundscheck(False)
@cython.wraparound(False)
def _unique_with_inverse(self, ndarray[object] values,
ObjectVector uniques, Py_ssize_t count_prior=0,
Py_ssize_t na_sentinel=-1, object na_value=None):
def _factorize(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
"""
Calculate unique values and labels (no sorting); ignores all NA-values

Expand Down Expand Up @@ -806,20 +806,20 @@ cdef class StringHashTable(HashTable):
for i in range(count):
uniques.append(values[uindexer[i]])

return uniques.to_array(), np.asarray(labels)
return np.asarray(labels)

def factorize(self, ndarray[object] values):
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
uniques = ObjectVector()
return self._unique_with_inverse(values, uniques=uniques)
labels = self._factorize(values, uniques=uniques,
na_sentinel=na_sentinel, na_value=na_value)
return labels, uniques.to_array()

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
_, labels = self._unique_with_inverse(values, uniques,
count_prior=count_prior,
na_sentinel=na_sentinel,
na_value=na_value)
return labels
return self._factorize(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value)


cdef class PyObjectHashTable(HashTable):
Expand Down Expand Up @@ -942,9 +942,9 @@ cdef class PyObjectHashTable(HashTable):

@cython.boundscheck(False)
@cython.wraparound(False)
def _unique_with_inverse(self, ndarray[object] values,
ObjectVector uniques, Py_ssize_t count_prior=0,
Py_ssize_t na_sentinel=-1, object na_value=None):
def _factorize(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
"""
Calculate unique values and labels (no sorting); ignores all NA-values

Expand Down Expand Up @@ -1002,17 +1002,17 @@ cdef class PyObjectHashTable(HashTable):
labels[i] = count
count += 1

return uniques.to_array(), np.asarray(labels)
return np.asarray(labels)

def factorize(self, ndarray[object] values):
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
uniques = ObjectVector()
return self._unique_with_inverse(values, uniques=uniques)
labels = self._factorize(values, uniques=uniques,
na_sentinel=na_sentinel, na_value=na_value)
return labels, uniques.to_array()

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None):
_, labels = self._unique_with_inverse(values, uniques,
count_prior=count_prior,
na_sentinel=na_sentinel,
na_value=na_value)
return labels
return self._factorize(values, uniques, count_prior=count_prior,
na_sentinel=na_sentinel, na_value=na_value)
8 changes: 3 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,15 +468,13 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
-------
labels, uniques : ndarray
"""
(hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
(hash_klass, _), values = _get_data_algo(values, _hashtables)

table = hash_klass(size_hint or len(values))
uniques = vec_klass()
labels = table.get_labels(values, uniques, 0, na_sentinel,
na_value=na_value)
labels, uniques = table.factorize(values, na_sentinel=na_sentinel,
na_value=na_value)

labels = ensure_platform_int(labels)
uniques = uniques.to_array()
return labels, uniques


Expand Down
54 changes: 52 additions & 2 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1322,14 +1322,64 @@ def test_hashtable_unique(self, htable, tm_dtype):
s.loc[500:502] = [np.nan, None, pd.NaT]

# create duplicated selection
s_duplicated = s.sample(frac=3, replace=True)
s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)

# drop_duplicates has own cython code (hash_table_func_helper.pxi)
# and is tested separately; keeps first occurrence like ht.unique()
# and is tested separately; keeps first occurrence like ht.unique()
expected_unique = s_duplicated.drop_duplicates(keep='first').values
result_unique = htable().unique(s_duplicated.values)
tm.assert_numpy_array_equal(result_unique, expected_unique)

@pytest.mark.parametrize('na_sentinel', [-1])
@pytest.mark.parametrize('htable, tm_dtype', [
(ht.PyObjectHashTable, 'String'),
(ht.StringHashTable, 'String'),
(ht.Float64HashTable, 'Float'),
(ht.Int64HashTable, 'Int'),
(ht.UInt64HashTable, 'UInt')])
def test_hashtable_factorize(self, htable, tm_dtype, na_sentinel):
# output of maker has guaranteed unique elements
maker = getattr(tm, 'make' + tm_dtype + 'Index')
s = Series(maker(1000))
if htable == ht.Float64HashTable:
# add NaN for float column
s.loc[500] = np.nan
elif htable == ht.PyObjectHashTable:
# use different NaN types for object column
s.loc[500:502] = [np.nan, None, pd.NaT]

# create duplicated selection
idx_duplicated = pd.Series(s.index).sample(frac=3, replace=True)
s_duplicated = s[idx_duplicated.values].reset_index(drop=True)
na_mask = s_duplicated.isna().values

result_inverse, result_unique = htable().factorize(s_duplicated.values)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i would rather you not test unrelated things directly here; these are pure unit tests

it just makes the test confusing. add with the other duplicated tests if you must
though i think these are already well covered, but if not you can add

Copy link
Contributor Author

@h-vetinari h-vetinari Oct 14, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not unrelated - it's the central functionality of factorize, and a bit more thorough than the few hand-made tests further up.

The code for duplicated is completely separate (in cython) and so this should have a separate test as well IMO

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i know you really want to show that this works, but these tests need hard coded expectations. It is very hard for a reader to assert what you are doing, because you are computing the results here as well. I think its pretty easy to cook an example where you start with a duplicated array and just can assert the results directly. We do lots of these kinds of things already in the drop_duplicates testing.


# drop_duplicates has own cython code (hash_table_func_helper.pxi)
# and is tested separately; keeps first occurrence like ht.unique()
expected_unique = s_duplicated.dropna().drop_duplicates(keep='first')
expected_unique = expected_unique.values
tm.assert_numpy_array_equal(result_unique, expected_unique)

# ignore NaNs for calculating inverse
_, values2unique, unique2values = np.unique(idx_duplicated[~na_mask],
return_inverse=True,
return_index=True)
expected_inverse = np.ones(s_duplicated.shape,
dtype=np.intp) * na_sentinel

# np.unique yields a __SORTED__ list of uniques, and values2unique
# resp. unique2values are relative to this order. To restore the
# original order, we argsort values2unique, because values2unique would
# be ordered if np.unique had not sorted implicitly. The first argsort
# gives the permutation from values2unique to its sorted form, but we
# need the inverse permutation (the map from the unsorted uniques to
# values2unique, from which we can continue with unique2values).
# This inversion (as a permutation) is achieved by the second argsort.
inverse_no_na = np.argsort(np.argsort(values2unique))[unique2values]
expected_inverse[~na_mask] = inverse_no_na
tm.assert_numpy_array_equal(result_inverse, expected_inverse)


def test_quantile():
s = Series(np.random.randn(100))
Expand Down