Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
CLN: str.cat internals
  • Loading branch information
h-vetinari committed Oct 11, 2018
commit 5f8890c1e6aeeda41e9818ce0c46b259efaa7f2f
165 changes: 52 additions & 113 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from pandas.compat import zip
from pandas.core.dtypes.generic import ABCSeries, ABCIndex
from pandas.core.dtypes.missing import isna, notna
from pandas.core.dtypes.missing import isna
from pandas.core.dtypes.common import (
is_bool_dtype,
is_categorical_dtype,
Expand Down Expand Up @@ -36,114 +36,28 @@
_shared_docs = dict()


def _get_array_list(arr, others):
"""
Auxiliary function for :func:`str_cat`

Parameters
----------
arr : ndarray
The left-most ndarray of the concatenation
others : list, ndarray, Series
The rest of the content to concatenate. If list of list-likes,
all elements must be passable to ``np.asarray``.

Returns
-------
list
List of all necessary arrays
"""
from pandas.core.series import Series

if len(others) and isinstance(com.values_from_object(others)[0],
(list, np.ndarray, Series)):
arrays = [arr] + list(others)
else:
arrays = [arr, others]

return [np.asarray(x, dtype=object) for x in arrays]


def str_cat(arr, others=None, sep=None, na_rep=None):
"""
def interleave_sep(all_cols, sep):
'''
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use triple-double quotes

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all_cols -> list_of_columns

Auxiliary function for :meth:`str.cat`

If `others` is specified, this function concatenates the Series/Index
and elements of `others` element-wise.
If `others` is not being passed then all values in the Series are
concatenated in a single string with a given `sep`.

Parameters
----------
others : list-like, or list of list-likes, optional
List-likes (or a list of them) of the same length as calling object.
If None, returns str concatenating strings of the Series.
sep : string or None, default None
If None, concatenates without any separator.
na_rep : string or None, default None
If None, NA in the series are ignored.
all_cols : list of numpy arrays
List of arrays to be concatenated with sep
sep : string
The separator string for concatenating the columns

Returns
-------
concat
ndarray containing concatenated results (if `others is not None`)
or str (if `others is None`)
"""
if sep is None:
sep = ''

if others is not None:
arrays = _get_array_list(arr, others)

n = _length_check(arrays)
masks = np.array([isna(x) for x in arrays])
cats = None

if na_rep is None:
na_mask = np.logical_or.reduce(masks, axis=0)

result = np.empty(n, dtype=object)
np.putmask(result, na_mask, np.nan)

notmask = ~na_mask

tuples = zip(*[x[notmask] for x in arrays])
cats = [sep.join(tup) for tup in tuples]

result[notmask] = cats
else:
for i, x in enumerate(arrays):
x = np.where(masks[i], na_rep, x)
if cats is None:
cats = x
else:
cats = cats + sep + x

result = cats

return result
else:
arr = np.asarray(arr, dtype=object)
mask = isna(arr)
if na_rep is None and mask.any():
if sep == '':
na_rep = ''
else:
return sep.join(arr[notna(arr)])
return sep.join(np.where(mask, na_rep, arr))


def _length_check(others):
n = None
for x in others:
try:
if n is None:
n = len(x)
elif len(x) != n:
raise ValueError('All arrays must be same length')
except TypeError:
raise ValueError('Must pass arrays containing strings to str_cat')
return n
list
The list of arrays interleaved with sep; to be fed to np.sum
'''
if sep == '':
# no need to add empty strings
return all_cols
result = [sep] * (2 * len(all_cols) - 1)
result[::2] = all_cols
return result
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would simply do np.sum(result) here, no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, that's reasonable. Refactored the function as necessary



def _na_map(f, arr, na_result=np.nan, dtype=object):
Expand Down Expand Up @@ -2283,6 +2197,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):

if isinstance(others, compat.string_types):
raise ValueError("Did you mean to supply a `sep` keyword?")
if sep is None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not anything you need to change in this PR but if we just overwrite None here it would probably make more sense to change the default function signature to simply be ''

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed - thought about that as well, but didn't know if that's considered an API change.

sep = ''

if isinstance(self._orig, Index):
data = Series(self._orig, index=self._orig)
Expand All @@ -2291,9 +2207,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):

# concatenate Series/Index with itself if no "others"
if others is None:
result = str_cat(data, others=others, sep=sep, na_rep=na_rep)
return self._wrap_result(result,
use_codes=(not self._is_categorical))
data = data.astype(object).values
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this astype needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used it because data may be categorical, and then values is not necessarily a numpy array. Changed to ensure_object which you mentioned below, hope this is better.

mask = isna(data)
if mask.any():
if na_rep is None:
return sep.join(data[~mask])
return sep.join(np.where(mask, na_rep, data))
return sep.join(data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we do a single sep.join, and just have the branches mask the data as needed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


try:
# turn anything in "others" into lists of Series
Expand All @@ -2320,23 +2240,42 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
"'outer'|'inner'|'right'`. The future default will "
"be `join='left'`.", FutureWarning, stacklevel=2)

# align if required
if join is not None:
# if join is None, _get_series_list already aligned indexes
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps as a follow up but if the alignment behavior is going to be scattered across these two methods would probably make more sense as a dedicated private method which can be called rather than having various arguments trigger the behavior in various functions

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You've got a point, and actually that line is not necessary anymore after I introduced the if any(...) condition right below.

Just as a comment about doing things in different places: _get_series_list will transform everything without an index (i.e. lists/ndarray/iterators) into Series which inherit the index of the caller. This is a necessary step to be able to talk about alignment of indexes at all (second step).

join = 'left' if join is None else join

if any(not data.index.equals(x.index) for x in others):
# Need to add keys for uniqueness in case of duplicate columns
others = concat(others, axis=1,
join=(join if join == 'inner' else 'outer'),
keys=range(len(others)))
keys=range(len(others)), copy=False)
data, others = data.align(others, join=join)
others = [others[x] for x in others] # again list of Series

# str_cat discards index
res = str_cat(data, others=others, sep=sep, na_rep=na_rep)
all_cols = [x.astype(object).values for x in [data] + others]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do you need the astype, much prefer ensure_object generally

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

masks = np.array([isna(x) for x in all_cols])
union_mask = np.logical_or.reduce(masks, axis=0)

if na_rep is None and union_mask.any():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment on these cases

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added comments

result = np.empty(len(data), dtype=object)
np.putmask(result, union_mask, np.nan)

not_masked = ~union_mask
all_cols = interleave_sep([x[not_masked] for x in all_cols], sep)

result[not_masked] = np.sum(all_cols, axis=0)
elif na_rep is not None and union_mask.any():
# fill NaNs
all_cols = [np.where(masks[i], na_rep, all_cols[i])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use zip(masks, all_cols)

for i in range(len(all_cols))]
result = np.sum(interleave_sep(all_cols, sep), axis=0)
else: # no NaNs
result = np.sum(interleave_sep(all_cols, sep), axis=0)

if isinstance(self._orig, Index):
res = Index(res, name=self._orig.name)
result = Index(result, name=self._orig.name)
else: # Series
res = Series(res, index=data.index, name=self._orig.name)
return res
result = Series(result, index=data.index, name=self._orig.name)
return result

_shared_docs['str_split'] = ("""
Split strings around given separator/delimiter.
Expand Down
49 changes: 1 addition & 48 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,53 +97,6 @@ def test_iter_object_try_string(self):
assert i == 100
assert s == 'h'

def test_cat(self):
one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_)
two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_)

# single array
result = strings.str_cat(one)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback

I removed one test for the internal method that has been factored away

Please have a look here - this is directly importing the internal method and testing it (not str.cat)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

exp = 'aabbc'
assert result == exp

result = strings.str_cat(one, na_rep='NA')
exp = 'aabbcNA'
assert result == exp

result = strings.str_cat(one, na_rep='-')
exp = 'aabbc-'
assert result == exp

result = strings.str_cat(one, sep='_', na_rep='NA')
exp = 'a_a_b_b_c_NA'
assert result == exp

result = strings.str_cat(two, sep='-')
exp = 'a-b-d-foo'
assert result == exp

# Multiple arrays
result = strings.str_cat(one, [two], na_rep='NA')
exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'],
dtype=np.object_)
tm.assert_numpy_array_equal(result, exp)

result = strings.str_cat(one, two)
exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_)
tm.assert_almost_equal(result, exp)

# error for incorrect lengths
rgx = 'All arrays must be same length'
three = Series(['1', '2', '3'])

with tm.assert_raises_regex(ValueError, rgx):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Furthermore, this removed test (test_cat) is exactly replicated in the test directly below (test_str_cat).

I can't mark lines that are not in the diff, but check out

result = s.str.cat()

I replicated the removed test (acting on strings.test_cat) as a test acting on str.cat within #20347.

strings.str_cat(one, three)

# error for incorrect type
rgx = "Must pass arrays containing strings to str_cat"
with tm.assert_raises_regex(ValueError, rgx):
strings.str_cat(one, 'three')

@pytest.mark.parametrize('box', [Series, Index])
@pytest.mark.parametrize('other', [None, Series, Index])
def test_str_cat_name(self, box, other):
Expand Down Expand Up @@ -3136,7 +3089,7 @@ def test_method_on_bytes(self):
lhs = Series(np.array(list('abc'), 'S1').astype(object))
rhs = Series(np.array(list('def'), 'S1').astype(object))
if compat.PY3:
pytest.raises(TypeError, lhs.str.cat, rhs)
pytest.raises(TypeError, lhs.str.cat, rhs, sep=',')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this the bytes concat?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

else:
result = lhs.str.cat(rhs)
expected = Series(np.array(
Expand Down