-
-
Notifications
You must be signed in to change notification settings - Fork 19.2k
CLN/ERR: str.cat internals #22725
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CLN/ERR: str.cat internals #22725
Changes from 1 commit
5f8890c
285a1f7
28e7859
807f18e
ed27c66
0d3c6d2
36c6240
a97fe67
e58ec9d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,7 +3,7 @@ | |
|
|
||
| from pandas.compat import zip | ||
| from pandas.core.dtypes.generic import ABCSeries, ABCIndex | ||
| from pandas.core.dtypes.missing import isna, notna | ||
| from pandas.core.dtypes.missing import isna | ||
| from pandas.core.dtypes.common import ( | ||
| is_bool_dtype, | ||
| is_categorical_dtype, | ||
|
|
@@ -36,114 +36,28 @@ | |
| _shared_docs = dict() | ||
|
|
||
|
|
||
| def _get_array_list(arr, others): | ||
| """ | ||
| Auxiliary function for :func:`str_cat` | ||
|
|
||
| Parameters | ||
| ---------- | ||
| arr : ndarray | ||
| The left-most ndarray of the concatenation | ||
| others : list, ndarray, Series | ||
| The rest of the content to concatenate. If list of list-likes, | ||
| all elements must be passable to ``np.asarray``. | ||
|
|
||
| Returns | ||
| ------- | ||
| list | ||
| List of all necessary arrays | ||
| """ | ||
| from pandas.core.series import Series | ||
|
|
||
| if len(others) and isinstance(com.values_from_object(others)[0], | ||
| (list, np.ndarray, Series)): | ||
| arrays = [arr] + list(others) | ||
| else: | ||
| arrays = [arr, others] | ||
|
|
||
| return [np.asarray(x, dtype=object) for x in arrays] | ||
|
|
||
|
|
||
| def str_cat(arr, others=None, sep=None, na_rep=None): | ||
| """ | ||
| def interleave_sep(all_cols, sep): | ||
| ''' | ||
|
||
| Auxiliary function for :meth:`str.cat` | ||
|
|
||
| If `others` is specified, this function concatenates the Series/Index | ||
| and elements of `others` element-wise. | ||
| If `others` is not being passed then all values in the Series are | ||
| concatenated in a single string with a given `sep`. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| others : list-like, or list of list-likes, optional | ||
| List-likes (or a list of them) of the same length as calling object. | ||
| If None, returns str concatenating strings of the Series. | ||
| sep : string or None, default None | ||
| If None, concatenates without any separator. | ||
| na_rep : string or None, default None | ||
| If None, NA in the series are ignored. | ||
| all_cols : list of numpy arrays | ||
| List of arrays to be concatenated with sep | ||
| sep : string | ||
| The separator string for concatenating the columns | ||
|
|
||
| Returns | ||
| ------- | ||
| concat | ||
| ndarray containing concatenated results (if `others is not None`) | ||
| or str (if `others is None`) | ||
| """ | ||
| if sep is None: | ||
| sep = '' | ||
|
|
||
| if others is not None: | ||
| arrays = _get_array_list(arr, others) | ||
|
|
||
| n = _length_check(arrays) | ||
| masks = np.array([isna(x) for x in arrays]) | ||
| cats = None | ||
|
|
||
| if na_rep is None: | ||
| na_mask = np.logical_or.reduce(masks, axis=0) | ||
|
|
||
| result = np.empty(n, dtype=object) | ||
| np.putmask(result, na_mask, np.nan) | ||
|
|
||
| notmask = ~na_mask | ||
|
|
||
| tuples = zip(*[x[notmask] for x in arrays]) | ||
| cats = [sep.join(tup) for tup in tuples] | ||
|
|
||
| result[notmask] = cats | ||
| else: | ||
| for i, x in enumerate(arrays): | ||
| x = np.where(masks[i], na_rep, x) | ||
| if cats is None: | ||
| cats = x | ||
| else: | ||
| cats = cats + sep + x | ||
|
|
||
| result = cats | ||
|
|
||
| return result | ||
| else: | ||
| arr = np.asarray(arr, dtype=object) | ||
| mask = isna(arr) | ||
| if na_rep is None and mask.any(): | ||
| if sep == '': | ||
| na_rep = '' | ||
| else: | ||
| return sep.join(arr[notna(arr)]) | ||
| return sep.join(np.where(mask, na_rep, arr)) | ||
|
|
||
|
|
||
| def _length_check(others): | ||
| n = None | ||
| for x in others: | ||
| try: | ||
| if n is None: | ||
| n = len(x) | ||
| elif len(x) != n: | ||
| raise ValueError('All arrays must be same length') | ||
| except TypeError: | ||
| raise ValueError('Must pass arrays containing strings to str_cat') | ||
| return n | ||
| list | ||
| The list of arrays interleaved with sep; to be fed to np.sum | ||
| ''' | ||
| if sep == '': | ||
| # no need to add empty strings | ||
| return all_cols | ||
| result = [sep] * (2 * len(all_cols) - 1) | ||
| result[::2] = all_cols | ||
| return result | ||
|
||
|
|
||
|
|
||
| def _na_map(f, arr, na_result=np.nan, dtype=object): | ||
|
|
@@ -2283,6 +2197,8 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): | |
|
|
||
| if isinstance(others, compat.string_types): | ||
| raise ValueError("Did you mean to supply a `sep` keyword?") | ||
| if sep is None: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not anything you need to change in this PR but if we just overwrite There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed - thought about that as well, but didn't know if that's considered an API change. |
||
| sep = '' | ||
|
|
||
| if isinstance(self._orig, Index): | ||
| data = Series(self._orig, index=self._orig) | ||
|
|
@@ -2291,9 +2207,13 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): | |
|
|
||
| # concatenate Series/Index with itself if no "others" | ||
| if others is None: | ||
| result = str_cat(data, others=others, sep=sep, na_rep=na_rep) | ||
| return self._wrap_result(result, | ||
| use_codes=(not self._is_categorical)) | ||
| data = data.astype(object).values | ||
|
||
| mask = isna(data) | ||
| if mask.any(): | ||
| if na_rep is None: | ||
| return sep.join(data[~mask]) | ||
| return sep.join(np.where(mask, na_rep, data)) | ||
| return sep.join(data) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we do a single sep.join, and just have the branches mask the data as needed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
|
||
| try: | ||
| # turn anything in "others" into lists of Series | ||
|
|
@@ -2320,23 +2240,42 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): | |
| "'outer'|'inner'|'right'`. The future default will " | ||
| "be `join='left'`.", FutureWarning, stacklevel=2) | ||
|
|
||
| # align if required | ||
| if join is not None: | ||
| # if join is None, _get_series_list already aligned indexes | ||
|
||
| join = 'left' if join is None else join | ||
|
|
||
| if any(not data.index.equals(x.index) for x in others): | ||
| # Need to add keys for uniqueness in case of duplicate columns | ||
| others = concat(others, axis=1, | ||
| join=(join if join == 'inner' else 'outer'), | ||
| keys=range(len(others))) | ||
| keys=range(len(others)), copy=False) | ||
| data, others = data.align(others, join=join) | ||
| others = [others[x] for x in others] # again list of Series | ||
|
|
||
| # str_cat discards index | ||
| res = str_cat(data, others=others, sep=sep, na_rep=na_rep) | ||
| all_cols = [x.astype(object).values for x in [data] + others] | ||
|
||
| masks = np.array([isna(x) for x in all_cols]) | ||
| union_mask = np.logical_or.reduce(masks, axis=0) | ||
|
|
||
| if na_rep is None and union_mask.any(): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. comment on these cases There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added comments |
||
| result = np.empty(len(data), dtype=object) | ||
| np.putmask(result, union_mask, np.nan) | ||
|
|
||
| not_masked = ~union_mask | ||
| all_cols = interleave_sep([x[not_masked] for x in all_cols], sep) | ||
|
|
||
| result[not_masked] = np.sum(all_cols, axis=0) | ||
| elif na_rep is not None and union_mask.any(): | ||
| # fill NaNs | ||
| all_cols = [np.where(masks[i], na_rep, all_cols[i]) | ||
|
||
| for i in range(len(all_cols))] | ||
| result = np.sum(interleave_sep(all_cols, sep), axis=0) | ||
| else: # no NaNs | ||
| result = np.sum(interleave_sep(all_cols, sep), axis=0) | ||
|
|
||
| if isinstance(self._orig, Index): | ||
| res = Index(res, name=self._orig.name) | ||
| result = Index(result, name=self._orig.name) | ||
| else: # Series | ||
| res = Series(res, index=data.index, name=self._orig.name) | ||
| return res | ||
| result = Series(result, index=data.index, name=self._orig.name) | ||
| return result | ||
|
|
||
| _shared_docs['str_split'] = (""" | ||
| Split strings around given separator/delimiter. | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -97,53 +97,6 @@ def test_iter_object_try_string(self): | |||
| assert i == 100 | ||||
| assert s == 'h' | ||||
|
|
||||
| def test_cat(self): | ||||
| one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_) | ||||
| two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_) | ||||
|
|
||||
| # single array | ||||
| result = strings.str_cat(one) | ||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Please have a look here - this is directly importing the internal method and testing it (not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||||
| exp = 'aabbc' | ||||
| assert result == exp | ||||
|
|
||||
| result = strings.str_cat(one, na_rep='NA') | ||||
| exp = 'aabbcNA' | ||||
| assert result == exp | ||||
|
|
||||
| result = strings.str_cat(one, na_rep='-') | ||||
| exp = 'aabbc-' | ||||
| assert result == exp | ||||
|
|
||||
| result = strings.str_cat(one, sep='_', na_rep='NA') | ||||
| exp = 'a_a_b_b_c_NA' | ||||
| assert result == exp | ||||
|
|
||||
| result = strings.str_cat(two, sep='-') | ||||
| exp = 'a-b-d-foo' | ||||
| assert result == exp | ||||
|
|
||||
| # Multiple arrays | ||||
| result = strings.str_cat(one, [two], na_rep='NA') | ||||
| exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'], | ||||
| dtype=np.object_) | ||||
| tm.assert_numpy_array_equal(result, exp) | ||||
|
|
||||
| result = strings.str_cat(one, two) | ||||
| exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_) | ||||
| tm.assert_almost_equal(result, exp) | ||||
|
|
||||
| # error for incorrect lengths | ||||
| rgx = 'All arrays must be same length' | ||||
| three = Series(['1', '2', '3']) | ||||
|
|
||||
| with tm.assert_raises_regex(ValueError, rgx): | ||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I can't mark lines that are not in the diff, but check out pandas/pandas/tests/test_strings.py Line 119 in fcd11b5
I replicated the removed test (acting on |
||||
| strings.str_cat(one, three) | ||||
|
|
||||
| # error for incorrect type | ||||
| rgx = "Must pass arrays containing strings to str_cat" | ||||
| with tm.assert_raises_regex(ValueError, rgx): | ||||
| strings.str_cat(one, 'three') | ||||
|
|
||||
| @pytest.mark.parametrize('box', [Series, Index]) | ||||
| @pytest.mark.parametrize('other', [None, Series, Index]) | ||||
| def test_str_cat_name(self, box, other): | ||||
|
|
@@ -3136,7 +3089,7 @@ def test_method_on_bytes(self): | |||
| lhs = Series(np.array(list('abc'), 'S1').astype(object)) | ||||
| rhs = Series(np.array(list('def'), 'S1').astype(object)) | ||||
| if compat.PY3: | ||||
| pytest.raises(TypeError, lhs.str.cat, rhs) | ||||
| pytest.raises(TypeError, lhs.str.cat, rhs, sep=',') | ||||
|
||||
| else: | ||||
| result = lhs.str.cat(rhs) | ||||
| expected = Series(np.array( | ||||
|
|
||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use triple-double quotes