Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
API: deprecate setting of .ordered directly (GH9347, GH9190)
     add set_ordered method for setting ordered
     default for Categorical is now to NOT order unless explicity specified
  • Loading branch information
jreback committed Mar 8, 2015
commit 39e17f24cb45d5a2cc21e4206d7c00c9c98198db
86 changes: 57 additions & 29 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from pandas.core.common import (CategoricalDtype, ABCSeries, isnull, notnull,
is_categorical_dtype, is_integer_dtype, is_object_dtype,
_possibly_infer_to_datetimelike, get_dtype_kinds,
is_list_like, is_sequence, is_null_slice,
is_list_like, is_sequence, is_null_slice, is_bool,
_ensure_platform_int, _ensure_object, _ensure_int64,
_coerce_indexer_dtype, _values_from_object, take_1d)
from pandas.util.terminal import get_terminal_size
Expand Down Expand Up @@ -141,7 +141,7 @@ class Categorical(PandasObject):
to be the unique values of values.
ordered : boolean, optional
Whether or not this categorical is treated as a ordered categorical. If not given,
the resulting categorical will be ordered if values can be sorted.
the resulting categorical will not be ordered.
name : str, optional
Name for the Categorical variable. If name is None, will attempt
to infer from values.
Expand Down Expand Up @@ -184,7 +184,6 @@ class Categorical(PandasObject):
dtype = CategoricalDtype()
"""The dtype (always "category")"""

ordered = None
"""Whether or not this Categorical is ordered.

Only ordered `Categoricals` can be sorted (according to the order
Expand All @@ -201,18 +200,17 @@ class Categorical(PandasObject):
# For comparisons, so that numpy uses our implementation if the compare ops, which raise
__array_priority__ = 1000
_typ = 'categorical'
ordered = False
name = None

def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False,
def __init__(self, values, categories=None, ordered=False, name=None, fastpath=False,
levels=None):

if fastpath:
# fast path
self._codes = _coerce_indexer_dtype(values, categories)
self.name = name
self.categories = categories
self.ordered = ordered
self._ordered = ordered
return

if name is None:
Expand All @@ -237,8 +235,6 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
cat = values.values
if categories is None:
categories = cat.categories
if ordered is None:
ordered = cat.ordered
values = values.__array__()

elif isinstance(values, Index):
Expand All @@ -263,18 +259,12 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa

if categories is None:
try:
codes, categories = factorize(values, sort=True)
# If the underlying data structure was sortable, and the user doesn't want to
# "forget" this order, the categorical also is sorted/ordered
if ordered is None:
ordered = True
codes, categories = factorize(values, sort=ordered)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes it "not option 4": if the constructor does not get a categories array and ordered=False, it will switch to not-lexi-sorted categories.

Only the if ordered is None part (+ comment) should be removed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, I think sorting is MORE confusing is it not?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was the disccusion in #9346 and especially #9347. This would make the categories sorted by order of appearance if ordered==False (which it is now per default). Before the categories were almost always lexi-ordered.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so categories not None implies ordered=True? in the constructor. Then how would you specify the list of categories that are NOT ordered?

except TypeError:
codes, categories = factorize(values, sort=False)
if ordered:
# raise, as we don't have a sortable data structure and so the user should
# give us one by specifying categories
raise TypeError("'values' is not ordered, please explicitly specify the "
"categories order by passing in a categories argument.")
# raise, as we don't have a sortable data structure and so the user should
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here, just replace the if ordered: part with a warning that categories were not lexi-sorted and the user has to specify one, if he wants to have the categories in a specific order (!= the Categorical ordered)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This wasn't tested anywhere, can you come up with an example which triggers this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the test which triggered this codepath (without the if ordered part... :-/ ):

    def test_constructor_unsortable(self):

        # it works!
        arr = np.array([1, 2, 3, datetime.now()], dtype='O')
        factor = Categorical.from_array(arr)
        self.assertFalse(factor.ordered)

e.g. this should give different results now:

        arr = np.array([1, 2, 3, datetime.now()], dtype='O')
        factor = Categorical(arr, ordered=True)

before it errored with a type error, now it either succeeds or errors in the factorize step, but it should print the warning/raises an error that the values are not sortable and the user should supply a user-defined order via supplied categories=[...]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, right, but is this ONLY if the user doesn't supply categories AND ordered=False (which is now the default). Before I think it tried first to factorize (e.g. sort=True), which failed then it warned/raised?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# give us one by specifying categories
raise TypeError("'values' is not factorizable, please pass "
"categories order by passing in a categories argument.")
except ValueError:

### FIXME ####
Expand All @@ -300,12 +290,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
warn("None of the categories were found in values. Did you mean to use\n"
"'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

# if we got categories, we can assume that the order is intended
# if ordered is unspecified
if ordered is None:
ordered = True

self.ordered = False if ordered is None else ordered
self._ordered = ordered
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not guard against None, which was valid before. Maybe simple use set_ordered(ordered), which has the check?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes I agree, None is no longer valid

self.categories = categories
self.name = name
self._codes = _coerce_indexer_dtype(codes, categories)
Expand Down Expand Up @@ -460,6 +445,37 @@ def _get_levels(self):
# TODO: Remove after deprecation period in 2017/ after 0.18
levels = property(fget=_get_levels, fset=_set_levels)

_ordered = None

def _set_ordered(self, value):
""" Sets the ordered attribute to the boolean value """
warn("Setting 'ordered' directly is deprecated, use 'set_ordered'", FutureWarning)
self.set_ordered(value, inplace=True)

def set_ordered(self, value, inplace=False):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still like "as_ordered/unordered" (or as_nominal/ordinal) more, as that implies that a value is returned and not that it is inplace and "set_ordered" is so near "set_order" which would be the "set to this categories".

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could buy as_ordered() and as_unordered(), but maybe in addition to set_ordered (but the former you can't programatically do anything), e.g. if I want to set_ordered(x)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed to using as_ordered/as_unordered

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, sounds like a good solution.

"""
Sets the ordered attribute to the boolean value

Parameters
----------
value : boolean to set whether this categorical is ordered (True) or not (False)
inplace : boolean (default: False)
Whether or not to set the ordered attribute inplace or return a copy of this categorical
with ordered set to the value
"""
if not is_bool(value):
raise TypeError("ordered must be a boolean value")
cat = self if inplace else self.copy()
cat._ordered = value
if not inplace:
return cat

def _get_ordered(self):
""" Gets the ordered attribute """
return self._ordered

ordered = property(fget=_get_ordered, fset=_set_ordered)

def set_categories(self, new_categories, ordered=None, rename=False, inplace=False):
""" Sets the categories to the specified new_categories.

Expand All @@ -486,7 +502,7 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal
----------
new_categories : Index-like
The categories in new order.
ordered : boolean, optional
ordered : boolean, (default: False)
Whether or not the categorical is treated as a ordered categorical. If not given,
do not change the ordered information.
rename : boolean (default: False)
Expand Down Expand Up @@ -520,8 +536,9 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal
cat._codes = _get_codes_for_values(values, new_categories)
cat._categories = new_categories

if not ordered is None:
cat.ordered = ordered
if ordered is None:
ordered = self.ordered
cat.set_ordered(ordered, inplace=True)

if not inplace:
return cat
Expand Down Expand Up @@ -765,6 +782,15 @@ def __setstate__(self, state):
state['_categories'] = \
self._validate_categories(state.pop('_levels'))

# 0.16.0 ordered change
if '_ordered' not in state:

# >=15.0 < 0.16.0
if 'ordered' in state:
state['_ordered'] = state.pop('ordered')
else:
state['_ordered'] = False

for k, v in compat.iteritems(state):
setattr(self, k, v)

Expand Down Expand Up @@ -1498,6 +1524,7 @@ class CategoricalAccessor(PandasDelegate):
>>> s.cat.remove_categories(['d'])
>>> s.cat.remove_unused_categories()
>>> s.cat.set_categories(list('abcde'))
>>> s.cat.set_ordered(True)

"""

Expand Down Expand Up @@ -1533,7 +1560,8 @@ def _delegate_method(self, name, *args, **kwargs):
"add_categories",
"remove_categories",
"remove_unused_categories",
"set_categories"],
"set_categories",
"set_ordered"],
typ='method')

##### utility routines #####
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3628,7 +3628,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
name = None if names is None else names[0]
return Index(arrays[0], name=name)

cats = [Categorical.from_array(arr) for arr in arrays]
cats = [Categorical.from_array(arr, ordered=True) for arr in arrays]
levels = [c.categories for c in cats]
labels = [c.codes for c in cats]
if names is None:
Expand Down Expand Up @@ -3721,7 +3721,7 @@ def from_product(cls, iterables, sortorder=None, names=None):
from pandas.core.categorical import Categorical
from pandas.tools.util import cartesian_product

categoricals = [Categorical.from_array(it) for it in iterables]
categoricals = [Categorical.from_array(it, ordered=True) for it in iterables]
labels = cartesian_product([c.codes for c in categoricals])

return MultiIndex(levels=[c.categories for c in categoricals],
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ def panel_index(time, panels, names=['time', 'panel']):
(1962, 'C')], dtype=object)
"""
time, panels = _ensure_like_indices(time, panels)
time_factor = Categorical.from_array(time)
panel_factor = Categorical.from_array(panels)
time_factor = Categorical.from_array(time, ordered=True)
panel_factor = Categorical.from_array(panels, ordered=True)

labels = [time_factor.codes, panel_factor.codes]
levels = [time_factor.categories, panel_factor.categories]
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ def get_result(self):
# may need to coerce categoricals here
if self.is_categorical is not None:
values = [ Categorical.from_array(values[:,i],
categories=self.is_categorical.categories)
categories=self.is_categorical.categories,
ordered=True)
for i in range(values.shape[-1]) ]

return DataFrame(values, index=index, columns=columns)
Expand Down Expand Up @@ -1049,7 +1050,7 @@ def check_len(item, name):

def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
# Series avoids inconsistent NaN handling
cat = Categorical.from_array(Series(data))
cat = Categorical.from_array(Series(data), ordered=True)
levels = cat.categories

# if all NaN
Expand Down Expand Up @@ -1117,7 +1118,7 @@ def make_axis_dummies(frame, axis='minor', transform=None):
labels = frame.index.labels[num]
if transform is not None:
mapped_items = items.map(transform)
cat = Categorical.from_array(mapped_items.take(labels))
cat = Categorical.from_array(mapped_items.take(labels), ordered=True)
labels = cat.codes
items = cat.categories

Expand Down
2 changes: 1 addition & 1 deletion pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -3637,7 +3637,7 @@ def read(self, where=None, columns=None, **kwargs):
if not self.read_axes(where=where, **kwargs):
return None

factors = [Categorical.from_array(a.values) for a in self.index_axes]
factors = [Categorical.from_array(a.values, ordered=True) for a in self.index_axes]
levels = [f.categories for f in factors]
N = [len(f.categories) for f in factors]
labels = [f.codes for f in factors]
Expand Down
Binary file added pandas/tests/data/categorical_0_15_2.pickle
Binary file not shown.
Loading