Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ Fixed regressions
- Fixed regression causing an ``AttributeError`` during warning emitted if the provided table name in :meth:`DataFrame.to_sql` and the table name actually used in the database do not match (:issue:`48733`)
- Fixed :meth:`.DataFrameGroupBy.size` not returning a Series when ``axis=1`` (:issue:`48738`)
- Fixed Regression in :meth:`DataFrameGroupBy.apply` when user defined function is called on an empty dataframe (:issue:`47985`)
- Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`)

.. ---------------------------------------------------------------------------

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,8 +619,8 @@ def factorize(
na_mask = indices.values == -1
na_index = na_mask.argmax()
if na_mask[na_index]:
uniques = uniques.insert(na_index, self.dtype.na_value)
na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1
na_code = 0 if na_index == 0 else indices[:na_index].max() + 1
uniques = uniques.insert(na_code, self.dtype.na_value)
indices[indices >= na_code] += 1
indices[indices == -1] = na_code
else:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -913,7 +913,7 @@ def factorize(
else:
# mypy error: Slice index must be an integer or None
# https://github.com/python/mypy/issues/2410
na_code = codes[:na_index].argmax() + 1 # type: ignore[misc]
na_code = codes[:na_index].max() + 1 # type: ignore[misc]
codes[codes >= na_code] += 1
codes[codes == -1] = na_code
# dummy value for uniques; not used since uniques_mask will be True
Expand Down
113 changes: 62 additions & 51 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,75 +393,86 @@ def test_groupby_drop_nan_with_multi_index():
tm.assert_frame_equal(result, expected)


# Test all combinations of values e.g. 1, 2, and NA. Use string labels to
# correspond to various dtypes. "z" always corresponds to NA.
@pytest.mark.parametrize("sequence0", ["x", "y", "z"])
@pytest.mark.parametrize("sequence1", ["x", "y", "z"])
@pytest.mark.parametrize("sequence2", ["x", "y", "z"])
@pytest.mark.parametrize("sequence3", ["x", "y", "z"])
@pytest.mark.parametrize(
"values, dtype",
"uniques, dtype",
[
([2, np.nan, 1, 2], None),
([2, np.nan, 1, 2], "UInt8"),
([2, np.nan, 1, 2], "Int8"),
([2, np.nan, 1, 2], "UInt16"),
([2, np.nan, 1, 2], "Int16"),
([2, np.nan, 1, 2], "UInt32"),
([2, np.nan, 1, 2], "Int32"),
([2, np.nan, 1, 2], "UInt64"),
([2, np.nan, 1, 2], "Int64"),
([2, np.nan, 1, 2], "Float32"),
([2, np.nan, 1, 2], "Int64"),
([2, np.nan, 1, 2], "Float64"),
({"x": 1, "y": 2, "z": np.nan}, None),
({"x": 1, "y": 2, "z": pd.NA}, "UInt8"),
({"x": 1, "y": 2, "z": pd.NA}, "Int8"),
({"x": 1, "y": 2, "z": pd.NA}, "UInt16"),
({"x": 1, "y": 2, "z": pd.NA}, "Int16"),
({"x": 1, "y": 2, "z": pd.NA}, "UInt32"),
({"x": 1, "y": 2, "z": pd.NA}, "Int32"),
({"x": 1, "y": 2, "z": pd.NA}, "UInt64"),
({"x": 1, "y": 2, "z": pd.NA}, "Int64"),
({"x": 1, "y": 2, "z": pd.NA}, "Float32"),
({"x": 1, "y": 2, "z": pd.NA}, "Int64"),
({"x": 1, "y": 2, "z": pd.NA}, "Float64"),
({"x": "x", "y": "y", "z": None}, "category"),
({"x": "x", "y": "y", "z": pd.NA}, "string"),
pytest.param(
["y", None, "x", "y"],
"category",
marks=pytest.mark.xfail(
reason="dropna=False not correct for categorical, GH#48645"
),
),
(["y", pd.NA, "x", "y"], "string"),
pytest.param(
["y", pd.NA, "x", "y"],
{"x": "x", "y": "y", "z": pd.NA},
"string[pyarrow]",
marks=pytest.mark.skipif(
pa_version_under1p01, reason="pyarrow is not installed"
),
),
(
["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"],
{"x": "2016-01-01", "y": "2017-01-01", "z": np.datetime64("NaT")},
"datetime64[ns]",
),
(
[
pd.Period("2012-02-01", freq="D"),
pd.NaT,
pd.Period("2012-01-01", freq="D"),
pd.Period("2012-02-01", freq="D"),
],
{
"x": pd.Period("2012-01-01", freq="D"),
"y": pd.Period("2012-02-01", freq="D"),
"z": pd.NaT,
},
None,
),
(pd.arrays.SparseArray([2, np.nan, 1, 2]), None),
],
)
@pytest.mark.parametrize("test_series", [True, False])
def test_no_sort_keep_na(values, dtype, test_series):
# GH#46584
key = pd.Series(values, dtype=dtype)
df = pd.DataFrame({"key": key, "a": [1, 2, 3, 4]})
def test_no_sort_keep_na(
request, sequence0, sequence1, sequence2, sequence3, uniques, dtype
):
# GH#46584, GH#48794
sequence = "".join([sequence0, sequence1, sequence2, sequence3])
if dtype == "category" and "z" in sequence:
# Only xfail when nulls are present
msg = "dropna=False not correct for categorical, GH#48645"
request.node.add_marker(pytest.mark.xfail(reason=msg))
if dtype == "datetime64[ns]" and sequence == "zzzz":
msg = "Cannot construct datetime of all nulls"
request.node.add_marker(pytest.mark.xfail(reason=msg))
weights = {"x": 1, "y": 2, "z": 3}

key = pd.Series([uniques[label] for label in sequence], dtype=dtype)
df = pd.DataFrame({"key": key, "a": [weights[label] for label in sequence]})
gb = df.groupby("key", dropna=False, sort=False)
if test_series:
gb = gb["a"]

warn = None
if isinstance(values, pd.arrays.SparseArray):
warn = FutureWarning
msg = "passing a SparseArray to pd.Index will store that array directly"
with tm.assert_produces_warning(warn, match=msg):
result = gb.sum()
expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key"))

if test_series:
expected = expected["a"]
if expected.index.is_categorical():
# TODO: Slicing reorders categories?
expected.index = expected.index.reorder_categories(["y", "x"])
tm.assert_equal(result, expected)
result = gb.sum()
# Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
# issues with hashing np.nan
summed = {}
for label in sequence:
summed[label] = summed.get(label, 0) + weights[label]

if dtype == "category":
index = pd.CategoricalIndex(
[uniques[label] for label in summed],
# Get the nonnull categories in the order they appear ignoring duplicates
list({uniques[k]: 0 for k in sequence if not pd.isnull(uniques[k])}),
name="key",
)
else:
index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
expected = pd.Series(summed.values(), index=index, name="a", dtype=None).to_frame()
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("test_series", [True, False])
Expand Down