From 6d758d71f99a6ef0665cd510038993a10a18106a Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Sep 2025 11:33:42 -0700 Subject: [PATCH 1/5] ENH: add value_counts to EA interface --- doc/source/reference/extensions.rst | 1 + pandas/core/arrays/base.py | 12 +++++++++- pandas/core/arrays/interval.py | 24 ------------------- pandas/core/arrays/string_.py | 5 +--- pandas/tests/extension/decimal/array.py | 4 ---- .../tests/extension/decimal/test_decimal.py | 20 ---------------- pandas/tests/extension/json/test_json.py | 10 ++++---- 7 files changed, 17 insertions(+), 59 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index e412793a328a3..b2f4bd998daa5 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -68,6 +68,7 @@ objects. api.extensions.ExtensionArray.ndim api.extensions.ExtensionArray.shape api.extensions.ExtensionArray.tolist + api.extensions.ExtensionArray.value_counts Additionally, we have some utility methods for ensuring your object behaves correctly. diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1cd10a9eef9d1..bf892d965c0e1 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -99,7 +99,10 @@ npt, ) - from pandas import Index + from pandas import ( + Index, + Series, + ) _extension_array_shared_docs: dict[str, str] = {} @@ -1673,6 +1676,13 @@ def repeat(self, repeats: int | Sequence[int], axis: AxisInt | None = None) -> S ind = np.arange(len(self)).repeat(repeats) return self.take(ind) + def value_counts(self, dropna: bool = True) -> Series: + from pandas.core.algorithms import value_counts_internal as value_counts + + result = value_counts(self.to_numpy(copy=False), sort=False, dropna=dropna) + result.index = result.index.astype(self.dtype) + return result + # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 09706f4e53d60..a883e57e75f4f 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -75,7 +75,6 @@ isin, take, unique, - value_counts_internal as value_counts, ) from pandas.core.arrays import ArrowExtensionArray from pandas.core.arrays.base import ( @@ -105,7 +104,6 @@ from pandas import ( Index, - Series, ) @@ -1197,28 +1195,6 @@ def _validate_setitem_value(self, value): return value_left, value_right - def value_counts(self, dropna: bool = True) -> Series: - """ - Returns a Series containing counts of each interval. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of NaN. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - """ - # TODO: implement this is a non-naive way! - result = value_counts(np.asarray(self), dropna=dropna) - result.index = result.index.astype(self.dtype) - return result - # --------------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f58d0b1c0b948..0d5cee63ba796 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1029,10 +1029,7 @@ def sum( return self._wrap_reduction_result(axis, result) def value_counts(self, dropna: bool = True) -> Series: - from pandas.core.algorithms import value_counts_internal as value_counts - - result = value_counts(self._ndarray, sort=False, dropna=dropna) - result.index = result.index.astype(self.dtype) + result = super().value_counts(dropna=dropna) if self.dtype.na_value is libmissing.NA: result = result.astype("Int64") diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 65fb6f33b0ea3..6f7733ad7693e 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -25,7 +25,6 @@ is_scalar, ) from pandas.core import arraylike -from pandas.core.algorithms import value_counts_internal as value_counts from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ( ExtensionArray, @@ -291,9 +290,6 @@ def convert_values(param): return np.asarray(res, dtype=bool) - def value_counts(self, dropna: bool = True): - return value_counts(self.to_numpy(), dropna=dropna) - # We override fillna here to simulate a 3rd party EA that has done so. This # lets us test a 3rd-party EA that has not yet updated to include a "copy" # keyword in its fillna method. diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index a7fc5061a267d..39ce93d37da45 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -171,26 +171,6 @@ def test_fillna_limit_series(self, data_missing): ): super().test_fillna_limit_series(data_missing) - @pytest.mark.parametrize("dropna", [True, False]) - def test_value_counts(self, all_data, dropna): - all_data = all_data[:10] - if dropna: - other = np.array(all_data[~all_data.isna()]) - else: - other = all_data - - vcs = pd.Series(all_data).value_counts(dropna=dropna) - vcs_ex = pd.Series(other).value_counts(dropna=dropna) - - with decimal.localcontext() as ctx: - # avoid raising when comparing Decimal("NAN") < Decimal(2) - ctx.traps[decimal.InvalidOperation] = False - - result = vcs.sort_index() - expected = vcs_ex.sort_index() - - tm.assert_series_equal(result, expected) - def test_series_repr(self, data): # Overriding this base test to explicitly test that # the custom _formatter is used diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 4bc9562f1895d..5e1980c202f62 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -189,14 +189,12 @@ def test_ffill_limit_area( data_missing, limit_area, input_ilocs, expected_ilocs ) - @unhashable - def test_value_counts(self, all_data, dropna): + def test_value_counts(self, all_data, dropna, request): + if len(all_data) == 100 or dropna: + mark = pytest.mark.xfail(reason="unhashable") + request.applymarker(mark) super().test_value_counts(all_data, dropna) - @unhashable - def test_value_counts_with_normalize(self, data): - super().test_value_counts_with_normalize(data) - @unhashable def test_sort_values_frame(self): # TODO (EA.factorize): see if _values_for_factorize allows this. From eccd22720f1b64ce2d5f19e942a6a2e432e2dd1d Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Sep 2025 10:06:06 -0700 Subject: [PATCH 2/5] docstring --- pandas/core/arrays/base.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7a0d824eee5b9..84ec38e2f75d1 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1677,6 +1677,18 @@ def repeat(self, repeats: int | Sequence[int], axis: AxisInt | None = None) -> S return self.take(ind) def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of unique values. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of NA values. + + Returns + ------- + Series + """ from pandas.core.algorithms import value_counts_internal as value_counts result = value_counts(self.to_numpy(copy=False), sort=False, dropna=dropna) From 13dcb0d02503caff4359cbf4eb5ee2252a7fd6d1 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 9 Sep 2025 09:36:01 -0700 Subject: [PATCH 3/5] lint fixup --- ci/code_checks.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index cebb9cda1e480..ce01120460f2a 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,6 +73,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.errors.IncompatibleFrequency SA01,SS06,EX01" \ + -i "pandas.core.arrays.base.ExtensionArray.value_counts EX01,RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ From f1d3154bbb1971e1cebb979bf804fbcd379ac86d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 9 Sep 2025 09:58:23 -0700 Subject: [PATCH 4/5] adjust code_checks --- ci/code_checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ce01120460f2a..68ca06564d3a6 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,7 +73,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.errors.IncompatibleFrequency SA01,SS06,EX01" \ - -i "pandas.core.arrays.base.ExtensionArray.value_counts EX01,RT03,SA01" \ + -i "pandas.api.extensions.ExtensionArray.value_counts EX01,RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ From 2e483bd2c47ef8bb5b143baaf7c3791217142557 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 9 Sep 2025 13:20:59 -0700 Subject: [PATCH 5/5] troubleshoot docbuild --- doc/source/reference/extensions.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index b2f4bd998daa5..e412793a328a3 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -68,7 +68,6 @@ objects. api.extensions.ExtensionArray.ndim api.extensions.ExtensionArray.shape api.extensions.ExtensionArray.tolist - api.extensions.ExtensionArray.value_counts Additionally, we have some utility methods for ensuring your object behaves correctly.