From a7a7481300a57823e92bd2dfb15d7789001b13e5 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 2 Apr 2022 13:58:31 +0200 Subject: [PATCH 01/49] Backport PR #46609: DOC: Start v1.4.3 release notes (#46612) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v1.4.2.rst | 2 +- doc/source/whatsnew/v1.4.3.rst | 45 ++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v1.4.3.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 0f546d04ea0e7..47a46c86c3a44 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 1.4 .. toctree:: :maxdepth: 2 + v1.4.3 v1.4.2 v1.4.1 v1.4.0 diff --git a/doc/source/whatsnew/v1.4.2.rst b/doc/source/whatsnew/v1.4.2.rst index 8a2bb4c6b3201..64c36632bfefe 100644 --- a/doc/source/whatsnew/v1.4.2.rst +++ b/doc/source/whatsnew/v1.4.2.rst @@ -42,4 +42,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v1.4.1..v1.4.2|HEAD +.. contributors:: v1.4.1..v1.4.2 diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst new file mode 100644 index 0000000000000..d53acc698c3bb --- /dev/null +++ b/doc/source/whatsnew/v1.4.3.rst @@ -0,0 +1,45 @@ +.. _whatsnew_143: + +What's new in 1.4.3 (April ??, 2022) +------------------------------------ + +These are the changes in pandas 1.4.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_143.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_143.bug_fixes: + +Bug fixes +~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_143.other: + +Other +~~~~~ +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_143.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.4.2..v1.4.3|HEAD From 9f71f519488de0ec263d59856bdc6a2c941a5ae2 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 6 Apr 2022 02:48:53 +0200 Subject: [PATCH 02/49] Backport PR #46647: CI/DOC: Unpin jinja2 (#46649) Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- environment.yml | 4 ++-- requirements-dev.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/environment.yml b/environment.yml index 753e210e6066a..c23bb99c736dc 100644 --- a/environment.yml +++ b/environment.yml @@ -44,7 +44,7 @@ dependencies: - types-setuptools # documentation (jupyter notebooks) - - nbconvert>=5.4.1 + - nbconvert>=6.4.5 - nbsphinx - pandoc @@ -86,7 +86,7 @@ dependencies: - bottleneck>=1.3.1 - ipykernel - ipython>=7.11.1 - - jinja2<=3.0.3 # pandas.Styler + - jinja2 # pandas.Styler - matplotlib>=3.3.2 # pandas.plotting, Series.plot, DataFrame.plot - numexpr>=2.7.1 - scipy>=1.4.1 diff --git a/requirements-dev.txt b/requirements-dev.txt index c4f6bb30c59ec..6caa9a7512faf 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -28,7 +28,7 @@ types-python-dateutil types-PyMySQL types-pytz types-setuptools -nbconvert>=5.4.1 +nbconvert>=6.4.5 nbsphinx pandoc dask @@ -58,7 +58,7 @@ blosc bottleneck>=1.3.1 ipykernel ipython>=7.11.1 -jinja2<=3.0.3 +jinja2 matplotlib>=3.3.2 numexpr>=2.7.1 scipy>=1.4.1 From 56b7e319da64a197b8142bc221e809326de63ff6 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 7 Apr 2022 01:13:57 +0100 Subject: [PATCH 03/49] Backport PR #46663: CI/DOC: pin pydata-sphinx-theme to 0.8.0 (#46667) Co-authored-by: Joris Van den Bossche --- environment.yml | 2 +- requirements-dev.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index c23bb99c736dc..a90f28a2c9e16 100644 --- a/environment.yml +++ b/environment.yml @@ -34,7 +34,7 @@ dependencies: - gitdb - numpydoc < 1.2 # 2021-02-09 1.2dev breaking CI - pandas-dev-flaker=0.4.0 - - pydata-sphinx-theme + - pydata-sphinx-theme=0.8.0 - pytest-cython - sphinx - sphinx-panels diff --git a/requirements-dev.txt b/requirements-dev.txt index 6caa9a7512faf..bb6c5d9427d38 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -20,7 +20,7 @@ gitpython gitdb numpydoc < 1.2 pandas-dev-flaker==0.4.0 -pydata-sphinx-theme +pydata-sphinx-theme==0.8.0 pytest-cython sphinx sphinx-panels From b49b16469f5dd06ffe7c1b802cfea6ca59453b0b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 9 Apr 2022 01:25:58 +0200 Subject: [PATCH 04/49] Backport PR #46692: CI fix ci-failure from bs4 new version (#46696) Co-authored-by: Marco Edward Gorelli --- pandas/io/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 05d7c2998ef27..cebedd18664e4 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -577,7 +577,7 @@ def _parse_tables(self, doc, match, attrs): for elem in table.find_all(style=re.compile(r"display:\s*none")): elem.decompose() - if table not in unique_tables and table.find(text=match) is not None: + if table not in unique_tables and table.find(string=match) is not None: result.append(table) unique_tables.add(table) From 63d2af576a3718c480195b11d3ed865a20bb1ae3 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 9 Apr 2022 03:57:13 +0200 Subject: [PATCH 05/49] Backport PR #46697: DOC: indicate that `month_name` and `day_name` also applies to Series (#46710) Co-authored-by: Tim Swast --- pandas/core/arrays/datetimes.py | 38 +++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6843e4f7eeb58..9d07a5862f11f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1217,7 +1217,8 @@ def to_perioddelta(self, freq) -> TimedeltaArray: def month_name(self, locale=None): """ - Return the month names of the DateTimeIndex with specified locale. + Return the month names of the :class:`~pandas.Series` or + :class:`~pandas.DatetimeIndex` with specified locale. Parameters ---------- @@ -1227,11 +1228,23 @@ def month_name(self, locale=None): Returns ------- - Index - Index of month names. + Series or Index + Series or Index of month names. Examples -------- + >>> s = pd.Series(pd.date_range(start='2018-01', freq='M', periods=3)) + >>> s + 0 2018-01-31 + 1 2018-02-28 + 2 2018-03-31 + dtype: datetime64[ns] + >>> s.dt.month_name() + 0 January + 1 February + 2 March + dtype: object + >>> idx = pd.date_range(start='2018-01', freq='M', periods=3) >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], @@ -1247,7 +1260,8 @@ def month_name(self, locale=None): def day_name(self, locale=None): """ - Return the day names of the DateTimeIndex with specified locale. + Return the day names of the :class:`~pandas.Series` or + :class:`~pandas.DatetimeIndex` with specified locale. Parameters ---------- @@ -1257,11 +1271,23 @@ def day_name(self, locale=None): Returns ------- - Index - Index of day names. + Series or Index + Series or Index of day names. Examples -------- + >>> s = pd.Series(pd.date_range(start='2018-01-01', freq='D', periods=3)) + >>> s + 0 2018-01-01 + 1 2018-01-02 + 2 2018-01-03 + dtype: datetime64[ns] + >>> s.dt.day_name() + 0 Monday + 1 Tuesday + 2 Wednesday + dtype: object + >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3) >>> idx DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], From 46fcc756eb88e38cc57d9ba374fd9ae5c39eca64 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 9 Apr 2022 03:57:25 +0200 Subject: [PATCH 06/49] Backport PR #46691: REGR: read_fwf raising ValueError when widths was specified with usecols (#46709) Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.4.3.rst | 2 +- pandas/io/parsers/readers.py | 3 ++- pandas/tests/io/parser/test_read_fwf.py | 23 +++++++++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index d53acc698c3bb..8572c136c28a9 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 7d0e78ce43b71..7480874fa7b23 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -867,7 +867,8 @@ def read_fwf( len_index = 1 else: len_index = len(index_col) - if len(names) + len_index != len(colspecs): + if kwds.get("usecols") is None and len(names) + len_index != len(colspecs): + # If usecols is used colspec may be longer than names raise ValueError("Length of colspecs must match length of names") kwds["colspecs"] = colspecs diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 3de73e51ce6b6..f3d41332502af 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -930,3 +930,26 @@ def test_names_and_infer_colspecs(): result = read_fwf(StringIO(data), skiprows=1, usecols=[0, 2], names=["a", "b"]) expected = DataFrame({"a": [959.0], "b": 22.2}) tm.assert_frame_equal(result, expected) + + +def test_widths_and_usecols(): + # GH#46580 + data = """0 1 n -0.4100.1 +0 2 p 0.2 90.1 +0 3 n -0.3140.4""" + result = read_fwf( + StringIO(data), + header=None, + usecols=(0, 1, 3), + widths=(3, 5, 1, 5, 5), + index_col=False, + names=("c0", "c1", "c3"), + ) + expected = DataFrame( + { + "c0": 0, + "c1": [1, 2, 3], + "c3": [-0.4, 0.2, -0.3], + } + ) + tm.assert_frame_equal(result, expected) From 00934f627541011a357bac0d01a1cb744cc24ab6 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 9 Apr 2022 03:59:20 +0200 Subject: [PATCH 07/49] Backport PR #46674: DOC: generate docs for the `Series.dt.isocalendar()` method. (#46686) Co-authored-by: Tim Swast --- doc/source/reference/series.rst | 1 + pandas/core/indexes/accessors.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index a60dab549e66d..fcdc9ea9b95da 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -342,6 +342,7 @@ Datetime methods :toctree: api/ :template: autosummary/accessor_method.rst + Series.dt.isocalendar Series.dt.to_period Series.dt.to_pydatetime Series.dt.tz_localize diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 8c2813f2b57ec..78beda95d4658 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -277,12 +277,13 @@ def isocalendar(self): @property def weekofyear(self): """ - The week ordinal of the year. + The week ordinal of the year according to the ISO 8601 standard. .. deprecated:: 1.1.0 - Series.dt.weekofyear and Series.dt.week have been deprecated. - Please use Series.dt.isocalendar().week instead. + Series.dt.weekofyear and Series.dt.week have been deprecated. Please + call :func:`Series.dt.isocalendar` and access the ``week`` column + instead. """ warnings.warn( "Series.dt.weekofyear and Series.dt.week have been deprecated. " From 5432af25d92b47e7339d28f4c504f56a9968ce15 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 9 Apr 2022 13:49:37 +0200 Subject: [PATCH 08/49] Backport PR #46690: CI fix-ci-isocalendar (#46720) Co-authored-by: Marco Edward Gorelli --- pandas/core/arrays/datetimes.py | 5 ++--- pandas/core/indexes/accessors.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 9d07a5862f11f..2c296ae1d1b0c 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1342,15 +1342,14 @@ def date(self) -> npt.NDArray[np.object_]: def isocalendar(self) -> DataFrame: """ - Returns a DataFrame with the year, week, and day calculated according to - the ISO 8601 standard. + Calculate year, week, and day according to the ISO 8601 standard. .. versionadded:: 1.1.0 Returns ------- DataFrame - with columns year, week and day + With columns year, week and day. See Also -------- diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 78beda95d4658..ed41fd15177ba 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -243,15 +243,14 @@ def freq(self): def isocalendar(self): """ - Returns a DataFrame with the year, week, and day calculated according to - the ISO 8601 standard. + Calculate year, week, and day according to the ISO 8601 standard. .. versionadded:: 1.1.0 Returns ------- DataFrame - with columns year, week and day + With columns year, week and day. See Also -------- From dfbc1dc31de00fd85a76801794145d65e61f4e80 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 10 Apr 2022 17:46:20 +0100 Subject: [PATCH 09/49] Backport PR #46633 on branch 1.4.x (CI: Remove grep from asv call (using strict parameter instead)) (#46727) Co-authored-by: Marc Garcia --- .github/workflows/code-checks.yml | 16 +--------------- environment.yml | 2 +- requirements-dev.txt | 2 +- 3 files changed, 3 insertions(+), 17 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 59fb81b167bd4..87b80204d0c19 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -140,22 +140,8 @@ jobs: - name: Run ASV benchmarks run: | cd asv_bench - asv check -E existing - git remote add upstream https://github.com/pandas-dev/pandas.git - git fetch upstream asv machine --yes - asv dev | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log - if grep "failed" benchmarks.log > /dev/null ; then - exit 1 - fi - if: ${{ steps.build.outcome == 'success' }} - - - name: Publish benchmarks artifact - uses: actions/upload-artifact@v3 - with: - name: Benchmarks log - path: asv_bench/benchmarks.log - if: failure() + asv run --quick --dry-run --strict --durations=30 --python=same build_docker_dev_environment: name: Build Docker Dev Environment diff --git a/environment.yml b/environment.yml index a90f28a2c9e16..0fbfb8236a135 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,7 @@ dependencies: - pytz # benchmarks - - asv < 0.5.0 # 2022-02-08: v0.5.0 > leads to ASV checks running > 3 hours on CI + - asv # building # The compiler packages are meta-packages and install the correct compiler (activation) packages on the respective platforms. diff --git a/requirements-dev.txt b/requirements-dev.txt index bb6c5d9427d38..5ede7b99a3d22 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,7 +4,7 @@ numpy>=1.18.5 python-dateutil>=2.8.1 pytz -asv < 0.5.0 +asv cython>=0.29.24 black==22.3.0 cpplint From 28863884390468073b2522b6be23199d97d1eab1 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 12 Apr 2022 14:30:57 +0100 Subject: [PATCH 10/49] Backport PR #46656: BUG: df.nsmallest get wrong results when NaN in the sorting column (#46748) Co-authored-by: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/algorithms.py | 6 +++++- pandas/tests/frame/methods/test_nlargest.py | 21 ++++++++++++++++++++ pandas/tests/series/methods/test_nlargest.py | 12 +++++++++++ 4 files changed, 39 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 8572c136c28a9..0c326e15d90ed 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) - diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 36eabe93dbd7e..32e3e19688a63 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1216,7 +1216,6 @@ def compute(self, method: str) -> Series: arr = arr[::-1] nbase = n - findex = len(self.obj) narr = len(arr) n = min(n, narr) @@ -1229,6 +1228,11 @@ def compute(self, method: str) -> Series: if self.keep != "all": inds = inds[:n] findex = nbase + else: + if len(inds) < nbase and len(nan_index) + len(inds) >= nbase: + findex = len(nan_index) + len(inds) + else: + findex = len(inds) if self.keep == "last": # reverse indices diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 1b2db80d782ce..a317dae562ae0 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -216,3 +216,24 @@ def test_nlargest_nan(self): result = df.nlargest(5, 0) expected = df.sort_values(0, ascending=False).head(5) tm.assert_frame_equal(result, expected) + + def test_nsmallest_nan_after_n_element(self): + # GH#46589 + df = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, None, 7], + "b": [7, 6, 5, 4, 3, 2, 1], + "c": [1, 1, 2, 2, 3, 3, 3], + }, + index=range(7), + ) + result = df.nsmallest(5, columns=["a", "b"]) + expected = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "b": [7, 6, 5, 4, 3], + "c": [1, 1, 2, 2, 3], + }, + index=range(5), + ).astype({"a": "float"}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index ee96ab08ad66c..4f07257038bc9 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -231,3 +231,15 @@ def test_nlargest_nullable(self, any_numeric_ea_dtype): .astype(dtype) ) tm.assert_series_equal(result, expected) + + def test_nsmallest_nan_when_keep_is_all(self): + # GH#46589 + s = Series([1, 2, 3, 3, 3, None]) + result = s.nsmallest(3, keep="all") + expected = Series([1.0, 2.0, 3.0, 3.0, 3.0]) + tm.assert_series_equal(result, expected) + + s = Series([1, 2, None, None, None]) + result = s.nsmallest(3, keep="all") + expected = Series([1, 2, None, None, None]) + tm.assert_series_equal(result, expected) From d1717e98a228f3b72be140340e46fdfb8a8ce162 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 14 Apr 2022 15:55:39 +0200 Subject: [PATCH 11/49] Backport PR #46767: CI: Fail Numpy Dev build on DeprecationWarnings from numpy only (#46773) Co-authored-by: Matthew Roeschke --- .github/workflows/posix.yml | 4 ++-- pandas/tests/util/test_show_versions.py | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml index ea9df610c1dff..0a914dd965a5e 100644 --- a/.github/workflows/posix.yml +++ b/.github/workflows/posix.yml @@ -61,7 +61,7 @@ jobs: env_file: actions-310-numpydev.yaml pattern: "not slow and not network and not single_cpu" pandas_testing_mode: "deprecate" - test_args: "-W error" + test_args: "-W error::DeprecationWarning:numpy" fail-fast: false name: ${{ matrix.name || format('{0} pyarrow={1} {2}', matrix.env_file, matrix.pyarrow_version, matrix.pattern) }} env: @@ -174,7 +174,7 @@ jobs: if: always() - name: Build Version - run: pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + run: conda list - name: Publish test results uses: actions/upload-artifact@v3 diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 7a1363099e7a0..54a8f395444ed 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -4,6 +4,7 @@ import pytest +from pandas.compat import is_numpy_dev from pandas.util._print_versions import ( _get_dependency_info, _get_sys_info, @@ -11,6 +12,14 @@ import pandas as pd +# This is failing on the Numpy Dev build, +# but the error may just be from distutils? +pytestmark = pytest.mark.xfail( + is_numpy_dev, + reason="_distutils not in python3.10/distutils/core.py", + raises=AssertionError, +) + @pytest.mark.filterwarnings( # openpyxl From f7f67adf4d2885128925c7ce7c86937b642d2edd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 25 Apr 2022 05:44:50 -0700 Subject: [PATCH 12/49] CI/TST: xfail git test failure on 32 bit job (#46842) (#46857) * CI/TST: xfail git test failure on 32 bit job * Add reason kwarg * Ensure docker image has PANDAS_CI=1 (cherry picked from commit 976404bde733c4751da56b41fba0aa95b4c8165d) --- azure-pipelines.yml | 3 ++- pandas/tests/test_common.py | 8 ++++++++ pandas/tests/util/test_show_versions.py | 9 ++++++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index ec798bd607034..d84f2d7784935 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -48,7 +48,8 @@ jobs: pip install cython numpy python-dateutil pytz pytest pytest-xdist pytest-asyncio>=0.17 hypothesis && \ python setup.py build_ext -q -j2 && \ python -m pip install --no-build-isolation -e . && \ - pytest -m 'not slow and not network and not clipboard' pandas --junitxml=test-data.xml" + export PANDAS_CI=1 && \ + pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml" displayName: 'Run 32-bit manylinux2014 Docker Build / Tests' - task: PublishTestResults@2 diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index cbd11cd6d8685..b2f2a5f672edb 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -5,6 +5,11 @@ import numpy as np import pytest +from pandas.compat import ( + IS64, + is_ci_environment, +) + import pandas as pd from pandas import Series import pandas._testing as tm @@ -157,6 +162,9 @@ def test_standardize_mapping(): assert isinstance(com.standardize_mapping(dd), partial) +@pytest.mark.xfail( + is_ci_environment() and not IS64, reason="Failing on 32 bit Python CI job" +) def test_git_version(): # GH 21295 git_version = pd.__git_version__ diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 54a8f395444ed..468a5e544122c 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -4,7 +4,11 @@ import pytest -from pandas.compat import is_numpy_dev +from pandas.compat import ( + IS64, + is_ci_environment, + is_numpy_dev, +) from pandas.util._print_versions import ( _get_dependency_info, _get_sys_info, @@ -77,6 +81,9 @@ def test_show_versions_console_json(capsys): assert result == expected +@pytest.mark.xfail( + is_ci_environment() and not IS64, reason="Failing on 32 bit Python CI job" +) def test_show_versions_console(capsys): # gh-32041 # gh-32041 From 22e384d97ff502167b262170199a07c44a78131a Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 27 Apr 2022 07:31:44 -0500 Subject: [PATCH 13/49] Backport PR #45247: PERF: find_stack_level (#46881) Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/util/_exceptions.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index 806e2abe83a92..ef467f096e963 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -29,17 +29,20 @@ def find_stack_level() -> int: Find the first place in the stack that is not inside pandas (tests notwithstanding). """ - stack = inspect.stack() import pandas as pd pkg_dir = os.path.dirname(pd.__file__) test_dir = os.path.join(pkg_dir, "tests") - for n in range(len(stack)): - fname = stack[n].filename + # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow + frame = inspect.currentframe() + n = 0 + while frame: + fname = inspect.getfile(frame) if fname.startswith(pkg_dir) and not fname.startswith(test_dir): - continue + frame = frame.f_back + n += 1 else: break return n From 21de060fd4864a2cf68ba53962ec85f37aa993f0 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 2 May 2022 15:31:43 -0500 Subject: [PATCH 14/49] Backport PR #46912: CI: More targeted pyarrow version testing (#46925) Co-authored-by: Matthew Roeschke --- .github/workflows/posix.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml index 0a914dd965a5e..f5cbb0e88ff11 100644 --- a/.github/workflows/posix.yml +++ b/.github/workflows/posix.yml @@ -62,6 +62,15 @@ jobs: pattern: "not slow and not network and not single_cpu" pandas_testing_mode: "deprecate" test_args: "-W error::DeprecationWarning:numpy" + exclude: + - env_file: actions-39.yaml + pyarrow_version: "6" + - env_file: actions-39.yaml + pyarrow_version: "7" + - env_file: actions-310.yaml + pyarrow_version: "6" + - env_file: actions-310.yaml + pyarrow_version: "7" fail-fast: false name: ${{ matrix.name || format('{0} pyarrow={1} {2}', matrix.env_file, matrix.pyarrow_version, matrix.pattern) }} env: From b18925d844fcf10d0af38a794e1b7b275776a03e Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 9 May 2022 19:05:29 -0500 Subject: [PATCH 15/49] Backport PR #46960: CI: Move Windows build from Azure to GHA (#46971) Co-authored-by: Matthew Roeschke --- .github/actions/build_pandas/action.yml | 5 +- .github/workflows/posix.yml | 1 - .github/workflows/windows.yml | 75 +++++++++++++++++++++++++ azure-pipelines.yml | 5 -- ci/azure/windows.yml | 58 ------------------- 5 files changed, 79 insertions(+), 65 deletions(-) create mode 100644 .github/workflows/windows.yml delete mode 100644 ci/azure/windows.yml diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index e916d5bfde5fb..5e5a3bdf0f024 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -12,6 +12,9 @@ runs: - name: Build Pandas run: | - python setup.py build_ext -j 2 + python setup.py build_ext -j $N_JOBS python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index shell: bash -el {0} + env: + # Cannot use parallel compilation on Windows, see https://github.com/pandas-dev/pandas/issues/30873 + N_JOBS: ${{ runner.os == 'Windows' && 1 || 2 }} diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml index f5cbb0e88ff11..b86dcea59edb8 100644 --- a/.github/workflows/posix.yml +++ b/.github/workflows/posix.yml @@ -180,7 +180,6 @@ jobs: run: ci/run_tests.sh # TODO: Don't continue on error for PyPy continue-on-error: ${{ env.IS_PYPY == 'true' }} - if: always() - name: Build Version run: conda list diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 0000000000000..6f267357554a3 --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,75 @@ +name: Windows + +on: + push: + branches: + - main + - 1.4.x + pull_request: + branches: + - main + - 1.4.x + paths-ignore: + - "doc/**" + +env: + PANDAS_CI: 1 + PYTEST_TARGET: pandas + PYTEST_WORKERS: auto + PATTERN: "not slow and not db and not network and not single_cpu" + + +jobs: + pytest: + runs-on: windows-latest + defaults: + run: + shell: bash -el {0} + timeout-minutes: 90 + strategy: + matrix: + env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml] + fail-fast: false + concurrency: + # https://github.amrom.workers.devmunity/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-windows + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Install Dependencies + uses: conda-incubator/setup-miniconda@v2.1.1 + with: + mamba-version: "*" + channels: conda-forge + activate-environment: pandas-dev + channel-priority: strict + environment-file: ci/deps/${{ matrix.env_file }} + use-only-tar-bz2: true + + - name: Build Pandas + uses: ./.github/actions/build_pandas + + - name: Test + run: ci/run_tests.sh + + - name: Build Version + run: conda list + + - name: Publish test results + uses: actions/upload-artifact@v3 + with: + name: Test results + path: test-data.xml + if: failure() + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v2 + with: + flags: unittests + name: codecov-pandas + fail_ci_if_error: false diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d84f2d7784935..0b2a9f5b2b0cd 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -27,11 +27,6 @@ jobs: name: macOS vmImage: macOS-10.15 -- template: ci/azure/windows.yml - parameters: - name: Windows - vmImage: windows-2019 - - job: py38_32bit pool: vmImage: ubuntu-18.04 diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml deleted file mode 100644 index 02c6564579aa2..0000000000000 --- a/ci/azure/windows.yml +++ /dev/null @@ -1,58 +0,0 @@ -parameters: - name: '' - vmImage: '' - -jobs: -- job: ${{ parameters.name }} - timeoutInMinutes: 90 - pool: - vmImage: ${{ parameters.vmImage }} - strategy: - matrix: - py38: - ENV_FILE: ci/deps/actions-38.yaml - CONDA_PY: "38" - - py39: - ENV_FILE: ci/deps/actions-39.yaml - CONDA_PY: "39" - - py310: - ENV_FILE: ci/deps/actions-310.yaml - CONDA_PY: "310" - - steps: - - powershell: | - Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" - Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin" - displayName: 'Add conda to PATH' - - bash: conda install -yv -c conda-forge -n base 'mamba>=0.21.2' - displayName: 'Install mamba' - - - bash: | - # See https://github.com/mamba-org/mamba/issues/1370 - # See https://github.com/mamba-org/mamba/issues/633 - C:\\Miniconda\\condabin\\mamba.bat create -n pandas-dev - C:\\Miniconda\\condabin\\mamba.bat env update -n pandas-dev --file ci\\deps\\actions-$(CONDA_PY).yaml - # TODO: GH#44980 https://github.com/pypa/setuptools/issues/2941 - C:\\Miniconda\\condabin\\mamba.bat install -n pandas-dev 'setuptools<60' - C:\\Miniconda\\condabin\\mamba.bat list -n pandas-dev - displayName: 'Create anaconda environment' - - bash: | - source activate pandas-dev - conda list - python setup.py build_ext -q -j 2 - python -m pip install --no-build-isolation -e . - displayName: 'Build' - - bash: | - source activate pandas-dev - wmic.exe cpu get caption, deviceid, name, numberofcores, maxclockspeed - ci/run_tests.sh - displayName: 'Test' - - task: PublishTestResults@2 - condition: succeededOrFailed() - inputs: - failTaskOnFailedTests: true - testResultsFiles: 'test-data.xml' - testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} - displayName: 'Publish test results' From cd99b01946ef9f684ad25b1bf601dc2a62aaee36 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 11 May 2022 07:07:07 -0500 Subject: [PATCH 16/49] Backport PR #46991: CI/TST: Fix test for pyarrow 8.0 release (#46992) Co-authored-by: Matthew Roeschke --- pandas/compat/pyarrow.py | 2 ++ pandas/tests/io/test_parquet.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 00b205d018e89..eef2bb6639c36 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -14,6 +14,7 @@ pa_version_under5p0 = _palv < Version("5.0.0") pa_version_under6p0 = _palv < Version("6.0.0") pa_version_under7p0 = _palv < Version("7.0.0") + pa_version_under8p0 = _palv < Version("8.0.0") except ImportError: pa_version_under1p01 = True pa_version_under2p0 = True @@ -22,3 +23,4 @@ pa_version_under5p0 = True pa_version_under6p0 = True pa_version_under7p0 = True + pa_version_under8p0 = True diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d2a3a2eebef02..3df59a2eeef1f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -17,6 +17,7 @@ pa_version_under2p0, pa_version_under5p0, pa_version_under6p0, + pa_version_under8p0, ) import pandas.util._test_decorators as td @@ -717,11 +718,14 @@ def test_duplicate_columns(self, pa): df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() self.check_error_on_write(df, pa, ValueError, "Duplicate column names found") - def test_unsupported(self, pa): - # timedelta + def test_timedelta(self, pa): df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) - self.check_external_error_on_write(df, pa, NotImplementedError) + if pa_version_under8p0: + self.check_external_error_on_write(df, pa, NotImplementedError) + else: + check_round_trip(df, pa) + def test_unsupported(self, pa): # mixed python objects df = pd.DataFrame({"a": ["a", 1, 2.0]}) # pyarrow 0.11 raises ArrowTypeError From fce6e05d8f3aaf62500870446f8724f905808ab3 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 14 May 2022 23:32:06 -0500 Subject: [PATCH 17/49] Backport PR #46981: CI: Move MacOS build from Azure to GHA (#47007) Co-authored-by: Matthew Roeschke --- .../{windows.yml => macos-windows.yml} | 17 +++++-- azure-pipelines.yml | 5 -- ci/azure/posix.yml | 50 ------------------- ci/setup_env.sh | 9 ---- 4 files changed, 13 insertions(+), 68 deletions(-) rename .github/workflows/{windows.yml => macos-windows.yml} (70%) delete mode 100644 ci/azure/posix.yml diff --git a/.github/workflows/windows.yml b/.github/workflows/macos-windows.yml similarity index 70% rename from .github/workflows/windows.yml rename to .github/workflows/macos-windows.yml index 6f267357554a3..560a421ec74ec 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/macos-windows.yml @@ -1,4 +1,4 @@ -name: Windows +name: Windows-MacOS on: push: @@ -21,18 +21,20 @@ env: jobs: pytest: - runs-on: windows-latest defaults: run: shell: bash -el {0} timeout-minutes: 90 strategy: matrix: + os: [macos-latest, windows-latest] env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml] fail-fast: false + runs-on: ${{ matrix.os }} + name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} concurrency: # https://github.amrom.workers.devmunity/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-windows + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} cancel-in-progress: true steps: @@ -47,10 +49,17 @@ jobs: mamba-version: "*" channels: conda-forge activate-environment: pandas-dev - channel-priority: strict + channel-priority: ${{ matrix.os == 'macos-latest' && 'flexible' || 'strict' }} environment-file: ci/deps/${{ matrix.env_file }} use-only-tar-bz2: true + # ImportError: 2): Library not loaded: @rpath/libssl.1.1.dylib + # Referenced from: /Users/runner/miniconda3/envs/pandas-dev/lib/libthrift.0.13.0.dylib + # Reason: image not found + - name: Upgrade pyarrow on MacOS + run: conda install -n pandas-dev -c conda-forge --no-update-deps pyarrow=6 + if: ${{ matrix.os == 'macos-latest' }} + - name: Build Pandas uses: ./.github/actions/build_pandas diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0b2a9f5b2b0cd..0c6195ff6924b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -22,11 +22,6 @@ variables: PANDAS_CI: 1 jobs: -- template: ci/azure/posix.yml - parameters: - name: macOS - vmImage: macOS-10.15 - - job: py38_32bit pool: vmImage: ubuntu-18.04 diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml deleted file mode 100644 index df1d5049be33d..0000000000000 --- a/ci/azure/posix.yml +++ /dev/null @@ -1,50 +0,0 @@ -parameters: - name: '' - vmImage: '' - -jobs: -- job: ${{ parameters.name }} - timeoutInMinutes: 90 - pool: - vmImage: ${{ parameters.vmImage }} - strategy: - matrix: - py38: - ENV_FILE: ci/deps/actions-38.yaml - CONDA_PY: "38" - - py39: - ENV_FILE: ci/deps/actions-39.yaml - CONDA_PY: "39" - - py310: - ENV_FILE: ci/deps/actions-310.yaml - CONDA_PY: "310" - - steps: - - script: echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' - displayName: 'Set conda path' - - - script: rm /usr/local/miniconda/pkgs/cache/*.json - displayName: 'Workaround for mamba-org/mamba#488' - - - script: ci/setup_env.sh - displayName: 'Setup environment and build pandas' - - - script: | - conda run -n pandas-dev --no-capture-output ci/run_tests.sh - displayName: 'Test' - - - script: | - pushd /tmp - conda run -n pandas-dev python -c "import pandas; pandas.show_versions()" - popd - displayName: 'Build versions' - - - task: PublishTestResults@2 - condition: succeededOrFailed() - inputs: - failTaskOnFailedTests: true - testResultsFiles: 'test-data.xml' - testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} - displayName: 'Publish test results' diff --git a/ci/setup_env.sh b/ci/setup_env.sh index a85767eb6f1b4..483353cfcb3cd 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -73,15 +73,6 @@ mamba install -n pandas-dev 'setuptools<60' echo "conda list -n pandas-dev" conda list -n pandas-dev -# From pyarrow on MacOS -# ImportError: 2): Library not loaded: @rpath/libssl.1.1.dylib -# Referenced from: /Users/runner/miniconda3/envs/pandas-dev/lib/libthrift.0.13.0.dylib -# Reason: image not found -if [[ "$(uname)" == 'Darwin' ]]; then - echo "Update pyarrow for pyarrow on MacOS" - conda install -n pandas-dev -c conda-forge --no-update-deps pyarrow=6 -fi - if [[ "$BITS32" == "yes" ]]; then # activate 32-bit compiler export CONDA_BUILD=1 From b9e83487e774f02d2dbdc6746c8538802253c763 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 15 May 2022 14:53:41 +0100 Subject: [PATCH 18/49] Backport PR #47020: CI: Move 32 bit Linux build to GHA (#47029) Co-authored-by: Matthew Roeschke --- .github/workflows/32-bit-linux.yml | 43 ++++++++++++++++ .github/workflows/python-dev.yml | 2 +- README.md | 1 - azure-pipelines.yml | 50 ------------------- .../development/contributing_codebase.rst | 6 +-- pandas/conftest.py | 2 +- pandas/tests/io/conftest.py | 2 +- pandas/tests/window/test_numba.py | 2 +- pandas/tests/window/test_online.py | 2 +- 9 files changed, 50 insertions(+), 60 deletions(-) create mode 100644 .github/workflows/32-bit-linux.yml delete mode 100644 azure-pipelines.yml diff --git a/.github/workflows/32-bit-linux.yml b/.github/workflows/32-bit-linux.yml new file mode 100644 index 0000000000000..500e800a082d9 --- /dev/null +++ b/.github/workflows/32-bit-linux.yml @@ -0,0 +1,43 @@ +name: 32 Bit Linux + +on: + push: + branches: + - main + - 1.4.x + pull_request: + branches: + - main + - 1.4.x + paths-ignore: + - "doc/**" + +jobs: + pytest: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Run 32-bit manylinux2014 Docker Build / Tests + run: | + docker pull quay.io/pypa/manylinux2014_i686 + docker run --platform linux/386 -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \ + /bin/bash -xc "cd pandas && \ + /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev && \ + . ~/virtualenvs/pandas-dev/bin/activate && \ + python -m pip install --no-deps -U pip wheel 'setuptools<60.0.0' && \ + pip install cython numpy python-dateutil pytz pytest pytest-xdist pytest-asyncio>=0.17 hypothesis && \ + python setup.py build_ext -q -j2 && \ + python -m pip install --no-build-isolation --no-use-pep517 -e . && \ + export PANDAS_CI=1 && \ + pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml" + + - name: Publish test results for Python 3.8-32 bit full Linux + uses: actions/upload-artifact@v3 + with: + name: Test results + path: test-data.xml + if: failure() diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index 8ca4cce155e96..23a48e567dfe9 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -2,7 +2,7 @@ # Unfreeze(by commentingthe if: false() condition) once the # next Python Dev version has released beta 1 and both Cython and numpy support it # After that Python has released, migrate the workflows to the -# posix GHA workflows/Azure pipelines and "freeze" this file by +# posix GHA workflows and "freeze" this file by # uncommenting the if: false() condition # Feel free to modify this comment as necessary. diff --git a/README.md b/README.md index 26aed081de4af..4eb983cfb24e8 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,6 @@ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134) [![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/) [![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/main/LICENSE) -[![Azure Build Status](https://dev.azure.com/pandas-dev/pandas/_apis/build/status/pandas-dev.pandas?branch=main)](https://dev.azure.com/pandas-dev/pandas/_build/latest?definitionId=1&branch=main) [![Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=main)](https://codecov.io/gh/pandas-dev/pandas) [![Downloads](https://static.pepy.tech/personalized-badge/pandas?period=month&units=international_system&left_color=black&right_color=orange&left_text=PyPI%20downloads%20per%20month)](https://pepy.tech/project/pandas) [![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas) diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index 0c6195ff6924b..0000000000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,50 +0,0 @@ -# Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml -trigger: - branches: - include: - - main - - 1.4.x - paths: - exclude: - - 'doc/*' - -pr: - autoCancel: true - branches: - include: - - main - - 1.4.x - -variables: - PYTEST_WORKERS: auto - PYTEST_TARGET: pandas - PATTERN: "not slow and not high_memory and not db and not network and not single_cpu" - PANDAS_CI: 1 - -jobs: -- job: py38_32bit - pool: - vmImage: ubuntu-18.04 - - steps: - # TODO: GH#44980 https://github.com/pypa/setuptools/issues/2941 - - script: | - docker pull quay.io/pypa/manylinux2014_i686 - docker run -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \ - /bin/bash -xc "cd pandas && \ - /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev && \ - . ~/virtualenvs/pandas-dev/bin/activate && \ - python -m pip install --no-deps -U pip wheel 'setuptools<60.0.0' && \ - pip install cython numpy python-dateutil pytz pytest pytest-xdist pytest-asyncio>=0.17 hypothesis && \ - python setup.py build_ext -q -j2 && \ - python -m pip install --no-build-isolation -e . && \ - export PANDAS_CI=1 && \ - pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml" - displayName: 'Run 32-bit manylinux2014 Docker Build / Tests' - - - task: PublishTestResults@2 - condition: succeededOrFailed() - inputs: - testResultsFiles: '**/test-*.xml' - failTaskOnFailedTests: true - testRunTitle: 'Publish test results for Python 3.8-32 bit full Linux' diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 4826921d4866b..6a8f07663578e 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -443,13 +443,11 @@ library. This makes type checkers aware of the type annotations shipped with pan Testing with continuous integration ----------------------------------- -The pandas test suite will run automatically on `GitHub Actions `__ and -`Azure Pipelines `__ +The pandas test suite will run automatically on `GitHub Actions `__ continuous integration services, once your pull request is submitted. However, if you wish to run the test suite on a branch prior to submitting the pull request, then the continuous integration services need to be hooked to your GitHub repository. Instructions are here -for `GitHub Actions `__ and -`Azure Pipelines `__. +for `GitHub Actions `__. A pull-request will be considered for merging when you have an all 'green' build. If any tests are failing, then you will get a red 'X', where you can click through to see the individual failed tests. diff --git a/pandas/conftest.py b/pandas/conftest.py index 958df72b3f607..148f8bea16b0c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -649,7 +649,7 @@ def index_with_missing(request): """ # GH 35538. Use deep copy to avoid illusive bug on np-dev - # Azure pipeline that writes into indices_dict despite copy + # GHA pipeline that writes into indices_dict despite copy ind = indices_dict[request.param].copy(deep=True) vals = ind.values if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]: diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index ff31d93947776..522d25205eeb0 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -71,7 +71,7 @@ def s3_base(worker_id): if is_platform_arm() or is_platform_mac() or is_platform_windows(): # NOT RUN on Windows/MacOS/ARM, only Ubuntu # - subprocess in CI can cause timeouts - # - Azure pipelines/Github Actions do not support + # - Github Actions do not support # container services for the above OSs # - CircleCI will probably hit the Docker rate pull limit pytest.skip( diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 9fe7ae7a5bb90..cf63ab2fe31c7 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -21,7 +21,7 @@ # TODO(GH#44584): Mark these as pytest.mark.single_cpu pytestmark = pytest.mark.skipif( is_ci_environment() and (is_platform_windows() or is_platform_mac()), - reason="On Azure CI, Windows can fail with " + reason="On GHA CI, Windows can fail with " "'Windows fatal exception: stack overflow' " "and MacOS can timeout", ) diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py index ab435a39a497b..b98129e1b07ec 100644 --- a/pandas/tests/window/test_online.py +++ b/pandas/tests/window/test_online.py @@ -17,7 +17,7 @@ # TODO(GH#44584): Mark these as pytest.mark.single_cpu pytestmark = pytest.mark.skipif( is_ci_environment() and (is_platform_windows() or is_platform_mac()), - reason="On Azure CI, Windows can fail with " + reason="On GHA CI, Windows can fail with " "'Windows fatal exception: stack overflow' " "and MacOS can timeout", ) From cba1b55fbb9c5e1112e6059ebbf4905f43ff82b5 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 15 May 2022 10:22:52 -0500 Subject: [PATCH 19/49] Backport PR #47015: CI: Ensure no-use-pep517 with no-build-isolation with new pip version (#47031) Co-authored-by: Matthew Roeschke --- ci/setup_env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 483353cfcb3cd..80448319f7918 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -104,6 +104,6 @@ echo "Build extensions" python setup.py build_ext -q -j3 echo "Install pandas" -python -m pip install --no-build-isolation -e . +python -m pip install --no-build-isolation --no-use-pep517 -e . echo "done" From 45918492815964203494105f631a85da840aec86 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 17 May 2022 06:06:53 -0500 Subject: [PATCH 20/49] Backport PR #46394: CI: Use conda-forge PyPy (#47040) Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/posix.yml | 13 ------------- ci/deps/actions-pypy-38.yaml | 1 + 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml index b86dcea59edb8..35c40f2a4aa54 100644 --- a/.github/workflows/posix.yml +++ b/.github/workflows/posix.yml @@ -155,24 +155,11 @@ jobs: channel-priority: flexible environment-file: ${{ env.ENV_FILE }} use-only-tar-bz2: true - if: ${{ env.IS_PYPY == 'false' }} # No pypy3.8 support - name: Upgrade Arrow version run: conda install -n pandas-dev -c conda-forge --no-update-deps pyarrow=${{ matrix.pyarrow_version }} if: ${{ matrix.pyarrow_version }} - - name: Setup PyPy - uses: actions/setup-python@v3 - with: - python-version: "pypy-3.8" - if: ${{ env.IS_PYPY == 'true' }} - - - name: Setup PyPy dependencies - run: | - # TODO: re-enable cov, its slowing the tests down though - pip install Cython numpy python-dateutil pytz pytest>=6.0 pytest-xdist>=1.31.0 pytest-asyncio>=0.17 hypothesis>=5.5.3 - if: ${{ env.IS_PYPY == 'true' }} - - name: Build Pandas uses: ./.github/actions/build_pandas diff --git a/ci/deps/actions-pypy-38.yaml b/ci/deps/actions-pypy-38.yaml index ad05d2ab2dacc..eda35ee14ec65 100644 --- a/ci/deps/actions-pypy-38.yaml +++ b/ci/deps/actions-pypy-38.yaml @@ -11,6 +11,7 @@ dependencies: - cython>=0.29.24 - pytest>=6.0 - pytest-cov + - pytest-asyncio - pytest-xdist>=1.31 - hypothesis>=5.5.3 From 806fabadc7c39ddcf0d4ba509802c15d6b80715b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 19 May 2022 16:03:25 -0700 Subject: [PATCH 21/49] Backport PR #47057 on branch 1.4.x (BUG: groupby.transform/agg with engine='numba' and a MultiIndex) (#47062) BUG: groupby.transform/agg with engine='numba' and a MultiIndex (#47057) Co-authored-by: Jeff Reback (cherry picked from commit c4027ad704c452ffb0f5ba962b5b789decbd29bd) --- doc/source/whatsnew/v1.4.3.rst | 3 ++- pandas/core/groupby/groupby.py | 11 +++++++- pandas/tests/groupby/aggregate/test_numba.py | 27 ++++++++++++++++++++ pandas/tests/groupby/transform/test_numba.py | 27 ++++++++++++++++++++ 4 files changed, 66 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 0c326e15d90ed..7c09eec212d69 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -16,7 +16,8 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) -- +- Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`) +- Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2afce8898134e..1adadf75ca0cb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1257,7 +1257,16 @@ def _numba_prep(self, func, data): sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False) sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() - sorted_index_data = data.index.take(sorted_index).to_numpy() + if len(self.grouper.groupings) > 1: + raise NotImplementedError( + "More than 1 grouping labels are not supported with engine='numba'" + ) + # GH 46867 + index_data = data.index + if isinstance(index_data, MultiIndex): + group_key = self.grouper.groupings[0].name + index_data = index_data.get_level_values(group_key) + sorted_index_data = index_data.take(sorted_index).to_numpy() starts, ends = lib.generate_slices(sorted_ids, ngroups) return ( diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index e7fa2e0690066..f6fccfb7d43de 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -187,3 +187,30 @@ def f(values, index): [-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group") ) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_multiindex_one_key(nogil, parallel, nopython): + def numba_func(values, index): + return 1 + + df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + result = df.groupby("A").agg( + numba_func, engine="numba", engine_kwargs=engine_kwargs + ) + expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"]) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): + def numba_func(values, index): + return 1 + + df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + with pytest.raises(NotImplementedError, match="More than 1 grouping labels"): + df.groupby(["A", "B"]).agg( + numba_func, engine="numba", engine_kwargs=engine_kwargs + ) diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index 4e1b777296d5b..93ce7287c59ab 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -176,3 +176,30 @@ def f(values, index): result = df.groupby("group").transform(f, engine="numba") expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3]) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_multiindex_one_key(nogil, parallel, nopython): + def numba_func(values, index): + return 1 + + df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + result = df.groupby("A").transform( + numba_func, engine="numba", engine_kwargs=engine_kwargs + ) + expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"]) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): + def numba_func(values, index): + return 1 + + df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"]) + engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel} + with pytest.raises(NotImplementedError, match="More than 1 grouping labels"): + df.groupby(["A", "B"]).transform( + numba_func, engine="numba", engine_kwargs=engine_kwargs + ) From 59de03a9d6b48245f0d8b047ab052bdddaa068db Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 21 May 2022 20:54:26 +0100 Subject: [PATCH 22/49] Backport PR #47059 on branch 1.4.x (REGR: Styler buf and encoding in to_latex and to_html) (#47072) --- pandas/io/formats/style.py | 11 ++++++----- pandas/tests/io/formats/style/test_style.py | 7 +++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 10cf8541effd6..4a8169c0609fd 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1049,10 +1049,12 @@ def to_latex( clines=clines, ) - encoding = encoding or get_option("styler.render.encoding") - return save_to_buffer( - latex, buf=buf, encoding=None if buf is None else encoding + encoding = ( + (encoding or get_option("styler.render.encoding")) + if isinstance(buf, str) # i.e. a filepath + else encoding ) + return save_to_buffer(latex, buf=buf, encoding=encoding) def to_html( self, @@ -1173,7 +1175,6 @@ def to_html( if caption is not None: obj.set_caption(caption) - encoding = encoding or get_option("styler.render.encoding") # Build HTML string.. html = obj._render_html( sparse_index=sparse_index, @@ -1181,7 +1182,7 @@ def to_html( max_rows=max_rows, max_cols=max_columns, exclude_styles=exclude_styles, - encoding=encoding, + encoding=encoding or get_option("styler.render.encoding"), doctype_html=doctype_html, **kwargs, ) diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index fa054ff7ca6c0..1a056f8cb3363 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -1547,3 +1547,10 @@ def test_col_trimming_hide_columns(): assert ctx["head"][0][c + 2]["is_visible"] == vals[1] assert len(ctx["body"][0]) == 6 # index + 2 hidden + 2 visible + trimming col + + +@pytest.mark.parametrize("format", ["html", "latex"]) +def test_output_buffer(mi_styler, format): + # gh 47053 + with open(f"delete_me.{format}", "w") as f: + getattr(mi_styler, f"to_{format}")(f) From 52a9bedf30048f44a0ddc9be85b02c3937427201 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 22 May 2022 12:58:48 -0500 Subject: [PATCH 23/49] Backport PR #47078 on branch 1.4.x (REGR: Raise NotImplementedError for agg with axis=1 and multiple funcs) (#47086) Backport PR #47078: REGR: Raise NotImplementedError for agg with axis=1 and multiple funcs Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v1.4.3.rst | 2 ++ pandas/core/apply.py | 6 ++++++ pandas/tests/groupby/aggregate/test_aggregate.py | 11 +++++++++++ pandas/tests/resample/test_resample_api.py | 14 ++++++++++++++ pandas/tests/window/test_api.py | 11 +++++++++++ 5 files changed, 44 insertions(+) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 7c09eec212d69..415a3ff4efda0 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -18,6 +18,8 @@ Fixed regressions - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) - Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`) - Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`) +- Fixed regression in :meth:`.DataFrameGroupBy.agg` when used with list-likes or dict-likes and ``axis=1`` that would give incorrect results; now raises ``NotImplementedError`` (:issue:`46995`) +- Fixed regression in :meth:`DataFrame.resample` and :meth:`DataFrame.rolling` when used with list-likes or dict-likes and ``axis=1`` that would raise an unintuitive error message; now raises ``NotImplementedError`` (:issue:`46904`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 64ee843f1d946..53e1b01d2a3d0 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -325,6 +325,9 @@ def agg_list_like(self) -> DataFrame | Series: obj = self.obj arg = cast(List[AggFuncTypeBase], self.f) + if getattr(obj, "axis", 0) == 1: + raise NotImplementedError("axis other than 0 is not supported") + if not isinstance(obj, SelectionMixin): # i.e. obj is Series or DataFrame selected_obj = obj @@ -456,6 +459,9 @@ def agg_dict_like(self) -> DataFrame | Series: obj = self.obj arg = cast(AggFuncTypeDict, self.f) + if getattr(obj, "axis", 0) == 1: + raise NotImplementedError("axis other than 0 is not supported") + if not isinstance(obj, SelectionMixin): # i.e. obj is Series or DataFrame selected_obj = obj diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 1ea44871eea4d..15ad81e3ffba8 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1393,3 +1393,14 @@ def test_groupby_complex_raises(func): msg = "No matching signature found" with pytest.raises(TypeError, match=msg): data.groupby(data.index % 2).agg(func) + + +@pytest.mark.parametrize( + "func", [["min"], ["mean", "max"], {"b": "sum"}, {"b": "prod", "c": "median"}] +) +def test_multi_axis_1_raises(func): + # GH#46995 + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5], "c": [6, 7, 8]}) + gb = df.groupby("a", axis=1) + with pytest.raises(NotImplementedError, match="axis other than 0 is not supported"): + gb.agg(func) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index bb49450b8414e..c052870fe0bfd 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -513,6 +513,20 @@ def test_agg_misc(): t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) +@pytest.mark.parametrize( + "func", [["min"], ["mean", "max"], {"A": "sum"}, {"A": "prod", "B": "median"}] +) +def test_multi_agg_axis_1_raises(func): + # GH#46904 + np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") + index.name = "date" + df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index).T + res = df.resample("M", axis=1) + with pytest.raises(NotImplementedError, match="axis other than 0 is not supported"): + res.agg(func) + + def test_agg_nested_dicts(): np.random.seed(1234) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index f84a579247630..e551aceab762b 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -127,6 +127,17 @@ def test_agg(): tm.assert_frame_equal(result, expected, check_like=True) +@pytest.mark.parametrize( + "func", [["min"], ["mean", "max"], {"b": "sum"}, {"b": "prod", "c": "median"}] +) +def test_multi_axis_1_raises(func): + # GH#46904 + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5], "c": [6, 7, 8]}) + r = df.rolling(window=3, axis=1) + with pytest.raises(NotImplementedError, match="axis other than 0 is not supported"): + r.agg(func) + + def test_agg_apply(raw): # passed lambda From 4bb454ff51dd2919faffe991cd4481082373e2df Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 25 May 2022 14:31:52 -0500 Subject: [PATCH 24/49] Backport PR #47096 on branch 1.4.x (fix pandas.show_versions() and remove pin for setuptools) (#47118) Backport PR #47096: fix pandas.show_versions() and remove pin for setuptools Co-authored-by: Simon Hawkins --- .github/workflows/python-dev.yml | 3 +-- .github/workflows/sdist.yml | 5 +---- ci/setup_env.sh | 5 +---- pandas/tests/util/test_show_versions.py | 9 --------- pandas/util/_print_versions.py | 2 +- 5 files changed, 4 insertions(+), 20 deletions(-) diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index 23a48e567dfe9..753e288f5e391 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -54,11 +54,10 @@ jobs: with: python-version: '3.11-dev' - # TODO: GH#44980 https://github.com/pypa/setuptools/issues/2941 - name: Install dependencies shell: bash -el {0} run: | - python -m pip install --upgrade pip "setuptools<60.0.0" wheel + python -m pip install --upgrade pip setuptools wheel pip install -i https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy pip install git+https://github.com/nedbat/coveragepy.git pip install cython python-dateutil pytz hypothesis pytest>=6.2.5 pytest-xdist pytest-cov diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml index 8406743889f71..4eb780d941031 100644 --- a/.github/workflows/sdist.yml +++ b/.github/workflows/sdist.yml @@ -39,10 +39,9 @@ jobs: with: python-version: ${{ matrix.python-version }} - # TODO: GH#44980 https://github.com/pypa/setuptools/issues/2941 - name: Install dependencies run: | - python -m pip install --upgrade pip "setuptools<60.0.0" wheel + python -m pip install --upgrade pip setuptools wheel # GH 39416 pip install numpy @@ -64,10 +63,8 @@ jobs: channels: conda-forge python-version: '${{ matrix.python-version }}' - # TODO: GH#44980 https://github.com/pypa/setuptools/issues/2941 - name: Install pandas from sdist run: | - python -m pip install --upgrade "setuptools<60.0.0" pip list python -m pip install dist/*.gz diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 80448319f7918..c03a7ff4be8b3 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -51,8 +51,7 @@ echo echo "update conda" conda config --set ssl_verify false conda config --set quiet true --set always_yes true --set changeps1 false -# TODO: GH#44980 https://github.com/pypa/setuptools/issues/2941 -conda install -y -c conda-forge -n base 'mamba>=0.21.2' pip +conda install -y -c conda-forge -n base 'mamba>=0.21.2' pip setuptools echo "conda info -a" conda info -a @@ -67,8 +66,6 @@ echo "mamba env update --file=${ENV_FILE}" # See https://github.com/mamba-org/mamba/issues/633 mamba create -q -n pandas-dev time mamba env update -n pandas-dev --file="${ENV_FILE}" -# TODO: GH#44980 https://github.com/pypa/setuptools/issues/2941 -mamba install -n pandas-dev 'setuptools<60' echo "conda list -n pandas-dev" conda list -n pandas-dev diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 468a5e544122c..4a962520460b0 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -7,7 +7,6 @@ from pandas.compat import ( IS64, is_ci_environment, - is_numpy_dev, ) from pandas.util._print_versions import ( _get_dependency_info, @@ -16,14 +15,6 @@ import pandas as pd -# This is failing on the Numpy Dev build, -# but the error may just be from distutils? -pytestmark = pytest.mark.xfail( - is_numpy_dev, - reason="_distutils not in python3.10/distutils/core.py", - raises=AssertionError, -) - @pytest.mark.filterwarnings( # openpyxl diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 289900c47375c..91d518d1ab496 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -60,8 +60,8 @@ def _get_dependency_info() -> dict[str, JSONSerializable]: "pytz", "dateutil", # install / build, - "pip", "setuptools", + "pip", "Cython", # test "pytest", From 3b3162b9ad0ae759d36499c6c7306281155b6a56 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 25 May 2022 20:32:07 -0500 Subject: [PATCH 25/49] Backport PR #46636 on branch 1.4.x (REGR: Replace changes the dtype of other columns) (#47123) Backport PR #46636: REGR: Replace changes the dtype of other columns Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/internals/blocks.py | 14 ++++++++------ pandas/tests/frame/methods/test_replace.py | 19 +++++++++++++++++++ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 415a3ff4efda0..bf414ab77cf65 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`DataFrame.replace` when the replacement value was explicitly ``None`` when passed in a dictionary to ``to_replace`` also casting other columns to object dtype even when there were no values to replace (:issue:`46634`) - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) - Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8a09e4ff2d5b7..941b1648a9778 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -875,12 +875,14 @@ def _replace_coerce( ) else: if value is None: - # gh-45601, gh-45836 - nb = self.astype(np.dtype(object), copy=False) - if nb is self and not inplace: - nb = nb.copy() - putmask_inplace(nb.values, mask, value) - return [nb] + # gh-45601, gh-45836, gh-46634 + if mask.any(): + nb = self.astype(np.dtype(object), copy=False) + if nb is self and not inplace: + nb = nb.copy() + putmask_inplace(nb.values, mask, value) + return [nb] + return [self] if inplace else [self.copy()] return self.replace( to_replace=to_replace, value=value, inplace=inplace, mask=mask ) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index b84af7b0e0b52..fd2044fd4fa7a 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -675,6 +675,25 @@ def test_replace_NAT_with_None(self): expected = DataFrame([None, None]) tm.assert_frame_equal(result, expected) + def test_replace_with_None_keeps_categorical(self): + # gh-46634 + cat_series = Series(["b", "b", "b", "d"], dtype="category") + df = DataFrame( + { + "id": Series([5, 4, 3, 2], dtype="float64"), + "col": cat_series, + } + ) + result = df.replace({3: None}) + + expected = DataFrame( + { + "id": Series([5.0, 4.0, None, 2.0], dtype="object"), + "col": cat_series, + } + ) + tm.assert_frame_equal(result, expected) + def test_replace_value_is_none(self, datetime_frame): orig_value = datetime_frame.iloc[0, 0] orig2 = datetime_frame.iloc[1, 0] From a9dcba2796278069896c870662194ba7d32ba502 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 27 May 2022 08:14:53 -0500 Subject: [PATCH 26/49] Backport PR #47135 on branch 1.4.x (CLN: ensure clean up styler tests) (#47141) Backport PR #47135: CLN: ensure clean up styler tests Co-authored-by: JHM Darbyshire <24256554+attack68@users.noreply.github.com> --- pandas/tests/io/formats/style/test_style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index 1a056f8cb3363..ab0fa8349b9c7 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -1552,5 +1552,5 @@ def test_col_trimming_hide_columns(): @pytest.mark.parametrize("format", ["html", "latex"]) def test_output_buffer(mi_styler, format): # gh 47053 - with open(f"delete_me.{format}", "w") as f: + with tm.ensure_clean(f"delete_me.{format}") as f: getattr(mi_styler, f"to_{format}")(f) From 8319ca36bc898eea7cc4b1e78655577b8c3ef163 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 27 May 2022 09:26:37 -0500 Subject: [PATCH 27/49] Backport PR #47139 on branch 1.4.x (REGR: index_col False and header=None inferring index names in some cases) (#47142) Backport PR #47139: REGR: index_col False and header=None inferring index names in some cases Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/io/parsers/python_parser.py | 2 +- pandas/tests/io/parser/test_python_parser_only.py | 12 ++++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index bf414ab77cf65..f0e208a117068 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) - Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`) - Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`) +- Fixed regression in :func:`read_csv` with ``index_col=False`` identifying first row as index names when ``header=None`` (:issue:`46955`) - Fixed regression in :meth:`.DataFrameGroupBy.agg` when used with list-likes or dict-likes and ``axis=1`` that would give incorrect results; now raises ``NotImplementedError`` (:issue:`46995`) - Fixed regression in :meth:`DataFrame.resample` and :meth:`DataFrame.rolling` when used with list-likes or dict-likes and ``axis=1`` that would raise an unintuitive error message; now raises ``NotImplementedError`` (:issue:`46904`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 04639123c5cfc..3b20777370e26 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -917,7 +917,7 @@ def _get_index_name(self, columns: list[Hashable]): implicit_first_cols = len(line) - self.num_original_columns # Case 0 - if next_line is not None: + if next_line is not None and self.header is not None: if len(next_line) == len(line) + self.num_original_columns: # column and index names on diff rows self.index_col = list(range(len(line))) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 73a6c8226b554..dadf1903ba1c7 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -458,3 +458,15 @@ def test_on_bad_lines_index_col_inferred(python_parser_only): result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"]) expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4]) tm.assert_frame_equal(result, expected) + + +def test_index_col_false_and_header_none(python_parser_only): + # GH#46955 + parser = python_parser_only + data = """ +0.5,0.03 +0.1,0.2,0.3,2 +""" + result = parser.read_csv(StringIO(data), sep=",", header=None, index_col=False) + expected = DataFrame({0: [0.5, 0.1], 1: [0.03, 0.2]}) + tm.assert_frame_equal(result, expected) From dd1e611fc231a5a823347bf304093a9eede3d6a5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 27 May 2022 16:23:23 +0100 Subject: [PATCH 28/49] Backport PR #47093 on branch 1.4.x (DOC/CI: Use more explicit data setup in user guides) (#47144) Backport PR #47093: DOC/CI: Use more explicit data setup in user guides Co-authored-by: Matthew Roeschke --- doc/data/fx_prices | Bin 16177 -> 0 bytes doc/data/mindex_ex.csv | 16 -- doc/data/test.xls | Bin 30720 -> 0 bytes doc/source/user_guide/io.rst | 259 +++++++++--------------- doc/source/user_guide/scale.rst | 19 ++ doc/source/whatsnew/v0.9.1.rst | 9 +- pandas/tests/util/test_show_versions.py | 4 +- 7 files changed, 127 insertions(+), 180 deletions(-) delete mode 100644 doc/data/fx_prices delete mode 100644 doc/data/mindex_ex.csv delete mode 100644 doc/data/test.xls diff --git a/doc/data/fx_prices b/doc/data/fx_prices deleted file mode 100644 index 38cadf26909a37e119c1c1cf93a7de94a9020d26..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16177 zcmeI3hgVZc`}h;Oh>8%BP(`qS1;w(;T4q2673^X`QK^C=5(#$23Tr{6H$e~;v7@ZL zqSzbQ6$^F+k&*yu@O$pe?7rvy{R!XK>^Xim$-Q@;dFJUktkVjJ@Cy(0i*yQDyd=me zc!}SlAj6Sxvl=)hBhNkt_{n*9K|HXVi@}?8;OR15zdgXw z3$#BU0&bOrS^N#OG+dB#3FzDL@#CAon!-oZ%Ye~t4_?0l+IMd+{Rk|++_t$I*my(Q zyAc>WcS%tz@OJR)F9y0io=@wG1t!3%v}ZMzz}~x2m#)A|gN7Y&0`k!QH-mr)&-49U zfp!sjpT`1kOs!lq1vvfZTfQ$aL*_9k7}z6j)cM80u;in)%Ym9+hou{UJ4y?iw*epB zGjU1*`UM%E$N}but@v619H6^wVG*!2Tl(u9Fmyw4>@}d$;zZl~z@c`QuFrt?&xuRl z0XvkeSAGNL`b4j-2Rb*b)=~o_m7fRc>hX9wNgvO*2fAl^)ro-5`;Cq23=FyJq38je z#QdN`Rl(=o(W7e|N6`yXn*39PZ%)h?6{p#z{W|Q zf@t92nZ;xN0^Y7Gx}N~_*?MeJ7O*A!*Y|v2y#CKcM}TGRr{$jl`WH;;eg(Mq=!Mz0 zfdf6xy)Osecy~Im0%-8nHuEzuBihF17w~mV?389;Sf|a;`TC&G(;H^B1?KH4X)p!$ z8+c`zHBdU#@klqI?$d70eSsbo@xd;@7t**NZotrX+rqtohXk^GZ{W})b=~Iz*B1Nu zh5$Q^n^7qN=6?1LS_K?=Cq=Un_|!jXeH>7sE=eOM z0|%Dxyg3W_^3OsiuGeD2|!Q)DRFIof2?gCeT$`5@4Y_}%w=NsUJO@{Vgf$yyh z3TlCNT_hC>;K>W&^R(N5KE1;}83AKvKQ$KuMe_3OPQaP!A&(t^KayRh_XDo#J?Qli zU?-Dhkt2b}4@#TH0qcw2^qLObIQI4NdBD<1Bfo?Kr~PnU5DEOc0oAMqI&M8+9|JsR zx4$4BsCMk|CLI`8V&u06Xfw<3^C4izvbT4OfqOzLCR_r3iWs%|CNQeah&E-w)5eE~ zzXFafEG+p5eCb-?{u5|9#&~rj@Ni{29?uZs&bOrl+W@22OV64BjV+c8vjP^k6g}+% z9F=x#rV}tWM;t#0C>|srr-`-R4&w%wqi%s7H2aL2H^Bq`PVR646I4w#%Nexuaobg?!EswXY-g{Ad z;C;oVJtE+o#0%Xz19LOF&h7zp?q~nLKd|JEZQwAVPWhG$cVNnbSnG+v=aSmVGl3z7 zHP8Hkiak|6VZdtf71t=>nxc!PYk`-~bW_CwCr;`Tod8Un5m%fEw6EJbcrUPc%U>6c z02_DK)tv&yI{u2f0=(UK#-ZClpWEJj9s{fVrktz*#=9qZeg@jU+xf5t=>J8b(+q4_ z+w>P7GSa}9d6x`lLmWx1FsnczMKQp72Izh0!%%8Cs6`?akBrdmB3IhXRl2_0q4eXhrE1v{hlpEe-7O-V-*wH{>d|7C}aA4Vs^7+ev{*p&k>wtTg zyL8?P9N2Ntz9is{!U2xizy&*`SN8!`-Ik0x1{~(}dgU45gBveet^$o8xDL1r%nL?m zo&fu`4juXyDAn69`3ltCoiDEidh9pqsRX_lZg@<)J?Qhfe*bpBb{9PEngMgBj+)R3 z_X8WRsGSz^A>8-Nyq(16Hh>4m@ygY3n@TkH_x@EC8+w zdvhidXr%jk=o;Yh?-oxt16?2b_S1ZHwOw0uDP{EzG9I!7HB+EY&jK} zS7)}{7dYyf_n=_lv+ybB7X!shFVrpv7Pdbp-2kjIvp={ExcaEAQwp%1hs}u`;Dqt9 zUkiZmzie7q1hlL9nR^a+GN!7_b>PTO-w)gee$#ezeFj{yt6S+i;Q9P`sR^IiGUjgGom^JOD(S+>;arse9375@T=E_6T^Vp zoYiC9fmU}E4<-QTl=8O?*(pM8Gqpja7}GM{wZMlEq>jv04MIeGy68M(&5(oa-e-*XJG}f z_?A=VXW%HmUN*mgu^}l_nt@_|(sO=Ch&u*qqqe}RT*XcSaJ5U5zy@e*Gw4V+V9}`o z&V7Lm7qf$1fSaae{cr=`@YS*P0#0w_@AU?L+tJ#6F0e;xh;In+yi;(c1gIJKP`C=X zygGl7xR%T z&4G7{TwCmbMrRN9?*+`Bbl~(rpwo|iLl7{cuDxV5kiXSPJ_$G?A*#nL;1kElqk+Jn zl8Aocz!`%_nJ)u=f8>_E4%qX>q4F&zQg5A+u2V-zO#f+-{%KGa2M0$6Nh1x%Ee@B4 zxCPA*4L6hspzAVQBM~}y>(8a%OGE*ItcNk=8kxYLFsYxRL=4Rc&yxpDlexFf!exPx z5{ot7{BTcxl>hkH82=z&i6yi|q;)fdevq_m603i^A8XQzLH`rO^*_P?^glMF0sH^^ zAFkuc?aOsM+^aeIH-GElw;lZUgWqxBcO3W~2mTWWQcoC34sWYRX!@+8%hh$r?P2iK zuM6tXPUmI4J9nx>1LvF@W29AwR@Wb&xG=L81rB*PAug>J^^6t1e9%;jR^D{{Gx2UM zDvjrEs6#JmWn*W?)S@J@&b0QuYthSLD`p%R2>1C`xN9%}g-&wIy{ouYp>6vh! zR3W<^Q4-5`)u_9(!C@<(YE)l7&fj>(PvoB5s5dvg2Bn9aY`k*k7b0)Hexbs>_w$5wdlYN&p}&U>yZ5Q^YxMLb?m-6 z#7Q%4O+6Zx<-PmRk$QA{>80Hked>{s7Iy-5XhS5oay|OG-Qj1uOZCW}D54(StGRv6 z?MMTvbd_*CDiT!y0I6q3YplsEt6 zkI27esP)7uupAlMpLHEH-+;!KCCqLa(STNb?*7}#`^c!7=g831kNZF(Ir`g7b+_iW40Yzr;-?HPUateD zEN9or(e|?o+bDA7=(K+{n6ezr2k+?FC`ZFNv%W89=hlckLCeod8XaOjaIj2{$`^BY$k36{x00=3C)`u~qh+Wm>iw>c`($Vg z=iZGV4}AJ^SR*24{6da`h@r{Rp5j5xC4q88ZhI+5B)o(-qAjlT;0T)#1QBrEMnu9# z+a~lpKVq2S^F|~k2iM5_M~>RP?+qH1BQOq#WO76UmmHDX3uNf4HD{=DM9j6b97T}; zC1?I6L)@t>cQnS67G^HnbdUX6V?3U7BdIZ7%Nd4?F(&?> zZj2)y{8j!o*BEnO7aMfM`H6q>$6fD;iGLsy9J;u6^X+64Ow7K*6w^3pim4t2_;UMQ z{rAYsFtN*AA^y{T?5d4@M3~t3PZ18h=j(YZR){;^Ja;B8Mu@3>3-KJ!Tl&LZ3$ZyD z2q%efcs~Dvu89~Itl$E(7`txxa5G&c#w6Z}#rR+y1gwb`SZ{sGh;3n(m=pKbDND9r zEBqO}c=2v4oD%d`fNzBrKFPW7LQ72aF0#bMGr34+iDP^oU-&4n!o(1o^`gdH1OOyb_dPPl>#b5m{c0?ur%+hQ*2Z0Xw>lQ^);77wxj zOXu5Sa9+3JqiylTxU_JSkG9y8yWp8EPUg&ab!YngQ=PFd=O|fr_*x(sR=FL1#f60+ zJ8WhRnL@{&J}1N;Lr~Kx?b8K6gJ^KRdl&o;qEF!mJDkI%hbB8r?CPi;Uf8oS?aj%~ zR4(JrnEFX)O!AGbEhheu-wBiWe9;Dz`25ov6aUMz!uqvBh>(_;#J9&*n8d39D=OzI zE9^ELPWBvp&)NA03!Dn4@%6j~_P%-mGKK~2lGeI&?J5gQ{n!E@>-uS9&V4b~;R5|Q zFgZ3`rX--5e7$oobHDxpY6)94oo> zJ;ef(IRDB5lk~dRocWUlCUwVj3!J;B24oj*#Cfar*Vl8 zb6Ge~S47XbK#2Q-8r*|~kc+Q9?zvBh!P_A6m|?D(2{$prq`uHM!@sz+87jcE?h)XJ z5cT%W63}y5C&1vvQ?EJ*@O8NAZJik==}t$4p@=9nohZUnQakU6au#FGUG;~F@m3Hw z<%9^&=EAm-2={<$DcV4Udx{|q*0=S^{@)b1Xe;wlhF)mFgtE1-Tl1kRr;hZhR)UC4S37X_H)vr__0 z(zK6&#wk|;Ci#D}0F!dEP=KLWgA6Rdr2c9a;9Xq#a{$hd)B!>aUQ?D(0e;rVB}x${ zbwi5~ZyyT*4&pdhbxj!{#-#iOzbAF5wwT6U&@Y#j#`PEB2rm7WiSR2+PD5f$(uS89 z-&h2WuujbEQjEFiaqyuCll*VQ;)9i#?x#_N{kQ;pTZlz%UWP9j1bn#g=6rR&{G3`%va0YbG9aidMYEOEYwCSt$ zFez74b@B57zXp%GsYC15-nw`>*QTj;F_~li(8X=YKu8aVkb*;>)z=2Bk1?RnQ|seR zTzY(JfQcBsZE!Vr919Iu9b|~PZua6;13ZmO?C%U{-q~P)p*k8of22Mp{p(CUnwN_8 zum?%#x_BQq{<)@uNjo6cp>_3oZA|*#0xeb#Yq5S$i}f#j+AqE3)4cZwpZ1Z_JX&8( z<*{?%;R8^PU3cbTG7fssidsf+g8~hk?`TlM&(&}O8a8KBBf6j(k^03#gJ_?qL0q{u z;g|+-ZN)Kd4I<-|hiU{}8C0-J){iUE#mQWOt3=!s>P;6Fn`bE5+)2siKPvPQ(&j8% zHKOxH4T9kc2-1S?aOL$|4I+K0u?EpTNyX$*BPkiAX%MNqvowg*x1TkL^p`(0h>St( zThU`~Y*g6F<_az73QQAwJ!wJEg+c+ADKK(?fjRk69Eip@#Yi1_7BHS&V#-@MN%l+Br$NQD}p4WFy8Vsj1!o5Ly)H?6wn zreJzeAaXn@N<{rf$>x{}#7+6;I4amYtqD1D<4uPqOi1dHK3O1)x zAm@x8(3vRM+*652{F$LdwC`82Igx_Ro0RA;XcN~*st_5ITB#7JOB)m@k{q~_ox6(J zp$c(r{_#o`B7MPi6(ap?I~C&E$$h<4$dUv=CE}*GnFdNk`b2?(y;mR-7H>Br8V{RU zJZ)xrS0FM6Sk=tt$jxjX)rh#ZFUY(RanbBdTqB}$jYiagG=hz6e%*vfT|T>s&HWnL z9JLV@ac!JY6Pr^vp{Lw9ZFv(S^M$=lh|XP_+4(jzy)+{lUz-sLqi34f+_4#vK2g$) ztVmqSXKT9i7{%+1ypb=DQk1>vA=V2Wn;~YQ(hxUDDKS{;XnieI z7{h1%9;{2|qkNXf{_odGG++PT=Wji58u;I{12Qk}x&buyJY&kOA3`QBZYonl0w%pN~MfWrO$6jrSiQ>q5EH%f=QZsmqOQ@mP*$z zN~O>Hl}hLCS!r}WC{JViOUImeD-AQK{O8lDeDgD?Tx~Nc4`$MNPMb`s*TEU|c^Mg0 zkLxq3-d1JOeaz0Ha+GD!=MBlG*B{EJ*I8v#c|T-R`66?ue!uOe9F|MhdzXXn--8!_ zH+^nw4n2<|cAtAT{rn>1XSQx|7JbeQCbv3+>giEBz1}&UKG!#$UiTr5>RXmZ^fXIhpFyJ(=3!{A7Aw{gUZ^dnVI$5|gPMU)g!{Qs_CY zN~X{CNv8Xrl|-NSfZ3aN5n)Q()zsUF^>)9ZQZ)L+}D(e*Z^(RFSyyX}xc zuiwP%TATUtKYltpldcz+N%u3I`LkgrT{nbX|0A9H>-%)7m(X;24rf?ASjGr`L+bF$ z>GXLq8B`xm8Fb&1GpJnzFuyyWNxy%bLG5Zc^TTc#biY;^bbkXg==ljVsGdKhQ~i8p z@#z}d?_X@6ARpPowKqviH&K`4vpQ zuuQ7wm6>#3lQOB@!F$plcx2J%3wBYx4b7tV^@&}-DUfOUzA1dZaaHFgq`0MrZ4SGy00fp-{zTAAN(w;*E@`(cG2f7-9?|dSVW^ucMy}!od@!Z|?KCN7O-HBYf-zW6*3hP`V-@>!G zRL?DK9|ZlI*;t{%lSMiUwj@p2m4#> zzKmSD&(rLF{X8OM$m@TvUi`gpP6PjY=Pvd4^ObZ8(F^&{pPbv$KRFLE agg+tg>=0rU(k`TZ2>X+9zgT~Nr~e1LGyn$x diff --git a/doc/data/mindex_ex.csv b/doc/data/mindex_ex.csv deleted file mode 100644 index 935ff936cd842..0000000000000 --- a/doc/data/mindex_ex.csv +++ /dev/null @@ -1,16 +0,0 @@ -year,indiv,zit,xit -1977,"A",1.2,.6 -1977,"B",1.5,.5 -1977,"C",1.7,.8 -1978,"A",.2,.06 -1978,"B",.7,.2 -1978,"C",.8,.3 -1978,"D",.9,.5 -1978,"E",1.4,.9 -1979,"C",.2,.15 -1979,"D",.14,.05 -1979,"E",.5,.15 -1979,"F",1.2,.5 -1979,"G",3.4,1.9 -1979,"H",5.4,2.7 -1979,"I",6.4,1.2 diff --git a/doc/data/test.xls b/doc/data/test.xls deleted file mode 100644 index db0f9dec7d5e42c87dc0b0d297b66305f7af4225..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 30720 zcmeHQ4RloHnf_)n2>~nwMF`qz3{voigalBALcoA2M2YEup$?c3C&Q3V$Y5q@z`-Wn z)-K)EmTp6pny%Q>?%H&N@w@kafA9Cc-}`&-dmWzn*|Zan{?lbARmxGOveZCgqRMf!yKv84dw}InSwGE1tOrP zpY9scyJbi=s)%Y)VRaX-5nM-mT&&zK%B{g6lljDO zr-fOzTCaAh?f9z)9=p|A)vUH^oY)dmjks?f0q8uePg1Bq#9@S?gI)7*ca}nrBU`P; z3IUBn5G%z-iO&+OZ)neSeZ8=?{}Xee#W~q(w(sbKJmpq7*vyl-KH>Qx&Pu(AvRUn* z?r#i63)Tg0X>4u|?k@OaALSZ{dKYn64i39e??lOiMPVEGj`Jn@{EvFn4`eKQs@3lj zyP~n%sQ>g(Z@E_LaS!NoA2jg# z2V50LN|$^1Sy{u+8g{LNSYcQ+Jm3W}B$6p;c+~QtT~~PC4=IB^F1)&M_Vu%8FQ~ib zL!0NB%I0gX@LUCp58G^-(7(wSY`8OdP$3qB@iv2s79_(#ouKBzX%wc8N`Yn#y%BaD zmb;FqTw7o;v#Oit>4weMe8_4joL>mM>gKJi!zxAU8vJ3ZbKbQD&Pmr6B+Xa>=K(k8 zTudlPUQeUuSaQV*d(j)~+Ak_~I#B?Qnd3+S+sQ-S{#It~ zl)iN1hifO)bmK4Nc9$w3dDbEOoDJUTedteZzh;>B>xXH-Hf=j2rMGDNQ~1azDqVXv z;;MA`Fm}@T)1~R-NWmA~9+lUgmEd(+dyc+d-0soWOWG&s>-6@C`g&=5uD+^vrLQyE zvlD*<8!ncR?&(*#IqJ&qFj70F&c2zg-pKv5Qr~)2?c-_7!CgVlpDq6nN&YJHsO@(=&r=!y*py+HQ&yI zEVw^-^YcyncRky*{~_FaHtsw~)EJJx)pW=H{5^@oYBvgwZ{Pi>O99Sf_r_E|;M@{@ z)P?(lgzdfuV{oM_KGCtj!!^0K2lL9B@7UkIcIUzFJr88N@>JgR+v{id1LpK^T((br z^MfAFWr)OU0D%<`J_0=a2>PF&Sxfi3v;G6Pf@aeczgl8+4so)9Ya6GZXd?tqDcQ^n7{X7>DPOLMee=) z16q^Smbo78huMEl(UCdkX$Fu$9uN* zqyMFpe(rzn!O7VV{j%|D>eXKc?$x+(ZO$iOh&S;gzfW8Jg!xevBQG4g_vf=*Tc%HU z-G_U@9s3Kj#If5ROC087xrFa|N#XlsH_f{ol$o3HO#S(Jd~|2XdlAkJczoymG@OZ7 zt~&dT*MGk5uId$ay8T++ew{x5g>Grr(kC$IXpQ%{K8c}YC?#tN!$FTZu|Dui#v#W(HP(G)B$UcF&;LG`U`*Ka5QjN;;(s|pJWx5Z*RmKGPc zwzd}4b5v0vvYi8?#nmm59l@5^?ppzDJ|+}3#2N~*j6hF92dxc_fmqq3iOa&l-DLn_ z0o&2i*c>aWt`7w(A}!nNal39!q^0qmNOP>d$rp?+E9Q7iv!mOK{m#M+iz-Vso!k_t!EV|VWL557#Qa!i$H*M zQ&~yrf(6Tp^(`Tfh*LCH9}8AA)rSUg!gy?BFcfSl0~B+sQ7kq%E-N;OFEefD=EZF- zjX`iC2Dvhe4HBJp_W8hxG(>U7&9;>!@8DQr#ri;f6NLG~VM7W*wknK4J;3V;NcIJ0 zkQk7%}sdi9CRjCbUg8=uL0QZ)h<9AbqMw+3UeAfgXD zvM}>g;~mROrgP-BA4>7$69?TkRvh%@j#dlP4ULgdz25QjcejV3!LirP_4Av?z;>rH zHYwt}so`Tgqp@HE8-2k*?0l5`cSax?<gTLg++MvWR68nTL=Jp zl=p_@0NSY|@4XskxNR_t7r0=gQaDsHH23B&OyQoz)#2JpO1%pfAPq2hl%|yl3`%Nu z*v+vZ6WE-!aG-eRJ+JNcP+=d5FmAl%A;+u{xADKdx&DH*;u%pmR9WAJ8NYF1TD!gY z#v9HL6?IAicJIQm+v!CimBgX4AcnhSNfP^v-A>yLULAQLQi8%Q~x*4r|mJ_PU>Q&(rCW22@-sD~oAIS6c%2CW*pf zYFnfofKaLYC%+;lCM-Q?wZplb7=OQ0c=DKyx z2cO?~WNn<;#!6}%Gck4>GhTT*p>4OxTqf)6wuiDgE<34B<}zh!k9$Oqj#(8*V#(SjG_I$$8CfYd`nKVM66D%v4(}sLWAKRsTEX-k+~} z>NO43jS*8-;CStS)x7`f8ipGfO;w+mdUs*@p{J}CLcMfv$4k#F`i~WaN}Ok^b_;5j zhWgoEjp>8BEgTA|NO%5f9db z{2L9Gyqu|ySWvSy)bH(m?d${Bzo7BR)=(c-FD%-$_C*aNTkk;C4NrzD{{4(KhEN~* z)UUU{y65{EDy3njDkuYy!(G=M+AtkWVr)$(iz#=iTM_#JOTTXR0%@A9tp3$8)CXQ^AZ# ztQSMckM|?_%5<+9Z@@t#- z?28)1i!rVg<2lpR%r7;@daxb~FBGcR4U=!QP4h@5HPvqq(q=!4r~T|Ky}7sS^iY?P zlu@%Wv%JQzG&uHp%)V|nuPE?#Z9HcxpB9pBCb$~UnL6bn^~JQU3S(YLZG({JvsYX4 zQz@h@BxlpdbEX&Wi}-ZO;Pb(Vr%xkYNMF8WMBz|neHUhYJZCy4f~%o+JH04mYPi!U zN9U(+9?zND|3k;0B(xr$`{OxN>uJTAiA=*I6YW@9vq67BJf1WCljKa9b75{$a-4Gi z$I6-RdsjJA=3AL^vh%1WJId@AkIeI^94VPo<`SjGj2ZVqYRthgcPBMw%9wMrGo>q& zGNsP^t7J-@`B%x5I`gmiUy}jRnSYf`>6WBSsWbm7nNnx|HET%zRWhZ{{HtV2+mkY- z&it!nN}c&vkA~{Zzj}t`UnNs|Fey{&%)d&e)R}*kOsO;f3NB&?ocUMDlpe95N~YAA zf0azBGyj@1B>yUzQfK~EGNmT}Dw$Ge{#7!i&it!nN}c)F2}AO)ltVlJnmZ)_nmZ)_ znmZ)_nyaDO`PUih6A*%f$iJq^!0OK#R$|q2XJGXpU<8dGl7W@J@$(qi0WydmT~P7) zJ6TjJUdN9=j~y7m=S*3XgO6LGj8?flG$+)n-NGUB|h zG0+l;Mz+QZZr&9LHWlb~w1zCHSX<5yzI1z$ZZDGGSs~o>UXjWf4x%lpS&=h=$ij)P$$o-QRs@cq42i=+fm5huc46NPoPi^et^PXn0X4t$#*a+ z>Bq%T)$%Z0JAb&u=F7VJ0N!OpqT7T+=v#^M!i;+#Jihp-i}&sS+@^nbPUK3d52Fp= z^=U3vuU)m_D*08QN+3SXow!IP2GCF9Sa`69#}y!ZQoE}ipA+JP!Y}hnz)$K9YlJLd zX5wd1AZawHdD)X5JSzinHqhky5fzE|M>|!>f4a5X744L3cvH0V_pROTXlEcl633k@ zvL_tph`_A6eAYSAj0i174f8UnCy!IojFU-P&Cl zD=&?7R_!mF7D3%w!Ce_GFAaDJ(O#o!e`Sm<)*3F~p~#-M94q(^h5Y`0dxg8K75sgR zp9%TPODkhs&#X|F-M4zpsPy*9aEw*6y0)t-ZiU@bJWYE$t61#hJuMI}P!mYbf!Dxoo`G6Ku?foWNSUfrbFbj@Uo|Y7l|eZi@NKgZoBt z1#qd)1UG$ciJGpEw>&T2>+h+GmgfcXtD?1iZ3*7Td&!0JJbzE6FATK<+ttflvG98ABp=q0{ImsVSiP$q9o+4D2c?EpAv$x+RWO%@;u_t_DUal?Cap1iW0)8 z>8kXFyliD3=>mr?ZSAfo@pZH%!hX)GD1nW@m_Rnl2%d2#RP|`bZ@EA#~U%B1Ixal>9w5k?~$a zps3b#Rg@6vnyyHkCJ*)n#6>%aE{U!vA(6muMj*eok2APAT!ZUYl#s-#s9;#0$JHxJ zNG4`vh5dvH(*E1xX>Gs1-{=DOnrIR&Y9NUYd5Jp7l>2yZB+j--ob5s`wSxSWks+`2 z*L0D{ke5jEjwKSOoN=8%zQ0H2&;r!<5njAE4%`470anGZ)mBWp`JS0h#wfVKpw zq4Ws`yeCDxhP-5$*%4}#j3SmIdxlwo{6N0Y5cU(#crPVZ=2O&ax@0Xfk*yF%KxR$) zM8e~})Ee#$@#7wFXKMQ>gv6@0PxhQ6K#j-X3{wBcVt%xfo)Q;^fZa$EjvSO7@y)&=*BJ zc_p_gSJ)s(EjNK%K~<#XA_oYA$R?+DcSH=xoLqwN^iZIjq z(I{}9ST2szM2OhyAvMzjLQWooMYQ-o{N)y^RZ_1d1LA zog72QW84(irCf7w$sfumXR}5+|F10pDG&SUb*ZL=$GN0coSlJ0w*{^t@S>D(4aArd zU}xmKPFei-_xXW`@E_-yB5{;9Vw91jXLIjG(zS+6p&23-wA4a_P;AS&Xf-W2b-*dv zq7SqK)Bx7FFT_mrfxAcj@pTYW1`Z4gs-ohoMDpn!WOt;MxS3GRjo5CzAs7N-xiAJv z1ZcY}Bk)69*d!VqLoON0Ie=K(H&N7twmyUhL8`xTU}6d?N^1MOPm6#vv?Fo!7_`>J zK;mPKT^3u(f&pJgHlIis3@(1WDvJ69;zkoFY9pk|+L~0uTgXOFI4-;uQfX`H7^pd! zyDwRWo|$vFAtamH%sW~H_Bn}-cJir$PKt;M_a%5{*c0}14nuvzTei|m$P8k~5uEMs zS^PM+LLTLvXxvWaS4e`Gk0S|nMQ`HY#bM_d&p{Fin?I?IZ}GTj3t@N+z);J0QC(axd;o)SS=B=ZO$F6x_t9<)t+}p`}8oQE;Owf~+VY!IN>^`sAlE+INxD zcrTxh5(}w#;HEJQ~*~xeNp;T(r0n zDpE9Pj`%dlM_VD+q2z|I;Z1lRd(z2Qc^)HSv>yaRgeXTG8fWvoLB5jDbZo?iTEwR>*2F>Zc~IsFp@K2nM)&IE zDY~_IuN%A42h*n?+Gr6p)EqHVq{5BtA4dbq1m}FKO}~?}7z$`yOAAD{ z8Ef}}M}rUqx*mx)PFzLb1J5&n@S5eEquu`F0WaoVF1#W8X?dt`be+^MUggb&m|=OA zpy-j;z{h%nLLH+(F%;vj^W_A^oSuhoN$h73tT&UdA&8Sa%~X_dBqf5O94%I*k8=n~ zUM_GBaUq_3e?w!>IsP6Bervb)X}S#tYqXEvr@1SbgO^X`dAO=qtsu>J2pDT>sh(5< z`Z=MS*Jvl-`iL4(euy3Sw2M zE##nh4QeR$!)UYp-hi4zn=647`{_^UH$)qyMlChe8ttWl;waHc-bqkOU8eUk+DqQb z7zui5A-Ez7y&%^Y7cVtyhu(2$`fE%2{l}~J>$itm9%Q#kCxd)3m6U0bD`HIFM4Kd! zS(F^wHfg8qvX59(dsq`X#PHH;i6J+kDaVp4T!U-V!!axs`^B0q3u5poKior(7wMp` zh}6h<2DIEA>bSAr{{F?UMX3veQqLD4c666c>Zt)qQa?pv9f=?4Lg<_bgAc2)h!rK- z)TiaA1|Pv-pNKzkuIWNd2?*pSSEB^es(7E~8w>lD3bl?B8uHR+@cpZ3r|qXXT3+GE zJ2h>;`TiQ@xhrGgO||__LWzJ)C>cfZk=7%d#d+tOb6@~!|4GY^O^5j|)*HR3G zkuO^69PQTcP%Kj|;vwov!Qp5ME$7+qRj8a2?1`{SC`du2D6x+Ql{FDD#vsn;OHp?T zyu?kLFpNQ#sEN2(!V$woQQ;d+F#{YW^pUk>Fh>$gQ8=M-RF9TL7Q=mqY7T`rf#Ik# z4HWNvNuPP&+AU{lf*Bu}RddMSm-+a}p4xu#nj^y>Rqmy9lE{mscaZi$Fi(jXi2%;w z&M_BPb11abcS!azlQ{uz2${zC3za(T&1{{(tbjNA6@Op!E9YKkRGeFoBZPnmf9sRw zrRU~_nL#oTm&eXM!)#Vveqa_}*{+YjaBd>Z9IUdmwKp(JupD`_Mn$+;fP}+%9|}PG zsR@~K5Sn&~0bj;*z(HpB{aQ+>0 zk-`|>G%BS0{^ONVX2?gwMUIsKeq?ER$3_S)0Wr(yj8Mp5H>>r@Q4pdzk53gc5c{l8 z!elFczVT;&xYR2A`-9u@FZ|txe}nO3quvkxJA4VpkqH0~25rb>V1{EH%l}DudOolJ E|39OKD*ylh diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index be761bb97f320..705861a3aa568 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -837,14 +837,11 @@ input text data into ``datetime`` objects. The simplest case is to just pass in ``parse_dates=True``: .. ipython:: python - :suppress: f = open("foo.csv", "w") f.write("date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5") f.close() -.. ipython:: python - # Use a column as an index, and parse it as dates. df = pd.read_csv("foo.csv", index_col=0, parse_dates=True) df @@ -862,7 +859,6 @@ order) and the new column names will be the concatenation of the component column names: .. ipython:: python - :suppress: data = ( "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" @@ -876,9 +872,6 @@ column names: with open("tmp.csv", "w") as fh: fh.write(data) -.. ipython:: python - - print(open("tmp.csv").read()) df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]]) df @@ -1058,19 +1051,20 @@ While US date formats tend to be MM/DD/YYYY, many international formats use DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: .. ipython:: python - :suppress: data = "date,value,cat\n1/6/2000,5,a\n2/6/2000,10,b\n3/6/2000,15,c" + print(data) with open("tmp.csv", "w") as fh: fh.write(data) -.. ipython:: python - - print(open("tmp.csv").read()) - pd.read_csv("tmp.csv", parse_dates=[0]) pd.read_csv("tmp.csv", dayfirst=True, parse_dates=[0]) +.. ipython:: python + :suppress: + + os.remove("tmp.csv") + Writing CSVs to binary file objects +++++++++++++++++++++++++++++++++++ @@ -1133,8 +1127,9 @@ For large numbers that have been written with a thousands separator, you can set the ``thousands`` keyword to a string of length 1 so that integers will be parsed correctly: +By default, numbers with a thousands separator will be parsed as strings: + .. ipython:: python - :suppress: data = ( "ID|level|category\n" @@ -1146,11 +1141,6 @@ correctly: with open("tmp.csv", "w") as fh: fh.write(data) -By default, numbers with a thousands separator will be parsed as strings: - -.. ipython:: python - - print(open("tmp.csv").read()) df = pd.read_csv("tmp.csv", sep="|") df @@ -1160,7 +1150,6 @@ The ``thousands`` keyword allows integers to be parsed correctly: .. ipython:: python - print(open("tmp.csv").read()) df = pd.read_csv("tmp.csv", sep="|", thousands=",") df @@ -1239,16 +1228,13 @@ as a ``Series``: ``read_csv`` instead. .. ipython:: python - :suppress: + :okwarning: data = "level\nPatient1,123000\nPatient2,23000\nPatient3,1234018" with open("tmp.csv", "w") as fh: fh.write(data) -.. ipython:: python - :okwarning: - print(open("tmp.csv").read()) output = pd.read_csv("tmp.csv", squeeze=True) @@ -1365,15 +1351,11 @@ The ``dialect`` keyword gives greater flexibility in specifying the file format. By default it uses the Excel dialect but you can specify either the dialect name or a :class:`python:csv.Dialect` instance. -.. ipython:: python - :suppress: - - data = "label1,label2,label3\n" 'index1,"a,c,e\n' "index2,b,d,f" - Suppose you had data with unenclosed quotes: .. ipython:: python + data = "label1,label2,label3\n" 'index1,"a,c,e\n' "index2,b,d,f" print(data) By default, ``read_csv`` uses the Excel dialect and treats the double quote as @@ -1449,8 +1431,9 @@ a different usage of the ``delimiter`` parameter: Can be used to specify the filler character of the fields if it is not spaces (e.g., '~'). +Consider a typical fixed-width data file: + .. ipython:: python - :suppress: f = open("bar.csv", "w") data1 = ( @@ -1463,12 +1446,6 @@ a different usage of the ``delimiter`` parameter: f.write(data1) f.close() -Consider a typical fixed-width data file: - -.. ipython:: python - - print(open("bar.csv").read()) - In order to parse this file into a ``DataFrame``, we simply need to supply the column specifications to the ``read_fwf`` function along with the file name: @@ -1523,19 +1500,15 @@ Indexes Files with an "implicit" index column +++++++++++++++++++++++++++++++++++++ -.. ipython:: python - :suppress: - - f = open("foo.csv", "w") - f.write("A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5") - f.close() - Consider a file with one less entry in the header than the number of data column: .. ipython:: python - print(open("foo.csv").read()) + data = "A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5" + print(data) + with open("foo.csv", "w") as f: + f.write(data) In this special case, ``read_csv`` assumes that the first column is to be used as the index of the ``DataFrame``: @@ -1567,7 +1540,10 @@ Suppose you have data indexed by two columns: .. ipython:: python - print(open("data/mindex_ex.csv").read()) + data = 'year,indiv,zit,xit\n1977,"A",1.2,.6\n1977,"B",1.5,.5' + print(data) + with open("mindex_ex.csv", mode="w") as f: + f.write(data) The ``index_col`` argument to ``read_csv`` can take a list of column numbers to turn multiple columns into a ``MultiIndex`` for the index of the @@ -1575,9 +1551,14 @@ returned object: .. ipython:: python - df = pd.read_csv("data/mindex_ex.csv", index_col=[0, 1]) + df = pd.read_csv("mindex_ex.csv", index_col=[0, 1]) df - df.loc[1978] + df.loc[1977] + +.. ipython:: python + :suppress: + + os.remove("mindex_ex.csv") .. _io.multi_index_columns: @@ -1601,16 +1582,12 @@ rows will skip the intervening rows. of multi-columns indices. .. ipython:: python - :suppress: data = ",a,a,a,b,c,c\n,q,r,s,t,u,v\none,1,2,3,4,5,6\ntwo,7,8,9,10,11,12" - fh = open("mi2.csv", "w") - fh.write(data) - fh.close() - -.. ipython:: python + print(data) + with open("mi2.csv", "w") as fh: + fh.write(data) - print(open("mi2.csv").read()) pd.read_csv("mi2.csv", header=[0, 1], index_col=0) Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it @@ -1632,16 +1609,16 @@ comma-separated) files, as pandas uses the :class:`python:csv.Sniffer` class of the csv module. For this, you have to specify ``sep=None``. .. ipython:: python - :suppress: df = pd.DataFrame(np.random.randn(10, 4)) - df.to_csv("tmp.sv", sep="|") - df.to_csv("tmp2.sv", sep=":") + df.to_csv("tmp.csv", sep="|") + df.to_csv("tmp2.csv", sep=":") + pd.read_csv("tmp2.csv", sep=None, engine="python") .. ipython:: python + :suppress: - print(open("tmp2.sv").read()) - pd.read_csv("tmp2.sv", sep=None, engine="python") + os.remove("tmp2.csv") .. _io.multiple_files: @@ -1662,8 +1639,9 @@ rather than reading the entire file into memory, such as the following: .. ipython:: python - print(open("tmp.sv").read()) - table = pd.read_csv("tmp.sv", sep="|") + df = pd.DataFrame(np.random.randn(10, 4)) + df.to_csv("tmp.csv", sep="|") + table = pd.read_csv("tmp.csv", sep="|") table @@ -1672,7 +1650,7 @@ value will be an iterable object of type ``TextFileReader``: .. ipython:: python - with pd.read_csv("tmp.sv", sep="|", chunksize=4) as reader: + with pd.read_csv("tmp.csv", sep="|", chunksize=4) as reader: reader for chunk in reader: print(chunk) @@ -1685,14 +1663,13 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object: .. ipython:: python - with pd.read_csv("tmp.sv", sep="|", iterator=True) as reader: + with pd.read_csv("tmp.csv", sep="|", iterator=True) as reader: reader.get_chunk(5) .. ipython:: python :suppress: - os.remove("tmp.sv") - os.remove("tmp2.sv") + os.remove("tmp.csv") Specifying the parser engine '''''''''''''''''''''''''''' @@ -2594,27 +2571,38 @@ Read in the content of the file from the above URL and pass it to ``read_html`` as a string: .. ipython:: python - :suppress: - rel_path = os.path.join("..", "pandas", "tests", "io", "data", "html", - "banklist.html") - file_path = os.path.abspath(rel_path) + html_str = """ + + + + + + + + + + + +
ABC
abc
+ """ + + with open("tmp.html", "w") as f: + f.write(html_str) + df = pd.read_html("tmp.html") + df[0] .. ipython:: python + :suppress: - with open(file_path, "r") as f: - dfs = pd.read_html(f.read()) - dfs + os.remove("tmp.html") You can even pass in an instance of ``StringIO`` if you so desire: .. ipython:: python - with open(file_path, "r") as f: - sio = StringIO(f.read()) - - dfs = pd.read_html(sio) - dfs + dfs = pd.read_html(StringIO(html_str)) + dfs[0] .. note:: @@ -2748,77 +2736,48 @@ in the method ``to_string`` described above. brevity's sake. See :func:`~pandas.core.frame.DataFrame.to_html` for the full set of options. -.. ipython:: python - :suppress: +.. note:: - def write_html(df, filename, *args, **kwargs): - static = os.path.abspath(os.path.join("source", "_static")) - with open(os.path.join(static, filename + ".html"), "w") as f: - df.to_html(f, *args, **kwargs) + In an HTML-rendering supported environment like a Jupyter Notebook, ``display(HTML(...))``` + will render the raw HTML into the environment. .. ipython:: python + from IPython.display import display, HTML + df = pd.DataFrame(np.random.randn(2, 2)) df - print(df.to_html()) # raw html - -.. ipython:: python - :suppress: - - write_html(df, "basic") - -HTML: - -.. raw:: html - :file: ../_static/basic.html + html = df.to_html() + print(html) # raw html + display(HTML(html)) The ``columns`` argument will limit the columns shown: .. ipython:: python - print(df.to_html(columns=[0])) - -.. ipython:: python - :suppress: - - write_html(df, "columns", columns=[0]) - -HTML: - -.. raw:: html - :file: ../_static/columns.html + html = df.to_html(columns=[0]) + print(html) + display(HTML(html)) ``float_format`` takes a Python callable to control the precision of floating point values: .. ipython:: python - print(df.to_html(float_format="{0:.10f}".format)) - -.. ipython:: python - :suppress: - - write_html(df, "float_format", float_format="{0:.10f}".format) + html = df.to_html(float_format="{0:.10f}".format) + print(html) + display(HTML(html)) -HTML: - -.. raw:: html - :file: ../_static/float_format.html ``bold_rows`` will make the row labels bold by default, but you can turn that off: .. ipython:: python - print(df.to_html(bold_rows=False)) - -.. ipython:: python - :suppress: - - write_html(df, "nobold", bold_rows=False) + html = df.to_html(bold_rows=False) + print(html) + display(HTML(html)) -.. raw:: html - :file: ../_static/nobold.html The ``classes`` argument provides the ability to give the resulting HTML table CSS classes. Note that these classes are *appended* to the existing @@ -2839,17 +2798,9 @@ that contain URLs. "url": ["https://www.python.org/", "https://pandas.pydata.org"], } ) - print(url_df.to_html(render_links=True)) - -.. ipython:: python - :suppress: - - write_html(url_df, "render_links", render_links=True) - -HTML: - -.. raw:: html - :file: ../_static/render_links.html + html = url_df.to_html(render_links=True) + print(html) + display(HTML(html)) Finally, the ``escape`` argument allows you to control whether the "<", ">" and "&" characters escaped in the resulting HTML (by default it is @@ -2859,30 +2810,21 @@ Finally, the ``escape`` argument allows you to control whether the df = pd.DataFrame({"a": list("&<>"), "b": np.random.randn(3)}) - -.. ipython:: python - :suppress: - - write_html(df, "escape") - write_html(df, "noescape", escape=False) - Escaped: .. ipython:: python - print(df.to_html()) - -.. raw:: html - :file: ../_static/escape.html + html = df.to_html() + print(html) + display(HTML(html)) Not escaped: .. ipython:: python - print(df.to_html(escape=False)) - -.. raw:: html - :file: ../_static/noescape.html + html = df.to_html(escape=False) + print(html) + display(HTML(html)) .. note:: @@ -3062,13 +3004,10 @@ Read in the content of the "books.xml" file and pass it to ``read_xml`` as a string: .. ipython:: python - :suppress: - rel_path = os.path.join("..", "pandas", "tests", "io", "data", "xml", - "books.xml") - file_path = os.path.abspath(rel_path) - -.. ipython:: python + file_path = "books.xml" + with open(file_path, "w") as f: + f.write(xml) with open(file_path, "r") as f: df = pd.read_xml(f.read()) @@ -3128,6 +3067,11 @@ Specify only elements or only attributes to parse: df = pd.read_xml(file_path, attrs_only=True) df +.. ipython:: python + :suppress: + + os.remove("books.xml") + XML documents can have namespaces with prefixes and default namespaces without prefixes both of which are denoted with a special attribute ``xmlns``. In order to parse by node under a namespace context, ``xpath`` must reference a prefix. @@ -5672,7 +5616,6 @@ the database using :func:`~pandas.DataFrame.to_sql`. .. ipython:: python - :suppress: import datetime @@ -5685,10 +5628,8 @@ the database using :func:`~pandas.DataFrame.to_sql`. data = pd.DataFrame(d, columns=c) -.. ipython:: python - - data - data.to_sql("data", engine) + data + data.to_sql("data", engine) With some databases, writing large DataFrames can result in errors due to packet size limitations being exceeded. This can be avoided by setting the diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 71aef4fdd75f6..a8591c5d3a2c7 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -82,6 +82,13 @@ Option 2 only loads the columns we request. pd.read_parquet("timeseries_wide.parquet", columns=columns) +.. ipython:: python + :suppress: + + import os + + os.remove("timeseries_wide.parquet") + If we were to measure the memory usage of the two calls, we'd see that specifying ``columns`` uses about 1/10th the memory in this case. @@ -102,6 +109,11 @@ can store larger datasets in memory. ts = pd.read_parquet("timeseries.parquet") ts +.. ipython:: python + :suppress: + + os.remove("timeseries.parquet") + Now, let's inspect the data types and memory usage to see where we should focus our attention. @@ -364,6 +376,13 @@ out of memory. At that point it's just a regular pandas object. @savefig dask_resample.png ddf[["x", "y"]].resample("1D").mean().cumsum().compute().plot() +.. ipython:: python + :suppress: + + import shutil + + shutil.rmtree("data/timeseries") + These Dask examples have all be done using multiple processes on a single machine. Dask can be `deployed on a cluster `_ to scale up to even larger diff --git a/doc/source/whatsnew/v0.9.1.rst b/doc/source/whatsnew/v0.9.1.rst index 6b05e5bcded7e..ce89b47e35da0 100644 --- a/doc/source/whatsnew/v0.9.1.rst +++ b/doc/source/whatsnew/v0.9.1.rst @@ -95,11 +95,12 @@ New features - Enable referencing of Excel columns by their column names (:issue:`1936`) - .. ipython:: python + .. code-block:: ipython + + In [1]: xl = pd.ExcelFile('data/test.xls') - xl = pd.ExcelFile('data/test.xls') - xl.parse('Sheet1', index_col=0, parse_dates=True, - parse_cols='A:D') + In [2]: xl.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols='A:D') - Added option to disable pandas-style tick locators and formatters diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 4a962520460b0..53521cda5d271 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -89,7 +89,9 @@ def test_show_versions_console(capsys): # check required dependency # 2020-12-09 npdev has "dirty" in the tag - assert re.search(r"numpy\s*:\s([0-9\.\+a-g\_]|dev)+(dirty)?\n", result) + # 2022-05-25 npdev released with RC wo/ "dirty". + # Just ensure we match [0-9]+\..* since npdev version is variable + assert re.search(r"numpy\s*:\s[0-9]+\..*\n", result) # check optional dependency assert re.search(r"pyarrow\s*:\s([0-9\.]+|None)\n", result) From 48a9a956a1ddef02cb5c08f3ccf2b118c5a796f7 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 28 May 2022 04:53:43 -0500 Subject: [PATCH 29/49] Backport PR #47150 on branch 1.4.x (DEPS: Bump Cython) (#47152) * Backport PR #47150: DEPS: Bump Cython * cython==0.29.30 in ci/deps/actions-310-numpydev.yaml Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> Co-authored-by: Simon Hawkins --- ci/deps/actions-310-numpydev.yaml | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-38-downstream_compat.yaml | 2 +- ci/deps/actions-38-minimum_versions.yaml | 2 +- ci/deps/actions-38.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/actions-pypy-38.yaml | 2 +- ci/deps/circle-38-arm64.yaml | 2 +- doc/source/whatsnew/v1.4.3.rst | 2 +- environment.yml | 2 +- requirements-dev.txt | 2 +- setup.py | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ci/deps/actions-310-numpydev.yaml b/ci/deps/actions-310-numpydev.yaml index 401be14aaca02..2ac97cc479196 100644 --- a/ci/deps/actions-310-numpydev.yaml +++ b/ci/deps/actions-310-numpydev.yaml @@ -16,7 +16,7 @@ dependencies: - pytz - pip - pip: - - cython==0.29.24 # GH#34014 + - cython==0.29.30 # GH#34014 - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple" - "--pre" - "numpy" diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index dac1219245e84..7a879b5ac9648 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.10 # test dependencies - - cython=0.29.24 + - cython=0.29.30 - pytest>=6.0 - pytest-cov - pytest-xdist>=1.31 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index 01415122e6076..cd501b1e018ef 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.8 # test dependencies - - cython=0.29.24 + - cython=0.29.30 - pytest>=6.0 - pytest-cov - pytest-xdist>=1.31 diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index f3a967f67cbc3..f6c3f3855aa02 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -7,7 +7,7 @@ dependencies: - python=3.8.0 # test dependencies - - cython=0.29.24 + - cython=0.29.30 - pytest>=6.0 - pytest-cov - pytest-xdist>=1.31 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index 79cd831051c2f..4d2ffbfbd9ce5 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.8 # test dependencies - - cython=0.29.24 + - cython=0.29.30 - pytest>=6.0 - pytest-cov - pytest-xdist>=1.31 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 1c681104f3196..202db035a6c52 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.9 # test dependencies - - cython=0.29.24 + - cython=0.29.30 - pytest>=6.0 - pytest-cov - pytest-xdist>=1.31 diff --git a/ci/deps/actions-pypy-38.yaml b/ci/deps/actions-pypy-38.yaml index eda35ee14ec65..1a3c73cb4ae2f 100644 --- a/ci/deps/actions-pypy-38.yaml +++ b/ci/deps/actions-pypy-38.yaml @@ -8,7 +8,7 @@ dependencies: - python=3.8[build=*_pypy] # TODO: use this once pypy3.8 is available # tools - - cython>=0.29.24 + - cython>=0.29.30 - pytest>=6.0 - pytest-cov - pytest-asyncio diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index 66fedccc5eca7..5d3f79602ed37 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.8 # test dependencies - - cython=0.29.24 + - cython=0.29.30 - pytest>=6.0 - pytest-cov - pytest-xdist>=1.31 diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index f0e208a117068..85e5983f877e8 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -38,7 +38,7 @@ Bug fixes Other ~~~~~ -- +- The minimum version of Cython needed to compile pandas is now ``0.29.30`` (:issue:`41935`) - .. --------------------------------------------------------------------------- diff --git a/environment.yml b/environment.yml index 0fbfb8236a135..83b00c0dd6421 100644 --- a/environment.yml +++ b/environment.yml @@ -15,7 +15,7 @@ dependencies: # The compiler packages are meta-packages and install the correct compiler (activation) packages on the respective platforms. - c-compiler - cxx-compiler - - cython>=0.29.24 + - cython>=0.29.30 # code checks - black=22.3.0 diff --git a/requirements-dev.txt b/requirements-dev.txt index 5ede7b99a3d22..05a9f0426440d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,7 +5,7 @@ numpy>=1.18.5 python-dateutil>=2.8.1 pytz asv -cython>=0.29.24 +cython>=0.29.30 black==22.3.0 cpplint flake8==4.0.1 diff --git a/setup.py b/setup.py index db65ea72e4a96..ec3fa3c0433f4 100755 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ def is_platform_mac(): # note: sync with pyproject.toml, environment.yml and asv.conf.json -min_cython_ver = "0.29.24" +min_cython_ver = "0.29.30" try: from Cython import ( From d7cb419059e3af3a5b79a7ee6193a731ad2d29c0 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 28 May 2022 06:46:21 -0500 Subject: [PATCH 30/49] Backport PR #47051 on branch 1.4.x (BUG: `DataFrame.shift` shows different behavior for `axis=1` when `freq` is specified) (#47158) Backport PR #47051: BUG: `DataFrame.shift` shows different behavior for `axis=1` when `freq` is specified Co-authored-by: Wenjun Si --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/frame.py | 8 +++++++- pandas/tests/frame/methods/test_shift.py | 10 ++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 85e5983f877e8..b78188409d350 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :func:`read_csv` with ``index_col=False`` identifying first row as index names when ``header=None`` (:issue:`46955`) - Fixed regression in :meth:`.DataFrameGroupBy.agg` when used with list-likes or dict-likes and ``axis=1`` that would give incorrect results; now raises ``NotImplementedError`` (:issue:`46995`) - Fixed regression in :meth:`DataFrame.resample` and :meth:`DataFrame.rolling` when used with list-likes or dict-likes and ``axis=1`` that would raise an unintuitive error message; now raises ``NotImplementedError`` (:issue:`46904`) +- Fixed regression in :meth:`DataFrame.shift` when ``axis`` is ``columns`` and ``fill_value`` is absent, ``freq`` is ignored (:issue:`47039`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bc50d04f938ec..391c12905adae 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5333,7 +5333,13 @@ def shift( axis = self._get_axis_number(axis) ncols = len(self.columns) - if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: + if ( + axis == 1 + and periods != 0 + and freq is None + and fill_value is lib.no_default + and ncols > 0 + ): # We will infer fill_value to match the closest column # Use a column that we know is valid for our column's dtype GH#38434 diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 2463e81d78edd..92db00368d088 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -255,6 +255,16 @@ def test_shift_named_axis(self): result = df.shift(1, axis="columns") tm.assert_frame_equal(result, expected) + def test_shift_other_axis_with_freq(self, datetime_frame): + obj = datetime_frame.T + offset = offsets.BDay() + + # GH#47039 + shifted = obj.shift(5, freq=offset, axis=1) + assert len(shifted) == len(obj) + unshifted = shifted.shift(-5, freq=offset, axis=1) + tm.assert_equal(unshifted, obj) + def test_shift_bool(self): df = DataFrame({"high": [True, False], "low": [False, False]}) rs = df.shift(1) From d84073c55ea5c3dd20beb7ef4de3ca68114d73e4 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 28 May 2022 19:19:31 -0500 Subject: [PATCH 31/49] Backport PR #47159 on branch 1.4.x (TST: Re-enable test after Cython upgrade) (#47161) Backport PR #47159: TST: Re-enable test after Cython upgrade Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- pandas/tests/io/parser/common/test_common_basic.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index bde69e365cfd1..69b087eff8a20 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -12,7 +12,6 @@ import numpy as np import pytest -from pandas.compat import PY310 from pandas.errors import ( EmptyDataError, ParserError, @@ -676,11 +675,6 @@ def test_read_table_equivalency_to_read_csv(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.skipif( - PY310, - reason="GH41935 This test is leaking only on Python 3.10," - "causing other tests to fail with a cryptic error.", -) @pytest.mark.parametrize("read_func", ["read_csv", "read_table"]) def test_read_csv_and_table_sys_setprofile(all_parsers, read_func): # GH#41069 From 23fea24fc5998b2df11dc3e88477eb021e4a92a7 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 31 May 2022 19:51:00 -0500 Subject: [PATCH 32/49] Backport PR #47165 on branch 1.4.x (BUG: do not suppress errors when closing file handles) (#47178) --- doc/source/whatsnew/v1.4.3.rst | 2 +- pandas/io/common.py | 7 ++----- pandas/tests/io/test_common.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index b78188409d350..54cad82366e43 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -30,7 +30,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Most I/O methods do no longer suppress ``OSError`` and ``ValueError`` when closing file handles (:issue:`47136`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 0331d320725ac..f02c43da7cdb1 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -112,11 +112,8 @@ def close(self) -> None: self.handle.flush() self.handle.detach() self.created_handles.remove(self.handle) - try: - for handle in self.created_handles: - handle.close() - except (OSError, ValueError): - pass + for handle in self.created_handles: + handle.close() self.created_handles = [] self.is_wrapped = False diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 36048601b3248..25a36c86eeaae 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -600,3 +600,15 @@ def test_fail_mmap(): with pytest.raises(UnsupportedOperation, match="fileno"): with BytesIO() as buffer: icom.get_handle(buffer, "rb", memory_map=True) + + +def test_close_on_error(): + # GH 47136 + class TestError: + def close(self): + raise OSError("test") + + with pytest.raises(OSError, match="test"): + with BytesIO() as buffer: + with icom.get_handle(buffer, "rb") as handles: + handles.created_handles.append(TestError()) From dc52fd6212a02f4d80827d244b40badf154d343d Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 1 Jun 2022 10:13:46 -0500 Subject: [PATCH 33/49] Backport PR #47179 on branch 1.4.x (CI: Versioneer not picking up version in 32-bit build) (#47190) Backport PR #47179: CI: Versioneer not picking up version in 32-bit build Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/32-bit-linux.yml | 6 ++++++ pandas/tests/test_common.py | 8 -------- pandas/tests/util/test_show_versions.py | 7 ------- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/.github/workflows/32-bit-linux.yml b/.github/workflows/32-bit-linux.yml index 500e800a082d9..be894e6a5a63e 100644 --- a/.github/workflows/32-bit-linux.yml +++ b/.github/workflows/32-bit-linux.yml @@ -23,9 +23,15 @@ jobs: - name: Run 32-bit manylinux2014 Docker Build / Tests run: | + # Without this (line 34), versioneer will not be able to determine the pandas version. + # This is because of a security update to git that blocks it from reading the config folder if + # it is not owned by the current user. We hit this since the "mounted" folder is not hit by the + # Docker container. + # xref https://github.com/pypa/manylinux/issues/1309 docker pull quay.io/pypa/manylinux2014_i686 docker run --platform linux/386 -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \ /bin/bash -xc "cd pandas && \ + git config --global --add safe.directory /pandas && \ /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev && \ . ~/virtualenvs/pandas-dev/bin/activate && \ python -m pip install --no-deps -U pip wheel 'setuptools<60.0.0' && \ diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index b2f2a5f672edb..cbd11cd6d8685 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -5,11 +5,6 @@ import numpy as np import pytest -from pandas.compat import ( - IS64, - is_ci_environment, -) - import pandas as pd from pandas import Series import pandas._testing as tm @@ -162,9 +157,6 @@ def test_standardize_mapping(): assert isinstance(com.standardize_mapping(dd), partial) -@pytest.mark.xfail( - is_ci_environment() and not IS64, reason="Failing on 32 bit Python CI job" -) def test_git_version(): # GH 21295 git_version = pd.__git_version__ diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 53521cda5d271..f49ee317327fe 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -4,10 +4,6 @@ import pytest -from pandas.compat import ( - IS64, - is_ci_environment, -) from pandas.util._print_versions import ( _get_dependency_info, _get_sys_info, @@ -72,9 +68,6 @@ def test_show_versions_console_json(capsys): assert result == expected -@pytest.mark.xfail( - is_ci_environment() and not IS64, reason="Failing on 32 bit Python CI job" -) def test_show_versions_console(capsys): # gh-32041 # gh-32041 From fb27ba96514613594de4387c83f30ed17e0fef79 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 5 Jun 2022 17:10:49 -0500 Subject: [PATCH 34/49] Backport PR #47239 on branch 1.4.x (Bump cython version in asv.conf.json to match environment.yml) (#47243) --- asv_bench/asv.conf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index daf2834c50d6a..46640505a4c84 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -42,7 +42,7 @@ // followed by the pip installed packages). "matrix": { "numpy": [], - "Cython": ["0.29.24"], + "Cython": ["0.29.30"], "matplotlib": [], "sqlalchemy": [], "scipy": [], From 75a799cbf108deae6228ba4ab00c0ff19008bba6 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 6 Jun 2022 02:54:45 -0500 Subject: [PATCH 35/49] Backport PR #47206 on branch 1.4.x (REGR: concat not sorting columns for mixed column names) (#47251) Backport PR #47206: REGR: concat not sorting columns for mixed column names Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/indexes/api.py | 11 ++++++++- pandas/tests/reshape/concat/test_dataframe.py | 23 +++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 54cad82366e43..5898e51ab5f52 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.replace` when the replacement value was explicitly ``None`` when passed in a dictionary to ``to_replace`` also casting other columns to object dtype even when there were no values to replace (:issue:`46634`) - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) +- Fixed regression in :func:`concat` not sorting columns for mixed column names (:issue:`47127`) - Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`) - Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`) - Fixed regression in :func:`read_csv` with ``index_col=False`` identifying first row as index names when ``header=None`` (:issue:`46955`) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 922c344510375..1e740132e3464 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,6 +1,9 @@ from __future__ import annotations import textwrap +from typing import cast + +import numpy as np from pandas._libs import ( NaT, @@ -10,6 +13,7 @@ from pandas.core.dtypes.common import is_dtype_equal +from pandas.core.algorithms import safe_sort from pandas.core.indexes.base import ( Index, _new_Index, @@ -154,7 +158,12 @@ def _get_combined_index( if sort: try: - index = index.sort_values() + array_sorted = safe_sort(index) + array_sorted = cast(np.ndarray, array_sorted) + if isinstance(index, MultiIndex): + index = MultiIndex.from_tuples(array_sorted, names=index.names) + else: + index = Index(array_sorted, name=index.name) except TypeError: pass diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index 01763926c6d89..1018fc2806fee 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -205,3 +205,26 @@ def test_concat_copies(self, axis, order, ignore_index): for arr in res._iter_column_arrays(): for arr2 in df._iter_column_arrays(): assert not np.shares_memory(arr, arr2) + + def test_outer_sort_columns(self): + # GH#47127 + df1 = DataFrame({"A": [0], "B": [1], 0: 1}) + df2 = DataFrame({"A": [100]}) + result = concat([df1, df2], ignore_index=True, join="outer", sort=True) + expected = DataFrame({0: [1.0, np.nan], "A": [0, 100], "B": [1.0, np.nan]}) + tm.assert_frame_equal(result, expected) + + def test_inner_sort_columns(self): + # GH#47127 + df1 = DataFrame({"A": [0], "B": [1], 0: 1}) + df2 = DataFrame({"A": [100], 0: 2}) + result = concat([df1, df2], ignore_index=True, join="inner", sort=True) + expected = DataFrame({0: [1, 2], "A": [0, 100]}) + tm.assert_frame_equal(result, expected) + + def test_sort_columns_one_df(self): + # GH#47127 + df1 = DataFrame({"A": [100], 0: 2}) + result = concat([df1], ignore_index=True, join="inner", sort=True) + expected = DataFrame({0: [2], "A": [100]}) + tm.assert_frame_equal(result, expected) From 34259fb7963a5644bc9785c6b79fccdf662f5088 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 6 Jun 2022 02:58:52 -0500 Subject: [PATCH 36/49] Backport PR #47085 on branch 1.4.x (BUG: Eval scopes ignoring empty dictionaries (#47084)) (#47250) Backport PR #47085: BUG: Eval scopes ignoring empty dictionaries (#47084) Co-authored-by: Alex-Blade <44120047+Alex-Blade@users.noreply.github.com> --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/computation/pytables.py | 2 +- pandas/core/computation/scope.py | 6 ++++-- pandas/tests/computation/test_eval.py | 15 +++++++++++++++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 5898e51ab5f52..d274458e24262 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -31,6 +31,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :meth:`pd.eval`, :meth:`DataFrame.eval` and :meth:`DataFrame.query` where passing empty ``local_dict`` or ``global_dict`` was treated as passing ``None`` (:issue:`47084`) - Most I/O methods do no longer suppress ``OSError`` and ``ValueError`` when closing file handles (:issue:`47136`) - diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 3e041c088f566..0ed0046d36678 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -563,7 +563,7 @@ def __init__( self._visitor = None # capture the environment if needed - local_dict: DeepChainMap[Any, Any] = DeepChainMap() + local_dict: DeepChainMap[Any, Any] | None = None if isinstance(where, PyTablesExpr): local_dict = where.env.scope diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index a561824f868f2..32e979eae991e 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -133,11 +133,13 @@ def __init__( # shallow copy here because we don't want to replace what's in # scope when we align terms (alignment accesses the underlying # numpy array of pandas objects) - scope_global = self.scope.new_child((global_dict or frame.f_globals).copy()) + scope_global = self.scope.new_child( + (global_dict if global_dict is not None else frame.f_globals).copy() + ) self.scope = DeepChainMap(scope_global) if not isinstance(local_dict, Scope): scope_local = self.scope.new_child( - (local_dict or frame.f_locals).copy() + (local_dict if local_dict is not None else frame.f_locals).copy() ) self.scope = DeepChainMap(scope_local) finally: diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 8fa28300b8345..3517068b3d0cc 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -43,6 +43,7 @@ from pandas.core.computation.ops import ( ARITH_OPS_SYMS, SPECIAL_CASE_ARITH_OPS_SYMS, + UndefinedVariableError, _binary_math_ops, _binary_ops_dict, _unary_math_ops, @@ -1747,6 +1748,20 @@ def test_no_new_globals(self, engine, parser): gbls2 = globals().copy() assert gbls == gbls2 + def test_empty_locals(self, engine, parser): + # GH 47084 + x = 1 # noqa: F841 + msg = "name 'x' is not defined" + with pytest.raises(UndefinedVariableError, match=msg): + pd.eval("x + 1", engine=engine, parser=parser, local_dict={}) + + def test_empty_globals(self, engine, parser): + # GH 47084 + msg = "name '_var_s' is not defined" + e = "_var_s * 2" + with pytest.raises(UndefinedVariableError, match=msg): + pd.eval(e, engine=engine, parser=parser, global_dict={}) + @td.skip_if_no_ne def test_invalid_engine(): From 3fdfa663bf0efee1f58eb2aacc831caa62419fe1 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 6 Jun 2022 12:26:18 +0100 Subject: [PATCH 37/49] Backport PR #47143 on branch 1.4.x (REGR: setitem writing into RangeIndex instead of creating a copy) (#47256) Backport PR #47143: REGR: setitem writing into RangeIndex instead of creating a copy Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/construction.py | 4 ++-- pandas/tests/frame/indexing/test_setitem.py | 9 ++++++++ pandas/tests/test_downstream.py | 24 +++++++++++++++++++++ 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index d274458e24262..f594aae2c7f9f 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.replace` when the replacement value was explicitly ``None`` when passed in a dictionary to ``to_replace`` also casting other columns to object dtype even when there were no values to replace (:issue:`46634`) +- Fixed regression when setting values with :meth:`DataFrame.loc` updating :class:`RangeIndex` when index was set as new column and column was updated afterwards (:issue:`47128`) - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) - Fixed regression in :func:`concat` not sorting columns for mixed column names (:issue:`47127`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index e496125683c09..2595cff5c43c4 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -508,7 +508,7 @@ def sanitize_array( dtype = dtype.numpy_dtype # extract ndarray or ExtensionArray, ensure we have no PandasArray - data = extract_array(data, extract_numpy=True) + data = extract_array(data, extract_numpy=True, extract_range=True) if isinstance(data, np.ndarray) and data.ndim == 0: if dtype is None: @@ -583,7 +583,7 @@ def sanitize_array( # materialize e.g. generators, convert e.g. tuples, abc.ValueView if hasattr(data, "__array__"): # e.g. dask array GH#38645 - data = np.asarray(data) + data = np.array(data, copy=copy) else: data = list(data) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index cd0a0a0467742..673d347917832 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -852,6 +852,15 @@ def test_frame_setitem_newcol_timestamp(self): data[ts] = np.nan # works, mostly a smoke-test assert np.isnan(data[ts]).all() + def test_frame_setitem_rangeindex_into_new_col(self): + # GH#47128 + df = DataFrame({"a": ["a", "b"]}) + df["b"] = df.index + df.loc[[False, True], "b"] = 100 + result = df.loc[[1], :] + expected = DataFrame({"a": ["b"], "b": [100]}, index=[1]) + tm.assert_frame_equal(result, expected) + class TestDataFrameSetItemSlicing: def test_setitem_slice_position(self): diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index be9f187c0c44b..a4180d38c670c 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -269,3 +269,27 @@ def test_missing_required_dependency(): output = exc.value.stdout.decode() for name in ["numpy", "pytz", "dateutil"]: assert name in output + + +def test_frame_setitem_dask_array_into_new_col(): + # GH#47128 + + # dask sets "compute.use_numexpr" to False, so catch the current value + # and ensure to reset it afterwards to avoid impacting other tests + olduse = pd.get_option("compute.use_numexpr") + + try: + dask = import_module("dask") # noqa:F841 + + import dask.array as da + + dda = da.array([1, 2]) + df = DataFrame({"a": ["a", "b"]}) + df["b"] = dda + df["c"] = dda + df.loc[[False, True], "b"] = 100 + result = df.loc[[1], :] + expected = DataFrame({"a": ["b"], "b": [100], "c": [2]}, index=[1]) + tm.assert_frame_equal(result, expected) + finally: + pd.set_option("compute.use_numexpr", olduse) From 17386d1c573b06cbb67289affdf23e676e5055f5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 7 Jun 2022 15:16:07 +0100 Subject: [PATCH 38/49] Backport PR #47121 on branch 1.4.x (BUG: read_excel loading some xlsx ints as floats) (#47271) Backport PR #47121: BUG: read_excel loading some xlsx ints as floats Co-authored-by: Andrew Hawyrluk <50434302+ahawryluk@users.noreply.github.com> --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/io/excel/_openpyxl.py | 10 ++++++++-- .../data/excel/ints_spelled_with_decimals.xlsx | Bin 0 -> 4734 bytes pandas/tests/io/excel/test_openpyxl.py | 8 ++++++++ 4 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/io/data/excel/ints_spelled_with_decimals.xlsx diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index f594aae2c7f9f..ca8b8ca15ec47 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -24,6 +24,7 @@ Fixed regressions - Fixed regression in :func:`read_csv` with ``index_col=False`` identifying first row as index names when ``header=None`` (:issue:`46955`) - Fixed regression in :meth:`.DataFrameGroupBy.agg` when used with list-likes or dict-likes and ``axis=1`` that would give incorrect results; now raises ``NotImplementedError`` (:issue:`46995`) - Fixed regression in :meth:`DataFrame.resample` and :meth:`DataFrame.rolling` when used with list-likes or dict-likes and ``axis=1`` that would raise an unintuitive error message; now raises ``NotImplementedError`` (:issue:`46904`) +- Fixed regression in :func:`read_excel` returning ints as floats on certain input sheets (:issue:`46988`) - Fixed regression in :meth:`DataFrame.shift` when ``axis`` is ``columns`` and ``fill_value`` is absent, ``freq`` is ignored (:issue:`47039`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 27c03d4a74bc1..8ec24b35779a0 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -560,8 +560,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return "" # compat with xlrd elif cell.data_type == TYPE_ERROR: return np.nan - elif not convert_float and cell.data_type == TYPE_NUMERIC: - return float(cell.value) + elif cell.data_type == TYPE_NUMERIC: + # GH5394, GH46988 + if convert_float: + val = int(cell.value) + if val == cell.value: + return val + else: + return float(cell.value) return cell.value diff --git a/pandas/tests/io/data/excel/ints_spelled_with_decimals.xlsx b/pandas/tests/io/data/excel/ints_spelled_with_decimals.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..a667be86283b8a721446657122100460f19f0d7f GIT binary patch literal 4734 zcmai12|U#87RF=?GZ@>D?K2U=*eUx~5m_TyGEBxcgffFn+D2NhO^9_lP1qd7 zYUN>vvUU5vR74V-GYX=b>xqtE40_FcMjwXa}NeO-wn$@^vX6W*!C zWQt zDtmI2GW)EX6P@~v-9wh06AT^1>{nXP)~I@Q2AA?$G-44N9xDM=NC104PiM8vx9N#e zfi9_vGlGt#7VnFC%m1pf)YT%VAcxywNQsGv$N@w|y8n+#4`){gHzWe#?j~~d@q@~k z2~;wS<$lSHul_z9)R(v@N%B!f6{$u;wd{56Cj{SVRBm@o^hdpQW8zNZ;KI$VO&jkk zT;wFX>dpQr>||=d0LcG>3=`>W2X*~{m&K=m9EmB?d{RD$449N--PulL=8{nmvY(>5 zTPf={Rw>s-okhP9Ng?g5$qgC2DezENGWsce^rd|@Fpxz|O}3hazUC!&cQkw*>e53o z!W;^f9OP}xM`kEEIaa?B4a?F^f2+W>Y%fL$g93OBp6+RL-H(UDFQ$&(M8rf~T2pq& z?>7)G_$W4@nQF!^$Ywo`P1wLj4n6ryi;ACZtm_5Hjc0PUEI4Gt{H?w{u@7Hag?pY2 zDhd^P^DfNnI`j(PY(cNV>%}D3bpiPyF}Lmc)l8Kqk?D5RC6lw3)uk)xg&;$ed|k?oYSjZ zW>(Yn6d_NdG&}{=SG!(gbI_;cSTkyS>&AT^@GoCJ<;bCG>f^Llfhn2)9ML8XY)ove zVwybn?GEtaSq@pWL~sYwLw32;Ru860*>5P|W9-KqmsO63{o;w#WQETyD8zh#(tqXzmJAO@2gAIv!}mfYK4}D@vN&#B%mz7fK-XT6B}b&=TV{Lv zeOfIgLE`0X_he)R(`HCvW!)dUfb@M6w;BLUVAF{5a;Q}rg?2-}j)Zl$^sUD}Hjq-i zmW1d@D!QSIWIP_Jjyhh<)Y)?PmC=UGr!G2%!3A4#h&SWAOoh0g*Pa*_O5HCtpS%(V z)G?>QeVM*DK)hOG!YfK019jOSZD@_QWl}Q_EeYEMA97HXyQE!CxI+N~5C2>|;2#{g zBN0vrk)!vcx_Mdth%QpcQ@S*Jc$FM67D6;Jt+JD zru%Ha@sYm7Mn#My<~&%i{(fjo+evP^GvZ9lp2`V3OtW&W`r4ikR3bSUoirX=Uh52e)uSmb-Xp3X7pPaI7 zMcYc(eJaF^JPpcT_VurHnw&&AxC266cV-djaFzDHyxqjdqZc6i*o_8~g{k~cUH(1- z7lQ!z>gM>fPs>}-$)P|m6WBLH%3~Vd2ndOa0fww>Cvsmb@#JMh>T815L!(PnwCH8E zTN}g{L+{u}n?dbOL`It1PA!SzYj#t*XQJ>bLe@)+a6N+#5{4SxU`id=rOZ?1&Yoz2 zrH|wMiRk?P}c4I3_>Wdc>m4ScBB}&ErLh1Z5rD%|yuKmtZ7_8DNP<2rH zNzHK2EIKbm53zcV_Hb z7?BR_x#Vk@4!ZP>VrAjcL6d;q{`G^=AtZO9*F?Qilr+Z5H-B+;vGqpu0O^PyU40Kg$Enzjy8K z>4-SmZiQj<-@=gERu`7fTgE;+l^eemRrFS{{h0=FJBd`q_f7T(6B9!k_6&In?|PFz zG(c{;Y{N`dEwVtYX6jYRf%G3wxR0_6>{w5Z`=mkDR?eZCy25nC27-4CAX7FoZXyV1 zCtgMUJ4>*4d*DLWtQ7#`zMzK}dWhTsP+{WhsqCZg65@$zW_qr&eoNKD5N=?xe zPp35e5l>^7){{O)PKXL$<@MIy&^Q4T|7Wy zgO4jkYEI1K#3Ft8gTXp7nzF*LZ~RSxY{_lZ@>$7L+=L} zHPgbuRI%-HIcwW_!YUWkU3*80(`AA~-Ob^MoTCz;d^m|t#=W?e)U-@_dvHZhV>po8Rys8a+mNHEy?2zw zW#`K3hX}2r_{5*`0rVpu4qGPN+1cU9tww7R+$!z;D+A}%-eauQ1Vb*qGmIrXM>(3_ zf#Dx6*d$AgH)5mI(jC)_cON?|;0{oetSJ(BjH?u%V`B7)LKl6UfllL#_e^ZjL7hc2 zqt`mS7)fxd!hG70r6=hAsTOt zk{WeeV^2$~%+15ht_cvA=$KKzLu1eba_M(-4cRr4>^NOCC|3gor}bsTqXv|8k)&T# zlFxJV%&|aL`s2Lso=Gyo7%G(##TziH$>Nz>ww5mL| zguM4TL$hK^6>}y1e8RU)%`QX5LE?Lzw2$CnW5GFX#q+^d%YzWngou-ixdG5PHZ$Kl zw$$Zg5#*t-c|-ddbas`CDfJgZ9_b2Dt+OwYtWXuJPu`Ez4nOj{+TH0z7YJv-`$xaa z`r{0Kc$MQPapYEtqLuAiVYKRm;|RfxlwtKXLmz9Ou^G!IryYDJR(hISdvs?jMGAk8 zR)yIj^q_6`U;tUZ$Lh5f!0*v4m>Wia!VBy;f1PwSIQ}~yZQ}E)S`Y{0g9NMVlTH?; zBQM2qw?(OdYu5sJ!us`ZPP+`$P|(RwN^sL8nPgwpaEh!|uMz0EkT?g?wX_LqFpAo$ zB)P@Hv(I;4Lfh7bxz|qVim?E6?X2rl9iiwzOHZkSz*Z#-TN9zKD7)ot8orS7Mxz%N z&)w zh4G$n&=)H%!KaOD8qU>N`xvrq;r_4VFW7YNCKSFV@yVdN@oDihXL2n{7yZ}m zC$HN%87)JhTF$VOCcOJl#rGlYXVaPrqW$Dd8VCYSPLb)H##KhRku(3-&s|q_M!6$U z?v{q0E(kaC!y>IP=+Pl$#V*M{_d25_N|q}qX{%|P;$jTQWLevKNW^nuWU~B{DKwue z;H^b{Wyg8(rHY)5l=3z*a3Wq%l)ocgsU%R!_gd(~wv7cHhL~oDXU*29w=TjJ>XL-F z=<@J3S7MQ_kze(r?FWcsxzlYkfnCaT2-9u~ds3|!>FKp1!)Dtro-x;fqR9$y8i(ZY zVBZ&6#V1}OBLaIHq1ezVyN(;q<>oz#GfwpiqT=UR_`#zq)RMe|n|VYTZw2btUqLh0 zEa}sVBUISmj^5I|B+8Anr!6m^mC7`dUX`5Wnq_vmEZ}j~7Xz(>@q=PhC&?wFbYINB zjL7B|Q3O7I6Rh(L(Z~G?pw`iKda2(#y!D<(r);daAJwVoMStIVWPxlP@WH{SS9k9} zJGa=gV%y(%#JW?+b6rUS=ZA5RbG`CsForh8qjUr~ Date: Wed, 15 Jun 2022 08:15:50 -0500 Subject: [PATCH 39/49] Backport PR #47326 on branch 1.4.x (REGR: Fix nan comparison for same Index object) (#47364) Backport PR #47326: REGR: Fix nan comparison for same Index object Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/indexes/base.py | 2 +- pandas/tests/indexes/test_base.py | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index ca8b8ca15ec47..bfc9422711690 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) - Fixed regression in :func:`concat` not sorting columns for mixed column names (:issue:`47127`) - Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`) +- Fixed regression in ``NaN`` comparison for :class:`Index` operations where the same object was compared (:issue:`47105`) - Fixed regression is :meth:`.Styler.to_latex` and :meth:`.Styler.to_html` where ``buf`` failed in combination with ``encoding`` (:issue:`47053`) - Fixed regression in :func:`read_csv` with ``index_col=False`` identifying first row as index names when ``header=None`` (:issue:`46955`) - Fixed regression in :meth:`.DataFrameGroupBy.agg` when used with list-likes or dict-likes and ``axis=1`` that would give incorrect results; now raises ``NotImplementedError`` (:issue:`46995`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2abd649d00b78..7175b85e966d7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6660,7 +6660,7 @@ def _cmp_method(self, other, op): # TODO: should set MultiIndex._can_hold_na = False? arr[self.isna()] = False return arr - elif op in {operator.ne, operator.lt, operator.gt}: + elif op is operator.ne: arr = np.zeros(len(self), dtype=bool) if self._can_hold_na and not isinstance(self, ABCMultiIndex): arr[self.isna()] = True diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 222f1fc3e7648..7b851d329c405 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2,6 +2,7 @@ from datetime import datetime from io import StringIO import math +import operator import re import numpy as np @@ -1587,3 +1588,16 @@ def test_get_attributes_dict_deprecated(): with tm.assert_produces_warning(DeprecationWarning): attrs = idx._get_attributes_dict() assert attrs == {"name": None} + + +@pytest.mark.parametrize("op", [operator.lt, operator.gt]) +def test_nan_comparison_same_object(op): + # GH#47105 + idx = Index([np.nan]) + expected = np.array([False]) + + result = op(idx, idx) + tm.assert_numpy_array_equal(result, expected) + + result = op(idx, idx.copy()) + tm.assert_numpy_array_equal(result, expected) From d2683598a053958f8a14671e96fc0f23b57bc37b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 15 Jun 2022 08:22:30 -0500 Subject: [PATCH 40/49] Backport PR #47325 on branch 1.4.x (REGR: Avoid regression warning with ea dtype and assert_index_equal order False) (#47366) Backport PR #47325: REGR: Avoid regression warning with ea dtype and assert_index_equal order False Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/_testing/asserters.py | 4 ++-- pandas/tests/util/test_assert_index_equal.py | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index bfc9422711690..6f8b73db69083 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -25,6 +25,7 @@ Fixed regressions - Fixed regression in :func:`read_csv` with ``index_col=False`` identifying first row as index names when ``header=None`` (:issue:`46955`) - Fixed regression in :meth:`.DataFrameGroupBy.agg` when used with list-likes or dict-likes and ``axis=1`` that would give incorrect results; now raises ``NotImplementedError`` (:issue:`46995`) - Fixed regression in :meth:`DataFrame.resample` and :meth:`DataFrame.rolling` when used with list-likes or dict-likes and ``axis=1`` that would raise an unintuitive error message; now raises ``NotImplementedError`` (:issue:`46904`) +- Fixed regression in :func:`assert_index_equal` when ``check_order=False`` and :class:`Index` has extension or object dtype (:issue:`47207`) - Fixed regression in :func:`read_excel` returning ints as floats on certain input sheets (:issue:`46988`) - Fixed regression in :meth:`DataFrame.shift` when ``axis`` is ``columns`` and ``fill_value`` is absent, ``freq`` is ignored (:issue:`47039`) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 4fa9c1aabe716..031a1a48760e6 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -374,8 +374,8 @@ def _get_ilevel_values(index, level): # If order doesn't matter then sort the index entries if not check_order: - left = Index(safe_sort(left)) - right = Index(safe_sort(right)) + left = Index(safe_sort(left), dtype=left.dtype) + right = Index(safe_sort(right), dtype=right.dtype) # MultiIndex special comparison for little-friendly error messages if left.nlevels > 1: diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index 8211b52fed650..e3461e62b4eda 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -242,3 +242,17 @@ def test_assert_index_equal_mixed_dtype(): # GH#39168 idx = Index(["foo", "bar", 42]) tm.assert_index_equal(idx, idx, check_order=False) + + +def test_assert_index_equal_ea_dtype_order_false(any_numeric_ea_dtype): + # GH#47207 + idx1 = Index([1, 3], dtype=any_numeric_ea_dtype) + idx2 = Index([3, 1], dtype=any_numeric_ea_dtype) + tm.assert_index_equal(idx1, idx2, check_order=False) + + +def test_assert_index_equal_object_ints_order_false(): + # GH#47207 + idx1 = Index([1, 3], dtype="object") + idx2 = Index([3, 1], dtype="object") + tm.assert_index_equal(idx1, idx2, check_order=False) From de071c641318f85a1085c723e18e73fda2fd8c3d Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 15 Jun 2022 10:23:19 -0500 Subject: [PATCH 41/49] Backport PR #47349 on branch 1.4.x (REGR: MultiIndex.dtypes has regular Index instead of MultiIndex index) (#47369) Backport PR #47349: REGR: MultiIndex.dtypes has regular Index instead of MultiIndex index Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/indexes/multi.py | 2 +- pandas/tests/indexes/multi/test_constructors.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 6f8b73db69083..05ace0509e0b7 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.replace` when the replacement value was explicitly ``None`` when passed in a dictionary to ``to_replace`` also casting other columns to object dtype even when there were no values to replace (:issue:`46634`) +- Fixed regression in representation of ``dtypes`` attribute of :class:`MultiIndex` (:issue:`46900`) - Fixed regression when setting values with :meth:`DataFrame.loc` updating :class:`RangeIndex` when index was set as new column and column was updated afterwards (:issue:`47128`) - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1168325378e92..68db372ff4e51 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -741,7 +741,7 @@ def dtypes(self) -> Series: from pandas import Series names = com.fill_missing_names([level.name for level in self.levels]) - return Series([level.dtype for level in self.levels], index=names) + return Series([level.dtype for level in self.levels], index=Index(names)) def __len__(self) -> int: return len(self.codes[0]) diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 63b0bd235e57c..7fad59fc6654c 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -827,3 +827,13 @@ def test_multiindex_inference_consistency(): mi = MultiIndex.from_tuples([(x,) for x in arr]) lev = mi.levels[0] assert lev.dtype == object + + +def test_dtype_representation(): + # GH#46900 + pmidx = MultiIndex.from_arrays([[1], ["a"]], names=[("a", "b"), ("c", "d")]) + result = pmidx.dtypes + expected = Series( + ["int64", "object"], index=MultiIndex.from_tuples([("a", "b"), ("c", "d")]) + ) + tm.assert_series_equal(result, expected) From f0af15c8418d2d08efa4a4a3b1176fa8f5ecc236 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 17 Jun 2022 02:52:28 -0500 Subject: [PATCH 42/49] Backport PR #47347 on branch 1.4.x (REGR: Regression in to_csv for ea dtype categorical) (#47388) * Backport PR #47347: REGR: Regression in to_csv for ea dtype categorical * inclusive -> closed Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/internals/blocks.py | 2 +- pandas/tests/frame/methods/test_to_csv.py | 29 +++++++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 05ace0509e0b7..a4d81533df23d 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.replace` when the replacement value was explicitly ``None`` when passed in a dictionary to ``to_replace`` also casting other columns to object dtype even when there were no values to replace (:issue:`46634`) +- Fixed regression in :meth:`DataFrame.to_csv` raising error when :class:`DataFrame` contains extension dtype categorical column (:issue:`46297`, :issue:`46812`) - Fixed regression in representation of ``dtypes`` attribute of :class:`MultiIndex` (:issue:`46900`) - Fixed regression when setting values with :meth:`DataFrame.loc` updating :class:`RangeIndex` when index was set as new column and column was updated afterwards (:issue:`47128`) - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 941b1648a9778..c5654db653de2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2160,7 +2160,7 @@ def to_native_types( **kwargs, ) -> np.ndarray: """convert to our native types format""" - if isinstance(values, Categorical): + if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm": # GH#40754 Convert categorical datetimes to datetime array values = take_nd( values.categories._values, diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 8a857c033a2de..c7c3c41a07a1e 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1333,3 +1333,32 @@ def test_to_csv_na_quoting(self): ) expected = '""\n""\n' assert result == expected + + def test_to_csv_categorical_and_ea(self): + # GH#46812 + df = DataFrame({"a": "x", "b": [1, pd.NA]}) + df["b"] = df["b"].astype("Int16") + df["b"] = df["b"].astype("category") + result = df.to_csv() + expected_rows = [",a,b", "0,x,1", "1,x,"] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected + + def test_to_csv_categorical_and_interval(self): + # GH#46297 + df = DataFrame( + { + "a": [ + pd.Interval( + Timestamp("2020-01-01"), + Timestamp("2020-01-02"), + closed="both", + ) + ] + } + ) + df["a"] = df["a"].astype("category") + result = df.to_csv() + expected_rows = [",a", '0,"[2020-01-01, 2020-01-02]"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected From 2d370af40a7407d019be300c84ea8342e5e8a023 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 21 Jun 2022 13:41:45 -0500 Subject: [PATCH 43/49] Backport PR #47318 on branch 1.4.x (CI: Pin PYTEST_WORKERS=1 for Windows builds due to memory errors) (#47445) Backport PR #47318: CI: Pin PYTEST_WORKERS=1 for Windows builds due to memory errors Co-authored-by: Matthew Roeschke --- .github/workflows/macos-windows.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml index 560a421ec74ec..923ac8f2e0fd6 100644 --- a/.github/workflows/macos-windows.yml +++ b/.github/workflows/macos-windows.yml @@ -15,7 +15,6 @@ on: env: PANDAS_CI: 1 PYTEST_TARGET: pandas - PYTEST_WORKERS: auto PATTERN: "not slow and not db and not network and not single_cpu" @@ -36,6 +35,9 @@ jobs: # https://github.amrom.workers.devmunity/t/concurrecy-not-work-for-push/183068/7 group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} cancel-in-progress: true + env: + # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors + PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} steps: - name: Checkout From a70b49476bf33a5cbc6fbbc2da343018dea884f9 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 22 Jun 2022 06:07:32 -0500 Subject: [PATCH 44/49] Backport PR #47431 on branch 1.4.x (Fix segmentation fault when JSON serializing a PeriodIndex) (#47457) Backport PR #47431: Fix segmentation fault when JSON serializing a PeriodIndex Co-authored-by: Robert de Vries --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/_libs/src/ujson/python/objToJSON.c | 4 +++- pandas/tests/io/json/test_ujson.py | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index a4d81533df23d..4034655ccd325 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -30,6 +30,7 @@ Fixed regressions - Fixed regression in :func:`assert_index_equal` when ``check_order=False`` and :class:`Index` has extension or object dtype (:issue:`47207`) - Fixed regression in :func:`read_excel` returning ints as floats on certain input sheets (:issue:`46988`) - Fixed regression in :meth:`DataFrame.shift` when ``axis`` is ``columns`` and ``fill_value`` is absent, ``freq`` is ignored (:issue:`47039`) +- Fixed regression in :meth:`DataFrame.to_json` causing a segmentation violation when :class:`DataFrame` is created with an ``index`` parameter of the type :class:`PeriodIndex` (:issue:`46683`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index c4609992342c3..5ad8029b38754 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -228,8 +228,10 @@ static PyObject *get_values(PyObject *obj) { PyErr_Clear(); } else if (PyObject_HasAttrString(values, "__array__")) { // We may have gotten a Categorical or Sparse array so call np.array + PyObject *array_values = PyObject_CallMethod(values, "__array__", + NULL); Py_DECREF(values); - values = PyObject_CallMethod(values, "__array__", NULL); + values = array_values; } else if (!PyArray_CheckExact(values)) { // Didn't get a numpy array, so keep trying Py_DECREF(values); diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 41a417f6b3ef4..982d751692eb9 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -24,6 +24,7 @@ DatetimeIndex, Index, NaT, + PeriodIndex, Series, Timedelta, Timestamp, @@ -1242,3 +1243,9 @@ def test_encode_timedelta_iso(self, td): expected = f'"{td.isoformat()}"' assert result == expected + + def test_encode_periodindex(self): + # GH 46683 + p = PeriodIndex(["2022-04-06", "2022-04-07"], freq="D") + df = DataFrame(index=p) + assert df.to_json() == "{}" From 30a3b98ae64c235092c22346b6870d49dfe04f31 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 22 Jun 2022 15:52:47 -0500 Subject: [PATCH 45/49] Backport PR #47393 on branch 1.4.x (CI/TST: Don't require length for construct_1d_arraylike_from_scalar cast to float64) (#47460) * Backport PR #47393: CI/TST: Don't require length for construct_1d_arraylike_from_scalar cast to float64 Co-authored-by: Matthew Roeschke Co-authored-by: Simon Hawkins --- pandas/core/construction.py | 5 ++++- pandas/core/dtypes/cast.py | 9 +++++++-- pandas/core/reshape/merge.py | 32 ++++++++++++++++++-------------- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 2595cff5c43c4..957fcf4ac10fc 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -533,7 +533,10 @@ def sanitize_array( if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: - subarr = _try_cast(data, dtype, copy, True) + # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int + # casting aligning with IntCastingNaNError below + with np.errstate(invalid="ignore"): + subarr = _try_cast(data, dtype, copy, True) except IntCastingNaNError: warnings.warn( "In a future version, passing float-dtype values containing NaN " diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e70fd443d61e8..1f7789e72be2c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1914,7 +1914,9 @@ def construct_1d_arraylike_from_scalar( value = maybe_unbox_datetimelike_tz_deprecation(value, dtype) subarr = np.empty(length, dtype=dtype) - subarr.fill(value) + if length: + # GH 47391: numpy > 1.24 will raise filling np.nan into int dtypes + subarr.fill(value) return subarr @@ -2218,7 +2220,10 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: if isinstance(element, np.ndarray) and element.dtype.kind == "f": # If all can be losslessly cast to integers, then we can hold them # We do something similar in putmask_smart - casted = element.astype(dtype) + + # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int + with np.errstate(invalid="ignore"): + casted = element.astype(dtype) comp = casted == element if comp.all(): return element diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 3a39713f18d65..d3ec9fec4640d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1201,23 +1201,27 @@ def _maybe_coerce_merge_keys(self) -> None: # check whether ints and floats elif is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype): - if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all(): - warnings.warn( - "You are merging on int and float " - "columns where the float values " - "are not equal to their int representation.", - UserWarning, - ) + # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int + with np.errstate(invalid="ignore"): + if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all(): + warnings.warn( + "You are merging on int and float " + "columns where the float values " + "are not equal to their int representation.", + UserWarning, + ) continue elif is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype): - if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all(): - warnings.warn( - "You are merging on int and float " - "columns where the float values " - "are not equal to their int representation.", - UserWarning, - ) + # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int + with np.errstate(invalid="ignore"): + if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all(): + warnings.warn( + "You are merging on int and float " + "columns where the float values " + "are not equal to their int representation.", + UserWarning, + ) continue # let's infer and see if we are ok From ad7dc56b53a26c4a6c6972eae25a73ca5f734c28 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 22 Jun 2022 23:52:53 +0100 Subject: [PATCH 46/49] Backport PR #47372 on branch 1.4.x (REGR: revert behaviour change for concat with empty/all-NaN data) (#47472) Backport PR #47372: REGR: revert behaviour change for concat with empty/all-NaN data Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.4.0.rst | 13 +- doc/source/whatsnew/v1.4.3.rst | 11 + pandas/core/dtypes/missing.py | 38 +++ pandas/core/internals/concat.py | 376 ++++++++++++++------- pandas/tests/extension/base/setitem.py | 14 + pandas/tests/frame/methods/test_append.py | 15 +- pandas/tests/reshape/concat/test_concat.py | 48 +++ pandas/tests/reshape/merge/test_merge.py | 8 +- 8 files changed, 383 insertions(+), 140 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 7340f2475e1f6..9f9bde65b482f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -271,6 +271,9 @@ the given ``dayfirst`` value when the value is a delimited date string (e.g. Ignoring dtypes in concat with empty or all-NA columns ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. note:: + This behaviour change has been reverted in pandas 1.4.3. + When using :func:`concat` to concatenate two or more :class:`DataFrame` objects, if one of the DataFrames was empty or had all-NA values, its dtype was *sometimes* ignored when finding the concatenated dtype. These are now @@ -301,9 +304,15 @@ object, the ``np.nan`` is retained. *New behavior*: -.. ipython:: python +.. code-block:: ipython + + In [4]: res + Out[4]: + bar + 0 2013-01-01 00:00:00 + 1 NaN + - res .. _whatsnew_140.notable_bug_fixes.value_counts_and_mode_do_not_coerce_to_nan: diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 4034655ccd325..f1532871d33c6 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -10,6 +10,17 @@ including other versions of pandas. .. --------------------------------------------------------------------------- +.. _whatsnew_143.concat: + +Behaviour of ``concat`` with empty or all-NA DataFrame columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The behaviour change in version 1.4.0 to stop ignoring the data type +of empty or all-NA columns with float or object dtype in :func:`concat` +(:ref:`whatsnew_140.notable_bug_fixes.concat_with_empty_or_all_na`) has been +reverted (:issue:`45637`). + + .. _whatsnew_143.regressions: Fixed regressions diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index dd3fcb260fdbd..d6c89824b619b 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -14,6 +14,7 @@ import pandas._libs.missing as libmissing from pandas._libs.tslibs import ( NaT, + Period, iNaT, ) from pandas._typing import ( @@ -668,3 +669,40 @@ def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool: # fallback, default to allowing NaN, None, NA, NaT return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal)) + + +def isna_all(arr: ArrayLike) -> bool: + """ + Optimized equivalent to isna(arr).all() + """ + total_len = len(arr) + + # Usually it's enough to check but a small fraction of values to see if + # a block is NOT null, chunks should help in such cases. + # parameters 1000 and 40 were chosen arbitrarily + chunk_len = max(total_len // 40, 1000) + + dtype = arr.dtype + if dtype.kind == "f": + checker = nan_checker + + elif dtype.kind in ["m", "M"] or dtype.type is Period: + # error: Incompatible types in assignment (expression has type + # "Callable[[Any], Any]", variable has type "ufunc") + checker = lambda x: np.asarray(x.view("i8")) == iNaT # type: ignore[assignment] + + else: + # error: Incompatible types in assignment (expression has type "Callable[[Any], + # Any]", variable has type "ufunc") + checker = lambda x: _isna_array( # type: ignore[assignment] + x, inf_as_na=INF_AS_NA + ) + + return all( + # error: Argument 1 to "__call__" of "ufunc" has incompatible type + # "Union[ExtensionArray, Any]"; expected "Union[Union[int, float, complex, str, + # bytes, generic], Sequence[Union[int, float, complex, str, bytes, generic]], + # Sequence[Sequence[Any]], _SupportsArray]" + checker(arr[i : i + chunk_len]).all() # type: ignore[arg-type] + for i in range(0, total_len, chunk_len) + ) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 782842d167570..2c21708aede0f 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,5 +1,6 @@ from __future__ import annotations +import copy import itertools from typing import ( TYPE_CHECKING, @@ -13,6 +14,7 @@ NaT, internals as libinternals, ) +from pandas._libs.missing import NA from pandas._typing import ( ArrayLike, DtypeObj, @@ -30,17 +32,26 @@ is_1d_only_ea_obj, is_datetime64tz_dtype, is_dtype_equal, + is_scalar, + needs_i8_conversion, ) from pandas.core.dtypes.concat import ( cast_to_common_type, concat_compat, ) from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + isna, + isna_all, +) +import pandas.core.algorithms as algos from pandas.core.arrays import ( DatetimeArray, ExtensionArray, ) +from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.internals.array_manager import ( ArrayManager, @@ -192,29 +203,19 @@ def concatenate_managers( if isinstance(mgrs_indexers[0][0], ArrayManager): return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) - # Assertions disabled for performance - # for tup in mgrs_indexers: - # # caller is responsible for ensuring this - # indexers = tup[1] - # assert concat_axis not in indexers - - if concat_axis == 0: - return _concat_managers_axis0(mgrs_indexers, axes, copy) - mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) - # Assertion disabled for performance - # assert all(not x[1] for x in mgrs_indexers) - - concat_plans = [_get_mgr_concatenation_plan(mgr) for mgr, _ in mgrs_indexers] - concat_plan = _combine_concat_plans(concat_plans) + concat_plans = [ + _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers + ] + concat_plan = _combine_concat_plans(concat_plans, concat_axis) blocks = [] for placement, join_units in concat_plan: unit = join_units[0] blk = unit.block - if len(join_units) == 1: + if len(join_units) == 1 and not join_units[0].indexers: values = blk.values if copy: values = values.copy() @@ -238,7 +239,7 @@ def concatenate_managers( fastpath = blk.values.dtype == values.dtype else: - values = _concatenate_join_units(join_units, copy=copy) + values = _concatenate_join_units(join_units, concat_axis, copy=copy) fastpath = False if fastpath: @@ -251,42 +252,6 @@ def concatenate_managers( return BlockManager(tuple(blocks), axes) -def _concat_managers_axis0( - mgrs_indexers, axes: list[Index], copy: bool -) -> BlockManager: - """ - concat_managers specialized to concat_axis=0, with reindexing already - having been done in _maybe_reindex_columns_na_proxy. - """ - had_reindexers = { - i: len(mgrs_indexers[i][1]) > 0 for i in range(len(mgrs_indexers)) - } - mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) - - mgrs = [x[0] for x in mgrs_indexers] - - offset = 0 - blocks = [] - for i, mgr in enumerate(mgrs): - # If we already reindexed, then we definitely don't need another copy - made_copy = had_reindexers[i] - - for blk in mgr.blocks: - if made_copy: - nb = blk.copy(deep=False) - elif copy: - nb = blk.copy() - else: - # by slicing instead of copy(deep=False), we get a new array - # object, see test_concat_copy - nb = blk.getitem_block(slice(None)) - nb._mgr_locs = nb._mgr_locs.add(offset) - blocks.append(nb) - - offset += len(mgr.items) - return BlockManager(tuple(blocks), axes) - - def _maybe_reindex_columns_na_proxy( axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] ) -> list[tuple[BlockManager, dict[int, np.ndarray]]]: @@ -297,33 +262,36 @@ def _maybe_reindex_columns_na_proxy( Columns added in this reindexing have dtype=np.void, indicating they should be ignored when choosing a column's final dtype. """ - new_mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] = [] - + new_mgrs_indexers = [] for mgr, indexers in mgrs_indexers: - # For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this - # is a cheap reindexing. - for i, indexer in indexers.items(): - mgr = mgr.reindex_indexer( - axes[i], - indexers[i], - axis=i, + # We only reindex for axis=0 (i.e. columns), as this can be done cheaply + if 0 in indexers: + new_mgr = mgr.reindex_indexer( + axes[0], + indexers[0], + axis=0, copy=False, - only_slice=True, # only relevant for i==0 + only_slice=True, allow_dups=True, - use_na_proxy=True, # only relevant for i==0 + use_na_proxy=True, ) - new_mgrs_indexers.append((mgr, {})) + new_indexers = indexers.copy() + del new_indexers[0] + new_mgrs_indexers.append((new_mgr, new_indexers)) + else: + new_mgrs_indexers.append((mgr, indexers)) return new_mgrs_indexers -def _get_mgr_concatenation_plan(mgr: BlockManager): +def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarray]): """ - Construct concatenation plan for given block manager. + Construct concatenation plan for given block manager and indexers. Parameters ---------- mgr : BlockManager + indexers : dict of {axis: indexer} Returns ------- @@ -333,11 +301,15 @@ def _get_mgr_concatenation_plan(mgr: BlockManager): # Calculate post-reindex shape , save for item axis which will be separate # for each block anyway. mgr_shape_list = list(mgr.shape) + for ax, indexer in indexers.items(): + mgr_shape_list[ax] = len(indexer) mgr_shape = tuple(mgr_shape_list) + assert 0 not in indexers + if mgr.is_single_block: blk = mgr.blocks[0] - return [(blk.mgr_locs, JoinUnit(blk, mgr_shape))] + return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] blknos = mgr.blknos blklocs = mgr.blklocs @@ -348,6 +320,8 @@ def _get_mgr_concatenation_plan(mgr: BlockManager): assert placements.is_slice_like assert blkno != -1 + join_unit_indexers = indexers.copy() + shape_list = list(mgr_shape) shape_list[0] = len(placements) shape = tuple(shape_list) @@ -372,14 +346,13 @@ def _get_mgr_concatenation_plan(mgr: BlockManager): ) ) - if not unit_no_ax0_reindexing: - # create block from subset of columns - blk = blk.getitem_block(ax0_blk_indexer) + # Omit indexer if no item reindexing is required. + if unit_no_ax0_reindexing: + join_unit_indexers.pop(0, None) + else: + join_unit_indexers[0] = ax0_blk_indexer - # Assertions disabled for performance - # assert blk._mgr_locs.as_slice == placements.as_slice - # assert blk.shape[0] == shape[0] - unit = JoinUnit(blk, shape) + unit = JoinUnit(blk, shape, join_unit_indexers) plan.append((placements, unit)) @@ -387,82 +360,192 @@ def _get_mgr_concatenation_plan(mgr: BlockManager): class JoinUnit: - def __init__(self, block: Block, shape: Shape): + def __init__(self, block: Block, shape: Shape, indexers=None): # Passing shape explicitly is required for cases when block is None. + # Note: block is None implies indexers is None, but not vice-versa + if indexers is None: + indexers = {} self.block = block + self.indexers = indexers self.shape = shape def __repr__(self) -> str: - return f"{type(self).__name__}({repr(self.block)})" + return f"{type(self).__name__}({repr(self.block)}, {self.indexers})" + + @cache_readonly + def needs_filling(self) -> bool: + for indexer in self.indexers.values(): + # FIXME: cache results of indexer == -1 checks. + if (indexer == -1).any(): + return True + + return False + + @cache_readonly + def dtype(self): + blk = self.block + if blk.values.dtype.kind == "V": + raise AssertionError("Block is None, no dtype") + + if not self.needs_filling: + return blk.dtype + return ensure_dtype_can_hold_na(blk.dtype) + + def _is_valid_na_for(self, dtype: DtypeObj) -> bool: + """ + Check that we are all-NA of a type/dtype that is compatible with this dtype. + Augments `self.is_na` with an additional check of the type of NA values. + """ + if not self.is_na: + return False + if self.block.dtype.kind == "V": + return True + + if self.dtype == object: + values = self.block.values + return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K")) + + na_value = self.block.fill_value + if na_value is NaT and not is_dtype_equal(self.dtype, dtype): + # e.g. we are dt64 and other is td64 + # fill_values match but we should not cast self.block.values to dtype + # TODO: this will need updating if we ever have non-nano dt64/td64 + return False + + if na_value is NA and needs_i8_conversion(dtype): + # FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat + # e.g. self.dtype == "Int64" and dtype is td64, we dont want + # to consider these as matching + return False + + # TODO: better to use can_hold_element? + return is_valid_na_for_dtype(na_value, dtype) @cache_readonly def is_na(self) -> bool: blk = self.block if blk.dtype.kind == "V": return True - return False - - def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike: - values: ArrayLike - if self.is_na: - return make_na_array(empty_dtype, self.shape) + if not blk._can_hold_na: + return False + values = blk.values + if values.size == 0: + return True + if isinstance(values.dtype, SparseDtype): + return False + + if values.ndim == 1: + # TODO(EA2D): no need for special case with 2D EAs + val = values[0] + if not is_scalar(val) or not isna(val): + # ideally isna_all would do this short-circuiting + return False + return isna_all(values) else: + val = values[0][0] + if not is_scalar(val) or not isna(val): + # ideally isna_all would do this short-circuiting + return False + return all(isna_all(row) for row in values) + + def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: + values: ArrayLike - if not self.block._can_consolidate: + if upcasted_na is None and self.block.dtype.kind != "V": + # No upcasting is necessary + fill_value = self.block.fill_value + values = self.block.get_values() + else: + fill_value = upcasted_na + + if self._is_valid_na_for(empty_dtype): + # note: always holds when self.block.dtype.kind == "V" + blk_dtype = self.block.dtype + + if blk_dtype == np.dtype("object"): + # we want to avoid filling with np.nan if we are + # using None; we already know that we are all + # nulls + values = self.block.values.ravel(order="K") + if len(values) and values[0] is None: + fill_value = None + + if is_datetime64tz_dtype(empty_dtype): + i8values = np.full(self.shape, fill_value.value) + return DatetimeArray(i8values, dtype=empty_dtype) + + elif is_1d_only_ea_dtype(empty_dtype): + empty_dtype = cast(ExtensionDtype, empty_dtype) + cls = empty_dtype.construct_array_type() + + missing_arr = cls._from_sequence([], dtype=empty_dtype) + ncols, nrows = self.shape + assert ncols == 1, ncols + empty_arr = -1 * np.ones((nrows,), dtype=np.intp) + return missing_arr.take( + empty_arr, allow_fill=True, fill_value=fill_value + ) + elif isinstance(empty_dtype, ExtensionDtype): + # TODO: no tests get here, a handful would if we disabled + # the dt64tz special-case above (which is faster) + cls = empty_dtype.construct_array_type() + missing_arr = cls._empty(shape=self.shape, dtype=empty_dtype) + missing_arr[:] = fill_value + return missing_arr + else: + # NB: we should never get here with empty_dtype integer or bool; + # if we did, the missing_arr.fill would cast to gibberish + missing_arr = np.empty(self.shape, dtype=empty_dtype) + missing_arr.fill(fill_value) + return missing_arr + + if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values - # No dtype upcasting is done here, it will be performed during - # concatenation itself. - values = self.block.values + if self.block.is_bool: + # External code requested filling/upcasting, bool values must + # be upcasted to object to avoid being upcasted to numeric. + values = self.block.astype(np.dtype("object")).values + else: + # No dtype upcasting is done here, it will be performed during + # concatenation itself. + values = self.block.values - return values + if not self.indexers: + # If there's no indexing to be done, we want to signal outside + # code that this array must be copied explicitly. This is done + # by returning a view and checking `retval.base`. + values = values.view() + else: + for ax, indexer in self.indexers.items(): + values = algos.take_nd(values, indexer, axis=ax) -def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike: - """ - Construct an np.ndarray or ExtensionArray of the given dtype and shape - holding all-NA values. - """ - if is_datetime64tz_dtype(dtype): - # NaT here is analogous to dtype.na_value below - i8values = np.full(shape, NaT.value) - return DatetimeArray(i8values, dtype=dtype) - - elif is_1d_only_ea_dtype(dtype): - dtype = cast(ExtensionDtype, dtype) - cls = dtype.construct_array_type() - - missing_arr = cls._from_sequence([], dtype=dtype) - nrows = shape[-1] - taker = -1 * np.ones((nrows,), dtype=np.intp) - return missing_arr.take(taker, allow_fill=True, fill_value=dtype.na_value) - elif isinstance(dtype, ExtensionDtype): - # TODO: no tests get here, a handful would if we disabled - # the dt64tz special-case above (which is faster) - cls = dtype.construct_array_type() - missing_arr = cls._empty(shape=shape, dtype=dtype) - missing_arr[:] = dtype.na_value - return missing_arr - else: - # NB: we should never get here with dtype integer or bool; - # if we did, the missing_arr.fill would cast to gibberish - missing_arr = np.empty(shape, dtype=dtype) - fill_value = _dtype_to_na_value(dtype) - missing_arr.fill(fill_value) - return missing_arr + return values -def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike: +def _concatenate_join_units( + join_units: list[JoinUnit], concat_axis: int, copy: bool +) -> ArrayLike: """ - Concatenate values from several join units along axis=1. + Concatenate values from several join units along selected axis. """ + if concat_axis == 0 and len(join_units) > 1: + # Concatenating join units along ax0 is handled in _merge_blocks. + raise AssertionError("Concatenating join units along axis0") empty_dtype = _get_empty_dtype(join_units) - to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype) for ju in join_units] + has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) + upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) + + to_concat = [ + ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) + for ju in join_units + ] if len(to_concat) == 1: # Only one block, nothing to concatenate. @@ -492,12 +575,12 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike concat_values = ensure_block_shape(concat_values, 2) else: - concat_values = concat_compat(to_concat, axis=1) + concat_values = concat_compat(to_concat, axis=concat_axis) return concat_values -def _dtype_to_na_value(dtype: DtypeObj): +def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): """ Find the NA value to go with this dtype. """ @@ -511,6 +594,9 @@ def _dtype_to_na_value(dtype: DtypeObj): # different from missing.na_value_for_dtype return None elif dtype.kind in ["i", "u"]: + if not has_none_blocks: + # different from missing.na_value_for_dtype + return None return np.nan elif dtype.kind == "O": return np.nan @@ -535,12 +621,14 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: empty_dtype = join_units[0].block.dtype return empty_dtype - needs_can_hold_na = any(unit.is_na for unit in join_units) + has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) - dtypes = [unit.block.dtype for unit in join_units if not unit.is_na] + dtypes = [unit.dtype for unit in join_units if not unit.is_na] + if not len(dtypes): + dtypes = [unit.dtype for unit in join_units if unit.block.dtype.kind != "V"] dtype = find_common_type(dtypes) - if needs_can_hold_na: + if has_none_blocks: dtype = ensure_dtype_can_hold_na(dtype) return dtype @@ -572,6 +660,9 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: # unless we're an extension dtype. all(not ju.is_na or ju.block.is_extension for ju in join_units) and + # no blocks with indexers (as then the dimensions do not fit) + all(not ju.indexers for ju in join_units) + and # only use this path when there is something to concatenate len(join_units) > 1 ) @@ -591,17 +682,28 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit: Extra items that didn't fit are returned as a separate block. """ + if 0 not in join_unit.indexers: + extra_indexers = join_unit.indexers + + if join_unit.block is None: + extra_block = None + else: + extra_block = join_unit.block.getitem_block(slice(length, None)) + join_unit.block = join_unit.block.getitem_block(slice(length)) + else: + extra_block = join_unit.block - extra_block = join_unit.block.getitem_block(slice(length, None)) - join_unit.block = join_unit.block.getitem_block(slice(length)) + extra_indexers = copy.copy(join_unit.indexers) + extra_indexers[0] = extra_indexers[0][length:] + join_unit.indexers[0] = join_unit.indexers[0][:length] extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] join_unit.shape = (length,) + join_unit.shape[1:] - return JoinUnit(block=extra_block, shape=extra_shape) + return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape) -def _combine_concat_plans(plans): +def _combine_concat_plans(plans, concat_axis: int): """ Combine multiple concatenation plans into one. @@ -611,6 +713,18 @@ def _combine_concat_plans(plans): for p in plans[0]: yield p[0], [p[1]] + elif concat_axis == 0: + offset = 0 + for plan in plans: + last_plc = None + + for plc, unit in plan: + yield plc.add(offset), [unit] + last_plc = plc + + if last_plc is not None: + offset += last_plc.as_slice.stop + else: # singleton list so we can modify it as a side-effect within _next_or_none num_ended = [0] diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 208a1a1757be2..a15cc2e8af66f 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -349,6 +349,20 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): self.assert_frame_equal(result, expected) + def test_setitem_with_expansion_row(self, data, na_value): + df = pd.DataFrame({"data": data[:1]}) + + df.loc[1, "data"] = data[1] + expected = pd.DataFrame({"data": data[:2]}) + self.assert_frame_equal(df, expected) + + # https://github.com/pandas-dev/pandas/issues/47284 + df.loc[2, "data"] = na_value + expected = pd.DataFrame( + {"data": pd.Series([data[0], data[1], na_value], dtype=data.dtype)} + ) + self.assert_frame_equal(df, expected) + def test_setitem_series(self, data, full_indexer): # https://github.com/pandas-dev/pandas/issues/32395 ser = pd.Series(data, name="data") diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index 5cfad472e0134..f8e6e07050aca 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -159,7 +159,7 @@ def test_append_empty_dataframe(self): expected = df1.copy() tm.assert_frame_equal(result, expected) - def test_append_dtypes(self): + def test_append_dtypes(self, using_array_manager): # GH 5754 # row appends of different dtypes (so need to do by-item) @@ -183,7 +183,10 @@ def test_append_dtypes(self): expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) - expected = expected.astype(object) + if using_array_manager: + # TODO(ArrayManager) decide on exact casting rules in concat + # With ArrayManager, all-NaN float is not ignored + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) @@ -192,7 +195,9 @@ def test_append_dtypes(self): expected = DataFrame( {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} ) - expected = expected.astype(object) + if using_array_manager: + # With ArrayManager, all-NaN float is not ignored + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": np.nan}, index=range(1)) @@ -201,7 +206,9 @@ def test_append_dtypes(self): expected = DataFrame( {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} ) - expected = expected.astype(object) + if using_array_manager: + # With ArrayManager, all-NaN float is not ignored + expected = expected.astype(object) tm.assert_frame_equal(result, expected) df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index a7b3c77e6ea0a..cc2f2ab7f7c1c 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -12,6 +12,7 @@ import pytest from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -744,3 +745,50 @@ def test_concat_retain_attrs(data): df2.attrs = {1: 1} df = concat([df1, df2]) assert df.attrs[1] == 1 + + +@td.skip_array_manager_invalid_test +@pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"]) +@pytest.mark.parametrize("empty_dtype", [None, "float64", "object"]) +def test_concat_ignore_emtpy_object_float(empty_dtype, df_dtype): + # https://github.com/pandas-dev/pandas/issues/45637 + df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype) + empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype) + result = concat([empty, df]) + expected = df + if df_dtype == "int64": + # TODO what exact behaviour do we want for integer eventually? + if empty_dtype == "float64": + expected = df.astype("float64") + else: + expected = df.astype("object") + tm.assert_frame_equal(result, expected) + + +@td.skip_array_manager_invalid_test +@pytest.mark.parametrize("df_dtype", ["float64", "int64", "datetime64[ns]"]) +@pytest.mark.parametrize("empty_dtype", [None, "float64", "object"]) +def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): + df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype) + empty = DataFrame({"foo": [np.nan], "bar": [np.nan]}, dtype=empty_dtype) + result = concat([empty, df], ignore_index=True) + + if df_dtype == "int64": + # TODO what exact behaviour do we want for integer eventually? + if empty_dtype == "object": + df_dtype = "object" + else: + df_dtype = "float64" + expected = DataFrame({"foo": [None, 1, 2], "bar": [None, 1, 2]}, dtype=df_dtype) + tm.assert_frame_equal(result, expected) + + +@td.skip_array_manager_invalid_test +def test_concat_ignore_empty_from_reindex(): + # https://github.com/pandas-dev/pandas/pull/43507#issuecomment-920375856 + df1 = DataFrame({"a": [1], "b": [pd.Timestamp("2012-01-01")]}) + df2 = DataFrame({"a": [2]}) + + result = concat([df1, df2.reindex(columns=df1.columns)], ignore_index=True) + expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 1249194d3a36d..7e62500df3e8c 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -682,7 +682,7 @@ def _constructor(self): assert isinstance(result, NotADataFrame) - def test_join_append_timedeltas(self): + def test_join_append_timedeltas(self, using_array_manager): # timedelta64 issues with join/merge # GH 5695 @@ -696,9 +696,11 @@ def test_join_append_timedeltas(self): { "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500), timedelta(0, 22500)], - }, - dtype=object, + } ) + if using_array_manager: + # TODO(ArrayManager) decide on exact casting rules in concat + expected = expected.astype(object) tm.assert_frame_equal(result, expected) def test_join_append_timedeltas2(self): From 8f846ae5bab2dc8a9183e7c6692d9fa9ffbd7483 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 23 Jun 2022 00:31:09 +0100 Subject: [PATCH 47/49] Backport PR #47327 on branch 1.4.x (REGR: Fix fillna making a copy when dict was given as fill value and inplace is set) (#47448) * Backport PR #47327: REGR: Fix fillna making a copy when dict was given as fill value and inplace is set Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/frame.py | 2 +- pandas/core/generic.py | 4 +++- pandas/tests/frame/methods/test_fillna.py | 12 ++++++++++++ pandas/tests/frame/methods/test_update.py | 13 +++++++++++++ 5 files changed, 30 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index f1532871d33c6..2550a12ebbb9d 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -29,6 +29,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.to_csv` raising error when :class:`DataFrame` contains extension dtype categorical column (:issue:`46297`, :issue:`46812`) - Fixed regression in representation of ``dtypes`` attribute of :class:`MultiIndex` (:issue:`46900`) - Fixed regression when setting values with :meth:`DataFrame.loc` updating :class:`RangeIndex` when index was set as new column and column was updated afterwards (:issue:`47128`) +- Fixed regression in :meth:`DataFrame.fillna` and :meth:`DataFrame.update` creating a copy when updating inplace (:issue:`47188`) - Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) - Fixed regression in :func:`concat` not sorting columns for mixed column names (:issue:`47127`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 391c12905adae..5b25f5be01d29 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7594,7 +7594,7 @@ def update( if mask.all(): continue - self[col] = expressions.where(mask, this, that) + self.loc[:, col] = expressions.where(mask, this, that) # ---------------------------------------------------------------------- # Data reshaping diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d924093203d7e..6357a670e6ba6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6482,7 +6482,9 @@ def fillna( if k not in result: continue downcast_k = downcast if not is_dict else downcast.get(k) - result[k] = result[k].fillna(v, limit=limit, downcast=downcast_k) + result.loc[:, k] = result[k].fillna( + v, limit=limit, downcast=downcast_k + ) return result if not inplace else None elif not is_list_like(value): diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index f4957efcd228a..33bd32ad65371 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -265,6 +265,7 @@ def test_fillna_downcast_false(self, frame_or_series): result = obj.fillna("", downcast=False) tm.assert_equal(result, obj) + @td.skip_array_manager_invalid_test @pytest.mark.parametrize("columns", [["A", "A", "B"], ["A", "A"]]) def test_fillna_dictlike_value_duplicate_colnames(self, columns): # GH#43476 @@ -654,6 +655,17 @@ def test_fillna_inplace_with_columns_limit_and_value(self): df.fillna(axis=1, value=100, limit=1, inplace=True) tm.assert_frame_equal(df, expected) + @td.skip_array_manager_invalid_test + @pytest.mark.parametrize("val", [-1, {"x": -1, "y": -1}]) + def test_inplace_dict_update_view(self, val): + # GH#47188 + df = DataFrame({"x": [np.nan, 2], "y": [np.nan, 2]}) + result_view = df[:] + df.fillna(val, inplace=True) + expected = DataFrame({"x": [-1, 2.0], "y": [-1.0, 2]}) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(result_view, expected) + def test_fillna_nonconsolidated_frame(): # https://github.com/pandas-dev/pandas/issues/36495 diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 408113e9bc417..d3257ac09a0ab 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -146,3 +148,14 @@ def test_update_with_different_dtype(self): expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]}) tm.assert_frame_equal(df, expected) + + @td.skip_array_manager_invalid_test + def test_update_modify_view(self): + # GH#47188 + df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) + df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) + result_view = df2[:] + df2.update(df) + expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(result_view, expected) From bb726d38172fd7cdeab978a1675d9565a63009ed Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 22 Jun 2022 19:57:30 -0500 Subject: [PATCH 48/49] Backport PR #47476 on branch 1.4.x (DOC: v1.4.3 release date) (#47478) Backport PR #47476: DOC: v1.4.3 release date Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.4.3.rst | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 2550a12ebbb9d..be5ac74201be8 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -1,7 +1,7 @@ .. _whatsnew_143: -What's new in 1.4.3 (April ??, 2022) ------------------------------------- +What's new in 1.4.3 (June 23, 2022) +----------------------------------- These are the changes in pandas 1.4.3. See :ref:`release` for a full changelog including other versions of pandas. @@ -12,10 +12,10 @@ including other versions of pandas. .. _whatsnew_143.concat: -Behaviour of ``concat`` with empty or all-NA DataFrame columns -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Behavior of ``concat`` with empty or all-NA DataFrame columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The behaviour change in version 1.4.0 to stop ignoring the data type +The behavior change in version 1.4.0 to stop ignoring the data type of empty or all-NA columns with float or object dtype in :func:`concat` (:ref:`whatsnew_140.notable_bug_fixes.concat_with_empty_or_all_na`) has been reverted (:issue:`45637`). @@ -30,7 +30,7 @@ Fixed regressions - Fixed regression in representation of ``dtypes`` attribute of :class:`MultiIndex` (:issue:`46900`) - Fixed regression when setting values with :meth:`DataFrame.loc` updating :class:`RangeIndex` when index was set as new column and column was updated afterwards (:issue:`47128`) - Fixed regression in :meth:`DataFrame.fillna` and :meth:`DataFrame.update` creating a copy when updating inplace (:issue:`47188`) -- Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) +- Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when the sorting column has ``np.nan`` values (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) - Fixed regression in :func:`concat` not sorting columns for mixed column names (:issue:`47127`) - Fixed regression in :meth:`.Groupby.transform` and :meth:`.Groupby.agg` failing with ``engine="numba"`` when the index was a :class:`MultiIndex` (:issue:`46867`) @@ -39,7 +39,7 @@ Fixed regressions - Fixed regression in :func:`read_csv` with ``index_col=False`` identifying first row as index names when ``header=None`` (:issue:`46955`) - Fixed regression in :meth:`.DataFrameGroupBy.agg` when used with list-likes or dict-likes and ``axis=1`` that would give incorrect results; now raises ``NotImplementedError`` (:issue:`46995`) - Fixed regression in :meth:`DataFrame.resample` and :meth:`DataFrame.rolling` when used with list-likes or dict-likes and ``axis=1`` that would raise an unintuitive error message; now raises ``NotImplementedError`` (:issue:`46904`) -- Fixed regression in :func:`assert_index_equal` when ``check_order=False`` and :class:`Index` has extension or object dtype (:issue:`47207`) +- Fixed regression in :func:`testing.assert_index_equal` when ``check_order=False`` and :class:`Index` has extension or object dtype (:issue:`47207`) - Fixed regression in :func:`read_excel` returning ints as floats on certain input sheets (:issue:`46988`) - Fixed regression in :meth:`DataFrame.shift` when ``axis`` is ``columns`` and ``fill_value`` is absent, ``freq`` is ignored (:issue:`47039`) - Fixed regression in :meth:`DataFrame.to_json` causing a segmentation violation when :class:`DataFrame` is created with an ``index`` parameter of the type :class:`PeriodIndex` (:issue:`46683`) @@ -50,9 +50,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in :meth:`pd.eval`, :meth:`DataFrame.eval` and :meth:`DataFrame.query` where passing empty ``local_dict`` or ``global_dict`` was treated as passing ``None`` (:issue:`47084`) -- Most I/O methods do no longer suppress ``OSError`` and ``ValueError`` when closing file handles (:issue:`47136`) -- +- Bug in :func:`pandas.eval`, :meth:`DataFrame.eval` and :meth:`DataFrame.query` where passing empty ``local_dict`` or ``global_dict`` was treated as passing ``None`` (:issue:`47084`) +- Most I/O methods no longer suppress ``OSError`` and ``ValueError`` when closing file handles (:issue:`47136`) .. --------------------------------------------------------------------------- @@ -61,7 +60,6 @@ Bug fixes Other ~~~~~ - The minimum version of Cython needed to compile pandas is now ``0.29.30`` (:issue:`41935`) -- .. --------------------------------------------------------------------------- From e8093ba372f9adfe79439d90fe74b0b5b6dea9d6 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Thu, 23 Jun 2022 11:35:25 +0100 Subject: [PATCH 49/49] RLS: 1.4.3