Merge remote-tracking branch 'upstream/master' into styler_to_html_sparse_args

attack68 · attack68 · commit cadcc3e204e2 · 2021-07-28T19:03:04.000+02:00
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -121,8 +121,7 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
       pandas/io/parsers/ \
       pandas/io/sas/ \
       pandas/io/sql.py \
-      pandas/tseries/ \
-      pandas/io/formats/style_render.py
+      pandas/tseries/
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
 fi
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -205,7 +205,7 @@ Numeric
 ^^^^^^^
 - Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`)
 - Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
--
+- Bug in ``numexpr`` engine still being used when the option ``compute.use_numexpr`` is set to ``False`` (:issue:`32556`)
 
 Conversion
 ^^^^^^^^^^
@@ -262,10 +262,11 @@ Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`)
 - Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`)
--
+- Bug in :meth:`DataFrame.groupby.rolling.var` would calculate the rolling variance only on the first group (:issue:`42442`)
 
 Reshaping
 ^^^^^^^^^
+- Improved error message when creating a :class:`DataFrame` column from a multi-dimensional :class:`numpy.ndarray` (:issue:`42463`)
 - :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`)
 - Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`)
 -
@@ -287,6 +288,7 @@ Styler
 
 Other
 ^^^^^
+- Bug in :meth:`CustomBusinessMonthBegin.__add__` (:meth:`CustomBusinessMonthEnd.__add__`) not applying the extra ``offset`` parameter when beginning (end) of the target month is already a business day (:issue:`41356`)
 
 .. ***DO NOT USE THIS SECTION***
 
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -217,8 +217,8 @@ def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
     This is a reverse of the label factorization process.
     """
     cdef:
-        Py_ssize_t i, loc, label, n
-        ndarray[intp_t] indexer, where, counts
+        Py_ssize_t i, label, n
+        intp_t[::1] indexer, where, counts
 
     counts = np.zeros(ngroups + 1, dtype=np.intp)
     n = len(index)
@@ -241,7 +241,7 @@ def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
             indexer[where[label]] = i
             where[label] += 1
 
-    return indexer, counts
+    return indexer.base, counts.base
 
 
 cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
@@ -3370,7 +3370,10 @@ cdef class _CustomBusinessMonth(BusinessMixin):
         """
         Define default roll function to be called in apply method.
         """
-        cbday = CustomBusinessDay(n=self.n, normalize=False, **self.kwds)
+        cbday_kwds = self.kwds.copy()
+        cbday_kwds['offset'] = timedelta(0)
+
+        cbday = CustomBusinessDay(n=1, normalize=False, **cbday_kwds)
 
         if self._prefix.endswith("S"):
             # MonthBegin
@@ -3414,6 +3417,9 @@ cdef class _CustomBusinessMonth(BusinessMixin):
 
         new = cur_month_offset_date + n * self.m_offset
         result = self.cbday_roll(new)
+
+        if self.offset:
+            result = result + self.offset
         return result
 
 
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -310,7 +310,10 @@ cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
     t = y - mean_x[0]
     compensation[0] = t + mean_x[0] - y
     delta = t
-    mean_x[0] = mean_x[0] + delta / nobs[0]
+    if nobs[0]:
+        mean_x[0] = mean_x[0] + delta / nobs[0]
+    else:
+        mean_x[0] = 0
     ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0])
 
 
diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py
@@ -43,9 +43,10 @@ def _check_engine(engine: str | None) -> str:
         Engine name.
     """
     from pandas.core.computation.check import NUMEXPR_INSTALLED
+    from pandas.core.computation.expressions import USE_NUMEXPR
 
     if engine is None:
-        engine = "numexpr" if NUMEXPR_INSTALLED else "python"
+        engine = "numexpr" if USE_NUMEXPR else "python"
 
     if engine not in ENGINES:
         valid_engines = list(ENGINES.keys())
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2897,16 +2897,15 @@ def _get_cythonized_result(
 
         ids, _, ngroups = grouper.group_info
         output: dict[base.OutputKey, np.ndarray] = {}
-        base_func = getattr(libgroupby, how)
-
-        error_msg = ""
-        for idx, obj in enumerate(self._iterate_slices()):
-            name = obj.name
-            values = obj._values
 
-            if numeric_only and not is_numeric_dtype(values.dtype):
-                continue
+        base_func = getattr(libgroupby, how)
+        base_func = partial(base_func, labels=ids)
+        if needs_ngroups:
+            base_func = partial(base_func, ngroups=ngroups)
+        if min_count is not None:
+            base_func = partial(base_func, min_count=min_count)
 
+        def blk_func(values: ArrayLike) -> ArrayLike:
             if aggregate:
                 result_sz = ngroups
             else:
@@ -2915,54 +2914,31 @@ def _get_cythonized_result(
             result = np.zeros(result_sz, dtype=cython_dtype)
             if needs_2d:
                 result = result.reshape((-1, 1))
-            func = partial(base_func, result)
+            func = partial(base_func, out=result)
 
             inferences = None
 
             if needs_counts:
                 counts = np.zeros(self.ngroups, dtype=np.int64)
-                func = partial(func, counts)
+                func = partial(func, counts=counts)
 
             if needs_values:
                 vals = values
                 if pre_processing:
-                    try:
-                        vals, inferences = pre_processing(vals)
-                    except TypeError as err:
-                        error_msg = str(err)
-                        howstr = how.replace("group_", "")
-                        warnings.warn(
-                            "Dropping invalid columns in "
-                            f"{type(self).__name__}.{howstr} is deprecated. "
-                            "In a future version, a TypeError will be raised. "
-                            f"Before calling .{howstr}, select only columns which "
-                            "should be valid for the function.",
-                            FutureWarning,
-                            stacklevel=3,
-                        )
-                        continue
+                    vals, inferences = pre_processing(vals)
+
                 vals = vals.astype(cython_dtype, copy=False)
                 if needs_2d:
                     vals = vals.reshape((-1, 1))
-                func = partial(func, vals)
-
-            func = partial(func, ids)
-
-            if min_count is not None:
-                func = partial(func, min_count)
+                func = partial(func, values=vals)
 
             if needs_mask:
                 mask = isna(values).view(np.uint8)
-                func = partial(func, mask)
-
-            if needs_ngroups:
-                func = partial(func, ngroups)
+                func = partial(func, mask=mask)
 
             if needs_nullable:
                 is_nullable = isinstance(values, BaseMaskedArray)
                 func = partial(func, nullable=is_nullable)
-                if post_processing:
-                    post_processing = partial(post_processing, nullable=is_nullable)
 
             func(**kwargs)  # Call func to modify indexer values in place
 
@@ -2973,9 +2949,38 @@ def _get_cythonized_result(
                 result = algorithms.take_nd(values, result)
 
             if post_processing:
-                result = post_processing(result, inferences)
+                pp_kwargs = {}
+                if needs_nullable:
+                    pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray)
 
-            key = base.OutputKey(label=name, position=idx)
+                result = post_processing(result, inferences, **pp_kwargs)
+
+            return result
+
+        error_msg = ""
+        for idx, obj in enumerate(self._iterate_slices()):
+            values = obj._values
+
+            if numeric_only and not is_numeric_dtype(values.dtype):
+                continue
+
+            try:
+                result = blk_func(values)
+            except TypeError as err:
+                error_msg = str(err)
+                howstr = how.replace("group_", "")
+                warnings.warn(
+                    "Dropping invalid columns in "
+                    f"{type(self).__name__}.{howstr} is deprecated. "
+                    "In a future version, a TypeError will be raised. "
+                    f"Before calling .{howstr}, select only columns which "
+                    "should be valid for the function.",
+                    FutureWarning,
+                    stacklevel=3,
+                )
+                continue
+
+            key = base.OutputKey(label=obj.name, position=idx)
             output[key] = result
 
         # error_msg is "" on an frame/series with no rows or columns
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -672,6 +672,11 @@ def _format_duplicate_message(self) -> DataFrame:
         assert len(duplicates)
 
         out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates]
+        if self._is_multi:
+            # test_format_duplicate_labels_message_multi
+            # error: "Type[Index]" has no attribute "from_tuples"  [attr-defined]
+            out.index = type(self).from_tuples(out.index)  # type: ignore[attr-defined]
+
         if self.nlevels == 1:
             out = out.rename_axis("label")
         return out.to_frame(name="positions")
@@ -5400,22 +5405,15 @@ def _get_indexer_strict(self, key, axis_name: str_t) -> tuple[Index, np.ndarray]
 
         self._raise_if_missing(keyarr, indexer, axis_name)
 
-        if (
-            needs_i8_conversion(self.dtype)
-            or is_categorical_dtype(self.dtype)
-            or is_interval_dtype(self.dtype)
-        ):
-            # For CategoricalIndex take instead of reindex to preserve dtype.
-            #  For IntervalIndex this is to map integers to the Intervals they match to.
-            keyarr = self.take(indexer)
-            if keyarr.dtype.kind in ["m", "M"]:
-                # DTI/TDI.take can infer a freq in some cases when we dont want one
-                if isinstance(key, list) or (
-                    isinstance(key, type(self))
-                    # "Index" has no attribute "freq"
-                    and key.freq is None  # type: ignore[attr-defined]
-                ):
-                    keyarr = keyarr._with_freq(None)
+        keyarr = self.take(indexer)
+        if keyarr.dtype.kind in ["m", "M"]:
+            # DTI/TDI.take can infer a freq in some cases when we dont want one
+            if isinstance(key, list) or (
+                isinstance(key, type(self))
+                # "Index" has no attribute "freq"
+                and key.freq is None  # type: ignore[attr-defined]
+            ):
+                keyarr = keyarr._with_freq(None)
 
         return keyarr, indexer
 
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -615,6 +615,8 @@ def _extract_index(data) -> Index:
             elif is_list_like(val) and getattr(val, "ndim", 1) == 1:
                 have_raw_arrays = True
                 raw_lengths.append(len(val))
+            elif isinstance(val, np.ndarray) and val.ndim > 1:
+                raise ValueError("Per-column arrays must each be 1-dimensional")
 
         if not indexes and not raw_lengths:
             raise ValueError("If using all scalar values, you must pass an index")
diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py
@@ -1865,6 +1865,35 @@ def test_invalid_engine():
         pd.eval("x + y", local_dict={"x": 1, "y": 2}, engine="asdf")
 
 
+@td.skip_if_no_ne
+@pytest.mark.parametrize(
+    ("use_numexpr", "expected"),
+    (
+        (True, "numexpr"),
+        (False, "python"),
+    ),
+)
+def test_numexpr_option_respected(use_numexpr, expected):
+    # GH 32556
+    from pandas.core.computation.eval import _check_engine
+
+    with pd.option_context("compute.use_numexpr", use_numexpr):
+        result = _check_engine(None)
+        assert result == expected
+
+
+@td.skip_if_no_ne
+def test_numexpr_option_incompatible_op():
+    # GH 32556
+    with pd.option_context("compute.use_numexpr", False):
+        df = DataFrame(
+            {"A": [True, False, True, False, None, None], "B": [1, 2, 3, 4, 5, 6]}
+        )
+        result = df.query("A.isnull()")
+        expected = DataFrame({"A": [None, None], "B": [5, 6]}, index=[4, 5])
+        tm.assert_frame_equal(result, expected)
+
+
 @td.skip_if_no_ne
 def test_invalid_parser():
     msg = "Invalid parser 'asdf' passed"
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -2530,6 +2530,19 @@ def test_from_2d_object_array_of_periods_or_intervals(self):
         expected = DataFrame({0: pi, 1: ii, 2: pi, 3: ii})
         tm.assert_frame_equal(df3, expected)
 
+    @pytest.mark.parametrize(
+        "col_a, col_b",
+        [
+            ([[1], [2]], np.array([[1], [2]])),
+            (np.array([[1], [2]]), [[1], [2]]),
+            (np.array([[1], [2]]), np.array([[1], [2]])),
+        ],
+    )
+    def test_error_from_2darray(self, col_a, col_b):
+        msg = "Per-column arrays must each be 1-dimensional"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame({"a": col_a, "b": col_b})
+
 
 class TestDataFrameConstructorWithDtypeCoercion:
     def test_floating_values_integer_dtype(self):
diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py
@@ -133,21 +133,23 @@ def test_getitem_interval_with_nans(self, frame_or_series, indexer_sl):
 class TestIntervalIndexInsideMultiIndex:
     def test_mi_intervalindex_slicing_with_scalar(self):
         # GH#27456
+        ii = IntervalIndex.from_arrays(
+            [0, 1, 10, 11, 0, 1, 10, 11], [1, 2, 11, 12, 1, 2, 11, 12], name="MP"
+        )
         idx = pd.MultiIndex.from_arrays(
             [
                 pd.Index(["FC", "FC", "FC", "FC", "OWNER", "OWNER", "OWNER", "OWNER"]),
                 pd.Index(
                     ["RID1", "RID1", "RID2", "RID2", "RID1", "RID1", "RID2", "RID2"]
                 ),
-                IntervalIndex.from_arrays(
-                    [0, 1, 10, 11, 0, 1, 10, 11], [1, 2, 11, 12, 1, 2, 11, 12]
-                ),
+                ii,
             ]
         )
 
         idx.names = ["Item", "RID", "MP"]
         df = DataFrame({"value": [1, 2, 3, 4, 5, 6, 7, 8]})
         df.index = idx
+
         query_df = DataFrame(
             {
                 "Item": ["FC", "OWNER", "FC", "OWNER", "OWNER"],
@@ -161,5 +163,13 @@ def test_mi_intervalindex_slicing_with_scalar(self):
         idx = pd.MultiIndex.from_arrays([query_df.Item, query_df.RID, query_df.MP])
         query_df.index = idx
         result = df.value.loc[query_df.index]
-        expected = Series([1, 6, 2, 8, 7], index=idx, name="value")
+
+        # the IntervalIndex level is indexed with floats, which map to
+        #  the intervals containing them.  Matching the behavior we would get
+        #  with _only_ an IntervalIndex, we get an IntervalIndex level back.
+        sliced_level = ii.take([0, 1, 1, 3, 2])
+        expected_index = pd.MultiIndex.from_arrays(
+            [idx.get_level_values(0), idx.get_level_values(1), sliced_level]
+        )
+        expected = Series([1, 6, 2, 8, 7], index=expected_index, name="value")
         tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py
@@ -512,8 +512,8 @@ def test_floating_misc(self, indexer_sl):
         for fancy_idx in [[5.0, 0.0], np.array([5.0, 0.0])]:  # float
             tm.assert_series_equal(indexer_sl(s)[fancy_idx], expected)
 
-        expected = Series([2, 0], index=Index([5, 0], dtype="int64"))
-        for fancy_idx in [[5, 0], np.array([5, 0])]:  # int
+        expected = Series([2, 0], index=Index([5, 0], dtype="float64"))
+        for fancy_idx in [[5, 0], np.array([5, 0])]:
             tm.assert_series_equal(indexer_sl(s)[fancy_idx], expected)
 
         # all should return the same as we are slicing 'the same'
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
diff --git a/pandas/tests/tseries/offsets/test_custom_business_month.py b/pandas/tests/tseries/offsets/test_custom_business_month.py
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py