From ab531e1269b32883eec433e5fcf6f5d0e8793a6d Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 14 Apr 2021 09:51:34 +0900 Subject: [PATCH] [SPARK-34495] Port/integrate Koalas remaining codes into PySpark --- python/pyspark/pandas/generic.py | 4 + python/pyspark/pandas/indexing.py | 19 +++++ .../pandas/tests/test_ops_on_diff_frames.py | 76 ++++++++----------- 3 files changed, 56 insertions(+), 43 deletions(-) diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index a08e80655f09..7edaabbd8676 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -2877,6 +2877,10 @@ def to_markdown(self, buf=None, mode=None) -> str: str Series or DataFrame in Markdown-friendly format. + Notes + ----- + Requires the `tabulate `_ package. + Examples -------- >>> kser = ps.Series(["elk", "pig", "dog", "quetzal"], name="animal") diff --git a/python/pyspark/pandas/indexing.py b/python/pyspark/pandas/indexing.py index 0adf45733c10..7f964df8bf07 100644 --- a/python/pyspark/pandas/indexing.py +++ b/python/pyspark/pandas/indexing.py @@ -1697,6 +1697,25 @@ def _select_cols_else( ) def __setitem__(self, key, value): + if is_list_like(value) and not isinstance(value, spark.Column): + iloc_item = self[key] + if not is_list_like(key) or not is_list_like(iloc_item): + raise ValueError("setting an array element with a sequence.") + else: + shape_iloc_item = iloc_item.shape + len_iloc_item = shape_iloc_item[0] + len_value = len(value) + if len_iloc_item != len_value: + if self._is_series: + raise ValueError( + "cannot set using a list-like indexer with a different length than " + "the value" + ) + else: + raise ValueError( + "shape mismatch: value array of shape ({},) could not be broadcast " + "to indexing result of shape {}".format(len_value, shape_iloc_item) + ) super().__setitem__(key, value) # Update again with resolved_copy to drop extra columns. self._kdf._update_internal_frame( diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index 9070b5ad7ada..d567bae3cd91 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -1151,25 +1151,17 @@ def test_frame_iloc_setitem(self): pdf.iloc[[0, 1, 2], 1] = -pdf.max_speed self.assert_eq(kdf, pdf) - # TODO: matching the behavior with pandas 1.2 and uncomment below test - # with self.assertRaisesRegex( - # ValueError, - # "shape mismatch: value array of shape (3,) could not be broadcast to indexing " - # "result of shape (2,1)", - # ): - # kdf.iloc[[1, 2], [1]] = -another_kdf.max_speed + with self.assertRaisesRegex( + ValueError, "shape mismatch", + ): + kdf.iloc[[1, 2], [1]] = -another_kdf.max_speed kdf.iloc[[0, 1, 2], 1] = 10 * another_kdf.max_speed pdf.iloc[[0, 1, 2], 1] = 10 * pdf.max_speed self.assert_eq(kdf, pdf) - # TODO: matching the behavior with pandas 1.2 and uncomment below test - # with self.assertRaisesRegex( - # ValueError, - # "shape mismatch: value array of shape (3,) could not be broadcast to indexing " - # "result of shape (1,)", - # ): - # kdf.iloc[[0], 1] = 10 * another_kdf.max_speed + with self.assertRaisesRegex(ValueError, "shape mismatch"): + kdf.iloc[[0], 1] = 10 * another_kdf.max_speed def test_series_loc_setitem(self): pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"]) @@ -1269,12 +1261,11 @@ def test_series_iloc_setitem(self): self.assert_eq(kdf, pdf) self.assert_eq(ksery, psery) - # TODO: matching the behavior with pandas 1.2 and uncomment below test. - # with self.assertRaisesRegex( - # ValueError, - # "cannot set using a list-like indexer with a different length than the value", - # ): - # kser.iloc[[1, 2]] = -kser_another + with self.assertRaisesRegex( + ValueError, + "cannot set using a list-like indexer with a different length than the value", + ): + kser.iloc[[1, 2]] = -kser_another kser.iloc[[0, 1, 2]] = 10 * kser_another pser.iloc[[0, 1, 2]] = 10 * pser_another @@ -1282,11 +1273,11 @@ def test_series_iloc_setitem(self): self.assert_eq(kdf, pdf) self.assert_eq(ksery, psery) - # with self.assertRaisesRegex( - # ValueError, - # "cannot set using a list-like indexer with a different length than the value", - # ): - # kser.iloc[[0]] = 10 * kser_another + with self.assertRaisesRegex( + ValueError, + "cannot set using a list-like indexer with a different length than the value", + ): + kser.iloc[[0]] = 10 * kser_another kser1.iloc[[0, 1, 2]] = -kser_another pser1.iloc[[0, 1, 2]] = -pser_another @@ -1294,11 +1285,11 @@ def test_series_iloc_setitem(self): self.assert_eq(kdf, pdf) self.assert_eq(ksery, psery) - # with self.assertRaisesRegex( - # ValueError, - # "cannot set using a list-like indexer with a different length than the value", - # ): - # kser1.iloc[[1, 2]] = -kser_another + with self.assertRaisesRegex( + ValueError, + "cannot set using a list-like indexer with a different length than the value", + ): + kser1.iloc[[1, 2]] = -kser_another pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"]) kdf = ps.from_pandas(pdf) @@ -1317,12 +1308,11 @@ def test_series_iloc_setitem(self): self.assert_eq(kdf, pdf) self.assert_eq(ksery, psery) - # TODO: matching the behavior with pandas 1.2 and uncomment below test. - # with self.assertRaisesRegex( - # ValueError, - # "cannot set using a list-like indexer with a different length than the value", - # ): - # kiloc[[1, 2]] = -kser_another + with self.assertRaisesRegex( + ValueError, + "cannot set using a list-like indexer with a different length than the value", + ): + kiloc[[1, 2]] = -kser_another kiloc[[0, 1, 2]] = 10 * kser_another piloc[[0, 1, 2]] = 10 * pser_another @@ -1330,11 +1320,11 @@ def test_series_iloc_setitem(self): self.assert_eq(kdf, pdf) self.assert_eq(ksery, psery) - # with self.assertRaisesRegex( - # ValueError, - # "cannot set using a list-like indexer with a different length than the value", - # ): - # kiloc[[0]] = 10 * kser_another + with self.assertRaisesRegex( + ValueError, + "cannot set using a list-like indexer with a different length than the value", + ): + kiloc[[0]] = 10 * kser_another def test_update(self): pdf = pd.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]}) @@ -1863,7 +1853,7 @@ def test_frame_iloc_setitem(self): another_kdf = ps.DataFrame(pdf) with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - kdf.iloc[[1, 2], [1]] = another_kdf.max_speed + kdf.iloc[[1, 2], [1]] = another_kdf.max_speed.iloc[[1, 2]] def test_series_loc_setitem(self): pser = pd.Series([1, 2, 3], index=["cobra", "viper", "sidewinder"]) @@ -1889,7 +1879,7 @@ def test_series_iloc_setitem(self): kser_another = ps.from_pandas(pser_another) with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"): - kser.iloc[[1]] = -kser_another + kser.iloc[[1]] = -kser_another.iloc[[1]] def test_where(self): pdf1 = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]})