[SPARK-52580][PS] Avoid CAST_INVALID_INPUT of replace in ANSI mode

xinrong-meng · xinrong-meng · commit 9c35c43eaf3b · 2025-07-31T11:26:23.000-07:00
### What changes were proposed in this pull request? Avoid CAST_INVALID_INPUT of `replace` in ANSI mode. Specifically, under ANSI mode - used try_cast() to safely cast values - NaN checks, we now avoid F.isnan() on non-numeric types An example of the spark plan difference between ANSI on/off is: ``` # if the original column is of StringType # ANSI off Column<'CASE WHEN in(C, 0, 1, 2, 3, 5, 6) THEN 4 ELSE C END'> # ANSI on Column<'CASE WHEN in(C, TRY_CAST(0 AS STRING), TRY_CAST(1 AS STRING), TRY_CAST(2 AS STRING), TRY_CAST(3 AS STRING), TRY_CAST(5 AS STRING), TRY_CAST(6 AS STRING)) THEN TRY_CAST(4 AS STRING) ELSE TRY_CAST(C AS STRING) END'> ``` ### Why are the changes needed? Ensure pandas on Spark works well with ANSI mode on. Part of https://issues.apache.org/jira/browse/SPARK-52556. ### Does this PR introduce _any_ user-facing change? Yes, `replace` works in ANSI, for example ```py >>> ps.set_option("compute.fail_on_ansi_mode", False) >>> ps.set_option("compute.ansi_mode_support", True) >>> pdf = pd.DataFrame( ... {"A": [0, 1, 2, 3, np.nan], "B": [5, 6, 7, 8, np.nan], "C": ["a", "b", "c", "d", None]}, ... index=np.random.rand(5), ... ) >>> psdf = ps.from_pandas(pdf) >>> psdf["C"].replace([0, 1, 2, 3, 5, 6], 4) 0.458472 a 0.749773 b 0.222904 c 0.397280 d 0.293933 None Name: C, dtype: object >>> psdf.replace([0, 1, 2, 3, 5, 6], [6, 5, 4, 3, 2, 1]) A B C 0.458472 6.0 2.0 a 0.749773 5.0 1.0 b 0.222904 4.0 7.0 c 0.397280 3.0 8.0 d 0.293933 NaN NaN None ``` ### How was this patch tested? Unit tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #51297 from xinrong-meng/replace. Authored-by: Xinrong Meng <xinrong@apache.org> Signed-off-by: Xinrong Meng <xinrong@apache.org>
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
@@ -104,6 +104,7 @@
 from pyspark.pandas.plot import PandasOnSparkPlotAccessor
 from pyspark.pandas.utils import (
     combine_frames,
+    is_ansi_mode_enabled,
     is_name_like_tuple,
     is_name_like_value,
     name_like_string,
@@ -5106,33 +5107,68 @@ def replace(
                     )
                 )
             to_replace = {k: v for k, v in zip(to_replace, value)}
+
+        spark_session = self._internal.spark_frame.sparkSession
+        ansi_mode = is_ansi_mode_enabled(spark_session)
+        col_type = self.spark.data_type
+
         if isinstance(to_replace, dict):
             is_start = True
             if len(to_replace) == 0:
                 current = self.spark.column
             else:
                 for to_replace_, value in to_replace.items():
-                    cond = (
-                        (F.isnan(self.spark.column) | self.spark.column.isNull())
-                        if pd.isna(to_replace_)
-                        else (self.spark.column == F.lit(to_replace_))
-                    )
+                    if pd.isna(to_replace_):
+                        if ansi_mode and isinstance(col_type, NumericType):
+                            cond = F.isnan(self.spark.column) | self.spark.column.isNull()
+                        else:
+                            cond = self.spark.column.isNull()
+                    else:
+                        to_replace_lit = (
+                            F.lit(to_replace_).try_cast(col_type)
+                            if ansi_mode
+                            else F.lit(to_replace_)
+                        )
+                        cond = self.spark.column == to_replace_lit
+                    value_expr = F.lit(value).try_cast(col_type) if ansi_mode else F.lit(value)
                     if is_start:
-                        current = F.when(cond, value)
+                        current = F.when(cond, value_expr)
                         is_start = False
                     else:
-                        current = current.when(cond, value)
+                        current = current.when(cond, value_expr)
                 current = current.otherwise(self.spark.column)
         else:
             if regex:
                 # to_replace must be a string
                 cond = self.spark.column.rlike(cast(str, to_replace))
             else:
-                cond = self.spark.column.isin(to_replace)
+                if ansi_mode:
+                    to_replace_values = (
+                        [to_replace]
+                        if not is_list_like(to_replace) or isinstance(to_replace, str)
+                        else to_replace
+                    )
+                    to_replace_values = cast(List[Any], to_replace_values)
+                    literals = [F.lit(v).try_cast(col_type) for v in to_replace_values]
+                    cond = self.spark.column.isin(literals)
+                else:
+                    cond = self.spark.column.isin(to_replace)
                 # to_replace may be a scalar
                 if np.array(pd.isna(to_replace)).any():
-                    cond = cond | F.isnan(self.spark.column) | self.spark.column.isNull()
-            current = F.when(cond, value).otherwise(self.spark.column)
+                    if ansi_mode:
+                        if isinstance(col_type, NumericType):
+                            cond = cond | F.isnan(self.spark.column) | self.spark.column.isNull()
+                        else:
+                            cond = cond | self.spark.column.isNull()
+                    else:
+                        cond = cond | F.isnan(self.spark.column) | self.spark.column.isNull()
+
+            if ansi_mode:
+                value_expr = F.lit(value).try_cast(col_type)
+                current = F.when(cond, value_expr).otherwise(self.spark.column.try_cast(col_type))
+
+            else:
+                current = F.when(cond, value).otherwise(self.spark.column)
 
         return self._with_new_scol(current)  # TODO: dtype?
 
diff --git a/python/pyspark/pandas/tests/computation/test_missing_data.py b/python/pyspark/pandas/tests/computation/test_missing_data.py
@@ -274,7 +274,6 @@ def test_fillna(self):
             pdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}),
         )
 
-    @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
     def test_replace(self):
         pdf = pd.DataFrame(
             {

Original file line number	Diff line number	Diff line change
`@@ -274,7 +274,6 @@ def test_fillna(self):`
`274`	`274`	`pdf.fillna({("x", "a"): -1, ("x", "b"): -2, ("y", "c"): -5}),`
`275`	`275`	`)`
`276`	`276`
`277`		`- @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)`
`278`	`277`	`def test_replace(self):`
`279`	`278`	`pdf = pd.DataFrame(`
`280`	`279`	`{`