cloud-fan · cloud-fan · Nov 26, 2024 · Nov 22, 2024
diff --git a/python/pyspark/sql/classic/column.py b/python/pyspark/sql/classic/column.py
@@ -605,6 +605,10 @@ def over(self, window: "WindowSpec") -> ParentColumn:
         jc = self._jc.over(window._jspec)
         return Column(jc)
 
+    def outer(self) -> ParentColumn:
+        jc = self._jc.outer()
+        return Column(jc)
+
     def __nonzero__(self) -> None:
         raise PySparkValueError(
             errorClass="CANNOT_CONVERT_COLUMN_INTO_BOOL",

diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
@@ -1521,6 +1521,24 @@ def over(self, window: "WindowSpec") -> "Column":
         """
         ...
 
+    @dispatch_col_method
+    def outer(self) -> "Column":
+        """
+        Mark this as an outer Column if its expression may reference columns in the outer plan.
+
+        This is used to trigger lazy analysis of Spark Classic DataFrame, so that we can use it
+        to build subquery expressions. Spark Connect DataFrame is always lazily analyzed and
+        does not need to use this function.
+
+        .. versionadded:: 4.0.0
+
+        See Also
+        --------
+        pyspark.sql.dataframe.DataFrame.scalar
+        pyspark.sql.dataframe.DataFrame.exists
+        """
+        ...
+
     @dispatch_col_method
     def __nonzero__(self) -> None:
         ...

diff --git a/python/pyspark/sql/connect/column.py b/python/pyspark/sql/connect/column.py
@@ -34,6 +34,7 @@
     PySparkTypeError,
     PySparkAttributeError,
     PySparkValueError,
+    PySparkNotImplementedError,
 )
 from pyspark.sql.types import DataType
 from pyspark.sql.utils import enum_to_value
@@ -458,6 +459,13 @@ def over(self, window: "WindowSpec") -> ParentColumn:  # type: ignore[override]
 
         return Column(WindowExpression(windowFunction=self._expr, windowSpec=window))
 
+    def outer(self) -> ParentColumn:
+        # TODO(SPARK-50134): Implement this method
+        raise PySparkNotImplementedError(
+            errorClass="NOT_IMPLEMENTED",
+            messageParameters={"feature": "outer()"},
+        )
+
     def isin(self, *cols: Any) -> ParentColumn:
         if len(cols) == 1 and isinstance(cols[0], (list, set)):
             _cols = list(cols[0])

diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py
@@ -42,7 +42,7 @@
 
 import numpy as np
 
-from pyspark.errors import PySparkNotImplementedError, PySparkTypeError, PySparkValueError
+from pyspark.errors import PySparkTypeError, PySparkValueError
 from pyspark.errors.utils import _with_origin
 from pyspark.sql.dataframe import DataFrame as ParentDataFrame
 from pyspark.sql import Column
@@ -257,17 +257,6 @@ def col(col: str) -> Column:
 column = col
 
 
-def outer(colOrExprSQL: Union[Column, str]) -> Column:
-    # TODO(SPARK-50134): Implement this method
-    raise PySparkNotImplementedError(
-        errorClass="NOT_IMPLEMENTED",
-        messageParameters={"feature": "outer()"},
-    )
-
-
-outer.__doc__ = pysparkfuncs.outer.__doc__
-
-
 def lit(col: Any) -> Column:
     from pyspark.sql.connect.column import Column as ConnectColumn
 

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -6525,7 +6525,7 @@ def scalar(self) -> Column:
         >>> employees.alias("e1").where(
         ...     sf.col("salary")
         ...     > employees.alias("e2").where(
-        ...         sf.col("e2.department_id") == sf.outer(sf.col("e1.department_id"))
+        ...         sf.col("e2.department_id") == sf.col("e1.department_id").outer()
         ...     ).select(sf.avg("salary")).scalar()
         ... ).select("name", "salary", "department_id").show()
         +-----+------+-------------+
@@ -6544,7 +6544,7 @@ def scalar(self) -> Column:
         ...     sf.format_number(
         ...         sf.lit(100) * sf.col("salary") /
         ...             employees.alias("e2").where(
-        ...                 sf.col("e2.department_id") == sf.outer(sf.col("e1.department_id"))
+        ...                 sf.col("e2.department_id") == sf.col("e1.department_id").outer()
         ...             ).select(sf.sum("salary")).scalar().alias("avg_salary"),
         ...         1
         ...     ).alias("salary_proportion_in_department")
@@ -6599,7 +6599,7 @@ def exists(self) -> Column:
         >>> from pyspark.sql import functions as sf
         >>> customers.alias("c").where(
         ...     orders.alias("o").where(
-        ...         sf.col("o.customer_id") == sf.outer(sf.col("c.customer_id"))
+        ...         sf.col("o.customer_id") == sf.col("c.customer_id").outer()
         ...     ).exists()
         ... ).orderBy("customer_id").show()
         +-----------+-------------+-------+
@@ -6615,7 +6615,7 @@ def exists(self) -> Column:
         >>> from pyspark.sql import functions as sf
         >>> customers.alias("c").where(
         ...     ~orders.alias("o").where(
-        ...         sf.col("o.customer_id") == sf.outer(sf.col("c.customer_id"))
+        ...         sf.col("o.customer_id") == sf.col("c.customer_id").outer()
         ...     ).exists()
         ... ).orderBy("customer_id").show()
         +-----------+-------------+---------+
@@ -6629,7 +6629,7 @@ def exists(self) -> Column:
         >>> from pyspark.sql import functions as sf
         >>> orders.alias("o").where(
         ...     customers.alias("c").where(
-        ...         (sf.col("c.customer_id") == sf.outer(sf.col("o.customer_id")))
+        ...         (sf.col("c.customer_id") == sf.col("o.customer_id").outer())
         ...         & (sf.col("country") == "USA")
         ...     ).exists()
         ... ).orderBy("order_id").show()

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
@@ -327,32 +327,6 @@ def col(col: str) -> Column:
 column = col
 
 
-@_try_remote_functions
-def outer(colOrExprSQL: Union[Column, str]) -> Column:
-    """
-    Mark a column or an expression as it could contain outer references.
-
-    .. versionadded:: 4.0.0
-
-    See Also
-    --------
-    pyspark.sql.dataframe.DataFrame.scalar
-    pyspark.sql.dataframe.DataFrame.exists
-
-    Parameters
-    ----------
-    colOrExprSQL : :class:`~pyspark.sql.Column` or str
-        Target column or SQL expression to mark as outer.
-    """
-    from pyspark.sql.classic.column import _to_java_column
-
-    if isinstance(colOrExprSQL, str):
-        arg = colOrExprSQL
-    else:
-        arg = _to_java_column(colOrExprSQL)
-    return _invoke_function("outer", arg)
-
-
 @_try_remote_functions
 def asc(col: "ColumnOrName") -> Column:
     """

diff --git a/python/pyspark/sql/tests/test_subquery.py b/python/pyspark/sql/tests/test_subquery.py
@@ -49,10 +49,21 @@ def df2(self):
 
     def test_noop_outer(self):
         assertDataFrameEqual(
-            self.spark.range(1).select(sf.outer(sf.col("id"))),
+            self.spark.range(1).select(sf.col("id").outer()),
             self.spark.range(1).select(sf.col("id")),
         )
 
+        with self.assertRaises(AnalysisException) as pe:
+            self.spark.range(1).select(sf.col("outer_col").outer()).collect()
+
+        self.check_error(
+            exception=pe.exception,
+            errorClass="UNRESOLVED_COLUMN.WITH_SUGGESTION",
+            messageParameters={"objectName": "`outer_col`", "proposal": "`id`"},
+            query_context_type=QueryContextType.DataFrame,
+            fragment="col",
+        )
+
     def test_simple_uncorrelated_scalar_subquery(self):
         assertDataFrameEqual(
             self.spark.range(1).select(self.spark.range(1).select(sf.lit(1)).scalar().alias("b")),
@@ -181,7 +192,7 @@ def test_scalar_subquery_against_local_relations(self):
                     "c1",
                     (
                         self.spark.table("t2")
-                        .where(sf.outer(sf.col("t1.c2")) == sf.col("t2.c2"))
+                        .where(sf.col("t1.c2").outer() == sf.col("t2.c2"))
                         .select(sf.max("c1"))
                         .scalar()
                     ),
@@ -198,10 +209,9 @@ def test_correlated_scalar_subquery(self):
 
             with self.subTest("in where"):
                 for cond in [
-                    sf.outer(sf.col("a")) == sf.col("c"),
-                    sf.outer("a") == sf.col("c"),
-                    sf.outer(sf.col("a") == sf.col("c")),
-                    sf.outer("a = c"),
+                    sf.col("a").outer() == sf.col("c"),
+                    (sf.col("a") == sf.col("c")).outer(),
+                    sf.expr("a = c").outer(),
                 ]:
                     with self.subTest(cond=cond):
                         assertDataFrameEqual(
@@ -219,10 +229,9 @@ def test_correlated_scalar_subquery(self):
                 df2 = self.spark.table("l").alias("t2")
 
                 for cond in [
-                    sf.col("t1.a") == sf.outer(sf.col("t2.a")),
-                    sf.col("t1.a") == sf.outer("t2.a"),
-                    sf.outer(sf.col("t1.a") == sf.col("t2.a")),
-                    sf.outer("t1.a = t2.a"),
+                    sf.col("t1.a") == sf.col("t2.a").outer(),
+                    (sf.col("t1.a") == sf.col("t2.a")).outer(),
+                    sf.expr("t1.a = t2.a").outer(),
                 ]:
                     with self.subTest(cond=cond):
                         assertDataFrameEqual(
@@ -245,7 +254,7 @@ def test_correlated_scalar_subquery(self):
                         "a",
                         (
                             self.spark.table("r")
-                            .where(sf.col("b") == sf.outer(sf.col("a")))
+                            .where(sf.col("b") == sf.col("a").outer())
                             .select(sf.sum("d"))
                             .scalar()
                             .alias("sum_d")
@@ -264,7 +273,7 @@ def test_correlated_scalar_subquery(self):
                     df1.select(
                         "a",
                         (
-                            df2.where(sf.col("t2.a").eqNullSafe(sf.outer(sf.col("t1.a"))))
+                            df2.where(sf.col("t2.a").eqNullSafe(sf.col("t1.a").outer()))
                             .select(sf.sum("b"))
                             .scalar()
                             .alias("sum_b")
@@ -284,7 +293,7 @@ def test_correlated_scalar_subquery(self):
                         "a",
                         (
                             self.spark.table("r")
-                            .where(sf.outer(sf.col("a")) == sf.col("c"))
+                            .where(sf.col("a").outer() == sf.col("c"))
                             .select(sf.sum("d"))
                             .scalar()
                             .alias("sum_d")
@@ -305,7 +314,7 @@ def test_correlated_scalar_subquery(self):
                 with self.assertRaises(SparkRuntimeException) as pe:
                     df1.select(
                         "a",
-                        df2.where(sf.col("t1.a") == sf.outer(sf.col("t2.a"))).select("b").scalar(),
+                        df2.where(sf.col("t1.a") == sf.col("t2.a").outer()).select("b").scalar(),
                     ).collect()
 
                 self.check_error(
@@ -322,7 +331,7 @@ def test_correlated_scalar_subquery(self):
                     df1.select(
                         "a",
                         (
-                            df2.where(sf.col("t2.a") < sf.outer(sf.col("t1.a")))
+                            df2.where(sf.col("t2.a") < sf.col("t1.a").outer())
                             .select(sf.sum("b"))
                             .scalar()
                             .alias("sum_b")
@@ -339,11 +348,8 @@ def test_correlated_scalar_subquery(self):
                     .where(
                         self.spark.table("r")
                         .where(
-                            ((sf.outer(sf.col("a")) == sf.col("c")) & (sf.col("d") == sf.lit(2.0)))
-                            | (
-                                (sf.outer(sf.col("a")) == sf.col("c"))
-                                & (sf.col("d") == sf.lit(1.0))
-                            )
+                            ((sf.col("a").outer() == sf.col("c")) & (sf.col("d") == sf.lit(2.0)))
+                            | ((sf.col("a").outer() == sf.col("c")) & (sf.col("d") == sf.lit(1.0)))
                         )
                         .select(sf.count(sf.lit(1)))
                         .scalar()
@@ -368,9 +374,9 @@ def test_exists_subquery(self):
 
             with self.subTest("EXISTS"):
                 for cond in [
-                    sf.outer(sf.col("a")) == sf.col("c"),
-                    sf.outer("a") == sf.col("c"),
-                    sf.outer("a = c"),
+                    sf.col("a").outer() == sf.col("c"),
+                    (sf.col("a") == sf.col("c")).outer(),
+                    sf.expr("a = c").outer(),
                 ]:
                     with self.subTest(cond=cond):
                         assertDataFrameEqual(
@@ -395,7 +401,7 @@ def test_exists_subquery(self):
             with self.subTest("NOT EXISTS"):
                 assertDataFrameEqual(
                     self.spark.table("l").where(
-                        ~self.spark.table("r").where(sf.outer(sf.col("a")) == sf.col("c")).exists()
+                        ~self.spark.table("r").where(sf.col("a").outer() == sf.col("c")).exists()
                     ),
                     self.spark.sql(
                         """select * from l where not exists (select * from r where l.a = r.c)"""
@@ -407,8 +413,8 @@ def test_exists_subquery(self):
                         ~(
                             self.spark.table("r")
                             .where(
-                                (sf.outer(sf.col("a")) == sf.col("c"))
-                                & (sf.outer(sf.col("b")) < sf.col("d"))
+                                (sf.col("a").outer() == sf.col("c"))
+                                & (sf.col("b").outer() < sf.col("d"))
                             )
                             .exists()
                         )
@@ -424,8 +430,8 @@ def test_exists_subquery(self):
             with self.subTest("EXISTS within OR"):
                 assertDataFrameEqual(
                     self.spark.table("l").where(
-                        self.spark.table("r").where(sf.outer(sf.col("a")) == sf.col("c")).exists()
-                        | self.spark.table("r").where(sf.outer(sf.col("a")) == sf.col("c")).exists()
+                        self.spark.table("r").where(sf.col("a").outer() == sf.col("c")).exists()
+                        | self.spark.table("r").where(sf.col("a").outer() == sf.col("c")).exists()
                     ),
                     self.spark.sql(
                         """
@@ -439,11 +445,11 @@ def test_exists_subquery(self):
                     self.spark.table("l").where(
                         self.spark.table("r")
                         .where(
-                            (sf.outer(sf.col("a")) == sf.col("c"))
-                            & (sf.outer(sf.col("b")) < sf.col("d"))
+                            (sf.col("a").outer() == sf.col("c"))
+                            & (sf.col("b").outer() < sf.col("d"))
                         )
                         .exists()
-                        | self.spark.table("r").where(sf.outer(sf.col("a")) == sf.col("c")).exists()
+                        | self.spark.table("r").where(sf.col("a").outer() == sf.col("c")).exists()
                     ),
                     self.spark.sql(
                         """

diff --git a/sql/api/src/main/scala/org/apache/spark/sql/Column.scala b/sql/api/src/main/scala/org/apache/spark/sql/Column.scala
@@ -1383,8 +1383,8 @@ class Column(val node: ColumnNode) extends Logging {
   def over(): Column = over(Window.spec)
 
   /**
-   * Mark this as an outer Column if its expression may reference columns in the outer plan. This is
-   * used to trigger lazy analysis of Spark Classic DataFrame, so that we can use it to build
+   * Mark this as an outer Column if its expression may reference columns in the outer plan. This
+   * is used to trigger lazy analysis of Spark Classic DataFrame, so that we can use it to build
    * subquery expressions. Spark Connect DataFrame is always lazily analyzed and does not need to
    * use this function.
    *

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -83,8 +83,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
       "product", // Discussed in https://github.com/apache/spark/pull/30745
       "unwrap_udt",
       "timestamp_add",
-      "timestamp_diff",
-      "outer"
+      "timestamp_diff"
     )
 
     // We only consider functions matching this pattern, this excludes symbolic and other