apache · itholic · Mar 4, 2024 · Apr 1, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/python/pyspark/errors/exceptions/captured.py b/python/pyspark/errors/exceptions/captured.py
@@ -379,5 +379,13 @@ def fragment(self) -> str:
     def callSite(self) -> str:
         return str(self._q.callSite())
 
+    def pysparkFragment(self) -> Optional[str]:  # type: ignore[return]
+        if self.contextType() == QueryContextType.DataFrame:
+            return str(self._q.pysparkFragment())
+
+    def pysparkCallSite(self) -> Optional[str]:  # type: ignore[return]
+        if self.contextType() == QueryContextType.DataFrame:
+            return str(self._q.pysparkCallSite())
+
     def summary(self) -> str:
         return str(self._q.summary())
diff --git a/python/pyspark/errors/utils.py b/python/pyspark/errors/utils.py
@@ -16,11 +16,16 @@
 #
 
 import re
-from typing import Dict, Match
+import functools
+import inspect
+from typing import Any, Callable, Dict, Match, TypeVar, Type
 
 from pyspark.errors.error_classes import ERROR_CLASSES_MAP
 
 
+T = TypeVar("T")
+
+
 class ErrorClassesReader:
     """
     A reader to load error information from error_classes.py.
@@ -119,3 +124,61 @@ def get_message_template(self, error_class: str) -> str:
             message_template = main_message_template + " " + sub_message_template
 
         return message_template
+
+
+def _capture_call_site(fragment: str) -> None:
+    """
+    Capture the call site information including file name, line number, and function name.
+
+    This function updates the thread-local storage from server side (PySparkCurrentOrigin)
+    with the current call site information when a PySpark API function is called.
+
+    Parameters
+    ----------
+    func_name : str
+        The name of the PySpark API function being captured.
+
+    Notes
+    -----
+    The call site information is used to enhance error messages with the exact location
+    in the user code that led to the error.
+    """
+    from pyspark.sql.session import SparkSession
+
+    spark = SparkSession._getActiveSessionOrCreate()
+    assert spark._jvm is not None
+
+    stack = inspect.stack()
+    frame_info = stack[-1]
+    filename = frame_info.filename
+    lineno = frame_info.lineno
+    call_site = f"{filename}:{lineno}"
+
+    pyspark_origin = spark._jvm.org.apache.spark.sql.catalyst.trees.PySparkCurrentOrigin
+    pyspark_origin.set(fragment, call_site)
+
+
+def with_origin(func: Callable[..., Any]) -> Callable[..., Any]:
+    """
+    A decorator to capture and provide the call site information to the server side
+    when PySpark API functions are invoked.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        # Update call site when the function is called
+        _capture_call_site(func.__name__)
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+def with_origin_to_class(cls: Type[T]) -> Type[T]:
+    """
+    Decorate all methods of a class with `with_origin` to capture call site information.
+    """
+    for name, method in cls.__dict__.items():
+        if callable(method) and name != "__init__":
+            setattr(cls, name, with_origin(method))
+    return cls
diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
@@ -35,6 +35,7 @@
 
 from pyspark.context import SparkContext
 from pyspark.errors import PySparkAttributeError, PySparkTypeError, PySparkValueError
+from pyspark.errors.utils import with_origin_to_class
 from pyspark.sql.types import DataType
 from pyspark.sql.utils import get_active_spark_context
 
@@ -177,6 +178,7 @@ def _(
         return Column(njc)
 
     _.__doc__ = doc
+    _.__name__ = name
     return _
 
 
@@ -195,6 +197,7 @@ def _(self: "Column", other: Union["LiteralType", "DecimalLiteral"]) -> "Column"
     return _
 
 
+@with_origin_to_class
 class Column:
 
     """

diff --git a/python/pyspark/sql/tests/connect/test_parity_dataframe.py b/python/pyspark/sql/tests/connect/test_parity_dataframe.py
@@ -30,6 +30,10 @@ def test_help_command(self):
     def test_toDF_with_schema_string(self):
         super().test_toDF_with_schema_string()
 
+    @unittest.skip("Spark Connect does not support DataFrameQueryContext currently.")
+    def test_dataframe_error_context(self):
+        super().test_dataframe_error_context()
+
 
 if __name__ == "__main__":
     import unittest

diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py
@@ -37,6 +37,9 @@
     AnalysisException,
     IllegalArgumentException,
     PySparkTypeError,
+    ArithmeticException,
+    QueryContextType,
+    NumberFormatException,
 )
 from pyspark.testing.sqlutils import (
     ReusedSQLTestCase,
@@ -825,6 +828,172 @@ def test_duplicate_field_names(self):
         self.assertEqual(df.schema, schema)
         self.assertEqual(df.collect(), data)
 
+    def test_dataframe_error_context(self):
+        # SPARK-47274: Add more useful contexts for PySpark DataFrame API errors.
+        with self.sql_conf({"spark.sql.ansi.enabled": True}):
+            df = self.spark.range(10)
+
+            # DataFrameQueryContext with pysparkCallSite - divide
+            with self.assertRaises(ArithmeticException) as pe:
+                df.withColumn("div_zero", df.id / 0).collect()
+            self.check_error(
+                exception=pe.exception,
+                error_class="DIVIDE_BY_ZERO",
+                message_parameters={"config": '"spark.sql.ansi.enabled"'},
+                query_context_type=QueryContextType.DataFrame,
+                pyspark_fragment="divide",
+            )
+
+            # DataFrameQueryContext with pysparkCallSite - plus
+            with self.assertRaises(NumberFormatException) as pe:
+                df.withColumn("plus_invalid_type", df.id + "string").collect()
+            self.check_error(
+                exception=pe.exception,
+                error_class="CAST_INVALID_INPUT",
+                message_parameters={
+                    "expression": "'string'",
+                    "sourceType": '"STRING"',
+                    "targetType": '"BIGINT"',
+                    "ansiConfig": '"spark.sql.ansi.enabled"',
+                },
+                query_context_type=QueryContextType.DataFrame,
+                pyspark_fragment="plus",
+            )
+
+            # DataFrameQueryContext with pysparkCallSite - minus
+            with self.assertRaises(NumberFormatException) as pe:
+                df.withColumn("minus_invalid_type", df.id - "string").collect()
+            self.check_error(
+                exception=pe.exception,
+                error_class="CAST_INVALID_INPUT",
+                message_parameters={
+                    "expression": "'string'",
+                    "sourceType": '"STRING"',
+                    "targetType": '"BIGINT"',
+                    "ansiConfig": '"spark.sql.ansi.enabled"',
+                },
+                query_context_type=QueryContextType.DataFrame,
+                pyspark_fragment="minus",
+            )
+
+            # DataFrameQueryContext with pysparkCallSite - multiply
+            with self.assertRaises(NumberFormatException) as pe:
+                df.withColumn("multiply_invalid_type", df.id * "string").collect()
+            self.check_error(
+                exception=pe.exception,
+                error_class="CAST_INVALID_INPUT",
+                message_parameters={
+                    "expression": "'string'",
+                    "sourceType": '"STRING"',
+                    "targetType": '"BIGINT"',
+                    "ansiConfig": '"spark.sql.ansi.enabled"',
+                },
+                query_context_type=QueryContextType.DataFrame,
+                pyspark_fragment="multiply",
+            )
+
+            # DataFrameQueryContext with pysparkCallSite - chained (`divide` is problematic)
+            with self.assertRaises(ArithmeticException) as pe:
+                df.withColumn("multiply_ten", df.id * 10).withColumn(
+                    "divide_zero", df.id / 0
+                ).withColumn("plus_ten", df.id + 10).withColumn("minus_ten", df.id - 10).collect()
+            self.check_error(
+                exception=pe.exception,
+                error_class="DIVIDE_BY_ZERO",
+                message_parameters={"config": '"spark.sql.ansi.enabled"'},
+                query_context_type=QueryContextType.DataFrame,
+                pyspark_fragment="divide",
+            )
+
+            # DataFrameQueryContext with pysparkCallSite - chained (`plus` is problematic)
+            with self.assertRaises(NumberFormatException) as pe:
+                df.withColumn("multiply_ten", df.id * 10).withColumn(
+                    "divide_ten", df.id / 10
+                ).withColumn("plus_string", df.id + "string").withColumn(
+                    "minus_ten", df.id - 10
+                ).collect()
+            self.check_error(
+                exception=pe.exception,
+                error_class="CAST_INVALID_INPUT",
+                message_parameters={
+                    "expression": "'string'",
+                    "sourceType": '"STRING"',
+                    "targetType": '"BIGINT"',
+                    "ansiConfig": '"spark.sql.ansi.enabled"',
+                },
+                query_context_type=QueryContextType.DataFrame,
+                pyspark_fragment="plus",
+            )
+
+            # DataFrameQueryContext with pysparkCallSite - chained (`minus` is problematic)
+            with self.assertRaises(NumberFormatException) as pe:
+                df.withColumn("multiply_ten", df.id * 10).withColumn(
+                    "divide_ten", df.id / 10
+                ).withColumn("plus_ten", df.id + 10).withColumn(
+                    "minus_string", df.id - "string"
+                ).collect()
+            self.check_error(
+                exception=pe.exception,
+                error_class="CAST_INVALID_INPUT",
+                message_parameters={
+                    "expression": "'string'",
+                    "sourceType": '"STRING"',
+                    "targetType": '"BIGINT"',
+                    "ansiConfig": '"spark.sql.ansi.enabled"',
+                },
+                query_context_type=QueryContextType.DataFrame,
+                pyspark_fragment="minus",
+            )
+
+            # DataFrameQueryContext with pysparkCallSite - chained (`multiply` is problematic)
+            with self.assertRaises(NumberFormatException) as pe:
+                df.withColumn("multiply_string", df.id * "string").withColumn(
+                    "divide_ten", df.id / 10
+                ).withColumn("plus_ten", df.id + 10).withColumn("minus_ten", df.id - 10).collect()
+            self.check_error(
+                exception=pe.exception,
+                error_class="CAST_INVALID_INPUT",
+                message_parameters={
+                    "expression": "'string'",
+                    "sourceType": '"STRING"',
+                    "targetType": '"BIGINT"',
+                    "ansiConfig": '"spark.sql.ansi.enabled"',
+                },
+                query_context_type=QueryContextType.DataFrame,
+                pyspark_fragment="multiply",
+            )
+
+            # DataFrameQueryContext without pysparkCallSite
+            with self.assertRaises(AnalysisException) as pe:
+                df.select("non-existing-column")
+            self.check_error(
+                exception=pe.exception,
+                error_class="UNRESOLVED_COLUMN.WITH_SUGGESTION",
+                message_parameters={"objectName": "`non-existing-column`", "proposal": "`id`"},
+                query_context_type=QueryContextType.DataFrame,
+                pyspark_fragment="",
+            )
+
+            # SQLQueryContext
+            with self.assertRaises(ArithmeticException) as pe:
+                self.spark.sql("select 10/0").collect()
+            self.check_error(
+                exception=pe.exception,
+                error_class="DIVIDE_BY_ZERO",
+                message_parameters={"config": '"spark.sql.ansi.enabled"'},
+                query_context_type=QueryContextType.SQL,
+            )
+
+            # No QueryContext
+            with self.assertRaises(AnalysisException) as pe:
+                self.spark.sql("select * from non-existing-table")
+            self.check_error(
+                exception=pe.exception,
+                error_class="INVALID_IDENTIFIER",
+                message_parameters={"ident": "non-existing-table"},
+                query_context_type=None,
+            )
+
 
 class DataFrameTests(DataFrameTestsMixin, ReusedSQLTestCase):
     pass

diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py
@@ -54,6 +54,8 @@
 
 from pyspark import SparkContext, SparkConf
 from pyspark.errors import PySparkAssertionError, PySparkException
+from pyspark.errors.exceptions.captured import CapturedException
+from pyspark.errors.exceptions.base import QueryContextType
 from pyspark.find_spark_home import _find_spark_home
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql import Row
@@ -280,7 +282,14 @@ def check_error(
         exception: PySparkException,
         error_class: str,
         message_parameters: Optional[Dict[str, str]] = None,
+        query_context_type: Optional[QueryContextType] = None,
+        pyspark_fragment: Optional[str] = None,
     ):
+        query_context = exception.getQueryContext()
+        assert bool(query_context) == (query_context_type is not None), (
+            "`query_context_type` is required when QueryContext exists. "
+            f"QueryContext: {query_context}."
+        )
         # Test if given error is an instance of PySparkException.
         self.assertIsInstance(
             exception,
@@ -302,6 +311,27 @@ def check_error(
             expected, actual, f"Expected message parameters was '{expected}', got '{actual}'"
         )
 
+        # Test query context
+        if query_context:
+            expected = query_context_type
+            actual_contexts = exception.getQueryContext()
+            for actual_context in actual_contexts:
+                actual = actual_context.contextType()
+                self.assertEqual(
+                    expected, actual, f"Expected QueryContext was '{expected}', got '{actual}'"
+                )
+                if actual == QueryContextType.DataFrame:
+                    assert (
+                        pyspark_fragment is not None
+                    ), "`pyspark_fragment` is required when QueryContextType is DataFrame."
+                    expected = pyspark_fragment
+                    actual = actual_context.pysparkFragment()
+                    self.assertEqual(
+                        expected,
+                        actual,
+                        f"Expected PySpark fragment was '{expected}', got '{actual}'",
+                    )
+
 
 def assertSchemaEqual(
     actual: StructType,