apache · grundprinzip · Jun 16, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024
diff --git a/...or/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala b/...or/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala
@@ -70,6 +70,7 @@ private[connect] object MetricGenerator extends AdaptiveSparkPlanHelper {
       .newBuilder()
       .setName(p.nodeName)
       .setPlanId(p.id)
+      .setParent(parentId)
       .putAllExecutionMetrics(mv.asJava)
       .build()
     Seq(mo) ++ transformChildren(p)

diff --git a/dev/requirements.txt b/dev/requirements.txt
@@ -60,6 +60,9 @@ mypy-protobuf==3.3.0
 googleapis-common-protos-stubs==2.2.0
 grpc-stubs==1.24.11
 
+# Debug for Spark and Spark Connect
+graphviz==0.20.3
+
 # TorchDistributor dependencies
 torch
 torchvision

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -1062,6 +1062,7 @@ def __hash__(self):
         "pyspark.sql.tests.connect.test_parity_pandas_udf_window",
         "pyspark.sql.tests.connect.test_resources",
         "pyspark.sql.tests.connect.shell.test_progress",
+        "pyspark.sql.tests.connect.test_df_debug",
     ],
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and

diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json
@@ -149,6 +149,11 @@
       "Cannot <condition1> without <condition2>."
     ]
   },
+  "CLASSIC_OPERATION_NOT_SUPPORTED_ON_DF": {
+    "message": [
+      "Calling property or member <member> is not supported in PySpark Classic, please use Spark Connect instead."
+    ]
+  },
   "COLLATION_INVALID_PROVIDER" : {
     "message" : [
       "The value <provider> does not represent a correct collation provider. Supported providers are: [<supportedProviders>]."

diff --git a/python/pyspark/sql/classic/dataframe.py b/python/pyspark/sql/classic/dataframe.py
@@ -94,6 +94,7 @@
     from pyspark.sql.session import SparkSession
     from pyspark.sql.group import GroupedData
     from pyspark.sql.observation import Observation
+    from pyspark.sql.metrics import QueryExecution
 
 
 class DataFrame(ParentDataFrame, PandasMapOpsMixin, PandasConversionMixin):
@@ -1835,6 +1836,13 @@ def toArrow(self) -> "pa.Table":
     def toPandas(self) -> "PandasDataFrameLike":
         return PandasConversionMixin.toPandas(self)
 
+    @property
+    def queryExecution(self) -> Optional["QueryExecution"]:
+        raise PySparkValueError(
+            error_class="CLASSIC_OPERATION_NOT_SUPPORTED_ON_DF",
+            message_parameters={"member": "queryExecution"},
+        )
+
 
 def _to_scala_map(sc: "SparkContext", jm: Dict) -> "JavaObject":
     """

diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py
@@ -61,6 +61,7 @@
 from pyspark.loose_version import LooseVersion
 from pyspark.version import __version__
 from pyspark.resource.information import ResourceInformation
+from pyspark.sql.metrics import MetricValue, PlanMetrics, QueryExecution, ObservedMetrics
 from pyspark.sql.connect.client.artifact import ArtifactManager
 from pyspark.sql.connect.client.logging import logger
 from pyspark.sql.connect.profiler import ConnectProfilerCollector
@@ -447,56 +448,7 @@ def toChannel(self) -> grpc.Channel:
             return self._secure_channel(self.endpoint, creds)
 
 
-class MetricValue:
-    def __init__(self, name: str, value: Union[int, float], type: str):
-        self._name = name
-        self._type = type
-        self._value = value
-
-    def __repr__(self) -> str:
-        return f"<{self._name}={self._value} ({self._type})>"
-
-    @property
-    def name(self) -> str:
-        return self._name
-
-    @property
-    def value(self) -> Union[int, float]:
-        return self._value
-
-    @property
-    def metric_type(self) -> str:
-        return self._type
-
-
-class PlanMetrics:
-    def __init__(self, name: str, id: int, parent: int, metrics: List[MetricValue]):
-        self._name = name
-        self._id = id
-        self._parent_id = parent
-        self._metrics = metrics
-
-    def __repr__(self) -> str:
-        return f"Plan({self._name})={self._metrics}"
-
-    @property
-    def name(self) -> str:
-        return self._name
-
-    @property
-    def plan_id(self) -> int:
-        return self._id
-
-    @property
-    def parent_plan_id(self) -> int:
-        return self._parent_id
-
-    @property
-    def metrics(self) -> List[MetricValue]:
-        return self._metrics
-
-
-class PlanObservedMetrics:
+class PlanObservedMetrics(ObservedMetrics):
     def __init__(self, name: str, metrics: List[pb2.Expression.Literal], keys: List[str]):
         self._name = name
         self._metrics = metrics
@@ -513,6 +465,13 @@ def name(self) -> str:
     def metrics(self) -> List[pb2.Expression.Literal]:
         return self._metrics
 
+    @property
+    def pairs(self) -> dict[str, Any]:
+        result = {}
+        for x in range(len(self._metrics)):
+            result[self.keys[x]] = LiteralExpression._to_value(self.metrics[x])
+        return result
+
     @property
     def keys(self) -> List[str]:
         return self._keys
@@ -920,16 +879,19 @@ def to_table_as_iterator(
 
     def to_table(
         self, plan: pb2.Plan, observations: Dict[str, Observation]
-    ) -> Tuple["pa.Table", Optional[StructType]]:
+    ) -> Tuple["pa.Table", Optional[StructType], QueryExecution]:
         """
         Return given plan as a PyArrow Table.
         """
         logger.info(f"Executing plan {self._proto_to_string(plan)}")
         req = self._execute_plan_request_with_metadata()
         req.plan.CopyFrom(plan)
-        table, schema, _, _, _ = self._execute_and_fetch(req, observations)
+        table, schema, metrics, observed_metrics, _ = self._execute_and_fetch(req, observations)
+
+        # Create a query execution object.
+        qe = QueryExecution(metrics, observed_metrics)
         assert table is not None
-        return table, schema
+        return table, schema, qe
 
     def to_pandas(self, plan: pb2.Plan, observations: Dict[str, Observation]) -> "pd.DataFrame":
         """

diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
@@ -101,6 +101,7 @@
     from pyspark.sql.connect.observation import Observation
     from pyspark.sql.connect.session import SparkSession
     from pyspark.pandas.frame import DataFrame as PandasOnSparkDataFrame
+    from pyspark.sql.metrics import QueryExecution
 
 
 class DataFrame(ParentDataFrame):
@@ -137,6 +138,7 @@ def __init__(
         # by __repr__ and _repr_html_ while eager evaluation opens.
         self._support_repr_html = False
         self._cached_schema: Optional[StructType] = None
+        self._query_execution: Optional["QueryExecution"] = None
 
     def __reduce__(self) -> Tuple:
         """
@@ -1836,7 +1838,9 @@ def collect(self) -> List[Row]:
 
     def _to_table(self) -> Tuple["pa.Table", Optional[StructType]]:
         query = self._plan.to_proto(self._session.client)
-        table, schema = self._session.client.to_table(query, self._plan.observations)
+        table, schema, self._query_execution = self._session.client.to_table(
+            query, self._plan.observations
+        )
         assert table is not None
         return (table, schema)
 
@@ -2202,6 +2206,19 @@ def rdd(self) -> "RDD[Row]":
                 message_parameters={"feature": "rdd"},
             )
 
+    @property
+    def queryExecution(self) -> Optional["QueryExecution"]:
+        """
+        The queryExecution method allows to introspect information about the actual
+        query execution after the successful execution. Accessing this member before
+        the query execution has happened will return None.
+
+        Returns
+        -------
+        An instance of QueryExecution or None when the value is not set yet.
+        """
+        return self._query_execution
+
 
 class DataFrameNaFunctions(ParentDataFrameNaFunctions):
     def __init__(self, df: ParentDataFrame):

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -64,6 +64,7 @@
         ArrowMapIterFunction,
         DataFrameLike as PandasDataFrameLike,
     )
+    from pyspark.sql.metrics import QueryExecution
 
 
 __all__ = ["DataFrame", "DataFrameNaFunctions", "DataFrameStatFunctions"]
@@ -6281,6 +6282,10 @@ def toPandas(self) -> "PandasDataFrameLike":
         """
         ...
 
+    @property
+    def queryExecution(self) -> Optional["QueryExecution"]:
+        ...
+
 
 class DataFrameNaFunctions:
     """Functionality for working with missing data in :class:`DataFrame`.