apache · bogao007 · Jan 22, 2024 · Feb 6, 2024 · Feb 6, 2024 · Jun 20, 2024
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -3698,6 +3698,12 @@
     ],
     "sqlState" : "42802"
   },
+  "STATEFUL_PROCESSOR_UNKNOWN_TIME_MODE" : {
+    "message" : [
+      "Unknown time mode <timeMode>. Accepted timeMode modes are 'none', 'processingTime', 'eventTime'"
+    ],
+    "sqlState" : "42802"
+  },
   "STATE_STORE_CANNOT_CREATE_COLUMN_FAMILY_WITH_RESERVED_CHARS" : {
     "message" : [
       "Failed to create column family with unsupported starting character and name=<colFamilyName>."

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -61,6 +61,7 @@ private[spark] object PythonEvalType {
   val SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE = 208
   val SQL_GROUPED_MAP_ARROW_UDF = 209
   val SQL_COGROUPED_MAP_ARROW_UDF = 210
+  val SQL_TRANSFORM_WITH_STATE_PANDAS_UDF = 211
 
   val SQL_TABLE_UDF = 300
   val SQL_ARROW_TABLE_UDF = 301
@@ -82,6 +83,7 @@ private[spark] object PythonEvalType {
     case SQL_COGROUPED_MAP_ARROW_UDF => "SQL_COGROUPED_MAP_ARROW_UDF"
     case SQL_TABLE_UDF => "SQL_TABLE_UDF"
     case SQL_ARROW_TABLE_UDF => "SQL_ARROW_TABLE_UDF"
+    case SQL_TRANSFORM_WITH_STATE_PANDAS_UDF => "SQL_TRANSFORM_WITH_STATE_PANDAS_UDF"
   }
 }
 

diff --git a/python/pyspark/sql/pandas/_typing/__init__.pyi b/python/pyspark/sql/pandas/_typing/__init__.pyi
@@ -55,6 +55,7 @@ ArrowMapIterUDFType = Literal[207]
 PandasGroupedMapUDFWithStateType = Literal[208]
 ArrowGroupedMapUDFType = Literal[209]
 ArrowCogroupedMapUDFType = Literal[210]
+PandasGroupedMapUDFTransformWithStateType = Literal[211]
 
 class PandasVariadicScalarToScalarFunction(Protocol):
     def __call__(self, *_: DataFrameOrSeriesLike_) -> DataFrameOrSeriesLike_: ...

diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py
@@ -413,6 +413,7 @@ def calculate(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
         PythonEvalType.SQL_MAP_ARROW_ITER_UDF,
         PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
         PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE,
+        PythonEvalType.SQL_TRANSFORM_WITH_STATE,
         PythonEvalType.SQL_GROUPED_MAP_ARROW_UDF,
         PythonEvalType.SQL_COGROUPED_MAP_ARROW_UDF,
         None,
@@ -453,6 +454,7 @@ def _validate_pandas_udf(f, evalType) -> int:
         PythonEvalType.SQL_MAP_ARROW_ITER_UDF,
         PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
         PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE,
+        PythonEvalType.SQL_TRANSFORM_WITH_STATE,
         PythonEvalType.SQL_GROUPED_MAP_ARROW_UDF,
         PythonEvalType.SQL_COGROUPED_MAP_ARROW_UDF,
         PythonEvalType.SQL_ARROW_BATCHED_UDF,

diff --git a/python/pyspark/sql/pandas/group_ops.py b/python/pyspark/sql/pandas/group_ops.py
@@ -15,14 +15,17 @@
 # limitations under the License.
 #
 import sys
-from typing import List, Union, TYPE_CHECKING, cast
+from typing import Any, Iterator, List, Union, TYPE_CHECKING, cast
 import warnings
 
 from pyspark.errors import PySparkTypeError
 from pyspark.util import PythonEvalType
 from pyspark.sql.column import Column
 from pyspark.sql.dataframe import DataFrame
+from pyspark.sql.functions.builtin import udf
 from pyspark.sql.streaming.state import GroupStateTimeout
+from pyspark.sql.streaming.state_api_client import StateApiClient, StatefulProcessorHandleState
+from pyspark.sql.streaming.stateful_processor import StatefulProcessor, StatefulProcessorHandle
 from pyspark.sql.types import StructType, _parse_datatype_string
 
 if TYPE_CHECKING:
@@ -33,6 +36,7 @@
         PandasCogroupedMapFunction,
         ArrowGroupedMapFunction,
         ArrowCogroupedMapFunction,
+        DataFrameLike as PandasDataFrameLike
     )
     from pyspark.sql.group import GroupedData
 
@@ -358,6 +362,120 @@ def applyInPandasWithState(
         )
         return DataFrame(jdf, self.session)
 
+
+    def transformWithStateInPandas(self, 
+            stateful_processor: StatefulProcessor,
+            outputStructType: Union[StructType, str],
+            outputMode: str,
+            timeMode: str) -> DataFrame:
+        """
+        Invokes methods defined in the stateful processor used in arbitrary state API v2.
+        We allow the user to act on per-group set of input rows along with keyed state and the
+        user can choose to output/return 0 or more rows.
+
+        For a streaming dataframe, we will repeatedly invoke the interface methods for new rows
+        in each trigger and the user's state/state variables will be stored persistently across
+        invocations.
+
+        The `stateful_processor` should be a Python class that implements the interface defined in
+        pyspark.sql.streaming.stateful_processor.StatefulProcessor.
+
+        The `outputStructType` should be a :class:`StructType` describing the schema of all
+        elements in the returned value, `pandas.DataFrame`. The column labels of all elements in
+        returned `pandas.DataFrame` must either match the field names in the defined schema if
+        specified as strings, or match the field data types by position if not strings,
+        e.g. integer indices.
+
+        The size of each `pandas.DataFrame` in both the input and output can be arbitrary. The
+        number of `pandas.DataFrame` in both the input and output can also be arbitrary.
+
+        .. versionadded:: 4.0.0
+
+        Parameters
+        ----------
+        stateful_processor : :class:`pyspark.sql.streaming.stateful_processor.StatefulProcessor`
+            Instance of StatefulProcessor whose functions will be invoked by the operator.
+        outputStructType : :class:`pyspark.sql.types.DataType` or str
+            The type of the output records. The value can be either a
+            :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
+        outputMode : str
+            The output mode of the stateful processor.
+        timeMode : str
+            The time mode semantics of the stateful processor for timers and TTL.
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> from pyspark.sql.streaming import StatefulProcessor, StatefulProcessorHandle
+        >>> from pyspark.sql.types import StructType, StructField, LongType, StringType
+        >>> from typing import Iterator
+        >>> output_schema = StructType([
+        ...     StructField("value", LongType(), True)
+        ... ])
+        >>> state_schema = StructType([
+        ...     StructField("value", StringType(), True)
+        ... ])
+        >>> class SimpleStatefulProcessor(StatefulProcessor):
+        ...   def init(self, handle: StatefulProcessorHandle) -> None:
+        ...     self.value_state = handle.getValueState("testValueState", state_schema)
+        ...   def handleInputRows(self, key, rows) -> Iterator[pd.DataFrame]:
+        ...     self.value_state.update("test_value")
+        ...     exists = self.value_state.exists()
+        ...     value = self.value_state.get()
+        ...     self.value_state.clear()
+        ...     return rows
+        ...   def close(self) -> None:
+        ...     pass
+        ...
+        >>> df.groupBy("value").transformWithStateInPandas(stateful_processor =
+        ...         SimpleStatefulProcessor(), outputStructType=output_schema, outputMode="Update",
+        ...         timeMode="None") # doctest: +SKIP
+
+        Notes
+        -----
+        This function requires a full shuffle.
+
+        This API is experimental.
+        """
+
+        from pyspark.sql import GroupedData
+        from pyspark.sql.functions import pandas_udf
+        assert isinstance(self, GroupedData)
+
+        def transformWithStateUDF(state_api_client: StateApiClient, key: Any,
+                                  inputRows: Iterator["PandasDataFrameLike"]) -> Iterator["PandasDataFrameLike"]:
+            handle = StatefulProcessorHandle(state_api_client)
+
+            if (state_api_client.handle_state == StatefulProcessorHandleState.CREATED):
+                stateful_processor.init(handle)
+                state_api_client.set_handle_state(StatefulProcessorHandleState.INITIALIZED)
+
+            state_api_client.set_implicit_key(str(key[0]))
+            result = stateful_processor.handleInputRows(key, inputRows)
+            state_api_client.remove_implicit_key()
+
+            return result
+
+        if isinstance(outputStructType, str):
+            outputStructType = cast(StructType, _parse_datatype_string(outputStructType))
+
+        udf = pandas_udf(
+            transformWithStateUDF,  # type: ignore[call-overload]
+            returnType=outputStructType,
+            functionType=PythonEvalType.SQL_TRANSFORM_WITH_STATE,
+        )
+        df = self._df
+        udf_column = udf(*[df[col] for col in df.columns])
+
+        jdf = self._jgd.transformWithStateInPandas(
+            udf_column._jc.expr(),
+            self.session._jsparkSession.parseDataType(outputStructType.json()),
+            outputMode,
+            timeMode,
+        )
+        return DataFrame(jdf, self.session)
+
+
     def applyInArrow(
         self, func: "ArrowGroupedMapFunction", schema: Union[StructType, str]
     ) -> "DataFrame":

diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
@@ -19,9 +19,14 @@
 Serializers for PyArrow and pandas conversions. See `pyspark.serializers` for more details.
 """
 
+from enum import Enum
+from itertools import groupby
+import os
+import socket
+from typing import Any
 from pyspark.errors import PySparkRuntimeError, PySparkTypeError, PySparkValueError
 from pyspark.loose_version import LooseVersion
-from pyspark.serializers import Serializer, read_int, write_int, UTF8Deserializer, CPickleSerializer
+from pyspark.serializers import Serializer, read_int, write_int, UTF8Deserializer, CPickleSerializer, write_with_length
 from pyspark.sql.pandas.types import (
     from_arrow_type,
     to_arrow_type,
@@ -1116,3 +1121,71 @@ def init_stream_yield_batches(batches):
         batches_to_write = init_stream_yield_batches(serialize_batches())
 
         return ArrowStreamSerializer.dump_stream(self, batches_to_write, stream)
+
+
+class TransformWithStateInPandasSerializer(ArrowStreamPandasUDFSerializer):
+    """
+    Serializer used by Python worker to evaluate UDF for transformWithStateInPandasSerializer.
+
+    Parameters
+    ----------
+    timezone : str
+        A timezone to respect when handling timestamp values
+    safecheck : bool
+        If True, conversion from Arrow to Pandas checks for overflow/truncation
+    assign_cols_by_name : bool
+        If True, then Pandas DataFrames will get columns by name
+    arrow_max_records_per_batch : int
+        Limit of the number of records that can be written to a single ArrowRecordBatch in memory.
+    """
+
+    def __init__(
+            self,
+            timezone,
+            safecheck,
+            assign_cols_by_name,
+            arrow_max_records_per_batch):
+        super(
+            TransformWithStateInPandasSerializer,
+            self
+        ).__init__(timezone, safecheck, assign_cols_by_name)
+        self.arrow_max_records_per_batch = arrow_max_records_per_batch
+        self.key_offsets = None
+
+    # Nothing special here, we need to create the handle and read
+    # data in groups.
+    def load_stream(self, stream):
+        """
+        Read ArrowRecordBatches from stream, deserialize them to populate a list of pair
+        (data chunk, state), and convert the data into a list of pandas.Series.
+
+        Please refer the doc of inner function `gen_data_and_state` for more details how
+        this function works in overall.
+
+        In addition, this function further groups the return of `gen_data_and_state` by the state
+        instance (same semantic as grouping by grouping key) and produces an iterator of data
+        chunks for each group, so that the caller can lazily materialize the data chunk.
+        """
+        import pyarrow as pa
+
+        def generate_data_batches(batches):
+            for batch in batches:
+                data_pandas = [self.arrow_to_pandas(c) for c in pa.Table.from_batches([batch]).itercolumns()]
+                key_series = [data_pandas[o] for o in self.key_offsets]
+                batch_key = tuple(s[0] for s in key_series)
+                yield (batch_key, data_pandas)
+
+        _batches = super(ArrowStreamPandasSerializer, self).load_stream(stream)
+        data_batches = generate_data_batches(_batches)
+
+        for k, g in groupby(data_batches, key=lambda x: x[0]):
+            yield (k, g)
+
+
+    def dump_stream(self, iterator, stream):
+        """
+        Read through an iterator of (iterator of pandas DataFrame, state), serialize them to Arrow
+        RecordBatches, and write batches to stream.
+        """
+        result = [(b, t) for x in iterator for y, t in x for b in y]    
+        super().dump_stream(result, stream)
diff --git a/python/pyspark/sql/streaming/StateMessage.proto b/python/pyspark/sql/streaming/StateMessage.proto
@@ -0,0 +1,86 @@
+syntax = "proto3";
+
+package pyspark.sql.streaming;
+
+message StateRequest {
+  int32 version = 1;
+  oneof method {
+    StatefulProcessorCall statefulProcessorCall = 2;
+    StateVariableRequest stateVariableRequest = 3;
+    ImplicitGroupingKeyRequest implicitGroupingKeyRequest = 4;
+  }
+}
+
+message StateResponse {
+  int32 statusCode = 1;
+  string errorMessage = 2;
+}
+
+message StatefulProcessorCall {
+  oneof method {
+    SetHandleState setHandleState = 1;
+    StateCallCommand getValueState = 2;
+    StateCallCommand getListState = 3;
+    StateCallCommand getMapState = 4;
+  }
+}
+
+message StateVariableRequest {
+  oneof method {
+    ValueStateCall valueStateCall = 1;
+  }
+}
+
+message ImplicitGroupingKeyRequest {
+  oneof method {
+    SetImplicitKey setImplicitKey = 1;
+    RemoveImplicitKey removeImplicitKey = 2;
+  }
+}
+
+message StateCallCommand {
+  string stateName = 1;
+  string schema = 2;
+}
+
+message ValueStateCall {
+  string stateName = 1;
+  oneof method {
+    Exists exists = 2;
+    Get get = 3;
+    ValueStateUpdate valueStateUpdate = 4;
+    Clear clear = 5;
+  }
+}
+
+message SetImplicitKey {
+  string key = 1;
+}
+
+message RemoveImplicitKey {
+}
+
+message Exists {
+}
+
+message Get {
+}
+
+message ValueStateUpdate {
+  string schema = 1;
+  bytes value = 2;
+}
+
+message Clear {
+}
+
+enum HandleState {
+  CREATED = 0;
+  INITIALIZED = 1;
+  DATA_PROCESSED = 2;
+  CLOSED = 3;
+}
+
+message SetHandleState {
+  HandleState state = 1;
+}