everything but batch works

ericm-db · ericm-db · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
commit c5ef895875cd8d677ec70f7cf7612116d06e0c8b
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -259,6 +259,19 @@ class IncrementalExecution(
     }
   }
 
+  object StateStoreColumnFamilySchemas extends SparkPlanPartialRule {
+    override val rule: PartialFunction[SparkPlan, SparkPlan] = {
+      case statefulOp: StatefulOperator =>
+        statefulOp match {
+          case transformWithStateExec: TransformWithStateExec =>
+            transformWithStateExec.copy(
+              columnFamilySchemas = transformWithStateExec.getColFamilySchemas()
+            )
+          case _ => statefulOp
+        }
+    }
+  }
+
   object StateOpIdRule extends SparkPlanPartialRule {
     override val rule: PartialFunction[SparkPlan, SparkPlan] = {
       case StateStoreSaveExec(keys, None, None, None, None, stateFormatVersion,
@@ -552,9 +565,9 @@ class IncrementalExecution(
       // The rule below doesn't change the plan but can cause the side effect that
       // metadata/schema is written in the checkpoint directory of stateful operator.
       planWithStateOpId transform StateSchemaAndOperatorMetadataRule.rule
-
-      simulateWatermarkPropagation(planWithStateOpId)
-      planWithStateOpId transform WatermarkPropagationRule.rule
+      val planWithStateSchemas = planWithStateOpId transform StateStoreColumnFamilySchemas.rule
+      simulateWatermarkPropagation(planWithStateSchemas)
+      planWithStateSchemas transform WatermarkPropagationRule.rule
     }
   }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImpl.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.streaming
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.execution.metric.SQLMetric
-import org.apache.spark.sql.execution.streaming.state.{AvroEncoderSpec, NoPrefixKeyStateEncoderSpec, StateStore, StateStoreErrors}
+import org.apache.spark.sql.execution.streaming.state.{AvroEncoder, NoPrefixKeyStateEncoderSpec, StateStore, StateStoreErrors}
 import org.apache.spark.sql.streaming.ListState
 import org.apache.spark.sql.types.StructType
 
@@ -42,7 +42,7 @@ class ListStateImpl[S](
      keyExprEnc: ExpressionEncoder[Any],
      valEncoder: ExpressionEncoder[Any],
      metrics: Map[String, SQLMetric] = Map.empty,
-     avroEnc: Option[AvroEncoderSpec] = None)
+     avroEnc: Option[AvroEncoder] = None)
   extends ListStateMetricsImpl
   with ListState[S]
   with Logging {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImplWithTTL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ListStateImplWithTTL.scala
@@ -20,7 +20,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._
-import org.apache.spark.sql.execution.streaming.state.{AvroEncoderSpec, NoPrefixKeyStateEncoderSpec, StateStore, StateStoreErrors}
+import org.apache.spark.sql.execution.streaming.state.{AvroEncoder, NoPrefixKeyStateEncoderSpec, StateStore, StateStoreErrors}
 import org.apache.spark.sql.streaming.{ListState, TTLConfig}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.NextIterator
@@ -38,7 +38,7 @@ import org.apache.spark.util.NextIterator
  * @param metrics - metrics to be updated as part of stateful processing
  * @param avroEnc - optional Avro serializer and deserializer for this state variable that
  *                is used by the StateStore to encode state in Avro format
- * @param ttlAvroEnc - optional Avro serializer and deserializer for TTL state that
+ * @param secondaryIndexAvroEnc - optional Avro serializer and deserializer for TTL state that
  *                is used by the StateStore to encode state in Avro format
  * @tparam S - data type of object that will be stored
  */
@@ -50,9 +50,10 @@ class ListStateImplWithTTL[S](
     ttlConfig: TTLConfig,
     batchTimestampMs: Long,
     metrics: Map[String, SQLMetric] = Map.empty,
-    avroEnc: Option[AvroEncoderSpec] = None,
-    ttlAvroEnc: Option[AvroEncoderSpec] = None)
-  extends SingleKeyTTLStateImpl(stateName, store, keyExprEnc, batchTimestampMs, ttlAvroEnc)
+    avroEnc: Option[AvroEncoder] = None,
+    secondaryIndexAvroEnc: Option[AvroEncoder] = None)
+  extends SingleKeyTTLStateImpl(
+    stateName, store, keyExprEnc, batchTimestampMs, secondaryIndexAvroEnc)
   with ListStateMetricsImpl
   with ListState[S] {
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImpl.scala
@@ -20,7 +20,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._
-import org.apache.spark.sql.execution.streaming.state.{AvroEncoderSpec, PrefixKeyScanStateEncoderSpec, StateStore, StateStoreErrors, UnsafeRowPair}
+import org.apache.spark.sql.execution.streaming.state.{AvroEncoder, PrefixKeyScanStateEncoderSpec, StateStore, StateStoreErrors, UnsafeRowPair}
 import org.apache.spark.sql.streaming.MapState
 import org.apache.spark.sql.types.StructType
 
@@ -44,7 +44,7 @@ class MapStateImpl[K, V](
     userKeyEnc: ExpressionEncoder[Any],
     valEncoder: ExpressionEncoder[Any],
     metrics: Map[String, SQLMetric] = Map.empty,
-    avroEnc: Option[AvroEncoderSpec] = None) extends MapState[K, V] with Logging {
+    avroEnc: Option[AvroEncoder] = None) extends MapState[K, V] with Logging {
 
   // Pack grouping key and user key together as a prefixed composite key
   private val schemaForCompositeKeyRow: StructType = {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImplWithTTL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MapStateImplWithTTL.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._
-import org.apache.spark.sql.execution.streaming.state.{AvroEncoderSpec, PrefixKeyScanStateEncoderSpec, StateStore, StateStoreErrors}
+import org.apache.spark.sql.execution.streaming.state.{AvroEncoder, PrefixKeyScanStateEncoderSpec, StateStore, StateStoreErrors}
 import org.apache.spark.sql.streaming.{MapState, TTLConfig}
 import org.apache.spark.util.NextIterator
 
@@ -38,25 +38,25 @@ import org.apache.spark.util.NextIterator
  * @param metrics - metrics to be updated as part of stateful processing
  * @param avroEnc - optional Avro serializer and deserializer for this state variable that
  *                is used by the StateStore to encode state in Avro format
- * @param ttlAvroEnc - optional Avro serializer and deserializer for TTL state that
+ * @param secondaryIndexAvroEnc - optional Avro serializer and deserializer for TTL state that
  *                is used by the StateStore to encode state in Avro format
  * @tparam K - type of key for map state variable
  * @tparam V - type of value for map state variable
  * @return - instance of MapState of type [K,V] that can be used to store state persistently
  */
 class MapStateImplWithTTL[K, V](
-    store: StateStore,
-    stateName: String,
-    keyExprEnc: ExpressionEncoder[Any],
-    userKeyEnc: ExpressionEncoder[Any],
-    valEncoder: ExpressionEncoder[Any],
-    ttlConfig: TTLConfig,
-    batchTimestampMs: Long,
-    metrics: Map[String, SQLMetric] = Map.empty,
-    avroEnc: Option[AvroEncoderSpec] = None,
-    ttlAvroEnc: Option[AvroEncoderSpec] = None)
+      store: StateStore,
+      stateName: String,
+      keyExprEnc: ExpressionEncoder[Any],
+      userKeyEnc: ExpressionEncoder[Any],
+      valEncoder: ExpressionEncoder[Any],
+      ttlConfig: TTLConfig,
+      batchTimestampMs: Long,
+      metrics: Map[String, SQLMetric] = Map.empty,
+      avroEnc: Option[AvroEncoder] = None,
+      secondaryIndexAvroEnc: Option[AvroEncoder] = None)
   extends CompositeKeyTTLStateImpl[K](stateName, store,
-    keyExprEnc, userKeyEnc, batchTimestampMs, ttlAvroEnc)
+    keyExprEnc, userKeyEnc, batchTimestampMs, secondaryIndexAvroEnc)
   with MapState[K, V] with Logging {
 
   private val stateTypesEncoder = new CompositeKeyStateEncoder(

diff --git a/...in/scala/org/apache/spark/sql/execution/streaming/StateStoreColumnFamilySchemaUtils.scala b/...in/scala/org/apache/spark/sql/execution/streaming/StateStoreColumnFamilySchemaUtils.scala
@@ -21,15 +21,14 @@ import org.apache.spark.sql.Encoder
 import org.apache.spark.sql.avro.{AvroDeserializer, AvroOptions, AvroSerializer, SchemaConverters}
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._
-import org.apache.spark.sql.execution.streaming.state.{AvroEncoderSpec, NoPrefixKeyStateEncoderSpec, PrefixKeyScanStateEncoderSpec, RangeKeyScanStateEncoderSpec, StateStoreColFamilySchema}
+import org.apache.spark.sql.execution.streaming.state.{AvroEncoder, NoPrefixKeyStateEncoderSpec, PrefixKeyScanStateEncoderSpec, RangeKeyScanStateEncoderSpec, StateStoreColFamilySchema}
 import org.apache.spark.sql.types.{BinaryType, BooleanType, ByteType, DataType, DoubleType, FloatType, IntegerType, LongType, NullType, ShortType, StructField, StructType}
 
 object StateStoreColumnFamilySchemaUtils {
 
   def apply(initializeAvroSerde: Boolean): StateStoreColumnFamilySchemaUtils =
     new StateStoreColumnFamilySchemaUtils(initializeAvroSerde)
 
-
   /**
    * Avro uses zig-zag encoding for some fixed-length types, like Longs and Ints. For range scans
    * we want to use big-endian encoding, so we need to convert the source schema to replace these
@@ -76,6 +75,16 @@ object StateStoreColumnFamilySchemaUtils {
  */
 class StateStoreColumnFamilySchemaUtils(initializeAvroSerde: Boolean) extends Logging {
 
+  private def getAvroSerdeForSchema(schema: StructType): (AvroSerializer, AvroDeserializer) = {
+    val avroType = SchemaConverters.toAvroType(schema)
+    val avroOptions = AvroOptions(Map.empty)
+    val serializer = new AvroSerializer(schema, avroType, nullable = false)
+    val deserializer = new AvroDeserializer(avroType, schema,
+      avroOptions.datetimeRebaseModeInRead, avroOptions.useStableIdForUnionType,
+      avroOptions.stableIdPrefixForUnionType, avroOptions.recursiveFieldMaxDepth)
+    (serializer, deserializer)
+  }
+
   /**
    * If initializeAvroSerde is true, this method will create an Avro Serializer and Deserializer
    * for a particular key and value schema.
@@ -84,30 +93,19 @@ class StateStoreColumnFamilySchemaUtils(initializeAvroSerde: Boolean) extends Lo
       keySchema: StructType,
       valSchema: StructType,
       suffixKeySchema: Option[StructType] = None
-  ): Option[AvroEncoderSpec] = {
+  ): Option[AvroEncoder] = {
     if (initializeAvroSerde) {
-      val avroType = SchemaConverters.toAvroType(valSchema)
-      val avroOptions = AvroOptions(Map.empty)
-      val keyAvroType = SchemaConverters.toAvroType(keySchema)
-      val keySer = new AvroSerializer(keySchema, keyAvroType, nullable = false)
-      val keyDe = new AvroDeserializer(keyAvroType, keySchema,
-        avroOptions.datetimeRebaseModeInRead, avroOptions.useStableIdForUnionType,
-        avroOptions.stableIdPrefixForUnionType, avroOptions.recursiveFieldMaxDepth)
-      val valueSerializer = new AvroSerializer(valSchema, avroType, nullable = false)
-      val valueDeserializer = new AvroDeserializer(avroType, valSchema,
-        avroOptions.datetimeRebaseModeInRead, avroOptions.useStableIdForUnionType,
-        avroOptions.stableIdPrefixForUnionType, avroOptions.recursiveFieldMaxDepth)
+      val (keySer, keyDe) =
+        getAvroSerdeForSchema(keySchema)
+      val (valueSerializer, valueDeserializer) =
+        getAvroSerdeForSchema(valSchema)
       val (suffixKeySer, suffixKeyDe) = if (suffixKeySchema.isDefined) {
-        val userKeyAvroType = SchemaConverters.toAvroType(suffixKeySchema.get)
-        val skSer = new AvroSerializer(suffixKeySchema.get, userKeyAvroType, nullable = false)
-        val skDe = new AvroDeserializer(userKeyAvroType, suffixKeySchema.get,
-          avroOptions.datetimeRebaseModeInRead, avroOptions.useStableIdForUnionType,
-          avroOptions.stableIdPrefixForUnionType, avroOptions.recursiveFieldMaxDepth)
-        (Some(skSer), Some(skDe))
+        val serde = getAvroSerdeForSchema(suffixKeySchema.get)
+        (Some(serde._1), Some(serde._2))
       } else {
         (None, None)
       }
-      Some(AvroEncoderSpec(
+      Some(AvroEncoder(
         keySer, keyDe, valueSerializer, valueDeserializer, suffixKeySer, suffixKeyDe))
     } else {
       None
@@ -164,6 +162,11 @@ class StateStoreColumnFamilySchemaUtils(initializeAvroSerde: Boolean) extends Lo
     )
   }
 
+  // This function creates the StateStoreColFamilySchema for
+  // the TTL secondary index.
+  // Because we want to encode fixed-length types as binary types
+  // if we are using Avro, we need to do some schema conversion to ensure
+  // we can use range scan
   def getTtlStateSchema(
       stateName: String,
       keyEncoder: ExpressionEncoder[Any]): StateStoreColFamilySchema = {
@@ -184,6 +187,11 @@ class StateStoreColumnFamilySchemaUtils(initializeAvroSerde: Boolean) extends Lo
     )
   }
 
+  // This function creates the StateStoreColFamilySchema for
+  // the TTL secondary index.
+  // Because we want to encode fixed-length types as binary types
+  // if we are using Avro, we need to do some schema conversion to ensure
+  // we can use range scan
   def getTtlStateSchema(
       stateName: String,
       keyEncoder: ExpressionEncoder[Any],
@@ -221,6 +229,11 @@ class StateStoreColumnFamilySchemaUtils(initializeAvroSerde: Boolean) extends Lo
       ))
   }
 
+  // This function creates the StateStoreColFamilySchema for
+  // Timers' secondary index.
+  // Because we want to encode fixed-length types as binary types
+  // if we are using Avro, we need to do some schema conversion to ensure
+  // we can use range scan
   def getTimerStateSchemaForSecIndex(
       stateName: String,
       keySchema: StructType,

diff --git a/...src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala b/...src/main/scala/org/apache/spark/sql/execution/streaming/StatefulProcessorHandleImpl.scala
@@ -394,10 +394,10 @@ class DriverStatefulProcessorHandleImpl(
     val stateName = TimerStateUtils.getTimerStateVarName(timeMode.toString)
     val secIndexColFamilyName = TimerStateUtils.getSecIndexColFamilyName(timeMode.toString)
     val timerEncoder = new TimerKeyEncoder(keyExprEnc)
-    val colFamilySchema = schemaUtils.
-      getTimerStateSchema(stateName, timerEncoder.schemaForKeyRow, timerEncoder.schemaForValueRow)
-    val secIndexColFamilySchema = schemaUtils.
-      getTimerStateSchemaForSecIndex(secIndexColFamilyName,
+    val colFamilySchema = schemaUtils
+      .getTimerStateSchema(stateName, timerEncoder.schemaForKeyRow, timerEncoder.schemaForValueRow)
+    val secIndexColFamilySchema = schemaUtils
+      .getTimerStateSchemaForSecIndex(secIndexColFamilyName,
         timerEncoder.keySchemaForSecIndex,
         timerEncoder.schemaForValueRow)
     columnFamilySchemas.put(stateName, colFamilySchema)
@@ -458,8 +458,8 @@ class DriverStatefulProcessorHandleImpl(
     }
 
     val stateEncoder = encoderFor[T]
-    val colFamilySchema = schemaUtils.
-      getListStateSchema(stateName, keyExprEnc, stateEncoder, ttlEnabled)
+    val colFamilySchema = schemaUtils
+      .getListStateSchema(stateName, keyExprEnc, stateEncoder, ttlEnabled)
     checkIfDuplicateVariableDefined(stateName)
     columnFamilySchemas.put(stateName, colFamilySchema)
     val stateVariableInfo = TransformWithStateVariableUtils.
@@ -494,8 +494,8 @@ class DriverStatefulProcessorHandleImpl(
     }
 
 
-    val colFamilySchema = schemaUtils.
-      getMapStateSchema(stateName, keyExprEnc, userKeyEnc, valEncoder, ttlEnabled)
+    val colFamilySchema = schemaUtils
+      .getMapStateSchema(stateName, keyExprEnc, userKeyEnc, valEncoder, ttlEnabled)
     columnFamilySchemas.put(stateName, colFamilySchema)
     val stateVariableInfo = TransformWithStateVariableUtils.
       getMapState(stateName, ttlEnabled = ttlEnabled)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TTLState.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.execution.streaming.StateStoreColumnFamilySchemaUtils.getTtlColFamilyName
 import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._
-import org.apache.spark.sql.execution.streaming.state.{AvroEncoderSpec, RangeKeyScanStateEncoderSpec, StateStore}
+import org.apache.spark.sql.execution.streaming.state.{AvroEncoder, RangeKeyScanStateEncoderSpec, StateStore}
 import org.apache.spark.sql.types._
 
 object StateTTLSchema {
@@ -81,7 +81,7 @@ abstract class SingleKeyTTLStateImpl(
     store: StateStore,
     keyExprEnc: ExpressionEncoder[Any],
     ttlExpirationMs: Long,
-    avroEnc: Option[AvroEncoderSpec] = None)
+    avroEnc: Option[AvroEncoder] = None)
   extends TTLState {
 
   import org.apache.spark.sql.execution.streaming.StateTTLSchema._
@@ -202,7 +202,7 @@ abstract class CompositeKeyTTLStateImpl[K](
     keyExprEnc: ExpressionEncoder[Any],
     userKeyEncoder: ExpressionEncoder[Any],
     ttlExpirationMs: Long,
-    avroEnc: Option[AvroEncoderSpec] = None)
+    avroEnc: Option[AvroEncoder] = None)
   extends TTLState {
 
   import org.apache.spark.sql.execution.streaming.StateTTLSchema._

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TimerStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/TimerStateImpl.scala
@@ -65,8 +65,8 @@ class TimerStateImpl(
     store: StateStore,
     timeMode: TimeMode,
     keyExprEnc: ExpressionEncoder[Any],
-    avroEnc: Option[AvroEncoderSpec] = None,
-    secIndexAvroEnc: Option[AvroEncoderSpec] = None) extends Logging {
+    avroEnc: Option[AvroEncoder] = None,
+    secIndexAvroEnc: Option[AvroEncoder] = None) extends Logging {
 
   private val EMPTY_ROW =
     UnsafeProjection.create(Array[DataType](NullType)).apply(InternalRow.apply(null))