init

ericm-db · ericm-db · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
commit c49acd28e8a965ce9462819a94cfcfc51920d092
diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala b/...core/src/main/scala/org/apache/spark/sql/execution/streaming/TransformWithStateExec.scala
@@ -475,7 +475,8 @@ case class TransformWithStateExec(
       newSchemas.values.toList, session.sessionState, stateSchemaVersion,
       storeName = StateStoreId.DEFAULT_STORE_NAME,
       oldSchemaFilePath = oldStateSchemaFilePath,
-      newSchemaFilePath = Some(newStateSchemaFilePath)))
+      newSchemaFilePath = Some(newStateSchemaFilePath),
+      usingAvro = true))
   }
 
   /** Metadata of this stateful operator and its states stores. */

diff --git a/...cala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala b/...cala/org/apache/spark/sql/execution/streaming/state/StateSchemaCompatibilityChecker.scala
@@ -17,14 +17,16 @@
 
 package org.apache.spark.sql.execution.streaming.state
 
+import scala.jdk.CollectionConverters.IterableHasAsJava
 import scala.util.Try
 
+import org.apache.avro.SchemaValidatorBuilder
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkUnsupportedOperationException
 import org.apache.spark.internal.{Logging, LogKeys, MDC}
-import org.apache.spark.sql.avro.{AvroDeserializer, AvroSerializer}
+import org.apache.spark.sql.avro.{AvroDeserializer, AvroSerializer, SchemaConverters}
 import org.apache.spark.sql.catalyst.util.UnsafeRowUtils
 import org.apache.spark.sql.execution.streaming.{CheckpointFileManager, StatefulOperatorStateInfo}
 import org.apache.spark.sql.execution.streaming.state.SchemaHelper.{SchemaReader, SchemaWriter}
@@ -151,22 +153,36 @@ class StateSchemaCompatibilityChecker(
   private def check(
       oldSchema: StateStoreColFamilySchema,
       newSchema: StateStoreColFamilySchema,
-      ignoreValueSchema: Boolean) : Unit = {
+      ignoreValueSchema: Boolean,
+      usingAvro: Boolean) : Boolean = {
     val (storedKeySchema, storedValueSchema) = (oldSchema.keySchema,
       oldSchema.valueSchema)
     val (keySchema, valueSchema) = (newSchema.keySchema, newSchema.valueSchema)
 
     if (storedKeySchema.equals(keySchema) &&
       (ignoreValueSchema || storedValueSchema.equals(valueSchema))) {
       // schema is exactly same
+      false
     } else if (!schemasCompatible(storedKeySchema, keySchema)) {
       throw StateStoreErrors.stateStoreKeySchemaNotCompatible(storedKeySchema.toString,
         keySchema.toString)
+    } else if (!ignoreValueSchema && usingAvro) {
+      // By this point, we know that old value schema is not equal to new value schema
+      val oldAvroSchema = SchemaConverters.toAvroType(storedValueSchema)
+      val newAvroSchema = SchemaConverters.toAvroType(valueSchema)
+      val validator = new SchemaValidatorBuilder().canReadStrategy.validateAll()
+      // This will throw a SchemaValidation exception if the schema has evolved in an
+      // unacceptable way.
+      validator.validate(newAvroSchema, Iterable(oldAvroSchema).asJava)
+      // If no exception is thrown, then we know that the schema evolved in an
+      // acceptable way
+      true
     } else if (!ignoreValueSchema && !schemasCompatible(storedValueSchema, valueSchema)) {
       throw StateStoreErrors.stateStoreValueSchemaNotCompatible(storedValueSchema.toString,
         valueSchema.toString)
     } else {
       logInfo("Detected schema change which is compatible. Allowing to put rows.")
+      true
     }
   }
 
@@ -180,7 +196,8 @@ class StateSchemaCompatibilityChecker(
   def validateAndMaybeEvolveStateSchema(
       newStateSchema: List[StateStoreColFamilySchema],
       ignoreValueSchema: Boolean,
-      stateSchemaVersion: Int): Boolean = {
+      stateSchemaVersion: Int,
+      usingAvro: Boolean): Boolean = {
     val existingStateSchemaList = getExistingKeyAndValueSchema()
     val newStateSchemaList = newStateSchema
 
@@ -195,18 +212,18 @@ class StateSchemaCompatibilityChecker(
       }.toMap
       // For each new state variable, we want to compare it to the old state variable
       // schema with the same name
-      newStateSchemaList.foreach { newSchema =>
-        existingSchemaMap.get(newSchema.colFamilyName).foreach { existingStateSchema =>
-          check(existingStateSchema, newSchema, ignoreValueSchema)
-        }
+      val hasEvolvedSchema = newStateSchemaList.exists { newSchema =>
+        existingSchemaMap.get(newSchema.colFamilyName)
+          .exists(existingSchema => check(existingSchema, newSchema, ignoreValueSchema, usingAvro))
       }
       val colFamiliesAddedOrRemoved =
         (newStateSchemaList.map(_.colFamilyName).toSet != existingSchemaMap.keySet)
-      if (stateSchemaVersion == SCHEMA_FORMAT_V3 && colFamiliesAddedOrRemoved) {
+      val newSchemaFileWritten = hasEvolvedSchema || colFamiliesAddedOrRemoved
+      if (stateSchemaVersion == SCHEMA_FORMAT_V3 && newSchemaFileWritten) {
         createSchemaFile(newStateSchemaList.sortBy(_.colFamilyName), stateSchemaVersion)
       }
       // TODO: [SPARK-49535] Write Schema files after schema has changed for StateSchemaV3
-      colFamiliesAddedOrRemoved
+      newSchemaFileWritten
     }
   }
 
@@ -255,7 +272,8 @@ object StateSchemaCompatibilityChecker {
       extraOptions: Map[String, String] = Map.empty,
       storeName: String = StateStoreId.DEFAULT_STORE_NAME,
       oldSchemaFilePath: Option[Path] = None,
-      newSchemaFilePath: Option[Path] = None): StateSchemaValidationResult = {
+      newSchemaFilePath: Option[Path] = None,
+      usingAvro: Boolean = false): StateSchemaValidationResult = {
     // SPARK-47776: collation introduces the concept of binary (in)equality, which means
     // in some collation we no longer be able to just compare the binary format of two
     // UnsafeRows to determine equality. For example, 'aaa' and 'AAA' can be "semantically"
@@ -286,7 +304,7 @@ object StateSchemaCompatibilityChecker {
     val result = Try(
       checker.validateAndMaybeEvolveStateSchema(newStateSchema,
         ignoreValueSchema = !storeConf.formatValidationCheckValue,
-        stateSchemaVersion = stateSchemaVersion)
+        stateSchemaVersion = stateSchemaVersion, usingAvro)
     ).toEither.fold(Some(_),
       hasEvolvedSchema => {
         evolvedSchema = hasEvolvedSchema