MaxGekk
diff --git a/‎.github/workflows/build_and_test.yml‎
Lines changed: 6 additions & 5 deletions b/‎.github/workflows/build_and_test.yml‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎LICENSE-binary‎
Lines changed: 0 additions & 1 deletion b/‎LICENSE-binary‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎assembly/pom.xml‎
Lines changed: 0 additions & 10 deletions b/‎assembly/pom.xml‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎bin/spark-pipelines‎
Lines changed: 8 additions & 1 deletion b/‎bin/spark-pipelines‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java‎
Lines changed: 8 additions & 5 deletions b/‎common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎common/utils/src/main/java/org/apache/spark/SparkThrowable.java‎
Lines changed: 6 additions & 0 deletions b/‎common/utils/src/main/java/org/apache/spark/SparkThrowable.java‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎common/utils/src/main/resources/error/error-conditions.json‎
Lines changed: 31 additions & 6 deletions b/‎common/utils/src/main/resources/error/error-conditions.json‎
Lines changed: 31 additions & 6 deletions
diff --git a/‎common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala‎
Lines changed: 52 additions & 5 deletions b/‎common/utils/src/main/scala/org/apache/spark/ErrorClassesJSONReader.scala‎
Lines changed: 52 additions & 5 deletions
diff --git a/‎common/utils/src/main/scala/org/apache/spark/SparkThrowableHelper.scala‎
Lines changed: 21 additions & 0 deletions b/‎common/utils/src/main/scala/org/apache/spark/SparkThrowableHelper.scala‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala‎
Lines changed: 11 additions & 4 deletions b/‎connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala‎
Lines changed: 11 additions & 4 deletions
@@ -362,7 +362,7 @@ jobs:
     - name: Install Python packages (Python 3.11)
       if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') || contains(matrix.modules, 'yarn')
       run: |
-        python3.11 -m pip install 'numpy>=1.22' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1'
+        python3.11 -m pip install 'numpy>=1.22' pyarrow pandas pyyaml scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1'
         python3.11 -m pip list
     # Run the tests.
     - name: Run tests
@@ -499,7 +499,8 @@ jobs:
     if: (!cancelled()) && (fromJson(needs.precondition.outputs.required).pyspark == 'true' || fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true')
     name: "Build modules: ${{ matrix.modules }}"
     runs-on: ubuntu-latest
-    timeout-minutes: 120
+    # TODO(SPARK-53605): Restore pyspark execution timeout to 2 hours after fixing test_pandas_transform_with_state
+    timeout-minutes: 150
     container:
       image: ${{ needs.precondition.outputs.image_pyspark_url_link }}
     strategy:
@@ -947,7 +948,7 @@ jobs:
     - uses: actions/setup-java@v4
       with:
         distribution: zulu
-        java-version: 25-ea
+        java-version: 25
     - name: Build with Maven
       run: |
         export MAVEN_OPTS="-Xss64m -Xmx4g -Xms4g -XX:ReservedCodeCacheSize=128m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
@@ -1323,9 +1324,9 @@ jobs:
           sudo apt update
           sudo apt-get install r-base
       - name: Start Minikube
-        uses: medyagh/setup-minikube@v0.0.19
+        uses: medyagh/setup-minikube@v0.0.20
         with:
-          kubernetes-version: "1.33.0"
+          kubernetes-version: "1.34.0"
           # Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic
           cpus: 2
           memory: 6144m
 
@@ -479,7 +479,6 @@ dev.ludovic.netlib:blas
 dev.ludovic.netlib:arpack
 dev.ludovic.netlib:lapack
 net.razorvine:pickle
-org.bouncycastle:bcprov-jdk18on
 org.checkerframework:checker-qual
 org.typelevel:algebra_2.13:jar
 org.typelevel:cats-kernel_2.13
 
@@ -136,16 +136,6 @@
       <artifactId>guava</artifactId>
       <scope>${hadoop.deps.scope}</scope>
     </dependency>
-
-    <!--
-      SPARK-51311: HDFS-15098 (3.4.0) adds hard dependency on bcprov-jdk18on, Spark fails to submit
-      to Kerberized cluster without this dependency, until HADOOP-19152 (3.5.0, unreleased)
-      -->
-    <dependency>
-      <groupId>org.bouncycastle</groupId>
-      <artifactId>bcprov-jdk18on</artifactId>
-      <scope>${hadoop.deps.scope}</scope>
-    </dependency>
   </dependencies>
 
   <build>
 
@@ -30,4 +30,11 @@ fi
 export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
 export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.9-src.zip:$PYTHONPATH"
 
-exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkPipelines "$@"
+SDP_CLI_PY_FILE_PATH=$("${PYSPARK_PYTHON}" - <<'EOF'
+import pyspark, os
+from pathlib import Path
+print(Path(os.path.dirname(pyspark.__file__)) / "pipelines" / "cli.py")
+EOF
+)
+
+exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkPipelines "$SDP_CLI_PY_FILE_PATH" "$@"
@@ -642,9 +642,13 @@ public UTF8String substring(final int start, final int until) {
     }
 
     int j = i;
-    while (i < numBytes && c < until) {
-      i += numBytesForFirstByte(getByte(i));
-      c += 1;
+    if (until == Integer.MAX_VALUE) {
+      i = numBytes;
+    } else {
+      while (i < numBytes && c < until) {
+        i += numBytesForFirstByte(getByte(i));
+        c += 1;
+      }
     }
 
     if (i > j) {
@@ -663,9 +667,8 @@ public UTF8String substringSQL(int pos, int length) {
     // refers to element i-1 in the sequence. If a start index i is less than 0, it refers
     // to the -ith element before the end of the sequence. If a start index i is 0, it
     // refers to the first element.
-    int len = numChars();
     // `len + pos` does not overflow as `len >= 0`.
-    int start = (pos > 0) ? pos -1 : ((pos < 0) ? len + pos : 0);
+    int start = (pos > 0) ? pos -1 : ((pos < 0) ? numChars() + pos : 0);
 
     int end;
     if ((long) start + length > Integer.MAX_VALUE) {
 
@@ -60,6 +60,12 @@ default boolean isInternalError() {
     return SparkThrowableHelper.isInternalError(this.getCondition());
   }
 
+  // If null, the error message is not for a breaking change
+  default BreakingChangeInfo getBreakingChangeInfo() {
+    return SparkThrowableHelper.getBreakingChangeInfo(
+        this.getCondition()).getOrElse(() -> null);
+  }
+
   default Map<String, String> getMessageParameters() {
     return new HashMap<>();
   }
 
@@ -368,6 +368,11 @@
           "The change log writer version cannot be <version>."
         ]
       },
+      "INVALID_CHECKPOINT_LINEAGE" : {
+        "message" : [
+          "Invalid checkpoint lineage: <lineage>. <message>"
+        ]
+      },
       "KEY_ROW_FORMAT_VALIDATION_FAILURE" : {
         "message" : [
           "<msg>"
@@ -2737,6 +2742,12 @@
     ],
     "sqlState" : "42001"
   },
+  "INVALID_EXPR_TYPE_FOR_QUERY_EXECUTE_IMMEDIATE" : {
+    "message" : [
+      "Expression type must be string type but got <exprType>."
+    ],
+    "sqlState" : "42K09"
+  },
   "INVALID_EXTERNAL_TYPE" : {
     "message" : [
       "The external type <externalType> is not valid for the type <type> at the expression <expr>."
@@ -3914,12 +3925,6 @@
     },
     "sqlState" : "42K0M"
   },
-  "INVALID_VARIABLE_TYPE_FOR_QUERY_EXECUTE_IMMEDIATE" : {
-    "message" : [
-      "Variable type must be string type but got <varType>."
-    ],
-    "sqlState" : "42K09"
-  },
   "INVALID_VARIANT_CAST" : {
     "message" : [
       "The variant value `<value>` cannot be cast into `<dataType>`. Please use `try_variant_get` instead."
@@ -4915,6 +4920,12 @@
     ],
     "sqlState" : "42601"
   },
+  "REMAINDER_BY_ZERO" : {
+    "message" : [
+      "Remainder by zero. Use `try_mod` to tolerate divisor being 0 and return NULL instead. If necessary set <config> to \"false\" to bypass this error."
+    ],
+    "sqlState" : "22012"
+  },
   "RENAME_SRC_PATH_NOT_FOUND" : {
     "message" : [
       "Failed to rename as <sourcePath> was not found."
@@ -5162,6 +5173,12 @@
     ],
     "sqlState" : "42802"
   },
+  "STATE_STORE_CHECKPOINT_IDS_NOT_SUPPORTED" : {
+    "message" : [
+      "<msg>"
+    ],
+    "sqlState" : "KD002"
+  },
   "STATE_STORE_CHECKPOINT_LOCATION_NOT_EMPTY" : {
     "message" : [
       "The checkpoint location <checkpointLocation> should be empty on batch 0",
@@ -5407,6 +5424,14 @@
     },
     "sqlState" : "42616"
   },
+  "STDS_MIXED_CHECKPOINT_FORMAT_VERSIONS_NOT_SUPPORTED" : {
+    "message" : [
+      "Reading state across different checkpoint format versions is not supported.",
+      "startBatchId=<startBatchId>, endBatchId=<endBatchId>.",
+      "startFormatVersion=<startFormatVersion>, endFormatVersion=<endFormatVersion>."
+    ],
+    "sqlState" : "KD002"
+  },
   "STDS_NO_PARTITION_DISCOVERED_IN_STATE_STORE" : {
     "message" : [
       "The state does not have any partition. Please double check that the query points to the valid state. options: <sourceOptions>"
 
@@ -75,6 +75,22 @@ class ErrorClassesJsonReader(jsonFileURLs: Seq[URL]) {
     matches.map(m => m.stripSuffix(">").stripPrefix("<"))
   }
 
+  def getBreakingChangeInfo(errorClass: String): Option[BreakingChangeInfo] = {
+    val errorClasses = errorClass.split('.')
+    errorClasses match {
+      case Array(mainClass) =>
+        errorInfoMap.get(mainClass).flatMap(_.breakingChangeInfo)
+      case Array(mainClass, subClass) =>
+        errorInfoMap.get(mainClass).flatMap{
+          errorInfo =>
+            errorInfo.subClass.flatMap(_.get(subClass))
+              .flatMap(_.breakingChangeInfo)
+              .orElse(errorInfo.breakingChangeInfo)
+        }
+      case _ => None
+    }
+  }
+
   def getMessageTemplate(errorClass: String): String = {
     val errorClasses = errorClass.split("\\.")
     assert(errorClasses.length == 1 || errorClasses.length == 2)
@@ -128,7 +144,7 @@ private object ErrorClassesJsonReader {
     val map = mapper.readValue(url, new TypeReference[Map[String, ErrorInfo]]() {})
     val errorClassWithDots = map.collectFirst {
       case (errorClass, _) if errorClass.contains('.') => errorClass
-      case (_, ErrorInfo(_, Some(map), _)) if map.keys.exists(_.contains('.')) =>
+      case (_, ErrorInfo(_, Some(map), _, _)) if map.keys.exists(_.contains('.')) =>
         map.keys.collectFirst { case s if s.contains('.') => s }.get
     }
     if (errorClassWithDots.isEmpty) {
@@ -147,28 +163,59 @@ private object ErrorClassesJsonReader {
  * @param subClass SubClass associated with this class.
  * @param message Message format with optional placeholders (e.g. &lt;parm&gt;).
  *                The error message is constructed by concatenating the lines with newlines.
+ * @param breakingChangeInfo Additional metadata if the error is due to a breaking change.
  */
 private case class ErrorInfo(
     message: Seq[String],
     subClass: Option[Map[String, ErrorSubInfo]],
-    sqlState: Option[String]) {
+    sqlState: Option[String],
+    breakingChangeInfo: Option[BreakingChangeInfo] = None) {
   // For compatibility with multi-line error messages
   @JsonIgnore
-  val messageTemplate: String = message.mkString("\n")
+  val messageTemplate: String = message.mkString("\n") +
+    breakingChangeInfo.map(_.migrationMessage.mkString(" ", "\n", "")).getOrElse("")
 }
 
 /**
  * Information associated with an error subclass.
  *
  * @param message Message format with optional placeholders (e.g. &lt;parm&gt;).
  *                The error message is constructed by concatenating the lines with newlines.
+ * @param breakingChangeInfo Additional metadata if the error is due to a breaking change.
  */
-private case class ErrorSubInfo(message: Seq[String]) {
+private case class ErrorSubInfo(
+    message: Seq[String],
+    breakingChangeInfo: Option[BreakingChangeInfo] = None) {
   // For compatibility with multi-line error messages
   @JsonIgnore
-  val messageTemplate: String = message.mkString("\n")
+  val messageTemplate: String = message.mkString("\n") +
+      breakingChangeInfo.map(_.migrationMessage.mkString(" ", "\n", "")).getOrElse("")
 }
 
+/**
+ * Additional information if the error was caused by a breaking change.
+ *
+ * @param migrationMessage A message explaining how the user can migrate their job to work
+ *                         with the breaking change.
+ * @param mitigationConfig A spark config flag that can be used to mitigate the
+ *                              breaking change.
+ * @param needsAudit If true, the breaking change should be inspected manually.
+ *                       If false, the spark job should be retried by setting the
+ *                       mitigationConfig.
+ */
+case class BreakingChangeInfo(
+    migrationMessage: Seq[String],
+    mitigationConfig: Option[MitigationConfig] = None,
+    needsAudit: Boolean = true
+)
+
+/**
+ * A spark config flag that can be used to mitigate a breaking change.
+ * @param key The spark config key.
+ * @param value The spark config value that mitigates the breaking change.
+ */
+case class MitigationConfig(key: String, value: String)
+
 /**
  * Information associated with an error state / SQLSTATE.
  *
 
@@ -73,6 +73,14 @@ private[spark] object SparkThrowableHelper {
     errorReader.getMessageParameters(errorClass)
   }
 
+  def getBreakingChangeInfo(errorClass: String): Option[BreakingChangeInfo] = {
+    if (errorClass == null) {
+      None
+    } else {
+      errorReader.getBreakingChangeInfo(errorClass)
+    }
+  }
+
   def isInternalError(errorClass: String): Boolean = {
     errorClass != null && errorClass.startsWith("INTERNAL_ERROR")
   }
@@ -99,6 +107,19 @@ private[spark] object SparkThrowableHelper {
           g.writeStringField("errorClass", errorClass)
           if (format == STANDARD) {
             g.writeStringField("messageTemplate", errorReader.getMessageTemplate(errorClass))
+            errorReader.getBreakingChangeInfo(errorClass).foreach { breakingChangeInfo =>
+              g.writeObjectFieldStart("breakingChangeInfo")
+              g.writeStringField("migrationMessage",
+                  breakingChangeInfo.migrationMessage.mkString("\n"))
+              breakingChangeInfo.mitigationConfig.foreach { mitigationConfig =>
+                g.writeObjectFieldStart("mitigationConfig")
+                g.writeStringField("key", mitigationConfig.key)
+                g.writeStringField("value", mitigationConfig.value)
+                g.writeEndObject()
+              }
+              g.writeBooleanField("needsAudit", breakingChangeInfo.needsAudit)
+              g.writeEndObject()
+            }
           }
           val sqlState = e.getSqlState
           if (sqlState != null) g.writeStringField("sqlState", sqlState)
 
@@ -91,6 +91,8 @@ private[kafka010] class KafkaMicroBatchStream(
 
   private var allDataForTriggerAvailableNow: PartitionOffsetMap = _
 
+  private var isTriggerAvailableNow: Boolean = false
+
   /**
    * Lazily initialize `initialPartitionOffsets` to make sure that `KafkaConsumer.poll` is only
    * called in StreamExecutionThread. Otherwise, interrupting a thread while running
@@ -126,8 +128,14 @@ private[kafka010] class KafkaMicroBatchStream(
     val startPartitionOffsets = start.asInstanceOf[KafkaSourceOffset].partitionToOffsets
 
     // Use the pre-fetched list of partition offsets when Trigger.AvailableNow is enabled.
-    latestPartitionOffsets = if (allDataForTriggerAvailableNow != null) {
-      allDataForTriggerAvailableNow
+    latestPartitionOffsets = if (isTriggerAvailableNow) {
+      if (allDataForTriggerAvailableNow != null) {
+        allDataForTriggerAvailableNow
+      } else {
+        allDataForTriggerAvailableNow =
+          kafkaOffsetReader.fetchLatestOffsets(Some(startPartitionOffsets))
+        allDataForTriggerAvailableNow
+      }
     } else {
       kafkaOffsetReader.fetchLatestOffsets(Some(startPartitionOffsets))
     }
@@ -359,8 +367,7 @@ private[kafka010] class KafkaMicroBatchStream(
   }
 
   override def prepareForTriggerAvailableNow(): Unit = {
-    allDataForTriggerAvailableNow = kafkaOffsetReader.fetchLatestOffsets(
-      Some(getOrCreateInitialPartitionOffsets()))
+    isTriggerAvailableNow = true
   }
 }
Original file line number	Diff line number	Diff line change
`@@ -60,6 +60,12 @@ default boolean isInternalError() {`
`60`	`60`	`return SparkThrowableHelper.isInternalError(this.getCondition());`
`61`	`61`	`}`
`62`	`62`
	`63`	`+ // If null, the error message is not for a breaking change`
	`64`	`+ default BreakingChangeInfo getBreakingChangeInfo() {`
	`65`	`+ return SparkThrowableHelper.getBreakingChangeInfo(`
	`66`	`+ this.getCondition()).getOrElse(() -> null);`
	`67`	`+ }`
	`68`	`+`
`63`	`69`	`default Map<String, String> getMessageParameters() {`
`64`	`70`	`return new HashMap<>();`
`65`	`71`	`}`