Skip to content
Closed

init #24

Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9f038aa
[SPARK-50112] Moving Avro files to sql/core so they can be used by Tr…
ericm-db Oct 25, 2024
28c3dbd
moving scala to scala dir
ericm-db Oct 25, 2024
2e33fd1
adding deprecated one
ericm-db Oct 25, 2024
b037859
init
ericm-db Oct 25, 2024
c1db91d
adding enum
ericm-db Oct 25, 2024
a30a29d
feedback and test
ericm-db Oct 25, 2024
2ebf6a8
creating utils class
ericm-db Oct 25, 2024
0559480
micheal feedback
ericm-db Oct 31, 2024
d3845a5
ValueState post-refactor
ericm-db Nov 1, 2024
35b3b0d
multivalue state encoder
ericm-db Nov 1, 2024
dcf0df7
encodeToUnsafeRow avro method
ericm-db Nov 2, 2024
dfc6b1e
using correct val
ericm-db Nov 4, 2024
5b98aa6
comments
ericm-db Nov 4, 2024
0d37ffd
calling encodeUnsafeRow
ericm-db Nov 4, 2024
9a1f825
merge into upstream/master
ericm-db Nov 5, 2024
5c8dd33
Merge remote-tracking branch 'upstream/master' into avro
ericm-db Nov 5, 2024
9b8dd5d
[SPARK-50127] Implement Avro encoding for MapState and PrefixKeyScanS…
ericm-db Nov 7, 2024
448ea76
making schema conversion lazy
ericm-db Nov 7, 2024
386fbf1
batch succeeds
ericm-db Nov 7, 2024
896e24f
actually enabling ttl
ericm-db Nov 7, 2024
15c5f71
including hidden files
ericm-db Nov 7, 2024
1f5e5f7
testWithEncodingTypes
ericm-db Nov 7, 2024
1826d5a
no longer relying on unsaferow
ericm-db Nov 8, 2024
c5ef895
everything but batch works
ericm-db Nov 8, 2024
e22e1a2
splitting it up
ericm-db Nov 8, 2024
730cae0
easy feedback to address
ericm-db Nov 9, 2024
754ce6c
batch works
ericm-db Nov 9, 2024
b6dbfdb
added test suite for non-contiguous ordinals
ericm-db Nov 11, 2024
e6f0b7a
using negative/null val marker
ericm-db Nov 11, 2024
ca660c0
removing log line
ericm-db Nov 11, 2024
41de8ae
getAvroEnc
ericm-db Nov 11, 2024
c49acd2
init
ericm-db Nov 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
init
  • Loading branch information
ericm-db committed Nov 11, 2024
commit c49acd28e8a965ce9462819a94cfcfc51920d092
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,8 @@ case class TransformWithStateExec(
newSchemas.values.toList, session.sessionState, stateSchemaVersion,
storeName = StateStoreId.DEFAULT_STORE_NAME,
oldSchemaFilePath = oldStateSchemaFilePath,
newSchemaFilePath = Some(newStateSchemaFilePath)))
newSchemaFilePath = Some(newStateSchemaFilePath),
usingAvro = true))
}

/** Metadata of this stateful operator and its states stores. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,16 @@

package org.apache.spark.sql.execution.streaming.state

import scala.jdk.CollectionConverters.IterableHasAsJava
import scala.util.Try

import org.apache.avro.SchemaValidatorBuilder
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.spark.SparkUnsupportedOperationException
import org.apache.spark.internal.{Logging, LogKeys, MDC}
import org.apache.spark.sql.avro.{AvroDeserializer, AvroSerializer}
import org.apache.spark.sql.avro.{AvroDeserializer, AvroSerializer, SchemaConverters}
import org.apache.spark.sql.catalyst.util.UnsafeRowUtils
import org.apache.spark.sql.execution.streaming.{CheckpointFileManager, StatefulOperatorStateInfo}
import org.apache.spark.sql.execution.streaming.state.SchemaHelper.{SchemaReader, SchemaWriter}
Expand Down Expand Up @@ -151,22 +153,36 @@ class StateSchemaCompatibilityChecker(
private def check(
oldSchema: StateStoreColFamilySchema,
newSchema: StateStoreColFamilySchema,
ignoreValueSchema: Boolean) : Unit = {
ignoreValueSchema: Boolean,
usingAvro: Boolean) : Boolean = {
val (storedKeySchema, storedValueSchema) = (oldSchema.keySchema,
oldSchema.valueSchema)
val (keySchema, valueSchema) = (newSchema.keySchema, newSchema.valueSchema)

if (storedKeySchema.equals(keySchema) &&
(ignoreValueSchema || storedValueSchema.equals(valueSchema))) {
// schema is exactly same
false
} else if (!schemasCompatible(storedKeySchema, keySchema)) {
throw StateStoreErrors.stateStoreKeySchemaNotCompatible(storedKeySchema.toString,
keySchema.toString)
} else if (!ignoreValueSchema && usingAvro) {
// By this point, we know that old value schema is not equal to new value schema
val oldAvroSchema = SchemaConverters.toAvroType(storedValueSchema)
val newAvroSchema = SchemaConverters.toAvroType(valueSchema)
val validator = new SchemaValidatorBuilder().canReadStrategy.validateAll()
// This will throw a SchemaValidation exception if the schema has evolved in an
// unacceptable way.
validator.validate(newAvroSchema, Iterable(oldAvroSchema).asJava)
// If no exception is thrown, then we know that the schema evolved in an
// acceptable way
true
} else if (!ignoreValueSchema && !schemasCompatible(storedValueSchema, valueSchema)) {
throw StateStoreErrors.stateStoreValueSchemaNotCompatible(storedValueSchema.toString,
valueSchema.toString)
} else {
logInfo("Detected schema change which is compatible. Allowing to put rows.")
true
}
}

Expand All @@ -180,7 +196,8 @@ class StateSchemaCompatibilityChecker(
def validateAndMaybeEvolveStateSchema(
newStateSchema: List[StateStoreColFamilySchema],
ignoreValueSchema: Boolean,
stateSchemaVersion: Int): Boolean = {
stateSchemaVersion: Int,
usingAvro: Boolean): Boolean = {
val existingStateSchemaList = getExistingKeyAndValueSchema()
val newStateSchemaList = newStateSchema

Expand All @@ -195,18 +212,18 @@ class StateSchemaCompatibilityChecker(
}.toMap
// For each new state variable, we want to compare it to the old state variable
// schema with the same name
newStateSchemaList.foreach { newSchema =>
existingSchemaMap.get(newSchema.colFamilyName).foreach { existingStateSchema =>
check(existingStateSchema, newSchema, ignoreValueSchema)
}
val hasEvolvedSchema = newStateSchemaList.exists { newSchema =>
existingSchemaMap.get(newSchema.colFamilyName)
.exists(existingSchema => check(existingSchema, newSchema, ignoreValueSchema, usingAvro))
}
val colFamiliesAddedOrRemoved =
(newStateSchemaList.map(_.colFamilyName).toSet != existingSchemaMap.keySet)
if (stateSchemaVersion == SCHEMA_FORMAT_V3 && colFamiliesAddedOrRemoved) {
val newSchemaFileWritten = hasEvolvedSchema || colFamiliesAddedOrRemoved
if (stateSchemaVersion == SCHEMA_FORMAT_V3 && newSchemaFileWritten) {
createSchemaFile(newStateSchemaList.sortBy(_.colFamilyName), stateSchemaVersion)
}
// TODO: [SPARK-49535] Write Schema files after schema has changed for StateSchemaV3
colFamiliesAddedOrRemoved
newSchemaFileWritten
}
}

Expand Down Expand Up @@ -255,7 +272,8 @@ object StateSchemaCompatibilityChecker {
extraOptions: Map[String, String] = Map.empty,
storeName: String = StateStoreId.DEFAULT_STORE_NAME,
oldSchemaFilePath: Option[Path] = None,
newSchemaFilePath: Option[Path] = None): StateSchemaValidationResult = {
newSchemaFilePath: Option[Path] = None,
usingAvro: Boolean = false): StateSchemaValidationResult = {
// SPARK-47776: collation introduces the concept of binary (in)equality, which means
// in some collation we no longer be able to just compare the binary format of two
// UnsafeRows to determine equality. For example, 'aaa' and 'AAA' can be "semantically"
Expand Down Expand Up @@ -286,7 +304,7 @@ object StateSchemaCompatibilityChecker {
val result = Try(
checker.validateAndMaybeEvolveStateSchema(newStateSchema,
ignoreValueSchema = !storeConf.formatValidationCheckValue,
stateSchemaVersion = stateSchemaVersion)
stateSchemaVersion = stateSchemaVersion, usingAvro)
).toEither.fold(Some(_),
hasEvolvedSchema => {
evolvedSchema = hasEvolvedSchema
Expand Down
Loading