[SPARK-53291][SQL] Fix nullability for value column

cashmand · cloud-fan · commit fd77ec6a2af2 · 2025-08-16T11:33:48.000+08:00
### What changes were proposed in this pull request? For shredded Variant, we currently always set the `value` column to be nullable. But when there is no corresponding `typed_value`, and the value doesn't represent an object field (where null implies missing from the object), the `value` is never null, and we can set the column to be required. ### Why are the changes needed? This shouldn't affect results as read by Spark, but it may cause the parquet file to be marginally larger, and the [spec](https://github.com/apache/parquet-format/blob/master/VariantShredding.md) wording indicates that `value` must be required in these situations, so a strict reader could reject the schema as it's currently being produced. ### Does this PR introduce _any_ user-facing change? Variant parquet file schema may change slightly. ### How was this patch tested? Unit test extended to cover this case. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #52043 from cashmand/fix_nullability. Authored-by: cashmand <david.cashman@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/SparkShreddingUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/SparkShreddingUtils.scala
@@ -473,13 +473,15 @@ case object SparkShreddingUtils {
    *   b: struct<typed_value: string, value: binary>>>
    *
    */
-  def variantShreddingSchema(dataType: DataType, isTopLevel: Boolean = true): StructType = {
+  def variantShreddingSchema(dataType: DataType,
+      isTopLevel: Boolean = true,
+      isObjectField : Boolean = false): StructType = {
     val fields = dataType match {
       case ArrayType(elementType, _) =>
         // Always set containsNull to false. One of value or typed_value must always be set for
         // array elements.
         val arrayShreddingSchema =
-          ArrayType(variantShreddingSchema(elementType, false), containsNull = false)
+          ArrayType(variantShreddingSchema(elementType, false, false), containsNull = false)
         Seq(
           StructField(VariantValueFieldName, BinaryType, nullable = true),
           StructField(TypedValueFieldName, arrayShreddingSchema, nullable = true)
@@ -489,15 +491,17 @@ case object SparkShreddingUtils {
         // "value" columna as "00", and missing values are represented by setting both "value" and
         // "typed_value" to null.
         val objectShreddingSchema = StructType(fields.map(f =>
-            f.copy(dataType = variantShreddingSchema(f.dataType, false), nullable = false)))
+            f.copy(dataType = variantShreddingSchema(f.dataType, false, true), nullable = false)))
         Seq(
           StructField(VariantValueFieldName, BinaryType, nullable = true),
           StructField(TypedValueFieldName, objectShreddingSchema, nullable = true)
         )
       case VariantType =>
-        // For Variant, we don't need a typed column
+        // For Variant, we don't need a typed column. If there is no typed column, value is required
+        // for array elements or top-level fields, but optional for objects (where a null represents
+        // a missing field).
         Seq(
-          StructField(VariantValueFieldName, BinaryType, nullable = true)
+          StructField(VariantValueFieldName, BinaryType, nullable = isObjectField)
         )
       case _: NumericType | BooleanType | _: StringType | BinaryType | _: DatetimeType =>
         Seq(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantWriteShreddingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantWriteShreddingSuite.scala
@@ -77,6 +77,12 @@ class VariantWriteShreddingSuite extends SparkFunSuite with ExpressionEvalHelper
         StructField("value", BinaryType, nullable = true),
         StructField("typed_value", IntegerType, nullable = true))))
 
+    // If typed_value is not provided, value is required.
+    assert(SparkShreddingUtils.variantShreddingSchema(VariantType) ==
+      StructType(Seq(
+        StructField("metadata", BinaryType, nullable = false),
+        StructField("value", BinaryType, nullable = false))))
+
     val fieldA = StructType(Seq(
       StructField("value", BinaryType, nullable = true),
       StructField("typed_value", TimestampNTZType, nullable = true)))
@@ -86,10 +92,22 @@ class VariantWriteShreddingSuite extends SparkFunSuite with ExpressionEvalHelper
     val fieldB = StructType(Seq(
       StructField("value", BinaryType, nullable = true),
       StructField("typed_value", arrayType, nullable = true)))
+    // If typed_value is not provided for an object field, value is still optional.
+    val fieldC = StructType(Seq(
+      StructField("value", BinaryType, nullable = true)))
+    // If typed_value is not provided for an array element, value is required.
+    val untypedArrayType = ArrayType(StructType(Seq(
+      StructField("value", BinaryType, nullable = false))), containsNull = false)
+    val fieldD = StructType(Seq(
+      StructField("value", BinaryType, nullable = true),
+      StructField("typed_value", untypedArrayType, nullable = true)))
     val objectType = StructType(Seq(
       StructField("a", fieldA, nullable = false),
-      StructField("b", fieldB, nullable = false)))
-    val structSchema = DataType.fromDDL("a timestamp_ntz, b array<string>")
+      StructField("b", fieldB, nullable = false),
+      StructField("c", fieldC, nullable = false),
+      StructField("d", fieldD, nullable = false)))
+    val structSchema = DataType.fromDDL(
+      "a timestamp_ntz, b array<string>, c variant, d array<variant>")
     assert(SparkShreddingUtils.variantShreddingSchema(structSchema) ==
       StructType(Seq(
         StructField("metadata", BinaryType, nullable = false),
@@ -185,6 +203,8 @@ class VariantWriteShreddingSuite extends SparkFunSuite with ExpressionEvalHelper
       testWithSchema(obj, t, Row(obj.getMetadata, untypedValue(obj), null))
     }
 
+    testWithSchema(obj, VariantType, Row(obj.getMetadata, untypedValue(obj)))
+
     // Happy path
     testWithSchema(obj, StructType.fromDDL("a int, b string"),
       Row(obj.getMetadata, null, Row(Row(null, 1), Row(null, "hello"))))
@@ -210,6 +230,11 @@ class VariantWriteShreddingSuite extends SparkFunSuite with ExpressionEvalHelper
     testWithSchema(obj, ArrayType(StructType.fromDDL("a int, b string")),
       Row(obj.getMetadata, untypedValue(obj), null))
 
+    // Shred with no typed_value in field schema
+    testWithSchema(obj, StructType.fromDDL("a variant, b variant"),
+      Row(obj.getMetadata, null,
+        Row(Row(untypedValue("1")), Row(untypedValue("\"hello\"")))))
+
     // Similar to the case above where "b" was not in the shredding schema, but with the unshredded
     // value being an object. Check that the copied value has correct dictionary IDs.
     val obj2 = parseJson("""{"a": 1, "b": {"c": "hello"}}""")
@@ -230,6 +255,9 @@ class VariantWriteShreddingSuite extends SparkFunSuite with ExpressionEvalHelper
       StructType.fromDDL("a int, b string")).foreach { t =>
       testWithSchema(arr, t, Row(arr.getMetadata, untypedValue(arr), null))
     }
+
+    testWithSchema(arr, VariantType, Row(arr.getMetadata, untypedValue(arr)))
+
     // First element is shredded
     testWithSchema(arr, ArrayType(StructType.fromDDL("a int, b string")),
       Row(arr.getMetadata, null, Array(
@@ -254,6 +282,15 @@ class VariantWriteShreddingSuite extends SparkFunSuite with ExpressionEvalHelper
         Row(null, 2),
         Row(null, 3)
       )))
+
+    // No typed_value in element schema
+    testWithSchema(arr, ArrayType(VariantType),
+      Row(arr.getMetadata, null, Array(
+        Row(untypedValue("""{"a": 1, "b": "hello"}""")),
+        Row(untypedValue("2")),
+        Row(untypedValue("null")),
+        Row(untypedValue("4"))
+      )))
   }
 
 }