update parquet row converter for arrays

apache · sadikovi · Sep 20, 2021 · Sep 20, 2021 · Sep 22, 2021 · Sep 20, 2021
commit d3d47f736286af906e66c3faf3dae7995eb28c9c
diff --git a/...c/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/...c/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
@@ -602,8 +602,10 @@ private[parquet] class ParquetRowConverter(
       // matches the Catalyst array element type. If it doesn't match, then it's case 1; otherwise,
       // it's case 2.
       val guessedElementType = schemaConverter.convertField(repeatedType)
+      // We also need to check if the list element follows the backward compatible pattern.
+      val isLegacy = schemaConverter.isElementType(repeatedType, parquetSchema.getName())
 
-      if (DataType.equalsIgnoreCompatibleNullability(guessedElementType, elementType)) {
+      if (DataType.equalsIgnoreCompatibleNullability(guessedElementType, elementType) || isLegacy) {
         // If the repeated field corresponds to the element type, creates a new converter using the
         // type of the repeated field.
         newConverter(repeatedType, elementType, new ParentContainerUpdater {

diff --git a/...ain/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/...ain/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
@@ -266,7 +266,7 @@ class ParquetToSparkSchemaConverter(
   // Here we implement Parquet LIST backwards-compatibility rules.
   // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
   // scalastyle:on
-  private def isElementType(repeatedType: Type, parentName: String): Boolean = {
+  private[parquet] def isElementType(repeatedType: Type, parentName: String): Boolean = {
     {
       // For legacy 2-level list types with primitive element type, e.g.:
       //

diff --git a/...ala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/...ala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.types.{ArrayType, IntegerType, StructField, StructType}
 
 class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedSparkSession {
   test("parquet files with different physical schemas but share the same logical schema") {
@@ -96,6 +97,58 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS
     }
   }
 
+  test("parquet files with legacy mode and schema evolution") {
-  test("parquet files with legacy mode and schema evolution") {
+  test("SPARK-36803: parquet files with legacy mode and schema evolution") {
-  test("parquet files with legacy mode and schema evolution") {
+  test("SPARK-36803: parquet files with legacy mode and schema evolution") {
+    // This test case writes arrays in Parquet legacy mode and schema evolution and verifies that
+    // the data can be correctly read back.
+
+    Seq(false, true).foreach { legacyMode =>
+      withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> legacyMode.toString) {
+        withTempPath { tableDir =>
+          val schema1 = StructType(
+            StructField("col-0", ArrayType(
+              StructType(
+                StructField("col-0", IntegerType, true) ::
+                Nil
+              ),
+              containsNull = false // allows to create 2-level Parquet LIST type in legacy mode
+            )) ::
+            Nil
+          )
+          val row1 = Row(Seq(Row(1)))
+          val df1 = spark.createDataFrame(spark.sparkContext.parallelize(row1 :: Nil, 1), schema1)
+          df1.write.parquet(tableDir.getAbsolutePath)
+
+          val schema2 = StructType(
+            StructField("col-0", ArrayType(
+              StructType(
+                StructField("col-0", IntegerType, true) ::
+                StructField("col-1", IntegerType, true) :: // additional field
+                Nil
+              ),
+              containsNull = false
+            )) ::
+            Nil
+          )
+          val row2 = Row(Seq(Row(1, 2)))
+          val df2 = spark.createDataFrame(spark.sparkContext.parallelize(row2 :: Nil, 1), schema2)
+          df2.write.mode("append").parquet(tableDir.getAbsolutePath)
+
+          // Reading of data should succeed and should not fail with
+          // java.lang.ClassCastException: optional int32 col-0 is not a group
+          withAllParquetReaders {
+            checkAnswer(
+              spark.read.schema(schema2).parquet(tableDir.getAbsolutePath),
+              Seq(
+                Row(Seq(Row(1, null))),
+                Row(Seq(Row(1, 2)))
+              )
+            )
+          }
+        }
+      }
+    }
+  }
+
   test("parquet timestamp conversion") {
     // Make a table with one parquet file written by impala, and one parquet file written by spark.
     // We should only adjust the timestamps in the impala file, and only if the conf is set