Handle differences in letter case in columns and fields between query

projections and filters, and the underlying parquet file schema
apache · mallman · Jun 24, 2016 · Jun 4, 2018 · Jun 4, 2018 · Jun 4, 2018
commit 97b3a51d478f19890ded73aa78d94c055a9f144c
diff --git a/.../main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala b/.../main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
-import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Expression, NamedExpression}
+import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, Expression, NamedExpression}
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
@@ -45,7 +45,9 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
       case op @ PhysicalOperation(projects, filters,
           l @ LogicalRelation(hadoopFsRelation: HadoopFsRelation, _, _, _))
         if canPruneRelation(hadoopFsRelation) =>
-        val requestedRootFields = identifyRootFields(projects, filters)
+        val (normalizedProjects, normalizedFilters) =
+          normalizeAttributeRefNames(l, projects, filters)
+        val requestedRootFields = identifyRootFields(normalizedProjects, normalizedFilters)
 
         // If requestedRootFields includes a nested field, continue. Otherwise,
         // return op
@@ -64,7 +66,8 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
             val prunedRelation = buildPrunedRelation(l, prunedParquetRelation)
             val projectionOverSchema = ProjectionOverSchema(prunedDataSchema)
 
-            buildNewProjection(projects, filters, prunedRelation, projectionOverSchema)
+            buildNewProjection(normalizedProjects, normalizedFilters, prunedRelation,
+              projectionOverSchema)
           } else {
             op
           }
@@ -79,6 +82,27 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
   private def canPruneRelation(fsRelation: HadoopFsRelation) =
     fsRelation.fileFormat.isInstanceOf[ParquetFileFormat]
 
+  /**
+   * Normalizes the names of the attribute references in the given projects and filters to reflect
+   * the names in the given logical relation. This makes it possible to compare attributes and
+   * fields by name. Returns a tuple with the normalized projects and filters, respectively.
+   */
+  private def normalizeAttributeRefNames(
+      logicalRelation: LogicalRelation,
+      projects: Seq[NamedExpression],
+      filters: Seq[Expression]): (Seq[NamedExpression], Seq[Expression]) = {
+    val normalizedAttNameMap = logicalRelation.output.map(att => (att.exprId, att.name)).toMap
+    val normalizedProjects = projects.map(_.transform {
+      case att: AttributeReference if normalizedAttNameMap.contains(att.exprId) =>
+        att.withName(normalizedAttNameMap(att.exprId))
+    }).map { case expr: NamedExpression => expr }
+    val normalizedFilters = filters.map(_.transform {
+      case att: AttributeReference if normalizedAttNameMap.contains(att.exprId) =>
+        att.withName(normalizedAttNameMap(att.exprId))
+    })
+    (normalizedProjects, normalizedFilters)
+  }
+
   /**
    * Returns the set of fields from the Parquet file that the query plan needs.
    */
@@ -142,23 +166,27 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
     sortLeftFieldsByRight(mergedDataSchema, fileDataSchema).asInstanceOf[StructType]
   }
 
+  /**
+   * Builds a pruned logical relation from the output of the output relation and the schema of the
+   * pruned base relation.
+   */
   private def buildPrunedRelation(
       outputRelation: LogicalRelation,
-      parquetRelation: HadoopFsRelation) = {
+      prunedBaseRelation: HadoopFsRelation) = {
     // We need to replace the expression ids of the pruned relation output attributes
     // with the expression ids of the original relation output attributes so that
     // references to the original relation's output are not broken
     val outputIdMap = outputRelation.output.map(att => (att.name, att.exprId)).toMap
     val prunedRelationOutput =
-      parquetRelation
+      prunedBaseRelation
         .schema
         .toAttributes
         .map {
           case att if outputIdMap.contains(att.name) =>
             att.withExprId(outputIdMap(att.name))
           case att => att
         }
-    outputRelation.copy(relation = parquetRelation, output = prunedRelationOutput)
+    outputRelation.copy(relation = prunedBaseRelation, output = prunedRelationOutput)
   }
 
   /**

diff --git a/.../scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala b/.../scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
@@ -199,6 +199,69 @@ class ParquetSchemaPruningSuite
     }
   }
 
+  case class MixedCaseColumn(a: String, B: Int)
+  case class MixedCase(id: Int, CoL1: String, coL2: MixedCaseColumn)
+
+  private val mixedCaseData =
+    MixedCase(0, "r0c1", MixedCaseColumn("abc", 1)) ::
+    MixedCase(1, "r1c1", MixedCaseColumn("123", 2)) ::
+    Nil
+
+  testMixedCasePruning("select with exact column names") {
+    val query = sql("select CoL1, coL2.B from mixedcase")
+    checkScan(query, "struct<CoL1:string,coL2:struct<B:int>>")
+    checkAnswer(query.orderBy("id"),
+      Row("r0c1", 1) ::
+      Row("r1c1", 2) ::
+      Nil)
+  }
+
+  testMixedCasePruning("select with lowercase column names") {
+    val query = sql("select col1, col2.b from mixedcase")
+    checkScan(query, "struct<CoL1:string,coL2:struct<B:int>>")
+    checkAnswer(query.orderBy("id"),
+      Row("r0c1", 1) ::
+      Row("r1c1", 2) ::
+      Nil)
+  }
+
+  testMixedCasePruning("select with different-case column names") {
+    val query = sql("select cOL1, cOl2.b from mixedcase")
+    checkScan(query, "struct<CoL1:string,coL2:struct<B:int>>")
+    checkAnswer(query.orderBy("id"),
+      Row("r0c1", 1) ::
+      Row("r1c1", 2) ::
+      Nil)
+  }
+
+  testMixedCasePruning("filter with different-case column names") {
+    val query = sql("select id from mixedcase where Col2.b = 2")
+    // Pruning with filters is currently unsupported. As-is, the file reader will read the id column
+    // and the entire coL2 struct. Once pruning with filters has been implemented we can uncomment
+    // this line
+    // checkScan(query, "struct<id:int,coL2:struct<B:int>>")
+    checkAnswer(query.orderBy("id"), Row(1) :: Nil)
+  }
+
+  private def testMixedCasePruning(testName: String)(testThunk: => Unit) {
+    withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
+      test(s"Spark vectorized reader - mixed-case schema - $testName") {
+        withMixedCaseData(testThunk)
+      }
+    }
+    withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
+      test(s"Parquet-mr reader - mixed-case schema - $testName") {
+        withMixedCaseData(testThunk)
+      }
+    }
+  }
+
+  private def withMixedCaseData(testThunk: => Unit) {
+    withParquetTable(mixedCaseData, "mixedcase") {
+      testThunk
+    }
+  }
+
   private val schemaEquality = new Equality[StructType] {
     override def areEqual(a: StructType, b: Any): Boolean =
       b match {