Improve readability of ParquetSchemaPruning and

ParquetSchemaPruningSuite. Add test to exercise whether the requested root fields in a query exclude any attributes
apache · mallman · Jun 24, 2016 · Jun 4, 2018 · Jun 4, 2018 · Jun 4, 2018
commit 9488cb5c9d33670bd05f14ed00a24e68ae79f2ea
diff --git a/.../main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala b/.../main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala
@@ -44,15 +44,15 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
       case op @ PhysicalOperation(projects, filters,
           l @ LogicalRelation(hadoopFsRelation @ HadoopFsRelation(_, partitionSchema,
             dataSchema, _, parquetFormat: ParquetFileFormat, _), _, _, _)) =>
-        val projectionFields = projects.flatMap(getFields)
-        val filterFields = filters.flatMap(getFields)
-        val requestedFields = (projectionFields ++ filterFields).distinct
+        val projectionRootFields = projects.flatMap(getRootFields)
+        val filterRootFields = filters.flatMap(getRootFields)
+        val requestedRootFields = (projectionRootFields ++ filterRootFields).distinct
 
-        // If [[requestedFields]] includes a nested field, continue. Otherwise,
+        // If [[requestedRootFields]] includes a nested field, continue. Otherwise,
         // return [[op]]
-        if (requestedFields.exists { case (_, optAtt) => optAtt.isEmpty }) {
-          val prunedSchema = requestedFields
-            .map { case (field, _) => StructType(Array(field)) }
+        if (requestedRootFields.exists { case RootField(_, derivedFromAtt) => !derivedFromAtt }) {
+          val prunedSchema = requestedRootFields
+            .map { case RootField(field, _) => StructType(Array(field)) }
             .reduceLeft(_ merge _)
           val dataSchemaFieldNames = dataSchema.fieldNames.toSet
           val prunedDataSchema =
@@ -123,17 +123,17 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
     }
 
   /**
-   * Gets the top-level (no-parent) [[StructField]]s for the given [[Expression]].
-   * When [[expr]] is an [[Attribute]], construct a field around it and return the
-   * attribute as the second component of the returned tuple.
+   * Gets the root (aka top-level, no-parent) [[StructField]]s for the given [[Expression]].
+   * When [[expr]] is an [[Attribute]], construct a field around it and indicate that that
+   * field was derived from an attribute.
    */
-  private def getFields(expr: Expression): Seq[(StructField, Option[Attribute])] = {
+  private def getRootFields(expr: Expression): Seq[RootField] = {
     expr match {
       case att: Attribute =>
-        (StructField(att.name, att.dataType, att.nullable), Some(att)) :: Nil
-      case SelectedField(field) => (field, None) :: Nil
+        RootField(StructField(att.name, att.dataType, att.nullable), true) :: Nil
+      case SelectedField(field) => RootField(field, false) :: Nil
       case _ =>
-        expr.children.flatMap(getFields)
+        expr.children.flatMap(getRootFields)
     }
   }
 
@@ -151,4 +151,10 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
       case _ => 1
     }
   }
+
+  /**
+   * A "root" schema field (aka top-level, no-parent) and whether it was derived from
+   * an attribute or had a proper child.
+   */
+  private case class RootField(field: StructField, derivedFromAtt: Boolean)
 }
diff --git a/.../scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala b/.../scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala
@@ -43,15 +43,16 @@ class ParquetSchemaPruningSuite
     BriefContact(Name("Janet", "Jones"), "567 Maple Drive") ::
     BriefContact(Name("Jim", "Jones"), "6242 Ash Street") :: Nil
 
-  testStandardAndLegacyModes("partial schema intersection - select missing subfield") {
-    withTempPath { dir =>
-      val path = dir.getCanonicalPath
-
-      makeParquetFile(contacts, new File(path + "/contacts/p=1"))
-      makeParquetFile(briefContacts, new File(path + "/contacts/p=2"))
-
-      spark.read.parquet(path + "/contacts").createOrReplaceTempView("contacts")
+  testStandardAndLegacyModes("prune a single field") {
+    withContacts {
+      val query = sql("select name.middle from contacts")
+      checkScanSchemata(query, "struct<name:struct<middle:string>>")
+      checkAnswer(query, Row("X.") :: Row("Y.") :: Row(null) :: Row(null) :: Nil)
+    }
+  }
 
+  testStandardAndLegacyModes("partial schema intersection - select missing subfield") {
+    withContacts {
       val query = sql("select name.middle, address from contacts where p=2")
       checkScanSchemata(query, "struct<name:struct<middle:string>,address:string>")
       checkAnswer(query,
@@ -61,14 +62,7 @@ class ParquetSchemaPruningSuite
   }
 
   testStandardAndLegacyModes("partial schema intersection - filter on subfield") {
-    withTempPath { dir =>
-      val path = dir.getCanonicalPath
-
-      makeParquetFile(contacts, new File(path + "/contacts/p=1"))
-      makeParquetFile(briefContacts, new File(path + "/contacts/p=2"))
-
-      spark.read.parquet(path + "/contacts").createOrReplaceTempView("contacts")
-
+    withContacts {
       val query =
         sql("select name.middle, name.first, pets, address from contacts where " +
           "name.first = 'Janet' and p=2")
@@ -80,14 +74,7 @@ class ParquetSchemaPruningSuite
   }
 
   testStandardAndLegacyModes("no unnecessary schema pruning") {
-    withTempPath { dir =>
-      val path = dir.getCanonicalPath
-
-      makeParquetFile(contacts, new File(path + "/contacts/p=1"))
-      makeParquetFile(briefContacts, new File(path + "/contacts/p=2"))
-
-      spark.read.parquet(path + "/contacts").createOrReplaceTempView("contacts")
-
+    withContacts {
       val query =
         sql("select name.last, name.middle, name.first, relatives[''].last, " +
           "relatives[''].middle, relatives[''].first, friends[0].last, friends[0].middle, " +
@@ -107,6 +94,15 @@ class ParquetSchemaPruningSuite
   }
 
   testStandardAndLegacyModes("empty schema intersection") {
+    withContacts {
+      val query = sql("select name.middle from contacts where p=2")
+      checkScanSchemata(query, "struct<name:struct<middle:string>>")
+      checkAnswer(query,
+        Row(null) :: Row(null) :: Nil)
+    }
+  }
+
+  private def withContacts(testThunk: => Unit) {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
 
@@ -115,10 +111,7 @@ class ParquetSchemaPruningSuite
 
       spark.read.parquet(path + "/contacts").createOrReplaceTempView("contacts")
 
-      val query = sql("select name.middle from contacts where p=2")
-      checkScanSchemata(query, "struct<name:struct<middle:string>>")
-      checkAnswer(query,
-        Row(null) :: Row(null) :: Nil)
+      testThunk
     }
   }
 }