[SPARK-4502][SQL] Parquet nested column pruning

apache · mallman · Jun 24, 2016 · Jun 4, 2018 · Jun 4, 2018 · Jun 4, 2018
commit 6e41081e98be40fde7f9099670c4ef3a84531cf0
diff --git a/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/GetStructFieldObject.scala b/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/GetStructFieldObject.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.planning
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, GetStructField}
+import org.apache.spark.sql.types.StructField
+
+/**
+ * A Scala extractor that extracts the child expression and struct field from a [[GetStructField]].
+ * This is in contrast to the [[GetStructField]] case class extractor which returns the field
+ * ordinal instead of the field itself.
+ */
+private[planning] object GetStructFieldObject {
+  def unapply(getStructField: GetStructField): Option[(Expression, StructField)] =
+    Some((
+      getStructField.child,
+      getStructField.childSchema(getStructField.ordinal)))
+}
diff --git a/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/ProjectionOverSchema.scala b/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/ProjectionOverSchema.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.planning
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types._
+
+/**
+ * A Scala extractor that projects an expression over a given schema. Data types,
+ * field indexes and field counts of complex type extractors and attributes
+ * are adjusted to fit the schema. All other expressions are left as-is. This
+ * class is motivated by columnar nested schema pruning.
+ */
+case class ProjectionOverSchema(schema: StructType) {
+  private val fieldNames = schema.fieldNames.toSet
+
+  def unapply(expr: Expression): Option[Expression] = getProjection(expr)
+
+  private def getProjection(expr: Expression): Option[Expression] =
+    expr match {
+      case a @ AttributeReference(name, _, _, _) if (fieldNames.contains(name)) =>
+        Some(a.copy(dataType = schema(name).dataType)(a.exprId, a.qualifier))
+      case GetArrayItem(child, arrayItemOrdinal) =>
+        getProjection(child).map {
+          case projection =>
+            GetArrayItem(projection, arrayItemOrdinal)
+        }
+      case GetArrayStructFields(child, StructField(name, _, _, _), _, numFields, containsNull) =>
+        getProjection(child).map(p => (p, p.dataType)).map {
+          case (projection, ArrayType(projSchema @ StructType(_), _)) =>
+            GetArrayStructFields(projection,
+              projSchema(name), projSchema.fieldIndex(name), projSchema.size, containsNull)
+        }
+      case GetMapValue(child, key) =>
+        getProjection(child).map {
+          case projection =>
+            GetMapValue(projection, key)
+        }
+      case GetStructFieldObject(child, StructField(name, _, _, _)) =>
+        getProjection(child).map(p => (p, p.dataType)).map {
+          case (projection, projSchema @ StructType(_)) =>
+            GetStructField(projection, projSchema.fieldIndex(name))
+        }
+      case _ =>
+        None
+    }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/SelectedField.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/SelectedField.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.planning
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types._
+
+/**
+ * A Scala extractor that builds a [[org.apache.spark.sql.types.StructField]] from a Catalyst
+ * complex type extractor. For example, consider a relation with the following schema:
+ *
+ *   {{{
+ *   root
+ *    |-- name: struct (nullable = true)
+ *    |    |-- first: string (nullable = true)
+ *    |    |-- last: string (nullable = true)
+ *    }}}
+ *
+ * Further, suppose we take the select expression `name.first`. This will parse into an
+ * `Alias(child, "first")`. Ignoring the alias, `child` matches the following pattern:
+ *
+ *   {{{
+ *   GetStructFieldObject(
+ *     AttributeReference("name", StructType(_), _, _),
+ *     StructField("first", StringType, _, _))
+ *   }}}
+ *
+ * [[SelectedField]] converts that expression into
+ *
+ *   {{{
+ *   StructField("name", StructType(Array(StructField("first", StringType))))
+ *   }}}
+ *
+ * by mapping each complex type extractor to a [[org.apache.spark.sql.types.StructField]] with the
+ * same name as its child (or "parent" going right to left in the select expression) and a data
+ * type appropriate to the complex type extractor. In our example, the name of the child expression
+ * is "name" and its data type is a [[org.apache.spark.sql.types.StructType]] with a single string
+ * field named "first".
+ *
+ * @param expr the top-level complex type extractor
+ */
+object SelectedField {
+  def unapply(expr: Expression): Option[StructField] = {
+    // If this expression is an alias, work on its child instead
+    val unaliased = expr match {
+      case Alias(child, _) => child
+      case expr => expr
+    }
+    selectField(unaliased, None)
+  }
+
+  private def selectField(expr: Expression, fieldOpt: Option[StructField]): Option[StructField] = {
+    expr match {
+      // No children. Returns a StructField with the attribute name or None if fieldOpt is None.
+      case AttributeReference(name, dataType, nullable, metadata) =>
+        fieldOpt.map(field =>
+          StructField(name, wrapStructType(dataType, field), nullable, metadata))
+      // Handles case "expr0.field[n]", where "expr0" is of struct type and "expr0.field" is of
+      // array type.
+      case GetArrayItem(x @ GetStructFieldObject(child, field @ StructField(name,
+          dataType, nullable, metadata)), _) =>
+        val childField = fieldOpt.map(field => StructField(name,
+          wrapStructType(dataType, field), nullable, metadata)).getOrElse(field)
+        selectField(child, Some(childField))
+      // Handles case "expr0.field[n]", where "expr0.field" is of array type.
+      case GetArrayItem(child, _) =>
+        selectField(child, fieldOpt)
+      // Handles case "expr0.field.subfield", where "expr0" and "expr0.field" are of array type.
+      case GetArrayStructFields(child: GetArrayStructFields,
+          field @ StructField(name, dataType, nullable, metadata), _, _, _) =>
+        val childField = fieldOpt.map(field => StructField(name,
+            wrapStructType(dataType, field),
+            nullable, metadata)).getOrElse(field)
+        selectField(child, Some(childField))
+      // Handles case "expr0.field", where "expr0" is of array type.
+      case GetArrayStructFields(child,
+          field @ StructField(name, dataType, nullable, metadata), _, _, containsNull) =>
+        val childField =
+          fieldOpt.map(field => StructField(name,
+            wrapStructType(dataType, field),
+            nullable, metadata)).getOrElse(field)
+        selectField(child, Some(childField))
+      // Handles case "expr0.field[key]", where "expr0" is of struct type and "expr0.field" is of
+      // map type.
+      case GetMapValue(x @ GetStructFieldObject(child, field @ StructField(name,
+          dataType,
+          nullable, metadata)), _) =>
+        val childField = fieldOpt.map(field => StructField(name,
+          wrapStructType(dataType, field),
+          nullable, metadata)).getOrElse(field)
+        selectField(child, Some(childField))
+      // Handles case "expr0.field[key]", where "expr0.field" is of map type.
+      case GetMapValue(child, _) =>
+        selectField(child, fieldOpt)
+      // Handles case "expr0.field", where expr0 is of struct type.
+      case GetStructFieldObject(child,
+        field @ StructField(name, dataType, nullable, metadata)) =>
+        val childField = fieldOpt.map(field => StructField(name,
+          wrapStructType(dataType, field),
+          nullable, metadata)).getOrElse(field)
+        selectField(child, Some(childField))
+      case _ =>
+        None
+    }
+  }
+
+  // Constructs a composition of complex types with a StructType(Array(field)) at its core. Returns
+  // a StructType for a StructType, an ArrayType for an ArrayType and a MapType for a MapType.
+  private def wrapStructType(dataType: DataType, field: StructField): DataType = {
+    dataType match {
+      case _: StructType =>
+        StructType(Array(field))
+      case ArrayType(elementType, containsNull) =>
+        ArrayType(wrapStructType(elementType, field), containsNull)
+      case MapType(keyType, valueType, valueContainsNull) =>
+        MapType(keyType, wrapStructType(valueType, field), valueContainsNull)
+    }
+  }
+}
diff --git a/...yst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala b/...yst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
@@ -81,8 +81,9 @@ trait ConstraintHelper {
 
   /**
    * Infers a set of `isNotNull` constraints from null intolerant expressions as well as
-   * non-nullable attributes. For e.g., if an expression is of the form (`a > 5`), this
-   * returns a constraint of the form `isNotNull(a)`
+   * non-nullable attributes and complex type extractors. For example, if an expression is of the
+   * form (`a > 5`), this returns a constraint of the form `isNotNull(a)`. For an expression of the
+   * form (`a.b > 5`), this returns the more precise constraint `isNotNull(a.b)`.
    */
   def constructIsNotNullConstraints(
       constraints: Set[Expression],
@@ -99,27 +100,28 @@ trait ConstraintHelper {
   }
 
   /**
-   * Infer the Attribute-specific IsNotNull constraints from the null intolerant child expressions
-   * of constraints.
+   * Infer the Attribute and ExtractValue-specific IsNotNull constraints from the null intolerant
+   * child expressions of constraints.
    */
   private def inferIsNotNullConstraints(constraint: Expression): Seq[Expression] =
     constraint match {
       // When the root is IsNotNull, we can push IsNotNull through the child null intolerant
       // expressions
-      case IsNotNull(expr) => scanNullIntolerantAttribute(expr).map(IsNotNull(_))
+      case IsNotNull(expr) => scanNullIntolerantField(expr).map(IsNotNull(_))
       // Constraints always return true for all the inputs. That means, null will never be returned.
       // Thus, we can infer `IsNotNull(constraint)`, and also push IsNotNull through the child
       // null intolerant expressions.
-      case _ => scanNullIntolerantAttribute(constraint).map(IsNotNull(_))
+      case _ => scanNullIntolerantField(constraint).map(IsNotNull(_))
     }
 
   /**
-   * Recursively explores the expressions which are null intolerant and returns all attributes
-   * in these expressions.
+   * Recursively explores the expressions which are null intolerant and returns all attributes and
+   * complex type extractors in these expressions.
    */
-  private def scanNullIntolerantAttribute(expr: Expression): Seq[Attribute] = expr match {
+  private def scanNullIntolerantField(expr: Expression): Seq[Expression] = expr match {
+    case ev: ExtractValue => Seq(ev)
     case a: Attribute => Seq(a)
-    case _: NullIntolerant => expr.children.flatMap(scanNullIntolerantAttribute)
+    case _: NullIntolerant => expr.children.flatMap(scanNullIntolerantField)
     case _ => Seq.empty[Attribute]
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1381,8 +1381,18 @@ object SQLConf {
         "issues. Turn on this config to insert a local sort before actually doing repartition " +
         "to generate consistent repartition results. The performance of repartition() may go " +
         "down since we insert extra local sort before it.")
+        .booleanConf
+        .createWithDefault(true)
+
+  val NESTED_SCHEMA_PRUNING_ENABLED =
+    buildConf("spark.sql.nestedSchemaPruning.enabled")
+      .internal()
+      .doc("Prune nested fields from a logical relation's output which are unnecessary in " +
+        "satisfying a query. This optimization allows columnar file format readers to avoid " +
+        "reading unnecessary nested column data. Currently Parquet is the only data source that " +
+        "implements this optimization.")
       .booleanConf
-      .createWithDefault(true)
+      .createWithDefault(false)
 
   val TOP_K_SORT_FALLBACK_THRESHOLD =
     buildConf("spark.sql.execution.topKSortFallbackThreshold")
@@ -1863,6 +1873,8 @@ class SQLConf extends Serializable with Logging {
   def partitionOverwriteMode: PartitionOverwriteMode.Value =
     PartitionOverwriteMode.withName(getConf(PARTITION_OVERWRITE_MODE))
 
+  def nestedSchemaPruningEnabled: Boolean = getConf(NESTED_SCHEMA_PRUNING_ENABLED)
+
   def csvColumnPruning: Boolean = getConf(SQLConf.CSV_PARSER_COLUMN_PRUNING)
 
   def legacySizeOfNull: Boolean = getConf(SQLConf.LEGACY_SIZE_OF_NULL)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SchemaPruningTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SchemaPruningTest.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst
+
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.internal.SQLConf.NESTED_SCHEMA_PRUNING_ENABLED
+
+/**
+ * A PlanTest that ensures that all tests in this suite are run with nested schema pruning enabled.
+ * Remove this trait once the default value of SQLConf.NESTED_SCHEMA_PRUNING_ENABLED is set to true.
+ */
+private[sql] trait SchemaPruningTest extends PlanTest with BeforeAndAfterAll {
+  private var originalConfSchemaPruningEnabled = false
+
+  override protected def beforeAll(): Unit = {
+    // Call `withSQLConf` eagerly because some subtypes of `PlanTest` (I'm looking at you,
+    // `SQLTestUtils`) override `withSQLConf` to reset the existing `SQLConf` with a new one without
+    // copying existing settings first. This here is an awful, ugly way to get around that behavior
+    // by initializing the "real" `SQLConf` with an noop call to `withSQLConf`. I don't want to risk
+    // "fixing" the downstream behavior, breaking everything else that's expecting these semantics.
+    // Oh well...
+    withSQLConf()(())
+    originalConfSchemaPruningEnabled = conf.nestedSchemaPruningEnabled
+    conf.setConf(NESTED_SCHEMA_PRUNING_ENABLED, true)
+    super.beforeAll()
+  }
+
+  override protected def afterAll(): Unit = {
+    try {
+      super.afterAll()
+    } finally {
+      conf.setConf(NESTED_SCHEMA_PRUNING_ENABLED, originalConfSchemaPruningEnabled)
+    }
+  }
+}