apache · lianhuiwang · Jun 3, 2016 · Jun 3, 2016 · Jun 3, 2016 · Jun 3, 2016
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -340,7 +340,8 @@ private[sql] object DataSourceScanExec {
       rdd: RDD[InternalRow],
       relation: BaseRelation,
       metadata: Map[String, String] = Map.empty,
-      metastoreTableIdentifier: Option[TableIdentifier] = None): DataSourceScanExec = {
+      metastoreTableIdentifier: Option[TableIdentifier] = None,
+      isSupportBatch: Boolean = true): DataSourceScanExec = {
     val outputPartitioning = {
       val bucketSpec = relation match {
         // TODO: this should be closer to bucket planning.
@@ -364,7 +365,8 @@ private[sql] object DataSourceScanExec {
 
     relation match {
       case r: HadoopFsRelation
-        if r.fileFormat.supportBatch(r.sparkSession, StructType.fromAttributes(output)) =>
+        if isSupportBatch &&
+          r.fileFormat.supportBatch(r.sparkSession, StructType.fromAttributes(output)) =>
         BatchedDataSourceScanExec(
           output, rdd, relation, outputPartitioning, metadata, metastoreTableIdentifier)
       case _ =>

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -22,9 +22,11 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path}
 
 import org.apache.spark.internal.Logging
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.expressions
+import org.apache.spark.sql.catalyst.{expressions, InternalRow}
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.DataSourceScanExec
@@ -109,108 +111,45 @@ private[sql] object FileSourceStrategy extends Strategy with Logging {
       val pushedDownFilters = dataFilters.flatMap(DataSourceStrategy.translateFilter)
       logInfo(s"Pushed Filters: ${pushedDownFilters.mkString(",")}")
 
-      val readFile = files.fileFormat.buildReaderWithPartitionValues(
-        sparkSession = files.sparkSession,
-        dataSchema = files.dataSchema,
-        partitionSchema = files.partitionSchema,
-        requiredSchema = prunedDataSchema,
-        filters = pushedDownFilters,
-        options = files.options,
-        hadoopConf = files.sparkSession.sessionState.newHadoopConfWithOptions(files.options))
-
-      val plannedPartitions = files.bucketSpec match {
-        case Some(bucketing) if files.sparkSession.sessionState.conf.bucketingEnabled =>
-          logInfo(s"Planning with ${bucketing.numBuckets} buckets")
-          val bucketed =
-            selectedPartitions.flatMap { p =>
-              p.files.map { f =>
-                val hosts = getBlockHosts(getBlockLocations(f), 0, f.getLen)
-                PartitionedFile(p.values, f.getPath.toUri.toString, 0, f.getLen, hosts)
-              }
-            }.groupBy { f =>
-              BucketingUtils
-                .getBucketId(new Path(f.filePath).getName)
-                .getOrElse(sys.error(s"Invalid bucket file ${f.filePath}"))
-            }
-
-          (0 until bucketing.numBuckets).map { bucketId =>
-            FilePartition(bucketId, bucketed.getOrElse(bucketId, Nil))
-          }
-
-        case _ =>
-          val defaultMaxSplitBytes = files.sparkSession.sessionState.conf.filesMaxPartitionBytes
-          val openCostInBytes = files.sparkSession.sessionState.conf.filesOpenCostInBytes
-          val defaultParallelism = files.sparkSession.sparkContext.defaultParallelism
-          val totalBytes = selectedPartitions.flatMap(_.files.map(_.getLen + openCostInBytes)).sum
-          val bytesPerCore = totalBytes / defaultParallelism
-          val maxSplitBytes = Math.min(defaultMaxSplitBytes,
-            Math.max(openCostInBytes, bytesPerCore))
-          logInfo(s"Planning scan with bin packing, max size: $maxSplitBytes bytes, " +
-            s"open cost is considered as scanning $openCostInBytes bytes.")
-
-          val splitFiles = selectedPartitions.flatMap { partition =>
-            partition.files.flatMap { file =>
-              val blockLocations = getBlockLocations(file)
-              (0L until file.getLen by maxSplitBytes).map { offset =>
-                val remaining = file.getLen - offset
-                val size = if (remaining > maxSplitBytes) maxSplitBytes else remaining
-                val hosts = getBlockHosts(blockLocations, offset, size)
-                PartitionedFile(partition.values, file.getPath.toUri.toString, offset, size, hosts)
-              }
-            }
-          }.toArray.sortBy(_.length)(implicitly[Ordering[Long]].reverse)
-
-          val partitions = new ArrayBuffer[FilePartition]
-          val currentFiles = new ArrayBuffer[PartitionedFile]
-          var currentSize = 0L
-
-          /** Add the given file to the current partition. */
-          def addFile(file: PartitionedFile): Unit = {
-            currentSize += file.length + openCostInBytes
-            currentFiles.append(file)
-          }
-
-          /** Close the current partition and move to the next. */
-          def closePartition(): Unit = {
-            if (currentFiles.nonEmpty) {
-              val newPartition =
-                FilePartition(
-                  partitions.size,
-                  currentFiles.toArray.toSeq) // Copy to a new Array.
-              partitions.append(newPartition)
-            }
-            currentFiles.clear()
-            currentSize = 0
-          }
-
-          // Assign files to partitions using "First Fit Decreasing" (FFD)
-          // TODO: consider adding a slop factor here?
-          splitFiles.foreach { file =>
-            if (currentSize + file.length > maxSplitBytes) {
-              closePartition()
-            }
-            addFile(file)
-          }
-          closePartition()
-          partitions
+      val optimizerMetadataOnly =
+        readDataColumns.isEmpty && files.sparkSession.sessionState.conf.optimizerMetadataOnly
+      val scanRdd: RDD[InternalRow] = if (optimizerMetadataOnly) {
+        val partitionSchema = files.partitionSchema.toAttributes
+        lazy val converter = GenerateUnsafeProjection.generate(partitionSchema, partitionSchema)
+        val partitionValues = selectedPartitions.map(_.values)
+        files.sqlContext.sparkContext.parallelize(partitionValues, 1).map(converter(_))
+      } else {
+        val readFile = files.fileFormat.buildReaderWithPartitionValues(
+          sparkSession = files.sparkSession,
+          dataSchema = files.dataSchema,
+          partitionSchema = files.partitionSchema,
+          requiredSchema = prunedDataSchema,
+          filters = pushedDownFilters,
+          options = files.options,
+          hadoopConf = files.sparkSession.sessionState.newHadoopConfWithOptions(files.options))
+
+        val plannedPartitions = getFilePartitions(files, selectedPartitions)
+        new FileScanRDD(
+          files.sparkSession,
+          readFile,
+          plannedPartitions)
       }
 
       val meta = Map(
         "Format" -> files.fileFormat.toString,
         "ReadSchema" -> prunedDataSchema.simpleString,
+        "metadataOnly" -> optimizerMetadataOnly.toString,
         PUSHED_FILTERS -> pushedDownFilters.mkString("[", ", ", "]"),
         INPUT_PATHS -> files.location.paths.mkString(", "))
 
       val scan =
         DataSourceScanExec.create(
           readDataColumns ++ partitionColumns,
-          new FileScanRDD(
-            files.sparkSession,
-            readFile,
-            plannedPartitions),
+          scanRdd,
           files,
           meta,
-          table)
+          table,
+          !optimizerMetadataOnly)
 
       val afterScanFilter = afterScanFilters.toSeq.reduceOption(expressions.And)
       val withFilter = afterScanFilter.map(execution.FilterExec(_, scan)).getOrElse(scan)
@@ -225,6 +164,85 @@ private[sql] object FileSourceStrategy extends Strategy with Logging {
     case _ => Nil
   }
 
+  private def getFilePartitions(
+      files: HadoopFsRelation,
+      selectedPartitions: Seq[Partition]): Seq[FilePartition] = files.bucketSpec match {
+    case Some(bucketing) if files.sparkSession.sessionState.conf.bucketingEnabled =>
+      logInfo(s"Planning with ${bucketing.numBuckets} buckets")
+      val bucketed =
+        selectedPartitions.flatMap { p =>
+          p.files.map { f =>
+            val hosts = getBlockHosts(getBlockLocations(f), 0, f.getLen)
+            PartitionedFile(p.values, f.getPath.toUri.toString, 0, f.getLen, hosts)
+          }
+        }.groupBy { f =>
+          BucketingUtils
+            .getBucketId(new Path(f.filePath).getName)
+            .getOrElse(sys.error(s"Invalid bucket file ${f.filePath}"))
+        }
+
+      (0 until bucketing.numBuckets).map { bucketId =>
+        FilePartition(bucketId, bucketed.getOrElse(bucketId, Nil))
+      }
+
+    case _ =>
+      val defaultMaxSplitBytes = files.sparkSession.sessionState.conf.filesMaxPartitionBytes
+      val openCostInBytes = files.sparkSession.sessionState.conf.filesOpenCostInBytes
+      val defaultParallelism = files.sparkSession.sparkContext.defaultParallelism
+      val totalBytes = selectedPartitions.flatMap(_.files.map(_.getLen + openCostInBytes)).sum
+      val bytesPerCore = totalBytes / defaultParallelism
+      val maxSplitBytes = Math.min(defaultMaxSplitBytes,
+        Math.max(openCostInBytes, bytesPerCore))
+      logInfo(s"Planning scan with bin packing, max size: $maxSplitBytes bytes, " +
+        s"open cost is considered as scanning $openCostInBytes bytes.")
+
+      val splitFiles = selectedPartitions.flatMap { partition =>
+        partition.files.flatMap { file =>
+          val blockLocations = getBlockLocations(file)
+          (0L until file.getLen by maxSplitBytes).map { offset =>
+            val remaining = file.getLen - offset
+            val size = if (remaining > maxSplitBytes) maxSplitBytes else remaining
+            val hosts = getBlockHosts(blockLocations, offset, size)
+            PartitionedFile(partition.values, file.getPath.toUri.toString, offset, size, hosts)
+          }
+        }
+      }.toArray.sortBy(_.length)(implicitly[Ordering[Long]].reverse)
+
+      val partitions = new ArrayBuffer[FilePartition]
+      val currentFiles = new ArrayBuffer[PartitionedFile]
+      var currentSize = 0L
+
+      /** Add the given file to the current partition. */
+      def addFile(file: PartitionedFile): Unit = {
+        currentSize += file.length + openCostInBytes
+        currentFiles.append(file)
+      }
+
+      /** Close the current partition and move to the next. */
+      def closePartition(): Unit = {
+        if (currentFiles.nonEmpty) {
+          val newPartition =
+            FilePartition(
+              partitions.size,
+              currentFiles.toArray.toSeq) // Copy to a new Array.
+          partitions.append(newPartition)
+        }
+        currentFiles.clear()
+        currentSize = 0
+      }
+
+      // Assign files to partitions using "First Fit Decreasing" (FFD)
+      // TODO: consider adding a slop factor here?
+      splitFiles.foreach { file =>
+        if (currentSize + file.length > maxSplitBytes) {
+          closePartition()
+        }
+        addFile(file)
+      }
+      closePartition()
+      partitions
+  }
+
   private def getBlockLocations(file: FileStatus): Array[BlockLocation] = file match {
     case f: LocatedFileStatus => f.getBlockLocations
     case f => Array.empty[BlockLocation]

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -258,6 +258,11 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val OPTIMIZER_METADATA_ONLY = SQLConfigBuilder("spark.sql.optimizer.metadataOnly")
+    .doc("When true, enable the metadata-only query optimization.")
+    .booleanConf
+    .createWithDefault(false)
+
   val NATIVE_VIEW = SQLConfigBuilder("spark.sql.nativeView")
     .internal()
     .doc("When true, CREATE VIEW will be handled by Spark SQL instead of Hive native commands.  " +
@@ -599,6 +604,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf with Logging {
 
   def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
+  def optimizerMetadataOnly: Boolean = getConf(OPTIMIZER_METADATA_ONLY)
+
   def nativeView: Boolean = getConf(NATIVE_VIEW)
 
   def wholeStageEnabled: Boolean = getConf(WHOLESTAGE_CODEGEN_ENABLED)

diff --git a/...e/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/...e/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
@@ -340,6 +340,21 @@ class FileSourceStrategySuite extends QueryTest with SharedSQLContext with Predi
     }
   }
 
+  test("optimize metadataOnly") {
+    withSQLConf("spark.sql.optimizer.metadataOnly" -> "true") {
+      val table =
+        createTable(
+          files = Seq(
+            "p1=1/file1" -> 10,
+            "p1=2/file2" -> 10))
+
+      checkDataset(table.select($"p1"), Row(1), Row(2))
+      checkDataset(table.where("p1 = 1").select($"p1"), Row(1))
+      val df = table.where("p1 = 1 AND (p1 + c1) = 2 AND c1 = 1")
+      assert(getPhysicalFilters(df) contains resolve(df, "c1 = 1"))
+    }
+  }
+
   // Helpers for checking the arguments passed to the FileFormat.
 
   protected val checkPartitionSchema =