-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-26327][SQL][FOLLOW-UP] Refactor the code and restore the metrics name #23328
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,7 @@ | |
|
|
||
| package org.apache.spark.sql.execution | ||
|
|
||
| import scala.collection.mutable.ArrayBuffer | ||
| import scala.collection.mutable.{ArrayBuffer, HashMap} | ||
|
|
||
| import org.apache.commons.lang3.StringUtils | ||
| import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path} | ||
|
|
@@ -167,14 +167,26 @@ case class FileSourceScanExec( | |
| partitionSchema = relation.partitionSchema, | ||
| relation.sparkSession.sessionState.conf) | ||
|
|
||
| private var fileListingTime = 0L | ||
| val driverMetrics: HashMap[String, Long] = HashMap.empty | ||
|
|
||
| /** | ||
| * Send the driver-side metrics. Before calling this function, selectedPartitions has | ||
| * been initialized. See SPARK-26327 for more details. | ||
| */ | ||
| private def sendDriverMetrics(): Unit = { | ||
| driverMetrics.foreach(e => metrics(e._1).add(e._2)) | ||
| val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) | ||
| SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, | ||
| metrics.filter(e => driverMetrics.contains(e._1)).values.toSeq) | ||
| } | ||
|
|
||
| @transient private lazy val selectedPartitions: Seq[PartitionDirectory] = { | ||
| val optimizerMetadataTimeNs = relation.location.metadataOpsTimeNs.getOrElse(0L) | ||
| val startTime = System.nanoTime() | ||
| val ret = relation.location.listFiles(partitionFilters, dataFilters) | ||
| driverMetrics("filesNum") = ret.map(_.files.size.toLong).sum | ||
| val timeTakenMs = ((System.nanoTime() - startTime) + optimizerMetadataTimeNs) / 1000 / 1000 | ||
| fileListingTime = timeTakenMs | ||
| driverMetrics("metadataTime") = timeTakenMs | ||
| ret | ||
| } | ||
|
|
||
|
|
@@ -286,8 +298,6 @@ case class FileSourceScanExec( | |
| } | ||
|
|
||
| private lazy val inputRDD: RDD[InternalRow] = { | ||
| // Update metrics for taking effect in both code generation node and normal node. | ||
| updateDriverMetrics() | ||
| val readFile: (PartitionedFile) => Iterator[InternalRow] = | ||
| relation.fileFormat.buildReaderWithPartitionValues( | ||
| sparkSession = relation.sparkSession, | ||
|
|
@@ -298,12 +308,14 @@ case class FileSourceScanExec( | |
| options = relation.options, | ||
| hadoopConf = relation.sparkSession.sessionState.newHadoopConfWithOptions(relation.options)) | ||
|
|
||
| relation.bucketSpec match { | ||
| val readRDD = relation.bucketSpec match { | ||
| case Some(bucketing) if relation.sparkSession.sessionState.conf.bucketingEnabled => | ||
| createBucketedReadRDD(bucketing, readFile, selectedPartitions, relation) | ||
| case _ => | ||
| createNonBucketedReadRDD(readFile, selectedPartitions, relation) | ||
| } | ||
| sendDriverMetrics() | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Line 313 and line 315, both are calling |
||
| readRDD | ||
| } | ||
|
|
||
| override def inputRDDs(): Seq[RDD[InternalRow]] = { | ||
|
|
@@ -313,7 +325,7 @@ case class FileSourceScanExec( | |
| override lazy val metrics = | ||
| Map("numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), | ||
| "numFiles" -> SQLMetrics.createMetric(sparkContext, "number of files"), | ||
| "fileListingTime" -> SQLMetrics.createMetric(sparkContext, "file listing time (ms)"), | ||
| "metadataTime" -> SQLMetrics.createMetric(sparkContext, "metadata time"), | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the original name is more straightfoward to end users who has no idea about file listing.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure we will add more metadata operation and reuse this metrics. Anyway it was not a good idea to do the renaming in a bug fix PR. @xuanyuanking can you create a ticket and send a new PR for renaming?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Copy that, original thinking is this bug fix is part of https://issues.apache.org/jira/browse/SPARK-26222. |
||
| "scanTime" -> SQLMetrics.createTimingMetric(sparkContext, "scan time")) | ||
|
|
||
| protected override def doExecute(): RDD[InternalRow] = { | ||
|
|
@@ -504,19 +516,6 @@ case class FileSourceScanExec( | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Send the updated metrics to driver, while this function calling, selectedPartitions has | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is wrong. |
||
| * been initialized. See SPARK-26327 for more detail. | ||
| */ | ||
| private def updateDriverMetrics() = { | ||
| metrics("numFiles").add(selectedPartitions.map(_.files.size.toLong).sum) | ||
| metrics("fileListingTime").add(fileListingTime) | ||
|
|
||
| val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) | ||
| SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, | ||
| metrics("numFiles") :: metrics("fileListingTime") :: Nil) | ||
| } | ||
|
|
||
| override def doCanonicalize(): FileSourceScanExec = { | ||
| FileSourceScanExec( | ||
| relation, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi, @gatorsmile . It looks like a typo of
numFiles.