apache · salilsurendran · Jan 20, 2017 · Jan 30, 2017 · Jan 30, 2017 · Feb 3, 2017
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
@@ -1256,8 +1256,9 @@ Configuration of in-memory caching can be done using the `setConf` method on `Sp
 
 ## Other Configuration Options
 
-The following options can also be used to tune the performance of query execution. It is possible
-that these options will be deprecated in future release as more optimizations are performed automatically.
+The following options can also be used to tune the performance of query execution and attaching
+query execution listeners. It is possible that these options will be deprecated in future release as
+more optimizations are performed automatically.
 
 <table class="table">
   <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
@@ -1304,6 +1305,16 @@ that these options will be deprecated in future release as more optimizations ar
       Configures the number of partitions to use when shuffling data for joins or aggregations.
     </td>
   </tr>
+  <tr>
+      <td><code>spark.sql.queryExecutionListeners</code></td>
+      <td></td>
+      <td>
+        A comma-separated list of classes that implement QueryExecutionListener. When creating a SparkSession,
+        instances of these listeners will be added to it. These classes needs to have a zero-argument
+        constructor. If the specified class can't be found or the class specified doesn't have a valid
+        constructor the SparkSession creation will fail with an exception.
+      </td>
+    </tr>
 </table>
 
 # Distributed SQL Engine

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
@@ -128,7 +128,13 @@ object MimaExcludes {
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.startOffset"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.endOffset"),
       ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryException.this"),
-      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryException.query")
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryException.query"),
+
+      // [SPARK-18120 ][SQL] Call QueryExecutionListener callback methods for DataFrameWriter methods
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onSuccess"),
+      ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onFailure"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onSuccess"),
+      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.util.QueryExecutionListener.onFailure")      
     )
   }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -26,10 +26,12 @@ import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogRelation, CatalogTable, CatalogTableType}
 import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable
+import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation}
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.{OutputParams, QueryExecutionListener}
 
 /**
  * Interface used to write a [[Dataset]] to external storage systems (e.g. file systems,
@@ -189,6 +191,33 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
     this
   }
 
+  /**
+   * Executes the query and calls the {@link org.apache.spark.sql.util.QueryExecutionListener}
+   * methods.
+   *
+   * @param funcName A identifier for the method executing the query
+   * @param qe the @see [[QueryExecution]] object associated with the
+   *        query
+   * @param outputParams The output parameters useful for query analysis
+   * @param action the function that executes the query after which the listener methods gets
+   *               called.
+   */
+  private def executeAndCallQEListener(
+                                        funcName: String,
+                                        qe: QueryExecution,
+                                        outputParams: OutputParams)(action: => Unit) = {
+    try {
+      val start = System.nanoTime()
+      action
+      val end = System.nanoTime()
+      df.sparkSession.listenerManager.onSuccess(funcName, qe, end - start, Some(outputParams))
+    } catch {
+      case e: Exception =>
+        df.sparkSession.listenerManager.onFailure(funcName, qe, e, Some(outputParams))
+        throw e
+    }
+  }
+
   /**
    * Saves the content of the `DataFrame` at the specified path.
    *
@@ -218,7 +247,17 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       bucketSpec = getBucketSpec,
       options = extraOptions.toMap)
 
-    dataSource.write(mode, df)
+    val destination = source match {
+      case "jdbc" => extraOptions.get("dbtable")
+      case _ => extraOptions.get("path")
+    }
+
+    executeAndCallQEListener(
+      "save",
+      df.queryExecution,
+      OutputParams(source, destination, extraOptions.toMap)) {
+      dataSource.write(mode, df)
+    }
   }
 
   /**
@@ -244,6 +283,11 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    *
    * Because it inserts data to an existing table, format or options will be ignored.
    *
+   * Calls the callback methods of @see[[QueryExecutionListener]] after query execution with
+   * @see[[OutputParams]] having datasourceType set as the string parameter passed to the
+   * @see[[DataFrameWriter#format]] method and destination set as the name of the table into which
+   * data is being inserted into.
+   *
    * @since 1.4.0
    */
   def insertInto(tableName: String): Unit = {
@@ -261,13 +305,19 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       )
     }
 
-    df.sparkSession.sessionState.executePlan(
+    val qe = df.sparkSession.sessionState.executePlan(
       InsertIntoTable(
         table = UnresolvedRelation(tableIdent),
         partition = Map.empty[String, Option[String]],
         child = df.logicalPlan,
         overwrite = mode == SaveMode.Overwrite,
-        ifNotExists = false)).toRdd
+        ifNotExists = false))
+    executeAndCallQEListener(
+      "insertInto",
+      qe,
+      new OutputParams(source, Some(tableIdent.unquotedString), extraOptions.toMap)) {
+        qe.toRdd
+    }
   }
 
   private def normalizedParCols: Option[Seq[String]] = partitioningColumns.map { cols =>
@@ -324,7 +374,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
 
   private def assertNotPartitioned(operation: String): Unit = {
     if (partitioningColumns.isDefined) {
-      throw new AnalysisException( s"'$operation' does not support partitioning")
+      throw new AnalysisException(s"'$operation' does not support partitioning")
     }
   }
 
@@ -359,6 +409,10 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL
    * specific format.
    *
+   * Calls the callback methods of @see[[QueryExecutionListener]] after query execution with a
+   * @see[[OutputParams]] object having datasourceType set as the string parameter passed to the
+   * @see[[DataFrameWriter#format]] and destination set as the name of the table being
+   * written to
    * @since 1.4.0
    */
   def saveAsTable(tableName: String): Unit = {
@@ -428,8 +482,14 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       partitionColumnNames = partitioningColumns.getOrElse(Nil),
       bucketSpec = getBucketSpec
     )
-    df.sparkSession.sessionState.executePlan(
-      CreateTable(tableDesc, mode, Some(df.logicalPlan))).toRdd
+    val qe = df.sparkSession.sessionState.executePlan(
+      CreateTable(tableDesc, mode, Some(df.logicalPlan)))
+    executeAndCallQEListener(
+      "saveAsTable",
+      qe,
+      new OutputParams(source, Some(tableIdent.unquotedString), extraOptions.toMap)) {
+      qe.toRdd
+    }
   }
 
   /**
@@ -493,6 +553,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * indicates a timestamp format. Custom date formats follow the formats at
    * `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
    * </ul>
+   * Calls the callback methods in @see[[QueryExecutionListener]] methods after query execution with
+   * @see[[OutputParams]] having datasourceType set as string constant "json" and
+   * destination set as the path to which the data is written
    *
    * @since 1.4.0
    */
@@ -514,6 +577,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * shorten names(none, `snappy`, `gzip`, and `lzo`). This will override
    * `spark.sql.parquet.compression.codec`.</li>
    * </ul>
+   * Calls the callback methods in @see[[QueryExecutionListener]] methods after query execution with
+   * @see[[OutputParams]] having datasourceType set as string constant "parquet" and
+   * destination set as the path to which the data is written
    *
    * @since 1.4.0
    */
@@ -534,6 +600,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * one of the known case-insensitive shorten names(`none`, `snappy`, `zlib`, and `lzo`).
    * This will override `orc.compress`.</li>
    * </ul>
+   * Calls the callback methods in @see[[QueryExecutionListener]] methods after query execution with
+   * @see[[OutputParams]] having datasourceType set as string constant "orc" and
+   * destination set as the path to which the data is written
    *
    * @since 1.5.0
    * @note Currently, this method can only be used after enabling Hive support
@@ -560,6 +629,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * one of the known case-insensitive shorten names (`none`, `bzip2`, `gzip`, `lz4`,
    * `snappy` and `deflate`). </li>
    * </ul>
+   * Calls the callback methods in e@see[[QueryExecutionListener]] methods after query execution
+   * with @see[[OutputParams]] having datasourceType set as string constant "text" and
+   * destination set as the path to which the data is written
    *
    * @since 1.6.0
    */
@@ -599,6 +671,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    * indicates a timestamp format. Custom date formats follow the formats at
    * `java.text.SimpleDateFormat`. This applies to timestamp type.</li>
    * </ul>
+   * Calls the callback methods in @see[[QueryExecutionListener]] methods after query execution with
+   * @see[[OutputParams]] having datasourceType set as string constant "csv" and
+   * destination set as the path to which the data is written
    *
    * @since 2.0.0
    */

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -40,12 +40,12 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Range}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.datasources.LogicalRelation
 import org.apache.spark.sql.execution.ui.SQLListener
-import org.apache.spark.sql.internal.{CatalogImpl, SessionState, SharedState}
+import org.apache.spark.sql.internal.{CatalogImpl, SessionState, SharedState, SQLConf}
 import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.streaming._
 import org.apache.spark.sql.types.{DataType, LongType, StructType}
-import org.apache.spark.sql.util.ExecutionListenerManager
+import org.apache.spark.sql.util.{ExecutionListenerManager, QueryExecutionListener}
 import org.apache.spark.util.Utils
 
 
@@ -876,6 +876,9 @@ object SparkSession {
         }
         session = new SparkSession(sparkContext)
         options.foreach { case (k, v) => session.sessionState.conf.setConfString(k, v) }
+        for (qeListener <- createQueryExecutionListeners(session.sparkContext.getConf)) {
+          session.listenerManager.register(qeListener)
+        }
         defaultSession.set(session)
 
         // Register a successfully instantiated context to the singleton. This should be at the
@@ -893,6 +896,12 @@ object SparkSession {
     }
   }
 
+  private def createQueryExecutionListeners(conf: SparkConf): Seq[QueryExecutionListener] = {
+    conf.get(SQLConf.QUERY_EXECUTION_LISTENERS)
+      .map(Utils.classForName(_))
+      .map(_.newInstance().asInstanceOf[QueryExecutionListener])
+  }
+
   /**
    * Creates a [[SparkSession.Builder]] for constructing a [[SparkSession]].
    *

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -655,6 +655,13 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val QUERY_EXECUTION_LISTENERS =
+    ConfigBuilder("spark.sql.queryExecutionListeners")
+      .doc("QueryExecutionListeners to be attached to the SparkSession")
+      .stringConf
+      .toSequence
+      .createWithDefault(Nil)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
@@ -44,27 +44,49 @@ trait QueryExecutionListener {
    * @param qe the QueryExecution object that carries detail information like logical plan,
    *           physical plan, etc.
    * @param durationNs the execution time for this query in nanoseconds.
-   *
-   * @note This can be invoked by multiple different threads.
+   * @param outputParams The output parameters in case the method is invoked as a result of a
+   *                     write operation. In case of a read will be @see[[None]]
    */
   @DeveloperApi
-  def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit
-
+  def onSuccess(
+      funcName: String,
+      qe: QueryExecution,
+      durationNs: Long,
+      outputParams: Option[OutputParams]): Unit
   /**
    * A callback function that will be called when a query execution failed.
    *
    * @param funcName the name of the action that triggered this query.
    * @param qe the QueryExecution object that carries detail information like logical plan,
    *           physical plan, etc.
    * @param exception the exception that failed this query.
+   * @param outputParams The output parameters in case the method is invoked as a result of a
+   *                     write operation. In case of a read will be @see[[None]]
    *
    * @note This can be invoked by multiple different threads.
    */
   @DeveloperApi
-  def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit
+  def onFailure(
+      funcName: String,
+      qe: QueryExecution,
+      exception: Exception,
+      outputParams: Option[OutputParams]): Unit
 }
 
-
+/**
+ * Contains extra information useful for query analysis passed on from the methods in
+ * @see[[org.apache.spark.sql.DataFrameWriter]] while writing to a datasource
+ * @param datasourceType type of data source written to like csv, parquet, json, hive, jdbc etc.
+ * @param destination path or table name written to
+ * @param options the map containing the output options for the underlying datasource
+ *                specified by using the @see [[org.apache.spark.sql.DataFrameWriter#option]] method
+ * @param writeParams will contain any extra information that the write method wants to provide
+ */
+case class OutputParams(
+    datasourceType: String,
+    destination: Option[String],
+    options: Map[String, String],
+    writeParams: Map[String, String] = Map.empty)
 /**
  * :: Experimental ::
  *
@@ -98,18 +120,26 @@ class ExecutionListenerManager private[sql] () extends Logging {
     listeners.clear()
   }
 
-  private[sql] def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
+  private[sql] def onSuccess(
+      funcName: String,
+      qe: QueryExecution,
+      duration: Long,
+      outputParams: Option[OutputParams] = None): Unit = {
     readLock {
       withErrorHandling { listener =>
-        listener.onSuccess(funcName, qe, duration)
+        listener.onSuccess(funcName, qe, duration, outputParams)
       }
     }
   }
 
-  private[sql] def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {
+  private[sql] def onFailure(
+      funcName: String,
+      qe: QueryExecution,
+      exception: Exception,
+      outputParams: Option[OutputParams] = None): Unit = {
     readLock {
       withErrorHandling { listener =>
-        listener.onFailure(funcName, qe, exception)
+        listener.onFailure(funcName, qe, exception, outputParams)
       }
     }
   }