Allow custom plugin for AQE cost evaluator

apache · c21 · Jun 17, 2021 · Jun 17, 2021 · Jul 1, 2021 · Jul 2, 2021
commit 494b8bc1198214d4e0e72ec188b1e32e4ddf8e4e
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -678,6 +678,13 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
+  val ADAPTIVE_COST_EVALUATOR_CLASS =
+    buildConf("spark.sql.adaptive.costEvaluatorClass")
+      .version("3.2.0")
+      .internal()
+      .stringConf
+      .createWithDefault("org.apache.spark.sql.execution.adaptive.SimpleCostEvaluator")
+
   val SUBEXPRESSION_ELIMINATION_ENABLED =
     buildConf("spark.sql.subexpressionElimination.enabled")
       .internal()
@@ -3582,6 +3589,8 @@ class SQLConf extends Serializable with Logging {
 
   def coalesceShufflePartitionsEnabled: Boolean = getConf(COALESCE_PARTITIONS_ENABLED)
 
+  def adaptiveCostEvaluatorClass: String = getConf(ADAPTIVE_COST_EVALUATOR_CLASS)
+
   def minBatchesToRetain: Int = getConf(MIN_BATCHES_TO_RETAIN)
 
   def maxBatchesToRetainInMemory: Int = getConf(MAX_BATCHES_TO_RETAIN_IN_MEMORY)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -130,7 +130,7 @@ case class AdaptiveSparkPlanExec(
     }
   }
 
-  @transient private val costEvaluator = SimpleCostEvaluator
+  @transient private val costEvaluator = CostEvaluator.instantiate(conf.adaptiveCostEvaluatorClass)
 
   @transient val initialPlan = context.session.withActive {
     applyPhysicalRules(

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/costing.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/costing.scala
@@ -17,16 +17,32 @@
 
 package org.apache.spark.sql.execution.adaptive
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.util.Utils
 
 /**
- * Represents the cost of a plan.
+ * An interface to represent the cost of a plan.
  */
 trait Cost extends Ordered[Cost]
 
 /**
- * Evaluates the cost of a physical plan.
+ * An interface to evaluate the cost of a physical plan.
  */
 trait CostEvaluator {
   def evaluateCost(plan: SparkPlan): Cost
 }
+
+object CostEvaluator extends Logging {
+
+  /**
+   * Instantiates a [[CostEvaluator]] using the given className.
+   */
+  def instantiate(className: String): CostEvaluator = {
+    logDebug(s"Creating CostEvaluator $className")
+    val clazz = Utils.classForName[CostEvaluator](className)
+    // Use the default no-argument constructor.
+    val ctor = clazz.getDeclaredConstructor()
+    ctor.newInstance()
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala
@@ -38,7 +38,7 @@ case class SimpleCost(value: Long) extends Cost {
  * A simple implementation of [[CostEvaluator]], which counts the number of
  * [[ShuffleExchangeLike]] nodes in the plan.
  */
-object SimpleCostEvaluator extends CostEvaluator {
+case class SimpleCostEvaluator() extends CostEvaluator {
 
   override def evaluateCost(plan: SparkPlan): Cost = {
     val cost = plan.collect {

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -1898,4 +1898,53 @@ class AdaptiveQueryExecSuite
       assert(coalesceReader.head.partitionSpecs.length == 1)
     }
   }
+
+  test("SPARK-35794: Allow custom plugin for cost evaluator") {
+    CostEvaluator.instantiate(classOf[SimpleShuffleSortCostEvaluator].getCanonicalName)
+    CostEvaluator.instantiate(classOf[SimpleCostEvaluator].getCanonicalName)
+    intercept[ClassCastException] {
+      CostEvaluator.instantiate(classOf[InvalidCostEvaluator].getCanonicalName)
+    }
+
+    withSQLConf(
+      SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
+      SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "80") {
+      val query = "SELECT * FROM testData join testData2 ON key = a where value = '1'"
+
+      withSQLConf(SQLConf.ADAPTIVE_COST_EVALUATOR_CLASS.key ->
+        "org.apache.spark.sql.execution.adaptive.SimpleShuffleSortCostEvaluator") {
+        val (plan, adaptivePlan) = runAdaptiveAndVerifyResult(query)
+        val smj = findTopLevelSortMergeJoin(plan)
+        assert(smj.size == 1)
+        val bhj = findTopLevelBroadcastHashJoin(adaptivePlan)
+        assert(bhj.size == 1)
+        checkNumLocalShuffleReaders(adaptivePlan)
+      }
+
+      withSQLConf(SQLConf.ADAPTIVE_COST_EVALUATOR_CLASS.key ->
+        "org.apache.spark.sql.execution.adaptive.InvalidCostEvaluator") {
+        intercept[ClassCastException] {
+          runAdaptiveAndVerifyResult(query)
+        }
+      }
+    }
+  }
+}
+
+/**
+ * Invalid implementation class for [[CostEvaluator]].
+ */
+private class InvalidCostEvaluator() {}
+
+/**
+ * A simple [[CostEvaluator]] to count number of [[ShuffleExchangeLike]] and [[SortExec]].
+ */
+private case class SimpleShuffleSortCostEvaluator() extends CostEvaluator {
+  override def evaluateCost(plan: SparkPlan): Cost = {
+    val cost = plan.collect {
+      case s: ShuffleExchangeLike => s
+      case s: SortExec => s
+    }.size
+    SimpleCost(cost)
+  }
 }