passing conf map to runner, tests pass

apache · BryanCutler · May 24, 2018 · May 24, 2018 · May 24, 2018 · May 25, 2018
commit 5a7edb2bc30dc7fe93d19504e78fbf83c1f525d9
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -224,8 +224,13 @@ def read_udfs(pickleSer, infile, eval_type):
                      PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                      PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                      PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):
-        timezone = utf8_deserializer.loads(infile)
-        ser = ArrowStreamPandasSerializer(timezone)
+        runner_conf = {}
+        num_conf = read_int(infile)
+        for i in range(num_conf):
+            k = utf8_deserializer.loads(infile)
+            v = utf8_deserializer.loads(infile)
+            runner_conf[k] = v
+        ser = ArrowStreamPandasSerializer(runner_conf.get("spark.sql.session.timeZone", None))
     else:
         ser = BatchedSerializer(PickleSerializer(), 100)
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AggregateInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AggregateInPandasExec.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning}
 import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DataType, StructField, StructType}
 import org.apache.spark.util.Utils
 
@@ -134,11 +135,23 @@ case class AggregateInPandasExec(
         rows
       }
 
+      val timeZoneConf = if (pandasRespectSessionTimeZone) {
+        Seq(SQLConf.SESSION_LOCAL_TIMEZONE.key -> sessionLocalTimeZone)
+      } else {
+        Nil
+      }
+      val runnerConfEntries = Seq() ++ timeZoneConf
+      val runnerConf = Map(runnerConfEntries: _*)
+
       val columnarBatchIter = new ArrowPythonRunner(
-        pyFuncs, bufferSize, reuseWorker,
-        PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF, argOffsets, aggInputSchema,
-        sessionLocalTimeZone, pandasRespectSessionTimeZone)
-        .compute(projectedRowIter, context.partitionId(), context)
+        pyFuncs,
+        bufferSize,
+        reuseWorker,
+        PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
+        argOffsets,
+        aggInputSchema,
+        sessionLocalTimeZone,
+        runnerConf).compute(projectedRowIter, context.partitionId(), context)
 
       val joinedAttributes =
         groupingExpressions.map(_.toAttribute) ++ udfExpressions.map(_.resultAttribute)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -24,6 +24,7 @@ import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -79,11 +80,23 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
     // DO NOT use iter.grouped(). See BatchIterator.
     val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter)
 
+    val timeZoneConf = if (pandasRespectSessionTimeZone) {
+      Seq(SQLConf.SESSION_LOCAL_TIMEZONE.key -> sessionLocalTimeZone)
+    } else {
+      Nil
+    }
+    val runnerConfEntries = Seq() ++ timeZoneConf
+    val runnerConf = Map(runnerConfEntries: _*)
+
     val columnarBatchIter = new ArrowPythonRunner(
-        funcs, bufferSize, reuseWorker,
-        PythonEvalType.SQL_SCALAR_PANDAS_UDF, argOffsets, schema,
-        sessionLocalTimeZone, pandasRespectSessionTimeZone)
-      .compute(batchIter, context.partitionId(), context)
+      funcs,
+      bufferSize,
+      reuseWorker,
+      PythonEvalType.SQL_SCALAR_PANDAS_UDF,
+      argOffsets,
+      schema,
+      sessionLocalTimeZone,
+      runnerConf).compute(batchIter, context.partitionId(), context)
 
     new Iterator[InternalRow] {
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -45,7 +45,7 @@ class ArrowPythonRunner(
     argOffsets: Array[Array[Int]],
     schema: StructType,
     timeZoneId: String,
-    respectTimeZone: Boolean)
+    conf: Map[String, String])
   extends BasePythonRunner[Iterator[InternalRow], ColumnarBatch](
     funcs, bufferSize, reuseWorker, evalType, argOffsets) {
 
@@ -59,17 +59,17 @@ class ArrowPythonRunner(
 
       protected override def writeCommand(dataOut: DataOutputStream): Unit = {
         PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets)
-        if (respectTimeZone) {
-          PythonRDD.writeUTF(timeZoneId, dataOut)
-        } else {
-          dataOut.writeInt(SpecialLengths.NULL)
+        dataOut.writeInt(conf.size)
+        for ((k, v) <- conf) {
+          PythonRDD.writeUTF(k, dataOut)
+          PythonRDD.writeUTF(v, dataOut)
         }
       }
 
       protected override def writeIteratorToStream(dataOut: DataOutputStream): Unit = {
-        val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId)
         val allocator = ArrowUtils.rootAllocator.newChildAllocator(
           s"stdout writer for $pythonExec", 0, Long.MaxValue)
+        val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId)
         val root = VectorSchemaRoot.create(arrowSchema, allocator)
 
         Utils.tryWithSafeFinally {

diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala b/...core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning}
 import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -137,12 +138,23 @@ case class FlatMapGroupsInPandasExec(
       }
 
       val context = TaskContext.get()
+      val timeZoneConf = if (pandasRespectSessionTimeZone) {
+        Seq(SQLConf.SESSION_LOCAL_TIMEZONE.key -> sessionLocalTimeZone)
+      } else {
+        Nil
+      }
+      val runnerConfEntries = Seq() ++ timeZoneConf
+      val runnerConf = Map(runnerConfEntries: _*)
 
       val columnarBatchIter = new ArrowPythonRunner(
-        chainedFunc, bufferSize, reuseWorker,
-        PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, argOffsets, dedupSchema,
-        sessionLocalTimeZone, pandasRespectSessionTimeZone)
-          .compute(grouped, context.partitionId(), context)
+        chainedFunc,
+        bufferSize,
+        reuseWorker,
+        PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
+        argOffsets,
+        dedupSchema,
+        sessionLocalTimeZone,
+        runnerConf).compute(grouped, context.partitionId(), context)
 
       columnarBatchIter.flatMap(_.rowIterator.asScala).map(UnsafeProjection.create(output, output))
     }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/WindowInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/WindowInPandasExec.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DataType, StructField, StructType}
 import org.apache.spark.util.Utils
 
@@ -153,12 +154,23 @@ case class WindowInPandasExec(
         }
       }
 
+      val timeZoneConf = if (pandasRespectSessionTimeZone) {
+        Seq(SQLConf.SESSION_LOCAL_TIMEZONE.key -> sessionLocalTimeZone)
+      } else {
+        Nil
+      }
+      val runnerConfEntries = Seq() ++ timeZoneConf
+      val runnerConf = Map(runnerConfEntries: _*)
+
       val windowFunctionResult = new ArrowPythonRunner(
-        pyFuncs, bufferSize, reuseWorker,
+        pyFuncs,
+        bufferSize,
+        reuseWorker,
         PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF,
-        argOffsets, windowInputSchema,
-        sessionLocalTimeZone, pandasRespectSessionTimeZone)
-        .compute(pythonInput, context.partitionId(), context)
+        argOffsets,
+        windowInputSchema,
+        sessionLocalTimeZone,
+        runnerConf).compute(pythonInput, context.partitionId(), context)
 
       val joined = new JoinedRow
       val resultProj = createResultProjection(expressions)