vectorized udfs working but hardcoded for ArrowPandasSerializer

apache · BryanCutler · Jul 14, 2017 · Jul 14, 2017 · Jul 14, 2017 · Aug 4, 2017
commit be81ef6be3f9e8965c2c182c2f5733bbaa78c4d2
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
@@ -187,8 +187,14 @@ class ArrowSerializer(FramedSerializer):
     Serializes an Arrow stream.
     """
 
-    def dumps(self, obj):
-        raise NotImplementedError
+    def dumps(self, batch):
+        import pyarrow as pa
+        import io
+        sink = io.BytesIO()
+        writer = pa.RecordBatchFileWriter(sink, batch.schema)
+        writer.write_batch(batch)
+        writer.close()
+        return sink.getvalue()
 
     def loads(self, obj):
         import pyarrow as pa
@@ -199,6 +205,28 @@ def __repr__(self):
         return "ArrowSerializer"
 
 
+class ArrowPandasSerializer(ArrowSerializer):
+
+    def __init__(self):
+        super(ArrowPandasSerializer, self).__init__()
+
+    # make an ArrowRecordBatch from a Pandas Series and serialize
+    def dumps(self, series):
+        import pyarrow as pa
+        # TODO: iterator could be a tuple
+        arr = pa.Array.from_pandas(series)
+        batch = pa.RecordBatch.from_arrays([arr], ["_0"])
+        return super(ArrowPandasSerializer, self).dumps(batch)
+
+    # deserialize an ArrowRecordBatch to an Arrow table and return as a list of pandas.Series
+    def loads(self, obj):
+        table = super(ArrowPandasSerializer, self).loads(obj)
+        return [c.to_pandas() for c in table.itercolumns()]
+
+    def __repr__(self):
+        return "ArrowPandasSerializer"
+
+
 class BatchedSerializer(Serializer):
 
     """

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -30,7 +30,8 @@
 from pyspark.taskcontext import TaskContext
 from pyspark.files import SparkFiles
 from pyspark.serializers import write_with_length, write_int, read_long, \
-    write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer, BatchedSerializer
+    write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer, BatchedSerializer, \
+    ArrowPandasSerializer
 from pyspark import shuffle
 
 pickleSer = PickleSerializer()
@@ -101,8 +102,14 @@ def read_udfs(pickleSer, infile):
     mapper_str = "lambda a: (%s)" % (", ".join(call_udf))
     mapper = eval(mapper_str, udfs)
 
-    func = lambda _, it: map(mapper, it)
-    ser = BatchedSerializer(PickleSerializer(), 100)
+    # Batched Data
+    #func = lambda _, it: map(mapper, it)
+    #ser = BatchedSerializer(PickleSerializer(), 100)
+
+    # Arrow Data
+    func = lambda _, series_list_generator: mapper(list(series_list_generator)[0])
+    ser = ArrowPandasSerializer()
+
     # profiling is not supported for UDF
     return func, None, ser, ser
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -82,7 +82,6 @@ private[sql] object ArrowConverters {
 
     val root = VectorSchemaRoot.create(arrowSchema, allocator)
     val arrowWriter = ArrowWriter.create(root)
-
     var closed = false
 
     context.addTaskCompletionListener { _ =>
@@ -203,4 +202,20 @@ private[sql] object ArrowConverters {
       reader.close()
     }
   }
+
+  private[arrow] def execByteArrayAsVectors(
+     batchBytes: Array[Byte],
+     allocator: BufferAllocator)(block: (VectorSchemaRoot) => Unit): Unit = {
+    val in = new ByteArrayReadableSeekableByteChannel(batchBytes)
+    val reader = new ArrowFileReader(in, allocator)
+
+    // Read a batch from a byte stream, ensure the reader is closed
+    Utils.tryWithSafeFinally {
+      val root = reader.getVectorSchemaRoot  // throws IOException
+      reader.loadNextBatch()  // throws IOException
+      block(root)
+    } {
+      reader.close()
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -0,0 +1,118 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.execution.python
+
+import java.io.{DataOutputStream, File}
+
+import org.apache.spark.api.python.{ChainedPythonFunctions, PythonRunner}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.execution.arrow.{ArrowConverters, ArrowPayload}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.types.{DataType, StructField, StructType}
+import org.apache.spark.util.Utils
+import org.apache.spark.{SparkEnv, TaskContext}
+
+import scala.collection.mutable.ArrayBuffer
+
+
+/**
+  * A physical plan that evaluates a [[PythonUDF]],
+  */
+case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan)
+  extends SparkPlan {
+
+  def children: Seq[SparkPlan] = child :: Nil
+
+  override def producedAttributes: AttributeSet = AttributeSet(output.drop(child.output.length))
+
+  private def collectFunctions(udf: PythonUDF): (ChainedPythonFunctions, Seq[Expression]) = {
+    udf.children match {
+      case Seq(u: PythonUDF) =>
+        val (chained, children) = collectFunctions(u)
+        (ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), children)
+      case children =>
+        // There should not be any other UDFs, or the children can't be evaluated directly.
+        assert(children.forall(_.find(_.isInstanceOf[PythonUDF]).isEmpty))
+        (ChainedPythonFunctions(Seq(udf.func)), udf.children)
+    }
+  }
+
+  protected override def doExecute(): RDD[InternalRow] = {
+    val inputRDD = child.execute().map(_.copy())
+    val bufferSize = inputRDD.conf.getInt("spark.buffer.size", 65536)
+    val reuseWorker = inputRDD.conf.getBoolean("spark.python.worker.reuse", defaultValue = true)
+
+    inputRDD.mapPartitions { iter =>
+
+      // The queue used to buffer input rows so we can drain it to
+      // combine input with output from Python.
+      val queue = HybridRowQueue(TaskContext.get().taskMemoryManager(),
+        new File(Utils.getLocalDir(SparkEnv.get.conf)), child.output.length)
+      TaskContext.get().addTaskCompletionListener({ ctx =>
+        queue.close()
+      })
+
+      val (pyFuncs, inputs) = udfs.map(collectFunctions).unzip
+
+      // flatten all the arguments
+      val allInputs = new ArrayBuffer[Expression]
+      val dataTypes = new ArrayBuffer[DataType]
+      val argOffsets = inputs.map { input =>
+        input.map { e =>
+          if (allInputs.exists(_.semanticEquals(e))) {
+            allInputs.indexWhere(_.semanticEquals(e))
+          } else {
+            allInputs += e
+            dataTypes += e.dataType
+            allInputs.length - 1
+          }
+        }.toArray
+      }.toArray
+      val projection = newMutableProjection(allInputs, child.output)
+      val schema = StructType(dataTypes.map(dt => StructField("", dt)))
+
+      // Input iterator to Python: input rows are grouped so we send them in batches to Python.
+      // For each row, add it to the queue.
+      val projectedRowIter = iter.map { inputRow =>
+        queue.add(inputRow.asInstanceOf[UnsafeRow])
+        projection(inputRow)
+      }
+
+      val inputIterator = ArrowConverters.toPayloadIterator(projectedRowIter, schema, 0).
+        map(_.asPythonSerializable)
+
+      val context = TaskContext.get()
+
+      // Output iterator for results from Python.
+      val outputIterator = new PythonRunner(pyFuncs, bufferSize, reuseWorker, true, argOffsets).
+        compute(inputIterator, context.partitionId(), context)
+
+      val joined = new JoinedRow
+      val resultProj = UnsafeProjection.create(output, output)
+
+      val outputRowIterator = ArrowConverters.fromPayloadIterator(
+        outputIterator.map(ArrowPayload(_)))
+
+      outputRowIterator.map { outputRow =>
+        resultProj(joined(queue.remove(), outputRow))
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -138,7 +138,8 @@ object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
           val resultAttrs = udfs.zipWithIndex.map { case (u, i) =>
             AttributeReference(s"pythonUDF$i", u.dataType)()
           }
-          val evaluation = BatchEvalPythonExec(validUdfs, child.output ++ resultAttrs, child)
+          //val evaluation = BatchEvalPythonExec(validUdfs, child.output ++ resultAttrs, child)
+          val evaluation = ArrowEvalPythonExec(validUdfs, child.output ++ resultAttrs, child)
           attributeMap ++= validUdfs.zip(resultAttrs)
           evaluation
         } else {