review comments (mainly formatting)

apache · d80tb7 · Jun 20, 2019 · Jun 20, 2019 · Jun 21, 2019 · Jun 25, 2019
commit 733b59277b51d36e4640d3ea0ff3e097aa301294
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
@@ -403,12 +403,9 @@ def __repr__(self):
 
 class CogroupUDFSerializer(ArrowStreamPandasUDFSerializer):
 
-    def __init__(self, timezone, safecheck, assign_cols_by_name):
-        super(CogroupUDFSerializer, self).__init__(timezone, safecheck, assign_cols_by_name)
-
     def load_stream(self, stream):
         """
-        Deserialize Cogrouped ArrowRecordBatches to a tuple of Arrow tables and return as a two
+        Deserialize Cogrouped ArrowRecordBatches to a tuple of Arrow tables and yield as two
         lists of pandas.Series.
         """
         import pyarrow as pa
@@ -427,7 +424,7 @@ def load_stream(self, stream):
 
             elif dataframes_in_group != 0:
                 raise ValueError(
-                    'Invalid number of dataframes in group {0}'.format(dataframes_in_group))
+                    'Invalid number of pandas.DataFrames in group {0}'.format(dataframes_in_group))
 
 
 class BatchedSerializer(Serializer):

diff --git a/python/pyspark/sql/cogroup.py b/python/pyspark/sql/cogroup.py
@@ -23,14 +23,13 @@
 
 class CoGroupedData(object):
     """
-       A logical grouping of two :class:`GroupedData`,
-       created by :func:`GroupedData.cogroup`.
+    A logical grouping of two :class:`GroupedData`,
+    created by :func:`GroupedData.cogroup`.
 
-       .. note:: Experimental
+    .. note:: Experimental
 
-       .. versionadded:: 3.0
-
-       """
+    .. versionadded:: 3.0
+    """
 
     def __init__(self, gd1, gd2):
         self._gd1 = gd1
@@ -53,7 +52,7 @@ def apply(self, udf):
 
         .. note:: This function requires a full shuffle. All the data of a cogroup will be loaded
             into memory, so the user should be aware of the potential OOM risk if data is skewed
-            and certain goroups are too large to fit in memory.
+            and certain groups are too large to fit in memory.
 
         .. note:: Experimental
 

diff --git a/python/pyspark/sql/tests/test_pandas_udf_cogrouped_map.py b/python/pyspark/sql/tests/test_pandas_udf_cogrouped_map.py
@@ -15,16 +15,11 @@
 # limitations under the License.
 #
 
-import datetime
 import unittest
 import sys
 
-from collections import OrderedDict
-from decimal import Decimal
-
-from pyspark.sql import Row
 from pyspark.sql.functions import array, explode, col, lit, udf, sum, pandas_udf, PandasUDFType
-from pyspark.sql.types import *
+from pyspark.sql.types import DoubleType, StructType, StructField
 from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
     pandas_requirement_message, pyarrow_requirement_message
 from pyspark.testing.utils import QuietTest

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -311,11 +311,11 @@ def read_udfs(pickleSer, infile, eval_type):
             "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\
             .lower() == "true"
 
-        # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
-        # pandas Series. See SPARK-27240.
         if eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
             ser = CogroupUDFSerializer(timezone, safecheck, assign_cols_by_name)
         else:
+            # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
+            # pandas Series. See SPARK-27240.
             df_for_struct = (eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF or
                              eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF or
                              eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF)
@@ -377,28 +377,30 @@ def map_batch(batch):
         # profiling is not supported for UDF
         return func, None, ser, ser
 
-    # Helper function to extract the key and value indexs from arg_offsets
-    # arg_offsets is a List containing the key and value
-    # indexes of columns of the DataFrames to be passed to the udf.
-    # It consists of n repeating groups where n is the number of
-    # DataFrames.  Each group has the following format.
-    # group[0]: length of group
-    # group[1]: length of key indexes
-    # group[2.. group[1] +2]: key attributes
-    # group[group[1] +3 group[0]]: value attributes
-    # See BasePandasGroupExec.resolveArgOffsets for equivalent scala code
-    def extract_key_value_indexes():
+    def extract_key_value_indexes(grouped_arg_offsets):
+        """
+        Helper function to extract the key and value indexes from arg_offsets for the grouped and
+        cogrouped pandas udfs. See BasePandasGroupExec.resolveArgOffsets for equivalent scala code.
+
+        :param grouped_arg_offsets:  List containing the key and value indexes of columns of the
+            DataFrames to be passed to the udf. It consists of n repeating groups where n is the
+            number of DataFrames.  Each group has the following format:
+                group[0]: length of group
+                group[1]: length of key indexes
+                group[2.. group[1] +2]: key attributes
+                group[group[1] +3 group[0]]: value attributes
+        """
         parsed = []
-        i = 0
-        while i < len(arg_offsets):
-            offsets_len = arg_offsets[i]
-            i += 1
-            offsets = arg_offsets[i: i + offsets_len]
+        idx = 0
+        while idx < len(grouped_arg_offsets):
+            offsets_len = grouped_arg_offsets[idx]
+            idx += 1
+            offsets = grouped_arg_offsets[idx: idx + offsets_len]
             split_index = offsets[0] + 1
-            keys = offsets[1: split_index]
-            values = offsets[split_index:]
-            parsed.append([keys, values])
-            i += offsets_len
+            offset_keys = offsets[1: split_index]
+            offset_values = offsets[split_index:]
+            parsed.append([offset_keys, offset_values])
+            idx += offsets_len
         return parsed
 
     udfs = {}
@@ -417,7 +419,7 @@ def extract_key_value_indexes():
         arg_offsets, udf = read_single_udf(
             pickleSer, infile, eval_type, runner_conf, udf_index=0)
         udfs['f'] = udf
-        parsed_offsets = extract_key_value_indexes()
+        parsed_offsets = extract_key_value_indexes(arg_offsets)
         keys = ["a[%d]" % (o,) for o in parsed_offsets[0][0]]
         vals = ["a[%d]" % (o, ) for o in parsed_offsets[0][1]]
         mapper_str = "lambda a: f([%s], [%s])" % (", ".join(keys), ", ".join(vals))
@@ -428,9 +430,9 @@ def extract_key_value_indexes():
         arg_offsets, udf = read_single_udf(
             pickleSer, infile, eval_type, runner_conf, udf_index=0)
         udfs['f'] = udf
-        parsed_offsets = extract_key_value_indexes()
+        parsed_offsets = extract_key_value_indexes(arg_offsets)
         df1_keys = ["a[0][%d]" % (o, ) for o in parsed_offsets[0][0]]
-        df1_vals = ["a[0][%d]" % (o, )for o in parsed_offsets[0][1]]
+        df1_vals = ["a[0][%d]" % (o, ) for o in parsed_offsets[0][1]]
         df2_keys = ["a[1][%d]" % (o, ) for o in parsed_offsets[1][0]]
         df2_vals = ["a[1][%d]" % (o, ) for o in parsed_offsets[1][1]]
         mapper_str = "lambda a: f([%s], [%s], [%s], [%s])" % (

diff --git a/...t/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala b/...t/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/pythonLogicalOperators.scala
@@ -52,7 +52,7 @@ case class MapInPandas(
 }
 
 /**
- * Flatmap cogroups  using a udf: pandas.Dataframe, pandas.Dataframe -> pandas.Dataframe
+ * Flatmap cogroups using a udf: pandas.Dataframe, pandas.Dataframe -> pandas.Dataframe
  * This is used by DataFrame.groupby().cogroup().apply().
  */
 case class FlatMapCoGroupsInPandas(
@@ -66,7 +66,6 @@ case class FlatMapCoGroupsInPandas(
   override val producedAttributes = AttributeSet(output)
 }
 
-
 trait BaseEvalPython extends UnaryNode {
 
   def udfs: Seq[PythonUDF]

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -19,12 +19,9 @@ package org.apache.spark.sql.execution.python
 
 import java.io._
 import java.net._
-import java.util.concurrent.atomic.AtomicBoolean
-
-import scala.collection.JavaConverters._
 
 import org.apache.arrow.vector.VectorSchemaRoot
-import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter}
+import org.apache.arrow.vector.ipc.ArrowStreamWriter
 
 import org.apache.spark._
 import org.apache.spark.api.python._
@@ -33,7 +30,6 @@ import org.apache.spark.sql.execution.arrow.ArrowWriter
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.ArrowUtils
-import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.util.Utils
 
 /**

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BaseArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BaseArrowPythonRunner.scala
@@ -40,7 +40,6 @@ abstract class BaseArrowPythonRunner[T](
     argOffsets: Array[Array[Int]])
   extends BasePythonRunner[T, ColumnarBatch](funcs, evalType, argOffsets) {
 
-
   protected override def newReaderIterator(
       stream: DataInputStream,
       writerThread: WriterThread,
@@ -111,5 +110,3 @@ abstract class BaseArrowPythonRunner[T](
     }
   }
 }
-
-
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BasePandasGroupExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BasePandasGroupExec.scala
@@ -31,8 +31,10 @@ import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
 /**
  * Base functionality for plans which execute grouped python udfs.
  */
-abstract class BasePandasGroupExec(func: Expression,
-                                   output: Seq[Attribute]) extends SparkPlan {
+abstract class BasePandasGroupExec(
+    func: Expression,
+    output: Seq[Attribute])
+  extends SparkPlan {
 
   protected val sessionLocalTimeZone = conf.sessionLocalTimeZone
 

diff --git a/...ore/src/main/scala/org/apache/spark/sql/execution/python/CogroupedArrowPythonRunner.scala b/...ore/src/main/scala/org/apache/spark/sql/execution/python/CogroupedArrowPythonRunner.scala
@@ -81,8 +81,11 @@ class CogroupedArrowPythonRunner(
         dataOut.writeInt(0)
       }
 
-      def writeGroup(group: Iterator[InternalRow], schema: StructType, dataOut: DataOutputStream,
-                    name: String) = {
+      def writeGroup(
+          group: Iterator[InternalRow],
+          schema: StructType,
+          dataOut: DataOutputStream,
+          name: String) = {
         val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId)
         val allocator = ArrowUtils.rootAllocator.newChildAllocator(
           s"stdout writer for $pythonExec ($name)", 0, Long.MaxValue)

diff --git a/...re/src/main/scala/org/apache/spark/sql/execution/python/FlatMapCoGroupsInPandasExec.scala b/...re/src/main/scala/org/apache/spark/sql/execution/python/FlatMapCoGroupsInPandasExec.scala
@@ -52,7 +52,7 @@ case class FlatMapCoGroupsInPandasExec(
     output: Seq[Attribute],
     left: SparkPlan,
     right: SparkPlan)
-  extends BasePandasGroupExec(func, output) with BinaryExecNode{
+  extends BasePandasGroupExec(func, output) with BinaryExecNode {
 
   override def outputPartitioning: Partitioning = left.outputPartitioning
 
@@ -72,23 +72,26 @@ case class FlatMapCoGroupsInPandasExec(
     val (leftDedup, leftArgOffsets) = resolveArgOffsets(left, leftGroup)
     val (rightDedup, rightArgOffsets) = resolveArgOffsets(right, rightGroup)
 
+    // Map cogrouped rows to ArrowPythonRunner results, Only execute if partition is not empty
     left.execute().zipPartitions(right.execute())  { (leftData, rightData) =>
+      if (leftData.isEmpty && rightData.isEmpty) Iterator.empty else {
 
-      val leftGrouped = groupAndProject(leftData, leftGroup, left.output, leftDedup)
-      val rightGrouped = groupAndProject(rightData, rightGroup, right.output, rightDedup)
-      val data = new CoGroupedIterator(leftGrouped, rightGrouped, leftGroup)
-        .map{case (_, l, r) => (l, r)}
+        val leftGrouped = groupAndProject(leftData, leftGroup, left.output, leftDedup)
+        val rightGrouped = groupAndProject(rightData, rightGroup, right.output, rightDedup)
+        val data = new CoGroupedIterator(leftGrouped, rightGrouped, leftGroup)
+          .map { case (_, l, r) => (l, r) }
 
-      val runner = new CogroupedArrowPythonRunner(
-        chainedFunc,
-        PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
-        Array(leftArgOffsets ++ rightArgOffsets),
-        StructType.fromAttributes(leftDedup),
-        StructType.fromAttributes(rightDedup),
-        sessionLocalTimeZone,
-        pythonRunnerConf)
+        val runner = new CogroupedArrowPythonRunner(
+          chainedFunc,
+          PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
+          Array(leftArgOffsets ++ rightArgOffsets),
+          StructType.fromAttributes(leftDedup),
+          StructType.fromAttributes(rightDedup),
+          sessionLocalTimeZone,
+          pythonRunnerConf)
 
-      executePython(data, runner)
+        executePython(data, runner)
+      }
     }
   }
 }
diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala b/...core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
@@ -17,19 +17,14 @@
 
 package org.apache.spark.sql.execution.python
 
-import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
-
-import org.apache.spark.TaskContext
-import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
+import org.apache.spark.api.python.PythonEvalType
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning}
-import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.util.ArrowUtils
-import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch}
+
 
 /**
  * Physical node for [[org.apache.spark.sql.catalyst.plans.logical.FlatMapGroupsInPandas]]
@@ -53,7 +48,7 @@ case class FlatMapGroupsInPandasExec(
     func: Expression,
     output: Seq[Attribute],
     child: SparkPlan)
-  extends  BasePandasGroupExec(func, output) with UnaryExecNode {
+  extends BasePandasGroupExec(func, output) with UnaryExecNode {
 
   override def outputPartitioning: Partitioning = child.outputPartitioning