update

Signed-off-by: Weichen Xu <[email protected]>
apache · WeichenXu123 · Mar 22, 2023 · Mar 22, 2023 · Mar 22, 2023 · Mar 23, 2023
commit e95fd9554c319fbd60827fb92cc584a765e994f8
diff --git a/...nect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/...nect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -485,12 +485,14 @@ class SparkConnectPlanner(val session: SparkSession) {
         logical.MapInPandas(
           pythonUdf,
           pythonUdf.dataType.asInstanceOf[StructType].toAttributes,
-          transformRelation(rel.getInput))
+          transformRelation(rel.getInput),
+          false)
       case PythonEvalType.SQL_MAP_ARROW_ITER_UDF =>
         logical.PythonMapInArrow(
           pythonUdf,
           pythonUdf.dataType.asInstanceOf[StructType].toAttributes,
-          transformRelation(rel.getInput))
+          transformRelation(rel.getInput),
+          false)
       case _ =>
         throw InvalidPlanInput(s"Function with EvalType: ${pythonUdf.evalType} is not supported")
     }

diff --git a/python/pyspark/sql/pandas/map_ops.py b/python/pyspark/sql/pandas/map_ops.py
@@ -32,7 +32,9 @@ class PandasMapOpsMixin:
     """
 
     def mapInPandas(
-        self, func: "PandasMapIterFunction", schema: Union[StructType, str], is_barrier: bool
+        self,
+        func: "PandasMapIterFunction", schema: Union[StructType, str],
+        is_barrier: bool = False
     ) -> "DataFrame":
         """
         Maps an iterator of batches in the current :class:`DataFrame` using a Python native
@@ -60,6 +62,7 @@ def mapInPandas(
         schema : :class:`pyspark.sql.types.DataType` or str
             the return type of the `func` in PySpark. The value can be either a
             :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
+        is_barrier : Use barrier mode execution if True.
 
         Examples
         --------
@@ -97,7 +100,9 @@ def mapInPandas(
         return DataFrame(jdf, self.sparkSession)
 
     def mapInArrow(
-        self, func: "ArrowMapIterFunction", schema: Union[StructType, str], is_barrier: bool
+        self,
+        func: "ArrowMapIterFunction", schema: Union[StructType, str],
+        is_barrier: bool = False
     ) -> "DataFrame":
         """
         Maps an iterator of batches in the current :class:`DataFrame` using a Python native
@@ -122,6 +127,7 @@ def mapInArrow(
         schema : :class:`pyspark.sql.types.DataType` or str
             the return type of the `func` in PySpark. The value can be either a
             :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
+        is_barrier : Use barrier mode execution if True.
 
         Examples
         --------

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInBatchExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/MapInBatchExec.scala
@@ -52,7 +52,7 @@ trait MapInBatchExec extends UnaryExecNode with PythonSQLMetrics {
   override def outputPartitioning: Partitioning = child.outputPartitioning
 
   override protected def doExecute(): RDD[InternalRow] = {
-    val resultRDD = child.execute().mapPartitionsInternal { inputIter =>
+    def mapper(inputIter: Iterator[InternalRow]): Iterator[InternalRow] = {
       // Single function with one struct.
       val argOffsets = Array(Array(0))
       val chainedFunc = Seq(ChainedPythonFunctions(Seq(pythonFunction)))
@@ -92,10 +92,11 @@ trait MapInBatchExec extends UnaryExecNode with PythonSQLMetrics {
         flattenedBatch.rowIterator.asScala
       }.map(unsafeProj)
     }
+
     if (isBarrier) {
-      resultRDD.barrier().mapPartitions(iter => iter)
+      child.execute().barrier().mapPartitions(mapper)
     } else {
-      resultRDD
+      child.execute().mapPartitionsInternal(mapper)
     }
   }
 }