Address comments

apache · icexelloss · Mar 24, 2018 · Apr 17, 2018 · Apr 17, 2018 · Apr 19, 2018
commit 328b2c4e09502a66939d47d6967ceea7ceab6c8c
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -2613,7 +2613,8 @@ def pandas_udf(f=None, returnType=None, functionType=None):
        >>> @pandas_udf("double", PandasUDFType.GROUPED_AGG)  # doctest: +SKIP
        ... def mean_udf(v):
        ...     return v.mean()
-       >>> w = Window.partitionBy('id')
+       >>> w = Window.partitionBy('id') \\
+       ...           .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
        >>> df.withColumn('mean_v', mean_udf(df['v']).over(w)).show()  # doctest: +SKIP
        +---+----+------+
        | id|   v|mean_v|

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -5577,7 +5577,7 @@ def test_multiple_udfs(self):
 
         result1 = df.withColumn('mean_v', self.pandas_agg_mean_udf(df['v']).over(w)) \
                     .withColumn('max_v', self.pandas_agg_max_udf(df['v']).over(w)) \
-                    .withColumn('min_w', self.pandas_agg_min_udf(df['w']).over(w)) \
+                    .withColumn('min_w', self.pandas_agg_min_udf(df['w']).over(w))
 
         expected1 = df.withColumn('mean_v', mean(df['v']).over(w)) \
                       .withColumn('max_v', max(df['v']).over(w)) \

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -129,6 +129,10 @@ def wrapped(*series):
 
 
 def wrap_window_agg_pandas_udf(f, return_type):
+    # This is similar to grouped_agg_pandas_udf, the only difference
+    # is that window_agg_pandas_udf needs to repeat the return value
+    # to match window length, where grouped_agg_pandas_udf just returns
+    # the scalar value.
     arrow_return_type = to_arrow_type(return_type)
 
     def wrapped(*series):

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -116,7 +116,7 @@ trait CheckAnalysis extends PredicateHelper {
           case _ @ WindowExpression(_: PythonUDF,
             WindowSpecDefinition(_, _, frame: SpecifiedWindowFrame))
               if !frame.isUnbounded =>
-            failAnalysis(s"Only unbounded window frame is supported with Pandas UDFs.")
+            failAnalysis("Only unbounded window frame is supported with Pandas UDFs.")
 
           case w @ WindowExpression(e, s) =>
             // Only allow window functions with an aggregate expression or an offset window

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/PythonUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/PythonUDF.scala
@@ -39,10 +39,9 @@ object PythonUDF {
       e.asInstanceOf[PythonUDF].evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF
   }
 
-  def isWindowPandasUDF(e: Expression): Boolean = {
-    e.isInstanceOf[PythonUDF] &&
-      e.asInstanceOf[PythonUDF].evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF
-  }
+  // This is currently same as GroupedAggPandasUDF, but we might support new types in the future,
+  // e.g, N -> N transform.
+  def isWindowPandasUDF(e: Expression): Boolean = isGroupedAggPandasUDF(e)
 }
 
 /**

diff --git a/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -313,7 +313,7 @@ object WindowFunctionType {
       case udf: PythonUDF if PythonUDF.isWindowPandasUDF(udf) => Python
     }
 
-    // Normally a window expression would either have either a SQL window function, a SQL
+    // Normally a window expression would either have a SQL window function, a SQL
     // aggregate function or a python window UDF. However, sometimes the optimizer will replace
     // the window function if the value of the window function can be predetermined.
     // For example, for query:

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -279,7 +279,7 @@ object PhysicalAggregation {
  */
 object PhysicalWindow {
   // windowFunctionType, windowExpression, partitionSpec, orderSpec, child
-  type ReturnType =
+  private type ReturnType =
     (WindowFunctionType, Seq[NamedExpression], Seq[Expression], Seq[SortOrder], LogicalPlan)
 
   def unapply(a: Any): Option[ReturnType] = a match {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -439,6 +439,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         WindowFunctionType.Python, windowExprs, partitionSpec, orderSpec, child) =>
         execution.python.WindowInPandasExec(
           windowExprs, partitionSpec, orderSpec, planLater(child)) :: Nil
+
       case _ => Nil
     }
   }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/WindowInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/WindowInPandasExec.scala
@@ -80,7 +80,7 @@ case class WindowInPandasExec(
    * @return the final resulting projection.
    */
   private[this] def createResultProjection(expressions: Seq[Expression]): UnsafeProjection = {
-    val references = expressions.zipWithIndex.map{ case (e, i) =>
+    val references = expressions.zipWithIndex.map { case (e, i) =>
       // Results of window expressions will be on the right side of child's output
       BoundReference(child.output.size + i, e.dataType, e.nullable)
     }