add description

apache · felixcheung · Jan 30, 2017 · Jan 30, 2017 · Jan 30, 2017 · Feb 1, 2017
commit bf2373f260a2af4a8841c0b440e86979de9c98e0
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -686,6 +686,13 @@ setMethod("storageLevel",
 #' the current partitions. If a larger number of partitions is requested, it will stay at the
 #' current number of partitions.
 #'
+#' However, if you're doing a drastic coalesce on a SparkDataFrame, e.g. to numPartitions = 1,
+#' this may result in your computation taking place on fewer nodes than
+#' you like (e.g. one node in the case of numPartitions = 1). To avoid this,
+#' call \code{repartition}. This will add a shuffle step, but means the
+#' current upstream partitions will be executed in parallel (per whatever
+#' the current partitioning is).
+#'
 #' @param numPartitions the number of partitions to use.
 #'
 #' @family SparkDataFrame functions

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -518,6 +518,13 @@ def coalesce(self, numPartitions):
         claim 10 of the current partitions. If a larger number of partitions is requested,
         it will stay at the current number of partitions.
 
+        However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,
+        this may result in your computation taking place on fewer nodes than
+        you like (e.g. one node in the case of numPartitions = 1). To avoid this,
+        you can call repartition(). This will add a shuffle step, but means the
+        current upstream partitions will be executed in parallel (per whatever
+        the current partitioning is).
+
         >>> df.coalesce(1).rdd.getNumPartitions()
         1
         """

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2435,6 +2435,13 @@ class Dataset[T] private[sql](
    * the 100 new partitions will claim 10 of the current partitions.  If a larger number of
    * partitions is requested, it will stay at the current number of partitions.
    *
+   * However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,
+   * this may result in your computation taking place on fewer nodes than
+   * you like (e.g. one node in the case of numPartitions = 1). To avoid this,
+   * you can call repartition. This will add a shuffle step, but means the
+   * current upstream partitions will be executed in parallel (per whatever
+   * the current partitioning is).
+   *
    * @group typedrel
    * @since 1.6.0
    */

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -543,6 +543,13 @@ case class UnionExec(children: Seq[SparkPlan]) extends SparkPlan {
  * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of
  * the 100 new partitions will claim 10 of the current partitions.  If a larger number of partitions
  * is requested, it will stay at the current number of partitions.
+ *
+ * However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,
+ * this may result in your computation taking place on fewer nodes than
+ * you like (e.g. one node in the case of numPartitions = 1). To avoid this,
+ * you see ShuffleExchange. This will add a shuffle step, but means the
+ * current upstream partitions will be executed in parallel (per whatever
+ * the current partitioning is).
  */
 case class CoalesceExec(numPartitions: Int, child: SparkPlan) extends UnaryExecNode {
   override def output: Seq[Attribute] = child.output