apache · JulienPeloton · Nov 13, 2018 · Nov 13, 2018 · Nov 16, 2018 · Nov 16, 2018
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -767,6 +767,14 @@ setMethod("repartition",
 #'                      using \code{spark.sql.shuffle.partitions} as number of partitions.}
 #'}
 #'
+#' At least one partition-by expression must be specified.
+#' When no explicit sort order is specified, "ascending nulls first" is assumed.
+#'
+#' Note that due to performance reasons this method uses sampling to estimate the ranges.
+#' Hence, the output may not be consistent, since sampling can return different values.
+#' The sample size can be controlled by the config
+#' \code{spark.sql.execution.rangeExchange.sampleSizePerPartition}.
+#'
 #' @param x a SparkDataFrame.
 #' @param numPartitions the number of partitions to use.
 #' @param col the column by which the range partitioning will be performed.

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -732,6 +732,11 @@ def repartitionByRange(self, numPartitions, *cols):
         At least one partition-by expression must be specified.
         When no explicit sort order is specified, "ascending nulls first" is assumed.
 
+        Note that due to performance reasons this method uses sampling to estimate the ranges.
+        Hence, the output may not be consistent, since sampling can return different values.
+        The sample size can be controlled by the config
+        `spark.sql.execution.rangeExchange.sampleSizePerPartition`.
+
         >>> df.repartitionByRange(2, "age").rdd.getNumPartitions()
         2
         >>> df.show()

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2789,6 +2789,12 @@ class Dataset[T] private[sql](
    * When no explicit sort order is specified, "ascending nulls first" is assumed.
    * Note, the rows are not sorted in each partition of the resulting Dataset.
    *
+   *
+   * Note that due to performance reasons this method uses sampling to estimate the ranges.
+   * Hence, the output may not be consistent, since sampling can return different values.
+   * The sample size can be controlled by the config
+   * `spark.sql.execution.rangeExchange.sampleSizePerPartition`.
+   *
    * @group typedrel
    * @since 2.3.0
    */
@@ -2813,6 +2819,11 @@ class Dataset[T] private[sql](
    * When no explicit sort order is specified, "ascending nulls first" is assumed.
    * Note, the rows are not sorted in each partition of the resulting Dataset.
    *
+   * Note that due to performance reasons this method uses sampling to estimate the ranges.
+   * Hence, the output may not be consistent, since sampling can return different values.
+   * The sample size can be controlled by the config
+   * `spark.sql.execution.rangeExchange.sampleSizePerPartition`.
+   *
    * @group typedrel
    * @since 2.3.0
    */