Skip to content

Commit 6032268

Browse files
committed
Merge remote-tracking branch 'upstream/master' into native-ddl
Conflicts: sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
2 parents 53d02f0 + c7fccb5 commit 6032268

File tree

297 files changed

+3805
-1856
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

297 files changed

+3805
-1856
lines changed

R/pkg/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ exportMethods("%in%",
111111
"add_months",
112112
"alias",
113113
"approxCountDistinct",
114+
"approxQuantile",
114115
"array_contains",
115116
"asc",
116117
"ascii",

R/pkg/R/generics.R

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,13 @@ setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
6767
# @export
6868
setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
6969

70+
# @rdname statfunctions
71+
# @export
72+
setGeneric("approxQuantile",
73+
function(x, col, probabilities, relativeError) {
74+
standardGeneric("approxQuantile")
75+
})
76+
7077
# @rdname distinct
7178
# @export
7279
setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })

R/pkg/R/stats.R

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,45 @@ setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
130130
collect(dataFrame(sct))
131131
})
132132

133+
#' approxQuantile
134+
#'
135+
#' Calculates the approximate quantiles of a numerical column of a DataFrame.
136+
#'
137+
#' The result of this algorithm has the following deterministic bound:
138+
#' If the DataFrame has N elements and if we request the quantile at probability `p` up to error
139+
#' `err`, then the algorithm will return a sample `x` from the DataFrame so that the *exact* rank
140+
#' of `x` is close to (p * N). More precisely,
141+
#' floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
142+
#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
143+
#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
144+
#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
145+
#'
146+
#' @param x A SparkSQL DataFrame.
147+
#' @param col The name of the numerical column.
148+
#' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1].
149+
#' For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
150+
#' @param relativeError The relative target precision to achieve (>= 0). If set to zero,
151+
#' the exact quantiles are computed, which could be very expensive.
152+
#' Note that values greater than 1 are accepted but give the same result as 1.
153+
#' @return The approximate quantiles at the given probabilities.
154+
#'
155+
#' @rdname statfunctions
156+
#' @name approxQuantile
157+
#' @export
158+
#' @examples
159+
#' \dontrun{
160+
#' df <- jsonFile(sqlContext, "/path/to/file.json")
161+
#' quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
162+
#' }
163+
setMethod("approxQuantile",
164+
signature(x = "DataFrame", col = "character",
165+
probabilities = "numeric", relativeError = "numeric"),
166+
function(x, col, probabilities, relativeError) {
167+
statFunctions <- callJMethod(x@sdf, "stat")
168+
callJMethod(statFunctions, "approxQuantile", col,
169+
as.list(probabilities), relativeError)
170+
})
171+
133172
#' sampleBy
134173
#'
135174
#' Returns a stratified sample without replacement based on the fraction given on each stratum.

R/pkg/inst/tests/testthat/test_sparkSQL.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1785,6 +1785,14 @@ test_that("sampleBy() on a DataFrame", {
17851785
expect_identical(as.list(result[2, ]), list(key = "1", count = 7))
17861786
})
17871787

1788+
test_that("approxQuantile() on a DataFrame", {
1789+
l <- lapply(c(0:99), function(i) { i })
1790+
df <- createDataFrame(sqlContext, l, "key")
1791+
quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
1792+
expect_equal(quantiles[[1]], 50)
1793+
expect_equal(quantiles[[2]], 80)
1794+
})
1795+
17881796
test_that("SQL error message is returned from JVM", {
17891797
retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
17901798
expect_equal(grepl("Table not found: blah", retError), TRUE)

network/common/src/main/java/org/apache/spark/network/TransportContext.java renamed to common/network-common/src/main/java/org/apache/spark/network/TransportContext.java

File renamed without changes.

network/common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java renamed to common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java

File renamed without changes.

network/common/src/main/java/org/apache/spark/network/buffer/LazyFileRegion.java renamed to common/network-common/src/main/java/org/apache/spark/network/buffer/LazyFileRegion.java

File renamed without changes.

network/common/src/main/java/org/apache/spark/network/buffer/ManagedBuffer.java renamed to common/network-common/src/main/java/org/apache/spark/network/buffer/ManagedBuffer.java

File renamed without changes.

network/common/src/main/java/org/apache/spark/network/buffer/NettyManagedBuffer.java renamed to common/network-common/src/main/java/org/apache/spark/network/buffer/NettyManagedBuffer.java

File renamed without changes.

0 commit comments

Comments
 (0)