Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 40 additions & 4 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2930,7 +2930,7 @@ setMethod("saveAsTable",
invisible(callJMethod(write, "saveAsTable", tableName))
})

#' summary
#' describe
#'
#' Computes statistics for numeric and string columns.
#' If no columns are given, this function computes statistics for all numerical or string columns.
Expand All @@ -2941,7 +2941,7 @@ setMethod("saveAsTable",
#' @return A SparkDataFrame.
#' @family SparkDataFrame functions
#' @aliases describe,SparkDataFrame,character-method describe,SparkDataFrame,ANY-method
#' @rdname summary
#' @rdname describe
#' @name describe
#' @export
#' @examples
Expand All @@ -2953,6 +2953,7 @@ setMethod("saveAsTable",
#' describe(df, "col1")
#' describe(df, "col1", "col2")
#' }
#' @seealso See \link{summary} for expanded statistics and control over which statistics to compute.
#' @note describe(SparkDataFrame, character) since 1.4.0
setMethod("describe",
signature(x = "SparkDataFrame", col = "character"),
Expand All @@ -2962,7 +2963,7 @@ setMethod("describe",
dataFrame(sdf)
})

#' @rdname summary
#' @rdname describe
#' @name describe
#' @aliases describe,SparkDataFrame-method
#' @note describe(SparkDataFrame) since 1.4.0
Expand All @@ -2973,15 +2974,50 @@ setMethod("describe",
dataFrame(sdf)
})

#' summary
#'
#' Computes specified statistics for numeric and string columns. Available statistics are:
#' \itemize{
#' \item count
#' \item mean
#' \item stddev
#' \item min
#' \item max
#' \item arbitrary approximate percentiles specified as a percentage (eg, "75%")
#' }
#' If no statistics are given, this function computes count, mean, stddev, min,
#' approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
#' This function is meant for exploratory data analysis, as we make no guarantee about the
#' backward compatibility of the schema of the resulting Dataset. If you want to
#' programmatically compute summary statistics, use the \code{agg} function instead.
#'
#'
#' @param object a SparkDataFrame to be summarized.
#' @param ... (optional) statistics to be computed for all columns.
#' @return A SparkDataFrame.
#' @family SparkDataFrame functions
#' @rdname summary
#' @name summary
#' @aliases summary,SparkDataFrame-method
#' @export
#' @examples
#'\dontrun{
#' sparkR.session()
#' path <- "path/to/file.json"
#' df <- read.json(path)
#' summary(df)
#' summary(df, "min", "25%", "75%", "max")
#' summary(select(df, "age", "height"))
#' }
#' @note summary(SparkDataFrame) since 1.5.0
#' @note The statistics provided by \code{summary} were change in 2.3.0 use \link{describe} for previous defaults.
#' @seealso \link{describe}
setMethod("summary",
signature(object = "SparkDataFrame"),
function(object, ...) {
describe(object)
statisticsList <- list(...)
sdf <- callJMethod(object@sdf, "summary", statisticsList)
dataFrame(sdf)
})


Expand Down
2 changes: 1 addition & 1 deletion R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ setGeneric("gapplyCollect", function(x, ...) { standardGeneric("gapplyCollect")
# @export
setGeneric("getNumPartitions", function(x) { standardGeneric("getNumPartitions") })

#' @rdname summary
#' @rdname describe
#' @export
setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })

Expand Down
19 changes: 13 additions & 6 deletions R/pkg/tests/fulltests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -2489,7 +2489,7 @@ test_that("read/write text files - compression option", {
unlink(textPath)
})

test_that("describe() and summarize() on a DataFrame", {
test_that("describe() and summary() on a DataFrame", {
df <- read.json(jsonPath)
stats <- describe(df, "age")
expect_equal(collect(stats)[1, "summary"], "count")
Expand All @@ -2500,8 +2500,15 @@ test_that("describe() and summarize() on a DataFrame", {
expect_equal(collect(stats)[5, "age"], "30")

stats2 <- summary(df)
expect_equal(collect(stats2)[4, "summary"], "min")
expect_equal(collect(stats2)[5, "age"], "30")
expect_equal(collect(stats2)[5, "summary"], "25%")
expect_equal(collect(stats2)[5, "age"], "30.0")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this mean this change the output of summary(df) call?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes


stats3 <- summary(df, "min", "max", "55.1%")

expect_equal(collect(stats3)[1, "summary"], "min")
expect_equal(collect(stats3)[2, "summary"], "max")
expect_equal(collect(stats3)[3, "summary"], "55.1%")
expect_equal(collect(stats3)[3, "age"], "30.0")

# SPARK-16425: SparkR summary() fails on column of type logical
df <- withColumn(df, "boolean", df$age == 30)
Expand Down Expand Up @@ -2734,15 +2741,15 @@ test_that("attach() on a DataFrame", {
expected_age <- data.frame(age = c(NA, 30, 19))
expect_equal(head(age), expected_age)
stat <- summary(age)
expect_equal(collect(stat)[5, "age"], "30")
expect_equal(collect(stat)[8, "age"], "30")
age <- age$age + 1
expect_is(age, "Column")
rm(age)
stat2 <- summary(age)
expect_equal(collect(stat2)[5, "age"], "30")
expect_equal(collect(stat2)[8, "age"], "30")
detach("df")
stat3 <- summary(df[, "age", drop = F])
expect_equal(collect(stat3)[5, "age"], "30")
expect_equal(collect(stat3)[8, "age"], "30")
expect_error(age)
})

Expand Down