-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21584][SQL][SparkR] Update R method for summary to call new implementation #18786
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2930,7 +2930,7 @@ setMethod("saveAsTable", | |
| invisible(callJMethod(write, "saveAsTable", tableName)) | ||
| }) | ||
|
|
||
| #' summary | ||
| #' describe | ||
| #' | ||
| #' Computes statistics for numeric and string columns. | ||
| #' If no columns are given, this function computes statistics for all numerical or string columns. | ||
|
|
@@ -2941,7 +2941,7 @@ setMethod("saveAsTable", | |
| #' @return A SparkDataFrame. | ||
| #' @family SparkDataFrame functions | ||
| #' @aliases describe,SparkDataFrame,character-method describe,SparkDataFrame,ANY-method | ||
| #' @rdname summary | ||
| #' @rdname describe | ||
| #' @name describe | ||
| #' @export | ||
| #' @examples | ||
|
|
@@ -2953,6 +2953,7 @@ setMethod("saveAsTable", | |
| #' describe(df, "col1") | ||
| #' describe(df, "col1", "col2") | ||
| #' } | ||
| #' @seealso Ues \code{\link{summary}} for expanded statistics and control over which statistics to compute. | ||
| #' @note describe(SparkDataFrame, character) since 1.4.0 | ||
| setMethod("describe", | ||
| signature(x = "SparkDataFrame", col = "character"), | ||
|
|
@@ -2962,7 +2963,7 @@ setMethod("describe", | |
| dataFrame(sdf) | ||
| }) | ||
|
|
||
| #' @rdname summary | ||
| #' @rdname describe | ||
| #' @name describe | ||
| #' @aliases describe,SparkDataFrame-method | ||
| #' @note describe(SparkDataFrame) since 1.4.0 | ||
|
|
@@ -2973,15 +2974,51 @@ setMethod("describe", | |
| dataFrame(sdf) | ||
| }) | ||
|
|
||
| #' summary | ||
| #' | ||
| #' Computes specified statistics for numeric and string columns. | ||
|
||
| #' | ||
| #' Available statistics are: | ||
| #' | ||
| #' - count | ||
| #' - mean | ||
| #' - stddev | ||
| #' - min | ||
| #' - max | ||
|
||
| #' - arbitrary approximate percentiles specified as a percentage (eg, 75%) | ||
|
||
| #' | ||
| #' If no statistics are given, this function computes count, mean, stddev, min, | ||
| #' approximate quartiles (percentiles at 25%, 50%, and 75%), and max. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, don't use empty line - like #' - the 2nd paragraph after such empty line becomes the "details" section in the doc as formatted by roxygen2 |
||
| #' | ||
| #' This function is meant for exploratory data analysis, as we make no guarantee about the | ||
| #' backward compatibility of the schema of the resulting Dataset. If you want to | ||
| #' programmatically compute summary statistics, use the `agg` function instead. | ||
|
||
| #' | ||
| #' | ||
| #' @param object a SparkDataFrame to be summarized. | ||
| #' @param ... (optional) statistics to be computed for all columns. | ||
| #' @rdname summary | ||
| #' @name summary | ||
| #' @aliases summary,SparkDataFrame-method | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should have a
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should have a |
||
| #' @export | ||
| #' @examples | ||
| #'\dontrun{ | ||
| #' sparkR.session() | ||
| #' path <- "path/to/file.json" | ||
| #' df <- read.json(path) | ||
| #' summary(df) | ||
| #' summary(df, "min", "25%", "75%", "max") | ||
| #' summary(select(df, "age", "height")) | ||
| #' } | ||
| #' @note summary(SparkDataFrame) since 1.5.0 | ||
| #' @note The statistics provided by \code{summary} were change in 2.3.0 use \code{\link{describe}} for previous defaults. | ||
| #' @seealso \code{\link{describe}} | ||
|
||
| setMethod("summary", | ||
| signature(object = "SparkDataFrame"), | ||
| function(object, ...) { | ||
| describe(object) | ||
| statisticsList <- list(...) | ||
| sdf <- callJMethod(object@sdf, "summary", statisticsList) | ||
| dataFrame(sdf) | ||
| }) | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2489,7 +2489,7 @@ test_that("read/write text files - compression option", { | |
| unlink(textPath) | ||
| }) | ||
|
|
||
| test_that("describe() and summarize() on a DataFrame", { | ||
| test_that("describe() and summary() on a DataFrame", { | ||
| df <- read.json(jsonPath) | ||
| stats <- describe(df, "age") | ||
| expect_equal(collect(stats)[1, "summary"], "count") | ||
|
|
@@ -2500,8 +2500,15 @@ test_that("describe() and summarize() on a DataFrame", { | |
| expect_equal(collect(stats)[5, "age"], "30") | ||
|
|
||
| stats2 <- summary(df) | ||
| expect_equal(collect(stats2)[4, "summary"], "min") | ||
| expect_equal(collect(stats2)[5, "age"], "30") | ||
| expect_equal(collect(stats2)[5, "summary"], "25%") | ||
| expect_equal(collect(stats2)[5, "age"], "30.0") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this mean this change the output of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes |
||
|
|
||
| stats3 <- summary(df, "min", "max", "55.1%") | ||
|
|
||
| expect_equal(collect(stats3)[1, "summary"], "min") | ||
| expect_equal(collect(stats3)[2, "summary"], "max") | ||
| expect_equal(collect(stats3)[3, "summary"], "55.1%") | ||
| expect_equal(collect(stats3)[3, "age"], "30.0") | ||
|
|
||
| # SPARK-16425: SparkR summary() fails on column of type logical | ||
| df <- withColumn(df, "boolean", df$age == 30) | ||
|
|
@@ -2734,15 +2741,15 @@ test_that("attach() on a DataFrame", { | |
| expected_age <- data.frame(age = c(NA, 30, 19)) | ||
| expect_equal(head(age), expected_age) | ||
| stat <- summary(age) | ||
| expect_equal(collect(stat)[5, "age"], "30") | ||
| expect_equal(collect(stat)[8, "age"], "30") | ||
| age <- age$age + 1 | ||
| expect_is(age, "Column") | ||
| rm(age) | ||
| stat2 <- summary(age) | ||
| expect_equal(collect(stat2)[5, "age"], "30") | ||
| expect_equal(collect(stat2)[8, "age"], "30") | ||
| detach("df") | ||
| stat3 <- summary(df[, "age", drop = F]) | ||
| expect_equal(collect(stat3)[5, "age"], "30") | ||
| expect_equal(collect(stat3)[8, "age"], "30") | ||
| expect_error(age) | ||
| }) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ues->Use? Or should we saySeehereThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also, I think
\link{summary}}is sufficient, no need for\code