-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-26227][R] from_[csv|json] should accept schema_of_[csv|json] in R API #23184
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -202,8 +202,9 @@ NULL | |||
| #' \itemize{ | ||||
| #' \item \code{from_json}: a structType object to use as the schema to use | ||||
| #' when parsing the JSON string. Since Spark 2.3, the DDL-formatted string is | ||||
| #' also supported for the schema. | ||||
| #' \item \code{from_csv}: a DDL-formatted string | ||||
| #' also supported for the schema. Since Spark 3.0, \code{schema_of_json} or | ||||
| #' the DDL-formatted string literal can also be accepted. | ||||
| #' \item \code{from_csv}: a structType object, DDL-formatted string or \code{schema_of_csv} | ||||
| #' } | ||||
| #' @param ... additional argument(s). | ||||
| #' \itemize{ | ||||
|
|
@@ -2254,40 +2255,54 @@ setMethod("date_format", signature(y = "Column", x = "character"), | |||
| column(jc) | ||||
| }) | ||||
|
|
||||
| setClassUnion("characterOrstructTypeOrColumn", c("character", "structType", "Column")) | ||||
|
|
||||
| #' @details | ||||
| #' \code{from_json}: Parses a column containing a JSON string into a Column of \code{structType} | ||||
| #' with the specified \code{schema} or array of \code{structType} if \code{as.json.array} is set | ||||
| #' to \code{TRUE}. If the string is unparseable, the Column will contain the value NA. | ||||
| #' | ||||
| #' @rdname column_collection_functions | ||||
| #' @param as.json.array indicating if input string is JSON array of objects or a single object. | ||||
| #' @aliases from_json from_json,Column,characterOrstructType-method | ||||
| #' @aliases from_json from_json,Column,characterOrstructTypeOrColumn-method | ||||
| #' @examples | ||||
| #' | ||||
| #' \dontrun{ | ||||
| #' df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d") | ||||
| #' df2 <- mutate(df2, d2 = to_json(df2$d, dateFormat = 'dd/MM/yyyy')) | ||||
| #' schema <- structType(structField("date", "string")) | ||||
| #' head(select(df2, from_json(df2$d2, schema, dateFormat = 'dd/MM/yyyy'))) | ||||
|
|
||||
| #' df2 <- sql("SELECT named_struct('name', 'Bob') as people") | ||||
| #' df2 <- mutate(df2, people_json = to_json(df2$people)) | ||||
| #' schema <- structType(structField("name", "string")) | ||||
| #' head(select(df2, from_json(df2$people_json, schema))) | ||||
| #' head(select(df2, from_json(df2$people_json, "name STRING")))} | ||||
| #' head(select(df2, from_json(df2$people_json, "name STRING"))) | ||||
| #' head(select(df2, from_json(df2$people_json, schema_of_json(head(df2)$people_json))))} | ||||
| #' @note from_json since 2.2.0 | ||||
| setMethod("from_json", signature(x = "Column", schema = "characterOrstructType"), | ||||
| setMethod("from_json", signature(x = "Column", schema = "characterOrstructTypeOrColumn"), | ||||
| function(x, schema, as.json.array = FALSE, ...) { | ||||
| if (is.character(schema)) { | ||||
| schema <- structType(schema) | ||||
| jschema <- structType(schema)$jobj | ||||
| } else if (class(schema) == "structType") { | ||||
| jschema <- schema$jobj | ||||
| } else { | ||||
| jschema <- schema@jc | ||||
| } | ||||
|
|
||||
| if (as.json.array) { | ||||
| jschema <- callJStatic("org.apache.spark.sql.types.DataTypes", | ||||
| "createArrayType", | ||||
| schema$jobj) | ||||
| } else { | ||||
| jschema <- schema$jobj | ||||
| # This case is R-specifically different. Unlike Scala and Python side, | ||||
|
||||
| # R side has 'as.json.array' option to indicate if the schema should be | ||||
| # treated as struct or element type of array in order to make it more | ||||
| # R-friendly. | ||||
| if (class(schema) == "Column") { | ||||
| jschema <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", | ||||
| "createArrayType", | ||||
| jschema) | ||||
| } else { | ||||
| jschema <- callJStatic("org.apache.spark.sql.types.DataTypes", | ||||
| "createArrayType", | ||||
| jschema) | ||||
| } | ||||
| } | ||||
| options <- varargsToStrEnv(...) | ||||
| jc <- callJStatic("org.apache.spark.sql.functions", | ||||
|
|
@@ -2328,22 +2343,27 @@ setMethod("schema_of_json", signature(x = "characterOrColumn"), | |||
| #' If the string is unparseable, the Column will contain the value NA. | ||||
| #' | ||||
| #' @rdname column_collection_functions | ||||
| #' @aliases from_csv from_csv,Column,character-method | ||||
| #' @aliases from_csv from_csv,Column,characterOrstructTypeOrColumn-method | ||||
| #' @examples | ||||
| #' | ||||
| #' \dontrun{ | ||||
| #' df <- sql("SELECT 'Amsterdam,2018' as csv") | ||||
| #' csv <- "Amsterdam,2018" | ||||
| #' df <- sql(paste0("SELECT '", csv, "' as csv")) | ||||
| #' schema <- "city STRING, year INT" | ||||
| #' head(select(df, from_csv(df$csv, schema)))} | ||||
| #' head(select(df, from_csv(df$csv, schema))) | ||||
| #' head(select(df, from_csv(df$csv, structType(schema)))) | ||||
| #' head(select(df, from_csv(df$csv, schema_of_csv(csv))))} | ||||
| #' @note from_csv since 3.0.0 | ||||
| setMethod("from_csv", signature(x = "Column", schema = "characterOrColumn"), | ||||
| setMethod("from_csv", signature(x = "Column", schema = "characterOrstructTypeOrColumn"), | ||||
| function(x, schema, ...) { | ||||
| if (class(schema) == "Column") { | ||||
| jschema <- schema@jc | ||||
| } else if (is.character(schema)) { | ||||
| if (class(schema) == "structType") { | ||||
| schema <- callJMethod(schema$jobj, "toDDL") | ||||
| } | ||||
|
|
||||
| if (is.character(schema)) { | ||||
| jschema <- callJStatic("org.apache.spark.sql.functions", "lit", schema) | ||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, why in the case for from_json, if schema is character is
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, yea, that looks a bit confusing. It's similar reason. Fortunately,
|
||||
| } else { | ||||
| stop("schema argument should be a column or character") | ||||
| jschema <- schema@jc | ||||
| } | ||||
| options <- varargsToStrEnv(...) | ||||
| jc <- callJStatic("org.apache.spark.sql.functions", | ||||
|
|
||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we should probably try to pull all the setClassUnion in one place. (to avoid conflict or duplication)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yup, I agree.. Would you mind if I do this separately? I roughly checked by
grepand looks:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes