[SPARK-24187][R][SQL]Adding array_join function to SparkR

apache · huaxingao · May 13, 2018 · May 13, 2018 · May 14, 2018 · Jun 3, 2018
commit 0c6ca7da2009ddbcfce49d3639223058a6f9d818
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -201,6 +201,7 @@ exportMethods("%<=>%",
               "approxCountDistinct",
               "approxQuantile",
               "array_contains",
+              "array_join",
               "array_max",
               "array_min",
               "array_position",

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -222,6 +222,9 @@ NULL
 #' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp))
 #' head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5)))
 #' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))}
+#' head(select(tmp3, element_at(tmp3$v3, "Valiant")))
+#' tmp4 <- mutate(df, v4 = create_array(df$model, df$model))
+#' head(select(tmp4, array_join(tmp4$v4, "#"), array_join(tmp4$v4, "#", "NULL")))}
 NULL
 
 #' Window functions for Column operations
@@ -3006,6 +3009,28 @@ setMethod("array_contains",
             column(jc)
           })
 
+#' @details
+#' \code{array_join}: Concatenates the elements of column using the delimiter.
+#' Null values are replaced with null_replacement if set, otherwise they are ignored.
+#'
+#' @param delimiter character(s) to use to concatenate the elements of column.
+#' @param null_replacement character(s) to use to replace the Null values.
+#' @rdname column_collection_functions
+#' @aliases array_join array_join,Column-method
+#' @note array_join since 2.4.0
+setMethod("array_join",
+         signature(x = "Column"),
+         function(x, delimiter, null_replacement = NA) {
+           jc <- if (is.na(null_replacement)) {
+             callJStatic("org.apache.spark.sql.functions", "array_join", x@jc, delimiter)
+           }
+           else {
+             callJStatic("org.apache.spark.sql.functions", "array_join", x@jc, delimiter,
+                         null_replacement)
+           }
+           column(jc)
+         })
+
 #' @details
 #' \code{array_max}: Returns the maximum value of the array.
 #'

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -757,6 +757,10 @@ setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCoun
 #' @name NULL
 setGeneric("array_contains", function(x, value) { standardGeneric("array_contains") })
 
+#' @rdname column_collection_functions
+#' @name NULL
+setGeneric("array_join", function(x, delimiter, ...) { standardGeneric("array_join") })
+
 #' @rdname column_collection_functions
 #' @name NULL
 setGeneric("array_max", function(x) { standardGeneric("array_max") })

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1518,6 +1518,16 @@ test_that("column functions", {
   result <- collect(select(df, arrays_overlap(df[[1]], df[[2]])))[[1]]
   expect_equal(result, c(TRUE, FALSE, NA))
 
+  # Test array_join()
+  df <- createDataFrame(list(list(list("Hello", "World!"))))
+  result <- collect(select(df, array_join(df[[1]], "#")))[[1]]
+  expect_equal(result, "Hello#World!")
+  df2 <- createDataFrame(list(list(list("Hello", NA, "World!"))))
+  result <- collect(select(df2, array_join(df2[[1]], "#", "Beautiful")))[[1]]
+  expect_equal(result, "Hello#Beautiful#World!")
+  result <- collect(select(df2, array_join(df2[[1]], "#")))[[1]]
+  expect_equal(result, "Hello#World!")
+
   # Test array_sort() and sort_array()
   df <- createDataFrame(list(list(list(2L, 1L, 3L, NA)), list(list(NA, 6L, 5L, NA, 4L))))