Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ exportMethods("arrange",
"transform",
"union",
"unionAll",
"unionByName",
"unique",
"unpersist",
"where",
Expand Down
38 changes: 36 additions & 2 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2683,7 +2683,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
#' @rdname union
#' @name union
#' @aliases union,SparkDataFrame,SparkDataFrame-method
#' @seealso \link{rbind}
#' @seealso \link{rbind} \link{unionByName}
#' @export
#' @examples
#'\dontrun{
Expand Down Expand Up @@ -2714,6 +2714,40 @@ setMethod("unionAll",
union(x, y)
})

#' Return a new SparkDataFrame containing the union of rows, matched by column names
#'
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
#' and another SparkDataFrame. This is different from \code{union} function, and both
#' \code{UNION ALL} and \code{UNION DISTINCT} in SQL as column positions are not taken
#' into account. Input SparkDataFrames can have different data types in the schema.
#'
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
#' This function resolves columns by name (not by position).
#'
#' @param x A SparkDataFrame
#' @param y A SparkDataFrame
#' @return A SparkDataFrame containing the result of the union.
#' @family SparkDataFrame functions
#' @rdname unionByName
#' @name unionByName
#' @aliases unionByName,SparkDataFrame,SparkDataFrame-method
#' @seealso \link{rbind} \link{union}
#' @export
#' @examples
#'\dontrun{
#' sparkR.session()
#' df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
#' df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
#' head(unionByName(df1, df2))
#' }
#' @note unionByName since 2.3.0
setMethod("unionByName",
signature(x = "SparkDataFrame", y = "SparkDataFrame"),
function(x, y) {
unioned <- callJMethod(x@sdf, "unionByName", y@sdf)
dataFrame(unioned)
})

#' Union two or more SparkDataFrames
#'
#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
Expand All @@ -2730,7 +2764,7 @@ setMethod("unionAll",
#' @aliases rbind,SparkDataFrame-method
#' @rdname rbind
#' @name rbind
#' @seealso \link{union}
#' @seealso \link{union} \link{unionByName}
#' @export
#' @examples
#'\dontrun{
Expand Down
4 changes: 4 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,10 @@ setGeneric("union", function(x, y) { standardGeneric("union") })
#' @export
setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })

#' @rdname unionByName
#' @export
setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })

#' @rdname unpersist
#' @export
setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") })
Expand Down
9 changes: 8 additions & 1 deletion R/pkg/tests/fulltests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -2255,7 +2255,7 @@ test_that("isLocal()", {
expect_false(isLocal(df))
})

test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
test_that("union(), unionByName(), rbind(), except(), and intersect() on a DataFrame", {
df <- read.json(jsonPath)

lines <- c("{\"name\":\"Bob\", \"age\":24}",
Expand All @@ -2271,6 +2271,13 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
expect_equal(first(unioned)$name, "Michael")
expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)

df1 <- select(df2, "age", "name")
unioned1 <- arrange(unionByName(df1, df), df1$age)
expect_is(unioned, "SparkDataFrame")
expect_equal(count(unioned), 6)
# Here, we test if 'Michael' in df is correctly mapped to the same name.
expect_equal(first(unioned)$name, "Michael")

unioned2 <- arrange(rbind(unioned, df, df2), df$age)
expect_is(unioned2, "SparkDataFrame")
expect_equal(count(unioned2), 12)
Expand Down
28 changes: 25 additions & 3 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1290,7 +1290,7 @@ def union(self, other):
""" Return a new :class:`DataFrame` containing union of rows in this and another frame.

This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
(that does deduplication of elements), use this function followed by a distinct.
(that does deduplication of elements), use this function followed by :func:`distinct`.

Also as standard in SQL, this function resolves columns by position (not by name).
"""
Expand All @@ -1301,14 +1301,36 @@ def unionAll(self, other):
""" Return a new :class:`DataFrame` containing union of rows in this and another frame.

This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
(that does deduplication of elements), use this function followed by a distinct.
(that does deduplication of elements), use this function followed by :func:`distinct`.

Also as standard in SQL, this function resolves columns by position (not by name).

.. note:: Deprecated in 2.0, use union instead.
.. note:: Deprecated in 2.0, use :func:`union` instead.
"""
return self.union(other)

@since(2.3)
def unionByName(self, other):
""" Returns a new :class:`DataFrame` containing union of rows in this and another frame.

This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
union (that does deduplication of elements), use this function followed by :func:`distinct`.

The difference between this function and :func:`union` is that this function
resolves columns by name (not by position):

>>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
>>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
>>> df1.unionByName(df2).show()
+----+----+----+
|col0|col1|col2|
+----+----+----+
| 1| 2| 3|
| 6| 4| 5|
+----+----+----+
"""
return DataFrame(self._jdf.unionByName(other._jdf), self.sql_ctx)

@since(1.3)
def intersect(self, other):
""" Return a new :class:`DataFrame` containing rows only in
Expand Down