Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ exportMethods("arrange",
"transform",
"union",
"unionAll",
"unionByName",
"unique",
"unpersist",
"where",
Expand Down
38 changes: 36 additions & 2 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2683,7 +2683,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
#' @rdname union
#' @name union
#' @aliases union,SparkDataFrame,SparkDataFrame-method
#' @seealso \link{rbind}
#' @seealso \link{rbind} \link{unionByName}
#' @export
#' @examples
#'\dontrun{
Expand Down Expand Up @@ -2714,6 +2714,40 @@ setMethod("unionAll",
union(x, y)
})

#' Return a new SparkDataFrame containing the union of rows
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'd suggest a slight different in the title - this one is the same words as union

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I just addressed both comments.

#'
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
#' and another SparkDataFrame. This is different from both \code{UNION ALL} and
#' \code{UNION DISTINCT} in SQL as column positions are not taken into account.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd list union() here too

#' Input SparkDataFrames can have different data types in the schema.
#'
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
#' This function resolves columns by name (not by position).
#'
#' @param x A SparkDataFrame
#' @param y A SparkDataFrame
#' @return A SparkDataFrame containing the result of the union.
#' @family SparkDataFrame functions
#' @rdname unionByName
#' @name unionByName
#' @aliases unionByName,SparkDataFrame,SparkDataFrame-method
#' @seealso \link{rbind} \link{union}
#' @export
#' @examples
#'\dontrun{
#' sparkR.session()
#' df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
#' df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
#' head(unionByName(df1, df2))
#' }
#' @note unionByName since 2.3.0
setMethod("unionByName",
signature(x = "SparkDataFrame", y = "SparkDataFrame"),
function(x, y) {
unioned <- callJMethod(x@sdf, "unionByName", y@sdf)
dataFrame(unioned)
})

#' Union two or more SparkDataFrames
#'
#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
Expand All @@ -2730,7 +2764,7 @@ setMethod("unionAll",
#' @aliases rbind,SparkDataFrame-method
#' @rdname rbind
#' @name rbind
#' @seealso \link{union}
#' @seealso \link{union} \link{unionByName}
#' @export
#' @examples
#'\dontrun{
Expand Down
4 changes: 4 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,10 @@ setGeneric("union", function(x, y) { standardGeneric("union") })
#' @export
setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })

#' @rdname unionByName
#' @export
setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })

#' @rdname unpersist
#' @export
setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") })
Expand Down
9 changes: 8 additions & 1 deletion R/pkg/tests/fulltests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -2255,7 +2255,7 @@ test_that("isLocal()", {
expect_false(isLocal(df))
})

test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
test_that("union(), unionByName(), rbind(), except(), and intersect() on a DataFrame", {
df <- read.json(jsonPath)

lines <- c("{\"name\":\"Bob\", \"age\":24}",
Expand All @@ -2271,6 +2271,13 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
expect_equal(first(unioned)$name, "Michael")
expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)

df1 <- select(df2, "age", "name")
unioned1 <- arrange(unionByName(df1, df), df1$age)
expect_is(unioned, "SparkDataFrame")
expect_equal(count(unioned), 6)
# Here, we test if 'Michael' in df is correctly mapped to the same name.
expect_equal(first(unioned)$name, "Michael")

unioned2 <- arrange(rbind(unioned, df, df2), df$age)
expect_is(unioned2, "SparkDataFrame")
expect_equal(count(unioned2), 12)
Expand Down
28 changes: 25 additions & 3 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1290,7 +1290,7 @@ def union(self, other):
""" Return a new :class:`DataFrame` containing union of rows in this and another frame.

This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
(that does deduplication of elements), use this function followed by a distinct.
(that does deduplication of elements), use this function followed by :func:`distinct`.

Also as standard in SQL, this function resolves columns by position (not by name).
"""
Expand All @@ -1301,14 +1301,36 @@ def unionAll(self, other):
""" Return a new :class:`DataFrame` containing union of rows in this and another frame.

This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
(that does deduplication of elements), use this function followed by a distinct.
(that does deduplication of elements), use this function followed by :func:`distinct`.

Also as standard in SQL, this function resolves columns by position (not by name).

.. note:: Deprecated in 2.0, use union instead.
.. note:: Deprecated in 2.0, use :func:`union` instead.
"""
return self.union(other)

@since(2.3)
def unionByName(self, other):
""" Returns a new :class:`DataFrame` containing union of rows in this and another frame.

This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
union (that does deduplication of elements), use this function followed by :func:`distinct`.

The difference between this function and :func:`union` is that this function
resolves columns by name (not by position):

>>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
>>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
>>> df1.unionByName(df2).show()
+----+----+----+
|col0|col1|col2|
+----+----+----+
| 1| 2| 3|
| 6| 4| 5|
+----+----+----+
"""
return DataFrame(self._jdf.unionByName(other._jdf), self.sql_ctx)

@since(1.3)
def intersect(self, other):
""" Return a new :class:`DataFrame` containing rows only in
Expand Down