From 0aa9ca776b1e058ef8b3193ba981dff1eef4d11f Mon Sep 17 00:00:00 2001 From: felixcheung Date: Fri, 13 Nov 2015 18:15:46 -0800 Subject: [PATCH 01/10] Add masked method tests for table, sample, lag, summary --- R/pkg/inst/tests/test_sparkSQL.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index d9a94faff7ac..eef028e1cf8a 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -433,6 +433,10 @@ test_that("table() returns a new DataFrame", { expect_is(tabledf, "DataFrame") expect_equal(count(tabledf), 3) dropTempTable(sqlContext, "table1") + + # Test base::table is working + a <- letters[1:3] + expect_equal(class(table(a, sample(a))), "table") }) test_that("toRDD() returns an RRDD", { @@ -673,6 +677,9 @@ test_that("sample on a DataFrame", { # Also test sample_frac sampled3 <- sample_frac(df, FALSE, 0.1, 0) # set seed for predictable result expect_true(count(sampled3) < 3) + + # Test base::sample is working + expect_equal(length(sample(1:12)), 12) }) test_that("select operators", { @@ -888,6 +895,9 @@ test_that("column functions", { expect_equal(result, list(list(3L, 2L, 1L), list(6L, 5L, 4L))) result <- collect(select(df, sort_array(df[[1]])))[[1]] expect_equal(result, list(list(1L, 2L, 3L), list(4L, 5L, 6L))) + + # Test that stats::lag is working + expect_equal(length(lag(ldeaths, 12)), 72) }) # test_that("column binary mathfunctions", { @@ -1365,6 +1375,9 @@ test_that("describe() and summarize() on a DataFrame", { stats2 <- summary(df) expect_equal(collect(stats2)[4, "name"], "Andy") expect_equal(collect(stats2)[5, "age"], "30") + + # Test base::summary is working + expect_equal(length(summary(attenu, digits = 4)), 35) }) test_that("dropna() and na.omit() on a DataFrame", { From 7bbaf7b08e43b923bb05e9b52bffa1d304a33481 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Fri, 13 Nov 2015 21:23:30 -0800 Subject: [PATCH 02/10] Add tests for bind, intersect, cov --- R/pkg/inst/tests/test_sparkSQL.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index eef028e1cf8a..dd2716677d5b 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1294,6 +1294,12 @@ test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", { expect_is(unioned, "DataFrame") expect_equal(count(intersected), 1) expect_equal(first(intersected)$name, "Andy") + + # Test base::rbind is working + expect_equal(length(rbind(1:4, c = 2, a = 10, 10, deparse.level = 0)), 16) + + # Test base::intersect is working + expect_equal(length(intersect(1:20, 3:23)), 18) }) test_that("withColumn() and withColumnRenamed()", { @@ -1523,6 +1529,9 @@ test_that("cov() and corr() on a DataFrame", { expect_true(abs(result - 1.0) < 1e-12) result <- corr(df, "singles", "doubles", "pearson") expect_true(abs(result - 1.0) < 1e-12) + + # Test stats::cov is working + expected_true(abs(max(cov(swiss)) - 1739.295) < 1e-3) }) test_that("freqItems() on a DataFrame", { From 4bb3b66377694fcf9d64dfaee0b17bc9ba67431a Mon Sep 17 00:00:00 2001 From: felixcheung Date: Fri, 13 Nov 2015 21:37:05 -0800 Subject: [PATCH 03/10] Add test for subset, filter, na.omit --- R/pkg/inst/tests/test_sparkSQL.R | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index dd2716677d5b..08c58b30f238 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -760,6 +760,9 @@ test_that("subsetting", { df6 <- subset(df, df$age %in% c(30), c(1,2)) expect_equal(count(df6), 1) expect_equal(columns(df6), c("name", "age")) + + # Test base::subset is working + expect_equal(nrow(subset(airquality, Temp > 80, select = c(Ozone, Temp))), 68) }) test_that("selectExpr() on a DataFrame", { @@ -1096,7 +1099,7 @@ test_that("group by, agg functions", { gd3_local <- collect(agg(gd3, var(df8$age))) expect_equal(162, gd3_local[gd3_local$name == "Justin",][1, 2]) - # make sure base:: or stats::sd, var are working + # Test stats::sd, stats::var are working expect_true(abs(sd(1:2) - 0.7071068) < 1e-6) expect_true(abs(var(1:5, 1:5) - 2.5) < 1e-6) @@ -1148,6 +1151,9 @@ test_that("filter() on a DataFrame", { expect_equal(count(filtered5), 1) filtered6 <- where(df, df$age %in% c(19, 30)) expect_equal(count(filtered6), 2) + + # Test stats::filter is working + expect_true(is.ts(filter(1:100, rep(1, 3)))) }) test_that("join() and merge() on a DataFrame", { @@ -1467,6 +1473,9 @@ test_that("dropna() and na.omit() on a DataFrame", { expect_identical(expected, actual) actual <- collect(na.omit(df, minNonNulls = 3, cols = c("name", "age", "height"))) expect_identical(expected, actual) + + # Test stats::na.omit is working + expect_equal(nrow(na.omit(data.frame(x = c(0, 10, NA)))), 2) }) test_that("fillna() on a DataFrame", { From 5071af9bac7bbdf86fc25ed5090c5ff95bf1a5f7 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Sat, 14 Nov 2015 21:21:22 -0800 Subject: [PATCH 04/10] update tests --- R/pkg/inst/tests/test_sparkSQL.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 08c58b30f238..b93bf75e4f25 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -435,8 +435,8 @@ test_that("table() returns a new DataFrame", { dropTempTable(sqlContext, "table1") # Test base::table is working - a <- letters[1:3] - expect_equal(class(table(a, sample(a))), "table") + #a <- letters[1:3] + #expect_equal(class(table(a, sample(a))), "table") }) test_that("toRDD() returns an RRDD", { @@ -679,7 +679,7 @@ test_that("sample on a DataFrame", { expect_true(count(sampled3) < 3) # Test base::sample is working - expect_equal(length(sample(1:12)), 12) + #expect_equal(length(sample(1:12)), 12) }) test_that("select operators", { @@ -762,7 +762,7 @@ test_that("subsetting", { expect_equal(columns(df6), c("name", "age")) # Test base::subset is working - expect_equal(nrow(subset(airquality, Temp > 80, select = c(Ozone, Temp))), 68) + #expect_equal(nrow(subset(airquality, Temp > 80, select = c(Ozone, Temp))), 68) }) test_that("selectExpr() on a DataFrame", { @@ -1153,7 +1153,7 @@ test_that("filter() on a DataFrame", { expect_equal(count(filtered6), 2) # Test stats::filter is working - expect_true(is.ts(filter(1:100, rep(1, 3)))) + #expect_true(is.ts(filter(1:100, rep(1, 3)))) }) test_that("join() and merge() on a DataFrame", { @@ -1540,7 +1540,7 @@ test_that("cov() and corr() on a DataFrame", { expect_true(abs(result - 1.0) < 1e-12) # Test stats::cov is working - expected_true(abs(max(cov(swiss)) - 1739.295) < 1e-3) + #expect_true(abs(max(cov(swiss)) - 1739.295) < 1e-3) }) test_that("freqItems() on a DataFrame", { From c598a6dd1bb9ff7ed6c07e56d7e1a4b9bd0b2938 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Tue, 17 Nov 2015 16:33:33 -0800 Subject: [PATCH 05/10] doc update --- docs/sparkr.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/docs/sparkr.md b/docs/sparkr.md index a744b76be746..1ede0de255d4 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -351,3 +351,46 @@ summary(model) ##Sepal_Width 0.404655 {% endhighlight %} + +# R Function Name Conflicts + +When loading and attaching a new package in R, it is possible to have a name [conflict](https://stat.ethz.ch/R-manual/R-devel/library/base/html/library.html), where a +function is masking another function. + +The following functions are masked by the SparkR package: + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Masked functionHow to Access
cov in package:stats
stats::cov(x, y = NULL, use = "everything",
+           method = c("pearson", "kendall", "spearman"))
filter in package:stats
stats::filter(x, filter, method = c("convolution", "recursive"),
+              sides = 2, circular = FALSE, init)
lag in package:stats
stats::lag(x, ...)
sample in package:basebase::sample(x, size, replace = FALSE, prob = NULL)
subset in package:basebase::subset(x, subset, select, drop = FALSE, ...)
table in package:base
base::table(...,
+            exclude = if (useNA == "no") c(NA, NaN),
+            useNA = c("no", "ifany", "always"),
+            dnn = list.names(...), deparse.level = 1)
+ +You can inspect the search path in R with [`search()`](https://stat.ethz.ch/R-manual/R-devel/library/base/html/search.html) + From 7311cb5cb3ff1224419b14984c905585f7e03485 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Tue, 17 Nov 2015 19:32:46 -0800 Subject: [PATCH 06/10] fix lag and subset --- R/pkg/R/functions.R | 2 +- R/pkg/R/generics.R | 4 ++-- R/pkg/inst/tests/test_sparkSQL.R | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index ff0f438045c1..25a1f2210149 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2204,7 +2204,7 @@ setMethod("denseRank", #' @export #' @examples \dontrun{lag(df$c)} setMethod("lag", - signature(x = "characterOrColumn", offset = "numeric", defaultValue = "ANY"), + signature(x = "characterOrColumn"), function(x, offset, defaultValue = NULL) { col <- if (class(x) == "Column") { x@jc diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 0dcd05438222..71004a05ba61 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -539,7 +539,7 @@ setGeneric("showDF", function(x,...) { standardGeneric("showDF") }) # @rdname subset # @export -setGeneric("subset", function(x, subset, select, ...) { standardGeneric("subset") }) +setGeneric("subset", function(x, ...) { standardGeneric("subset") }) #' @rdname agg #' @export @@ -790,7 +790,7 @@ setGeneric("kurtosis", function(x) { standardGeneric("kurtosis") }) #' @rdname lag #' @export -setGeneric("lag", function(x, offset, defaultValue = NULL) { standardGeneric("lag") }) +setGeneric("lag", function(x, ...) { standardGeneric("lag") }) #' @rdname last #' @export diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index b93bf75e4f25..3f4f319fe745 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -762,7 +762,7 @@ test_that("subsetting", { expect_equal(columns(df6), c("name", "age")) # Test base::subset is working - #expect_equal(nrow(subset(airquality, Temp > 80, select = c(Ozone, Temp))), 68) + expect_equal(nrow(subset(airquality, Temp > 80, select = c(Ozone, Temp))), 68) }) test_that("selectExpr() on a DataFrame", { From 2d81394f726f180d47618bdd92b42b427b77e1be Mon Sep 17 00:00:00 2001 From: felixcheung Date: Tue, 17 Nov 2015 23:14:02 -0800 Subject: [PATCH 07/10] fix filter --- R/pkg/R/DataFrame.R | 14 +++++++------- R/pkg/R/generics.R | 6 +++++- R/pkg/inst/tests/test_sparkSQL.R | 2 +- docs/sparkr.md | 13 ------------- 4 files changed, 13 insertions(+), 22 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 34177e3cdd94..7a4e54fade36 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1441,7 +1441,7 @@ setMethod("orderBy", #' Filter the rows of a DataFrame according to a given condition. #' #' @param x A DataFrame to be sorted. -#' @param condition The condition to filter on. This may either be a Column expression +#' @param filter The condition to filter on. This may either be a Column expression #' or a string containing a SQL statement #' @return A DataFrame containing only the rows that meet the condition. #' @family DataFrame functions @@ -1459,12 +1459,12 @@ setMethod("orderBy", #' filter(df, df$col2 != "abcdefg") #' } setMethod("filter", - signature(x = "DataFrame", condition = "characterOrColumn"), - function(x, condition) { - if (class(condition) == "Column") { - condition <- condition@jc + signature(x = "DataFrame", filter = "characterOrColumn"), + function(x, filter) { + if (class(filter) == "Column") { + filter <- filter@jc } - sdf <- callJMethod(x@sdf, "filter", condition) + sdf <- callJMethod(x@sdf, "filter", filter) dataFrame(sdf) }) @@ -2152,7 +2152,7 @@ setMethod("with", }) #' Returns the column types of a DataFrame. -#' +#' #' @name coltypes #' @title Get column types of a DataFrame #' @family dataframe_funcs diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 71004a05ba61..cfcdfb5a9277 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -433,7 +433,11 @@ setGeneric("fillna", function(x, value, cols = NULL) { standardGeneric("fillna") #' @rdname filter #' @export -setGeneric("filter", function(x, condition) { standardGeneric("filter") }) +setGeneric("filter", + function(x, filter, method = c("convolution", "recursive"), sides = 2, + circular = FALSE, init) { + standardGeneric("filter") + }) #' @rdname groupBy #' @export diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 3f4f319fe745..98a1c31749f8 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1153,7 +1153,7 @@ test_that("filter() on a DataFrame", { expect_equal(count(filtered6), 2) # Test stats::filter is working - #expect_true(is.ts(filter(1:100, rep(1, 3)))) + expect_true(is.ts(filter(1:100, rep(1, 3)))) }) test_that("join() and merge() on a DataFrame", { diff --git a/docs/sparkr.md b/docs/sparkr.md index 1ede0de255d4..e04d2e51826d 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -366,23 +366,10 @@ The following functions are masked by the SparkR package:
stats::cov(x, y = NULL, use = "everything",
            method = c("pearson", "kendall", "spearman"))
- - filter in package:stats -
stats::filter(x, filter, method = c("convolution", "recursive"),
-              sides = 2, circular = FALSE, init)
- - - lag in package:stats -
stats::lag(x, ...)
- sample in package:base base::sample(x, size, replace = FALSE, prob = NULL) - - subset in package:base - base::subset(x, subset, select, drop = FALSE, ...) - table in package:base
base::table(...,

From 70c806a9d222b5e939a931966799b949ae5fe89d Mon Sep 17 00:00:00 2001
From: felixcheung 
Date: Wed, 18 Nov 2015 21:04:32 -0800
Subject: [PATCH 08/10] revert filter, add test for predict

---
 R/pkg/R/DataFrame.R           | 12 ++++++------
 R/pkg/R/generics.R            |  6 +-----
 R/pkg/inst/tests/test_mllib.R |  5 +++++
 docs/sparkr.md                |  7 ++++++-
 4 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 7a4e54fade36..06b0108b1389 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1441,7 +1441,7 @@ setMethod("orderBy",
 #' Filter the rows of a DataFrame according to a given condition.
 #'
 #' @param x A DataFrame to be sorted.
-#' @param filter The condition to filter on. This may either be a Column expression
+#' @param condition The condition to filter on. This may either be a Column expression
 #' or a string containing a SQL statement
 #' @return A DataFrame containing only the rows that meet the condition.
 #' @family DataFrame functions
@@ -1459,12 +1459,12 @@ setMethod("orderBy",
 #' filter(df, df$col2 != "abcdefg")
 #' }
 setMethod("filter",
-          signature(x = "DataFrame", filter = "characterOrColumn"),
-          function(x, filter) {
-            if (class(filter) == "Column") {
-              filter <- filter@jc
+          signature(x = "DataFrame", condition = "characterOrColumn"),
+          function(x, condition) {
+            if (class(condition) == "Column") {
+              condition <- condition@jc
             }
-            sdf <- callJMethod(x@sdf, "filter", filter)
+            sdf <- callJMethod(x@sdf, "filter", condition)
             dataFrame(sdf)
           })
 
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index cfcdfb5a9277..71004a05ba61 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -433,11 +433,7 @@ setGeneric("fillna", function(x, value, cols = NULL) { standardGeneric("fillna")
 
 #' @rdname filter
 #' @export
-setGeneric("filter",
-           function(x, filter, method = c("convolution", "recursive"), sides = 2,
-            circular = FALSE, init) {
-              standardGeneric("filter")
-            })
+setGeneric("filter", function(x, condition) { standardGeneric("filter") })
 
 #' @rdname groupBy
 #' @export
diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
index d497ad8c9daa..023925956f84 100644
--- a/R/pkg/inst/tests/test_mllib.R
+++ b/R/pkg/inst/tests/test_mllib.R
@@ -31,6 +31,11 @@ test_that("glm and predict", {
   model <- glm(Sepal_Width ~ Sepal_Length, training, family = "gaussian")
   prediction <- predict(model, test)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+
+  # Test stats::predict is working
+  x <- rnorm(15)
+  y <- x + rnorm(15)
+  expect_equal(15, predict(lm(y ~ x)))
 })
 
 test_that("glm should work with long formula", {
diff --git a/docs/sparkr.md b/docs/sparkr.md
index e04d2e51826d..cfb9b41350f4 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -286,7 +286,7 @@ head(teenagers)
 
 # Machine Learning
 
-SparkR allows the fitting of generalized linear models over DataFrames using the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to train a model of the specified family. Currently the gaussian and binomial families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', ':', '+', and '-'. 
+SparkR allows the fitting of generalized linear models over DataFrames using the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to train a model of the specified family. Currently the gaussian and binomial families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', ':', '+', and '-'.
 
 The [summary()](api/R/summary.html) function gives the summary of a model produced by [glm()](api/R/glm.html).
 
@@ -366,6 +366,11 @@ The following functions are masked by the SparkR package:
     
stats::cov(x, y = NULL, use = "everything",
            method = c("pearson", "kendall", "spearman"))
+ + filter in package:stats +
stats::filter(x, filter, method = c("convolution", "recursive"),
+              sides = 2, circular = FALSE, init)
+ sample in package:base base::sample(x, size, replace = FALSE, prob = NULL) From dd3f83c6fd7b04c676bbf4ff56692f6567a6d723 Mon Sep 17 00:00:00 2001 From: felixcheung Date: Wed, 18 Nov 2015 22:05:54 -0800 Subject: [PATCH 09/10] fix busted test --- R/pkg/inst/tests/test_mllib.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R index 023925956f84..e0667e5e22c1 100644 --- a/R/pkg/inst/tests/test_mllib.R +++ b/R/pkg/inst/tests/test_mllib.R @@ -35,7 +35,7 @@ test_that("glm and predict", { # Test stats::predict is working x <- rnorm(15) y <- x + rnorm(15) - expect_equal(15, predict(lm(y ~ x))) + expect_equal(length(predict(lm(y ~ x))), 15) }) test_that("glm should work with long formula", { From 71f3be732408310d537a01daa75627704585d4bd Mon Sep 17 00:00:00 2001 From: felixcheung Date: Wed, 18 Nov 2015 22:39:54 -0800 Subject: [PATCH 10/10] disable filter test --- R/pkg/inst/tests/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 98a1c31749f8..3f4f319fe745 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1153,7 +1153,7 @@ test_that("filter() on a DataFrame", { expect_equal(count(filtered6), 2) # Test stats::filter is working - expect_true(is.ts(filter(1:100, rep(1, 3)))) + #expect_true(is.ts(filter(1:100, rep(1, 3)))) }) test_that("join() and merge() on a DataFrame", {