apache
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.travis.yml‎
Lines changed: 1 addition & 1 deletion b/‎.travis.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎R/pkg/NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 2 additions & 2 deletions b/‎R/pkg/R/DataFrame.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/pkg/R/WindowSpec.R‎
Lines changed: 12 additions & 0 deletions b/‎R/pkg/R/WindowSpec.R‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎R/pkg/R/functions.R‎
Lines changed: 60 additions & 18 deletions b/‎R/pkg/R/functions.R‎
Lines changed: 60 additions & 18 deletions
diff --git a/‎R/pkg/R/generics.R‎
Lines changed: 4 additions & 0 deletions b/‎R/pkg/R/generics.R‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎R/pkg/R/install.R‎
Lines changed: 44 additions & 20 deletions b/‎R/pkg/R/install.R‎
Lines changed: 44 additions & 20 deletions
@@ -23,6 +23,7 @@
 /lib/
 R-unit-tests.log
 R/unit-tests.out
+R/cran-check.out
 build/*.jar
 build/apache-maven*
 build/scala*
 
@@ -44,7 +44,7 @@ notifications:
 # 5. Run maven install before running lint-java.
 install:
   - export MAVEN_SKIP_RC=1
-  - build/mvn -T 4 -q -DskipTests -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
+  - build/mvn -T 4 -q -DskipTests -Pmesos -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
 
 # 6. Run lint-java.
 script:
 
@@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.1 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.3 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (BSD licence) sbt and sbt-launch-lib.bash
      (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
 
@@ -27,6 +27,7 @@ exportMethods("glm",
               "summary",
               "spark.kmeans",
               "fitted",
+              "spark.mlp",
               "spark.naiveBayes",
               "spark.survreg",
               "spark.lda",
 
@@ -212,9 +212,9 @@ setMethod("showDF",
 
 #' show
 #'
-#' Print the SparkDataFrame column names and types
+#' Print class and type information of a Spark object.
 #'
-#' @param object a SparkDataFrame.
+#' @param object a Spark object. Can be a SparkDataFrame, Column, GroupedData, WindowSpec.
 #'
 #' @family SparkDataFrame functions
 #' @rdname show
 
@@ -203,6 +203,18 @@ setMethod("rangeBetween",
 #' @aliases over,Column,WindowSpec-method
 #' @family colum_func
 #' @export
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'
+#'   # Partition by am (transmission) and order by hp (horsepower)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'
+#'   # Rank on hp within each partition
+#'   out <- select(df, over(rank(), ws), df$hp, df$am)
+#'
+#'   # Lag mpg values by 1 row on the partition-and-ordered table
+#'   out <- select(df, over(lead(df$mpg), ws), df$mpg, df$hp, df$am)
+#' }
 #' @note over since 2.0.0
 setMethod("over",
           signature(x = "Column", window = "WindowSpec"),
 
@@ -3121,9 +3121,9 @@ setMethod("ifelse",
 #' @aliases cume_dist,missing-method
 #' @export
 #' @examples \dontrun{
-#'   df <- createDataFrame(iris)
-#'   ws <- orderBy(windowPartitionBy("Species"), "Sepal_Length")
-#'   out <- select(df, over(cume_dist(), ws), df$Sepal_Length, df$Species)
+#'   df <- createDataFrame(mtcars)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'   out <- select(df, over(cume_dist(), ws), df$hp, df$am)
 #' }
 #' @note cume_dist since 1.6.0
 setMethod("cume_dist",
@@ -3148,7 +3148,11 @@ setMethod("cume_dist",
 #' @family window_funcs
 #' @aliases dense_rank,missing-method
 #' @export
-#' @examples \dontrun{dense_rank()}
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'   out <- select(df, over(dense_rank(), ws), df$hp, df$am)
+#' }
 #' @note dense_rank since 1.6.0
 setMethod("dense_rank",
           signature("missing"),
@@ -3168,18 +3172,26 @@ setMethod("dense_rank",
 #' @param x the column as a character string or a Column to compute on.
 #' @param offset the number of rows back from the current row from which to obtain a value.
 #'               If not specified, the default is 1.
-#' @param defaultValue default to use when the offset row does not exist.
+#' @param defaultValue (optional) default to use when the offset row does not exist.
 #' @param ... further arguments to be passed to or from other methods.
 #' @rdname lag
 #' @name lag
 #' @aliases lag,characterOrColumn-method
 #' @family window_funcs
 #' @export
-#' @examples \dontrun{lag(df$c)}
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'
+#'   # Partition by am (transmission) and order by hp (horsepower)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'
+#'   # Lag mpg values by 1 row on the partition-and-ordered table
+#'   out <- select(df, over(lag(df$mpg), ws), df$mpg, df$hp, df$am)
+#' }
 #' @note lag since 1.6.0
 setMethod("lag",
           signature(x = "characterOrColumn"),
-          function(x, offset, defaultValue = NULL) {
+          function(x, offset = 1, defaultValue = NULL) {
             col <- if (class(x) == "Column") {
               x@jc
             } else {
@@ -3194,25 +3206,35 @@ setMethod("lag",
 #' lead
 #'
 #' Window function: returns the value that is \code{offset} rows after the current row, and
-#' NULL if there is less than \code{offset} rows after the current row. For example,
-#' an \code{offset} of one will return the next row at any given point in the window partition.
+#' \code{defaultValue} if there is less than \code{offset} rows after the current row.
+#' For example, an \code{offset} of one will return the next row at any given point
+#' in the window partition.
 #'
 #' This is equivalent to the \code{LEAD} function in SQL.
 #'
-#' @param x Column to compute on
-#' @param offset Number of rows to offset
-#' @param defaultValue (Optional) default value to use
+#' @param x the column as a character string or a Column to compute on.
+#' @param offset the number of rows after the current row from which to obtain a value.
+#'               If not specified, the default is 1.
+#' @param defaultValue (optional) default to use when the offset row does not exist.
 #'
 #' @rdname lead
 #' @name lead
 #' @family window_funcs
 #' @aliases lead,characterOrColumn,numeric-method
 #' @export
-#' @examples \dontrun{lead(df$c)}
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'
+#'   # Partition by am (transmission) and order by hp (horsepower)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'
+#'   # Lead mpg values by 1 row on the partition-and-ordered table
+#'   out <- select(df, over(lead(df$mpg), ws), df$mpg, df$hp, df$am)
+#' }
 #' @note lead since 1.6.0
 setMethod("lead",
           signature(x = "characterOrColumn", offset = "numeric", defaultValue = "ANY"),
-          function(x, offset, defaultValue = NULL) {
+          function(x, offset = 1, defaultValue = NULL) {
             col <- if (class(x) == "Column") {
               x@jc
             } else {
@@ -3239,7 +3261,15 @@ setMethod("lead",
 #' @aliases ntile,numeric-method
 #' @family window_funcs
 #' @export
-#' @examples \dontrun{ntile(1)}
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'
+#'   # Partition by am (transmission) and order by hp (horsepower)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'
+#'   # Get ntile group id (1-4) for hp
+#'   out <- select(df, over(ntile(4), ws), df$hp, df$am)
+#' }
 #' @note ntile since 1.6.0
 setMethod("ntile",
           signature(x = "numeric"),
@@ -3263,7 +3293,11 @@ setMethod("ntile",
 #' @family window_funcs
 #' @aliases percent_rank,missing-method
 #' @export
-#' @examples \dontrun{percent_rank()}
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'   out <- select(df, over(percent_rank(), ws), df$hp, df$am)
+#' }
 #' @note percent_rank since 1.6.0
 setMethod("percent_rank",
           signature("missing"),
@@ -3288,7 +3322,11 @@ setMethod("percent_rank",
 #' @family window_funcs
 #' @aliases rank,missing-method
 #' @export
-#' @examples \dontrun{rank()}
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'   out <- select(df, over(rank(), ws), df$hp, df$am)
+#' }
 #' @note rank since 1.6.0
 setMethod("rank",
           signature(x = "missing"),
@@ -3321,7 +3359,11 @@ setMethod("rank",
 #' @aliases row_number,missing-method
 #' @family window_funcs
 #' @export
-#' @examples \dontrun{row_number()}
+#' @examples \dontrun{
+#'   df <- createDataFrame(mtcars)
+#'   ws <- orderBy(windowPartitionBy("am"), "hp")
+#'   out <- select(df, over(row_number(), ws), df$hp, df$am)
+#' }
 #' @note row_number since 1.6.0
 setMethod("row_number",
           signature("missing"),
 
@@ -1330,6 +1330,10 @@ setGeneric("spark.kmeans", function(data, formula, ...) { standardGeneric("spark
 #' @export
 setGeneric("fitted")
 
+#' @rdname spark.mlp
+#' @export
+setGeneric("spark.mlp", function(data, ...) { standardGeneric("spark.mlp") })
+
 #' @rdname spark.naiveBayes
 #' @export
 setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("spark.naiveBayes") })
 
@@ -70,9 +70,9 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
                           localDir = NULL, overwrite = FALSE) {
   version <- paste0("spark-", packageVersion("SparkR"))
   hadoopVersion <- tolower(hadoopVersion)
-  hadoopVersionName <- hadoop_version_name(hadoopVersion)
+  hadoopVersionName <- hadoopVersionName(hadoopVersion)
   packageName <- paste(version, "bin", hadoopVersionName, sep = "-")
-  localDir <- ifelse(is.null(localDir), spark_cache_path(),
+  localDir <- ifelse(is.null(localDir), sparkCachePath(),
                      normalizePath(localDir, mustWork = FALSE))
 
   if (is.na(file.info(localDir)$isdir)) {
@@ -88,12 +88,14 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
 
   # can use dir.exists(packageLocalDir) under R 3.2.0 or later
   if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) {
-    fmt <- "Spark %s for Hadoop %s is found, and SPARK_HOME set to %s"
+    fmt <- "%s for Hadoop %s found, with SPARK_HOME set to %s"
     msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
                    packageLocalDir)
     message(msg)
     Sys.setenv(SPARK_HOME = packageLocalDir)
     return(invisible(packageLocalDir))
+  } else {
+    message("Spark not found in the cache directory. Installation will start.")
   }
 
   packageLocalPath <- paste0(packageLocalDir, ".tgz")
@@ -102,7 +104,7 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
   if (tarExists && !overwrite) {
     message("tar file found.")
   } else {
-    robust_download_tar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath)
+    robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath)
   }
 
   message(sprintf("Installing to %s", localDir))
@@ -116,33 +118,37 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
   invisible(packageLocalDir)
 }
 
-robust_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) {
+robustDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) {
   # step 1: use user-provided url
   if (!is.null(mirrorUrl)) {
     msg <- sprintf("Use user-provided mirror site: %s.", mirrorUrl)
     message(msg)
-    success <- direct_download_tar(mirrorUrl, version, hadoopVersion,
+    success <- directDownloadTar(mirrorUrl, version, hadoopVersion,
                                    packageName, packageLocalPath)
-    if (success) return()
+    if (success) {
+      return()
+    } else {
+      message(paste0("Unable to download from mirrorUrl: ", mirrorUrl))
+    }
   } else {
-    message("Mirror site not provided.")
+    message("MirrorUrl not provided.")
   }
 
   # step 2: use url suggested from apache website
-  message("Looking for site suggested from apache website...")
-  mirrorUrl <- get_preferred_mirror(version, packageName)
+  message("Looking for preferred site from apache website...")
+  mirrorUrl <- getPreferredMirror(version, packageName)
   if (!is.null(mirrorUrl)) {
-    success <- direct_download_tar(mirrorUrl, version, hadoopVersion,
+    success <- directDownloadTar(mirrorUrl, version, hadoopVersion,
                                    packageName, packageLocalPath)
     if (success) return()
   } else {
-    message("Unable to find suggested mirror site.")
+    message("Unable to find preferred mirror site.")
   }
 
   # step 3: use backup option
   message("To use backup site...")
-  mirrorUrl <- default_mirror_url()
-  success <- direct_download_tar(mirrorUrl, version, hadoopVersion,
+  mirrorUrl <- defaultMirrorUrl()
+  success <- directDownloadTar(mirrorUrl, version, hadoopVersion,
                                  packageName, packageLocalPath)
   if (success) {
     return(packageLocalPath)
@@ -155,7 +161,7 @@ robust_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName,
   }
 }
 
-get_preferred_mirror <- function(version, packageName) {
+getPreferredMirror <- function(version, packageName) {
   jsonUrl <- paste0("http://www.apache.org/dyn/closer.cgi?path=",
                         file.path("spark", version, packageName),
                         ".tgz&as_json=1")
@@ -175,10 +181,10 @@ get_preferred_mirror <- function(version, packageName) {
   mirrorPreferred
 }
 
-direct_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) {
+directDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) {
   packageRemotePath <- paste0(
     file.path(mirrorUrl, version, packageName), ".tgz")
-  fmt <- paste("Downloading Spark %s for Hadoop %s from:\n- %s")
+  fmt <- "Downloading %s for Hadoop %s from:\n- %s"
   msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
                  packageRemotePath)
   message(msg)
@@ -192,11 +198,11 @@ direct_download_tar <- function(mirrorUrl, version, hadoopVersion, packageName,
   !isFail
 }
 
-default_mirror_url <- function() {
+defaultMirrorUrl <- function() {
   "http://www-us.apache.org/dist/spark"
 }
 
-hadoop_version_name <- function(hadoopVersion) {
+hadoopVersionName <- function(hadoopVersion) {
   if (hadoopVersion == "without") {
     "without-hadoop"
   } else if (grepl("^[0-9]+\\.[0-9]+$", hadoopVersion, perl = TRUE)) {
@@ -208,7 +214,7 @@ hadoop_version_name <- function(hadoopVersion) {
 
 # The implementation refers to appdirs package: https://pypi.python.org/pypi/appdirs and
 # adapt to Spark context
-spark_cache_path <- function() {
+sparkCachePath <- function() {
   if (.Platform$OS.type == "windows") {
     winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA)
     if (is.na(winAppPath)) {
@@ -231,3 +237,21 @@ spark_cache_path <- function() {
   }
   normalizePath(path, mustWork = FALSE)
 }
+
+
+installInstruction <- function(mode) {
+  if (mode == "remote") {
+    paste0("Connecting to a remote Spark master. ",
+           "Please make sure Spark package is also installed in this machine.\n",
+           "- If there is one, set the path in sparkHome parameter or ",
+           "environment variable SPARK_HOME.\n",
+           "- If not, you may run install.spark function to do the job. ",
+           "Please make sure the Spark and the Hadoop versions ",
+           "match the versions on the cluster. ",
+           "SparkR package is compatible with Spark ", packageVersion("SparkR"), ".",
+           "If you need further help, ",
+           "contact the administrators of the cluster.")
+  } else {
+    stop(paste0("No instruction found for ", mode, " mode."))
+  }
+}