apache
diff --git a/‎.github/PULL_REQUEST_TEMPLATE‎
Lines changed: 1 addition & 1 deletion b/‎.github/PULL_REQUEST_TEMPLATE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 3 additions & 1 deletion b/‎R/pkg/NAMESPACE‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎R/pkg/R/generics.R‎
Lines changed: 8 additions & 0 deletions b/‎R/pkg/R/generics.R‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎R/pkg/R/mllib.R‎
Lines changed: 70 additions & 4 deletions b/‎R/pkg/R/mllib.R‎
Lines changed: 70 additions & 4 deletions
diff --git a/‎R/pkg/inst/tests/testthat/test_mllib.R‎
Lines changed: 28 additions & 0 deletions b/‎R/pkg/inst/tests/testthat/test_mllib.R‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎core/src/main/resources/org/apache/spark/ui/static/historypage-template.html‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/resources/org/apache/spark/ui/static/historypage-template.html‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/main/resources/org/apache/spark/ui/static/historypage.js‎
Lines changed: 32 additions & 1 deletion b/‎core/src/main/resources/org/apache/spark/ui/static/historypage.js‎
Lines changed: 32 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/SparkConf.scala‎
Lines changed: 25 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/SparkConf.scala‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 12 additions & 18 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 12 additions & 18 deletions
@@ -3,7 +3,7 @@
 (Please fill in changes proposed in this fix)
 
 
-## How was the this patch tested?
+## How was this patch tested?
 
 (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
 
 
@@ -13,7 +13,9 @@ export("print.jobj")
 # MLlib integration
 exportMethods("glm",
               "predict",
-              "summary")
+              "summary",
+              "kmeans",
+              "fitted")
 
 # Job group lifecycle management methods
 export("setJobGroup",
 
@@ -1160,3 +1160,11 @@ setGeneric("predict", function(object, ...) { standardGeneric("predict") })
 #' @rdname rbind
 #' @export
 setGeneric("rbind", signature = "...")
+
+#' @rdname kmeans
+#' @export
+setGeneric("kmeans")
+
+#' @rdname fitted
+#' @export
+setGeneric("fitted")
@@ -104,11 +104,11 @@ setMethod("predict", signature(object = "PipelineModel"),
 setMethod("summary", signature(object = "PipelineModel"),
           function(object, ...) {
             modelName <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                   "getModelName", object@model)
+                                     "getModelName", object@model)
             features <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                   "getModelFeatures", object@model)
+                                    "getModelFeatures", object@model)
             coefficients <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
-                                   "getModelCoefficients", object@model)
+                                        "getModelCoefficients", object@model)
             if (modelName == "LinearRegressionModel") {
               devianceResiduals <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
                                                "getModelDevianceResiduals", object@model)
@@ -119,10 +119,76 @@ setMethod("summary", signature(object = "PipelineModel"),
               colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
               rownames(coefficients) <- unlist(features)
               return(list(devianceResiduals = devianceResiduals, coefficients = coefficients))
-            } else {
+            } else if (modelName == "LogisticRegressionModel") {
               coefficients <- as.matrix(unlist(coefficients))
               colnames(coefficients) <- c("Estimate")
               rownames(coefficients) <- unlist(features)
               return(list(coefficients = coefficients))
+            } else if (modelName == "KMeansModel") {
+              modelSize <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                       "getKMeansModelSize", object@model)
+              cluster <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                     "getKMeansCluster", object@model, "classes")
+              k <- unlist(modelSize)[1]
+              size <- unlist(modelSize)[-1]
+              coefficients <- t(matrix(coefficients, ncol = k))
+              colnames(coefficients) <- unlist(features)
+              rownames(coefficients) <- 1:k
+              return(list(coefficients = coefficients, size = size, cluster = dataFrame(cluster)))
+            } else {
+              stop(paste("Unsupported model", modelName, sep = " "))
+            }
+          })
+
+#' Fit a k-means model
+#'
+#' Fit a k-means model, similarly to R's kmeans().
+#'
+#' @param x DataFrame for training
+#' @param centers Number of centers
+#' @param iter.max Maximum iteration number
+#' @param algorithm Algorithm choosen to fit the model
+#' @return A fitted k-means model
+#' @rdname kmeans
+#' @export
+#' @examples
+#'\dontrun{
+#' model <- kmeans(x, centers = 2, algorithm="random")
+#'}
+setMethod("kmeans", signature(x = "DataFrame"),
+          function(x, centers, iter.max = 10, algorithm = c("random", "k-means||")) {
+            columnNames <- as.array(colnames(x))
+            algorithm <- match.arg(algorithm)
+            model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers", "fitKMeans", x@sdf,
+                                 algorithm, iter.max, centers, columnNames)
+            return(new("PipelineModel", model = model))
+         })
+
+#' Get fitted result from a model
+#'
+#' Get fitted result from a model, similarly to R's fitted().
+#'
+#' @param object A fitted MLlib model
+#' @return DataFrame containing fitted values
+#' @rdname fitted
+#' @export
+#' @examples
+#'\dontrun{
+#' model <- kmeans(trainingData, 2)
+#' fitted.model <- fitted(model)
+#' showDF(fitted.model)
+#'}
+setMethod("fitted", signature(object = "PipelineModel"),
+          function(object, method = c("centers", "classes"), ...) {
+            modelName <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                     "getModelName", object@model)
+
+            if (modelName == "KMeansModel") {
+              method <- match.arg(method)
+              fittedResult <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
+                                          "getKMeansCluster", object@model, method)
+              return(dataFrame(fittedResult))
+            } else {
+              stop(paste("Unsupported model", modelName, sep = " "))
             }
           })
@@ -113,3 +113,31 @@ test_that("summary works on base GLM models", {
   baseSummary <- summary(baseModel)
   expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
 })
+
+test_that("kmeans", {
+  newIris <- iris
+  newIris$Species <- NULL
+  training <- suppressWarnings(createDataFrame(sqlContext, newIris))
+
+  # Cache the DataFrame here to work around the bug SPARK-13178.
+  cache(training)
+  take(training, 1)
+
+  model <- kmeans(x = training, centers = 2)
+  sample <- take(select(predict(model, training), "prediction"), 1)
+  expect_equal(typeof(sample$prediction), "integer")
+  expect_equal(sample$prediction, 1)
+
+  # Test stats::kmeans is working
+  statsModel <- kmeans(x = newIris, centers = 2)
+  expect_equal(sort(unique(statsModel$cluster)), c(1, 2))
+
+  # Test fitted works on KMeans
+  fitted.model <- fitted(model)
+  expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction), c(0, 1))
+
+  # Test summary works on KMeans
+  summary.model <- summary(model)
+  cluster <- summary.model$cluster
+  expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
+})
@@ -64,7 +64,7 @@
   <tbody>
   {{#applications}}
     <tr>
-      <td class="rowGroupColumn"><a href="/history/{{id}}/{{num}}/jobs/">{{id}}</a></td>
+      <td class="rowGroupColumn"><span title="{{id}}"><a href="/history/{{id}}/{{num}}/jobs/">{{id}}</a></span></td>
       <td class="rowGroupColumn">{{name}}</td>
       {{#attempts}}
       <td class="attemptIDSpan"><a href="/history/{{id}}/{{attemptId}}/jobs/">{{attemptId}}</a></td>
 
@@ -37,6 +37,22 @@ function formatDuration(milliseconds) {
   return hours.toFixed(1) + " h";
 }
 
+function makeIdNumeric(id) {
+  var strs = id.split("_");
+  if (strs.length < 3) {
+    return id;
+  }
+  var appSeqNum = strs[2];
+  var resl = strs[0] + "_" + strs[1] + "_";
+  var diff = 10 - appSeqNum.length;
+  while (diff > 0) {
+      resl += "0"; // padding 0 before the app sequence number to make sure it has 10 characters
+      diff--;
+  }
+  resl += appSeqNum;
+  return resl;
+}
+
 function formatDate(date) {
   return date.split(".")[0].replace("T", " ");
 }
@@ -62,6 +78,21 @@ jQuery.extend( jQuery.fn.dataTableExt.oSort, {
     }
 } );
 
+jQuery.extend( jQuery.fn.dataTableExt.oSort, {
+    "appid-numeric-pre": function ( a ) {
+        var x = a.match(/title="*(-?[0-9a-zA-Z\-\_]+)/)[1];
+        return makeIdNumeric(x);
+    },
+
+    "appid-numeric-asc": function ( a, b ) {
+        return ((a < b) ? -1 : ((a > b) ? 1 : 0));
+    },
+
+    "appid-numeric-desc": function ( a, b ) {
+        return ((a < b) ? 1 : ((a > b) ? -1 : 0));
+    }
+} );
+
 $(document).ajaxStop($.unblockUI);
 $(document).ajaxStart(function(){
     $.blockUI({ message: '<h3>Loading history summary...</h3>'});
@@ -109,7 +140,7 @@ $(document).ready(function() {
         var selector = "#history-summary-table";
         var conf = {
                     "columns": [
-                        {name: 'first'},
+                        {name: 'first', type: "appid-numeric"},
                         {name: 'second'},
                         {name: 'third'},
                         {name: 'fourth'},
 
@@ -503,6 +503,31 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
         set("spark.executor.instances", value)
       }
     }
+
+    if (contains("spark.master") && get("spark.master").startsWith("yarn-")) {
+      val warning = s"spark.master ${get("spark.master")} is deprecated in Spark 2.0+, please " +
+        "instead use \"yarn\" with specified deploy mode."
+
+      get("spark.master") match {
+        case "yarn-cluster" =>
+          logWarning(warning)
+          set("spark.master", "yarn")
+          set("spark.submit.deployMode", "cluster")
+        case "yarn-client" =>
+          logWarning(warning)
+          set("spark.master", "yarn")
+          set("spark.submit.deployMode", "client")
+        case _ => // Any other unexpected master will be checked when creating scheduler backend.
+      }
+    }
+
+    if (contains("spark.submit.deployMode")) {
+      get("spark.submit.deployMode") match {
+        case "cluster" | "client" =>
+        case e => throw new SparkException("spark.submit.deployMode can only be \"cluster\" or " +
+          "\"client\".")
+      }
+    }
   }
 
   /**
 
@@ -237,6 +237,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   def jars: Seq[String] = _jars
   def files: Seq[String] = _files
   def master: String = _conf.get("spark.master")
+  def deployMode: String = _conf.getOption("spark.submit.deployMode").getOrElse("client")
   def appName: String = _conf.get("spark.app.name")
 
   private[spark] def isEventLogEnabled: Boolean = _conf.getBoolean("spark.eventLog.enabled", false)
@@ -375,10 +376,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     }
 
     // System property spark.yarn.app.id must be set if user code ran by AM on a YARN cluster
-    // yarn-standalone is deprecated, but still supported
-    if ((master == "yarn-cluster" || master == "yarn-standalone") &&
-        !_conf.contains("spark.yarn.app.id")) {
-      throw new SparkException("Detected yarn-cluster mode, but isn't running on a cluster. " +
+    if (master == "yarn" && deployMode == "cluster" && !_conf.contains("spark.yarn.app.id")) {
+      throw new SparkException("Detected yarn cluster mode, but isn't running on a cluster. " +
         "Deployment to YARN is not supported directly by SparkContext. Please use spark-submit.")
     }
 
@@ -414,7 +413,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       }
     }
 
-    if (master == "yarn-client") System.setProperty("SPARK_YARN_MODE", "true")
+    if (master == "yarn" && deployMode == "client") System.setProperty("SPARK_YARN_MODE", "true")
 
     // "_jobProgressListener" should be set up before creating SparkEnv because when creating
     // "SparkEnv", some messages will be posted to "listenerBus" and we should not miss them.
@@ -491,7 +490,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
       HeartbeatReceiver.ENDPOINT_NAME, new HeartbeatReceiver(this))
 
     // Create and start the scheduler
-    val (sched, ts) = SparkContext.createTaskScheduler(this, master)
+    val (sched, ts) = SparkContext.createTaskScheduler(this, master, deployMode)
     _schedulerBackend = sched
     _taskScheduler = ts
     _dagScheduler = new DAGScheduler(this)
@@ -1590,10 +1589,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
         key = uri.getScheme match {
           // A JAR file which exists only on the driver node
           case null | "file" =>
-            // yarn-standalone is deprecated, but still supported
-            if (SparkHadoopUtil.get.isYarnMode() &&
-                (master == "yarn-standalone" || master == "yarn-cluster")) {
-              // In order for this to work in yarn-cluster mode the user must specify the
+            if (master == "yarn" && deployMode == "cluster") {
+              // In order for this to work in yarn cluster mode the user must specify the
               // --addJars option to the client to upload the file into the distributed cache
               // of the AM to make it show up in the current working directory.
               val fileName = new Path(uri.getPath).getName()
@@ -2319,7 +2316,8 @@ object SparkContext extends Logging {
    */
   private def createTaskScheduler(
       sc: SparkContext,
-      master: String): (SchedulerBackend, TaskScheduler) = {
+      master: String,
+      deployMode: String): (SchedulerBackend, TaskScheduler) = {
     import SparkMasterRegex._
 
     // When running locally, don't try to re-execute tasks on failure.
@@ -2381,11 +2379,7 @@ object SparkContext extends Logging {
         }
         (backend, scheduler)
 
-      case "yarn-standalone" | "yarn-cluster" =>
-        if (master == "yarn-standalone") {
-          logWarning(
-            "\"yarn-standalone\" is deprecated as of Spark 1.0. Use \"yarn-cluster\" instead.")
-        }
+      case "yarn" if deployMode == "cluster" =>
         val scheduler = try {
           val clazz = Utils.classForName("org.apache.spark.scheduler.cluster.YarnClusterScheduler")
           val cons = clazz.getConstructor(classOf[SparkContext])
@@ -2410,7 +2404,7 @@ object SparkContext extends Logging {
         scheduler.initialize(backend)
         (backend, scheduler)
 
-      case "yarn-client" =>
+      case "yarn" if deployMode == "client" =>
         val scheduler = try {
           val clazz = Utils.classForName("org.apache.spark.scheduler.cluster.YarnScheduler")
           val cons = clazz.getConstructor(classOf[SparkContext])
@@ -2451,7 +2445,7 @@ object SparkContext extends Logging {
       case zkUrl if zkUrl.startsWith("zk://") =>
         logWarning("Master URL for a multi-master Mesos cluster managed by ZooKeeper should be " +
           "in the form mesos://zk://host:port. Current Master URL will stop working in Spark 2.0.")
-        createTaskScheduler(sc, "mesos://" + zkUrl)
+        createTaskScheduler(sc, "mesos://" + zkUrl, deployMode)
 
       case _ =>
         throw new SparkException("Could not parse Master URL: '" + master + "'")