Stash of adding config options in submit script and YARN

apache · pwendell · Mar 31, 2014 · Apr 1, 2014 · Apr 1, 2014 · Apr 1, 2014
commit 0faa3b6ff0e0b37b18fdde3d6a6110459ffd8f28
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -17,11 +17,13 @@
 
 package org.apache.spark.deploy
 
-import java.io.{PrintStream, File}
+import java.io.{FileInputStream, PrintStream, File}
 import java.net.URL
+import java.util.Properties
 
 import org.apache.spark.executor.ExecutorURLClassLoader
 
+import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 import scala.collection.mutable.Map
@@ -108,6 +110,21 @@ object SparkSubmit {
     val sysProps = new HashMap[String, String]()
     var childMainClass = ""
 
+    // Load system properties by default from the file, if present
+    if (appArgs.verbose) printStream.println(s"Using properties file: ${appArgs.propertiesFile}")
+    Option(appArgs.propertiesFile).map { filename =>
+      val file = new File(filename)
+      getDefaultProperties(file).foreach { case (k, v) =>
+        if (k.startsWith("spark")) {
+          sysProps(k) = v
+          if (appArgs.verbose) printStream.println(s"Adding default property: $k=$v")
+        }
+        else {
+          printWarning(s"Ignoring non-spark config property: $k=$v")
+        }
+      }
+    }
+
     if (clusterManager == MESOS && deployOnCluster) {
       printErrorAndExit("Mesos does not support running the driver on the cluster")
     }
@@ -191,11 +208,11 @@ object SparkSubmit {
       sysProps: Map[String, String], childMainClass: String, verbose: Boolean = false) {
 
     if (verbose) {
-      System.err.println(s"Main class:\n$childMainClass")
-      System.err.println(s"Arguments:\n${childArgs.mkString("\n")}")
-      System.err.println(s"System properties:\n${sysProps.mkString("\n")}")
-      System.err.println(s"Classpath elements:\n${childClasspath.mkString("\n")}")
-      System.err.println("\n")
+      printStream.println(s"Main class:\n$childMainClass")
+      printStream.println(s"Arguments:\n${childArgs.mkString("\n")}")
+      printStream.println(s"System properties:\n${sysProps.mkString("\n")}")
+      printStream.println(s"Classpath elements:\n${childClasspath.mkString("\n")}")
+      printStream.println("\n")
     }
 
     val loader = new ExecutorURLClassLoader(new Array[URL](0),
@@ -224,6 +241,13 @@ object SparkSubmit {
     val url = localJarFile.getAbsoluteFile.toURI.toURL
     loader.addURL(url)
   }
+
+  private def getDefaultProperties(file: File): Seq[(String, String)] = {
+    val inputStream = new FileInputStream(file)
+    val properties = new Properties()
+    properties.load(inputStream)
+    properties.stringPropertyNames().toSeq.map(k => (k, properties(k)))
+  }
 }
 
 private[spark] class OptionAssigner(val value: String,

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.deploy
 
 import scala.collection.mutable.ArrayBuffer
+import java.io.File
 
 /**
  * Parses and encapsulates arguments from the spark-submit script.
@@ -28,6 +29,7 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
   var executorMemory: String = null
   var executorCores: String = null
   var totalExecutorCores: String = null
+  var propertiesFile: String = null
   var driverMemory: String = null
   var driverCores: String = null
   var supervise: Boolean = false
@@ -49,6 +51,15 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
   if (args.length == 0) printUsageAndExit(-1)
   if (primaryResource == null) SparkSubmit.printErrorAndExit("Must specify a primary resource")
   if (mainClass == null) SparkSubmit.printErrorAndExit("Must specify a main class with --class")
+  if (propertiesFile == null) {
+    val sparkHome = sys.env("SPARK_HOME") // defined via `spark-class`
+    val sep = File.separator
+    val defaultPath = s"${sparkHome}${sep}conf${sep}spark-defaults.properties"
+    val file = new File(defaultPath)
+    if (file.exists()) {
+       propertiesFile = file.getAbsolutePath
+     }
+  }
 
   override def toString =  {
     s"""Parsed arguments:
@@ -57,8 +68,9 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
     |  executorMemory     $executorMemory
     |  executorCores      $executorCores
     |  totalExecutorCores $totalExecutorCores
+    |  propertiesFile     $propertiesFile
     |  driverMemory       $driverMemory
-    |  drivercores        $driverCores
+    |  driverCores        $driverCores
     |  supervise          $supervise
     |  queue              $queue
     |  numExecutors       $numExecutors
@@ -122,6 +134,10 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
       driverCores = value
       parseOpts(tail)
 
+    case ("--properties-file") :: value :: tail =>
+      propertiesFile = value
+      parseOpts(tail)
+
     case ("--supervise") :: tail =>
       supervise = true
       parseOpts(tail)

diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala
@@ -44,6 +44,7 @@ import org.apache.hadoop.yarn.util.{Records, Apps}
 import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.util.Utils
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.ExecutorLauncher
 import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
 
 
@@ -340,8 +341,19 @@ trait ClientBase extends Logging {
       JAVA_OPTS += " -XX:CMSIncrementalDutyCycle=10 "
     }
 
-    if (env.isDefinedAt("SPARK_JAVA_OPTS")) {
-      JAVA_OPTS += " " + env("SPARK_JAVA_OPTS")
+
+    if (args.amClass == classOf[ExecutorLauncher].getName) {
+      // If we are being launched in client mode, forward the spark-conf options
+      // onto the executor launcher
+      for ((k, v) <- sparkConf.getAll) {
+        JAVA_OPTS += s"-D$k=$v"
+      }
+    } else {
+      // If we are being launched in standalone mode, capture and forward any spark
+      // system properties (e.g. set by spark-class).
+      for ((k, v) <- sys.props.filterKeys(_.startsWith("spark"))) {
+        JAVA_OPTS += s"-D$k=$v"
+      }
     }
 
     if (!localResources.contains(ClientBase.LOG4J_PROP)) {

diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnableUtil.scala
@@ -57,8 +57,10 @@ trait ExecutorRunnableUtil extends Logging {
     // Set the JVM memory
     val executorMemoryString = executorMemory + "m"
     JAVA_OPTS += "-Xms" + executorMemoryString + " -Xmx" + executorMemoryString + " "
-    if (env.isDefinedAt("SPARK_JAVA_OPTS")) {
-      JAVA_OPTS += env("SPARK_JAVA_OPTS") + " "
+
+    /* Pass on Spark properties to the driver. */
+    for ((k, v) <- sys.props.filterKeys(_.startsWith("spark"))) {
+      JAVA_OPTS += s"-D$k=$v"
     }
 
     JAVA_OPTS += " -Djava.io.tmpdir=" +

diff --git a/...common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala b/...common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientClusterScheduler.scala
@@ -29,7 +29,7 @@ import org.apache.spark.util.Utils
  */
 private[spark] class YarnClientClusterScheduler(sc: SparkContext, conf: Configuration) extends TaskSchedulerImpl(sc) {
 
-  def this(sc: SparkContext) = this(sc, new Configuration())
+  def this(sc: SparkContext) = this(sc, sc.getConf)
 
   // By default, rack is unknown
   override def getRackForHost(hostPort: String): Option[String] = {

diff --git a/...common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/...common/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -19,7 +19,7 @@ package org.apache.spark.scheduler.cluster
 
 import org.apache.hadoop.yarn.api.records.{ApplicationId, YarnApplicationState}
 import org.apache.spark.{SparkException, Logging, SparkContext}
-import org.apache.spark.deploy.yarn.{Client, ClientArguments}
+import org.apache.spark.deploy.yarn.{Client, ClientArguments, ExecutorLauncher}
 import org.apache.spark.scheduler.TaskSchedulerImpl
 
 import scala.collection.mutable.ArrayBuffer
@@ -54,7 +54,7 @@ private[spark] class YarnClientSchedulerBackend(
       "--class", "notused",
       "--jar", null,
       "--args", hostport,
-      "--am-class", "org.apache.spark.deploy.yarn.ExecutorLauncher"
+      "--am-class", classOf[ExecutorLauncher].getName
     )
 
     // process any optional arguments, given either as environment variables

diff --git a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala
@@ -237,7 +237,7 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
   }
 
   def finishApplicationMaster(status: FinalApplicationStatus) {
-    logInfo("finish ApplicationMaster with " + status)
+    logInfo("finish ApplicationEMaster with " + status)
     amClient.unregisterApplicationMaster(status, "" /* appMessage */ , "" /* appTrackingUrl */)
   }