apache · ifilonenko · Apr 16, 2018 · Apr 17, 2018 · Apr 18, 2018 · Apr 18, 2018
diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh
@@ -63,12 +63,20 @@ function build {
   if [ ! -d "$IMG_PATH" ]; then
     error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark."
   fi
-
-  local DOCKERFILE=${DOCKERFILE:-"$IMG_PATH/spark/Dockerfile"}
+  local BINDING_BUILD_ARGS=(
+    --build-arg
+    base_img=$(image_ref spark)
+  )
+  local BASEDOCKERFILE=${BASEDOCKERFILE:-"$IMG_PATH/spark/Dockerfile"}
+  local PYDOCKERFILE=${PYDOCKERFILE:-"$IMG_PATH/spark/bindings/python/Dockerfile"}
 
   docker build "${BUILD_ARGS[@]}" \
     -t $(image_ref spark) \
-    -f "$DOCKERFILE" .
+    -f "$BASEDOCKERFILE" .
+
+    docker build "${BINDING_BUILD_ARGS[@]}" \
+    -t $(image_ref spark-py) \
+    -f "$PYDOCKERFILE" .
 }
 
 function push {
@@ -86,7 +94,8 @@ Commands:
   push        Push a pre-built image to a registry. Requires a repository address to be provided.
 
 Options:
-  -f file     Dockerfile to build. By default builds the Dockerfile shipped with Spark.
+  -f file     Dockerfile to build for JVM based Jobs. By default builds the Dockerfile shipped with Spark.
+  -p file     Dockerfile with Python baked in. By default builds the Dockerfile shipped with Spark.
   -r repo     Repository address.
   -t tag      Tag to apply to the built image, or to identify the image to be pushed.
   -m          Use minikube's Docker daemon.
@@ -116,12 +125,14 @@ fi
 
 REPO=
 TAG=
-DOCKERFILE=
+BASEDOCKERFILE=
+PYDOCKERFILE=
 while getopts f:mr:t: option
 do
  case "${option}"
  in
- f) DOCKERFILE=${OPTARG};;
+ f) BASEDOCKERFILE=${OPTARG};;
+ p) PYDOCKERFILE=${OPTARG};;
  r) REPO=${OPTARG};;
  t) TAG=${OPTARG};;
  m)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -285,8 +285,6 @@ private[spark] class SparkSubmit extends Logging {
       case (STANDALONE, CLUSTER) if args.isR =>
         error("Cluster deploy mode is currently not supported for R " +
           "applications on standalone clusters.")
-      case (KUBERNETES, _) if args.isPython =>
-        error("Python applications are currently not supported for Kubernetes.")
       case (KUBERNETES, _) if args.isR =>
         error("R applications are currently not supported for Kubernetes.")
       case (LOCAL, CLUSTER) =>
@@ -694,9 +692,17 @@ private[spark] class SparkSubmit extends Logging {
     if (isKubernetesCluster) {
       childMainClass = KUBERNETES_CLUSTER_SUBMIT_CLASS
       if (args.primaryResource != SparkLauncher.NO_RESOURCE) {
-        childArgs ++= Array("--primary-java-resource", args.primaryResource)
+        if (args.isPython) {
+          childArgs ++= Array("--primary-py-file", args.primaryResource)
+          childArgs ++= Array("--main-class", "org.apache.spark.deploy.PythonRunner")
+          if (args.pyFiles != null) {
+            childArgs ++= Array("--other-py-files", args.pyFiles)
+          }
+        } else {
+          childArgs ++= Array("--primary-java-resource", args.primaryResource)
+          childArgs ++= Array("--main-class", args.mainClass)
+        }
       }
-      childArgs ++= Array("--main-class", args.mainClass)
       if (args.childArgs != null) {
         args.childArgs.foreach { arg =>
           childArgs += ("--arg", arg)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
@@ -624,4 +624,20 @@ specific to Spark on Kubernetes.
    <code>spark.kubernetes.executor.secrets.ENV_VAR=spark-secret:key</code>.
   </td>
 </tr>
+<tr>
+  <td><code>spark.kubernetes.memoryOverheadFactor</code></td>
+  <td><code>0.1</code></td>
+  <td>
+    This sets the Memory Overhead Factor that will allocate memory to non-JVM jobs which in the case of JVM tasks will default to 0.10 and 0.40 for non-JVM jobs.
+    This is done as non-JVM tasks need more non-JVM heap space and such tasks commonly fail with "Memory Overhead Exceeded" errors. This prempts this error with 
+    a higher default. 
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.pyspark.pythonversion</code></td>
+  <td><code>"2"</code></td>
+  <td>
+   This sets the major Python version of the docker image used to run the driver and executor containers. Can either be 2 or 3. 
+  </td>
+</tr>
 </table>
diff --git a/examples/src/main/python/py_container_checks.py b/examples/src/main/python/py_container_checks.py
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import sys
+
+
+def version_check(python_env, major_python_version):
+    """
+        These are various tests to test the Python container image.
+        This file will be distributed via --py-files in the e2e tests.
+    """
+    env_version = os.environ.get('PYSPARK_PYTHON')
+    print("Python runtime version check is: " +
+          str(sys.version_info[0] == major_python_version))
+
+    print("Python environment version check is: " +
+          str(env_version == python_env))
diff --git a/examples/src/main/python/pyfiles.py b/examples/src/main/python/pyfiles.py
@@ -0,0 +1,38 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+import sys
+
+from pyspark.sql import SparkSession
+
+
+if __name__ == "__main__":
+    """
+        Usage: pyfiles [major_python_version]
+    """
+    spark = SparkSession \
+        .builder \
+        .appName("PyFilesTest") \
+        .getOrCreate()
+
+    from py_container_checks import version_check
+    # Begin of Python container checks
+    version_check(sys.argv[1], 2 if sys.argv[1] == "python" else 3)
+
+    spark.stop()
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -117,6 +117,28 @@ private[spark] object Config extends Logging {
       .stringConf
       .createWithDefault("spark")
 
+  val KUBERNETES_PYSPARK_PY_FILES =
+    ConfigBuilder("spark.kubernetes.python.pyFiles")
+      .doc("The PyFiles that are distributed via client arguments")
+      .internal()
+      .stringConf
+      .createOptional
+
+  val KUBERNETES_PYSPARK_MAIN_APP_RESOURCE =
+    ConfigBuilder("spark.kubernetes.python.mainAppResource")
+      .doc("The main app resource for pyspark jobs")
+      .internal()
+      .stringConf
+      .createOptional
+
+  val KUBERNETES_PYSPARK_APP_ARGS =
+    ConfigBuilder("spark.kubernetes.python.appArgs")
+      .doc("The app arguments for PySpark Jobs")
+      .internal()
+      .stringConf
+      .createOptional
+
+
   val KUBERNETES_ALLOCATION_BATCH_SIZE =
     ConfigBuilder("spark.kubernetes.allocation.batch.size")
       .doc("Number of pods to launch at once in each round of executor allocation.")
@@ -154,6 +176,24 @@ private[spark] object Config extends Logging {
       .checkValue(interval => interval > 0, s"Logging interval must be a positive time value.")
       .createWithDefaultString("1s")
 
+  val MEMORY_OVERHEAD_FACTOR =
+    ConfigBuilder("spark.kubernetes.memoryOverheadFactor")
+      .doc("This sets the Memory Overhead Factor that will allocate memory to non-JVM jobs " +
+        "which in the case of JVM tasks will default to 0.10 and 0.40 for non-JVM jobs")
+      .doubleConf
+      .checkValue(mem_overhead => mem_overhead >= 0 && mem_overhead < 1,
+        "Ensure that memory overhead is a double between 0 --> 1.0")
+      .createWithDefault(0.1)
+
+  val PYSPARK_MAJOR_PYTHON_VERSION =
+    ConfigBuilder("spark.kubernetes.pyspark.pythonversion")
+      .doc("This sets the major Python version. Either 2 or 3. (Python2 or Python3)")
+      .stringConf
+      .checkValue(pv => List("2", "3").contains(pv),
+        "Ensure that major Python version is either Python2 or Python3")
+      .createWithDefault("2")
+
+
   val KUBERNETES_AUTH_SUBMISSION_CONF_PREFIX =
     "spark.kubernetes.authenticate.submission"
 

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
@@ -71,9 +71,14 @@ private[spark] object Constants {
   val SPARK_CONF_FILE_NAME = "spark.properties"
   val SPARK_CONF_PATH = s"$SPARK_CONF_DIR_INTERNAL/$SPARK_CONF_FILE_NAME"
 
+  // BINDINGS
+  val ENV_PYSPARK_PRIMARY = "PYSPARK_PRIMARY"
+  val ENV_PYSPARK_FILES = "PYSPARK_FILES"
+  val ENV_PYSPARK_ARGS = "PYSPARK_APP_ARGS"
+  val ENV_PYSPARK_MAJOR_PYTHON_VERSION = "PYSPARK_MAJOR_PYTHON_VERSION"
+
   // Miscellaneous
   val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
   val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
-  val MEMORY_OVERHEAD_FACTOR = 0.10
   val MEMORY_OVERHEAD_MIN_MIB = 384L
 }
diff --git a/...-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala b/...-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesConf.scala
@@ -16,14 +16,17 @@
  */
 package org.apache.spark.deploy.k8s
 
+import scala.collection.mutable
+
 import io.fabric8.kubernetes.api.model.{LocalObjectReference, LocalObjectReferenceBuilder, Pod}
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.submit.{JavaMainAppResource, MainAppResource}
+import org.apache.spark.deploy.k8s.submit._
 import org.apache.spark.internal.config.ConfigEntry
 
+
 private[spark] sealed trait KubernetesRoleSpecificConf
 
 /*
@@ -55,7 +58,8 @@ private[spark] case class KubernetesConf[T <: KubernetesRoleSpecificConf](
     roleAnnotations: Map[String, String],
     roleSecretNamesToMountPaths: Map[String, String],
     roleSecretEnvNamesToKeyRefs: Map[String, String],
-    roleEnvs: Map[String, String]) {
+    roleEnvs: Map[String, String],
+    sparkFiles: Seq[String]) {
 
   def namespace(): String = sparkConf.get(KUBERNETES_NAMESPACE)
 
@@ -64,10 +68,14 @@ private[spark] case class KubernetesConf[T <: KubernetesRoleSpecificConf](
     .map(str => str.split(",").toSeq)
     .getOrElse(Seq.empty[String])
 
-  def sparkFiles(): Seq[String] = sparkConf
-    .getOption("spark.files")
-    .map(str => str.split(",").toSeq)
-    .getOrElse(Seq.empty[String])
+  def pyFiles(): Option[String] = sparkConf
+    .get(KUBERNETES_PYSPARK_PY_FILES)
+
+  def pySparkMainResource(): Option[String] = sparkConf
+    .get(KUBERNETES_PYSPARK_MAIN_APP_RESOURCE)
+
+  def pySparkPythonVersion(): String = sparkConf
+      .get(PYSPARK_MAJOR_PYTHON_VERSION)
 
   def imagePullPolicy(): String = sparkConf.get(CONTAINER_IMAGE_PULL_POLICY)
 
@@ -102,17 +110,30 @@ private[spark] object KubernetesConf {
       appId: String,
       mainAppResource: Option[MainAppResource],
       mainClass: String,
-      appArgs: Array[String]): KubernetesConf[KubernetesDriverSpecificConf] = {
+      appArgs: Array[String],
+      maybePyFiles: Option[String]): KubernetesConf[KubernetesDriverSpecificConf] = {
     val sparkConfWithMainAppJar = sparkConf.clone()
+    val additionalFiles = mutable.ArrayBuffer.empty[String]
     mainAppResource.foreach {
-      case JavaMainAppResource(res) =>
-        val previousJars = sparkConf
-          .getOption("spark.jars")
-          .map(_.split(","))
-          .getOrElse(Array.empty)
-        if (!previousJars.contains(res)) {
-          sparkConfWithMainAppJar.setJars(previousJars ++ Seq(res))
-        }
+        case JavaMainAppResource(res) =>
+          val previousJars = sparkConf
+            .getOption("spark.jars")
+            .map(_.split(","))
+            .getOrElse(Array.empty)
+          if (!previousJars.contains(res)) {
+            sparkConfWithMainAppJar.setJars(previousJars ++ Seq(res))
+          }
+        // The function of this outer match is to account for multiple nonJVM
+        // bindings that will all have increased MEMORY_OVERHEAD_FACTOR to 0.4
+        case nonJVM: NonJVMResource =>
+          nonJVM match {
+            case PythonMainAppResource(res) =>
+              additionalFiles += res
+              maybePyFiles.foreach{maybePyFiles =>
+                additionalFiles.appendAll(maybePyFiles.split(","))}
+              sparkConfWithMainAppJar.set(KUBERNETES_PYSPARK_MAIN_APP_RESOURCE, res)
+          }
+          sparkConfWithMainAppJar.setIfMissing(MEMORY_OVERHEAD_FACTOR, 0.4)
     }
 
     val driverCustomLabels = KubernetesUtils.parsePrefixedKeyValuePairs(
@@ -135,6 +156,11 @@ private[spark] object KubernetesConf {
     val driverEnvs = KubernetesUtils.parsePrefixedKeyValuePairs(
       sparkConf, KUBERNETES_DRIVER_ENV_PREFIX)
 
+    val sparkFiles = sparkConf
+      .getOption("spark.files")
+      .map(str => str.split(",").toSeq)
+      .getOrElse(Seq.empty[String]) ++ additionalFiles
+
     KubernetesConf(
       sparkConfWithMainAppJar,
       KubernetesDriverSpecificConf(mainAppResource, mainClass, appName, appArgs),
@@ -144,7 +170,8 @@ private[spark] object KubernetesConf {
       driverAnnotations,
       driverSecretNamesToMountPaths,
       driverSecretEnvNamesToKeyRefs,
-      driverEnvs)
+      driverEnvs,
+      sparkFiles)
   }
 
   def createExecutorConf(
@@ -186,6 +213,7 @@ private[spark] object KubernetesConf {
       executorAnnotations,
       executorMountSecrets,
       executorEnvSecrets,
-      executorEnv)
+      executorEnv,
+      Seq.empty[String])
   }
 }
diff --git a/...managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala b/...managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala
@@ -52,7 +52,7 @@ private[spark] object KubernetesUtils {
     }
   }
 
-  private def resolveFileUri(uri: String): String = {
+  def resolveFileUri(uri: String): String = {
     val fileUri = Utils.resolveURI(uri)
     val fileScheme = Option(fileUri.getScheme).getOrElse("file")
     fileScheme match {