[SPARK-26015][K8S] Set a default UID for Spark on K8S Images

Adds USER directives to the Dockerfiles which is configurable via build argument for easy customisation. A -u flag is added to bin/docker-image-tool.sh to make it easy to customise this.
apache · rvesse · Nov 12, 2018 · Nov 15, 2018 · Nov 15, 2018 · Nov 16, 2018
commit 26697fc545ba816dedc789b186a26c9d8636f4e6
diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh
@@ -132,6 +132,11 @@ function build {
     SPARK_ROOT="$CTX_DIR/base"
   fi
 
+  # If a custom SPARK_UID was set add it to build arguments
+  if [ -n "$SPARK_UID" ]; then
+    BUILD_ARGS+=(--build-arg spark_uid=$SPARK_UID)
+  fi
+
   # Verify that the Docker image content directory is present
   if [ ! -d "$SPARK_ROOT/kubernetes/dockerfiles" ]; then
     error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark."
@@ -207,8 +212,10 @@ Options:
   -t tag                Tag to apply to the built image, or to identify the image to be pushed.
   -m                    Use minikube's Docker daemon.
   -n                    Build docker image with --no-cache
-  -b arg      Build arg to build or push the image. For multiple build args, this option needs to
-              be used separately for each build arg.
+  -u uid                UID to use in the USER directive to set the user the main Spark process runs as inside the
+                        resulting container
+  -b arg                Build arg to build or push the image. For multiple build args, this option needs to
+                        be used separately for each build arg.
 
 Using minikube when building images will do so directly into minikube's Docker daemon.
 There is no need to push the images into minikube in that case, they'll be automatically
@@ -243,7 +250,8 @@ PYDOCKERFILE=
 RDOCKERFILE=
 NOCACHEARG=
 BUILD_PARAMS=
-while getopts f:p:R:mr:t:nb: option
+SPARK_UID=
+while getopts f:p:R:mr:t:nb:u: option
 do
  case "${option}"
  in
@@ -263,6 +271,7 @@ do
    fi
    eval $(minikube docker-env)
    ;;
+  u) SPARK_UID=${OPTARG};;
  esac
 done
 

diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
@@ -17,6 +17,8 @@
 
 FROM openjdk:8-alpine
 
+ARG spark_uid=185
+
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
 # If this docker file is being used in the context of building your images from a Spark
@@ -47,5 +49,9 @@ COPY data /opt/spark/data
 ENV SPARK_HOME /opt/spark
 
 WORKDIR /opt/spark/work-dir
+RUN chmod g+w /opt/spark/work-dir
 
 ENTRYPOINT [ "/opt/entrypoint.sh" ]
+
+# Specify the User that the actual main process will run as
+USER ${spark_uid}
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/R/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/R/Dockerfile
@@ -16,8 +16,14 @@
 #
 
 ARG base_img
+ARG spark_uid=185
+
 FROM $base_img
 WORKDIR /
+
+# Reset to root to run installation tasks
+USER 0
+
 RUN mkdir ${SPARK_HOME}/R
 
 RUN apk add --no-cache R R-dev
@@ -27,3 +33,6 @@ ENV R_HOME /usr/lib/R
 
 WORKDIR /opt/spark/work-dir
 ENTRYPOINT [ "/opt/entrypoint.sh" ]
+
+# Specify the User that the actual main process will run as
+USER ${spark_uid}
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/python/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/python/Dockerfile
@@ -16,8 +16,14 @@
 #
 
 ARG base_img
+ARG spark_uid=185
+
 FROM $base_img
 WORKDIR /
+
+# Reset to root to run installation tasks
+USER 0
+
 RUN mkdir ${SPARK_HOME}/python
 # TODO: Investigate running both pip and pip3 via virtualenvs
 RUN apk add --no-cache python && \
@@ -37,3 +43,6 @@ ENV PYTHONPATH ${SPARK_HOME}/python/lib/pyspark.zip:${SPARK_HOME}/python/lib/py4
 
 WORKDIR /opt/spark/work-dir
 ENTRYPOINT [ "/opt/entrypoint.sh" ]
+
+# Specify the User that the actual main process will run as
+USER ${spark_uid}
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
@@ -30,6 +30,10 @@ set -e
 # If there is no passwd entry for the container UID, attempt to create one
 if [ -z "$uidentry" ] ; then
     if [ -w /etc/passwd ] ; then
+        # TODO Should we allow providing an environment variable to set the desired username?
+        #      SPARK_USER_NAME is the obvious candidate here but we only propagate this to the
+        #      pods when using Hadoop therefore we'd need to move that to a feature step that
+        #      always runs e.g. Basic(Driver|Executor)FeatureStep
         echo "$myuid:x:$myuid:$mygid:anonymous uid:$SPARK_HOME:/bin/false" >> /etc/passwd
     else
         echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"