diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index fd37990..c1d0c56 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -91,10 +91,12 @@ jobs: scala) SUFFIX=ubuntu ;; esac + BASE_IMGAE_TAG=${{ inputs.spark }}-scala${{ inputs.scala }}-java${{ inputs.java }}-ubuntu TAG=scala${{ inputs.scala }}-java${{ inputs.java }}-$SUFFIX IMAGE_NAME=spark IMAGE_PATH=${{ inputs.spark }}/$TAG + BASE_IMAGE_PATH=${{ inputs.spark }}/scala${{ inputs.scala }}-java${{ inputs.java }}-ubuntu if [ "${{ inputs.build }}" == "true" ]; then # Use the local registry to build and test REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') @@ -105,6 +107,7 @@ jobs: TEST_REPO=${{ inputs.repository }} UNIQUE_IMAGE_TAG=${{ inputs.image-tag }} fi + BASE_IMAGE_URL=$TEST_REPO/$IMAGE_NAME:$BASE_IMGAE_TAG IMAGE_URL=$TEST_REPO/$IMAGE_NAME:$UNIQUE_IMAGE_TAG PUBLISH_REPO=${{ inputs.repository }} @@ -116,8 +119,12 @@ jobs: echo "TEST_REPO=${TEST_REPO}" >> $GITHUB_ENV # Image name: spark echo "IMAGE_NAME=${IMAGE_NAME}" >> $GITHUB_ENV + # Base Image Dockerfile: 3.3.0/scala2.12-java11-ubuntu + echo "BASE_IMAGE_PATH=${BASE_IMAGE_PATH}" >> $GITHUB_ENV # Image dockerfile path: 3.3.0/scala2.12-java11-python3-ubuntu echo "IMAGE_PATH=${IMAGE_PATH}" >> $GITHUB_ENV + # Base Image URL: spark:3.3.0-scala2.12-java11-ubuntu + echo "BASE_IMAGE_URL=${BASE_IMAGE_URL}" >> $GITHUB_ENV # Image URL: ghcr.io/apache/spark-docker/spark:3.3.0-scala2.12-java11-python3-ubuntu echo "IMAGE_URL=${IMAGE_URL}" >> $GITHUB_ENV @@ -132,6 +139,9 @@ jobs: echo "IMAGE_PATH: "${IMAGE_PATH} echo "IMAGE_URL: "${IMAGE_URL} + echo "BASE_IMAGE_PATH: "${BASE_IMAGE_PATH} + echo "BASE_IMAGE_URL: "${BASE_IMAGE_URL} + echo "PUBLISH_REPO:"${PUBLISH_REPO} echo "PUBLISH_IMAGE_URL:"${PUBLISH_IMAGE_URL} @@ -146,10 +156,20 @@ jobs: # This required by local registry driver-opts: network=host + - name: Build - Build the base image + if: ${{ inputs.build }} + uses: docker/build-push-action@v3 + with: + context: ${{ env.BASE_IMAGE_PATH }} + tags: ${{ env.BASE_IMAGE_URL }} + platforms: linux/amd64,linux/arm64 + push: true + - name: Build - Build and push test image if: ${{ inputs.build }} uses: docker/build-push-action@v3 with: + build-args: BASE_IMAGE=${{ env.BASE_IMAGE_URL }} context: ${{ env.IMAGE_PATH }} tags: ${{ env.IMAGE_URL }} platforms: linux/amd64,linux/arm64 diff --git a/3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile b/3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile index 4f62e8d..86337c5 100644 --- a/3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile +++ b/3.4.0/scala2.12-java11-python3-r-ubuntu/Dockerfile @@ -14,73 +14,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM eclipse-temurin:11-jre-focal - -ARG spark_uid=185 - -RUN groupadd --system --gid=${spark_uid} spark && \ - useradd --system --uid=${spark_uid} --gid=spark spark +ARG BASE_IMAGE=spark:3.4.0-scala2.12-java11-ubuntu +FROM $BASE_IMAGE RUN set -ex && \ apt-get update && \ - ln -s /lib /lib64 && \ - apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ apt install -y python3 python3-pip && \ apt install -y r-base r-base-dev && \ - mkdir -p /opt/spark && \ - mkdir /opt/spark/python && \ - mkdir -p /opt/spark/examples && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - chown -R spark:spark /opt/spark && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ rm -rf /var/cache/apt/* && \ rm -rf /var/lib/apt/lists/* -# Install Apache Spark -# https://downloads.apache.org/spark/KEYS -ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz \ - SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz.asc \ - GPG_KEY=CC68B3D16FE33A766705160BA7E57908C7A4E1B1 - -RUN set -ex; \ - export SPARK_TMP="$(mktemp -d)"; \ - cd $SPARK_TMP; \ - wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ - wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ - export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ - gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ - gpg --batch --verify spark.tgz.asc spark.tgz; \ - gpgconf --kill all; \ - rm -rf "$GNUPGHOME" spark.tgz.asc; \ - \ - tar -xf spark.tgz --strip-components=1; \ - chown -R spark:spark .; \ - mv jars /opt/spark/; \ - mv bin /opt/spark/; \ - mv sbin /opt/spark/; \ - mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ - mv examples /opt/spark/; \ - mv kubernetes/tests /opt/spark/; \ - mv data /opt/spark/; \ - mv python/pyspark /opt/spark/python/pyspark/; \ - mv python/lib /opt/spark/python/lib/; \ - mv R /opt/spark/; \ - cd ..; \ - rm -rf "$SPARK_TMP"; - -COPY entrypoint.sh /opt/ - -ENV SPARK_HOME /opt/spark ENV R_HOME /usr/lib/R - -WORKDIR /opt/spark/work-dir -RUN chmod g+w /opt/spark/work-dir -RUN chmod a+x /opt/decom.sh -RUN chmod a+x /opt/entrypoint.sh - -ENTRYPOINT [ "/opt/entrypoint.sh" ] diff --git a/3.4.0/scala2.12-java11-python3-r-ubuntu/entrypoint.sh b/3.4.0/scala2.12-java11-python3-r-ubuntu/entrypoint.sh deleted file mode 100644 index 4bb1557..0000000 --- a/3.4.0/scala2.12-java11-python3-r-ubuntu/entrypoint.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Check whether there is a passwd entry for the container UID -myuid=$(id -u) -mygid=$(id -g) -# turn off -e for getent because it will return error code in anonymous uid case -set +e -uidentry=$(getent passwd $myuid) -set -e - -# If there is no passwd entry for the container UID, attempt to create one -if [ -z "$uidentry" ] ; then - if [ -w /etc/passwd ] ; then - echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd - else - echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" - fi -fi - -if [ -z "$JAVA_HOME" ]; then - JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') -fi - -SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" -env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt -readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt - -if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then - SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" -fi - -if ! [ -z ${PYSPARK_PYTHON+x} ]; then - export PYSPARK_PYTHON -fi -if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then - export PYSPARK_DRIVER_PYTHON -fi - -# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. -# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. -if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then - export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" -fi - -if ! [ -z ${HADOOP_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; -fi - -if ! [ -z ${SPARK_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; -elif ! [ -z ${SPARK_HOME+x} ]; then - SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; -fi - -case "$1" in - driver) - shift 1 - CMD=( - "$SPARK_HOME/bin/spark-submit" - --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" - --deploy-mode client - "$@" - ) - ;; - executor) - shift 1 - CMD=( - ${JAVA_HOME}/bin/java - "${SPARK_EXECUTOR_JAVA_OPTS[@]}" - -Xms$SPARK_EXECUTOR_MEMORY - -Xmx$SPARK_EXECUTOR_MEMORY - -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" - org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend - --driver-url $SPARK_DRIVER_URL - --executor-id $SPARK_EXECUTOR_ID - --cores $SPARK_EXECUTOR_CORES - --app-id $SPARK_APPLICATION_ID - --hostname $SPARK_EXECUTOR_POD_IP - --resourceProfileId $SPARK_RESOURCE_PROFILE_ID - --podName $SPARK_EXECUTOR_POD_NAME - ) - ;; - - *) - # Non-spark-on-k8s command provided, proceeding in pass-through mode... - CMD=("$@") - ;; -esac - -# Switch to spark if no USER specified (root by default) otherwise use USER directly -switch_spark_if_root() { - if [ $(id -u) -eq 0 ]; then - echo gosu spark - fi -} - -# Execute the container CMD under tini for better hygiene -exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile b/3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile index 2be0cb4..540805f 100644 --- a/3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile +++ b/3.4.0/scala2.12-java11-python3-ubuntu/Dockerfile @@ -14,70 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM eclipse-temurin:11-jre-focal - -ARG spark_uid=185 - -RUN groupadd --system --gid=${spark_uid} spark && \ - useradd --system --uid=${spark_uid} --gid=spark spark +ARG BASE_IMAGE=spark:3.4.0-scala2.12-java11-ubuntu +FROM $BASE_IMAGE RUN set -ex && \ apt-get update && \ - ln -s /lib /lib64 && \ - apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ apt install -y python3 python3-pip && \ - mkdir -p /opt/spark && \ - mkdir /opt/spark/python && \ - mkdir -p /opt/spark/examples && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - chown -R spark:spark /opt/spark && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ rm -rf /var/cache/apt/* && \ rm -rf /var/lib/apt/lists/* - -# Install Apache Spark -# https://downloads.apache.org/spark/KEYS -ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz \ - SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz.asc \ - GPG_KEY=CC68B3D16FE33A766705160BA7E57908C7A4E1B1 - -RUN set -ex; \ - export SPARK_TMP="$(mktemp -d)"; \ - cd $SPARK_TMP; \ - wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ - wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ - export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ - gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ - gpg --batch --verify spark.tgz.asc spark.tgz; \ - gpgconf --kill all; \ - rm -rf "$GNUPGHOME" spark.tgz.asc; \ - \ - tar -xf spark.tgz --strip-components=1; \ - chown -R spark:spark .; \ - mv jars /opt/spark/; \ - mv bin /opt/spark/; \ - mv sbin /opt/spark/; \ - mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ - mv examples /opt/spark/; \ - mv kubernetes/tests /opt/spark/; \ - mv data /opt/spark/; \ - mv python/pyspark /opt/spark/python/pyspark/; \ - mv python/lib /opt/spark/python/lib/; \ - cd ..; \ - rm -rf "$SPARK_TMP"; - -COPY entrypoint.sh /opt/ - -ENV SPARK_HOME /opt/spark - -WORKDIR /opt/spark/work-dir -RUN chmod g+w /opt/spark/work-dir -RUN chmod a+x /opt/decom.sh -RUN chmod a+x /opt/entrypoint.sh - -ENTRYPOINT [ "/opt/entrypoint.sh" ] diff --git a/3.4.0/scala2.12-java11-python3-ubuntu/entrypoint.sh b/3.4.0/scala2.12-java11-python3-ubuntu/entrypoint.sh deleted file mode 100644 index 4bb1557..0000000 --- a/3.4.0/scala2.12-java11-python3-ubuntu/entrypoint.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Check whether there is a passwd entry for the container UID -myuid=$(id -u) -mygid=$(id -g) -# turn off -e for getent because it will return error code in anonymous uid case -set +e -uidentry=$(getent passwd $myuid) -set -e - -# If there is no passwd entry for the container UID, attempt to create one -if [ -z "$uidentry" ] ; then - if [ -w /etc/passwd ] ; then - echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd - else - echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" - fi -fi - -if [ -z "$JAVA_HOME" ]; then - JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') -fi - -SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" -env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt -readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt - -if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then - SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" -fi - -if ! [ -z ${PYSPARK_PYTHON+x} ]; then - export PYSPARK_PYTHON -fi -if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then - export PYSPARK_DRIVER_PYTHON -fi - -# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. -# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. -if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then - export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" -fi - -if ! [ -z ${HADOOP_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; -fi - -if ! [ -z ${SPARK_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; -elif ! [ -z ${SPARK_HOME+x} ]; then - SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; -fi - -case "$1" in - driver) - shift 1 - CMD=( - "$SPARK_HOME/bin/spark-submit" - --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" - --deploy-mode client - "$@" - ) - ;; - executor) - shift 1 - CMD=( - ${JAVA_HOME}/bin/java - "${SPARK_EXECUTOR_JAVA_OPTS[@]}" - -Xms$SPARK_EXECUTOR_MEMORY - -Xmx$SPARK_EXECUTOR_MEMORY - -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" - org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend - --driver-url $SPARK_DRIVER_URL - --executor-id $SPARK_EXECUTOR_ID - --cores $SPARK_EXECUTOR_CORES - --app-id $SPARK_APPLICATION_ID - --hostname $SPARK_EXECUTOR_POD_IP - --resourceProfileId $SPARK_RESOURCE_PROFILE_ID - --podName $SPARK_EXECUTOR_POD_NAME - ) - ;; - - *) - # Non-spark-on-k8s command provided, proceeding in pass-through mode... - CMD=("$@") - ;; -esac - -# Switch to spark if no USER specified (root by default) otherwise use USER directly -switch_spark_if_root() { - if [ $(id -u) -eq 0 ]; then - echo gosu spark - fi -} - -# Execute the container CMD under tini for better hygiene -exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/3.4.0/scala2.12-java11-r-ubuntu/Dockerfile b/3.4.0/scala2.12-java11-r-ubuntu/Dockerfile index 22fe82b..c65c2ce 100644 --- a/3.4.0/scala2.12-java11-r-ubuntu/Dockerfile +++ b/3.4.0/scala2.12-java11-r-ubuntu/Dockerfile @@ -14,69 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM eclipse-temurin:11-jre-focal - -ARG spark_uid=185 - -RUN groupadd --system --gid=${spark_uid} spark && \ - useradd --system --uid=${spark_uid} --gid=spark spark +ARG BASE_IMAGE=spark:3.4.0-scala2.12-java11-ubuntu +FROM $BASE_IMAGE RUN set -ex && \ apt-get update && \ - ln -s /lib /lib64 && \ - apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ apt install -y r-base r-base-dev && \ - mkdir -p /opt/spark && \ - mkdir -p /opt/spark/examples && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - chown -R spark:spark /opt/spark && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ rm -rf /var/cache/apt/* && \ rm -rf /var/lib/apt/lists/* -# Install Apache Spark -# https://downloads.apache.org/spark/KEYS -ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz \ - SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz.asc \ - GPG_KEY=CC68B3D16FE33A766705160BA7E57908C7A4E1B1 - -RUN set -ex; \ - export SPARK_TMP="$(mktemp -d)"; \ - cd $SPARK_TMP; \ - wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ - wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ - export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ - gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ - gpg --batch --verify spark.tgz.asc spark.tgz; \ - gpgconf --kill all; \ - rm -rf "$GNUPGHOME" spark.tgz.asc; \ - \ - tar -xf spark.tgz --strip-components=1; \ - chown -R spark:spark .; \ - mv jars /opt/spark/; \ - mv bin /opt/spark/; \ - mv sbin /opt/spark/; \ - mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ - mv examples /opt/spark/; \ - mv kubernetes/tests /opt/spark/; \ - mv data /opt/spark/; \ - mv R /opt/spark/; \ - cd ..; \ - rm -rf "$SPARK_TMP"; - -COPY entrypoint.sh /opt/ - -ENV SPARK_HOME /opt/spark ENV R_HOME /usr/lib/R - -WORKDIR /opt/spark/work-dir -RUN chmod g+w /opt/spark/work-dir -RUN chmod a+x /opt/decom.sh -RUN chmod a+x /opt/entrypoint.sh - -ENTRYPOINT [ "/opt/entrypoint.sh" ] diff --git a/3.4.0/scala2.12-java11-r-ubuntu/entrypoint.sh b/3.4.0/scala2.12-java11-r-ubuntu/entrypoint.sh deleted file mode 100644 index 159d539..0000000 --- a/3.4.0/scala2.12-java11-r-ubuntu/entrypoint.sh +++ /dev/null @@ -1,107 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Check whether there is a passwd entry for the container UID -myuid=$(id -u) -mygid=$(id -g) -# turn off -e for getent because it will return error code in anonymous uid case -set +e -uidentry=$(getent passwd $myuid) -set -e - -# If there is no passwd entry for the container UID, attempt to create one -if [ -z "$uidentry" ] ; then - if [ -w /etc/passwd ] ; then - echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd - else - echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" - fi -fi - -if [ -z "$JAVA_HOME" ]; then - JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') -fi - -SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" -env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt -readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt - -if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then - SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" -fi - -# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. -# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. -if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then - export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" -fi - -if ! [ -z ${HADOOP_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; -fi - -if ! [ -z ${SPARK_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; -elif ! [ -z ${SPARK_HOME+x} ]; then - SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; -fi - -case "$1" in - driver) - shift 1 - CMD=( - "$SPARK_HOME/bin/spark-submit" - --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" - --deploy-mode client - "$@" - ) - ;; - executor) - shift 1 - CMD=( - ${JAVA_HOME}/bin/java - "${SPARK_EXECUTOR_JAVA_OPTS[@]}" - -Xms$SPARK_EXECUTOR_MEMORY - -Xmx$SPARK_EXECUTOR_MEMORY - -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" - org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend - --driver-url $SPARK_DRIVER_URL - --executor-id $SPARK_EXECUTOR_ID - --cores $SPARK_EXECUTOR_CORES - --app-id $SPARK_APPLICATION_ID - --hostname $SPARK_EXECUTOR_POD_IP - --resourceProfileId $SPARK_RESOURCE_PROFILE_ID - --podName $SPARK_EXECUTOR_POD_NAME - ) - ;; - - *) - # Non-spark-on-k8s command provided, proceeding in pass-through mode... - CMD=("$@") - ;; -esac - -# Switch to spark if no USER specified (root by default) otherwise use USER directly -switch_spark_if_root() { - if [ $(id -u) -eq 0 ]; then - echo gosu spark - fi -} - -# Execute the container CMD under tini for better hygiene -exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/3.4.0/scala2.12-java11-ubuntu/Dockerfile b/3.4.0/scala2.12-java11-ubuntu/Dockerfile index 4e3df64..997b8d3 100644 --- a/3.4.0/scala2.12-java11-ubuntu/Dockerfile +++ b/3.4.0/scala2.12-java11-ubuntu/Dockerfile @@ -26,6 +26,7 @@ RUN set -ex && \ ln -s /lib /lib64 && \ apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ mkdir -p /opt/spark && \ + mkdir /opt/spark/python && \ mkdir -p /opt/spark/examples && \ mkdir -p /opt/spark/work-dir && \ touch /opt/spark/RELEASE && \ @@ -64,6 +65,9 @@ RUN set -ex; \ mv examples /opt/spark/; \ mv kubernetes/tests /opt/spark/; \ mv data /opt/spark/; \ + mv python/pyspark /opt/spark/python/pyspark/; \ + mv python/lib /opt/spark/python/lib/; \ + mv R /opt/spark/; \ cd ..; \ rm -rf "$SPARK_TMP"; diff --git a/3.4.0/scala2.12-java11-ubuntu/entrypoint.sh b/3.4.0/scala2.12-java11-ubuntu/entrypoint.sh index 159d539..4bb1557 100644 --- a/3.4.0/scala2.12-java11-ubuntu/entrypoint.sh +++ b/3.4.0/scala2.12-java11-ubuntu/entrypoint.sh @@ -45,6 +45,13 @@ if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" fi +if ! [ -z ${PYSPARK_PYTHON+x} ]; then + export PYSPARK_PYTHON +fi +if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then + export PYSPARK_DRIVER_PYTHON +fi + # If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. # It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then diff --git a/Dockerfile.template b/Dockerfile.template index 4819cb2..5fe4f25 100644 --- a/Dockerfile.template +++ b/Dockerfile.template @@ -25,16 +25,8 @@ RUN set -ex && \ apt-get update && \ ln -s /lib /lib64 && \ apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ - {%- if HAVE_PY %} - apt install -y python3 python3-pip && \ - {%- endif %} - {%- if HAVE_R %} - apt install -y r-base r-base-dev && \ - {%- endif %} mkdir -p /opt/spark && \ - {%- if HAVE_PY %} mkdir /opt/spark/python && \ - {%- endif %} mkdir -p /opt/spark/examples && \ mkdir -p /opt/spark/work-dir && \ touch /opt/spark/RELEASE && \ @@ -73,22 +65,15 @@ RUN set -ex; \ mv examples /opt/spark/; \ mv kubernetes/tests /opt/spark/; \ mv data /opt/spark/; \ - {%- if HAVE_PY %} mv python/pyspark /opt/spark/python/pyspark/; \ mv python/lib /opt/spark/python/lib/; \ - {%- endif %} - {%- if HAVE_R %} mv R /opt/spark/; \ - {%- endif %} cd ..; \ rm -rf "$SPARK_TMP"; COPY entrypoint.sh /opt/ ENV SPARK_HOME /opt/spark -{%- if HAVE_R %} -ENV R_HOME /usr/lib/R -{%- endif %} WORKDIR /opt/spark/work-dir RUN chmod g+w /opt/spark/work-dir diff --git a/add-dockerfiles.sh b/add-dockerfiles.sh index 1683f33..7dcd7b0 100755 --- a/add-dockerfiles.sh +++ b/add-dockerfiles.sh @@ -48,6 +48,11 @@ for TAG in $TAGS; do OPTS+=" --spark-version $VERSION" mkdir -p $VERSION/$TAG - python3 tools/template.py $OPTS -f entrypoint.sh.template > $VERSION/$TAG/entrypoint.sh - python3 tools/template.py $OPTS > $VERSION/$TAG/Dockerfile + + if [ "$TAG" == "scala2.12-java11-ubuntu" ]; then + python3 tools/template.py $OPTS > $VERSION/$TAG/Dockerfile + python3 tools/template.py $OPTS -f entrypoint.sh.template > $VERSION/$TAG/entrypoint.sh + else + python3 tools/template.py $OPTS -f r-python.template > $VERSION/$TAG/Dockerfile + fi done diff --git a/entrypoint.sh.template b/entrypoint.sh.template index dd56d84..4bb1557 100644 --- a/entrypoint.sh.template +++ b/entrypoint.sh.template @@ -44,7 +44,6 @@ readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" fi -{%- if HAVE_PY %} if ! [ -z ${PYSPARK_PYTHON+x} ]; then export PYSPARK_PYTHON @@ -52,7 +51,6 @@ fi if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then export PYSPARK_DRIVER_PYTHON fi -{%- endif %} # If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. # It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. diff --git a/r-python.template b/r-python.template new file mode 100644 index 0000000..fec4e70 --- /dev/null +++ b/r-python.template @@ -0,0 +1,33 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +ARG BASE_IMAGE=spark:{{ SPARK_VERSION }}-scala{{ SCALA_VERSION }}-java{{ JAVA_VERSION }}-ubuntu +FROM $BASE_IMAGE + +RUN set -ex && \ + apt-get update && \ + {%- if HAVE_PY %} + apt install -y python3 python3-pip && \ + {%- endif %} + {%- if HAVE_R %} + apt install -y r-base r-base-dev && \ + {%- endif %} + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* +{%- if HAVE_R %} + +ENV R_HOME /usr/lib/R +{%- endif %} diff --git a/tools/template.py b/tools/template.py index 693182b..cb74cc3 100755 --- a/tools/template.py +++ b/tools/template.py @@ -50,6 +50,20 @@ def parse_opts(): default="3.3.0", ) + parser.add_argument( + "-j", + "--java-version", + help="The Spark version of Dockerfile.", + default="11", + ) + + parser.add_argument( + "-s", + "--scala-version", + help="The Spark version of Dockerfile.", + default="2.12", + ) + parser.add_argument( "-i", "--image", @@ -88,6 +102,8 @@ def main(): HAVE_R=opts.sparkr, SPARK_VERSION=opts.spark_version, SPARK_GPG_KEY=GPG_KEY_DICT.get(opts.spark_version), + JAVA_VERSION=opts.java_version, + SCALA_VERSION=opts.scala_version, ) )