From 07b46ae431386546c9c770673764394b687a3646 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Mon, 21 Aug 2023 18:39:37 +0800 Subject: [PATCH 1/7] Add Apache Spark 3.3.3 Dockerfiles --- .github/workflows/build_3.3.3.yaml | 43 +++++++ .github/workflows/publish.yml | 1 + .github/workflows/test.yml | 1 + .../Dockerfile | 86 +++++++++++++ .../entrypoint.sh | 114 ++++++++++++++++++ .../Dockerfile | 83 +++++++++++++ .../entrypoint.sh | 114 ++++++++++++++++++ 3.3.3/scala2.12-java11-r-ubuntu/Dockerfile | 82 +++++++++++++ 3.3.3/scala2.12-java11-r-ubuntu/entrypoint.sh | 107 ++++++++++++++++ 3.3.3/scala2.12-java11-ubuntu/Dockerfile | 79 ++++++++++++ 3.3.3/scala2.12-java11-ubuntu/entrypoint.sh | 107 ++++++++++++++++ tools/template.py | 2 + versions.json | 28 +++++ 13 files changed, 847 insertions(+) create mode 100644 .github/workflows/build_3.3.3.yaml create mode 100644 3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile create mode 100644 3.3.3/scala2.12-java11-python3-r-ubuntu/entrypoint.sh create mode 100644 3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile create mode 100644 3.3.3/scala2.12-java11-python3-ubuntu/entrypoint.sh create mode 100644 3.3.3/scala2.12-java11-r-ubuntu/Dockerfile create mode 100644 3.3.3/scala2.12-java11-r-ubuntu/entrypoint.sh create mode 100644 3.3.3/scala2.12-java11-ubuntu/Dockerfile create mode 100644 3.3.3/scala2.12-java11-ubuntu/entrypoint.sh diff --git a/.github/workflows/build_3.3.3.yaml b/.github/workflows/build_3.3.3.yaml new file mode 100644 index 0000000..75d3948 --- /dev/null +++ b/.github/workflows/build_3.3.3.yaml @@ -0,0 +1,43 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build and Test (3.3.3)" + +on: + pull_request: + branches: + - 'master' + paths: + - '3.3.3/**' + - '.github/workflows/build_3.3.3.yaml' + - '.github/workflows/main.yml' + +jobs: + run-build: + strategy: + matrix: + image-type: ["all", "python", "scala", "r"] + name: Run + secrets: inherit + uses: ./.github/workflows/main.yml + with: + spark: 3.3.3 + scala: 2.12 + java: 11 + image-type: ${{ matrix.image-type }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 1138a9f..d213ada 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -30,6 +30,7 @@ on: options: - 3.4.1 - 3.4.0 + - 3.3.3 - 3.3.2 - 3.3.1 - 3.3.0 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4136f1c..4f0f741 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,6 +30,7 @@ on: options: - 3.4.1 - 3.4.0 + - 3.3.3 - 3.3.2 - 3.3.1 - 3.3.0 diff --git a/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile new file mode 100644 index 0000000..aa4d5ad --- /dev/null +++ b/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile @@ -0,0 +1,86 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +FROM eclipse-temurin:11-jre-focal + +ARG spark_uid=185 + +RUN groupadd --system --gid=${spark_uid} spark && \ + useradd --system --uid=${spark_uid} --gid=spark spark + +RUN set -ex && \ + apt-get update && \ + ln -s /lib /lib64 && \ + apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ + apt install -y python3 python3-pip && \ + apt install -y r-base r-base-dev && \ + mkdir -p /opt/spark && \ + mkdir /opt/spark/python && \ + mkdir -p /opt/spark/examples && \ + mkdir -p /opt/spark/work-dir && \ + touch /opt/spark/RELEASE && \ + chown -R spark:spark /opt/spark && \ + rm /bin/sh && \ + ln -sv /bin/bash /bin/sh && \ + echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ + chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + +# Install Apache Spark +# https://downloads.apache.org/spark/KEYS +ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz \ + SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz.asc \ + GPG_KEY=F6468A4FF8377B4F1C07BC2AA077F928A0BF68D8 + +RUN set -ex; \ + export SPARK_TMP="$(mktemp -d)"; \ + cd $SPARK_TMP; \ + wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ + wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ + export GNUPGHOME="$(mktemp -d)"; \ + gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ + gpg --batch --verify spark.tgz.asc spark.tgz; \ + gpgconf --kill all; \ + rm -rf "$GNUPGHOME" spark.tgz.asc; \ + \ + tar -xf spark.tgz --strip-components=1; \ + chown -R spark:spark .; \ + mv jars /opt/spark/; \ + mv bin /opt/spark/; \ + mv sbin /opt/spark/; \ + mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ + mv examples /opt/spark/; \ + mv kubernetes/tests /opt/spark/; \ + mv data /opt/spark/; \ + mv python/pyspark /opt/spark/python/pyspark/; \ + mv python/lib /opt/spark/python/lib/; \ + mv R /opt/spark/; \ + cd ..; \ + rm -rf "$SPARK_TMP"; + +COPY entrypoint.sh /opt/ + +ENV SPARK_HOME /opt/spark +ENV R_HOME /usr/lib/R + +WORKDIR /opt/spark/work-dir +RUN chmod g+w /opt/spark/work-dir +RUN chmod a+x /opt/decom.sh +RUN chmod a+x /opt/entrypoint.sh + +ENTRYPOINT [ "/opt/entrypoint.sh" ] diff --git a/3.3.3/scala2.12-java11-python3-r-ubuntu/entrypoint.sh b/3.3.3/scala2.12-java11-python3-r-ubuntu/entrypoint.sh new file mode 100644 index 0000000..4bb1557 --- /dev/null +++ b/3.3.3/scala2.12-java11-python3-r-ubuntu/entrypoint.sh @@ -0,0 +1,114 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Check whether there is a passwd entry for the container UID +myuid=$(id -u) +mygid=$(id -g) +# turn off -e for getent because it will return error code in anonymous uid case +set +e +uidentry=$(getent passwd $myuid) +set -e + +# If there is no passwd entry for the container UID, attempt to create one +if [ -z "$uidentry" ] ; then + if [ -w /etc/passwd ] ; then + echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd + else + echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" + fi +fi + +if [ -z "$JAVA_HOME" ]; then + JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') +fi + +SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" +env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt +readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt + +if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then + SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" +fi + +if ! [ -z ${PYSPARK_PYTHON+x} ]; then + export PYSPARK_PYTHON +fi +if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then + export PYSPARK_DRIVER_PYTHON +fi + +# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. +# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. +if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then + export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" +fi + +if ! [ -z ${HADOOP_CONF_DIR+x} ]; then + SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; +fi + +if ! [ -z ${SPARK_CONF_DIR+x} ]; then + SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; +elif ! [ -z ${SPARK_HOME+x} ]; then + SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; +fi + +case "$1" in + driver) + shift 1 + CMD=( + "$SPARK_HOME/bin/spark-submit" + --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" + --deploy-mode client + "$@" + ) + ;; + executor) + shift 1 + CMD=( + ${JAVA_HOME}/bin/java + "${SPARK_EXECUTOR_JAVA_OPTS[@]}" + -Xms$SPARK_EXECUTOR_MEMORY + -Xmx$SPARK_EXECUTOR_MEMORY + -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" + org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend + --driver-url $SPARK_DRIVER_URL + --executor-id $SPARK_EXECUTOR_ID + --cores $SPARK_EXECUTOR_CORES + --app-id $SPARK_APPLICATION_ID + --hostname $SPARK_EXECUTOR_POD_IP + --resourceProfileId $SPARK_RESOURCE_PROFILE_ID + --podName $SPARK_EXECUTOR_POD_NAME + ) + ;; + + *) + # Non-spark-on-k8s command provided, proceeding in pass-through mode... + CMD=("$@") + ;; +esac + +# Switch to spark if no USER specified (root by default) otherwise use USER directly +switch_spark_if_root() { + if [ $(id -u) -eq 0 ]; then + echo gosu spark + fi +} + +# Execute the container CMD under tini for better hygiene +exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile new file mode 100644 index 0000000..32d8950 --- /dev/null +++ b/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile @@ -0,0 +1,83 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +FROM eclipse-temurin:11-jre-focal + +ARG spark_uid=185 + +RUN groupadd --system --gid=${spark_uid} spark && \ + useradd --system --uid=${spark_uid} --gid=spark spark + +RUN set -ex && \ + apt-get update && \ + ln -s /lib /lib64 && \ + apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ + apt install -y python3 python3-pip && \ + mkdir -p /opt/spark && \ + mkdir /opt/spark/python && \ + mkdir -p /opt/spark/examples && \ + mkdir -p /opt/spark/work-dir && \ + touch /opt/spark/RELEASE && \ + chown -R spark:spark /opt/spark && \ + rm /bin/sh && \ + ln -sv /bin/bash /bin/sh && \ + echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ + chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + +# Install Apache Spark +# https://downloads.apache.org/spark/KEYS +ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz \ + SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz.asc \ + GPG_KEY=F6468A4FF8377B4F1C07BC2AA077F928A0BF68D8 + +RUN set -ex; \ + export SPARK_TMP="$(mktemp -d)"; \ + cd $SPARK_TMP; \ + wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ + wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ + export GNUPGHOME="$(mktemp -d)"; \ + gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ + gpg --batch --verify spark.tgz.asc spark.tgz; \ + gpgconf --kill all; \ + rm -rf "$GNUPGHOME" spark.tgz.asc; \ + \ + tar -xf spark.tgz --strip-components=1; \ + chown -R spark:spark .; \ + mv jars /opt/spark/; \ + mv bin /opt/spark/; \ + mv sbin /opt/spark/; \ + mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ + mv examples /opt/spark/; \ + mv kubernetes/tests /opt/spark/; \ + mv data /opt/spark/; \ + mv python/pyspark /opt/spark/python/pyspark/; \ + mv python/lib /opt/spark/python/lib/; \ + cd ..; \ + rm -rf "$SPARK_TMP"; + +COPY entrypoint.sh /opt/ + +ENV SPARK_HOME /opt/spark + +WORKDIR /opt/spark/work-dir +RUN chmod g+w /opt/spark/work-dir +RUN chmod a+x /opt/decom.sh +RUN chmod a+x /opt/entrypoint.sh + +ENTRYPOINT [ "/opt/entrypoint.sh" ] diff --git a/3.3.3/scala2.12-java11-python3-ubuntu/entrypoint.sh b/3.3.3/scala2.12-java11-python3-ubuntu/entrypoint.sh new file mode 100644 index 0000000..4bb1557 --- /dev/null +++ b/3.3.3/scala2.12-java11-python3-ubuntu/entrypoint.sh @@ -0,0 +1,114 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Check whether there is a passwd entry for the container UID +myuid=$(id -u) +mygid=$(id -g) +# turn off -e for getent because it will return error code in anonymous uid case +set +e +uidentry=$(getent passwd $myuid) +set -e + +# If there is no passwd entry for the container UID, attempt to create one +if [ -z "$uidentry" ] ; then + if [ -w /etc/passwd ] ; then + echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd + else + echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" + fi +fi + +if [ -z "$JAVA_HOME" ]; then + JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') +fi + +SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" +env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt +readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt + +if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then + SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" +fi + +if ! [ -z ${PYSPARK_PYTHON+x} ]; then + export PYSPARK_PYTHON +fi +if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then + export PYSPARK_DRIVER_PYTHON +fi + +# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. +# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. +if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then + export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" +fi + +if ! [ -z ${HADOOP_CONF_DIR+x} ]; then + SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; +fi + +if ! [ -z ${SPARK_CONF_DIR+x} ]; then + SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; +elif ! [ -z ${SPARK_HOME+x} ]; then + SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; +fi + +case "$1" in + driver) + shift 1 + CMD=( + "$SPARK_HOME/bin/spark-submit" + --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" + --deploy-mode client + "$@" + ) + ;; + executor) + shift 1 + CMD=( + ${JAVA_HOME}/bin/java + "${SPARK_EXECUTOR_JAVA_OPTS[@]}" + -Xms$SPARK_EXECUTOR_MEMORY + -Xmx$SPARK_EXECUTOR_MEMORY + -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" + org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend + --driver-url $SPARK_DRIVER_URL + --executor-id $SPARK_EXECUTOR_ID + --cores $SPARK_EXECUTOR_CORES + --app-id $SPARK_APPLICATION_ID + --hostname $SPARK_EXECUTOR_POD_IP + --resourceProfileId $SPARK_RESOURCE_PROFILE_ID + --podName $SPARK_EXECUTOR_POD_NAME + ) + ;; + + *) + # Non-spark-on-k8s command provided, proceeding in pass-through mode... + CMD=("$@") + ;; +esac + +# Switch to spark if no USER specified (root by default) otherwise use USER directly +switch_spark_if_root() { + if [ $(id -u) -eq 0 ]; then + echo gosu spark + fi +} + +# Execute the container CMD under tini for better hygiene +exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile new file mode 100644 index 0000000..effda7d --- /dev/null +++ b/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile @@ -0,0 +1,82 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +FROM eclipse-temurin:11-jre-focal + +ARG spark_uid=185 + +RUN groupadd --system --gid=${spark_uid} spark && \ + useradd --system --uid=${spark_uid} --gid=spark spark + +RUN set -ex && \ + apt-get update && \ + ln -s /lib /lib64 && \ + apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ + apt install -y r-base r-base-dev && \ + mkdir -p /opt/spark && \ + mkdir -p /opt/spark/examples && \ + mkdir -p /opt/spark/work-dir && \ + touch /opt/spark/RELEASE && \ + chown -R spark:spark /opt/spark && \ + rm /bin/sh && \ + ln -sv /bin/bash /bin/sh && \ + echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ + chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + +# Install Apache Spark +# https://downloads.apache.org/spark/KEYS +ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz \ + SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz.asc \ + GPG_KEY=F6468A4FF8377B4F1C07BC2AA077F928A0BF68D8 + +RUN set -ex; \ + export SPARK_TMP="$(mktemp -d)"; \ + cd $SPARK_TMP; \ + wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ + wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ + export GNUPGHOME="$(mktemp -d)"; \ + gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ + gpg --batch --verify spark.tgz.asc spark.tgz; \ + gpgconf --kill all; \ + rm -rf "$GNUPGHOME" spark.tgz.asc; \ + \ + tar -xf spark.tgz --strip-components=1; \ + chown -R spark:spark .; \ + mv jars /opt/spark/; \ + mv bin /opt/spark/; \ + mv sbin /opt/spark/; \ + mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ + mv examples /opt/spark/; \ + mv kubernetes/tests /opt/spark/; \ + mv data /opt/spark/; \ + mv R /opt/spark/; \ + cd ..; \ + rm -rf "$SPARK_TMP"; + +COPY entrypoint.sh /opt/ + +ENV SPARK_HOME /opt/spark +ENV R_HOME /usr/lib/R + +WORKDIR /opt/spark/work-dir +RUN chmod g+w /opt/spark/work-dir +RUN chmod a+x /opt/decom.sh +RUN chmod a+x /opt/entrypoint.sh + +ENTRYPOINT [ "/opt/entrypoint.sh" ] diff --git a/3.3.3/scala2.12-java11-r-ubuntu/entrypoint.sh b/3.3.3/scala2.12-java11-r-ubuntu/entrypoint.sh new file mode 100644 index 0000000..159d539 --- /dev/null +++ b/3.3.3/scala2.12-java11-r-ubuntu/entrypoint.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Check whether there is a passwd entry for the container UID +myuid=$(id -u) +mygid=$(id -g) +# turn off -e for getent because it will return error code in anonymous uid case +set +e +uidentry=$(getent passwd $myuid) +set -e + +# If there is no passwd entry for the container UID, attempt to create one +if [ -z "$uidentry" ] ; then + if [ -w /etc/passwd ] ; then + echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd + else + echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" + fi +fi + +if [ -z "$JAVA_HOME" ]; then + JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') +fi + +SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" +env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt +readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt + +if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then + SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" +fi + +# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. +# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. +if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then + export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" +fi + +if ! [ -z ${HADOOP_CONF_DIR+x} ]; then + SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; +fi + +if ! [ -z ${SPARK_CONF_DIR+x} ]; then + SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; +elif ! [ -z ${SPARK_HOME+x} ]; then + SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; +fi + +case "$1" in + driver) + shift 1 + CMD=( + "$SPARK_HOME/bin/spark-submit" + --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" + --deploy-mode client + "$@" + ) + ;; + executor) + shift 1 + CMD=( + ${JAVA_HOME}/bin/java + "${SPARK_EXECUTOR_JAVA_OPTS[@]}" + -Xms$SPARK_EXECUTOR_MEMORY + -Xmx$SPARK_EXECUTOR_MEMORY + -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" + org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend + --driver-url $SPARK_DRIVER_URL + --executor-id $SPARK_EXECUTOR_ID + --cores $SPARK_EXECUTOR_CORES + --app-id $SPARK_APPLICATION_ID + --hostname $SPARK_EXECUTOR_POD_IP + --resourceProfileId $SPARK_RESOURCE_PROFILE_ID + --podName $SPARK_EXECUTOR_POD_NAME + ) + ;; + + *) + # Non-spark-on-k8s command provided, proceeding in pass-through mode... + CMD=("$@") + ;; +esac + +# Switch to spark if no USER specified (root by default) otherwise use USER directly +switch_spark_if_root() { + if [ $(id -u) -eq 0 ]; then + echo gosu spark + fi +} + +# Execute the container CMD under tini for better hygiene +exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/3.3.3/scala2.12-java11-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-ubuntu/Dockerfile new file mode 100644 index 0000000..7b7698a --- /dev/null +++ b/3.3.3/scala2.12-java11-ubuntu/Dockerfile @@ -0,0 +1,79 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +FROM eclipse-temurin:11-jre-focal + +ARG spark_uid=185 + +RUN groupadd --system --gid=${spark_uid} spark && \ + useradd --system --uid=${spark_uid} --gid=spark spark + +RUN set -ex && \ + apt-get update && \ + ln -s /lib /lib64 && \ + apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ + mkdir -p /opt/spark && \ + mkdir -p /opt/spark/examples && \ + mkdir -p /opt/spark/work-dir && \ + touch /opt/spark/RELEASE && \ + chown -R spark:spark /opt/spark && \ + rm /bin/sh && \ + ln -sv /bin/bash /bin/sh && \ + echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ + chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + +# Install Apache Spark +# https://downloads.apache.org/spark/KEYS +ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz \ + SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz.asc \ + GPG_KEY=F6468A4FF8377B4F1C07BC2AA077F928A0BF68D8 + +RUN set -ex; \ + export SPARK_TMP="$(mktemp -d)"; \ + cd $SPARK_TMP; \ + wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ + wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ + export GNUPGHOME="$(mktemp -d)"; \ + gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ + gpg --batch --verify spark.tgz.asc spark.tgz; \ + gpgconf --kill all; \ + rm -rf "$GNUPGHOME" spark.tgz.asc; \ + \ + tar -xf spark.tgz --strip-components=1; \ + chown -R spark:spark .; \ + mv jars /opt/spark/; \ + mv bin /opt/spark/; \ + mv sbin /opt/spark/; \ + mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ + mv examples /opt/spark/; \ + mv kubernetes/tests /opt/spark/; \ + mv data /opt/spark/; \ + cd ..; \ + rm -rf "$SPARK_TMP"; + +COPY entrypoint.sh /opt/ + +ENV SPARK_HOME /opt/spark + +WORKDIR /opt/spark/work-dir +RUN chmod g+w /opt/spark/work-dir +RUN chmod a+x /opt/decom.sh +RUN chmod a+x /opt/entrypoint.sh + +ENTRYPOINT [ "/opt/entrypoint.sh" ] diff --git a/3.3.3/scala2.12-java11-ubuntu/entrypoint.sh b/3.3.3/scala2.12-java11-ubuntu/entrypoint.sh new file mode 100644 index 0000000..159d539 --- /dev/null +++ b/3.3.3/scala2.12-java11-ubuntu/entrypoint.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Check whether there is a passwd entry for the container UID +myuid=$(id -u) +mygid=$(id -g) +# turn off -e for getent because it will return error code in anonymous uid case +set +e +uidentry=$(getent passwd $myuid) +set -e + +# If there is no passwd entry for the container UID, attempt to create one +if [ -z "$uidentry" ] ; then + if [ -w /etc/passwd ] ; then + echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd + else + echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" + fi +fi + +if [ -z "$JAVA_HOME" ]; then + JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') +fi + +SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" +env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt +readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt + +if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then + SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" +fi + +# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. +# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. +if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then + export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" +fi + +if ! [ -z ${HADOOP_CONF_DIR+x} ]; then + SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; +fi + +if ! [ -z ${SPARK_CONF_DIR+x} ]; then + SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; +elif ! [ -z ${SPARK_HOME+x} ]; then + SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; +fi + +case "$1" in + driver) + shift 1 + CMD=( + "$SPARK_HOME/bin/spark-submit" + --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" + --deploy-mode client + "$@" + ) + ;; + executor) + shift 1 + CMD=( + ${JAVA_HOME}/bin/java + "${SPARK_EXECUTOR_JAVA_OPTS[@]}" + -Xms$SPARK_EXECUTOR_MEMORY + -Xmx$SPARK_EXECUTOR_MEMORY + -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" + org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend + --driver-url $SPARK_DRIVER_URL + --executor-id $SPARK_EXECUTOR_ID + --cores $SPARK_EXECUTOR_CORES + --app-id $SPARK_APPLICATION_ID + --hostname $SPARK_EXECUTOR_POD_IP + --resourceProfileId $SPARK_RESOURCE_PROFILE_ID + --podName $SPARK_EXECUTOR_POD_NAME + ) + ;; + + *) + # Non-spark-on-k8s command provided, proceeding in pass-through mode... + CMD=("$@") + ;; +esac + +# Switch to spark if no USER specified (root by default) otherwise use USER directly +switch_spark_if_root() { + if [ $(id -u) -eq 0 ]; then + echo gosu spark + fi +} + +# Execute the container CMD under tini for better hygiene +exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/tools/template.py b/tools/template.py index cdc167c..d305e62 100755 --- a/tools/template.py +++ b/tools/template.py @@ -28,6 +28,8 @@ "3.3.1": "86727D43E73A415F67A0B1A14E68B3E6CD473653", # issuer "viirya@apache.org" "3.3.2": "C56349D886F2B01F8CAE794C653C2301FEA493EE", + # issuer "yumwang@apache.org" + "3.3.3": "F6468A4FF8377B4F1C07BC2AA077F928A0BF68D8", # issuer "xinrong@apache.org" "3.4.0": "CC68B3D16FE33A766705160BA7E57908C7A4E1B1", # issuer "dongjoon@apache.org" diff --git a/versions.json b/versions.json index 3ef0101..647eb09 100644 --- a/versions.json +++ b/versions.json @@ -60,6 +60,34 @@ "3.4.0-scala2.12-java11-python3-r-ubuntu" ] }, + { + "path": "3.3.3/scala2.12-java11-python3-ubuntu", + "tags": [ + "3.3.2-scala2.12-java11-python3-ubuntu", + "3.3.2-python3", + "3.3.2" + ] + }, + { + "path": "3.3.3/scala2.12-java11-r-ubuntu", + "tags": [ + "3.3.3-scala2.12-java11-r-ubuntu", + "3.3.3-r" + ] + }, + { + "path": "3.3.3/scala2.12-java11-ubuntu", + "tags": [ + "3.3.3-scala2.12-java11-ubuntu", + "3.3.3-scala" + ] + }, + { + "path": "3.3.3/scala2.12-java11-python3-r-ubuntu", + "tags": [ + "3.3.3-scala2.12-java11-python3-r-ubuntu" + ] + }, { "path": "3.3.1/scala2.12-java11-python3-ubuntu", "tags": [ From f7d5e010af78d7ec128151c34d9c4f8165df950c Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Tue, 22 Aug 2023 11:33:07 +0800 Subject: [PATCH 2/7] Try hkps://keyserver.ubuntu.com --- 3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile | 2 +- 3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile | 2 +- 3.3.3/scala2.12-java11-r-ubuntu/Dockerfile | 2 +- 3.3.3/scala2.12-java11-ubuntu/Dockerfile | 2 +- versions.json | 6 +++--- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile index aa4d5ad..5260bbb 100644 --- a/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile +++ b/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile @@ -52,7 +52,7 @@ RUN set -ex; \ wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + # gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ gpg --batch --verify spark.tgz.asc spark.tgz; \ gpgconf --kill all; \ diff --git a/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile index 32d8950..f2f52a5 100644 --- a/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile +++ b/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile @@ -51,7 +51,7 @@ RUN set -ex; \ wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + # gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ gpg --batch --verify spark.tgz.asc spark.tgz; \ gpgconf --kill all; \ diff --git a/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile index effda7d..aea4c27 100644 --- a/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile +++ b/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile @@ -50,7 +50,7 @@ RUN set -ex; \ wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + # gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ gpg --batch --verify spark.tgz.asc spark.tgz; \ gpgconf --kill all; \ diff --git a/3.3.3/scala2.12-java11-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-ubuntu/Dockerfile index 7b7698a..e8cb0ec 100644 --- a/3.3.3/scala2.12-java11-ubuntu/Dockerfile +++ b/3.3.3/scala2.12-java11-ubuntu/Dockerfile @@ -49,7 +49,7 @@ RUN set -ex; \ wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + # gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ gpg --batch --verify spark.tgz.asc spark.tgz; \ gpgconf --kill all; \ diff --git a/versions.json b/versions.json index 647eb09..1442bfe 100644 --- a/versions.json +++ b/versions.json @@ -63,9 +63,9 @@ { "path": "3.3.3/scala2.12-java11-python3-ubuntu", "tags": [ - "3.3.2-scala2.12-java11-python3-ubuntu", - "3.3.2-python3", - "3.3.2" + "3.3.3-scala2.12-java11-python3-ubuntu", + "3.3.3-python3", + "3.3.3" ] }, { From 6161e1ccd8ba8ef0273e6ba22c9537c8fca809e9 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Tue, 22 Aug 2023 11:49:58 +0800 Subject: [PATCH 3/7] fix --- 3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile | 2 +- 3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile | 2 +- 3.3.3/scala2.12-java11-r-ubuntu/Dockerfile | 2 +- 3.3.3/scala2.12-java11-ubuntu/Dockerfile | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile index 5260bbb..b2735e9 100644 --- a/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile +++ b/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile @@ -52,7 +52,7 @@ RUN set -ex; \ wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ export GNUPGHOME="$(mktemp -d)"; \ - # gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY"; \ gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ gpg --batch --verify spark.tgz.asc spark.tgz; \ gpgconf --kill all; \ diff --git a/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile index f2f52a5..a7c8d1d 100644 --- a/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile +++ b/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile @@ -51,7 +51,7 @@ RUN set -ex; \ wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ export GNUPGHOME="$(mktemp -d)"; \ - # gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY"; \ gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ gpg --batch --verify spark.tgz.asc spark.tgz; \ gpgconf --kill all; \ diff --git a/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile index aea4c27..7d6cb34 100644 --- a/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile +++ b/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile @@ -50,7 +50,7 @@ RUN set -ex; \ wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ export GNUPGHOME="$(mktemp -d)"; \ - # gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY"; \ gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ gpg --batch --verify spark.tgz.asc spark.tgz; \ gpgconf --kill all; \ diff --git a/3.3.3/scala2.12-java11-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-ubuntu/Dockerfile index e8cb0ec..31bf002 100644 --- a/3.3.3/scala2.12-java11-ubuntu/Dockerfile +++ b/3.3.3/scala2.12-java11-ubuntu/Dockerfile @@ -49,7 +49,7 @@ RUN set -ex; \ wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ export GNUPGHOME="$(mktemp -d)"; \ - # gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY"; \ gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ gpg --batch --verify spark.tgz.asc spark.tgz; \ gpgconf --kill all; \ From ba3468842ce842edbd2166d3f6f8b92ff102c7e1 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Tue, 22 Aug 2023 13:15:18 +0800 Subject: [PATCH 4/7] try --- .../Dockerfile | 71 ++--------- .../entrypoint.sh | 114 ------------------ .../Dockerfile | 69 +---------- .../entrypoint.sh | 114 ------------------ 3.3.3/scala2.12-java11-r-ubuntu/Dockerfile | 66 +--------- 3.3.3/scala2.12-java11-r-ubuntu/entrypoint.sh | 107 ---------------- 3.3.3/scala2.12-java11-ubuntu/Dockerfile | 38 +++--- 7 files changed, 38 insertions(+), 541 deletions(-) delete mode 100644 3.3.3/scala2.12-java11-python3-r-ubuntu/entrypoint.sh delete mode 100644 3.3.3/scala2.12-java11-python3-ubuntu/entrypoint.sh delete mode 100644 3.3.3/scala2.12-java11-r-ubuntu/entrypoint.sh diff --git a/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile index b2735e9..3d8599a 100644 --- a/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile +++ b/3.3.3/scala2.12-java11-python3-r-ubuntu/Dockerfile @@ -14,73 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM eclipse-temurin:11-jre-focal +FROM spark:3.3.3-scala2.12-java11-ubuntu -ARG spark_uid=185 - -RUN groupadd --system --gid=${spark_uid} spark && \ - useradd --system --uid=${spark_uid} --gid=spark spark - -RUN set -ex && \ - apt-get update && \ - ln -s /lib /lib64 && \ - apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ - apt install -y python3 python3-pip && \ - apt install -y r-base r-base-dev && \ - mkdir -p /opt/spark && \ - mkdir /opt/spark/python && \ - mkdir -p /opt/spark/examples && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - chown -R spark:spark /opt/spark && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ - rm -rf /var/cache/apt/* && \ - rm -rf /var/lib/apt/lists/* - -# Install Apache Spark -# https://downloads.apache.org/spark/KEYS -ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz \ - SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz.asc \ - GPG_KEY=F6468A4FF8377B4F1C07BC2AA077F928A0BF68D8 +USER root RUN set -ex; \ - export SPARK_TMP="$(mktemp -d)"; \ - cd $SPARK_TMP; \ - wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ - wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ - export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY"; \ - gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ - gpg --batch --verify spark.tgz.asc spark.tgz; \ - gpgconf --kill all; \ - rm -rf "$GNUPGHOME" spark.tgz.asc; \ - \ - tar -xf spark.tgz --strip-components=1; \ - chown -R spark:spark .; \ - mv jars /opt/spark/; \ - mv bin /opt/spark/; \ - mv sbin /opt/spark/; \ - mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ - mv examples /opt/spark/; \ - mv kubernetes/tests /opt/spark/; \ - mv data /opt/spark/; \ - mv python/pyspark /opt/spark/python/pyspark/; \ - mv python/lib /opt/spark/python/lib/; \ - mv R /opt/spark/; \ - cd ..; \ - rm -rf "$SPARK_TMP"; - -COPY entrypoint.sh /opt/ + apt-get update; \ + apt-get install -y python3 python3-pip; \ + apt-get install -y r-base r-base-dev; \ + rm -rf /var/lib/apt/lists/* -ENV SPARK_HOME /opt/spark ENV R_HOME /usr/lib/R -WORKDIR /opt/spark/work-dir -RUN chmod g+w /opt/spark/work-dir -RUN chmod a+x /opt/decom.sh -RUN chmod a+x /opt/entrypoint.sh - -ENTRYPOINT [ "/opt/entrypoint.sh" ] +USER spark diff --git a/3.3.3/scala2.12-java11-python3-r-ubuntu/entrypoint.sh b/3.3.3/scala2.12-java11-python3-r-ubuntu/entrypoint.sh deleted file mode 100644 index 4bb1557..0000000 --- a/3.3.3/scala2.12-java11-python3-r-ubuntu/entrypoint.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Check whether there is a passwd entry for the container UID -myuid=$(id -u) -mygid=$(id -g) -# turn off -e for getent because it will return error code in anonymous uid case -set +e -uidentry=$(getent passwd $myuid) -set -e - -# If there is no passwd entry for the container UID, attempt to create one -if [ -z "$uidentry" ] ; then - if [ -w /etc/passwd ] ; then - echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd - else - echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" - fi -fi - -if [ -z "$JAVA_HOME" ]; then - JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') -fi - -SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" -env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt -readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt - -if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then - SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" -fi - -if ! [ -z ${PYSPARK_PYTHON+x} ]; then - export PYSPARK_PYTHON -fi -if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then - export PYSPARK_DRIVER_PYTHON -fi - -# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. -# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. -if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then - export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" -fi - -if ! [ -z ${HADOOP_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; -fi - -if ! [ -z ${SPARK_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; -elif ! [ -z ${SPARK_HOME+x} ]; then - SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; -fi - -case "$1" in - driver) - shift 1 - CMD=( - "$SPARK_HOME/bin/spark-submit" - --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" - --deploy-mode client - "$@" - ) - ;; - executor) - shift 1 - CMD=( - ${JAVA_HOME}/bin/java - "${SPARK_EXECUTOR_JAVA_OPTS[@]}" - -Xms$SPARK_EXECUTOR_MEMORY - -Xmx$SPARK_EXECUTOR_MEMORY - -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" - org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend - --driver-url $SPARK_DRIVER_URL - --executor-id $SPARK_EXECUTOR_ID - --cores $SPARK_EXECUTOR_CORES - --app-id $SPARK_APPLICATION_ID - --hostname $SPARK_EXECUTOR_POD_IP - --resourceProfileId $SPARK_RESOURCE_PROFILE_ID - --podName $SPARK_EXECUTOR_POD_NAME - ) - ;; - - *) - # Non-spark-on-k8s command provided, proceeding in pass-through mode... - CMD=("$@") - ;; -esac - -# Switch to spark if no USER specified (root by default) otherwise use USER directly -switch_spark_if_root() { - if [ $(id -u) -eq 0 ]; then - echo gosu spark - fi -} - -# Execute the container CMD under tini for better hygiene -exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile index a7c8d1d..68afdd5 100644 --- a/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile +++ b/3.3.3/scala2.12-java11-python3-ubuntu/Dockerfile @@ -14,70 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM eclipse-temurin:11-jre-focal +FROM spark:3.3.3-scala2.12-java11-ubuntu -ARG spark_uid=185 - -RUN groupadd --system --gid=${spark_uid} spark && \ - useradd --system --uid=${spark_uid} --gid=spark spark - -RUN set -ex && \ - apt-get update && \ - ln -s /lib /lib64 && \ - apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ - apt install -y python3 python3-pip && \ - mkdir -p /opt/spark && \ - mkdir /opt/spark/python && \ - mkdir -p /opt/spark/examples && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - chown -R spark:spark /opt/spark && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ - rm -rf /var/cache/apt/* && \ - rm -rf /var/lib/apt/lists/* - -# Install Apache Spark -# https://downloads.apache.org/spark/KEYS -ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz \ - SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz.asc \ - GPG_KEY=F6468A4FF8377B4F1C07BC2AA077F928A0BF68D8 +USER root RUN set -ex; \ - export SPARK_TMP="$(mktemp -d)"; \ - cd $SPARK_TMP; \ - wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ - wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ - export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY"; \ - gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ - gpg --batch --verify spark.tgz.asc spark.tgz; \ - gpgconf --kill all; \ - rm -rf "$GNUPGHOME" spark.tgz.asc; \ - \ - tar -xf spark.tgz --strip-components=1; \ - chown -R spark:spark .; \ - mv jars /opt/spark/; \ - mv bin /opt/spark/; \ - mv sbin /opt/spark/; \ - mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ - mv examples /opt/spark/; \ - mv kubernetes/tests /opt/spark/; \ - mv data /opt/spark/; \ - mv python/pyspark /opt/spark/python/pyspark/; \ - mv python/lib /opt/spark/python/lib/; \ - cd ..; \ - rm -rf "$SPARK_TMP"; - -COPY entrypoint.sh /opt/ - -ENV SPARK_HOME /opt/spark - -WORKDIR /opt/spark/work-dir -RUN chmod g+w /opt/spark/work-dir -RUN chmod a+x /opt/decom.sh -RUN chmod a+x /opt/entrypoint.sh + apt-get update; \ + apt-get install -y python3 python3-pip; \ + rm -rf /var/lib/apt/lists/* -ENTRYPOINT [ "/opt/entrypoint.sh" ] +USER spark diff --git a/3.3.3/scala2.12-java11-python3-ubuntu/entrypoint.sh b/3.3.3/scala2.12-java11-python3-ubuntu/entrypoint.sh deleted file mode 100644 index 4bb1557..0000000 --- a/3.3.3/scala2.12-java11-python3-ubuntu/entrypoint.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Check whether there is a passwd entry for the container UID -myuid=$(id -u) -mygid=$(id -g) -# turn off -e for getent because it will return error code in anonymous uid case -set +e -uidentry=$(getent passwd $myuid) -set -e - -# If there is no passwd entry for the container UID, attempt to create one -if [ -z "$uidentry" ] ; then - if [ -w /etc/passwd ] ; then - echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd - else - echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" - fi -fi - -if [ -z "$JAVA_HOME" ]; then - JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') -fi - -SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" -env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt -readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt - -if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then - SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" -fi - -if ! [ -z ${PYSPARK_PYTHON+x} ]; then - export PYSPARK_PYTHON -fi -if ! [ -z ${PYSPARK_DRIVER_PYTHON+x} ]; then - export PYSPARK_DRIVER_PYTHON -fi - -# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. -# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. -if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then - export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" -fi - -if ! [ -z ${HADOOP_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; -fi - -if ! [ -z ${SPARK_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; -elif ! [ -z ${SPARK_HOME+x} ]; then - SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; -fi - -case "$1" in - driver) - shift 1 - CMD=( - "$SPARK_HOME/bin/spark-submit" - --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" - --deploy-mode client - "$@" - ) - ;; - executor) - shift 1 - CMD=( - ${JAVA_HOME}/bin/java - "${SPARK_EXECUTOR_JAVA_OPTS[@]}" - -Xms$SPARK_EXECUTOR_MEMORY - -Xmx$SPARK_EXECUTOR_MEMORY - -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" - org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend - --driver-url $SPARK_DRIVER_URL - --executor-id $SPARK_EXECUTOR_ID - --cores $SPARK_EXECUTOR_CORES - --app-id $SPARK_APPLICATION_ID - --hostname $SPARK_EXECUTOR_POD_IP - --resourceProfileId $SPARK_RESOURCE_PROFILE_ID - --podName $SPARK_EXECUTOR_POD_NAME - ) - ;; - - *) - # Non-spark-on-k8s command provided, proceeding in pass-through mode... - CMD=("$@") - ;; -esac - -# Switch to spark if no USER specified (root by default) otherwise use USER directly -switch_spark_if_root() { - if [ $(id -u) -eq 0 ]; then - echo gosu spark - fi -} - -# Execute the container CMD under tini for better hygiene -exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile index 7d6cb34..b89d87f 100644 --- a/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile +++ b/3.3.3/scala2.12-java11-r-ubuntu/Dockerfile @@ -14,69 +14,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # -FROM eclipse-temurin:11-jre-focal +FROM spark:3.3.3-scala2.12-java11-ubuntu -ARG spark_uid=185 - -RUN groupadd --system --gid=${spark_uid} spark && \ - useradd --system --uid=${spark_uid} --gid=spark spark - -RUN set -ex && \ - apt-get update && \ - ln -s /lib /lib64 && \ - apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ - apt install -y r-base r-base-dev && \ - mkdir -p /opt/spark && \ - mkdir -p /opt/spark/examples && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - chown -R spark:spark /opt/spark && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ - rm -rf /var/cache/apt/* && \ - rm -rf /var/lib/apt/lists/* - -# Install Apache Spark -# https://downloads.apache.org/spark/KEYS -ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz \ - SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz.asc \ - GPG_KEY=F6468A4FF8377B4F1C07BC2AA077F928A0BF68D8 +USER root RUN set -ex; \ - export SPARK_TMP="$(mktemp -d)"; \ - cd $SPARK_TMP; \ - wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ - wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ - export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY"; \ - gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ - gpg --batch --verify spark.tgz.asc spark.tgz; \ - gpgconf --kill all; \ - rm -rf "$GNUPGHOME" spark.tgz.asc; \ - \ - tar -xf spark.tgz --strip-components=1; \ - chown -R spark:spark .; \ - mv jars /opt/spark/; \ - mv bin /opt/spark/; \ - mv sbin /opt/spark/; \ - mv kubernetes/dockerfiles/spark/decom.sh /opt/; \ - mv examples /opt/spark/; \ - mv kubernetes/tests /opt/spark/; \ - mv data /opt/spark/; \ - mv R /opt/spark/; \ - cd ..; \ - rm -rf "$SPARK_TMP"; - -COPY entrypoint.sh /opt/ + apt-get update; \ + apt-get install -y r-base r-base-dev; \ + rm -rf /var/lib/apt/lists/* -ENV SPARK_HOME /opt/spark ENV R_HOME /usr/lib/R -WORKDIR /opt/spark/work-dir -RUN chmod g+w /opt/spark/work-dir -RUN chmod a+x /opt/decom.sh -RUN chmod a+x /opt/entrypoint.sh - -ENTRYPOINT [ "/opt/entrypoint.sh" ] +USER spark diff --git a/3.3.3/scala2.12-java11-r-ubuntu/entrypoint.sh b/3.3.3/scala2.12-java11-r-ubuntu/entrypoint.sh deleted file mode 100644 index 159d539..0000000 --- a/3.3.3/scala2.12-java11-r-ubuntu/entrypoint.sh +++ /dev/null @@ -1,107 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Check whether there is a passwd entry for the container UID -myuid=$(id -u) -mygid=$(id -g) -# turn off -e for getent because it will return error code in anonymous uid case -set +e -uidentry=$(getent passwd $myuid) -set -e - -# If there is no passwd entry for the container UID, attempt to create one -if [ -z "$uidentry" ] ; then - if [ -w /etc/passwd ] ; then - echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd - else - echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" - fi -fi - -if [ -z "$JAVA_HOME" ]; then - JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') -fi - -SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" -env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt -readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt - -if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then - SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" -fi - -# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. -# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. -if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then - export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" -fi - -if ! [ -z ${HADOOP_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; -fi - -if ! [ -z ${SPARK_CONF_DIR+x} ]; then - SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; -elif ! [ -z ${SPARK_HOME+x} ]; then - SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; -fi - -case "$1" in - driver) - shift 1 - CMD=( - "$SPARK_HOME/bin/spark-submit" - --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" - --deploy-mode client - "$@" - ) - ;; - executor) - shift 1 - CMD=( - ${JAVA_HOME}/bin/java - "${SPARK_EXECUTOR_JAVA_OPTS[@]}" - -Xms$SPARK_EXECUTOR_MEMORY - -Xmx$SPARK_EXECUTOR_MEMORY - -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" - org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend - --driver-url $SPARK_DRIVER_URL - --executor-id $SPARK_EXECUTOR_ID - --cores $SPARK_EXECUTOR_CORES - --app-id $SPARK_APPLICATION_ID - --hostname $SPARK_EXECUTOR_POD_IP - --resourceProfileId $SPARK_RESOURCE_PROFILE_ID - --podName $SPARK_EXECUTOR_POD_NAME - ) - ;; - - *) - # Non-spark-on-k8s command provided, proceeding in pass-through mode... - CMD=("$@") - ;; -esac - -# Switch to spark if no USER specified (root by default) otherwise use USER directly -switch_spark_if_root() { - if [ $(id -u) -eq 0 ]; then - echo gosu spark - fi -} - -# Execute the container CMD under tini for better hygiene -exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" diff --git a/3.3.3/scala2.12-java11-ubuntu/Dockerfile b/3.3.3/scala2.12-java11-ubuntu/Dockerfile index 31bf002..1bc6900 100644 --- a/3.3.3/scala2.12-java11-ubuntu/Dockerfile +++ b/3.3.3/scala2.12-java11-ubuntu/Dockerfile @@ -21,20 +21,17 @@ ARG spark_uid=185 RUN groupadd --system --gid=${spark_uid} spark && \ useradd --system --uid=${spark_uid} --gid=spark spark -RUN set -ex && \ - apt-get update && \ - ln -s /lib /lib64 && \ - apt install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu && \ - mkdir -p /opt/spark && \ - mkdir -p /opt/spark/examples && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - chown -R spark:spark /opt/spark && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ - rm -rf /var/cache/apt/* && \ +RUN set -ex; \ + apt-get update; \ + apt-get install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu libnss-wrapper; \ + mkdir -p /opt/spark; \ + mkdir /opt/spark/python; \ + mkdir -p /opt/spark/examples; \ + mkdir -p /opt/spark/work-dir; \ + chmod g+w /opt/spark/work-dir; \ + touch /opt/spark/RELEASE; \ + chown -R spark:spark /opt/spark; \ + echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su; \ rm -rf /var/lib/apt/lists/* # Install Apache Spark @@ -49,8 +46,8 @@ RUN set -ex; \ wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \ wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \ export GNUPGHOME="$(mktemp -d)"; \ - gpg --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY"; \ - gpg --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ + gpg --batch --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \ + gpg --batch --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \ gpg --batch --verify spark.tgz.asc spark.tgz; \ gpgconf --kill all; \ rm -rf "$GNUPGHOME" spark.tgz.asc; \ @@ -64,6 +61,10 @@ RUN set -ex; \ mv examples /opt/spark/; \ mv kubernetes/tests /opt/spark/; \ mv data /opt/spark/; \ + mv python/pyspark /opt/spark/python/pyspark/; \ + mv python/lib /opt/spark/python/lib/; \ + mv R /opt/spark/; \ + chmod a+x /opt/decom.sh; \ cd ..; \ rm -rf "$SPARK_TMP"; @@ -72,8 +73,7 @@ COPY entrypoint.sh /opt/ ENV SPARK_HOME /opt/spark WORKDIR /opt/spark/work-dir -RUN chmod g+w /opt/spark/work-dir -RUN chmod a+x /opt/decom.sh -RUN chmod a+x /opt/entrypoint.sh + +USER spark ENTRYPOINT [ "/opt/entrypoint.sh" ] From efbff7c42ff199e9ff9031e2e694d418c7fec23c Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Tue, 22 Aug 2023 13:34:10 +0800 Subject: [PATCH 5/7] Empty-Commit From 8d32172d47c69929a2f64f39979f40b3fdd35225 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Tue, 22 Aug 2023 13:43:42 +0800 Subject: [PATCH 6/7] fix --- 3.3.3/scala2.12-java11-ubuntu/entrypoint.sh | 101 ++++++++++++-------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/3.3.3/scala2.12-java11-ubuntu/entrypoint.sh b/3.3.3/scala2.12-java11-ubuntu/entrypoint.sh index 159d539..2e3d2a8 100644 --- a/3.3.3/scala2.12-java11-ubuntu/entrypoint.sh +++ b/3.3.3/scala2.12-java11-ubuntu/entrypoint.sh @@ -15,52 +15,75 @@ # See the License for the specific language governing permissions and # limitations under the License. # +# Prevent any errors from being silently ignored +set -eo pipefail -# Check whether there is a passwd entry for the container UID -myuid=$(id -u) -mygid=$(id -g) -# turn off -e for getent because it will return error code in anonymous uid case -set +e -uidentry=$(getent passwd $myuid) -set -e - -# If there is no passwd entry for the container UID, attempt to create one -if [ -z "$uidentry" ] ; then - if [ -w /etc/passwd ] ; then - echo "$myuid:x:$myuid:$mygid:${SPARK_USER_NAME:-anonymous uid}:$SPARK_HOME:/bin/false" >> /etc/passwd - else - echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID" - fi -fi +attempt_setup_fake_passwd_entry() { + # Check whether there is a passwd entry for the container UID + local myuid; myuid="$(id -u)" + # If there is no passwd entry for the container UID, attempt to fake one + # You can also refer to the https://github.com/docker-library/official-images/pull/13089#issuecomment-1534706523 + # It's to resolve OpenShift random UID case. + # See also: https://github.com/docker-library/postgres/pull/448 + if ! getent passwd "$myuid" &> /dev/null; then + local wrapper + for wrapper in {/usr,}/lib{/*,}/libnss_wrapper.so; do + if [ -s "$wrapper" ]; then + NSS_WRAPPER_PASSWD="$(mktemp)" + NSS_WRAPPER_GROUP="$(mktemp)" + export LD_PRELOAD="$wrapper" NSS_WRAPPER_PASSWD NSS_WRAPPER_GROUP + local mygid; mygid="$(id -g)" + printf 'spark:x:%s:%s:${SPARK_USER_NAME:-anonymous uid}:%s:/bin/false\n' "$myuid" "$mygid" "$SPARK_HOME" > "$NSS_WRAPPER_PASSWD" + printf 'spark:x:%s:\n' "$mygid" > "$NSS_WRAPPER_GROUP" + break + fi + done + fi +} if [ -z "$JAVA_HOME" ]; then JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}') fi SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*" -env | grep SPARK_JAVA_OPT_ | sort -t_ -k4 -n | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt -readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt +for v in "${!SPARK_JAVA_OPT_@}"; do + SPARK_EXECUTOR_JAVA_OPTS+=( "${!v}" ) +done if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH" fi +if ! [ -z "${PYSPARK_PYTHON+x}" ]; then + export PYSPARK_PYTHON +fi +if ! [ -z "${PYSPARK_DRIVER_PYTHON+x}" ]; then + export PYSPARK_DRIVER_PYTHON +fi + # If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. # It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)" fi -if ! [ -z ${HADOOP_CONF_DIR+x} ]; then +if ! [ -z "${HADOOP_CONF_DIR+x}" ]; then SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; fi -if ! [ -z ${SPARK_CONF_DIR+x} ]; then +if ! [ -z "${SPARK_CONF_DIR+x}" ]; then SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH"; -elif ! [ -z ${SPARK_HOME+x} ]; then +elif ! [ -z "${SPARK_HOME+x}" ]; then SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH"; fi +# Switch to spark if no USER specified (root by default) otherwise use USER directly +switch_spark_if_root() { + if [ $(id -u) -eq 0 ]; then + echo gosu spark + fi +} + case "$1" in driver) shift 1 @@ -70,38 +93,34 @@ case "$1" in --deploy-mode client "$@" ) + attempt_setup_fake_passwd_entry + # Execute the container CMD under tini for better hygiene + exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" ;; executor) shift 1 CMD=( ${JAVA_HOME}/bin/java "${SPARK_EXECUTOR_JAVA_OPTS[@]}" - -Xms$SPARK_EXECUTOR_MEMORY - -Xmx$SPARK_EXECUTOR_MEMORY + -Xms"$SPARK_EXECUTOR_MEMORY" + -Xmx"$SPARK_EXECUTOR_MEMORY" -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend - --driver-url $SPARK_DRIVER_URL - --executor-id $SPARK_EXECUTOR_ID - --cores $SPARK_EXECUTOR_CORES - --app-id $SPARK_APPLICATION_ID - --hostname $SPARK_EXECUTOR_POD_IP - --resourceProfileId $SPARK_RESOURCE_PROFILE_ID - --podName $SPARK_EXECUTOR_POD_NAME + --driver-url "$SPARK_DRIVER_URL" + --executor-id "$SPARK_EXECUTOR_ID" + --cores "$SPARK_EXECUTOR_CORES" + --app-id "$SPARK_APPLICATION_ID" + --hostname "$SPARK_EXECUTOR_POD_IP" + --resourceProfileId "$SPARK_RESOURCE_PROFILE_ID" + --podName "$SPARK_EXECUTOR_POD_NAME" ) + attempt_setup_fake_passwd_entry + # Execute the container CMD under tini for better hygiene + exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" ;; *) # Non-spark-on-k8s command provided, proceeding in pass-through mode... - CMD=("$@") + exec "$@" ;; esac - -# Switch to spark if no USER specified (root by default) otherwise use USER directly -switch_spark_if_root() { - if [ $(id -u) -eq 0 ]; then - echo gosu spark - fi -} - -# Execute the container CMD under tini for better hygiene -exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}" From cd14211e781c5239a60b11116c4bac024e291063 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Tue, 22 Aug 2023 13:51:59 +0800 Subject: [PATCH 7/7] fix --- 3.3.3/scala2.12-java11-ubuntu/entrypoint.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 3.3.3/scala2.12-java11-ubuntu/entrypoint.sh diff --git a/3.3.3/scala2.12-java11-ubuntu/entrypoint.sh b/3.3.3/scala2.12-java11-ubuntu/entrypoint.sh old mode 100644 new mode 100755