Skip to content

Commit e2fecda

Browse files
authored
Add solution (#133)
* Start working on sol nb * Finish update to Spark 3.5.2, download 2022 & 2023 data, install Python-is-Python3 so we can run the example. * Add solution * Ignore incubator-gluten * Update workshop NB * Install pyarrow & pyiceberg for folks who want to poke around at the parquet files. * Update solutions * More update * Forward the Spark UI for ze query plans. * Update solution * Update solutions * Re-enable cross build, use launch script * Make exec * Lets make a slimmed down container for folks who need it. * Fix spark home slim down mini some more eh wait we don't need a root user install of scala. oops we do need it * Tag mini image seperately * Stack them * Avoid pip cache dir * Don't keep the spark tarball in the image * Seperate out build from run * shell check fixes
1 parent b3db591 commit e2fecda

File tree

10 files changed

+1231
-74
lines changed

10 files changed

+1231
-74
lines changed

.gitignore

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,4 +103,9 @@ incubator-glutten/*
103103
project/build.sbt
104104
coursier
105105
# Magic file we use for build tracking
106-
oldhash
106+
oldhash
107+
# ignore ipynb checkpoints
108+
.ipynb_checkpoints/
109+
110+
# ignore accel
111+
incubator-gluten/

Dockerfile

Lines changed: 3 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,7 @@
1-
# Open JDK11, Spark 3.X and the latest JDKs get a little spicy
2-
FROM azul/zulu-openjdk:11-latest
1+
ARG base
2+
FROM $base
33

4-
RUN apt-get -qq update && \
5-
apt-get -qq -y upgrade && \
6-
apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop && \
7-
locale-gen en_US.UTF-8 && \
8-
apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \
9-
echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \
10-
echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \
11-
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \
12-
chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \
13-
apt-get update && \
14-
apt-get -qq -y install sbt && \
15-
rm -rf /var/lib/apt/lists/*
16-
17-
RUN curl -Lo coursier https://git.io/coursier-cli
18-
RUN chmod +x coursier
19-
# ensure the JAR of the CLI is in the coursier cache, in the image
20-
RUN ./coursier --help
21-
RUN pip install jupyter
22-
RUN ./coursier bootstrap \
23-
-r jitpack \
24-
-i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \
25-
sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \
26-
--default=true --sources \
27-
-o almond && \
28-
./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
29-
RUN chmod a+xr almond coursier
30-
RUN ./coursier launch almond --scala 2.13.8 -- --install
31-
# Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3
32-
#RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8
33-
RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb
34-
35-
RUN adduser dev
36-
RUN adduser dev sudo
37-
RUN echo 'dev:dev' | chpasswd
38-
RUN mkdir -p ~dev
39-
RUN cp ./coursier ~dev/
40-
RUN echo "color_prompt=yes" >> ~dev/.bashrc
41-
RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc
42-
RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.1-bin-hadoop3" >> ~dev/.bashrc
43-
RUN chown -R dev ~dev
44-
USER dev
45-
# Kernels are installed in user so we need to run as the user
46-
RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
47-
RUN ./coursier launch almond --scala 2.13.8 -- --install
484
USER root
49-
50-
RUN mkdir /high-performance-spark-examples
51-
RUN chown -R dev /high-performance-spark-examples
52-
WORKDIR /high-performance-spark-examples
53-
# Increase the chance of caching by copying just the env setup file first.
54-
COPY --chown=dev:dev env_setup.sh ./
55-
# Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place
56-
# Also downloads some test data
57-
RUN SCALA_VERSION=2.13 ./env_setup.sh
58-
RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back
59-
# Note: We need to use /home in the COPY otherwise no happy pandas
60-
COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new
61-
RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json
62-
RUN git clone https://github.com/holdenk/spark-upgrade.git
63-
RUN chown -R dev /high-performance-spark-examples
64-
ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/
65-
RUN chown -R dev /high-performance-spark-examples
5+
RUN pip install --no-cache-dir pyarrow pyiceberg[pandas,snappy,daft,s3fs] avro
666
USER dev
67-
RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history
687
RUN sbt clean compile
69-
CMD ["jupyter-lab", "--ip", "0.0.0.0", "--port", "8877"]
70-

Dockerfile-mini

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Open JDK11, Spark 3.X and the latest JDKs get a little spicy
2+
FROM azul/zulu-openjdk:11-latest
3+
4+
RUN apt-get -qq update && \
5+
apt-get -qq -y upgrade && \
6+
apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop python-is-python3 && \
7+
locale-gen en_US.UTF-8 && \
8+
apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \
9+
echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \
10+
echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \
11+
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \
12+
chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \
13+
apt-get update && \
14+
apt-get -qq -y install sbt && \
15+
rm -rf /var/lib/apt/lists/*
16+
17+
RUN curl -Lo coursier https://git.io/coursier-cli
18+
RUN chmod +x coursier
19+
# ensure the JAR of the CLI is in the coursier cache, in the image
20+
RUN ./coursier --help
21+
RUN pip install --no-cache-dir jupyter
22+
# Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3
23+
#RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8
24+
RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb
25+
26+
RUN ./coursier bootstrap \
27+
-r jitpack \
28+
-i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \
29+
sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \
30+
--default=true --sources \
31+
-o almond && \
32+
./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
33+
34+
35+
RUN adduser dev
36+
RUN adduser dev sudo
37+
RUN echo 'dev:dev' | chpasswd
38+
RUN mkdir -p ~dev
39+
RUN cp ./coursier ~dev/
40+
RUN echo "color_prompt=yes" >> ~dev/.bashrc
41+
RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc
42+
RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.2-bin-hadoop3" >> ~dev/.bashrc
43+
RUN chown -R dev ~dev
44+
USER dev
45+
# Kernels are installed in user so we need to run as the user
46+
RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
47+
USER root
48+
49+
RUN mkdir -p /high-performance-spark-examples
50+
RUN mkdir -p /high-performance-spark-examples/warehouse
51+
RUN chown -R dev /high-performance-spark-examples
52+
WORKDIR /high-performance-spark-examples
53+
# Increase the chance of caching by copying just the env setup file first.
54+
COPY --chown=dev:dev env_setup.sh ./
55+
# Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place
56+
# Also downloads some test data
57+
RUN SCALA_VERSION=2.13 ./env_setup.sh && rm *.tgz
58+
RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back
59+
# Note: We need to use /home in the COPY otherwise no happy pandas
60+
COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new
61+
RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json
62+
RUN chown -R dev /high-performance-spark-examples
63+
ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/
64+
RUN git clone https://github.com/holdenk/spark-upgrade.git
65+
RUN chown -R dev /high-performance-spark-examples
66+
USER dev
67+
RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history
68+
CMD ["/high-performance-spark-examples/misc/container_launch.sh"]
69+

build_container.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ else
1515
git archive -o myapp.tar --format=tar HEAD
1616
echo "$hash" > oldhash
1717
fi
18-
IMAGE=holdenk/hps:0.1
19-
docker buildx build --platform=linux/amd64,linux/arm64 -t "${IMAGE}" . --push
18+
VERSION=${VERSION:-0.4}
19+
IMAGE=${IMAGE:-holdenk/hps:$VERSION}
20+
MINI_IMAGE=${MINI_IMAGE:-holdenk/hps-mini:$VERSION}
21+
docker buildx build --platform=linux/amd64,linux/arm64 -t "${MINI_IMAGE}" -f Dockerfile-mini . --push
22+
docker buildx build --platform=linux/amd64,linux/arm64 -t "${IMAGE}" . --push --build-arg base="${MINI_IMAGE}"
2023
#docker buildx build --platform=linux/amd64 -t "${IMAGE}" . --push

env_setup.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ set -ex
44

55
# Download Spark and iceberg if not present
66
SPARK_MAJOR=${SPARK_MAJOR:-"3.5"}
7-
SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.1"}
7+
SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.2"}
88
SCALA_VERSION=${SCALA_VERSION:-"2.13"}
99
HADOOP_VERSION="3"
1010
SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
@@ -55,4 +55,10 @@ mkdir -p ./data/fetched/
5555
if [ ! -f ./data/fetched/2021 ]; then
5656
wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" -O ./data/fetched/2021
5757
fi
58+
if [ ! -f ./data/fetched/2022 ]; then
59+
wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2022" -O ./data/fetched/2022
60+
fi
61+
if [ ! -f ./data/fetched/2023 ]; then
62+
wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2023" -O ./data/fetched/2023
63+
fi
5864

0 commit comments

Comments
 (0)