diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..b5e1d28e --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,269 @@ +name: CI +on: + pull_request: + push: + +jobs: + test: + strategy: + fail-fast: false + matrix: + include: + - java: 17 + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Sync the current branch with the latest + if: github.repository != 'high-performance-spark/high-performance-spark-examples' + id: sync-branch + run: | + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/} + git -c user.name='Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD || echo "no merge needed." + git -c user.name='Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" || echo "no merge needed." + + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ matrix.java }} + cache: sbt + + - name: Add sbt + uses: sbt/setup-sbt@v1 + + - name: Scala Build and Test + run: sbt clean package +test + + python-test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + + - name: Run tox + run: | + cd python; tox + + run-sql-examples: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Cache Spark and friends + uses: actions/cache@v4 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 + + - name: Add sbt + uses: sbt/setup-sbt@v1 + + - name: Cache Data + uses: actions/cache@v4 + with: + path: | + data/fetched/* + key: data-fetched + + - name: Run sql examples + run: ./run_sql_examples.sh + + # run-gluten-sql-examples: + # runs-on: ubuntu-latest + # steps: + # - name: Checkout + # uses: actions/checkout@v4 + # - name: Cache Spark and friends + # uses: actions/cache@v4 + # with: + # path: | + # spark*.tgz + # iceberg*.jar + # key: spark-artifacts + # - name: Setup JDK + # uses: actions/setup-java@v4 + # with: + # distribution: temurin + # java-version: 17 + # - name: Add sbt + # uses: sbt/setup-sbt@v1 + # - name: Cache Maven packages + # uses: actions/cache@v4 + # with: + # path: ~/.m2 + # key: ${{ runner.os }}-m2-gluten + # - name: Cache Data + # uses: actions/cache@v4 + # with: + # path: | + # data/fetched/* + # key: data-fetched + # - name: Run gluten + # run: | + # cd accelerators; ./gluten_spark_34_ex.sh + + run-comet-sql-examples: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Cache Spark and friends + uses: actions/cache@v4 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + + - name: Cache Data + uses: actions/cache@v4 + with: + path: | + data/fetched/* + key: data-fetched + + - name: Cache Maven packages + uses: actions/cache@v4 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-comet + + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 + + - name: Add sbt + uses: sbt/setup-sbt@v1 + + - name: Setup comet + run: | + cd accelerators; SPARK_MAJOR=3.5 ./setup_comet.sh + + - name: Run comet + run: | + cd accelerators; ./comet_ex.sh + + run-target-examples: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Cache Spark and friends + uses: actions/cache@v4 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 + + - name: Cache Accel + uses: actions/cache@v4 + with: + path: | + accelerators/*.jar + key: accelerators-artifacts + + - name: Cache Data + uses: actions/cache@v4 + with: + path: | + data/fetched/* + key: data-fetched + + - name: Run the target validator example + run: | + cd target-validator; ./runme.sh + + run-pyspark-examples: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Cache Spark and friends + uses: actions/cache@v4 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + + - name: Cache Data + uses: actions/cache@v4 + with: + path: | + data/fetched/* + key: data-fetched + + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 + cache: sbt + + - name: Add sbt + uses: sbt/setup-sbt@v1 + + - name: Run PySpark examples + run: ./run_pyspark_examples.sh + + style: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Shellcheck + run: | + sudo apt-get update + sudo apt-get install -y shellcheck + shellcheck -e SC2317,SC1091,SC2034,SC2164 $(find -name "*.sh") + + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 + cache: sbt + - name: Add sbt + uses: sbt/setup-sbt@v1 + - name: scala style + run: + sbt scalastyle diff --git a/.gitignore b/.gitignore index 4a8e38ca..30685846 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ lib_managed/ src_managed/ project/boot/ project/plugins/project/ +.bsp # Scala-IDE specific .scala_dependencies @@ -23,11 +24,88 @@ project/plugins/project/ *~ sbt/*launch*.jar +# VSCode specific +.vscode +.history + +# Metals +.metals +.bloop +metals.sbt + # python *.pyc +.tox +.bsp + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +# scala stuff +.metals # native *.o *.so *.so.0.0.0 -*.so.0 \ No newline at end of file +*.so.0 + +# Spark files +*.tgz +iceberg-spark-runtime-*.jar +spark-*-bin-hadoop*/ + +# Warehouse +spark-warehouse/ +warehouse/ +metastore_db/ + +# Misc internal stuff +sql/*.sql.out +python/examples/*.py.out +data/fetched/* +spark_expectations_sample_rules.json + +# more python +pyspark_venv.tar.gz +pyspark_venv/ + +# accel stuff +accelerators/*.jar +accelerators/arrow-datafusion-comet +# ignore gluten +gluten +gluten*.jar +spark-3*hadoop*/ +spark-3*hadoop*.tgz +accelerators/incubator-gluten +# ignore the temporary myapp from the dockerbuild +myapp.tar +# ignore glutten +incubator-glutten/* +# ignore nested build file. +project/build.sbt +coursier +# Magic file we use for build tracking +oldhash +# ignore ipynb checkpoints +.ipynb_checkpoints/ + +# ignore accel +incubator-gluten/ diff --git a/.jvmopts b/.jvmopts new file mode 100644 index 00000000..694a6c7d --- /dev/null +++ b/.jvmopts @@ -0,0 +1,4 @@ + -Xms4096M + -Xmx8096M + -Xss2M + -XX:MaxMetaspaceSize=4024M \ No newline at end of file diff --git a/.scalafix.conf b/.scalafix.conf new file mode 100644 index 00000000..8697e8ff --- /dev/null +++ b/.scalafix.conf @@ -0,0 +1,31 @@ +UnionRewrite.deprecatedMethod { + "unionAll" = "union" +} + +OrganizeImports { + blankLines = Auto, + groups = [ + "re:javax?\\." + "scala." + "org.apache.spark." + "*" + ], + removeUnused = false +} + +rules = [ + DisableSyntax, + SparkAutoUpgrade, + MigrateHiveContext, + MigrateToSparkSessionBuilder, + MigrateDeprecatedDataFrameReaderFuns, + AccumulatorUpgrade, + onFailureFix, + ExecutorPluginWarn, + UnionRewrite, + GroupByKeyWarn, + GroupByKeyRewrite, + MetadataWarnQQ, + ScalaTestExtendsFix, + ScalaTestImportChange +] \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 58147d3d..00000000 --- a/.travis.yml +++ /dev/null @@ -1,64 +0,0 @@ -language: scala -sudo: false -cache: - directories: - - $HOME/.ivy2 - - $HOME/spark - - $HOME/.cache/pip - - $HOME/.pip-cache - - $HOME/.sbt/launchers - - $HOME/perl5 -scala: - - 2.11.6 -jdk: - - oraclejdk8 -r: - - release -addons: - apt: - sources: - - ubuntu-toolchain-r-test - - ppa:marutter/rdev - packages: - - gfortran - - gcc - - binutils - - python-pip - - python-pandas - - python-numpy - - gfortran - - cmake - - perl - - cpanminus - - r-base - - libcurl4-gnutls-dev - - libxml2-dev - - libssl-dev - - r-base-dev - - axel -r_packages: - - Imap -before_install: - - # Setup Python - - pip install --user codecov unittest2 nose pep8 pylint - - # Setup perl - - cpanm --force --local-lib $HOME/perl5 --quite --notest Pithub || cat ~/.cpanm/build.log - - cd ./src/main/perl; cpanm --local-lib $HOME/perl5 --force --quiet --installdeps --notest .; cd ../../../ - - PATH="$HOME/perl5/bin${PATH:+:${PATH}}"; export PATH; - - PERL5LIB=":$HOME/perl5/lib/perl5${PERL5LIB:+:${PERL5LIB}}"; export PERL5LIB; - - PERL_LOCAL_LIB_ROOT="$HOME/perl5${PERL_LOCAL_LIB_ROOT:+:${PERL_LOCAL_LIB_ROOT}}"; export PERL_LOCAL_LIB_ROOT; - - PERL_MB_OPT="--install_base \"$HOME/perl5\""; export PERL_MB_OPT; - - PERL_MM_OPT="INSTALL_BASE=$HOME/perl5"; export PERL_MM_OPT; -script: - - "export SPARK_CONF_DIR=./log4j/" - - sbt clean coverage compile package assembly test || (rm -rf ~/.ivy2 ~/.m2 && sbt clean coverage compile package test) - - "[ -f spark] || mkdir spark && cd spark && axel http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz && cd .." - - "tar -xf ./spark/spark-2.1.0-bin-hadoop2.7.tgz" - - "export SPARK_HOME=`pwd`/spark-2.1.0-bin-hadoop2.7" - - "export PYTHONPATH=$SPARK_HOME/python:`ls -1 $SPARK_HOME/python/lib/py4j-*-src.zip`:$PYTHONPATH" - - "PYSPARK_SUBMIT_ARGS='--jars ./target/examples-assembly-0.0.1.jar pyspark-shell' nosetests --with-doctest --doctest-options=+ELLIPSIS --logging-level=INFO --detailed-errors --verbosity=2 --with-coverage --cover-html-dir=./htmlcov" - - # $SPARK_HOME/bin/spark-submit ./src/main/r/wc.R $SPARK_HOME/README.md - - # $SPARK_HOME/bin/spark-submit ./src/main/r/dapply.R -after_success: - - sbt coverageReport || sbt update coverageReport - - codecov \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..c4feed87 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,7 @@ +ARG base +FROM $base + +USER root +RUN pip install --no-cache-dir pyarrow pyiceberg[pandas,snappy,daft,s3fs] avro fastavro +USER dev +RUN sbt clean compile diff --git a/Dockerfile-mini b/Dockerfile-mini new file mode 100644 index 00000000..b9e7ddf8 --- /dev/null +++ b/Dockerfile-mini @@ -0,0 +1,69 @@ +# Open JDK11, Spark 3.X and the latest JDKs get a little spicy +FROM azul/zulu-openjdk:11-latest + +RUN apt-get -qq update && \ + apt-get -qq -y upgrade && \ + apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop python-is-python3 && \ + locale-gen en_US.UTF-8 && \ + apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \ + echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \ + echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \ + curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \ + chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \ + apt-get update && \ + apt-get -qq -y install sbt && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -Lo coursier https://git.io/coursier-cli +RUN chmod +x coursier +# ensure the JAR of the CLI is in the coursier cache, in the image +RUN ./coursier --help +RUN pip install --no-cache-dir jupyter +# Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3 +#RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8 +RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb + +RUN ./coursier bootstrap \ + -r jitpack \ + -i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \ + sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \ + --default=true --sources \ + -o almond && \ + ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13" + + +RUN adduser dev +RUN adduser dev sudo +RUN echo 'dev:dev' | chpasswd +RUN mkdir -p ~dev +RUN cp ./coursier ~dev/ +RUN echo "color_prompt=yes" >> ~dev/.bashrc +RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc +RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.2-bin-hadoop3" >> ~dev/.bashrc +RUN chown -R dev ~dev +USER dev +# Kernels are installed in user so we need to run as the user +RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13" +USER root + +RUN mkdir -p /high-performance-spark-examples +RUN mkdir -p /high-performance-spark-examples/warehouse +RUN chown -R dev /high-performance-spark-examples +WORKDIR /high-performance-spark-examples +# Increase the chance of caching by copying just the env setup file first. +COPY --chown=dev:dev env_setup.sh ./ +# Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place +# Also downloads some test data +RUN SCALA_VERSION=2.13 ./env_setup.sh && rm *.tgz +RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back +# Note: We need to use /home in the COPY otherwise no happy pandas +COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new +RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json +RUN chown -R dev /high-performance-spark-examples +ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/ +RUN git clone https://github.com/holdenk/spark-upgrade.git +RUN chown -R dev /high-performance-spark-examples +USER dev +RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history +CMD ["/high-performance-spark-examples/misc/container_launch.sh"] + diff --git a/README.md b/README.md index 551928fd..b230d384 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # high-performance-spark-examples Examples for High Performance Spark +We are in the progress of updata this for Spark 4 (some parts depending on external libraries like Iceberg, Comet, etc. are still 3.X) and the 2ed edition of our book! + # Building Most of the examples can be built with sbt, the C and Fortran components depend on gcc, g77, and cmake. @@ -8,3 +10,25 @@ Most of the examples can be built with sbt, the C and Fortran components depend # Tests The full test suite depends on having the C and Fortran components built as well as a local R installation available. + +The most "accuate" way of seeing how we run the tests is to look at the .github workflows + +# History Server + +The history server can be a great way to figure out what's going on. + +By default the history server writes to `/tmp/spark-events` so you'll need to create that directory if not setup with + +`mkdir -p /tmp/spark-events` + +The scripts for running the examples generally run with the event log enabled. + +You can set the SPARK_EVENTLOG=true before running the scala tests and you'll get the history server too! + +e.g. + +`SPARK_EVENTLOG=true sbt test` + +If you want to run just a specific test you can run [testOnly](https://www.scala-sbt.org/1.x/docs/Testing.html) + +Then to view the history server you'll want to launch it using the `${SPARK_HOME}/sbin/start-history-server.sh` then you [can go to your local history server](http://localhost:18080/) diff --git a/accelerators/comet_env_setup.sh b/accelerators/comet_env_setup.sh new file mode 100644 index 00000000..3563f0eb --- /dev/null +++ b/accelerators/comet_env_setup.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +SPARK_EXTRA=" +--jars ${COMET_JAR} \ +--driver-class-path ${COMET_JAR} \ +--conf spark.comet.enabled=true \ +--conf spark.comet.exec.enabled=true \ +--conf spark.comet.exec.all.enabled=true \ +--conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ +--conf spark.comet.exec.shuffle.enabled=true \ +--conf spark.comet.columnar.shuffle.enabled=true" +# Instead of --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions we set +# EXTRA_EXTENSIONS so it can be appended to iceberg +if [ -z "$EXTRA_EXTENSIONS" ]; then + EXTRA_EXTENSIONS="org.apache.comet.CometSparkSessionExtensions" +else + EXTRA_EXTENSIONS="org.apache.comet.CometSparkSessionExtensions,$EXTRA_EXTENSIONS" +fi +export EXTRA_EXTENSIONS +export SPARK_EXTRA diff --git a/accelerators/comet_ex.sh b/accelerators/comet_ex.sh new file mode 100755 index 00000000..268a4dcb --- /dev/null +++ b/accelerators/comet_ex.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -ex + +# If you change this update the workflow version too. +SPARK_MAJOR=${SPARK_MAJOR:-3.5} +SPARK_VERSION=${SPARK_MAJOR}.1 +export SPARK_MAJOR +export SPARK_VERSION + +source setup_comet.sh +pushd .. +source ./env_setup.sh +popd +source comet_env_setup.sh +pushd .. +USE_COMET="true" ./run_sql_examples.sh diff --git a/accelerators/gluten_config.properties b/accelerators/gluten_config.properties new file mode 100644 index 00000000..eab39465 --- /dev/null +++ b/accelerators/gluten_config.properties @@ -0,0 +1,5 @@ +spark.plugins=io.glutenproject.GlutenPlugin +spark.memory.offHeap.enabled=true +spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager +# This static allocation is one of the hardest part of using Gluten +spark.memory.offHeap.size=20g diff --git a/accelerators/gluten_env_setup.sh b/accelerators/gluten_env_setup.sh new file mode 100755 index 00000000..6bda6ecd --- /dev/null +++ b/accelerators/gluten_env_setup.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# Check if we gluten and gluten UDFs present +GLUTEN_NATIVE_LIB_NAME=libhigh-performance-spark-gluten-0.so +NATIVE_LIB_DIR=$(pwd)/../native/src/ +NATIVE_LIB_PATH="${NATIVE_LIB_DIR}${GLUTEN_NATIVE_LIB_NAME}" +GLUTEN_HOME=incubator-gluten +source /etc/lsb-release +if [ -n "$GLUTEN_JAR_PATH" ]; then + GLUTEN_EXISTS="true" + GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=5g \ + --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \ + --jars ${GLUTEN_JAR_PATH}" +fi +if [ -f "${NATIVE_LIB_PATH}" ]; then + if [ "$GLUTEN_EXISTS" == "true" ]; then + GLUTEN_UDF_EXISTS="true" + GLUTEN_SPARK_EXTRA="$GLUTEN_SPARK_EXTRA \ + --conf spark.jars=${GLUTEN_JAR_PATH} \ + --conf spark.gluten.loadLibFromJar=true \ + --files ${NATIVE_LIB_PATH} \ + --conf spark.gluten.sql.columnar.backend.velox.udfLibraryPaths=${GLUTEN_NATIVE_LIB_NAME}" + fi +fi +SPARK_EXTRA=GLUTEN_SPARK_EXTRA + +export SPARK_EXTRA +export GLUTEN_UDF_EXISTS +export GLUTEN_EXISTS diff --git a/accelerators/gluten_spark_34_ex.sh b/accelerators/gluten_spark_34_ex.sh new file mode 100755 index 00000000..0f98ab8e --- /dev/null +++ b/accelerators/gluten_spark_34_ex.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd "${SCRIPT_DIR}" +source "${SCRIPT_DIR}/setup_gluten_spark34.sh" + +export SPARK_HOME +PATH="$(pwd)/${SPARK_DIR}/bin:$PATH" +export PATH +"${SPARK_HOME}/bin/spark-sql" --master local[5] \ + --conf spark.plugins=io.glutenproject.GlutenPlugin \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=5g \ + --jars "${GLUTEN_JAR}" \ + --conf spark.eventLog.enabled=true \ + -e "SELECT 1" + +source gluten_env_setup.sh +cd .. +./run_sql_examples.sh || echo "Expected to fail" diff --git a/accelerators/install_rust_if_needed.sh b/accelerators/install_rust_if_needed.sh new file mode 100644 index 00000000..76826e8e --- /dev/null +++ b/accelerators/install_rust_if_needed.sh @@ -0,0 +1,9 @@ +#!/bin/bash +if [ -f "$HOME/.cargo/env" ]; then + source "$HOME/.cargo/env" +fi + +if ! command -v cargo; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + source "$HOME/.cargo/env" +fi diff --git a/accelerators/run_gluten.sh b/accelerators/run_gluten.sh new file mode 100755 index 00000000..34ddb3b1 --- /dev/null +++ b/accelerators/run_gluten.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +"${SPARK_HOME}/bin/spark-shell" --master local --jars "${ACCEL_JARS}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_2.12-1.1.1.jar" --spark-properties=gluten_config.properties diff --git a/accelerators/setup_comet.sh b/accelerators/setup_comet.sh new file mode 100755 index 00000000..ed89a0d8 --- /dev/null +++ b/accelerators/setup_comet.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +set -ex +source install_rust_if_needed.sh + +if command -v protoc >/dev/null 2>&1; then + echo "protoc already installed" +else + sudo apt-get install -y protobuf-compiler +fi + +if [ -z "${SPARK_MAJOR}" ]; then + echo "Need a spark major version specified." + exit 1 +else + echo "Building comet for Spark ${SPARK_MAJOR}" +fi + +#tag::build[] +# If we don't have fusion checked out do it +if [ ! -d arrow-datafusion-comet ]; then + git clone https://github.com/apache/arrow-datafusion-comet.git +fi + +# Build JAR if not present +if [ -z "$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar)" ]; then + cd arrow-datafusion-comet + make clean release PROFILES="-Pspark-${SPARK_MAJOR} -Pscala-2.13" + cd .. +fi +COMET_JAR="$(pwd)/$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*SNAPSHOT.jar)" +export COMET_JAR +#end::build[] diff --git a/accelerators/setup_gluten_deps.sh b/accelerators/setup_gluten_deps.sh new file mode 100755 index 00000000..6472390c --- /dev/null +++ b/accelerators/setup_gluten_deps.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -ex + +sudo apt-get update +#tag::gluten_deps[] +sudo apt-get install -y locales wget tar tzdata git ccache cmake ninja-build build-essential \ + llvm-dev clang libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev \ + libcurl4-openssl-dev maven rapidjson-dev libdouble-conversion-dev libgflags-dev \ + libsodium-dev libsnappy-dev nasm +sudo apt install -y libunwind-dev +sudo apt-get install -y libgoogle-glog-dev +sudo apt-get -y install docker-compose +sudo apt-get install -y libre2-9 || sudo apt-get install -y libre2-10 +#end::gluten_deps[] diff --git a/accelerators/setup_gluten_from_src.sh b/accelerators/setup_gluten_from_src.sh new file mode 100755 index 00000000..4788e05f --- /dev/null +++ b/accelerators/setup_gluten_from_src.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -ex + +# Setup deps +source setup_gluten_deps.sh + +# Try gluten w/clickhouse +#if [ ! -d gluten ]; then +# git clone https://github.com/oap-project/gluten.git +# cd gluten +# bash ./ep/build-clickhouse/src/build_clickhouse.sh +#fi + +# Build gluten +if [ ! -d gluten ]; then + # We need Spark 3.5 w/scala212 + git clone git@github.com:holdenk/gluten.git || git clone https://github.com/holdenk/gluten.git + cd gluten + git checkout add-spark35-scala213-hack + ./dev/builddeps-veloxbe.sh + mvn clean package -Pbackends-velox -Pspark-3.5 -DskipTests + cd .. +fi diff --git a/accelerators/setup_gluten_spark34.sh b/accelerators/setup_gluten_spark34.sh new file mode 100755 index 00000000..0cbfbc19 --- /dev/null +++ b/accelerators/setup_gluten_spark34.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +mkdir -p /tmp/spark-events +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ACCEL_JARS=${SCRIPT_DIR} +SPARK_MAJOR_VERSION=3.4 +SCALA_VERSION=${SCALA_VERSION:-"2.12"} + +set -ex + +# Note: this does not work on Ubuntu 23, only on 22 +# You might get something like: +# # C [libgluten.so+0x30c753] gluten::Runtime::registerFactory(std::string const&, std::function, std::equal_to, std::allocator > > const&)>)+0x23 + + +SPARK_VERSION=3.4.2 +SPARK_MAJOR=3.4 +HADOOP_VERSION=3 +SPARK_DIR="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" +SPARK_FILE="${SPARK_DIR}.tgz" + +export SPARK_MAJOR +export SPARK_VERSION + +source setup_gluten_deps.sh + +cd .. +source /etc/lsb-release +# Pre-baked only +if [ "$DISTRIB_RELEASE" == "20.04" ]; then + source ./env_setup.sh + cd "${SCRIPT_DIR}" + + GLUTEN_JAR="gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar" + GLUTEN_JAR_PATH="${SCRIPT_DIR}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar" + + if [ ! -f "${GLUTEN_JAR_PATH}" ]; then + wget "https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR}" || unset GLUTEN_JAR_PATH + fi + +fi +# Rather than if/else we fall through to build if wget fails because major version is not supported. +if [ -z "$GLUTEN_JAR_PATH" ]; then + #tag::build_gluten[] + if [ ! -d incubator-gluten ]; then + git clone https://github.com/apache/incubator-gluten.git + fi + cd incubator-gluten + sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON + mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests + GLUTEN_JAR_PATH="$(pwd)/package/target/gluten-package-*-SNAPSHOT-${SPARK_MAJOR_VERSION}.jar" + #end::build_gluten[] +fi + +export GLUTEN_JAR_PATH + diff --git a/build.sbt b/build.sbt index 35b1508e..f5c04850 100644 --- a/build.sbt +++ b/build.sbt @@ -1,97 +1,125 @@ -organization := "com.highperformancespark" - -name := "examples" - -publishMavenStyle := true - -version := "0.0.1" - -scalaVersion := "2.11.6" -scalaVersion in ThisBuild := "2.11.6" -ivyScala := ivyScala.value map { _.copy(overrideScalaVersion = true) } - -crossScalaVersions := Seq("2.11.6") - -javacOptions ++= Seq("-source", "1.8", "-target", "1.8") - -//tag::sparkVersion[] -sparkVersion := "2.2.0" -//end::sparkVersion[] - -//tag::sparkComponents[] -sparkComponents ++= Seq("core") -//end::sparkComponents[] -//tag::sparkExtraComponents[] -sparkComponents ++= Seq("streaming", "mllib") -//end::sparkExtraComponents[] -//tag::addSQLHiveComponent[] -sparkComponents ++= Seq("sql", "hive", "hive-thriftserver", "hive-thriftserver") -//end::addSQLHiveComponent[] +lazy val root = (project in file(".")) + .aggregate(core, native) -parallelExecution in Test := false -fork := true - -javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true") +organization := "com.highperformancespark" -// additional libraries -libraryDependencies ++= Seq( - "org.scalatest" %% "scalatest" % "3.0.1", - "org.scalacheck" %% "scalacheck" % "1.13.4", - "junit" % "junit" % "4.12", - "junit" % "junit" % "4.11", - "com.holdenkarau" %% "spark-testing-base" % "2.2.0_0.7.2", - "com.novocode" % "junit-interface" % "0.11" % "test->default", - //tag::scalaLogging[] - "com.typesafe.scala-logging" %% "scala-logging" % "3.5.0", - //end::scalaLogging[] - "org.codehaus.jackson" % "jackson-core-asl" % "1.8.8", - "org.codehaus.jackson" % "jackson-mapper-asl" % "1.8.8", - "org.codehaus.jackson" % "jackson-core-asl" % "1.9.13", - "org.codehaus.jackson" % "jackson-mapper-asl" % "1.9.13", - "net.java.dev.jna" % "jna" % "4.2.2") +//tag::addSparkScalaFix[] +// Needs to be commented out post-upgrade because of Scala versions. +//ThisBuild / scalafixDependencies += +// "com.holdenkarau" %% "spark-scalafix-rules-2.4.8" % "0.1.5" +//ThisBuild / scalafixDependencies += +// "com.github.liancheng" %% "organize-imports" % "0.6.0" +//end::addSparkScalaFix[] + +lazy val V = _root_.scalafix.sbt.BuildInfo + +scalaVersion := "2.13.13" +addCompilerPlugin(scalafixSemanticdb) +scalacOptions ++= List( + "-Yrangepos", + "-P:semanticdb:synthetics:on" +) -scalacOptions ++= Seq("-deprecation", "-unchecked") +name := "examples" -pomIncludeRepository := { x => false } +publishMavenStyle := true +version := "0.0.1" resolvers ++= Seq( - "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/", - "Spray Repository" at "http://repo.spray.cc/", + "JBoss Repository" at "https://repository.jboss.org/nexus/content/repositories/releases/", "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", - "Akka Repository" at "http://repo.akka.io/releases/", - "Twitter4J Repository" at "http://twitter4j.org/maven2/", "Apache HBase" at "https://repository.apache.org/content/repositories/releases", - "Twitter Maven Repo" at "http://maven.twttr.com/", + "Twitter Maven Repo" at "https://maven.twttr.com/", "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/", - "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", - "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/", - "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven", + "Typesafe repository" at "https://repo.typesafe.com/typesafe/releases/", + "Second Typesafe repo" at "https://repo.typesafe.com/typesafe/maven-releases/", + "Mesosphere Public Repository" at "https://downloads.mesosphere.io/maven", Resolver.sonatypeRepo("public"), - Resolver.bintrayRepo("jodersky", "sbt-jni-macros"), - "jodersky" at "https://dl.bintray.com/jodersky/maven/" + Resolver.mavenLocal ) licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")) -mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => - { - case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard - case m if m.startsWith("META-INF") => MergeStrategy.discard - case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first - case PathList("org", "apache", xs @ _*) => MergeStrategy.first - case PathList("org", "jboss", xs @ _*) => MergeStrategy.first - case "log4j.properties" => MergeStrategy.discard - case "about.html" => MergeStrategy.rename - case "reference.conf" => MergeStrategy.concat - case _ => MergeStrategy.first +def specialOptions = { + // We only need these extra props for JRE>17 + if (sys.props("java.specification.version") > "1.17") { + Seq( + "base/java.lang", "base/java.lang.invoke", "base/java.lang.reflect", "base/java.io", "base/java.net", "base/java.nio", + "base/java.util", "base/java.util.concurrent", "base/java.util.concurrent.atomic", + "base/sun.nio.ch", "base/sun.nio.cs", "base/sun.security.action", + "base/sun.util.calendar", "security.jgss/sun.security.krb5", + ).map("--add-opens=java." + _ + "=ALL-UNNAMED") + } else { + Seq() } } -// JNI -enablePlugins(JniNative) +val sparkVersion = settingKey[String]("Spark version") +val sparkTestingVersion = settingKey[String]("Spark testing base version without Spark version part") + + +// Core (non-JNI bits) + +lazy val core = (project in file("core")) // regular scala code with @native methods + .dependsOn(native % Runtime) + .settings(javah / target := (native / nativeCompile / sourceDirectory).value / "include") + .settings(scalaVersion := "2.13.13") + .settings(sbtJniCoreScope := Compile) + .settings( + scalaVersion := "2.13.8", + javacOptions ++= Seq("-source", "17", "-target", "17"), + parallelExecution in Test := false, + fork := true, + javaOptions ++= Seq("-Xms4048M", "-Xmx4048M", "-Djna.nosys=true"), + Test / javaOptions ++= specialOptions, + // 2.4.5 is the highest version we have with the old spark-testing-base deps + sparkVersion := System.getProperty("sparkVersion", "4.0.0"), + sparkTestingVersion := "2.1.2", + // additional libraries + libraryDependencies ++= Seq( + "org.apache.spark" %% "spark-core" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-streaming" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-sql" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-hive" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-hive-thriftserver" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-yarn" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-mllib" % sparkVersion.value % Provided, + "com.holdenkarau" %% "spark-testing-base" % s"${sparkVersion.value}_${sparkTestingVersion.value}" % Test, + //tag::scalaLogging[] + "com.typesafe.scala-logging" %% "scala-logging" % "3.9.4", + //end::scalaLogging[] + "net.java.dev.jna" % "jna" % "5.12.1"), + scalacOptions ++= Seq("-deprecation", "-unchecked"), + pomIncludeRepository := { x => false }, + resolvers += Resolver.mavenLocal + ) + +// JNI Magic! +lazy val native = (project in file("native")) // native code and build script + .settings(nativeCompile / sourceDirectory := sourceDirectory.value) + .settings(scalaVersion := "2.13.13") + .enablePlugins(JniNative) // JniNative needs to be explicitly enabled + +//tag::xmlVersionConflict[] +// See https://github.com/scala/bug/issues/12632 +ThisBuild / libraryDependencySchemes ++= Seq( + "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always +) +//end::xmlVersionConflict[] + +assemblyMergeStrategy in assembly := { + case x => MergeStrategy.first +} -sourceDirectory in nativeCompile := sourceDirectory.value +assemblyMergeStrategy in native := { + case x => MergeStrategy.first +} + +assemblyMergeStrategy in core := { + case x => MergeStrategy.first +} diff --git a/build_container.sh b/build_container.sh new file mode 100755 index 00000000..691ae67d --- /dev/null +++ b/build_container.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +set -ex + +cp .git/index /tmp/git_index +export GIT_INDEX_FILE=/tmp/git_index +git add -u +hash=$(git write-tree) +unset GIT_INDEX_FILE +oldhash=$(cat oldhash || true) +if [ "$hash" = "$oldhash" ] && [ -f myapp.tar ]; then + echo "Skipping making tar since we match." +else + echo "Making tar since no match" + git archive -o myapp.tar --format=tar HEAD + echo "$hash" > oldhash +fi +VERSION=${VERSION:-0.5} +IMAGE=${IMAGE:-holdenk/hps:$VERSION} +MINI_IMAGE=${MINI_IMAGE:-holdenk/hps-mini:$VERSION} +docker buildx build --platform=linux/amd64,linux/arm64 -t "${MINI_IMAGE}" -f Dockerfile-mini . --push +docker buildx build --platform=linux/amd64,linux/arm64 -t "${IMAGE}" . --push --build-arg base="${MINI_IMAGE}" +#docker buildx build --platform=linux/amd64 -t "${IMAGE}" . --push diff --git a/build_windows.sbt b/build_windows.sbt deleted file mode 100644 index b698ab9a..00000000 --- a/build_windows.sbt +++ /dev/null @@ -1,91 +0,0 @@ -organization := "com.highperformancespark" - -name := "examples" - -publishMavenStyle := true - -version := "0.0.1" - -scalaVersion := "2.11.6" -scalaVersion in ThisBuild := "2.11.6" -ivyScala := ivyScala.value map { _.copy(overrideScalaVersion = true) } - -crossScalaVersions := Seq("2.11.6") - -javacOptions ++= Seq("-source", "1.8", "-target", "1.8") - -//tag::sparkVersion[] -sparkVersion := "2.2.0" -//end::sparkVersion[] - -//tag::sparkComponents[] -sparkComponents ++= Seq("core") -//end::sparkComponents[] -//tag::sparkExtraComponents[] -sparkComponents ++= Seq("streaming", "mllib") -//end::sparkExtraComponents[] -//tag::addSQLHiveComponent[] -sparkComponents ++= Seq("sql", "hive", "hive-thriftserver", "hive-thriftserver") -//end::addSQLHiveComponent[] - -parallelExecution in Test := false - -fork := true - -javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true") - -// additional libraries -libraryDependencies ++= Seq( - "org.scalatest" %% "scalatest" % "3.0.1", - "org.scalacheck" %% "scalacheck" % "1.13.4", - "junit" % "junit" % "4.12", - "junit" % "junit" % "4.11", - "com.holdenkarau" %% "spark-testing-base" % "2.2.0_0.7.2", - "com.novocode" % "junit-interface" % "0.11" % "test->default", - //tag::sacalLogging[] - "com.typesafe.scala-logging" %% "scala-logging" % "3.5.0", - //end::scalaLogging[] - "org.codehaus.jackson" % "jackson-core-asl" % "1.8.8", - "org.codehaus.jackson" % "jackson-mapper-asl" % "1.8.8", - "org.codehaus.jackson" % "jackson-core-asl" % "1.9.13", - "org.codehaus.jackson" % "jackson-mapper-asl" % "1.9.13", - "net.java.dev.jna" % "jna" % "4.2.2") - - -scalacOptions ++= Seq("-deprecation", "-unchecked") - -pomIncludeRepository := { x => false } - -resolvers ++= Seq( - "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/", - "Spray Repository" at "http://repo.spray.cc/", - "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", - "Akka Repository" at "http://repo.akka.io/releases/", - "Twitter4J Repository" at "http://twitter4j.org/maven2/", - "Apache HBase" at "https://repository.apache.org/content/repositories/releases", - "Twitter Maven Repo" at "http://maven.twttr.com/", - "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", - "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/", - "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", - "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/", - "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven", - Resolver.sonatypeRepo("public"), - Resolver.bintrayRepo("jodersky", "sbt-jni-macros"), - "jodersky" at "https://dl.bintray.com/jodersky/maven/" -) - -licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")) - -mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => - { - case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard - case m if m.startsWith("META-INF") => MergeStrategy.discard - case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first - case PathList("org", "apache", xs @ _*) => MergeStrategy.first - case PathList("org", "jboss", xs @ _*) => MergeStrategy.first - case "log4j.properties" => MergeStrategy.discard - case "about.html" => MergeStrategy.rename - case "reference.conf" => MergeStrategy.concat - case _ => MergeStrategy.first - } -} diff --git a/c b/c new file mode 100644 index 00000000..cb4d93b6 --- /dev/null +++ b/c @@ -0,0 +1,2 @@ +bloop + diff --git a/src/main/java/com/highperformancespark/examples/JavaInterop.java b/core/src/main/java/com/highperformancespark/examples/JavaInterop.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/JavaInterop.java rename to core/src/main/java/com/highperformancespark/examples/JavaInterop.java diff --git a/src/main/java/com/highperformancespark/examples/WordCount.java b/core/src/main/java/com/highperformancespark/examples/WordCount.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/WordCount.java rename to core/src/main/java/com/highperformancespark/examples/WordCount.java diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java similarity index 85% rename from src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java rename to core/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java index 950f9e5c..62b32e06 100644 --- a/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java +++ b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java @@ -4,10 +4,9 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Column; import org.apache.spark.sql.*; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.expressions.Window; import org.apache.spark.sql.expressions.WindowSpec; -import org.apache.spark.sql.hive.HiveContext; import java.util.HashMap; import java.util.Map; @@ -16,39 +15,23 @@ public class JavaHappyPandas { - /** - * Creates SQLContext with an existing SparkContext. - */ - public static SQLContext sqlContext(JavaSparkContext jsc) { - SQLContext sqlContext = new SQLContext(jsc); - return sqlContext; - } - - /** - * Creates HiveContext with an existing SparkContext. - */ - public static HiveContext hiveContext(JavaSparkContext jsc) { - HiveContext hiveContext = new HiveContext(jsc); - return hiveContext; - } - /** * Illustrate loading some JSON data. */ - public static Dataset loadDataSimple(JavaSparkContext jsc, SQLContext sqlContext, String path) { - Dataset df1 = sqlContext.read().json(path); + public static Dataset loadDataSimple(JavaSparkContext jsc, SparkSession session, String path) { + Dataset df1 = session.read().json(path); - Dataset df2 = sqlContext.read().format("json").option("samplingRatio", "1.0").load(path); + Dataset df2 = session.read().format("json").option("samplingRatio", "1.0").load(path); JavaRDD jsonRDD = jsc.textFile(path); - Dataset df3 = sqlContext.read().json(jsonRDD); + Dataset df3 = session.read().json(jsonRDD); return df1; } - public static Dataset jsonLoadFromRDD(SQLContext sqlContext, JavaRDD input) { + public static Dataset jsonLoadFromRDD(SparkSession session, JavaRDD input) { JavaRDD rdd = input.filter(e -> e.contains("panda")); - Dataset df = sqlContext.read().json(rdd); + Dataset df = session.read().json(rdd); return df; } @@ -147,10 +130,10 @@ public static Dataset minMeanSizePerZip(Dataset pandas) { } public static Dataset simpleSqlExample(Dataset pandas) { - SQLContext sqlContext = pandas.sqlContext(); + SparkSession session = SparkSession.builder().getOrCreate(); pandas.registerTempTable("pandas"); - Dataset miniPandas = sqlContext.sql("SELECT * FROM pandas WHERE pandaSize < 12"); + Dataset miniPandas = session.sql("SELECT * FROM pandas WHERE pandaSize < 12"); return miniPandas; } diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java rename to core/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java rename to core/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java diff --git a/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java b/core/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java rename to core/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java rename to core/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java rename to core/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java rename to core/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/objects/JavaPandas.java rename to core/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java rename to core/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java diff --git a/src/main/julia/setup.jl b/core/src/main/julia/setup.jl similarity index 100% rename from src/main/julia/setup.jl rename to core/src/main/julia/setup.jl diff --git a/src/main/julia/wc.jl b/core/src/main/julia/wc.jl similarity index 100% rename from src/main/julia/wc.jl rename to core/src/main/julia/wc.jl diff --git a/src/main/perl/Changes b/core/src/main/perl/Changes similarity index 100% rename from src/main/perl/Changes rename to core/src/main/perl/Changes diff --git a/src/main/perl/MANIFEST b/core/src/main/perl/MANIFEST similarity index 100% rename from src/main/perl/MANIFEST rename to core/src/main/perl/MANIFEST diff --git a/src/main/perl/Makefile.PL b/core/src/main/perl/Makefile.PL similarity index 100% rename from src/main/perl/Makefile.PL rename to core/src/main/perl/Makefile.PL diff --git a/src/main/perl/README b/core/src/main/perl/README similarity index 100% rename from src/main/perl/README rename to core/src/main/perl/README diff --git a/src/main/perl/ghinfo.pl b/core/src/main/perl/ghinfo.pl similarity index 100% rename from src/main/perl/ghinfo.pl rename to core/src/main/perl/ghinfo.pl diff --git a/src/main/perl/ignore.txt b/core/src/main/perl/ignore.txt similarity index 100% rename from src/main/perl/ignore.txt rename to core/src/main/perl/ignore.txt diff --git a/src/main/perl/lib/HighPerformanceSpark/Examples.pm b/core/src/main/perl/lib/HighPerformanceSpark/Examples.pm similarity index 100% rename from src/main/perl/lib/HighPerformanceSpark/Examples.pm rename to core/src/main/perl/lib/HighPerformanceSpark/Examples.pm diff --git a/src/main/perl/t/00-load.t b/core/src/main/perl/t/00-load.t similarity index 100% rename from src/main/perl/t/00-load.t rename to core/src/main/perl/t/00-load.t diff --git a/src/main/perl/t/manifest.t b/core/src/main/perl/t/manifest.t similarity index 100% rename from src/main/perl/t/manifest.t rename to core/src/main/perl/t/manifest.t diff --git a/src/main/perl/t/pod-coverage.t b/core/src/main/perl/t/pod-coverage.t similarity index 100% rename from src/main/perl/t/pod-coverage.t rename to core/src/main/perl/t/pod-coverage.t diff --git a/src/main/perl/t/pod.t b/core/src/main/perl/t/pod.t similarity index 100% rename from src/main/perl/t/pod.t rename to core/src/main/perl/t/pod.t diff --git a/src/main/perl/xt/boilerplate.t b/core/src/main/perl/xt/boilerplate.t similarity index 100% rename from src/main/perl/xt/boilerplate.t rename to core/src/main/perl/xt/boilerplate.t diff --git a/src/main/r/dapply.R b/core/src/main/r/dapply.R similarity index 100% rename from src/main/r/dapply.R rename to core/src/main/r/dapply.R diff --git a/src/main/r/wc.R b/core/src/main/r/wc.R similarity index 100% rename from src/main/r/wc.R rename to core/src/main/r/wc.R diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala similarity index 79% rename from src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala index 8aeb8ebc..def3e088 100644 --- a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala @@ -6,19 +6,16 @@ package com.highperformancespark.examples.dataframe import org.apache.spark._ import org.apache.spark.rdd.RDD -//tag::sparkSQLImports[] -import org.apache.spark.sql.{Dataset, DataFrame, SparkSession, Row} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.Row +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.expressions._ import org.apache.spark.sql.functions._ -//end::sparkSQLImports[] - -//tag::legacySparkSQLImports[] -import org.apache.spark.sql.SQLContext -//end::legacySparkSQLImports[] -//tag::legacySparkHiveImports[] -import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver._ +import org.apache.spark.sql.Encoders //end::legacySparkHiveImports[] object HappyPandas { @@ -29,7 +26,7 @@ object HappyPandas { def sparkSession(): SparkSession = { //tag::createSparkSession[] val session = SparkSession.builder() - .enableHiveSupport() + //.enableHiveSupport() -- try disabling this .getOrCreate() // Import the implicits, unlike in core Spark the implicits are defined // on the context. @@ -38,12 +35,15 @@ object HappyPandas { session } + val session = sparkSession() + import session.implicits._ + /** * Creates SQLContext with an existing SparkContext. */ def sqlContext(sc: SparkContext): SQLContext = { //tag::createSQLContext[] - val sqlContext = new SQLContext(sc) + val sqlContext = SparkSession.builder.getOrCreate().sqlContext // Import the implicits, unlike in core Spark the implicits are defined // on the context. import sqlContext.implicits._ @@ -54,9 +54,9 @@ object HappyPandas { /** * Creates HiveContext Spark with an existing SparkContext using hive. */ - def hiveContext(sc: SparkContext): HiveContext = { + def hiveContext(sc: SparkContext): SQLContext = { //tag::createHiveContext[] - val hiveContext = new HiveContext(sc) + val hiveContext = SparkSession.builder.enableHiveSupport().getOrCreate().sqlContext // Import the implicits, unlike in core Spark the implicits are defined // on the context. import hiveContext.implicits._ @@ -78,7 +78,7 @@ object HappyPandas { //end::loadPandaJSONComplex[] val jsonRDD = sc.textFile(path) //tag::loadPandaJsonRDD[] - val df3 = session.read.json(jsonRDD) + val df3 = session.read.json(session.createDataset(jsonRDD)(Encoders.STRING)) //end::loadPandaJSONRDD[] df1 } @@ -86,7 +86,7 @@ object HappyPandas { def jsonLoadFromRDD(session: SparkSession, input: RDD[String]): DataFrame = { //tag::loadPandaJSONRDD[] val rdd: RDD[String] = input.filter(_.contains("panda")) - val df = session.read.json(rdd) + val df = session.read.json(session.createDataset(rdd)(Encoders.STRING)) //end::loadPandaJSONRDD[] df } @@ -113,8 +113,8 @@ object HappyPandas { */ def happyPandasPercentage(pandaInfo: DataFrame): DataFrame = { pandaInfo.select( - pandaInfo("place"), - (pandaInfo("happyPandas") / pandaInfo("totalPandas")).as("percentHappy") + $"place", + ($"happyPandas" / $"totalPandas").as("percentHappy") ) } @@ -126,9 +126,9 @@ object HappyPandas { * @return Returns a DataFrame of pandaId and integer value for pandaType. */ def encodePandaType(pandaInfo: DataFrame): DataFrame = { - pandaInfo.select(pandaInfo("id"), - (when(pandaInfo("pt") === "giant", 0). - when(pandaInfo("pt") === "red", 1). + pandaInfo.select($"id", + (when($"pt" === "giant", 0). + when($"pt" === "red", 1). otherwise(2)).as("encodedType") ) } @@ -138,7 +138,7 @@ object HappyPandas { * Gets places with happy pandas more than minHappinessBound. */ def minHappyPandas(pandaInfo: DataFrame, minHappyPandas: Int): DataFrame = { - pandaInfo.filter(pandaInfo("happyPandas") >= minHappyPandas) + pandaInfo.filter($"happyPandas" >= minHappyPandas) } /** @@ -158,7 +158,7 @@ object HappyPandas { RawPanda(id, zip, pt, happy, attrs.toArray) }} pandaInfo.select( - (pandaInfo("attributes")(0) / pandaInfo("attributes")(1)) + ($"attributes"(0) / $"attributes"(1)) .as("squishyness")) //end::selectExplode[] } @@ -167,6 +167,7 @@ object HappyPandas { * Find pandas that are sad */ def sadPandas(pandaInfo: DataFrame): DataFrame = { + // This one is our intentional non $ example //tag::simpleFilter[] pandaInfo.filter(pandaInfo("happy") !== true) //end::simpleFilter[] @@ -178,7 +179,7 @@ object HappyPandas { def happyFuzzyPandas(pandaInfo: DataFrame): DataFrame = { //tag::complexFilter[] pandaInfo.filter( - pandaInfo("happy").and(pandaInfo("attributes")(0) > pandaInfo("attributes")(1)) + $"happy".and($"attributes"(0) > $"attributes"(1)) ) //end::complexFilter[] } @@ -187,7 +188,7 @@ object HappyPandas { * Gets places that contains happy pandas more than unhappy pandas. */ def happyPandasPlaces(pandaInfo: DataFrame): DataFrame = { - pandaInfo.filter(pandaInfo("happyPandas") >= pandaInfo("totalPandas") / 2) + pandaInfo.filter($"happyPandas" >= $"totalPandas" / 2) } @@ -258,7 +259,7 @@ object HappyPandas { miniPandas } - def startJDBCServer(hiveContext: HiveContext): Unit = { + def startJDBCServer(hiveContext: SQLContext): Unit = { //tag::startJDBC[] hiveContext.setConf("hive.server2.thrift.port", "9090") HiveThriftServer2.startWithContext(hiveContext) @@ -314,27 +315,52 @@ object HappyPandas { //end::rightouterJoin[] //tag::leftsemiJoin[] - // Left semi join explicit + // Left semi join explicit. + // Here we're explicit about which DF which col comes from given + // the shared name. df1.join(df2, df1("name") === df2("name"), "left_semi") //end::leftsemiJoin[] } + + def badComplexJoin(df1: Dataset[Pandas], df2: Dataset[Pandas]): Dataset[(Pandas, Pandas)] = { + df1.joinWith(df2, regexp(df1("name"), df2("name"))).alias("regexp join") + } + + + //tag::badJoinMagic[] + def badJoin(df1: Dataset[Pandas], df2: Dataset[Pandas]): Dataset[(Pandas, Pandas)] = { + val session = df1.sparkSession + val sle = session.udf.register("strLenEq", (s: String, s2: String) => s.length() == s2.length()) + df1.joinWith(df2, sle(df1("name"), df2("name"))).alias("strlenEqJoin") + } + //end::badJoinMagic[] + + //tag::okJoin[] + def okJoin(df1: Dataset[Pandas], df2: Dataset[Pandas]): Dataset[(Pandas, Pandas)] = { + val session = df1.sparkSession + val sl = session.udf.register("strLen", (s: String) => s.length()) + df1.joinWith(df2, sl(df1("name")) === sl(df2("name"))).alias("strlenJoin") + } + //end::okJoin[] + /** * Cut the lineage of a DataFrame which has too long a query plan. */ def cutLineage(df: DataFrame): DataFrame = { - val sqlCtx = df.sqlContext + val session = SparkSession.builder.getOrCreate() + import session.implicits._ //tag::cutLineage[] val rdd = df.rdd rdd.cache() - sqlCtx.createDataFrame(rdd, df.schema) + session.createDataFrame(rdd, df.schema) //end::cutLineage[] } // Self join def selfJoin(df: DataFrame): DataFrame = { - val sqlCtx = df.sqlContext - import sqlCtx.implicits._ + val session = SparkSession.builder.getOrCreate() + import session.implicits._ //tag::selfJoin[] val joined = df.as("a").join(df.as("b")).where($"a.name" === $"b.name") //end::selfJoin[] diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala similarity index 93% rename from src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala index 82be10fc..54ca5342 100644 --- a/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala @@ -88,11 +88,21 @@ case class LoadSave(sc: SparkContext, session: SparkSession) { } //end::saveAppend[] + def upsertPandas(input: DataFrame): Unit = { + //tag::upsert[] + input.mergeInto("pandaInfo", $"source.id" === $"target.id") + .whenMatched() // Note you can override the general match condition above if desired + .updateAll() + .whenNotMatched() + .insertAll() + //end::upsert[] + } + def createJDBC() = { - //tag::createJDBC[] session.read.jdbc("jdbc:dialect:serverName;user=user;password=pass", "table", new Properties) + //tag::createJDBC[] session.read.format("jdbc") .option("url", "jdbc:dialect:serverName") .option("dbtable", "table").load() @@ -100,10 +110,10 @@ case class LoadSave(sc: SparkContext, session: SparkSession) { } def writeJDBC(df: DataFrame) = { - //tag::writeJDBC[] df.write.jdbc("jdbc:dialect:serverName;user=user;password=pass", "table", new Properties) + //tag::writeJDBC[] df.write.format("jdbc") .option("url", "jdbc:dialect:serverName") .option("user", "user") diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala similarity index 92% rename from src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala index 2ccdd10f..b74e1cbb 100644 --- a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala @@ -9,10 +9,9 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.expressions._ import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types._ -// Additional imports for using HiveContext import org.apache.spark.sql.hive._ import org.apache.spark.sql.hive.thriftserver._ +import org.apache.spark.sql.types._ case class MiniPandaInfo(zip: String, size: Double) @@ -68,9 +67,10 @@ class MixedDataset(sqlCtx: SQLContext) { //tag::maxPandaSizePerZipScala[] def maxPandaSizePerZipScala(ds: Dataset[RawPanda]): Dataset[(String, Double)] = { - ds.groupByKey(rp => rp.zip).mapGroups{ case (g, iter) => + def groupMapFun(g: String, iter: Iterator[RawPanda]): (String, Double) = { (g, iter.map(_.attributes(2)).reduceLeft(Math.max(_, _))) } + ds.groupByKey(rp => rp.zip).mapGroups(groupMapFun) } //end::maxPandaSizePerZipScala[] @@ -89,7 +89,7 @@ class MixedDataset(sqlCtx: SQLContext) { Dataset[(RawPanda, CoffeeShop)] = { //tag::joinWith[] val result: Dataset[(RawPanda, CoffeeShop)] = pandas.joinWith(coffeeShops, - $"zip" === $"zip") + pandas("zip") === coffeeShops("zip")) //end::joinWith[] result } @@ -100,8 +100,8 @@ class MixedDataset(sqlCtx: SQLContext) { def selfJoin(pandas: Dataset[RawPanda]): Dataset[(RawPanda, RawPanda)] = { //tag::selfJoin[] - val result: Dataset[(RawPanda, RawPanda)] = pandas.joinWith(pandas, - $"zip" === $"zip") + val result: Dataset[(RawPanda, RawPanda)] = pandas.as("l").joinWith(pandas.as("r"), + $"l.zip" === $"r.zip") //end::selfJoin[] result } diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back b/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala new file mode 100644 index 00000000..8e482bfc --- /dev/null +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala @@ -0,0 +1,28 @@ +/** + * Extension for the SparkSession to allow us to plug in a custom optimizer + */ + +package com.highperformancespark.examples.dataframe + +import org.apache.spark.sql.catalyst.optimizer._ +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.trees.TreePattern._ +import org.apache.spark.sql.catalyst.expressions.{And, IsNotNull} + +object NullabilityFilterOptimizer extends Rule[LogicalPlan] { + + def apply(plan: LogicalPlan): LogicalPlan = { + plan.transform { + case p @ Project(projectList, projChild) => + val children = projectList.flatMap(_.children) + // If there are no null intolerant children don't worry about it + if (children.isEmpty) { + p + } else { + val filterCond = children.map(IsNotNull(_)).reduceLeft(And) + Project(projectList, Filter(filterCond, projChild)) + } + } + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala similarity index 92% rename from src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala index b1d64dc7..c7cf0cae 100644 --- a/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala @@ -14,7 +14,7 @@ case class RawPanda(id: Long, zip: String, pt: String, happy: Boolean, attributes: Array[Double]) { override def equals(o: Any) = o match { case other: RawPanda => (id == other.id && pt == other.pt && - happy == other.happy && attributes.deep == other.attributes.deep) + happy == other.happy && attributes.sameElements(other.attributes)) case _ => false } override def hashCode(): Int = { diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/SQLExtension.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/SQLExtension.scala new file mode 100644 index 00000000..14e2072f --- /dev/null +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/SQLExtension.scala @@ -0,0 +1,15 @@ +/** + * Extension for the SparkSession to allow us to plug in a custom optimizer + */ + +package com.highperformancespark.examples.dataframe + +import org.apache.spark.sql.{SparkSessionExtensions, SparkSessionExtensionsProvider} + +class SQLExtension extends SparkSessionExtensionsProvider { + override def apply(extensions: SparkSessionExtensions): Unit = { + // There are _many different_ types of rules you can inject, here we're focused on + // making things go fast so our sample is an optimizer rule (AQE rules could also make sense). + extensions.injectOptimizerRule(session => NullabilityFilterOptimizer) + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala diff --git a/src/main/scala/com/high-performance-spark-examples/errors/throws.scala b/core/src/main/scala/com/high-performance-spark-examples/errors/throws.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/errors/throws.scala rename to core/src/main/scala/com/high-performance-spark-examples/errors/throws.scala diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala similarity index 96% rename from src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala index 9f8ec9d3..afcdeb85 100644 --- a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala @@ -1,12 +1,13 @@ package com.highperformancespark.examples.goldilocks +import scala.collection.Map +import scala.collection.mutable + import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Row import org.apache.spark.storage.StorageLevel -import scala.collection.mutable.MutableList -import scala.collection.{Map, mutable} - object GoldilocksGroupByKey { //tag::groupByKey[] def findRankStatistics( @@ -252,7 +253,7 @@ object GoldilocksFirstTry { // to sort the partitionsColumnsFreq array by the partition index (the // first value in the tuple). partitionColumnsFreq.sortBy(_._1).map { case (partitionIndex, columnsFreq) => - val relevantIndexList = new MutableList[(Int, Long)]() + val relevantIndexList = new mutable.ListBuffer[(Int, Long)]() columnsFreq.zipWithIndex.foreach{ case (colCount, colIndex) => val runningTotalCol = runningTotal(colIndex) @@ -291,8 +292,8 @@ object GoldilocksFirstTry { (partitionIndex : Int, valueColumnPairs : Iterator[(Double, Int)]) => { val targetsInThisPart: List[(Int, Long)] = ranksLocations(partitionIndex)._2 if (targetsInThisPart.nonEmpty) { - val columnsRelativeIndex: Map[Int, List[Long]] = - targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2)) + val columnsRelativeIndex: collection.MapView[Int, List[Long]] = + targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2)) val columnsInThisPart = targetsInThisPart.map(_._1).distinct val runningTotals : mutable.HashMap[Int, Long]= new mutable.HashMap() diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala index 92cb44fd..71a66afa 100644 --- a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala @@ -1,12 +1,12 @@ package com.highperformancespark.examples.goldilocks +import scala.collection.Map +import scala.collection.mutable.ArrayBuffer + import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.sql._ -import scala.collection.Map -import scala.collection.mutable.ArrayBuffer - //tag::colIndex_partition[] class ColumnIndexPartition(override val numPartitions: Int) extends Partitioner { diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala similarity index 98% rename from src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala index 2b3adc10..2097d021 100644 --- a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala @@ -1,12 +1,13 @@ package com.highperformancespark.examples.goldilocks +import scala.collection.Map +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel -import scala.collection.mutable.ArrayBuffer -import scala.collection.{Map, mutable} - object GoldilocksWithHashMap { @@ -173,7 +174,7 @@ object GoldilocksWithHashMap { val runningTotal = Array.fill[Long](numOfColumns)(0) partitionColumnsFreq.sortBy(_._1).map { case (partitionIndex, columnsFreq)=> - val relevantIndexList = new mutable.MutableList[(Int, Long)]() + val relevantIndexList = new mutable.ListBuffer[(Int, Long)]() columnsFreq.zipWithIndex.foreach{ case (colCount, colIndex) => val runningTotalCol = runningTotal(colIndex) @@ -302,7 +303,7 @@ object FindTargetsSubRoutine extends Serializable { def withArrayBuffer(valueColumnPairsIter : Iterator[((Double, Int), Long)], targetsInThisPart: List[(Int, Long)] ): Iterator[(Int, Double)] = { - val columnsRelativeIndex: Predef.Map[Int, List[Long]] = + val columnsRelativeIndex: collection.MapView[Int, List[Long]] = targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2)) // The column indices of the pairs that are desired rank statistics that live in diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala similarity index 93% rename from src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala index a60a39fc..d7024aea 100644 --- a/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala @@ -1,11 +1,11 @@ package com.highperformancespark.examples.goldilocks -import org.apache.spark.HashPartitioner -import org.apache.spark.rdd.RDD - import scala.collection.Map import scala.reflect.ClassTag +import org.apache.spark.HashPartitioner +import org.apache.spark.rdd.RDD + object RDDJoinExamples { /* For Example, suppose we have one RDD with some data in the form (Panda id, score) @@ -95,7 +95,7 @@ object RDDJoinExamples { } /** - * Performs a broad cast hash join for two RDDs. + * Performs a broadcast hash join for two RDDs. * @param bigRDD - the first rdd, should be the larger RDD * @param smallRDD - the small rdd, should be small enough to fit in memory * @tparam K - The type of the key @@ -103,8 +103,8 @@ object RDDJoinExamples { * @tparam V2 - The type of the values for the second array * @return */ - //tag::coreBroadCast[] - def manualBroadCastHashJoin[K : Ordering : ClassTag, V1 : ClassTag, + //tag::coreBroadcast[] + def manualBroadcastHashJoin[K : Ordering : ClassTag, V1 : ClassTag, V2 : ClassTag](bigRDD : RDD[(K, V1)], smallRDD : RDD[(K, V2)])= { val smallRDDLocal: Map[K, V2] = smallRDD.collectAsMap() @@ -113,11 +113,13 @@ object RDDJoinExamples { iter.flatMap{ case (k,v1 ) => smallRDDLocalBcast.value.get(k) match { + // Note: You could switch this to a left join by changing the empty seq + // to instead return Seq(k, Seq.empty[(V1, V2)]) case None => Seq.empty[(K, (V1, V2))] case Some(v2) => Seq((k, (v1, v2))) } } }, preservesPartitioning = true) } - //end:coreBroadCast[] + //end::coreBroadcast[] } diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala similarity index 98% rename from src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala index 2b73ba45..b4e08738 100644 --- a/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala @@ -3,7 +3,8 @@ package com.highperformancespark.examples.goldilocks import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag -import org.apache.spark.{HashPartitioner, Partitioner} +import org.apache.spark.HashPartitioner +import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD object PandaSecondarySort { diff --git a/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala similarity index 97% rename from src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala rename to core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala index 2b87a7e3..9fdef436 100644 --- a/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala @@ -1,22 +1,20 @@ package com.highperformancespark.examples.ml -import com.highperformancespark.examples.dataframe._ - -import scala.collection.{Map, mutable} -import scala.collection.mutable.{ArrayBuffer, MutableList} +import scala.collection.Map import org.apache.spark._ -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql._ -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types._ import org.apache.spark.ml._ import org.apache.spark.ml.classification._ import org.apache.spark.ml.linalg._ -//tag::extraImports[] import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe._ //end::extraImports[] //tag::basicPipelineSetup[] diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala rename to core/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala similarity index 97% rename from src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala rename to core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala index 13e937f6..ee34ed77 100644 --- a/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala @@ -1,22 +1,20 @@ package com.highperformancespark.examples.ml -import com.highperformancespark.examples.dataframe._ - -import scala.collection.{Map, mutable} -import scala.collection.mutable.{ArrayBuffer, MutableList} +import scala.collection.Map import org.apache.spark._ -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql._ -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types._ import org.apache.spark.ml._ import org.apache.spark.ml.classification._ import org.apache.spark.ml.linalg._ -//tag::extraImports[] import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe._ //end::extraImports[] case class LabeledToken(label: Double, index: Integer) @@ -40,7 +38,7 @@ class SimpleNaiveBayes(val uid: String) // Note this estimator assumes they start at 0 and go to numClasses val numClasses = getNumClasses(ds) // Get the number of features by peaking at the first row - val numFeatures: Integer = ds.select(col($(featuresCol))).head + val numFeatures: Integer = ds.select(col($(featuresCol))).head() .get(0).asInstanceOf[Vector].size // Determine the number of records for each class val groupedByLabel = ds.select(col($(labelCol)).as[Double]).groupByKey(x => x) diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala similarity index 95% rename from src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala rename to core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala index 9117c74e..7f63ef8d 100644 --- a/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala @@ -1,30 +1,22 @@ package com.highperformancespark.examples.ml -import com.highperformancespark.examples.dataframe._ - -import scala.collection.{Map, mutable} -import scala.collection.mutable.{ArrayBuffer, MutableList} - import org.apache.spark._ -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql._ -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types._ -//tag::basicImport[] import org.apache.spark.ml._ -import org.apache.spark.ml.feature._ import org.apache.spark.ml.classification._ -//end::basicImport[] -//tag::renameImport[] +import org.apache.spark.ml.feature._ import org.apache.spark.ml.linalg.{Vector => SparkVector} -//end::renameImport[] import org.apache.spark.ml.param._ import org.apache.spark.ml.tuning._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe._ object SimplePipeline { def constructAndSetParams(df: DataFrame) = { - val sqlCtx = df.sqlContext //tag::constructSetParams[] val hashingTF = new HashingTF() hashingTF.setInputCol("input") @@ -33,7 +25,6 @@ object SimplePipeline { } def constructSimpleTransformer(df: DataFrame) = { - val sqlCtx = df.sqlContext //tag::simpleTransformer[] val hashingTF = new HashingTF() // We don't set the output column here so the default output column of @@ -69,7 +60,6 @@ object SimplePipeline { } def constructSimpleEstimator(df: DataFrame) = { - val sqlCtx = df.sqlContext //tag::simpleNaiveBayes[] val nb = new NaiveBayes() nb.setLabelCol("happy") diff --git a/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala b/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala similarity index 74% rename from src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala rename to core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala index ddbc9d65..3fab009e 100644 --- a/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala @@ -1,21 +1,17 @@ package com.highperformancespark.examples.mllib -import com.highperformancespark.examples.dataframe._ - -import scala.collection.{Map, mutable} -import scala.collection.mutable.{ArrayBuffer, MutableList} +import scala.collection.Map import org.apache.spark._ -import org.apache.spark.rdd.RDD -//tag::imports[] -import com.github.fommil.netlib.BLAS.{getInstance => blas} +import org.apache.spark.mllib.classification.LogisticRegressionModel +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS +import org.apache.spark.mllib.feature._ import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, - LogisticRegressionModel} -// Rename Vector to SparkVector to avoid conflicts with Scala's Vector class import org.apache.spark.mllib.linalg.{Vector => SparkVector} import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.feature._ +import org.apache.spark.rdd.RDD + +import com.highperformancespark.examples.dataframe._ //end::imports[] object GoldilocksMLlib { @@ -97,42 +93,6 @@ object GoldilocksMLlib { } //end::trainScaler[] - //tag::word2vecSimple[] - def word2vec(sc: SparkContext, rdd: RDD[String]): RDD[SparkVector] = { - // Tokenize our data - val tokenized = rdd.map(_.split(" ").toIterable) - // Construct our word2vec model - val wv = new Word2Vec() - val wvm = wv.fit(tokenized) - val wvmb = sc.broadcast(wvm) - // WVM can now transform single words - println(wvm.transform("panda")) - // Vector size is 100 - we use this to build a transformer on top of WVM that - // works on sentences. - val vectorSize = 100 - // The transform function works on a per-word basis, but we have - // sentences as input. - tokenized.map{words => - // If there is nothing in the sentence output a null vector - if (words.isEmpty) { - Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double]) - } else { - // If there are sentences construct a running sum of the - // vectors for each word - val sum = Array[Double](vectorSize) - words.foreach { word => - blas.daxpy( - vectorSize, 1.0, wvmb.value.transform(word).toArray, 1, sum, 1) - } - // Then scale it by the number of words - blas.dscal(sum.length, 1.0 / words.size, sum, 1) - // And wrap it in a Spark vector - Vectors.dense(sum) - } - } - } - //end::word2vecSimple[] - //tag::hashingTFPreserve[] def toVectorPerserving(rdd: RDD[RawPanda]): RDD[(RawPanda, SparkVector)] = { val ht = new HashingTF() diff --git a/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala b/core/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala rename to core/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala diff --git a/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala b/core/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala similarity index 96% rename from src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala rename to core/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala index 40eb61fa..ca6d65c4 100644 --- a/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala @@ -16,8 +16,9 @@ */ package com.highperformancespark.examples.ffi +import org.apache.spark.SparkContext +import org.apache.spark.SparkFiles import org.apache.spark.rdd._ -import org.apache.spark.{SparkContext, SparkFiles} object PipeExample { //tag::pipeExample[] diff --git a/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala b/core/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala similarity index 86% rename from src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala rename to core/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala index 485c73d8..16aa779e 100644 --- a/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala @@ -1,10 +1,12 @@ package com.highperformancespark.examples.ffi object StandAlone { + // $COVERAGE-OFF$ def main(args: Array[String]) { //tag::systemLoadLibrary[] System.loadLibrary("highPerformanceSpark0") //end::systemLoadLibrary[] println(new SumJNI().sum(Array(1,2,3))) } + // $COVERAGE-ON$ } diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala b/core/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala rename to core/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala b/core/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala rename to core/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala b/core/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala similarity index 85% rename from src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala rename to core/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala index ed0caafb..65de6c2f 100644 --- a/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala @@ -1,6 +1,6 @@ package com.highperformancespark.examples.ffi -import ch.jodersky.jni.nativeLoader +import com.github.sbt.jni.nativeLoader //tag::sumJNIDecorator[] @nativeLoader("high-performance-spark0") diff --git a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala b/core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala similarity index 86% rename from src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala rename to core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala index b6e59ae1..5a06ff63 100644 --- a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala @@ -16,18 +16,23 @@ */ package com.highperformancespark.examples.perf -import com.highperformancespark.examples.dataframe.RawPanda -import com.highperformancespark.examples.tools._ - +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext import org.apache.spark.rdd._ -import org.apache.spark.{SparkContext, SparkConf} -import org.apache.spark.sql.{SparkSession, DataFrame, Dataset, Row} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.Row +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types._ +import com.highperformancespark.examples.dataframe.RawPanda +import com.highperformancespark.examples.tools._ + /** * A simple performance test to compare a simple sort between DataFrame, and RDD */ object SimplePerfTest { + // $COVERAGE-OFF$ def main(args: Array[String]) = { val sparkConf = new SparkConf().setAppName("simple-perf-test") val sparkSession = SparkSession.builder().enableHiveSupport().getOrCreate() @@ -59,9 +64,9 @@ object SimplePerfTest { println(dataFrameTimeings.map(_._2).mkString(",")) } - def testOnRDD(rdd: RDD[(Int, Double)]) = { - rdd.map{case (x, y) => (x, (y, 1))} - .reduceByKey{case (x, y) => (x._1 + y._1, x._2 + y._2)}.count() + def testOnRDD(rdd: RDD[(Int, Double)]): Long = { + val kvc: RDD[(Int, (Double , Int))] = rdd.map{case (x, y) => (x, (y, 1))} + kvc.reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2)).count() } def groupOnRDD(rdd: RDD[(Int, Double)]) = { @@ -81,4 +86,5 @@ object SimplePerfTest { println(s"Time ${t1 - t0}ns") (result, t1 - t0) } + // $COVERAGE-ON$ } diff --git a/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala b/core/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala similarity index 97% rename from src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala rename to core/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala index 2fa173ca..2cde7b2e 100644 --- a/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala @@ -5,15 +5,14 @@ package com.highperformancespark.examples.streaming import scala.reflect.ClassTag -import org.apache.hadoop.io.{LongWritable, Text} -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat - import org.apache.spark._ import org.apache.spark.rdd.RDD - -//tag::DStreamImports[] import org.apache.spark.streaming._ import org.apache.spark.streaming.dstream._ + +import org.apache.hadoop.io.LongWritable +import org.apache.hadoop.io.Text +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat //end::DStreamImports[] object DStreamExamples { diff --git a/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala b/core/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala similarity index 89% rename from src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala rename to core/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala index f773a2e7..0c50469e 100644 --- a/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala @@ -5,6 +5,7 @@ import scala.concurrent.duration._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.streaming._ +import org.apache.spark.sql.streaming.Trigger object Structured { @@ -21,7 +22,7 @@ object Structured { // Write out the result as parquet format("parquet"). // Specify the interval at which new data will be picked up - trigger(ProcessingTime(1.second)). + trigger(Trigger.ProcessingTime(1.second)). queryName("pandas").start() //end::writeComplete[] } diff --git a/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala b/core/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala rename to core/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala diff --git a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala similarity index 50% rename from src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala rename to core/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala index 02287ae9..ffc7d838 100644 --- a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala @@ -2,21 +2,20 @@ package com.highperformancespark.examples.tools import scala.collection.immutable.HashSet -import com.highperformancespark.examples.dataframe.RawPanda - import org.apache.spark._ import org.apache.spark.rdd.RDD +import com.highperformancespark.examples.dataframe.RawPanda //tag::loggerImport[] -import com.typesafe.scalalogging.LazyLogging +import org.apache.logging.log4j.LogManager //end::loggerImport[] -object FilterInvalidPandas extends LazyLogging { +object FilterInvalidPandas { def filterInvalidPandas(sc: SparkContext, invalidPandas: List[Long], input: RDD[RawPanda]) = { //tag::broadcast[] - val invalid = HashSet() ++ invalidPandas + val invalid: HashSet[Long] = HashSet() ++ invalidPandas val invalidBroadcast = sc.broadcast(invalid) input.filter{panda => !invalidBroadcast.value.contains(panda.id)} //end::broadcast[] @@ -25,11 +24,12 @@ object FilterInvalidPandas extends LazyLogging { def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long], input: RDD[RawPanda]) = { //tag::broadcastAndLog[] - val invalid = HashSet() ++ invalidPandas + val invalid: HashSet[Long] = HashSet() ++ invalidPandas val invalidBroadcast = sc.broadcast(invalid) def keepPanda(pandaId: Long) = { + val logger = LogManager.getLogger("fart based logs") if (invalidBroadcast.value.contains(pandaId)) { - logger.debug(s"Invalid panda ${pandaId} discovered") + logger.debug("hi") false } else { true @@ -39,3 +39,24 @@ object FilterInvalidPandas extends LazyLogging { //end::broadcastAndLog[] } } + +//tag::broadcastAndLogClass[] +class AltLog() { + lazy val logger = LogManager.getLogger("fart based logs") + def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long], + input: RDD[RawPanda]) = { + val invalid: HashSet[Long] = HashSet() ++ invalidPandas + val invalidBroadcast = sc.broadcast(invalid) + def keepPanda(pandaId: Long) = { + val logger = LogManager.getLogger("fart based logs") + if (invalidBroadcast.value.contains(pandaId)) { + logger.debug("hi") + false + } else { + true + } + } + input.filter{panda => keepPanda(panda.id)} + } +} +//end::broadcastAndLogClass[] diff --git a/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala b/core/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala rename to core/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala index da4fd384..586ee3b6 100644 --- a/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala @@ -1,12 +1,12 @@ package com.highperformancespark.examples.tools -import com.highperformancespark.examples.dataframe.RawPanda - import org.apache.spark._ +import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row -import org.apache.spark.mllib.random.RandomRDDs -import org.apache.spark.mllib.linalg.Vector + +import com.highperformancespark.examples.dataframe.RawPanda object GenerateScalingData { /** diff --git a/core/src/main/scala/com/high-performance-spark-examples/tools/ResourceProfileEx.scala b/core/src/main/scala/com/high-performance-spark-examples/tools/ResourceProfileEx.scala new file mode 100644 index 00000000..21b7afa7 --- /dev/null +++ b/core/src/main/scala/com/high-performance-spark-examples/tools/ResourceProfileEx.scala @@ -0,0 +1,41 @@ +package com.highperformancespark.examples.gpu + +import org.apache.spark.sql.SparkSession +import org.apache.spark.resource._ +import org.apache.spark.resource.ResourceProfileBuilder +import org.apache.spark.TaskContext + +object GPUResourceProfileExample { + def main(args: Array[String]): Unit = { + val spark = SparkSession.builder() + .appName("GPUResourceProfileExample") + .getOrCreate() + run(spark) + } + + def run(spark: SparkSession) = { + val sc = spark.sparkContext + //tag::gpuResourceProfileExample[] + // Create a resource profile requesting 2 NVIDIA GPUs per executor and 1 per task + val gpuResourceProfile = new ResourceProfileBuilder() + .require(new ExecutorResourceRequests().resource( + "gpu", 2, vendor="nvidia", + discoveryScript="/opt/spark/bin/getGpusResources.sh" // See sample in Spark repo + )) + .require(new TaskResourceRequests().resource("gpu", 1)) + .build() + + // Use resource profile to run on a machine with GPUs. + val rdd = sc.parallelize(1 to 4, 4) + .withResources(gpuResourceProfile) + .map { i => + // Do some special GPU stuff here my friend + i + } + //end::gpuResourceProfileExample[] + + rdd.collect().foreach(println) + + spark.stop() + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala b/core/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala similarity index 98% rename from src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala rename to core/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala index 298a7c3f..30684411 100644 --- a/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala @@ -1,5 +1,5 @@ +import scala.reflect.ClassTag import scala.util.Random -import scala.reflect.{ClassTag} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD diff --git a/core/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/core/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala new file mode 100644 index 00000000..f58cdbb9 --- /dev/null +++ b/core/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala @@ -0,0 +1,126 @@ +/** + * Illustrates how to use Spark accumulators. Note that most of these examples + * are "dangerous" in that they may not return consistent results. + */ +package com.highperformancespark.examples.transformations + +import java.{lang => jl} + +import scala.collection.mutable.HashSet + +import org.apache.spark._ +import org.apache.spark.rdd._ +import org.apache.spark.util.AccumulatorV2 + +import com.highperformancespark.examples.dataframe.RawPanda +object Accumulators { + /** + * Compute the total fuzzyness with an accumulator while generating + * an id and zip pair for sorting. + */ + //tag::sumFuzzyAcc[] + def computeTotalFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): + (RDD[(String, Long)], Double) = { + // Create an accumulator with the initial value of 0.0 + val acc = sc.doubleAccumulator + val transformed = rdd.map{x => acc.add(x.attributes(0)); (x.zip, x.id)} + // accumulator still has zero value + // Note: This example is dangerous since the transformation may be + // evaluated multiple times. + transformed.count() // force evaluation + (transformed, acc.value) + } + //end::sumFuzzyAcc[] + + /** + * Compute the max fuzzyness with an accumulator while generating an + * id and zip pair for sorting. + */ + //tag::maxFuzzyAcc[] + def computeMaxFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): + (RDD[(String, Long)], Double) = { + class MaxDoubleParam extends AccumulatorV2[jl.Double, jl.Double] { + var _value = Double.MinValue + override def isZero(): Boolean = { + _value == Double.MinValue + } + override def reset() = { + _value = Double.MinValue + } + + override def add(r1: jl.Double): Unit = { + _value = Math.max(r1, _value) + } + + def add(r1: Double): Unit = { + _value = Math.max(r1, _value) + } + + def copy(): MaxDoubleParam = { + val newAcc = new MaxDoubleParam() + newAcc._value = _value + newAcc + } + + override def merge(other: AccumulatorV2[jl.Double, jl.Double]): Unit = other match { + case o: MaxDoubleParam => + _value = Math.max(_value, o._value) + case _ => + throw new UnsupportedOperationException( + s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") + } + + override def value: jl.Double = _value + } + // Create an accumulator with the initial value of Double.MinValue + val acc = new MaxDoubleParam() + sc.register(acc) + val transformed = rdd.map{x => acc.add(x.attributes(0)); (x.zip, x.id)} + // accumulator still has Double.MinValue + // Note: This example is dangerous since the transformation may be + // evaluated multiple times. + transformed.count() // force evaluation + (transformed, acc.value) + } + //end::maxFuzzyAcc[] + + //tag::uniquePandaAcc[] + def uniquePandas(sc: SparkContext, rdd: RDD[RawPanda]): HashSet[Long] = { + class UniqParam extends AccumulatorV2[Long, HashSet[Long]] { + val _values = new HashSet[Long] + override def isZero() = _values.isEmpty + + override def copy(): UniqParam = { + val nacc = new UniqParam + nacc._values ++= _values + nacc + } + + override def reset(): Unit = { + _values.clear() + } + + override def merge(other: AccumulatorV2[Long, HashSet[Long]]): Unit = other match { + case o: UniqParam => + _values ++= o._values + case _ => + throw new UnsupportedOperationException( + s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") + } + + override def value: HashSet[Long] = _values + // For adding new values + override def add(t: Long) = { + _values += t + } + } + // Create an accumulator with the initial value of Double.MinValue + val acc = new UniqParam() + sc.register(acc) + val transformed = rdd.map{x => acc.add(x.id); (x.zip, x.id)} + // accumulator still has zero values + transformed.count() // force evaluation + acc.value + } + //end::uniquePandaAcc[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala b/core/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala rename to core/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala b/core/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala similarity index 99% rename from src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala rename to core/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala index 948df496..aca85410 100644 --- a/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala @@ -6,15 +6,13 @@ */ package com.highperformancespark.examples.transformations -import com.highperformancespark.examples.dataframe.RawPanda +import scala.collection.mutable.HashSet import org.apache.spark._ -//tag::import[] -import org.apache.spark.util.AccumulatorV2 -//end::import[] import org.apache.spark.rdd._ +import org.apache.spark.util.AccumulatorV2 -import scala.collection.mutable.HashSet +import com.highperformancespark.examples.dataframe.RawPanda object NewAccumulators { /** * Compute the total fuzzyness with an accumulator while generating diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala b/core/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala rename to core/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala diff --git a/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala b/core/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala rename to core/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala diff --git a/src/test/java/com/highperformancespark/examples/JavaInteropTest.java b/core/src/test/java/com/highperformancespark/examples/JavaInteropTest.java similarity index 100% rename from src/test/java/com/highperformancespark/examples/JavaInteropTest.java rename to core/src/test/java/com/highperformancespark/examples/JavaInteropTest.java diff --git a/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java b/core/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java similarity index 98% rename from src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java rename to core/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java index d6bec37c..284397f9 100644 --- a/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java +++ b/core/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java @@ -3,7 +3,7 @@ import com.highperformancespark.examples.objects.JavaPandaInfo; import com.highperformancespark.examples.objects.JavaPandas; import com.highperformancespark.examples.objects.JavaRawPanda; -import com.holdenkarau.spark.testing.JavaDataFrameSuiteBase; +//import com.holdenkarau.spark.testing.JavaDataFrameSuiteBase; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; @@ -16,6 +16,8 @@ import static org.junit.Assert.*; +// Temporarily disable until we upgrade to Spark 3.3 +/* public class JavaHappyPandasTest extends JavaDataFrameSuiteBase { String toronto = "toronto"; String sandiego = "san diego"; @@ -149,3 +151,4 @@ public void simpleSQLExample() { } } +*/ diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala similarity index 86% rename from src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala rename to core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala index 3fb10a53..854fc4e2 100644 --- a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala @@ -4,17 +4,24 @@ */ package com.highperformancespark.examples.dataframe -import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas} -import com.holdenkarau.spark.testing._ -import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.scalatest.Matchers._ -import org.scalatest.FunSuite - import scala.collection.mutable import scala.util.Random -class HappyPandasTest extends FunSuite with DataFrameSuiteBase { +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Row +import org.apache.spark.sql.{SQLContext, SparkSession} +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo +import com.highperformancespark.examples.dataframe.HappyPandas.Pandas +import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ + +class HappyPandasTest extends AnyFunSuite with DataFrameSuiteBase { + + override def appName: String = "happyPandasTest" + val toronto = "toronto" val sandiego = "san diego" val virginia = "virginia" @@ -44,6 +51,30 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase { rez.foreach{x => assert(x(0) == x(1))} } + test("bad regexp join") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val df1 = sqlCtx.createDataset(pandasList) + val df2 = sqlCtx.createDataset(pandasList) + val result = HappyPandas.badComplexJoin(df1, df2).collect() + } + + test("bad udf join") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val df1 = sqlCtx.createDataset(pandasList) + val df2 = sqlCtx.createDataset(pandasList) + val result = HappyPandas.badJoin(df1, df2).collect() + } + + test("ok udf join") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val df1 = sqlCtx.createDataset(pandasList) + val df2 = sqlCtx.createDataset(pandasList) + val result = HappyPandas.okJoin(df1, df2).collect() + } + test("simple explode test") { val inputDF = sqlContext.createDataFrame(pandaPlaces) val pandaInfo = sqlContext.createDataFrame(rawPandaList) @@ -64,7 +95,7 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase { val expectedDf = createDF(expectedList, ("place", StringType), ("percentHappy", DoubleType)) - val inputDF = sqlContext.createDataFrame(pandaInfoList) + val inputDF = spark.createDataFrame(pandaInfoList) val resultDF = HappyPandas.happyPandasPercentage(inputDF) assertDataFrameApproximateEquals(expectedDf, resultDF, 1E-5) @@ -72,7 +103,7 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase { //end::approxEqualDataFrames[] test("verify approx by hand") { - val inputDF = sqlContext.createDataFrame(pandaInfoList) + val inputDF = spark.createDataFrame(pandaInfoList) val resultDF = HappyPandas.happyPandasPercentage(inputDF) val resultRows = resultDF.collect() @@ -90,7 +121,7 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase { } test("test encode Panda type") { - val inputDF = sqlContext.createDataFrame(rawPandaList) + val inputDF = spark.createDataFrame(rawPandaList) val resultDF = HappyPandas.encodePandaType(inputDF) val expectedRows = List(Row(10L, 0), Row(11L, 1)) @@ -103,7 +134,7 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase { //tag::exactEqualDataFrames[] test("verify exact equality") { // test minHappyPandas - val inputDF = sqlContext.createDataFrame(pandaInfoList) + val inputDF = spark.createDataFrame(pandaInfoList) val result = HappyPandas.minHappyPandas(inputDF, 2) val resultRows = result.collect() @@ -113,12 +144,12 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase { //end::exactEqualDataFrames[] test("test happyPandasPlaces") { - val inputDF = sqlContext.createDataFrame(pandaInfoList) + val inputDF = spark.createDataFrame(pandaInfoList) val resultDF = HappyPandas.happyPandasPlaces(inputDF) val expectedRows = List(PandaInfo(toronto, "giant", 1, 2), PandaInfo(sandiego, "red", 2, 3)) - val expectedDF = sqlContext.createDataFrame(expectedRows) + val expectedDF = spark.createDataFrame(expectedRows) assertDataFrameEquals(expectedDF, resultDF) } @@ -230,7 +261,7 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase { .flatMap(zipPandas => { val pandas = zipPandas._2 val length = pandas.size - 1 - val result = new mutable.MutableList[Row] + val result = new mutable.ListBuffer[Row] for (i <- 0 to length) { var totalSum = 0 diff --git a/core/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala new file mode 100644 index 00000000..cbd79adc --- /dev/null +++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala @@ -0,0 +1,180 @@ +/** + * Checks basic Dataset magics + */ +package com.highperformancespark.examples.dataframe + +import scala.collection.JavaConverters._ +import scala.collection.mutable +import scala.util.Random + +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Row +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo +import com.highperformancespark.examples.dataframe.HappyPandas.Pandas +import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ + +class MixedDatasetSuite extends AnyFunSuite + with DataFrameSuiteBase + with DatasetSuiteBase + with RDDComparisons { + + val rawPandaList = List( + RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9, 20.0)), + RawPanda(11L, "94110", "red", true, Array(1.0, 0.7, 30.0))) + + test("happy panda sums") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val mixedDS = new MixedDataset(sqlCtx) + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val result = mixedDS.happyPandaSums(inputDS) + assert(result === (2.0 +- 0.001)) + } + + test("basic select") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val mixedDS = new MixedDataset(sqlCtx) + val squishy = mixedDS.squishyPandas(inputDS).collect() + assert(squishy(0)._2 === true) + } + + test("funquery") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val mixedDS = new MixedDataset(sqlCtx) + val summedAttrs = mixedDS.funMap(inputDS).collect() + assert(summedAttrs(0) === 21.9 +- 0.001) + assert(summedAttrs(1) === 31.7 +- 0.001) + } + + test("max pandas size per zip") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val mixedDS = new MixedDataset(sqlCtx) + val bigPandas = mixedDS.maxPandaSizePerZip(inputDS).collect() + assert(bigPandas.size === 1) + assert(bigPandas(0)._2 === 30.0 +- 0.00001) + } + + test("max pandas size per zip scala version") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val mixedDS = new MixedDataset(sqlCtx) + val bigPandas = mixedDS.maxPandaSizePerZipScala(inputDS).collect() + assert(bigPandas.size === 1) + assert(bigPandas(0)._2 === 30.0 +- 0.00001) + } + + test("union pandas") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val happyPandas = sqlCtx.createDataset(rawPandaList.take(1)) + val sadPandas = sqlCtx.createDataset(rawPandaList.drop(1)) + val mixedDS = new MixedDataset(sqlCtx) + val unionPandas = mixedDS.unionPandas(happyPandas, sadPandas).collect + assert(unionPandas.toSet == rawPandaList.toSet) + } + + test("typed query") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val mixedDS = new MixedDataset(sqlCtx) + val typedResult = mixedDS.typedQueryExample(inputDS) + assert(typedResult.collect().toList == rawPandaList.map(_.attributes(0))) + } + + test("join different dataset") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val pandaDS = sqlCtx.createDataFrame(rawPandaList).as[RawPanda] + val rawCoffeeShop = List( + CoffeeShop("94110", "Starbucks"), + CoffeeShop("98765", "Caribou") + ) + val coffeeShopDS = sqlCtx.createDataFrame(rawCoffeeShop).as[CoffeeShop] + val mixedDS = new MixedDataset(sqlCtx) + val joinResult = mixedDS.joinSample(pandaDS, coffeeShopDS) + val expected = for { + panda <- rawPandaList + coffeeShop <- rawCoffeeShop + if (panda.zip == coffeeShop.zip) + } yield (panda, coffeeShop) + assert(joinResult.collect().toSet == expected.toSet) + } + + test("self join") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val mixedDS = new MixedDataset(sqlCtx) + val selfJoinResult = mixedDS.selfJoin(inputDS) + val expected = for { + left <- rawPandaList + right <- rawPandaList + if (left.zip == right.zip) + } yield (left, right) + assert(selfJoinResult.collect().toSet == expected.toSet) + } + + test("convert an RDD to DS") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val mixedDS = new MixedDataset(sqlCtx) + val rdd = sc.parallelize(rawPandaList) + val result = mixedDS.fromRDD(rdd) + val expected = sqlCtx.createDataFrame(rawPandaList).as[RawPanda] + assertDatasetEquals(expected, result) + } + + test("convert a Dataset to an RDD") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val mixedDS = new MixedDataset(sqlCtx) + val rdd = sc.parallelize(rawPandaList) + val dataset = sqlCtx.createDataFrame(rawPandaList).as[RawPanda] + val result = mixedDS.toRDD(dataset) + val expected = sc.parallelize(rawPandaList) + assertRDDEquals(expected, result) + } + + test("convert a Dataset to a DataFrame") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val mixedDS = new MixedDataset(sqlCtx) + val rdd = sc.parallelize(rawPandaList) + val dataset = sqlCtx.createDataFrame(rawPandaList).as[RawPanda] + val result = mixedDS.toDF(dataset) + val expected = sqlCtx.createDataFrame(rawPandaList) + assertDataFrameEquals(expected, result) + } + + + test("convert a DataFrame to a DataSset") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val mixedDS = new MixedDataset(sqlCtx) + val dataframe = sqlCtx.createDataFrame(rawPandaList) + val result = mixedDS.fromDF(dataframe) + val expected = sqlCtx.createDataFrame(rawPandaList).as[RawPanda] + assertDatasetEquals(expected, result) + } + +} diff --git a/core/src/test/scala/com/high-performance-spark-examples/dataframe/PandaPlaceFilterPushdown.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/PandaPlaceFilterPushdown.scala new file mode 100644 index 00000000..17215ab2 --- /dev/null +++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/PandaPlaceFilterPushdown.scala @@ -0,0 +1,48 @@ +/** + * Happy Panda Example for DataFrames. + * Computes the % of happy pandas. Very contrived. + */ +package com.highperformancespark.examples.dataframe + +import scala.collection.mutable +import scala.util.Random + +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Row +import org.apache.spark.sql.{SQLContext, SparkSession} +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo +import com.highperformancespark.examples.dataframe.HappyPandas.Pandas +import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ + +case class ExtraMagic( + place: String, + pandaType: String, + happyPandas: Integer, + totalPandas: Integer, + extraInfo: Integer) + + +class PandaPlaceFilterPushdown extends AnyFunSuite with DataFrameSuiteBase { + + override def appName: String = "pandaPlaceFilterPushdown" + + val basicList = List( + ExtraMagic("a", "b", 1, 2, 3), + ExtraMagic("toronto", "b", 1, 2, 3), + ) + + test("simpleFilterTest") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(basicList) + val restrictedDF = inputDF.select($"place", $"pandaType", $"happyPandas", $"totalPandas") + val switched = inputDF.as[PandaInfo] + // Note if we write the filter with functional syntax it does not push down. + val filtered = switched.filter($"place" === "a") + assert(filtered.count() === 1) + } +} diff --git a/core/src/test/scala/com/high-performance-spark-examples/dataframe/SQLExtensionTest.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/SQLExtensionTest.scala new file mode 100644 index 00000000..91408fd7 --- /dev/null +++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/SQLExtensionTest.scala @@ -0,0 +1,49 @@ +/** + * Happy Panda Example for DataFrames. + * Computes the % of happy pandas. Very contrived. + */ +package com.highperformancespark.examples.dataframe + +import scala.collection.mutable +import scala.util.Random + +import org.apache.spark.SparkConf +import org.apache.spark.sql._ +import org.apache.spark.sql.execution.ExplainMode +import org.apache.spark.sql.types.IntegerType +import org.apache.spark.sql.functions.{lower, rand} +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo +import com.highperformancespark.examples.dataframe.HappyPandas.Pandas +import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ + +class SQLExtensionTest extends AnyFunSuite with ScalaDataFrameSuiteBase { + + val rawPandaList = List( + RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9)), + RawPanda(11L, "94110", "red", true, Array(1.0, 0.9))) + + override def conf: SparkConf = { + val initialConf = super.conf + initialConf.set( + "spark.sql.extensions", + "com.highperformancespark.examples.dataframe.SQLExtension") + } + + def explainToString(df: DataFrame): String = { + df.queryExecution.explainString(ExplainMode.fromString("extended")) + } + + test("Magic") { + import spark.implicits._ + val inputDF = spark.createDataFrame(rawPandaList) + spark.sql("DROP TABLE IF EXISTS farts") + inputDF.write.saveAsTable("farts") + val testDF = spark.read.table("farts") + val explained: String = explainToString(testDF.select($"zip".cast(IntegerType))) + explained should include ("isnotnull(zip#") + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala similarity index 86% rename from src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala index 2b54ce75..8a6ba097 100644 --- a/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala @@ -1,10 +1,9 @@ package com.highperformancespark.examples.errors import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite -import org.scalatest.FunSuite - -class ThrowsSuite extends FunSuite with SharedSparkContext { +class ThrowsSuite extends AnyFunSuite with SharedSparkContext { test("inner throw & outer throw should both throw SparkExceptions exceptions") { intercept[org.apache.spark.SparkException] { Throws.throwInner(sc) diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala similarity index 96% rename from src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala index 97082841..4067fcba 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala @@ -1,10 +1,11 @@ package com.highperformancespark.examples.goldilocks -import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.rdd.RDD -import org.scalatest.FunSuite -class EvaluationTests extends FunSuite with SharedSparkContext { +import com.holdenkarau.spark.testing.SharedSparkContext +import org.scalatest.funsuite.AnyFunSuite + +class EvaluationTests extends AnyFunSuite with SharedSparkContext { val doubleList = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0) val keyValuePairs = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0).zipWithIndex val path = "target/testResults" diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala similarity index 89% rename from src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala index 53884778..2e7fea8c 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala @@ -1,15 +1,21 @@ package com.highperformancespark.examples.goldilocks -import com.holdenkarau.spark.testing.SharedSparkContext +import scala.collection.immutable.IndexedSeq + import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD -import org.apache.spark.sql.types.{StructType, DoubleType, StructField} -import org.apache.spark.sql.{Row, SQLContext, DataFrame} -import org.scalatest.FunSuite +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Row +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.types.DoubleType +import org.apache.spark.sql.types.StructField +import org.apache.spark.sql.types.StructType -import scala.collection.immutable.IndexedSeq +import com.holdenkarau.spark.testing.SharedSparkContext +import org.scalatest.funsuite.AnyFunSuite +import org.apache.spark.sql.SparkSession -class GoldilocksLargeTests extends FunSuite with SharedSparkContext{ +class GoldilocksLargeTests extends AnyFunSuite with SharedSparkContext{ def testGoldilocksImplementations( @@ -47,7 +53,7 @@ class GoldilocksLargeTests extends FunSuite with SharedSparkContext{ } test("Goldilocks on local data solution "){ - val sqlContext = new SQLContext(sc) + val sqlContext = SparkSession.builder.getOrCreate().sqlContext val testRanks = List(3L, 8L) val (smallTestData, result) = DataCreationUtils.createLocalTestData(5, 10, testRanks) diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala similarity index 81% rename from src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala index 69dcc5e8..ea0a16af 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala @@ -1,11 +1,12 @@ package com.highperformancespark.examples.goldilocks -import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.rdd.RDD -import org.scalatest.FunSuite + +import com.holdenkarau.spark.testing.SharedSparkContext +import org.scalatest.funsuite.AnyFunSuite -class JoinTest extends FunSuite with SharedSparkContext { +class JoinTest extends AnyFunSuite with SharedSparkContext { test("Hash join"){ val keySet = "a, b, c, d, e, f, g".split(",") val smallRDD = sc.parallelize(keySet.map(letter => (letter, letter.hashCode))) @@ -13,7 +14,7 @@ class JoinTest extends FunSuite with SharedSparkContext { sc.parallelize(keySet.flatMap{ letter => Range(1, 50).map(i => (letter, letter.hashCode() / i.toDouble))}) val result: RDD[(String, (Double, Int))] = - RDDJoinExamples.manualBroadCastHashJoin( + RDDJoinExamples.manualBroadcastHashJoin( largeRDD, smallRDD) val nativeJoin: RDD[(String, (Double, Int))] = largeRDD.join(smallRDD) diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala similarity index 87% rename from src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala index 131f3111..92130165 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala @@ -1,12 +1,16 @@ package com.highperformancespark.examples.goldilocks import org.apache.spark._ -import org.apache.spark.sql.{Row, SQLContext} -import org.scalatest.{BeforeAndAfterAll, FunSuite} +import org.apache.spark.sql.Row +import org.apache.spark.sql.SQLContext + +import org.scalatest.BeforeAndAfterAll +import org.scalatest.funsuite.AnyFunSuite +import org.apache.spark.sql.SparkSession // tag::MAGIC_PANDA[] -class QuantileOnlyArtisanalTest extends FunSuite with BeforeAndAfterAll { +class QuantileOnlyArtisanalTest extends AnyFunSuite with BeforeAndAfterAll { @transient private var _sc: SparkContext = _ def sc: SparkContext = _sc @@ -31,15 +35,15 @@ class QuantileOnlyArtisanalTest extends FunSuite with BeforeAndAfterAll { 3 -> Set(6.0, 7.0)) test("Goldilocks naive Solution"){ - val sqlContext = new SQLContext(sc) + val sqlContext = SparkSession.builder.getOrCreate().sqlContext val input = sqlContext.createDataFrame(inputList) val whileLoopSolution = GoldilocksWhileLoop.findRankStatistics( input, List(2L, 3L)).mapValues(_.toSet) val inputAsKeyValuePairs = GoldilocksGroupByKey.mapToKeyValuePairs(input) val groupByKeySolution = GoldilocksGroupByKey.findRankStatistics( inputAsKeyValuePairs, List(2L,3L)).mapValues(_.toSet) - assert(whileLoopSolution == expectedResult) - assert(groupByKeySolution == expectedResult) + assert(whileLoopSolution.toMap == expectedResult) + assert(groupByKeySolution.toMap == expectedResult) } override def afterAll() { @@ -56,7 +60,7 @@ class QuantileOnlyArtisanalTest extends FunSuite with BeforeAndAfterAll { // We don't need the rest of the tests included. class QuantileOnlyArtisanalTestContinued extends QuantileOnlyArtisanalTest { test("Goldilocks first try ") { - val sqlContext = new SQLContext(sc) + val sqlContext = SparkSession.builder.getOrCreate().sqlContext val input = sqlContext.createDataFrame(inputList) val secondAndThird = GoldilocksFirstTry.findRankStatistics( input, targetRanks = List(2L, 3L)) @@ -112,7 +116,7 @@ class QuantileOnlyArtisanalTestContinued extends QuantileOnlyArtisanalTest { test("GoldiLocks With Hashmap ") { - val sqlContext = new SQLContext(sc) + val sqlContext = SparkSession.builder.getOrCreate().sqlContext val input = sqlContext.createDataFrame(inputList) val secondAndThird = GoldilocksWithHashMap.findRankStatistics( input, targetRanks = List(2L, 3L)) @@ -127,12 +131,12 @@ class QuantileOnlyArtisanalTestContinued extends QuantileOnlyArtisanalTest { } test("Goldilocks Secondary Sort"){ - val sqlContext = new SQLContext(sc) + val sqlContext = SparkSession.builder.getOrCreate().sqlContext val input = sqlContext.createDataFrame(inputList) val secondarySortSolution = GoldilocksWithHashMap.findRankStatistics( input, targetRanks = List(2L, 3L)).mapValues(_.toSet) - assert(secondarySortSolution == expectedResult) + assert(secondarySortSolution.toMap == expectedResult) } test("Secondary Sort"){ diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala similarity index 96% rename from src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala index 4ac03e77..2ff69cbf 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala @@ -1,14 +1,15 @@ package com.highperformancespark.examples.goldilocks -import com.holdenkarau.spark.testing.SharedSparkContext +import scala.reflect.ClassTag + import org.apache.spark.rdd.RDD -import org.scalatest.FunSuite -import scala.reflect.ClassTag +import com.holdenkarau.spark.testing.SharedSparkContext +import org.scalatest.funsuite.AnyFunSuite -class SortingTests extends FunSuite with SharedSparkContext { +class SortingTests extends AnyFunSuite with SharedSparkContext { test("Test Sort by two keys"){ diff --git a/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/core/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala similarity index 89% rename from src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala rename to core/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala index 3b9159c1..940d2231 100644 --- a/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala @@ -3,13 +3,14 @@ */ package com.highperformancespark.examples.ml -import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.sql.Dataset -import org.scalatest.FunSuite + +import com.holdenkarau.spark.testing.DataFrameSuiteBase +import org.scalatest.funsuite.AnyFunSuite case class TestRow(id: Int, inputColumn: String) -class CustomPipelineSuite extends FunSuite with DataFrameSuiteBase { +class CustomPipelineSuite extends AnyFunSuite with DataFrameSuiteBase { val d = List( TestRow(0, "a"), TestRow(1, "b"), diff --git a/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/core/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala similarity index 75% rename from src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala rename to core/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala index 1fa296a0..7a893107 100644 --- a/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala @@ -3,21 +3,24 @@ */ package com.highperformancespark.examples.ml -import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas} - -import com.holdenkarau.spark.testing._ - import org.apache.spark.ml._ import org.apache.spark.ml.feature._ import org.apache.spark.ml.param._ +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.Row +import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext} -import org.scalatest.Matchers._ -import org.scalatest.FunSuite + +import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo +import com.highperformancespark.examples.dataframe.HappyPandas.Pandas +import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ case class MiniPanda(happy: Double, fuzzy: Double, old: Double) -class SimpleNaiveBayesSuite extends FunSuite with DataFrameSuiteBase { +class SimpleNaiveBayesSuite extends AnyFunSuite with DataFrameSuiteBase { val miniPandasList = List( MiniPanda(1.0, 1.0, 1.0), MiniPanda(1.0, 1.0, 0.0), diff --git a/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala similarity index 89% rename from src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala index fa551a50..05b70e8e 100644 --- a/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala @@ -3,16 +3,13 @@ */ package com.highperformancespark.examples.mllib -import com.highperformancespark.examples.dataframe.RawPanda +import org.apache.spark.mllib.linalg.{Vector => SparkVector} +import com.highperformancespark.examples.dataframe.RawPanda import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite -import org.scalatest.FunSuite - - -import org.apache.spark.mllib.linalg.{Vector => SparkVector} - -class GoldilocksMLlibSuite extends FunSuite with SharedSparkContext { +class GoldilocksMLlibSuite extends AnyFunSuite with SharedSparkContext { val rps = List( RawPanda(1L, "94110", "giant", true, Array(0.0, 0.0)), RawPanda(2L, "94110", "giant", false, Array(0.0, 3.0)), diff --git a/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala b/core/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala similarity index 83% rename from src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala rename to core/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala index 724ddaa3..0b0ed361 100644 --- a/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala @@ -4,13 +4,14 @@ package com.highperformancespark.examples.ffi import com.holdenkarau.spark.testing._ -import org.scalacheck.{Arbitrary, Gen} +import org.scalacheck.Arbitrary +import org.scalacheck.Gen import org.scalacheck.Prop.forAll -import org.scalatest.FunSuite -import org.scalatest.prop.Checkers -import org.scalatest.Matchers._ +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ +import org.scalatestplus.scalacheck.Checkers -class NativeExampleSuite extends FunSuite +class NativeExampleSuite extends AnyFunSuite with SharedSparkContext with Checkers with RDDComparisons { test("local sum") { diff --git a/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala similarity index 61% rename from src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala index 4b1f0324..aa45fe1e 100644 --- a/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala @@ -4,12 +4,12 @@ package com.highperformancespark.examples.ffi import com.holdenkarau.spark.testing._ -import org.scalatest.FunSuite -import org.scalatest.prop.Checkers -import org.scalatest.Matchers._ +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ +import org.scalatestplus.scalacheck.Checkers -class PipeExampleSuite extends FunSuite with SharedSparkContext with Checkers { +class PipeExampleSuite extends AnyFunSuite with SharedSparkContext with Checkers { ignore("commentors on a pr") { val rdd = sc.parallelize(List(12883)) val expected = (12883, List("SparkQA", "srowen")) diff --git a/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala similarity index 86% rename from src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala index b5a3d440..871e2aa9 100644 --- a/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala @@ -5,14 +5,14 @@ */ package com.highperformancespark.examples.streaming +import java.lang.Thread + import org.apache.spark.streaming._ -import java.lang.Thread import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite -import org.scalatest.FunSuite - -class DStreamExamplesSuite extends FunSuite with SharedSparkContext { +class DStreamExamplesSuite extends AnyFunSuite with SharedSparkContext { test("simple set up") { val ssc = DStreamExamples.makeStreamingContext(sc) val inputStream = DStreamExamples.fileAPIExample(ssc, "./") diff --git a/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala similarity index 88% rename from src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala index ca364d14..a0afb64b 100644 --- a/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala @@ -3,14 +3,14 @@ */ package com.highperformancespark.examples.tokenize +import java.lang.Thread + import org.apache.spark.streaming._ -import java.lang.Thread import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite -import org.scalatest.FunSuite - -class SampleTokenizeSuite extends FunSuite with SharedSparkContext { +class SampleTokenizeSuite extends AnyFunSuite with SharedSparkContext { val input = List("hi holden", "I like coffee") val expected = List("hi", "holden", "I", "like", "coffee") diff --git a/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala similarity index 58% rename from src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala index 897a8d39..545b7898 100644 --- a/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala @@ -4,12 +4,10 @@ package com.highperformancespark.examples.tools import com.highperformancespark.examples.dataframe.RawPanda - import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite -import org.scalatest.FunSuite - -class FilterInvalidPandasSuite extends FunSuite with SharedSparkContext { +class FilterInvalidPandasSuite extends AnyFunSuite with SharedSparkContext { test("simple filter") { val invalidPandas = List(1L, 2L) val inputPandas = List( @@ -23,4 +21,16 @@ class FilterInvalidPandasSuite extends FunSuite with SharedSparkContext { assert(result1.collect() === result2.collect()) assert(result1.count() === 1) } + + test("alt log") { + val invalidPandas = List(1L, 2L) + val inputPandas = List( + RawPanda(1L, "94110", "giant", true, Array(0.0)), + RawPanda(3L, "94110", "giant", true, Array(0.0))) + val input = sc.parallelize(inputPandas) + val al = new AltLog() + val result1 = + al.filterInvalidPandasWithLogs(sc, invalidPandas, input) + assert(result1.count() === 1) + } } diff --git a/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala similarity index 91% rename from src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala index 15f60d12..1d761601 100644 --- a/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala @@ -4,12 +4,10 @@ package com.highperformancespark.examples.tools import com.highperformancespark.examples.dataframe.RawPanda - import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite -import org.scalatest.FunSuite - -class GeneratescalaingDataSuite extends FunSuite with SharedSparkContext { +class GeneratescalaingDataSuite extends AnyFunSuite with SharedSparkContext { // The number of entries depends somewhat on the partition split because we // zip multiple separate RDDs so its more of a "request" test("expected num entries") { diff --git a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/core/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala similarity index 57% rename from src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala rename to core/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala index 5eb995f2..48991e0d 100644 --- a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala @@ -3,13 +3,13 @@ */ package com.highperformancespark.examples.transformations -import com.highperformancespark.examples.dataframe.RawPanda +import scala.collection.immutable.HashSet +import com.highperformancespark.examples.dataframe.RawPanda import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite -import org.scalatest.FunSuite - -class AccumulatorsTest extends FunSuite with SharedSparkContext { +class AccumulatorsTest extends AnyFunSuite with SharedSparkContext { test("accumulator max should function") { val input = sc.parallelize(1.to(100)).map(x => RawPanda(1L, "1", "red", true, Array(x.toDouble))) @@ -23,4 +23,17 @@ class AccumulatorsTest extends FunSuite with SharedSparkContext { val (_, sum) = Accumulators.computeTotalFuzzyNess(sc, input) assert(sum === 5050.0) } + + test("accumulator unique should function") { + val input1 = sc.parallelize(1 to 100).map(x => + RawPanda(1L, "1", "red", true, Array(x.toDouble)) + ) + + val input2 = sc.parallelize(1 to 100).map(x => + RawPanda(2L, "2", "blude", false, Array(x.toDouble)) + ) + + val set = Accumulators.uniquePandas(sc, input1 ++ input2) + assert(set == HashSet(2, 1)) + } } diff --git a/core/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala b/core/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala new file mode 100644 index 00000000..68eab956 --- /dev/null +++ b/core/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala @@ -0,0 +1,63 @@ +package com.highperformancespark.examples.wordcount + + +import com.holdenkarau.spark.testing.SharedSparkContext +import org.scalatest.funsuite.AnyFunSuite + +class WordCountTest extends AnyFunSuite with SharedSparkContext { + test("word count with Stop Words Removed"){ + val wordRDD = sc.parallelize(Seq( + "How happy was the panda? You ask.", + "Panda is the most happy panda in all the #$!?ing land!")) + + val stopWords: Set[String] = Set("a", "the", "in", "was", "there", "she", "he") + val illegalTokens: Array[Char] = "#$%?!.".toCharArray + + val wordCounts = WordCount.withStopWordsFiltered( + wordRDD, illegalTokens, stopWords) + val wordCountsAsMap = wordCounts.collectAsMap() + assert(!wordCountsAsMap.contains("the")) + assert(!wordCountsAsMap.contains("?")) + assert(!wordCountsAsMap.contains("#$!?ing")) + assert(wordCountsAsMap.contains("ing")) + assert(wordCountsAsMap.get("panda").get.equals(3)) + } + + test("word count with simple counting") { + val wordRDD = sc.parallelize( + Seq( + "a b c d", + "b c d e" + ) + ) + val wordCounts = WordCount.simpleWordCount(wordRDD) + + val wordCountsAsMap = wordCounts.collectAsMap() + + for (character <- 'a' to 'e') { + assert(wordCountsAsMap.contains(character.toString)) + } + for (character <- 'b' to 'd') { + assert(wordCountsAsMap.get(character.toString).get == 2) + } + } + + test("word count with bad idea") { + val wordRDD = sc.parallelize( + Seq( + "a b c d", + "b c d e" + ) + ) + val wordCounts = WordCount.badIdea(wordRDD) + + val wordCountsAsMap = wordCounts.collectAsMap() + + for (character <- 'a' to 'e') { + assert(wordCountsAsMap.contains(character.toString)) + } + for (character <- 'b' to 'd') { + assert(wordCountsAsMap.get(character.toString).get == 2) + } + } +} diff --git a/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala b/core/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala similarity index 100% rename from src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala rename to core/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala diff --git a/data/project.csv b/data/project.csv new file mode 100644 index 00000000..69210101 --- /dev/null +++ b/data/project.csv @@ -0,0 +1,5 @@ +creator,projectname,stars +holdenk,spark-upgrade,17 +krisnova,rust-nova,71 +kbendick,MongoMart,6 +mateiz,spark,36600 \ No newline at end of file diff --git a/env_setup.sh b/env_setup.sh new file mode 100755 index 00000000..f31f427d --- /dev/null +++ b/env_setup.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +set -ex + +# Download Spark and iceberg if not present +SPARK_MAJOR=${SPARK_MAJOR:-"3.5"} +SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.3"} +SCALA_VERSION=${SCALA_VERSION:-"2.13"} +HADOOP_VERSION="3" +SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" +SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" +if [ "$SCALA_VERSION" = "2.13" ]; then + SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3-scala2.13.tgz" + SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala2.13" +fi +ICEBERG_VERSION=${ICEBERG_VERSION:-"1.9.2"} +if [ ! -f "${SPARK_FILE}" ]; then + SPARK_DIST_URL="https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" + SPARK_ARCHIVE_DIST_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" + if command -v axel &> /dev/null + then + (axel --quiet "$SPARK_DIST_URL" || axel --quiet "$SPARK_ARCHIVE_DIST_URL") & + else + (wget --quiet "$SPARK_DIST_URL" || wget --quiet "$SPARK_ARCHIVE_DIST_URL") & + fi +fi +# Download Icberg if not present +ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}-${ICEBERG_VERSION}.jar" +if [ ! -f "${ICEBERG_FILE}" ]; then + wget --quiet "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & +fi +wait +sleep 1 +# Setup the env +if [ ! -d "${SPARK_PATH}" ]; then + tar -xf "${SPARK_FILE}" +fi + +SPARK_HOME="${SPARK_PATH}" +export SPARK_HOME + +if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then + # Delete the old JAR first. + rm "${SPARK_PATH}/jars/iceberg-spark-runtime*.jar" || echo "No old version to delete." + cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}" +fi + +# Set up for running pyspark and friends +export PATH="${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH}" + +# Make sure we have a history directory +mkdir -p /tmp/spark-events + +mkdir -p ./data/fetched/ +if [ ! -f ./data/fetched/2021 ]; then + wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" -O ./data/fetched/2021 +fi +if [ ! -f ./data/fetched/2022 ]; then + wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2022" -O ./data/fetched/2022 +fi +if [ ! -f ./data/fetched/2023 ]; then + wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2023" -O ./data/fetched/2023 +fi + diff --git a/high_performance_pyspark/__init__.py b/high_performance_pyspark/__init__.py index 7741593d..3f79c0dd 100644 --- a/high_performance_pyspark/__init__.py +++ b/high_performance_pyspark/__init__.py @@ -22,4 +22,3 @@ import os import sys - diff --git a/iceberg-workshop-solutions/Workshop-Template.ipynb b/iceberg-workshop-solutions/Workshop-Template.ipynb new file mode 100644 index 00000000..472a9c3c --- /dev/null +++ b/iceberg-workshop-solutions/Workshop-Template.ipynb @@ -0,0 +1,552 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "34577ad3-822f-4370-bcba-56b9fcec3196", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.spark.sql._\n", + "import scala.sys.process._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d3141ec-7779-467a-9f76-2e51030fd1c7", + "metadata": {}, + "outputs": [], + "source": [ + "// So now we need to configure Spark to use Iceberg\n", + "// See https://iceberg.apache.org/docs/1.6.0/spark-configuration/ & https://iceberg.apache.org/docs/1.6.0/spark-getting-started/\n", + "// We'll use the \"hadoop\" (aka file) catalog & /high-performance-spark-examples/warehouse for the location\n", + "val spark = (SparkSession.builder.master(\"local[*]\")\n", + " // Setup the extensions\n", + " // You'll want to configure Iceberg here as discussed above\n", + " // If you want to match the solution you'll want to configure the Iceberg catalog to be \"local.\"\n", + " .getOrCreate()\n", + " )\n", + "import spark._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c53080aa-a9d6-45f9-968b-8e052e7fa963", + "metadata": {}, + "outputs": [], + "source": [ + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88d54d05-0c49-4268-9b65-8c72679cb0f7", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sparkContext.uiWebUrl.get" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "270730c9-9787-407c-ba22-f0cee1f67f53", + "metadata": {}, + "outputs": [], + "source": [ + "// Load the current data\n", + "val df = spark.read.option(\"header\", \"true\").option(\"inferSchema\", \"true\").csv(\"/high-performance-spark-examples/data/fetched/2021\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87ca6359-86bc-42a4-93dd-4fc64496b145", + "metadata": {}, + "outputs": [], + "source": [ + "// Drop existing table if present & create new table\n", + "spark.sql(\"DROP TABLE IF EXISTS local.uk_gender_pay_data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bdeb3eb-b725-409b-ab3a-409d0e8309ae", + "metadata": {}, + "outputs": [], + "source": [ + "// Write the data out\n", + "df.write.saveAsTable(\"local.uk_gender_pay_data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "554c6036-0c6b-4e3c-a9e1-7251c608b48f", + "metadata": {}, + "outputs": [], + "source": [ + "\"ls /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb541fbf-4a79-402d-a6b2-e999106e9a18", + "metadata": {}, + "outputs": [], + "source": [ + "\"cat /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/v1.metadata.json\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90149834-27a2-45a3-aa8a-dae2162da854", + "metadata": {}, + "outputs": [], + "source": [ + "// Iceberg Java SDK time imports\n", + "import java.util.HashMap\n", + "import java.util.Map\n", + "\n", + "import org.apache.iceberg.Table\n", + "import org.apache.iceberg.catalog.TableIdentifier\n", + "import org.apache.iceberg.hadoop.HadoopCatalog\n", + "\n", + "\n", + "// And to handle java types\n", + "import scala.jdk.CollectionConverters._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf56bc6-d420-474c-b3a8-ded03b23eff8", + "metadata": {}, + "outputs": [], + "source": [ + "// Create a local Iceberg Catalog client. Here we're using the \"hadoop catalog\"\n", + "// The spark hadoop conf can be got from: spark.sparkContext.hadoopConfiguration\n", + "// Here we make the Catalog, it's kind of funky. Spark also has methods which return tables but they're Spark tables so\n", + "// which aren't the type we want\n", + "// https://iceberg.apache.org/javadoc/latest/org/apache/iceberg/hadoop/HadoopCatalog.html\n", + "val catalog = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c55dc276-035f-40d4-9a47-bd4698f2519d", + "metadata": {}, + "outputs": [], + "source": [ + "// Now we want to load the table. To do that we need to make a TableIdentifier of the same table we wrote to. Note it'll just be\n", + "// the table name no need for the \"local\" prefix.\n", + "// See https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/catalog/TableIdentifier.html\n", + "val name = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ea4b1cc-bd1b-42b4-bdbe-27625b461db9", + "metadata": {}, + "outputs": [], + "source": [ + "// Load the table\n", + "val table = catalog.loadTable(name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd1c6add-d465-4b81-9c34-6c8f40197ab2", + "metadata": {}, + "outputs": [], + "source": [ + "// Now we want to get the snapshots from the table. There are a few different ways we can do this:\n", + "// 1) Using the Iceberg Table API (see https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/Table.html)\n", + "// 2) Using the Iceberg + Spark SQL special query interface https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/Table.html\n", + "val snapshots = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a96986d-b3a5-49ad-aeac-a492bf3fc8e6", + "metadata": {}, + "outputs": [], + "source": [ + "val snapshot = snapshots(0).snapshotId()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15c6cb85-ff64-405f-ae6a-7e3c917ac12a", + "metadata": {}, + "outputs": [], + "source": [ + "val altSnapshotQuery = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f93516ad-3ae9-4bb6-989f-7c127f82143c", + "metadata": {}, + "outputs": [], + "source": [ + "val altSnapshotId = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15e67d1b-1e9e-45a0-af94-1c9c79e03d54", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4829752b-dc30-49db-93ae-911f1c2743c1", + "metadata": {}, + "outputs": [], + "source": [ + "// And the files!\n", + "// We can also list snapshots with the select\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f262d890-0818-410a-aec8-2986a04ae16e", + "metadata": {}, + "outputs": [], + "source": [ + "// Lets take a quick look and see\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7369bcc8-a738-48dc-a475-55885d4460cc", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"DELETE FROM local.uk_gender_pay_data WHERE isnull(responsibleperson)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73d279f3-f2a5-4ddf-a56f-d473b0c28b97", + "metadata": {}, + "outputs": [], + "source": [ + "// Make sure the data is gone\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b6902ef-b742-466d-b4c8-d6830ff67cf4", + "metadata": {}, + "outputs": [], + "source": [ + "// Yay! ok now lets travel back in time\n", + "// We can do this with SQL or with a read option\n", + "// SQL: https://iceberg.apache.org/docs/nightly/spark-queries/#sql" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7e899a1-d2cd-4e25-b142-e69fb9ca6774", + "metadata": {}, + "outputs": [], + "source": [ + "// DF: https://iceberg.apache.org/docs/nightly/spark-queries/#dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8884a8a2-bbb7-47b1-85f6-744c60612dcb", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(f\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f53692b-f14a-4df7-8069-147eca8da0cd", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"DROP TABLE IF EXISTS local.uk_gender_pay_data_postcode\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8deb38c3-1e64-4eba-ac80-c75d5674258b", + "metadata": {}, + "outputs": [], + "source": [ + "// Write the data out partitioned to do this we'll want to use the SQL interface so we can use the truncate function\n", + "// since the regular Scala API doesn't support partioning by things besides raw keys.\n", + "https://iceberg.apache.org/docs/1.5.1/spark-ddl/#partitioned-by" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e87e6b08-5c0e-4356-a0ee-7245b7d7790b", + "metadata": {}, + "outputs": [], + "source": [ + "// Inspect the files again. This should look familiar ish\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data_postcode.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71569b2e-7def-42a4-bf3e-69ee9667a41d", + "metadata": {}, + "outputs": [], + "source": [ + "// Add some more data, we've got 2022, 2023 , & 2024\n", + "// Make sure to use the append mode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5be0f8c7-2926-4bf6-bc9d-c02a15648e83", + "metadata": {}, + "outputs": [], + "source": [ + "val uncompacted_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb6c4b7d-f8d1-41f7-b014-c6434bbb6d48", + "metadata": {}, + "outputs": [], + "source": [ + "val uncompacted_metadata_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/metadata/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfb116e4-c66d-4027-80a3-e7de9ad62ee0", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "586bdb3c-19f0-4a63-b87f-d181e8c44c06", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.snapshots\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22351ea4-8cb7-43c2-b205-4554d0b15aca", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.spark.actions.SparkActions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "928f9da9-d65b-4d53-b818-82a27f8171a2", + "metadata": {}, + "outputs": [], + "source": [ + "// So far the logging has been... verbose but interesting, but the next stages it's actually too much\n", + "spark.sparkContext.setLogLevel(\"ERROR\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "807193d8-8ff5-4a9c-b6ae-510ee0bb2f84", + "metadata": {}, + "outputs": [], + "source": [ + "// Ok let's try and compact things down a little bit.\n", + "// You should look at SparkActions & use the rewrite data files operation.\n", + "// Consider specifying rewrite-all to true to force rewrites\n", + "// https://iceberg.apache.org/javadoc/latest/org/apache/iceberg/spark/actions/SparkActions.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e4a013d-1af5-4dd8-82c1-5115905f3feb", + "metadata": {}, + "outputs": [], + "source": [ + "val compacted_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c40db89-7ce1-40ed-a111-1395e5b75a0a", + "metadata": {}, + "outputs": [], + "source": [ + "// Interesting. Note it _added_ a new file but the old files are all still there. That's kind of expected/ok since if we look at the\n", + "// files actually currently used it's just the new one\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9198c74b-87d5-42b0-9987-587095848282", + "metadata": {}, + "outputs": [], + "source": [ + "// Remove the old snapshots but keep the latest one.\n", + "// This produces _so much logging_ by default that running in the NB would be slow (that's why we set the log level to error)\n", + "// Here your going to want to use the expireSnapshots action.\n", + "// Note: if you _just set_ retainLast it will keep all snapshots, retain last is like a safety mechanism that keeps the last K\n", + "// snapshots. To get rid of everything except the last expire everything older than right now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be51d1ca-a105-407f-ac3c-41c0f9258891", + "metadata": {}, + "outputs": [], + "source": [ + "val compacted_and_expired_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18825715-ced8-401f-b7b3-9ea682d38757", + "metadata": {}, + "outputs": [], + "source": [ + "// Table is in an inconsistent state here, this is not \"good\" but YOLO\n", + "// spark.sql(\"REFRESH local.uk_gender_pay_data\").show()\n", + "// spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa8644af-5604-4147-8546-f65e749b8253", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f1983f2-2fe7-4e43-a78e-40fd1c7577fd", + "metadata": {}, + "outputs": [], + "source": [ + "// Remove the orphaned files\n", + "SparkActions.get().deleteOrphanFiles(table).execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73b3e2ca-555b-467c-a253-d96aab32e27b", + "metadata": {}, + "outputs": [], + "source": [ + "val cleaned_and_compacted_file_list = \"ls ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "921d0c02-1fb7-43ec-ac0a-b5d1c3a40c3d", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b8ff0a3-8c6e-4d67-8afb-d1541c7e6dbd", + "metadata": {}, + "outputs": [], + "source": [ + "// Lets go take a look at a quick side-by-side test\n", + "//cd /high-performance-spark-examples/spark-upgrade/;./e2e_demo/scala/run_demo.sh" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05d47a57-3bfa-484a-90ed-0231a17a7205", + "metadata": {}, + "outputs": [], + "source": [ + "// Ok, let's try branching! Note: requires very recent Iceberg, so if you're doing this elsewhere might not be a party\n", + "// Relevant docs: https://iceberg.apache.org/docs/nightly/spark-ddl/#branching-and-tagging-ddl\n", + "// https://iceberg.apache.org/docs/nightly/spark-queries/#sql" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)", + "language": "scala", + "name": "scala2.13" + }, + "language_info": { + "codemirror_mode": "text/x-scala", + "file_extension": ".sc", + "mimetype": "text/x-scala", + "name": "scala", + "nbconvert_exporter": "script", + "version": "2.13.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/iceberg-workshop-solutions/Workshop.ipynb b/iceberg-workshop-solutions/Workshop.ipynb new file mode 100644 index 00000000..ebd12637 --- /dev/null +++ b/iceberg-workshop-solutions/Workshop.ipynb @@ -0,0 +1,577 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "34577ad3-822f-4370-bcba-56b9fcec3196", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.spark.sql._\n", + "import scala.sys.process._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d3141ec-7779-467a-9f76-2e51030fd1c7", + "metadata": {}, + "outputs": [], + "source": [ + "// So now we need to configure Spark to use Iceberg\n", + "// See https://iceberg.apache.org/docs/1.6.0/spark-configuration/ & https://iceberg.apache.org/docs/1.6.0/spark-getting-started/\n", + "// We'll use the \"hadoop\" (aka file) catalog & /high-performance-spark-examples/warehouse for the location\n", + "val spark = (SparkSession.builder.master(\"local[*]\")\n", + " // Setup the extensions\n", + " .config(\"spark.sql.extensions\", \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\")\n", + " .config(\"spark.sql.catalog.local\", \"org.apache.iceberg.spark.SparkCatalog\")\n", + " .config(\"spark.sql.catalog.local.type\", \"hadoop\")\n", + " .config(\"spark.sql.catalog.local.warehouse\", \"/high-performance-spark-examples/warehouse\")\n", + " .getOrCreate()\n", + " )\n", + "import spark._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ecbdf4a8-3f16-4242-9d89-0ce7835b49e7", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sparkContext.uiWebUrl.get" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "270730c9-9787-407c-ba22-f0cee1f67f53", + "metadata": {}, + "outputs": [], + "source": [ + "// Load the current data\n", + "val df = spark.read.option(\"header\", \"true\").option(\"inferSchema\", \"true\").csv(\"/high-performance-spark-examples/data/fetched/2021\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87ca6359-86bc-42a4-93dd-4fc64496b145", + "metadata": {}, + "outputs": [], + "source": [ + "// Drop existing table if present & create new table\n", + "spark.sql(\"DROP TABLE IF EXISTS local.uk_gender_pay_data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bdeb3eb-b725-409b-ab3a-409d0e8309ae", + "metadata": {}, + "outputs": [], + "source": [ + "// Write the data out\n", + "df.write.saveAsTable(\"local.uk_gender_pay_data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "554c6036-0c6b-4e3c-a9e1-7251c608b48f", + "metadata": {}, + "outputs": [], + "source": [ + "\"ls /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb541fbf-4a79-402d-a6b2-e999106e9a18", + "metadata": {}, + "outputs": [], + "source": [ + "\"cat /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/v1.metadata.json\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90149834-27a2-45a3-aa8a-dae2162da854", + "metadata": {}, + "outputs": [], + "source": [ + "// Java SDK time imports\n", + "import java.util.HashMap\n", + "import java.util.Map\n", + "\n", + "import org.apache.iceberg.Table\n", + "import org.apache.iceberg.catalog.TableIdentifier\n", + "import org.apache.iceberg.hadoop.HadoopCatalog\n", + "\n", + "\n", + "// And to handle java types\n", + "import scala.jdk.CollectionConverters._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf56bc6-d420-474c-b3a8-ded03b23eff8", + "metadata": {}, + "outputs": [], + "source": [ + "// Create a local Iceberg Catalog client. Here we're using the \"hadoop catalog\"\n", + "// The spark hadoop conf can be got from: spark.sparkContext.hadoopConfiguration\n", + "// Here we make the Catalog, it's kind of funky. Spark also has methods which return tables but they're Spark tables so\n", + "// which aren't the type we want\n", + "val catalog = new HadoopCatalog(spark.sparkContext.hadoopConfiguration, \"/high-performance-spark-examples/warehouse\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c55dc276-035f-40d4-9a47-bd4698f2519d", + "metadata": {}, + "outputs": [], + "source": [ + "// Now we want to load the table. To do that we need to make a TableIdentifier of the same table we wrote to. Note it'll just be\n", + "// the table name no need for the \"local\" prefix.\n", + "// See https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/catalog/TableIdentifier.html\n", + "val name = TableIdentifier.of(\"uk_gender_pay_data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ea4b1cc-bd1b-42b4-bdbe-27625b461db9", + "metadata": {}, + "outputs": [], + "source": [ + "val table = catalog.loadTable(name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd1c6add-d465-4b81-9c34-6c8f40197ab2", + "metadata": {}, + "outputs": [], + "source": [ + "// Now we want to get the snapshots from the table. There are a few different ways we can do this:\n", + "// 1) Using the Iceberg Table API (see https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/Table.html)\n", + "// 2) Using the Iceberg + Spark SQL special query interface https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/Table.html\n", + "val snapshots = table.snapshots().asScala.toList\n", + "snapshots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a96986d-b3a5-49ad-aeac-a492bf3fc8e6", + "metadata": {}, + "outputs": [], + "source": [ + "val snapshot = snapshots(0).snapshotId()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15c6cb85-ff64-405f-ae6a-7e3c917ac12a", + "metadata": {}, + "outputs": [], + "source": [ + "val altSnapshotQuery = spark.sql(\"SELECT * FROM local.uk_gender_pay_data.snapshots\")\n", + "altSnapshotQuery.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f93516ad-3ae9-4bb6-989f-7c127f82143c", + "metadata": {}, + "outputs": [], + "source": [ + "val altSnapshotId = spark.sql(\"SELECT snapshot_id FROM local.uk_gender_pay_data.snapshots\").collect()(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15e67d1b-1e9e-45a0-af94-1c9c79e03d54", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d94eb4db-bf03-49be-865a-e80c0613d526", + "metadata": {}, + "outputs": [], + "source": [ + "// We can also list snapshots with the select\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.snapshots\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4829752b-dc30-49db-93ae-911f1c2743c1", + "metadata": {}, + "outputs": [], + "source": [ + "// And the files!\n", + "// We can also list snapshots with the select\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f262d890-0818-410a-aec8-2986a04ae16e", + "metadata": {}, + "outputs": [], + "source": [ + "// Lets take a quick look and see\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7369bcc8-a738-48dc-a475-55885d4460cc", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"DELETE FROM local.uk_gender_pay_data WHERE isnull(responsibleperson)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73d279f3-f2a5-4ddf-a56f-d473b0c28b97", + "metadata": {}, + "outputs": [], + "source": [ + "// Make sure the data is gone\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b6902ef-b742-466d-b4c8-d6830ff67cf4", + "metadata": {}, + "outputs": [], + "source": [ + "// Yay! ok now lets travel back in time\n", + "// We can do this with SQL or with a read option\n", + "spark.sql(f\"SELECT * FROM local.uk_gender_pay_data VERSION AS OF ${snapshot} WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7e899a1-d2cd-4e25-b142-e69fb9ca6774", + "metadata": {}, + "outputs": [], + "source": [ + "// Or the same with option + DF syntax\n", + "spark.read.option(\"snapshot-id\", f\"${snapshot}\").table(\"local.uk_gender_pay_data\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8884a8a2-bbb7-47b1-85f6-744c60612dcb", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(f\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f53692b-f14a-4df7-8069-147eca8da0cd", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"DROP TABLE IF EXISTS local.uk_gender_pay_data_postcode\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8deb38c3-1e64-4eba-ac80-c75d5674258b", + "metadata": {}, + "outputs": [], + "source": [ + "// Write the data out partitioned\n", + "df.registerTempTable(\"temp_table\")\n", + "// We could use the table write semantics but we can't do truncate() on that\n", + "spark.sql(\"CREATE TABLE local.uk_gender_pay_data_postcode USING iceberg PARTITIONED BY (truncate(1, PostCode)) AS select * from temp_table\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e87e6b08-5c0e-4356-a0ee-7245b7d7790b", + "metadata": {}, + "outputs": [], + "source": [ + "// Inspect the files again. This should look familiar ish\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data_postcode.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71569b2e-7def-42a4-bf3e-69ee9667a41d", + "metadata": {}, + "outputs": [], + "source": [ + "val year_dfs = 2022.to(2023).map(r => spark.read.option(\"header\", \"true\").option(\"inferSchema\", \"true\").csv(s\"/high-performance-spark-examples/data/fetched/${r}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c4441cf-fd65-4a29-94fb-6d3aa927f6b1", + "metadata": {}, + "outputs": [], + "source": [ + "List(\"local.uk_gender_pay_data\", \"local.uk_gender_pay_data_postcode\").foreach(table => year_dfs.foreach(df => df.write.mode(\"append\").saveAsTable(table)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5be0f8c7-2926-4bf6-bc9d-c02a15648e83", + "metadata": {}, + "outputs": [], + "source": [ + "val uncompacted_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb6c4b7d-f8d1-41f7-b014-c6434bbb6d48", + "metadata": {}, + "outputs": [], + "source": [ + "val uncompacted_metadata_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/metadata/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfb116e4-c66d-4027-80a3-e7de9ad62ee0", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "586bdb3c-19f0-4a63-b87f-d181e8c44c06", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.snapshots\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22351ea4-8cb7-43c2-b205-4554d0b15aca", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.spark.actions.SparkActions\n", + "// Iceberg actions\n", + "import org.apache.iceberg.actions.Action" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "928f9da9-d65b-4d53-b818-82a27f8171a2", + "metadata": {}, + "outputs": [], + "source": [ + "// So far the logging has been... verbose but interesting, but the next stages it's actually too much\n", + "spark.sparkContext.setLogLevel(\"ERROR\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "807193d8-8ff5-4a9c-b6ae-510ee0bb2f84", + "metadata": {}, + "outputs": [], + "source": [ + "// Ok let's try and compact things down a little bit.\n", + "// You should look at SparkActions & use the rewrite data files operation.\n", + "// Consider specifying rewrite-all to true to force rewrites\n", + "// https://iceberg.apache.org/javadoc/latest/org/apache/iceberg/spark/actions/SparkActions.html\n", + "SparkActions.get().rewriteDataFiles(table).option(\"target-file-size-bytes\", (512L*1024L*1024L).toString).option(\"rewrite-all\", \"true\").execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e4a013d-1af5-4dd8-82c1-5115905f3feb", + "metadata": {}, + "outputs": [], + "source": [ + "val compacted_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c40db89-7ce1-40ed-a111-1395e5b75a0a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9198c74b-87d5-42b0-9987-587095848282", + "metadata": {}, + "outputs": [], + "source": [ + "// Remove the old snapshots but keep the latest one.\n", + "// This produces _so much logging_ by default that running in the NB would be slow (that's why we set the log level to error)\n", + "// Here your going to want to use the expireSnapshots action.\n", + "// Note: if you _just set_ retainLast it will keep all snapshots, retain last is like a safety mechanism that keeps the last K\n", + "// snapshots. To get rid of everything except the last expire everything older than right now.\n", + "SparkActions.get().expireSnapshots(table).expireOlderThan(System.currentTimeMillis()).retainLast(1).execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be51d1ca-a105-407f-ac3c-41c0f9258891", + "metadata": {}, + "outputs": [], + "source": [ + "val compacted_and_expired_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18825715-ced8-401f-b7b3-9ea682d38757", + "metadata": {}, + "outputs": [], + "source": [ + "// Table is in an inconsistent state here, this is not \"good\"\n", + "spark.sql(\"REFRESH local.uk_gender_pay_data\").show()\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa8644af-5604-4147-8546-f65e749b8253", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f1983f2-2fe7-4e43-a78e-40fd1c7577fd", + "metadata": {}, + "outputs": [], + "source": [ + "// Remove the orphaned files\n", + "SparkActions.get().deleteOrphanFiles(table).execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73b3e2ca-555b-467c-a253-d96aab32e27b", + "metadata": {}, + "outputs": [], + "source": [ + "val cleaned_and_compacted_file_list = \"ls ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "921d0c02-1fb7-43ec-ac0a-b5d1c3a40c3d", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b8ff0a3-8c6e-4d67-8afb-d1541c7e6dbd", + "metadata": {}, + "outputs": [], + "source": [ + "// Lets go take a look at a quick side-by-side test\n", + "//cd /high-performance-spark-examples/spark-upgrade/;./e2e_demo/scala/run_demo.sh\n", + "//That'll be easier to run in a terminal than the .!! trick we've been doing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05d47a57-3bfa-484a-90ed-0231a17a7205", + "metadata": {}, + "outputs": [], + "source": [ + "// Ok, let's try branching! Note: requires very recent Iceberg, so if you're doing this elsewhere might not be a party\n", + "// Relevant docs: https://iceberg.apache.org/docs/nightly/spark-ddl/#branching-and-tagging-ddl\n", + "// https://iceberg.apache.org/docs/nightly/spark-queries/#sql\n", + "spark.sql(\"ALTER TABLE local.uk_gender_pay_data CREATE BRANCH IF NOT EXISTS `new-software-branch`\")\n", + "spark.sql(\"DELETE FROM local.uk_gender_pay_data.`branch_new-software-branch` WHERE isnull(DueDate)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "128591e9-fc12-4791-8797-901ce2f1c6b7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)", + "language": "scala", + "name": "scala2.13" + }, + "language_info": { + "codemirror_mode": "text/x-scala", + "file_extension": ".sc", + "mimetype": "text/x-scala", + "name": "scala", + "nbconvert_exporter": "script", + "version": "2.13.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/migration/sql.sh b/migration/sql.sh new file mode 100644 index 00000000..3d94f07e --- /dev/null +++ b/migration/sql.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +pip install sqlfluff +python -m pip install 'sqlfluff-plugin-sparksql-upgrade @ git+https://github.com/holdenk/spark-upgrade#subdirectory=sql' + +sqlfluff rules |grep -i spark +sqlfluff fix --dialect sparksql farts.sql diff --git a/misc/container_launch.sh b/misc/container_launch.sh new file mode 100755 index 00000000..31f0edbb --- /dev/null +++ b/misc/container_launch.sh @@ -0,0 +1,5 @@ +#!/bin/bash +if [ ! -f /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb ]; then + cp /high-performance-spark-examples/iceberg-workshop-solutions/Workshop-Template.ipynb /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb +fi +jupyter-lab --ip 0.0.0.0 --port 8877 diff --git a/misc/kernel.json b/misc/kernel.json new file mode 100644 index 00000000..5812f16a --- /dev/null +++ b/misc/kernel.json @@ -0,0 +1,19 @@ +{ + "argv": [ + "java", + "-cp", + "/home/dev/.local/share/jupyter/kernels/scala2.13/launcher.jar:.:/high-performance-spark-examples/:/high-performance-spark-examples/target/scala-2.13/home/dev/.local/share/jupyter/kernels/scala/launcher.jar:/high-performance-spark-examples/spark-3.5.2-bin-hadoop3-scala2.13/jars/*", + "coursier.bootstrap.launcher.Launcher", + "--log", + "info", + "--metabrowse", + "--id", + "scala2.13", + "--display-name", + "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)", + "--connection-file", + "{connection_file}" + ], + "display_name": "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)", + "language": "scala" +} diff --git a/native/src/CMakeLists.txt b/native/src/CMakeLists.txt new file mode 100644 index 00000000..e9766458 --- /dev/null +++ b/native/src/CMakeLists.txt @@ -0,0 +1,71 @@ +################################################################ +# A minimal CMake file that is compatible with sbt-jni # +# # +# All settings required by sbt-jni have been marked so, please # +# add/modify/remove settings to build your specific library. # +################################################################ + +cmake_minimum_required(VERSION 3.12) + +option(SBT "Set if invoked from sbt-jni" OFF) + +# Define project and related variables +# (required by sbt-jni) please use semantic versioning +# +project (high-performance-spark) +enable_language(Fortran) +set(PROJECT_VERSION_MAJOR 0) +set(PROJECT_VERSION_MINOR 0) +set(PROJECT_VERSION_PATCH 0) + +set (LIB_NAME ${PROJECT_NAME}${PROJECT_VERSION_MAJOR}) + +#tag::velox[] +set (GLUTEN_LIB_NAME ${PROJECT_NAME}-gluten-${PROJECT_VERSION_MAJOR}) +# For gluten+velox, you can leave out if not using gluten +set(GLUTEN_HOME ../../gluten) +set(CMAKE_FIND_DEBUG_MODE TRUE) +find_library(VELOX_LIBRARY NAMES velox HINTS + ${GLUTEN_HOME}/cpp/build/releases NO_DEFAULT_PATH) +# End gluten specific + +if(VELOX_LIBRARY) + file(GLOB GLUTEN_UDF_FILES + "./c/gluten/*.cpp") + add_library(${GLUTEN_LIB_NAME} SHARED ${GLUTEN_UDF_FILES}) + target_include_directories(${GLUTEN_LIB_NAME} PRIVATE ${GLUTEN_HOME}/cpp ${GLUTEN_HOME}/ep/build-velox/build/velox_ep) + target_link_libraries(${GLUTEN_LIB_NAME} PRIVATE ${VELOX_LIBRARY}) +else() + message(WARNING "Velox library not found. Specific path not added.") +endif() +#end::velox[] + +# Setup JNI +find_package(JNI REQUIRED) +if (JNI_FOUND) + message (STATUS "JNI include directories: ${JNI_INCLUDE_DIRS}") +endif() + +# Include directories +include_directories(.) +include_directories(include) +include_directories(${JNI_INCLUDE_DIRS}) + +# Sources +file(GLOB LIB_SRC + "*.c" + "*.f95" + "*.f*" + "*.cc" + "*.cpp" + "./c/*.c" + "./c/*.cpp" + "./fortran/*.f95" + "./fortran/*.f*" +) + +# Setup installation targets +# (required by sbt-jni) major version should always be appended to library name +# +add_library(${LIB_NAME} SHARED ${LIB_SRC}) +install(TARGETS ${LIB_NAME} LIBRARY DESTINATION .) diff --git a/native/src/c/gluten/GlutenUDF.cpp b/native/src/c/gluten/GlutenUDF.cpp new file mode 100644 index 00000000..14019f4a --- /dev/null +++ b/native/src/c/gluten/GlutenUDF.cpp @@ -0,0 +1,82 @@ +// Filename MyUDF.cpp + +#include +#include +#include + + +namespace { +using namespace facebook::velox; + +template +class PlusConstantFunction : public exec::VectorFunction { + public: + explicit PlusConstantFunction(int32_t addition) : addition_(addition) {} + + void apply( + const SelectivityVector& rows, + std::vector& args, + const TypePtr& /* outputType */, + exec::EvalCtx& context, + VectorPtr& result) const override { + using nativeType = typename TypeTraits::NativeType; + VELOX_CHECK_EQ(args.size(), 1); + + auto& arg = args[0]; + + // The argument may be flat or constant. + VELOX_CHECK(arg->isFlatEncoding() || arg->isConstantEncoding()); + + BaseVector::ensureWritable(rows, createScalarType(), context.pool(), result); + + auto* flatResult = result->asFlatVector(); + auto* rawResult = flatResult->mutableRawValues(); + + flatResult->clearNulls(rows); + + if (arg->isConstantEncoding()) { + auto value = arg->as>()->valueAt(0); + rows.applyToSelected([&](auto row) { rawResult[row] = value + addition_; }); + } else { + auto* rawInput = arg->as>()->rawValues(); + + rows.applyToSelected([&](auto row) { rawResult[row] = rawInput[row] + addition_; }); + } + } + + private: + const int32_t addition_; +}; + +static std::vector> integerSignatures() { + // integer -> integer + return {exec::FunctionSignatureBuilder().returnType("integer").argumentType("integer").build()}; +} + +static std::vector> bigintSignatures() { + // bigint -> bigint + return {exec::FunctionSignatureBuilder().returnType("bigint").argumentType("bigint").build()}; +} + +} // namespace + +const int kNumMyUdf = 2; +gluten::UdfEntry myUdf[kNumMyUdf] = {{"myudf1", "integer"}, {"myudf2", "bigint"}}; + +DEFINE_GET_NUM_UDF { + return kNumMyUdf; +} + +DEFINE_GET_UDF_ENTRIES { + for (auto i = 0; i < kNumMyUdf; ++i) { + udfEntries[i] = myUdf[i]; + } +} + +DEFINE_REGISTER_UDF { + facebook::velox::exec::registerVectorFunction( + "myudf1", integerSignatures(), std::make_unique>(5)); + facebook::velox::exec::registerVectorFunction( + "myudf2", bigintSignatures(), std::make_unique>(5)); + std::cout << "registered myudf1, myudf2" << std::endl; +} diff --git a/src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h b/native/src/c/include/com_highperformancespark_examples_ffi_SumJNI.h similarity index 100% rename from src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h rename to native/src/c/include/com_highperformancespark_examples_ffi_SumJNI.h diff --git a/src/main/c/sum.c b/native/src/c/sum.c similarity index 100% rename from src/main/c/sum.c rename to native/src/c/sum.c diff --git a/src/main/c/sum.h b/native/src/c/sum.h similarity index 100% rename from src/main/c/sum.h rename to native/src/c/sum.h diff --git a/src/main/c/sum_wrapper.c b/native/src/c/sum_wrapper.c similarity index 100% rename from src/main/c/sum_wrapper.c rename to native/src/c/sum_wrapper.c diff --git a/src/main/c/sumf_wrapper.c b/native/src/c/sumf_wrapper.c similarity index 100% rename from src/main/c/sumf_wrapper.c rename to native/src/c/sumf_wrapper.c diff --git a/src/main/fortran/sumf.f95 b/native/src/fortran/sumf.f95 similarity index 100% rename from src/main/fortran/sumf.f95 rename to native/src/fortran/sumf.f95 diff --git a/project/build.properties b/project/build.properties new file mode 100644 index 00000000..04267b14 --- /dev/null +++ b/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.9.9 diff --git a/project/plugins.sbt b/project/plugins.sbt index 26c430ed..8cfbf42a 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,22 +1,27 @@ -addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0") +addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" resolvers += "sonatype-snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/" -//tag::addSparkPackagesPlugin[] -resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven" +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.2") -addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.5") -//end::addSparkPackagesPlugin[] +addDependencyTreePlugin -//addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") - -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.0") +//tag::scalaFix[] +addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.12.1") +//end::scalaFix[] //tag::sbtJNIPlugin[] -addSbtPlugin("ch.jodersky" %% "sbt-jni" % "1.0.0-RC3") +addSbtPlugin("com.github.sbt" %% "sbt-jni" % "1.7.0") //end::sbtJNIPlugin[] -addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0") +//tag::xmlVersionConflict[] +// See https://github.com/scala/bug/issues/12632 +ThisBuild / libraryDependencySchemes ++= Seq( + "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always +) +//end::xmlVersionConflict[] + +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0") diff --git a/python/.flake8 b/python/.flake8 new file mode 100644 index 00000000..79a16af7 --- /dev/null +++ b/python/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 120 \ No newline at end of file diff --git a/python/README.md b/python/README.md new file mode 100644 index 00000000..3cf58309 --- /dev/null +++ b/python/README.md @@ -0,0 +1 @@ +Python examples for High Performance Spark diff --git a/high_performance_pyspark/SQLLineage.py b/python/examples/SQLLineage.py similarity index 58% rename from high_performance_pyspark/SQLLineage.py rename to python/examples/SQLLineage.py index 121f0b40..26bd0c4a 100644 --- a/high_performance_pyspark/SQLLineage.py +++ b/python/examples/SQLLineage.py @@ -1,3 +1,13 @@ +from pyspark.sql import DataFrame, Row +from pyspark.sql.session import SparkSession +import sys + +global df +global sc +global rdd +global spark + + """ >>> df = rdd.toDF() >>> df2 = cutLineage(df) @@ -7,20 +17,12 @@ True """ -global df -global sc -global rdd -global spark - -from pyspark.context import SparkContext -from pyspark.sql import DataFrame, Row -from pyspark.sql.session import SparkSession # tag::cutLineage[] def cutLineage(df): """ Cut the lineage of a DataFrame - used for iterative algorithms - + .. Note: This uses internal members and may break between versions >>> df = rdd.toDF() >>> cutDf = cutLineage(df) @@ -30,43 +32,48 @@ def cutLineage(df): jRDD = df._jdf.toJavaRDD() jSchema = df._jdf.schema() jRDD.cache() - sqlCtx = df.sql_ctx - try: - javaSqlCtx = sqlCtx._jsqlContext - except: - javaSqlCtx = sqlCtx._ssql_ctx - newJavaDF = javaSqlCtx.createDataFrame(jRDD, jSchema) - newDF = DataFrame(newJavaDF, sqlCtx) + session = df.sparkSession + javaSparkSession = session._jsparkSession + newJavaDF = javaSparkSession.createDataFrame(jRDD, jSchema) + newDF = DataFrame(newJavaDF, session) return newDF + + # end::cutLineage[] + def _setupTest(): globs = globals() - spark = SparkSession.builder \ - .master("local[4]") \ - .getOrCreate() + spark = SparkSession.builder.master("local[4]").getOrCreate() sc = spark._sc sc.setLogLevel("ERROR") - globs['sc'] = sc - globs['spark'] = spark - globs['rdd'] = rdd = sc.parallelize( - [Row(field1=1, field2="row1"), - Row(field1=2, field2="row2"), - Row(field1=3, field2="row3")]) + globs["sc"] = sc + globs["spark"] = spark + globs["rdd"] = sc.parallelize( + [ + Row(field1=1, field2="row1"), + Row(field1=2, field2="row2"), + Row(field1=3, field2="row3"), + ] + ) return globs + def _test(): """ Run the tests. """ import doctest + globs = _setupTest() - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - globs['sc'].stop() + (failure_count, test_count) = doctest.testmod( + globs=globs, optionflags=doctest.ELLIPSIS + ) + globs["sc"].stop() if failure_count: exit(-1) -import sys + if __name__ == "__main__": _test() # Hack to support running in nose diff --git a/python/examples/__init__.py b/python/examples/__init__.py new file mode 100644 index 00000000..80db2c40 --- /dev/null +++ b/python/examples/__init__.py @@ -0,0 +1 @@ +__version__ = 0.2 diff --git a/high_performance_pyspark/bad_pyspark.py b/python/examples/bad_pyspark.py similarity index 81% rename from high_performance_pyspark/bad_pyspark.py rename to python/examples/bad_pyspark.py index 46741dc9..083fbdd6 100644 --- a/high_performance_pyspark/bad_pyspark.py +++ b/python/examples/bad_pyspark.py @@ -1,10 +1,11 @@ # This script triggers a number of different PySpark errors -from pyspark import * from pyspark.sql.session import SparkSession +import sys global sc + def nonExistentInput(sc): """ Attempt to load non existent input @@ -18,6 +19,7 @@ def nonExistentInput(sc): failedRdd.count() # end::nonExistent[] + def throwOuter(sc): """ Attempt to load non existant input @@ -33,6 +35,7 @@ def throwOuter(sc): transform2.count() # end::throwOuter[] + def throwInner(sc): """ Attempt to load non existant input @@ -48,6 +51,7 @@ def throwInner(sc): transform2.count() # end::throwInner[] + # tag::rewrite[] def add1(x): """ @@ -57,6 +61,7 @@ def add1(x): """ return x + 1 + def divZero(x): """ Divide by zero (cause an error) @@ -67,6 +72,7 @@ def divZero(x): """ return x / 0 + def throwOuter2(sc): """ Attempt to load non existant input @@ -80,6 +86,7 @@ def throwOuter2(sc): transform2 = transform1.map(divZero) transform2.count() + def throwInner2(sc): """ Attempt to load non existant input @@ -92,8 +99,11 @@ def throwInner2(sc): transform1 = data.map(divZero) transform2 = transform1.map(add1) transform2.count() + + # end::rewrite[] + def throwInner3(sc): """ Attempt to load non existant input @@ -102,14 +112,17 @@ def throwInner3(sc): """ data = sc.parallelize(range(10)) rejectedCount = sc.accumulator(0) + def loggedDivZero(x): import logging + try: return [x / 0] except Exception as e: rejectedCount.add(1) logging.warning("Error found " + repr(e)) return [] + transform1 = data.flatMap(loggedDivZero) transform2 = transform1.map(add1) transform2.count() @@ -118,45 +131,51 @@ def loggedDivZero(x): def runOutOfMemory(sc): """ - Run out of memory on the workers. - In standalone modes results in a memory error, but in YARN may trigger YARN container - overhead errors. - >>> runOutOfMemory(sc) + Run out of memory on the workers from a skewed shuffle. + >>> runOutOfMemory(sc) # doctest: +SKIP Traceback (most recent call last): ... Py4JJavaError:... """ # tag::worker_oom[] - data = sc.parallelize(range(10)) - def generate_too_much(itr): - return range(10000000000000) - itr = data.flatMap(generate_too_much) - itr.count() + data = sc.parallelize(range(10000)) + + def generate_too_much(i: int): + return list(map(lambda v: (i % 2, v), range(100000 * i))) + + bad = data.flatMap(generate_too_much).groupByKey() + bad.count() # end::worker_oom[] + def _setupTest(): globs = globals() - spark = SparkSession.builder \ - .master("local[4]") \ - .getOrCreate() + spark = SparkSession.builder.master("local[4]").getOrCreate() sc = spark._sc - globs['sc'] = sc + globs["sc"] = sc return globs - + + def _test(): """ - Run the tests. - Note this will print a lot of error message to stderr since we don't capture the JVM sub process - stdout/stderr for doctests. + Run the tests. + Note this will print a lot of error message to stderr since we don't + capture the JVM sub process stdout/stderr for doctests. """ import doctest - globs = setupTest() - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - globs['sc'].stop() + + globs = _setupTest() + (failure_count, test_count) = doctest.testmod( + globs=globs, optionflags=doctest.ELLIPSIS + ) + print("All tests done, stopping Spark context.") + globs["sc"].stop() if failure_count: exit(-1) + else: + exit(0) + -import sys if __name__ == "__main__": _test() # Hack to support running in nose diff --git a/python/examples/bad_pyspark.py.fail b/python/examples/bad_pyspark.py.fail new file mode 100644 index 00000000..e69de29b diff --git a/python/examples/dual_write.py b/python/examples/dual_write.py new file mode 100644 index 00000000..94f27157 --- /dev/null +++ b/python/examples/dual_write.py @@ -0,0 +1,22 @@ +import asyncactions # noqa # pylint: disable=unused-import + + +class DualWriteExample: + def do_write(self, df, p1, p2): + """ + Apply two concrete actions to a DataFrame in parallel. + A common use case is two views of the same data, normally + one with sensitive data and one scrubbed/clean. + """ + # First we "persist" it (you can also checkpoint or choose a different + # level of persistence. + df.persist() + df.count() + # Create the distinct "safe" view. + df1 = df.select("times") + # Start the async actions + async1 = df1.write.mode("append").format("parquet").saveAsync(p1) + async2 = df.write.mode("append").format("parquet").saveAsync(p2) + # Block until the writes are both finished. + async1.result() + async2.result() diff --git a/python/examples/load_previous_run_data.py b/python/examples/load_previous_run_data.py new file mode 100644 index 00000000..d9277682 --- /dev/null +++ b/python/examples/load_previous_run_data.py @@ -0,0 +1,31 @@ +import os +import tempfile + + +class LoadPreviousRunData(object): + def __init__(self, session): + self.session = session + + def find_oldest_id(self, local_path): + """Find the oldest Spark job since it's probably not being updated.""" + directories = os.listdir(local_path) + return min(directories, key=lambda x: os.path.getmtime(f"{local_path}/{x}")) + + def do_magic(self): + local_path = "/tmp/spark-events" + event_log_path = f"file://{local_path}" + application_id = self.find_oldest_id(local_path) + return self.load_json_records(event_log_path, application_id) + + # tag::load[] + def load_json_records(self, event_log_path, application_id): + print(f"Loading {application_id}") + full_log_path = f"{event_log_path}/{application_id}" + df = self.session.read.json(full_log_path) + special_events = df.filter( + (df["Event"] == "SparkListenerExecutorAdded") + | (df["Event"] == "SparkListenerJobEnd") + ) + special_events.show() + + # end::load[] diff --git a/python/examples/pandera_ex.py b/python/examples/pandera_ex.py new file mode 100644 index 00000000..f3afa7c9 --- /dev/null +++ b/python/examples/pandera_ex.py @@ -0,0 +1,52 @@ +from pyspark.sql.session import SparkSession + +# tag::pandera_imports[] +import pandera.pyspark as pa +import pyspark.sql.types as T + +# end::pandera_imports[] + + +# tag::simple_data_schema[] +class ProjectDataSchema(pa.DataFrameModel): + # Note str_length is currently broken :/ + creator: T.StringType() = pa.Field(str_length={"min_value": 1}) + projectname: T.StringType() = pa.Field() + stars: T.IntegerType() = pa.Field(ge=0) + + +# end::simple_data_schema[] + + +# tag::gender_data[] +class GenderData(pa.DataFrameModel): + MaleBonusPercent: T.DoubleType() = pa.Field(nullable=True, le=5) + FemaleBonusPercent: T.DoubleType() = pa.Field(nullable=True) + CompanyNumber: T.IntegerType() = pa.Field() + + +# end::gender_data[] + +if __name__ == "__main__": + spark = SparkSession.builder.master("local[4]").getOrCreate() + # Make sure to make + # "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" + # available as ./data/2021 + uk_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) + + # tag::validate_gender_data[] + validated_df = GenderData(uk_df) + # Print out the errors. You may wish to exit with an error condition. + if validated_df.pandera.errors != {}: + print(validated_df.pandera.errors) + # sys.exit(1) + # end::validate_gender_data[] + + # tag::validate_project_data[] + project_data = spark.read.csv("./data/project.csv", header=True, inferSchema=True) + validated_df = ProjectDataSchema(project_data) + # Print out the errors. You may wish to exit with an error condition. + if validated_df.pandera.errors != {}: + print(validated_df.pandera.errors) + # sys.exit(1) + # end::validate_project_data[] diff --git a/high_performance_pyspark/simple_perf.py b/python/examples/simple_perf.py similarity index 61% rename from high_performance_pyspark/simple_perf.py rename to python/examples/simple_perf.py index 773ad3e0..1c725255 100644 --- a/high_performance_pyspark/simple_perf.py +++ b/python/examples/simple_perf.py @@ -1,14 +1,19 @@ # When running this example make sure to include the built Scala jar : -# $SPARK_HOME/bin/pyspark --jars ./target/examples-0.0.1.jar --driver-class-path ./target/examples-0.0.1.jar +# +# $SPARK_HOME/bin/pyspark --jars \ +# ./target/examples-0.0.1.jar --driver-class-path ./target/examples-0.0.1.jar +# # This example illustrates how to interface Scala and Python code, but caution # should be taken as it depends on many private members that may change in # future releases of Spark. -from pyspark.sql.types import * -from pyspark.sql import * +from pyspark.sql.types import StructType, IntegerType, DoubleType, StructField +from pyspark.sql import DataFrame, SparkSession +import sys import timeit import time + def generate_scale_data(sqlCtx, rows, numCols): """ Generate scale data for the performance test. @@ -28,14 +33,7 @@ def generate_scale_data(sqlCtx, rows, numCols): """ # tag::javaInterop[] sc = sqlCtx._sc - # Get the SQL Context, 2.1, 2.0 and pre-2.0 syntax - yay internals :p - try: - try: - javaSqlCtx = sqlCtx._jsqlContext - except: - javaSqlCtx = sqlCtx._ssql_ctx - except: - javaSqlCtx = sqlCtx._jwrapped + javaSparkSession = sqlCtx._jsparkSession jsc = sc._jsc scalasc = jsc.sc() gateway = sc._gateway @@ -45,21 +43,17 @@ def generate_scale_data(sqlCtx, rows, numCols): # This returns a Java RDD of Rows - normally it would better to # return a DataFrame directly, but for illustration we will work # with an RDD of Rows. - java_rdd = (gateway.jvm.com.highperformancespark.examples. - tools.GenerateScalingData. - generateMiniScaleRows(scalasc, rows, numCols)) + java_rdd = gateway.jvm.com.highperformancespark.examples.tools.GenerateScalingData.generateMiniScaleRows( + scalasc, rows, numCols + ) # Schemas are serialized to JSON and sent back and forth # Construct a Python Schema and turn it into a Java Schema - schema = StructType([ - StructField("zip", IntegerType()), - StructField("fuzzyness", DoubleType())]) - # 2.1 / pre-2.1 - try: - jschema = javaSqlCtx.parseDataType(schema.json()) - except: - jschema = sqlCtx._jsparkSession.parseDataType(schema.json()) + schema = StructType( + [StructField("zip", IntegerType()), StructField("fuzzyness", DoubleType())] + ) + jschema = javaSparkSession.parseDataType(schema.json()) # Convert the Java RDD to Java DataFrame - java_dataframe = javaSqlCtx.createDataFrame(java_rdd, jschema) + java_dataframe = javaSparkSession.createDataFrame(java_rdd, jschema) # Wrap the Java DataFrame into a Python DataFrame python_dataframe = DataFrame(java_dataframe, sqlCtx) # Convert the Python DataFrame into an RDD @@ -67,19 +61,25 @@ def generate_scale_data(sqlCtx, rows, numCols): return (python_dataframe, pairRDD) # end::javaInterop[] + def runOnDF(df): result = df.groupBy("zip").avg("fuzzyness").count() return result + def runOnRDD(rdd): - result = rdd.map(lambda (x, y): (x, (y, 1))). \ - reduceByKey(lambda x, y: (x[0] + y [0], x[1] + y[1])). \ - count() + result = ( + rdd.map(lambda xy: (xy[0], (xy[1], 1))) + .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) + .count() + ) return result + def groupOnRDD(rdd): return rdd.groupByKey().mapValues(lambda v: sum(v) / float(len(v))).count() + def run(sc, sqlCtx, scalingFactor, size): """ Run the simple perf test printing the results to stdout. @@ -98,17 +98,30 @@ def run(sc, sqlCtx, scalingFactor, size): """ (input_df, input_rdd) = generate_scale_data(sqlCtx, scalingFactor, size) input_rdd.cache().count() - rddTimeings = timeit.repeat(stmt=lambda: runOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()') - groupTimeings = timeit.repeat(stmt=lambda: groupOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()') + rddTimeings = timeit.repeat( + stmt=lambda: runOnRDD(input_rdd), + repeat=10, + number=1, + timer=time.time, + setup="gc.enable()", + ) + groupTimeings = timeit.repeat( + stmt=lambda: groupOnRDD(input_rdd), + repeat=10, + number=1, + timer=time.time, + setup="gc.enable()", + ) input_df.cache().count() - dfTimeings = timeit.repeat(stmt=lambda: runOnDF(input_df), repeat=10, number=1, timer=time.time, setup='gc.enable()') - print "RDD:" - print rddTimeings - print "group:" - print groupTimeings - print "df:" - print dfTimeings - print "yay" + dfTimeings = timeit.repeat( + stmt=lambda: runOnDF(input_df), + repeat=10, + number=1, + timer=time.time, + setup="gc.enable()", + ) + print(f"RDD: {rddTimeings}, group: {groupTimeings}, df: {dfTimeings}") + def parseArgs(args): """ @@ -123,15 +136,15 @@ def parseArgs(args): if __name__ == "__main__": - """ Usage: simple_perf_test scalingFactor size """ - import sys - from pyspark import SparkContext - from pyspark.sql import SQLContext - (scalingFactor, size) = parseArgs(sys.argv) - session = SparkSession.appName("SimplePythonPerf").builder.getOrCreate() + + scalingFactor = 1 + size = 1 + if len(sys.argv) > 2: + (scalingFactor, size) = parseArgs(sys.argv) + session = SparkSession.builder.appName("SimplePythonPerf").getOrCreate() sc = session._sc run(sc, session, scalingFactor, size) diff --git a/python/examples/spark_expectations_example.py b/python/examples/spark_expectations_example.py new file mode 100644 index 00000000..003bb158 --- /dev/null +++ b/python/examples/spark_expectations_example.py @@ -0,0 +1,111 @@ +from pyspark import SparkFiles +from pyspark.sql import * +from spark_expectations.core.expectations import ( + SparkExpectations, + WrappedDataFrameWriter, +) + +spark = SparkSession.builder.master("local[4]").getOrCreate() +sc = spark.sparkContext +sc.setLogLevel("ERROR") + +# tag::global_setup[] +se_conf = { + "se_notifications_enable_email": False, + "se_notifications_email_smtp_host": "mailhost.example.com", + "se_notifications_email_smtp_port": 25, + "se_notifications_email_from": "timbit@example.com", + "se_notifications_email_subject": "spark expectations - data quality - notifications", + "se_notifications_on_fail": True, + "se_notifications_on_error_drop_exceeds_threshold_breach": True, + "se_notifications_on_error_drop_threshold": 15, +} +# end::global_setup[] + + +# tag::setup_and_load[] +from spark_expectations.config.user_config import Constants as user_config + +spark.sql("DROP TABLE IF EXISTS local.magic_validation") +spark.sql( + """ +create table local.magic_validation ( + product_id STRING, + table_name STRING, + rule_type STRING, + rule STRING, + column_name STRING, + expectation STRING, + action_if_failed STRING, + tag STRING, + description STRING, + enable_for_source_dq_validation BOOLEAN, + enable_for_target_dq_validation BOOLEAN, + is_active BOOLEAN, + enable_error_drop_alert BOOLEAN, + error_drop_threshold INT +)""" +) +# Reminder: addFile does not handle directories well. +rule_file = "spark_expectations_sample_rules.json" +sc.addFile(rule_file) +df = spark.read.json(SparkFiles.get(rule_file)) +print(df) +df.write.option("byname", "true").mode("append").saveAsTable("local.magic_validation") +spark.read.table("local.magic_validation").show() + +# Can be used to point to your desired metastore. +se_writer = WrappedDataFrameWriter().mode("append").format("iceberg") + +rule_df = spark.sql("select * from local.magic_validation") + +se: SparkExpectations = SparkExpectations( + rules_df=rule_df, # See if we can replace this with the DF we wrote out. + product_id="pay", # We will only apply rules matching this product id + stats_table="local.dq_stats", + stats_table_writer=se_writer, + target_and_error_table_writer=se_writer, + stats_streaming_options={user_config.se_enable_streaming: False}, +) +# end::setup_and_load[] +rule_df.show(truncate=200) + + +# tag::run_validation_row[] +@se.with_expectations( + user_conf=se_conf, + write_to_table=False, # If set to true SE will write to the target table. + target_and_error_table_writer=se_writer, + # target_table is used to create the error table (e.g. here local.fake_table_name_error) + # and filter the rules on top of the global product filter. + target_table="local.fake_table_name", +) +def load_data(): + raw_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) + uk_df = raw_df.select("CompanyNumber", "MaleBonusPercent", "FemaleBonuspercent") + return uk_df + + +# data = load_data() +# end::run_validation_row[] + + +# tag::run_validation_complex[] +@se.with_expectations( + user_conf=se_conf, + write_to_table=True, # If set to true SE will write to the target table. + target_and_error_table_writer=se_writer, + # target_table is used to create the error table (e.g. here local.fake_table_name_error) + # and filter the rules on top of the global product filter. + target_table="local.3rd_fake", +) +def load_data2(): + raw_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) + uk_df = raw_df.select("CompanyNumber", "MaleBonusPercent", "FemaleBonuspercent") + return uk_df + + +data = load_data2() +# end::run_validation_complex[] + +spark.sql("SELECT * FROM local.3rd_fake_error").show(truncate=300) diff --git a/python/examples/spark_expectations_example.py.fail b/python/examples/spark_expectations_example.py.fail new file mode 100644 index 00000000..e69de29b diff --git a/python/examples/test_dual_write.py b/python/examples/test_dual_write.py new file mode 100644 index 00000000..e68eb2b1 --- /dev/null +++ b/python/examples/test_dual_write.py @@ -0,0 +1,27 @@ +import os +import tempfile + +# tag::test[] +from sparktestingbase.sqltestcase import SQLTestCase +from pyspark.sql.functions import current_timestamp +from pyspark.sql.types import Row +from .dual_write import DualWriteExample + + +class DualWriteTest(SQLTestCase): + def test_always_passes(self): + self.assertTrue(True) + + def test_actual_dual_write(self): + tempdir = tempfile.mkdtemp() + p1 = os.path.join(tempdir, "data1") + p2 = os.path.join(tempdir, "data2") + df = self.sqlCtx.createDataFrame([Row("timbit"), Row("farted")], ["names"]) + combined = df.withColumn("times", current_timestamp()) + DualWriteExample().do_write(combined, p1, p2) + df1 = self.sqlCtx.read.format("parquet").load(p1) + df2 = self.sqlCtx.read.format("parquet").load(p2) + self.assertDataFrameEqual(df2.select("times"), df1, 0.1) + + +# end::test[] diff --git a/python/examples/test_dual_write_new.py b/python/examples/test_dual_write_new.py new file mode 100644 index 00000000..e8b6df52 --- /dev/null +++ b/python/examples/test_dual_write_new.py @@ -0,0 +1,39 @@ +import os +import tempfile + +# tag::test[] +import unittest +from pyspark.sql import SparkSession +from pyspark.sql.functions import current_timestamp +from pyspark.sql.types import Row +from pyspark.testing.utils import assertDataFrameEqual +from .dual_write import DualWriteExample + + +class DualWriteTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.spark = SparkSession.builder.appName( + "Testing PySpark Example" + ).getOrCreate() + + @classmethod + def tearDownClass(cls): + cls.spark.stop() + + def test_always_passes(self): + self.assertTrue(True) + + def test_actual_dual_write(self): + tempdir = tempfile.mkdtemp() + p1 = os.path.join(tempdir, "data1") + p2 = os.path.join(tempdir, "data2") + df = self.spark.createDataFrame([Row("timbit"), Row("farted")], ["names"]) + combined = df.withColumn("times", current_timestamp()) + DualWriteExample().do_write(combined, p1, p2) + df1 = self.spark.read.format("parquet").load(p1) + df2 = self.spark.read.format("parquet").load(p2) + assertDataFrameEqual(df2.select("times"), df1, 0.1) + + +# end::test[] diff --git a/python/examples/test_load_previous_run_data.py b/python/examples/test_load_previous_run_data.py new file mode 100644 index 00000000..1f0ca313 --- /dev/null +++ b/python/examples/test_load_previous_run_data.py @@ -0,0 +1,15 @@ +from pyspark.sql.session import SparkSession +import os +import tempfile + +from sparktestingbase.sqltestcase import SQLTestCase +from .load_previous_run_data import LoadPreviousRunData + + +class TestLoadPreviousRunData(SQLTestCase): + def test_do_magic(self): + lprd = LoadPreviousRunData(self.session) + try: + lprd.do_magic() + except FileNotFoundError: + print("No previous jobs") diff --git a/python/examples/udf.py b/python/examples/udf.py new file mode 100644 index 00000000..f0d6a605 --- /dev/null +++ b/python/examples/udf.py @@ -0,0 +1,73 @@ +# This script triggers a number of different PySpark errors + +from pyspark.sql.session import SparkSession +from pyspark.sql.functions import pandas_udf, udf +from typing import Iterator +import sys +import pandas as pd + +global sc + + +# We need the session before we can use @udf +spark = SparkSession.builder.master("local[4]").getOrCreate() + + +# tag::simple_udf[] +@udf("long") +def classic_add1(e: int) -> int: + return e + 1 + + +# end::simple_udf[] + + +# tag::agg_new_udf[] +@pandas_udf("long") +def pandas_sum(s: pd.Series) -> int: + return s.sum() + + +# end::agg_new_udf[] + + +# tag::new_udf[] +@pandas_udf("long") +def pandas_add1(s: pd.Series) -> pd.Series: + # Vectorized operation on all of the elems in series at once + return s + 1 + + +# end::new_udf[] + + +# tag::complex_udf[] +@pandas_udf("long") +def pandas_nested_add1(d: pd.DataFrame) -> pd.Series: + # Takes a struct and returns the age elem + 1, if we wanted + # to update (e.g. return struct) we could update d and return it instead. + return d["age"] + 1 + + +# end::complex_udf[] + + +# tag::batches_of_batches_udf[] +@pandas_udf("long") +def pandas_batches_of_batches(t: Iterator[pd.Series]) -> Iterator[pd.Series]: + my_db_connection = None # Expensive setup logic goes here + for s in t: + # Do something with your setup logic + if my_db_connection is None: + # Vectorized operation on all of the elems in series at once + yield s + 1 + + +# end::batches_of_batches_udf[] + + +if __name__ == "__main__": + # Make sure to make + # "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" + # available as ./data/2021 + uk_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 00000000..38b11847 --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,7 @@ +[build-system] +requires = ["setuptools >= 58.0"] +build-backend = "setuptools.build_meta" + +[[tool.mypy.overrides]] +module = "examples" +ignore_missing_imports = true diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 00000000..6654dc9d --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,11 @@ +spark-testing-base +pandas +pyarrow +pyspark==3.5.0 +pyspark-asyncactions +pandera +pandera[pyspark] +spark-expectations>=1.0 +venv-pack +requests +numpy<2.0 diff --git a/python/setup.cfg b/python/setup.cfg new file mode 100644 index 00000000..64c8931c --- /dev/null +++ b/python/setup.cfg @@ -0,0 +1,39 @@ +[metadata] +name = examples +version = attr: examples.__version__ +author = Holden and Anya +author_email = your@email.address +url = https://github.com/high-performance-spark/high-performance-spark-examples +description = Python Examples for High Performance Spark +long_description = file: README.md +long_description_content_type = text/markdown +keywords = example, setuptools, pyspark +license = BSD 3-Clause License +classifiers = + License :: OSI Approved :: BSD License + Programming Language :: Python :: 3 + +[options] +packages = find: +zip_safe = True +include_package_data = True +install_requires = + pandas >= 1.4.1 + PyYAML >= 6.0 + typer + mypy + pyspark + pyspark-asyncactions + + +[options.entry_points] +console_scripts = + my-example-utility = example.example_module:main + +[options.extras_require] +dev = + black>=22.1.0 + flake8>=4.0.1 + +[options.package_data] +* = README.md \ No newline at end of file diff --git a/python/tox.ini b/python/tox.ini new file mode 100644 index 00000000..2aa2d4d2 --- /dev/null +++ b/python/tox.ini @@ -0,0 +1,75 @@ +[tox] +passenv = * +isolated_build = True +requires = tox-conda +envlist = + isort + py310 + black + mypy + flake8 + +skip_missing_interpeters = true + +[gh-actions] +python = +# 3.9: py39 +# We need a new version of PySpark w/3.10 support. + 3.10: py310 + +[testenv] +setenv = + DJANGO_SETTINGS_MODULE=fighthealthinsurance.settings + PYTHONPATH={toxinidir} + DJANGO_CONFIGURATION=Dev +passenv = * +extras = + tests + coverage +deps = + pytest + isort==4.3.21 + pyspark==3.5.0 + flake8 + spark-testing-base>=0.11.1 + mypy + -rrequirements.txt +commands = + pytest examples \ + {posargs} +allowlist_externals = pytest + +[testenv:isort] +extras = tests +skipsdist = True +commands = isort --check-only --diff examples +allowlist_externals = isort + +[testenv:black] +extras = tests +skipsdist = True +commands = black --check examples +allowlist_externals = black +deps = + black + -rrequirements.txt + +[testenv:flake8] +extras = tests +skipsdist = True +commands = flake8 --ignore=F403,E402,F401,F405,W503,E265 examples +allowlist_externals = flake8 + +[testenv:mypy] +extras = tests +passenv = * +deps = + pytest + mypy + -rrequirements.txt +setenv = + {[testenv]setenv} + MYPYPATH={toxinidir} +commands = + mypy -m examples +allowlist_externals = mypy \ No newline at end of file diff --git a/run_container.sh b/run_container.sh new file mode 100755 index 00000000..0efe1f60 --- /dev/null +++ b/run_container.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -ex +VERSION=${VERSION:-0.5} +IMAGE=${IMAGE:-holdenk/hps:$VERSION} +export VERSION +export IMAGE +docker image pull "$IMAGE" +mkdir -p warehouse +mkdir -p iceberg-workshop +docker container run --mount type=bind,source="$(pwd)"/warehouse,target=/high-performance-spark-examples/warehouse --mount type=bind,source="$(pwd)/iceberg-workshop",target=/high-performance-spark-examples/iceberg-workshop -p 8877:8877 -p 4040:4040 -it "${IMAGE}" # /bin/bash diff --git a/run_pyspark_examples.sh b/run_pyspark_examples.sh new file mode 100755 index 00000000..7e0818e4 --- /dev/null +++ b/run_pyspark_examples.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# shellcheck disable=SC1091,SC2034 + +source env_setup.sh + +set -ex + +set -o pipefail + +#tag::package_venv[] +if [ ! -d pyspark_venv ]; then + python -m venv pyspark_venv +fi + +source pyspark_venv/bin/activate +pip install -r ./python/requirements.txt + +if [ ! -f pyspark_venv.tar.gz ]; then + venv-pack -o pyspark_venv.tar.gz +fi + + +# Set in local and client mode where the driver uses the Python present +# (requires that you have activated the venv as we did above) +PYSPARK_DRIVER_PYTHON=python +export PYSPARK_DRIVER_PYTHON +export PYTHON_PATH=./environment/bin/python +#end::package_venv[] + +# Some hack for our json magic +cat se*.json > spark_expectations_sample_rules.json + +function check_fail () { + local ex="$1" + local code="$2" + if [ -f "${ex}.fail" ]; then + echo "ok"; + else + exit "$code" + fi +} + +EXAMPLE_JAR="./core/target/scala-2.13/core-assembly-0.1.0-SNAPSHOT.jar" + +pip install setuptools + +# Iceberg JAR not yet available for Spark 4. +if [ ! -f "${EXAMPLE_JAR}" ]; then + rm ./core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala # temp hack no merge in Spark 3. + sbt core/assembly -DsparkVersion="${SPARK_VERSION}" +fi + +if [ ! -f "${EXAMPLE_JAR}" ]; then + echo "Can't find sample jar?!?" + exit 1 +fi + +function run_example () { + local ex="$1" + # shellcheck disable=SC2046 + spark-submit \ + --master local[5] \ + --conf spark.eventLog.enabled=true \ + --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + --conf spark.sql.catalog.spark_catalog.type=hive \ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.local.type=hadoop \ + --archives pyspark_venv.tar.gz#environment \ + --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ + $(cat "${ex}.conf" || echo "") \ + --name "${ex}" \ + --jars "${EXAMPLE_JAR}" \ + "${ex}" 2>&1 | tee -a "${ex}.out" || check_fail "$ex" $? +} + +if [ $# -eq 1 ]; then + run_example "python/examples/$1" +else + for ex in python/examples/*.py; do + if [[ "$ex" =~ test.* ]]; then + echo "Skipping ex $ex as it is a test and covered by our tests." + else + echo "Running $ex" + run_example "$ex" + fi + done +fi diff --git a/run_sql_examples.sh b/run_sql_examples.sh new file mode 100755 index 00000000..946abf4c --- /dev/null +++ b/run_sql_examples.sh @@ -0,0 +1,62 @@ +#!/bin/bash +set -ex +set -o pipefail + +source env_setup.sh + +# You might want to set SPARK_EXTRA to do things like log more info + +function run_example () { + local sql_file="$1" + local extra="$2" + EXTENSIONS=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions + if [ -n "$EXTRA_EXTENSIONS" ]; then + EXTENSIONS="$EXTENSIONS,$EXTRA_EXTENSIONS" + fi + # shellcheck disable=SC2046,SC2086 + ${SPARK_HOME}/bin/spark-sql --master local[5] \ + --conf spark.eventLog.enabled=true \ + --conf spark.sql.extensions=$EXTENSIONS \ + --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ + --conf spark.sql.catalog.spark_catalog.type=hive \ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.local.type=hadoop \ + --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ + ${extra} ${SPARK_EXTRA} \ + $(cat "${sql_file}.conf" || echo "") \ + --name "${sql_file}" \ + -f "${sql_file}" 2>&1 | tee -a "${sql_file}.out" || ls "${sql_file}.expected_to_fail" +} + + +# If you want to look at them +# ${SPARK_PATH}/sbin/start-history-server.sh + +if [ $# -eq 1 ]; then + if [[ "$1" != *"gluten_only"* ]]; then + run_example "sql/$1" + else + echo "Processing gluten ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" + fi +else + # For each SQL + for sql_file in sql/*.sql; do + if [[ "$sql_file" != *"_only"* ]]; then + echo "Processing ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" + elif [[ "$sql_file" != *"gluten_only"* && "$GLUTEN_EXISTS" == "true" ]]; then + echo "Processing gluten ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" + elif [[ "$sql_file" != *"gluten_udf_only"* && "$GLUTEN_UDF_EXISTS" == "true" ]]; then + echo "Processing gluten UDF ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" + else + echo "Skipping $sql_file since we did not find gluten and this is restricted example." + fi + done +fi diff --git a/sbt/sbt b/sbt/sbt deleted file mode 100755 index aac1085a..00000000 --- a/sbt/sbt +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This script launches sbt for this project. If present it uses the system -# version of sbt. If there is no system version of sbt it attempts to download -# sbt locally. -SBT_VERSION=0.13.9 -URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar -URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar -JAR=sbt/sbt-launch-${SBT_VERSION}.jar - -# Download sbt launch jar if it hasn't been downloaded yet -if [ ! -f ${JAR} ]; then - # Download - printf "Attempting to fetch sbt\n" - set -x - JAR_DL=${JAR}.part - if hash wget 2>/dev/null; then - (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} - elif hash axel 2>/dev/null; then - (axel ${URL1} -o ${JAR_DL} || axel ${URL2} -o ${JAR_DL}) && mv ${JAR_DL} ${JAR} - else - printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" - exit -1 - fi -fi -if [ ! -f ${JAR} ]; then - # We failed to download - printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" - exit -1 -fi -printf "Launching sbt from ${JAR}\n" -java \ - -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \ - -jar ${JAR} \ - "$@" diff --git a/sbt/sbt.bat b/sbt/sbt.bat deleted file mode 100644 index 0f7a3e9a..00000000 --- a/sbt/sbt.bat +++ /dev/null @@ -1,95 +0,0 @@ -@REM SBT launcher script -@REM -@REM Environment: -@REM JAVA_HOME - location of a JDK home dir (mandatory) -@REM SBT_OPTS - JVM options (optional) -@REM Configuration: -@REM sbtconfig.txt found in the SBT_HOME. - -@REM ZOMG! We need delayed expansion to build up CFG_OPTS later -@setlocal enabledelayedexpansion - -@echo off -set SBT_HOME=%~dp0 - -rem FIRST we load the config file of extra options. -set FN=%SBT_HOME%\..\conf\sbtconfig.txt -set CFG_OPTS= -FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO ( - set DO_NOT_REUSE_ME=%%i - rem ZOMG (Part #2) WE use !! here to delay the expansion of - rem CFG_OPTS, otherwise it remains "" for this loop. - set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME! -) - -rem poor man's jenv (which is not available on Windows) -IF DEFINED JAVA_HOMES ( - IF EXIST .java-version FOR /F %%A IN (.java-version) DO ( - SET JAVA_HOME=%JAVA_HOMES%\%%A - SET JDK_HOME=%JAVA_HOMES%\%%A - ) -) -rem must set PATH or wrong javac is used for java projects -IF DEFINED JAVA_HOME SET "PATH=%JAVA_HOME%\bin;%PATH%" - -rem users can set JAVA_OPTS via .jvmopts (sbt-extras style) -IF EXIST .jvmopts FOR /F %%A IN (.jvmopts) DO ( - SET JAVA_OPTS=%%A !JAVA_OPTS! -) - -rem We use the value of the JAVACMD environment variable if defined -set _JAVACMD=%JAVACMD% - -if "%_JAVACMD%"=="" ( - if not "%JAVA_HOME%"=="" ( - if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe" - ) -) - -if "%_JAVACMD%"=="" set _JAVACMD=java - -rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config. -set _JAVA_OPTS=%JAVA_OPTS% -if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS% - -:args_loop -if "%~1" == "" goto args_end - -if "%~1" == "-jvm-debug" ( - set JVM_DEBUG=true - set /a JVM_DEBUG_PORT=5005 2>nul >nul -) else if "!JVM_DEBUG!" == "true" ( - set /a JVM_DEBUG_PORT=%1 2>nul >nul - if not "%~1" == "!JVM_DEBUG_PORT!" ( - set SBT_ARGS=!SBT_ARGS! %1 - ) -) else ( - set SBT_ARGS=!SBT_ARGS! %1 -) - -shift -goto args_loop -:args_end - -if defined JVM_DEBUG_PORT ( - set _JAVA_OPTS=!_JAVA_OPTS! -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=!JVM_DEBUG_PORT! -) - -call :run %SBT_ARGS% - -if ERRORLEVEL 1 goto error -goto end - -:run - -"%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -cp "%SBT_HOME%sbt-launch.jar" xsbt.boot.Boot %* -goto :eof - -:error -@endlocal -exit /B 1 - - -:end -@endlocal -exit /B 0 diff --git a/se_complex.json b/se_complex.json new file mode 100644 index 00000000..f073e640 --- /dev/null +++ b/se_complex.json @@ -0,0 +1,2 @@ +{"product_id": "pay", "table_name": "local.3rd_fake", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": false, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} +{"product_id": "pay", "table_name": "local.3rd_fake", "rule_type": "query_dq", "rule": "history", "column_name": "MaleBonusPercent", "expectation": "(select count(*) from 3rd_fake_view) > (select input_count from local.dq_stats WHERE table_name='local.3rd_fake' LIMIT 1)", "action_if_failed": "fail", "tag": "", "description": "We should always have more records than before", "enable_for_source_dq_validation": false, "enable_for_target_dq_validation": true, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} diff --git a/se_simple.json b/se_simple.json new file mode 100644 index 00000000..72d9b866 --- /dev/null +++ b/se_simple.json @@ -0,0 +1 @@ +{"product_id": "pay", "table_name": "local.fake_table_name", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": false, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} diff --git a/sql/gluten_only_nonpartitioned_table_join.sql b/sql/gluten_only_nonpartitioned_table_join.sql new file mode 100644 index 00000000..572437c5 --- /dev/null +++ b/sql/gluten_only_nonpartitioned_table_join.sql @@ -0,0 +1,12 @@ +CREATE TABLE IF NOT EXISTS local.udevelopers ( + username string, + firstname string, + lastname string) +USING iceberg; +CREATE TABLE IF NOT EXISTS local.uprojects ( + creator string, + uprojectname string) +USING iceberg; +INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova"); +INSERT INTO local.uprojects VALUES("krisnova", "aurae"); +SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username; diff --git a/sql/iceberg-schema-evolution-gotcha-possibility.sql b/sql/iceberg-schema-evolution-gotcha-possibility.sql new file mode 100644 index 00000000..99b9fd60 --- /dev/null +++ b/sql/iceberg-schema-evolution-gotcha-possibility.sql @@ -0,0 +1,14 @@ +DROP TABLE IF EXISTS local.udevelopers_sorted; +CREATE TABLE IF NOT EXISTS local.udevelopers_sorted ( + username string, + firstname string, + lastname string) +USING ICEBERG; +INSERT INTO local.udevelopers_sorted VALUES("krisnova", "Kris", "Nova"); +ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY lastname; +ALTER TABLE local.udevelopers_sorted RENAME COLUMN lastname TO deprecated_lastname; +SELECT * FROM local.udevelopers_sorted; +ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY username; +ALTER TABLE local.udevelopers_sorted DROP COLUMN deprecated_lastname; +SELECT * FROM local.udevelopers_sorted; + diff --git a/sql/iceberg-schema-evolution-gotcha-possibility.sql.expected_to_fail b/sql/iceberg-schema-evolution-gotcha-possibility.sql.expected_to_fail new file mode 100644 index 00000000..e69de29b diff --git a/sql/iceberg-schema-evolution-gotcha-workaround.sql b/sql/iceberg-schema-evolution-gotcha-workaround.sql new file mode 100644 index 00000000..5b57afb2 --- /dev/null +++ b/sql/iceberg-schema-evolution-gotcha-workaround.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS local.udevelopers_sorted; +CREATE TABLE IF NOT EXISTS local.udevelopers_sorted ( + username string, + firstname string, + lastname string) +USING ICEBERG; +ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY lastname; +INSERT INTO local.udevelopers_sorted VALUES("krisnova", "Kris", "Nova"); +SELECT * FROM local.udevelopers_sorted; +ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY username; +-- Hack, add it to identifier fields so we can do a "partial" drop where it stays in the schema and we don't +-- corrupt the metadata. +ALTER TABLE local.udevelopers_sorted ADD PARTITION FIELD lastname; +ALTER TABLE local.udevelopers_sorted DROP PARTITION FIELD lastname; +SELECT * FROM local.udevelopers_sorted; diff --git a/sql/nonpartitioned_table_join.sql b/sql/nonpartitioned_table_join.sql new file mode 100644 index 00000000..572437c5 --- /dev/null +++ b/sql/nonpartitioned_table_join.sql @@ -0,0 +1,12 @@ +CREATE TABLE IF NOT EXISTS local.udevelopers ( + username string, + firstname string, + lastname string) +USING iceberg; +CREATE TABLE IF NOT EXISTS local.uprojects ( + creator string, + uprojectname string) +USING iceberg; +INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova"); +INSERT INTO local.uprojects VALUES("krisnova", "aurae"); +SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username; diff --git a/sql/nonpartitioned_table_join.sql.conf b/sql/nonpartitioned_table_join.sql.conf new file mode 100644 index 00000000..ece26ce0 --- /dev/null +++ b/sql/nonpartitioned_table_join.sql.conf @@ -0,0 +1,7 @@ + --conf spark.sql.sources.v2.bucketing.enabled=true + --conf spark.sql.iceberg.planning.preserve-data-grouping=true + --conf spark.sql.requireAllClusterKeysForCoPartition=false + + --conf spark.sql.adaptive.enabled=false + --conf spark.sql.autoBroadcastJoinThreshold=-1 + --conf spark.sql.shuffle.partitions=4 diff --git a/sql/partioned_table_join.sql b/sql/partioned_table_join.sql new file mode 100644 index 00000000..1f6dac31 --- /dev/null +++ b/sql/partioned_table_join.sql @@ -0,0 +1,14 @@ +CREATE TABLE IF NOT EXISTS local.developers ( + username string, + firstname string, + lastname string) +USING iceberg +PARTITIONED BY (username); +CREATE TABLE IF NOT EXISTS local.projects ( + creator string, + projectname string) +USING iceberg +PARTITIONED BY (creator); +INSERT INTO local.developers VALUES("krisnova", "Kris", "Nova"); +INSERT INTO local.projects VALUES("krisnova", "aurae"); +SELECT * FROM local.developers INNER JOIN local.projects ON local.projects.creator = local.developers.username; diff --git a/sql/partioned_table_join.sql.conf b/sql/partioned_table_join.sql.conf new file mode 100644 index 00000000..ece26ce0 --- /dev/null +++ b/sql/partioned_table_join.sql.conf @@ -0,0 +1,7 @@ + --conf spark.sql.sources.v2.bucketing.enabled=true + --conf spark.sql.iceberg.planning.preserve-data-grouping=true + --conf spark.sql.requireAllClusterKeysForCoPartition=false + + --conf spark.sql.adaptive.enabled=false + --conf spark.sql.autoBroadcastJoinThreshold=-1 + --conf spark.sql.shuffle.partitions=4 diff --git a/sql/wap.sql b/sql/wap.sql new file mode 100644 index 00000000..6665c22b --- /dev/null +++ b/sql/wap.sql @@ -0,0 +1,19 @@ +DROP TABLE IF EXISTS local.wap_projects; +CREATE TABLE local.wap_projects ( + creator string, + projectname string) +USING iceberg +PARTITIONED BY (creator); +ALTER TABLE local.projects SET TBLPROPERTIES ( + 'write.wap.enabled''true' +); +-- We need a first commit, see https://github.com/apache/iceberg/issues/8849 +INSERT INTO local.wap_projects VALUES("holdenk", "spark"); +ALTER TABLE local.wap_projects DROP BRANCH IF EXISTS `audit-branch`; +ALTER TABLE local.wap_projects CREATE BRANCH `audit-branch`; +SET spark.wap.branch = 'audit-branch'; +INSERT INTO local.projects VALUES("krisnova", "aurae"); +SELECT count(*) FROM local.wap_projects VERSION AS OF 'audit-branch' WHERE creator is NULL; +SELECT count(*) FROM local.wap_projects VERSION AS OF 'audit-branch' WHERE creator == "krisnova"; +CALL local.system.remove_orphan_files(table => 'local.wap_projects'); +CALL local.system.fast_forward("local.wap_projects", "main", "audit-branch"); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt deleted file mode 100644 index e88b326a..00000000 --- a/src/CMakeLists.txt +++ /dev/null @@ -1,74 +0,0 @@ -################################################################ -# A minimal CMake file that is compatible with sbt-jni # -# # -# All settings required by sbt-jni have been marked so, please # -# add/modify/remove settings to build your specific library. # -################################################################ - -cmake_minimum_required(VERSION 2.6) - -# Define project and related variables -# -project (high-performance-spark) - -# Enable fortan -enable_language (Fortran) -include(FortranCInterface) - - -# FFLAGS depend on the compiler -get_filename_component (Fortran_COMPILER_NAME ${CMAKE_Fortran_COMPILER} NAME) - - -# Set versions and library name -# (required by sbt-jni) please use semantic versioning -# -set (VERSION_MAJOR 0) -set (VERSION_MINOR 0) -set (VERSION_PATCH 0) -# (required by sbt-jni) major version will always be appended to library name -set (LIB_NAME ${CMAKE_PROJECT_NAME}${VERSION_MAJOR}) - -# Command-line options -# -# (set by sbt-jni) -set (LIB_INSTALL_DIR lib CACHE PATH "Path in which to install libraries (equivalent to Autoconf --libdir).") -# (set by sbt-jni) -set (LIB_ENABLE_MINOR_VERSIONS ON CACHE BOOLEAN "Build libraries with minor and patch versions appended.") - -# Setup JNI -find_package(JNI REQUIRED) -if (JNI_FOUND) - message (STATUS "JNI include directories: ${JNI_INCLUDE_DIRS}") -endif() - -# Include directories -include_directories(.) -include_directories(./main/c) -include_directories(./main/c/include) -include_directories(${JNI_INCLUDE_DIRS}) - -# Setup main shared library -file(GLOB LIB_SRC - "*.c" - "*.cpp" - "./main/c/*.c" - "./main/c/*.cpp" - "./main/fortran/*.f*" -) -add_library(${LIB_NAME} SHARED ${LIB_SRC}) - -# By default, in a regular build, minor and patch versions are added to the generated files. -# When built through sbt-jni however, LIB_ENABLE_MINOR_VERSIONS is deactivated and only a -# major-versioned library file is built. -if (LIB_ENABLE_MINOR_VERSIONS) - set_target_properties( - ${LIB_NAME} - PROPERTIES - VERSION 0.${VERSION_MINOR}.${VERSION_PATCH} # major version always 0, it is included in library name - SOVERSION 0 - ) -endif() - -# Installation targets -install(TARGETS ${LIB_NAME} LIBRARY DESTINATION ${LIB_INSTALL_DIR}) diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala deleted file mode 100644 index bddc84b4..00000000 --- a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala +++ /dev/null @@ -1,79 +0,0 @@ -/** - * Illustrates how to use Spark accumulators. Note that most of these examples - * are "dangerous" in that they may not return consistent results. - */ -package com.highperformancespark.examples.transformations - -import com.highperformancespark.examples.dataframe.RawPanda - -import org.apache.spark._ -import org.apache.spark.rdd._ - -import scala.collection.mutable.HashSet -object Accumulators { - /** - * Compute the total fuzzyness with an accumulator while generating - * an id and zip pair for sorting. - */ - //tag::sumFuzzyAcc[] - def computeTotalFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): - (RDD[(String, Long)], Double) = { - // Create an accumulator with the initial value of 0.0 - val acc = sc.accumulator(0.0) - val transformed = rdd.map{x => acc += x.attributes(0); (x.zip, x.id)} - // accumulator still has zero value - // Note: This example is dangerous since the transformation may be - // evaluated multiple times. - transformed.count() // force evaluation - (transformed, acc.value) - } - //end::sumFuzzyAcc[] - - /** - * Compute the max fuzzyness with an accumulator while generating an - * id and zip pair for sorting. - */ - //tag::maxFuzzyAcc[] - def computeMaxFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): - (RDD[(String, Long)], Double) = { - object MaxDoubleParam extends AccumulatorParam[Double] { - override def zero(initValue: Double) = initValue - override def addInPlace(r1: Double, r2: Double): Double = { - Math.max(r1, r2) - } - } - // Create an accumulator with the initial value of Double.MinValue - val acc = sc.accumulator(Double.MinValue)(MaxDoubleParam) - val transformed = rdd.map{x => acc += x.attributes(0); (x.zip, x.id)} - // accumulator still has Double.MinValue - // Note: This example is dangerous since the transformation may be - // evaluated multiple times. - transformed.count() // force evaluation - (transformed, acc.value) - } - //end::maxFuzzyAcc[] - - //tag::uniquePandaAcc[] - def uniquePandas(sc: SparkContext, rdd: RDD[RawPanda]): HashSet[Long] = { - object UniqParam extends AccumulableParam[HashSet[Long], Long] { - override def zero(initValue: HashSet[Long]) = initValue - // For adding new values - override def addAccumulator(r: HashSet[Long], t: Long): HashSet[Long] = { - r += t - r - } - // For merging accumulators - override def addInPlace(r1: HashSet[Long], r2: HashSet[Long]): - HashSet[Long] = { - r1 ++ r2 - } - } - // Create an accumulator with the initial value of Double.MinValue - val acc = sc.accumulable(new HashSet[Long]())(UniqParam) - val transformed = rdd.map{x => acc += x.id; (x.zip, x.id)} - // accumulator still has Double.MinValue - transformed.count() // force evaluation - acc.value - } - //end::uniquePandaAcc[] -} diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala b/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala deleted file mode 100644 index 6571ceef..00000000 --- a/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Checks basic Dataset magics - */ -package com.highperformancespark.examples.dataframe - -import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas} -import com.holdenkarau.spark.testing._ -import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.scalatest.Matchers._ -import org.scalatest.FunSuite - -import scala.collection.mutable -import scala.util.Random - -class MixedDatasetSuite extends FunSuite with DataFrameSuiteBase { - - val rawPandaList = List( - RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9, 20.0)), - RawPanda(11L, "94110", "red", true, Array(1.0, 0.7, 30.0))) - - test("happy panda sums") { - val sqlCtx = sqlContext - import sqlCtx.implicits._ - val mixedDS = new MixedDataset(sqlCtx) - val inputDF = sqlCtx.createDataFrame(rawPandaList) - val inputDS = inputDF.as[RawPanda] - val result = mixedDS.happyPandaSums(inputDS) - assert(result === (2.0 +- 0.001)) - } - - test("basic select") { - val sqlCtx = sqlContext - import sqlCtx.implicits._ - val inputDF = sqlCtx.createDataFrame(rawPandaList) - val inputDS = inputDF.as[RawPanda] - val mixedDS = new MixedDataset(sqlCtx) - val squishy = mixedDS.squishyPandas(inputDS).collect() - assert(squishy(0)._2 === true) - } - - test("funquery") { - val sqlCtx = sqlContext - import sqlCtx.implicits._ - val inputDF = sqlCtx.createDataFrame(rawPandaList) - val inputDS = inputDF.as[RawPanda] - val mixedDS = new MixedDataset(sqlCtx) - val summedAttrs = mixedDS.funMap(inputDS).collect() - assert(summedAttrs(0) === 21.9 +- 0.001) - assert(summedAttrs(1) === 31.7 +- 0.001) - } - - test("max pandas size per zip") { - val sqlCtx = sqlContext - import sqlCtx.implicits._ - val inputDF = sqlCtx.createDataFrame(rawPandaList) - val inputDS = inputDF.as[RawPanda] - val mixedDS = new MixedDataset(sqlCtx) - val bigPandas = mixedDS.maxPandaSizePerZip(inputDS).collect() - assert(bigPandas.size === 1) - assert(bigPandas(0)._2 === 30.0 +- 0.00001) - } -} diff --git a/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala b/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala deleted file mode 100644 index 4fd8ad52..00000000 --- a/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala +++ /dev/null @@ -1,25 +0,0 @@ -package com.highperformancespark.examples.wordcount - - -import com.holdenkarau.spark.testing.SharedSparkContext -import org.scalatest.FunSuite - -class WordCountTest extends FunSuite with SharedSparkContext { - test("word count with Stop Words Removed"){ - val wordRDD = sc.parallelize(Seq( - "How happy was the panda? You ask.", - "Panda is the most happy panda in all the #$!?ing land!")) - - val stopWords: Set[String] = Set("a", "the", "in", "was", "there", "she", "he") - val illegalTokens: Array[Char] = "#$%?!.".toCharArray - - val wordCounts = WordCount.withStopWordsFiltered( - wordRDD, illegalTokens, stopWords) - val wordCountsAsMap = wordCounts.collectAsMap() - assert(!wordCountsAsMap.contains("the")) - assert(!wordCountsAsMap.contains("?")) - assert(!wordCountsAsMap.contains("#$!?ing")) - assert(wordCountsAsMap.contains("ing")) - assert(wordCountsAsMap.get("panda").get.equals(3)) - } -} diff --git a/target-validator/ex.yaml b/target-validator/ex.yaml new file mode 100644 index 00000000..ce8b4925 --- /dev/null +++ b/target-validator/ex.yaml @@ -0,0 +1,31 @@ +detailedErrors: true +numKeyCols: 4 +# We might have a large number of errors so just show the first 5 +numErrorsToReport: 5 + +email: + smtpHost: smtp.example.com + subject: Data Validation Summary + from: data-validator-no-reply@example.com + to: + - professor-timbit@example.com + +tables: + - db: gender_paygaps + table: uk + # Columns that taken together uniquely specifies each row (think of groupBy) + keyColumns: + - CompanyNumber + - EmployerId + - CompanyLinkToGPGInfo + - ResponsiblePerson + # Used to filter + condition: MaleBonusPercent >= FemaleBonusPercent + checks: + # We expect at least 500 records + - type: rowCount + minNumRows: 500 + # We don't expect more than 1% not companies in the dataset. + - type: nullCheck + column: CompanyNumber + threshold: 0.01 diff --git a/target-validator/runme.sh b/target-validator/runme.sh new file mode 100755 index 00000000..b6236dd7 --- /dev/null +++ b/target-validator/runme.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# shellcheck disable=SC1091,SC2034 + +source ../env_setup.sh +set -ex +export SPARK_VERSION="${SPARK_VERSION:-3.4.1}" + +# Disable for now until the target folks agree on the PR nested builds are slow. +exit 0 + +git clone git@github.com:holdenk/data-validator.git || git clone https://github.com/holdenk/data-validator.git +cd data-validator +git checkout upgrade-to-modern-spark +sbt -Dspark="${SPARK_VERSION}" clean assembly +JAR_PATH="$(pwd)/target/scala-2.12/data-validator-assembly-${SPARK_VERSION}_0.15.0.jar" +export JAR_PATH +cd .. +"${SPARK_HOME}/bin/spark-submit" --master local "$JAR_PATH" --config ex.yaml || echo "Failed as expected."