diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 00000000..b5e1d28e
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,269 @@
+name: CI
+on:
+  pull_request:
+  push:
+
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - java: 17
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Sync the current branch with the latest
+        if: github.repository != 'high-performance-spark/high-performance-spark-examples'
+        id: sync-branch
+        run: |
+          git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/}
+          git -c user.name='Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD || echo "no merge needed."
+          git -c user.name='Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" || echo "no merge needed."
+
+      - name: Setup JDK
+        uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: ${{ matrix.java }}
+          cache: sbt
+
+      - name: Add sbt
+        uses: sbt/setup-sbt@v1
+
+      - name: Scala Build and Test
+        run: sbt clean package +test
+
+  python-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install tox tox-gh-actions
+
+      - name: Run tox
+        run: |
+          cd python; tox
+
+  run-sql-examples:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache Spark and friends
+        uses: actions/cache@v4
+        with:
+          path: |
+            spark*.tgz
+            iceberg*.jar
+          key: spark-artifacts
+
+      - name: Setup JDK
+        uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: 17
+
+      - name: Add sbt
+        uses: sbt/setup-sbt@v1
+
+      - name: Cache Data
+        uses: actions/cache@v4
+        with:
+          path: |
+            data/fetched/*
+          key: data-fetched
+
+      - name: Run sql examples
+        run: ./run_sql_examples.sh
+
+  # run-gluten-sql-examples:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Checkout
+  #       uses: actions/checkout@v4
+  #     - name: Cache Spark and friends
+  #       uses: actions/cache@v4
+  #       with:
+  #         path: |
+  #           spark*.tgz
+  #           iceberg*.jar
+  #         key: spark-artifacts
+  #     - name: Setup JDK
+  #       uses: actions/setup-java@v4
+  #       with:
+  #         distribution: temurin
+  #         java-version: 17
+  #    - name: Add sbt
+  #      uses: sbt/setup-sbt@v1
+  #     - name: Cache Maven packages
+  #       uses: actions/cache@v4
+  #       with:
+  #         path: ~/.m2
+  #         key: ${{ runner.os }}-m2-gluten
+  #     - name: Cache Data
+  #       uses: actions/cache@v4
+  #       with:
+  #         path: |
+  #           data/fetched/*
+  #         key: data-fetched
+  #     - name: Run gluten
+  #       run: |
+  #         cd accelerators; ./gluten_spark_34_ex.sh
+
+  run-comet-sql-examples:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache Spark and friends
+        uses: actions/cache@v4
+        with:
+          path: |
+            spark*.tgz
+            iceberg*.jar
+          key: spark-artifacts
+
+      - name: Cache Data
+        uses: actions/cache@v4
+        with:
+          path: |
+            data/fetched/*
+          key: data-fetched
+
+      - name: Cache Maven packages
+        uses: actions/cache@v4
+        with:
+          path: ~/.m2
+          key: ${{ runner.os }}-m2-comet
+
+      - name: Setup Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          override: true
+
+      - name: Setup JDK
+        uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: 17
+
+      - name: Add sbt
+        uses: sbt/setup-sbt@v1
+
+      - name: Setup comet
+        run: |
+          cd accelerators; SPARK_MAJOR=3.5 ./setup_comet.sh
+
+      - name: Run comet
+        run: |
+          cd accelerators; ./comet_ex.sh
+
+  run-target-examples:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache Spark and friends
+        uses: actions/cache@v4
+        with:
+          path: |
+            spark*.tgz
+            iceberg*.jar
+          key: spark-artifacts
+      - name: Setup JDK
+        uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: 17
+
+      - name: Cache Accel
+        uses: actions/cache@v4
+        with:
+          path: |
+            accelerators/*.jar
+          key: accelerators-artifacts
+
+      - name: Cache Data
+        uses: actions/cache@v4
+        with:
+          path: |
+            data/fetched/*
+          key: data-fetched
+
+      - name: Run the target validator example
+        run: |
+          cd target-validator; ./runme.sh
+
+  run-pyspark-examples:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache Spark and friends
+        uses: actions/cache@v4
+        with:
+          path: |
+            spark*.tgz
+            iceberg*.jar
+          key: spark-artifacts
+
+      - name: Cache Data
+        uses: actions/cache@v4
+        with:
+          path: |
+            data/fetched/*
+          key: data-fetched
+
+      - name: Setup JDK
+        uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: 17
+          cache: sbt
+
+      - name: Add sbt
+        uses: sbt/setup-sbt@v1
+
+      - name: Run PySpark examples
+        run: ./run_pyspark_examples.sh
+
+  style:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Shellcheck
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y shellcheck
+          shellcheck -e SC2317,SC1091,SC2034,SC2164 $(find -name "*.sh")
+
+      - name: Setup JDK
+        uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: 17
+          cache: sbt
+      - name: Add sbt
+        uses: sbt/setup-sbt@v1
+      - name: scala style
+        run:
+          sbt scalastyle
diff --git a/.gitignore b/.gitignore
index 4a8e38ca..30685846 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@ lib_managed/
 src_managed/
 project/boot/
 project/plugins/project/
+.bsp
 
 # Scala-IDE specific
 .scala_dependencies
@@ -23,11 +24,88 @@ project/plugins/project/
 *~
 sbt/*launch*.jar
 
+# VSCode specific
+.vscode
+.history
+
+# Metals
+.metals
+.bloop
+metals.sbt
+
 # python
 *.pyc
+.tox
+.bsp
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# scala stuff
+.metals
 
 # native
 *.o
 *.so
 *.so.0.0.0
-*.so.0
\ No newline at end of file
+*.so.0
+
+# Spark files
+*.tgz
+iceberg-spark-runtime-*.jar
+spark-*-bin-hadoop*/
+
+# Warehouse
+spark-warehouse/
+warehouse/
+metastore_db/
+
+# Misc internal stuff
+sql/*.sql.out
+python/examples/*.py.out
+data/fetched/*
+spark_expectations_sample_rules.json
+
+# more python
+pyspark_venv.tar.gz
+pyspark_venv/
+
+# accel stuff
+accelerators/*.jar
+accelerators/arrow-datafusion-comet
+# ignore gluten
+gluten
+gluten*.jar
+spark-3*hadoop*/
+spark-3*hadoop*.tgz
+accelerators/incubator-gluten
+# ignore the temporary myapp from the dockerbuild
+myapp.tar
+# ignore glutten
+incubator-glutten/*
+# ignore nested build file.
+project/build.sbt
+coursier
+# Magic file we use for build tracking
+oldhash
+# ignore ipynb checkpoints
+.ipynb_checkpoints/
+
+# ignore accel
+incubator-gluten/
diff --git a/.jvmopts b/.jvmopts
new file mode 100644
index 00000000..694a6c7d
--- /dev/null
+++ b/.jvmopts
@@ -0,0 +1,4 @@
+ -Xms4096M
+ -Xmx8096M
+ -Xss2M
+ -XX:MaxMetaspaceSize=4024M
\ No newline at end of file
diff --git a/.scalafix.conf b/.scalafix.conf
new file mode 100644
index 00000000..8697e8ff
--- /dev/null
+++ b/.scalafix.conf
@@ -0,0 +1,31 @@
+UnionRewrite.deprecatedMethod {
+  "unionAll" = "union"
+}
+
+OrganizeImports {
+  blankLines = Auto,
+  groups = [
+    "re:javax?\\."
+    "scala."
+    "org.apache.spark."
+    "*"
+  ],
+  removeUnused = false
+}
+
+rules = [
+  DisableSyntax,
+  SparkAutoUpgrade,
+  MigrateHiveContext,
+  MigrateToSparkSessionBuilder,
+  MigrateDeprecatedDataFrameReaderFuns,
+  AccumulatorUpgrade,
+  onFailureFix,
+  ExecutorPluginWarn,
+  UnionRewrite,
+  GroupByKeyWarn,
+  GroupByKeyRewrite,
+  MetadataWarnQQ,
+  ScalaTestExtendsFix,
+  ScalaTestImportChange
+]
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 58147d3d..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-language: scala
-sudo: false
-cache:
-  directories:
-    - $HOME/.ivy2
-    - $HOME/spark
-    - $HOME/.cache/pip
-    - $HOME/.pip-cache
-    - $HOME/.sbt/launchers
-    - $HOME/perl5
-scala:
-   - 2.11.6
-jdk:
-  - oraclejdk8
-r:
-  - release
-addons:
-  apt:
-    sources:
-      - ubuntu-toolchain-r-test
-      - ppa:marutter/rdev
-    packages:
-      - gfortran
-      - gcc
-      - binutils
-      - python-pip
-      - python-pandas
-      - python-numpy
-      - gfortran
-      - cmake
-      - perl
-      - cpanminus
-      - r-base
-      - libcurl4-gnutls-dev
-      - libxml2-dev
-      - libssl-dev
-      - r-base-dev
-      - axel
-r_packages:
-  - Imap
-before_install:
-  - # Setup Python
-  - pip install --user codecov unittest2 nose pep8 pylint
-  - # Setup perl
-  - cpanm --force --local-lib $HOME/perl5  --quite --notest Pithub || cat ~/.cpanm/build.log
-  - cd ./src/main/perl; cpanm --local-lib $HOME/perl5 --force --quiet --installdeps --notest .; cd ../../../
-  - PATH="$HOME/perl5/bin${PATH:+:${PATH}}"; export PATH;
-  - PERL5LIB=":$HOME/perl5/lib/perl5${PERL5LIB:+:${PERL5LIB}}"; export PERL5LIB;
-  - PERL_LOCAL_LIB_ROOT="$HOME/perl5${PERL_LOCAL_LIB_ROOT:+:${PERL_LOCAL_LIB_ROOT}}"; export PERL_LOCAL_LIB_ROOT;
-  - PERL_MB_OPT="--install_base \"$HOME/perl5\""; export PERL_MB_OPT;
-  - PERL_MM_OPT="INSTALL_BASE=$HOME/perl5"; export PERL_MM_OPT;
-script:
-  - "export SPARK_CONF_DIR=./log4j/"
-  - sbt clean coverage compile package assembly test || (rm -rf ~/.ivy2 ~/.m2 && sbt clean coverage compile package test)
-  - "[ -f spark] || mkdir spark && cd spark && axel http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz && cd .."
-  - "tar -xf ./spark/spark-2.1.0-bin-hadoop2.7.tgz"
-  - "export SPARK_HOME=`pwd`/spark-2.1.0-bin-hadoop2.7"
-  - "export PYTHONPATH=$SPARK_HOME/python:`ls -1 $SPARK_HOME/python/lib/py4j-*-src.zip`:$PYTHONPATH"
-  - "PYSPARK_SUBMIT_ARGS='--jars ./target/examples-assembly-0.0.1.jar pyspark-shell' nosetests --with-doctest --doctest-options=+ELLIPSIS --logging-level=INFO --detailed-errors --verbosity=2 --with-coverage --cover-html-dir=./htmlcov"
-  - # $SPARK_HOME/bin/spark-submit ./src/main/r/wc.R $SPARK_HOME/README.md
-  - # $SPARK_HOME/bin/spark-submit ./src/main/r/dapply.R
-after_success:
-  - sbt coverageReport || sbt update coverageReport
-  - codecov
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..c4feed87
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,7 @@
+ARG base
+FROM $base
+
+USER root
+RUN pip install --no-cache-dir pyarrow pyiceberg[pandas,snappy,daft,s3fs] avro fastavro
+USER dev
+RUN sbt clean compile
diff --git a/Dockerfile-mini b/Dockerfile-mini
new file mode 100644
index 00000000..b9e7ddf8
--- /dev/null
+++ b/Dockerfile-mini
@@ -0,0 +1,69 @@
+# Open JDK11, Spark 3.X and the latest JDKs get a little spicy
+FROM azul/zulu-openjdk:11-latest
+
+RUN apt-get -qq update && \
+    apt-get -qq -y upgrade && \
+    apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop python-is-python3 && \
+    locale-gen en_US.UTF-8 && \
+    apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \
+    echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \
+    echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \
+    curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \
+    chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \
+    apt-get update && \
+    apt-get -qq -y install sbt && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN curl -Lo coursier https://git.io/coursier-cli
+RUN chmod +x coursier
+# ensure the JAR of the CLI is in the coursier cache, in the image
+RUN ./coursier --help
+RUN pip install --no-cache-dir  jupyter
+# Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3
+#RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8
+RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb
+
+RUN ./coursier bootstrap \
+      -r jitpack \
+      -i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \
+      sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \
+      --default=true --sources \
+      -o almond && \
+    ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
+
+
+RUN adduser dev
+RUN adduser dev sudo
+RUN echo 'dev:dev' | chpasswd 
+RUN mkdir -p ~dev
+RUN cp ./coursier ~dev/
+RUN echo "color_prompt=yes" >> ~dev/.bashrc
+RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc
+RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.2-bin-hadoop3" >> ~dev/.bashrc
+RUN chown -R dev ~dev
+USER dev
+# Kernels are installed in user so we need to run as the user
+RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
+USER root
+
+RUN mkdir -p /high-performance-spark-examples
+RUN mkdir -p /high-performance-spark-examples/warehouse
+RUN chown -R dev /high-performance-spark-examples
+WORKDIR /high-performance-spark-examples
+# Increase the chance of caching by copying just the env setup file first.
+COPY --chown=dev:dev env_setup.sh ./
+# Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place
+# Also downloads some test data
+RUN SCALA_VERSION=2.13 ./env_setup.sh && rm *.tgz
+RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back
+# Note: We need to use /home in the COPY otherwise no happy pandas
+COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new
+RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json
+RUN chown -R dev /high-performance-spark-examples
+ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/
+RUN git clone https://github.com/holdenk/spark-upgrade.git
+RUN chown -R dev /high-performance-spark-examples
+USER dev
+RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history
+CMD ["/high-performance-spark-examples/misc/container_launch.sh"]
+
diff --git a/README.md b/README.md
index 551928fd..b230d384 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 # high-performance-spark-examples
 Examples for High Performance Spark
 
+We are in the progress of updata this for Spark 4 (some parts depending on external libraries like Iceberg, Comet, etc. are still 3.X) and the 2ed edition of our book!
+
 # Building
 
 Most of the examples can be built with sbt, the C and Fortran components depend on gcc, g77, and cmake.
@@ -8,3 +10,25 @@ Most of the examples can be built with sbt, the C and Fortran components depend
 # Tests
 
 The full test suite depends on having the C and Fortran components built as well as a local R installation available.
+
+The most "accuate" way of seeing how we run the tests is to look at the .github workflows
+
+# History Server
+
+The history server can be a great way to figure out what's going on.
+
+By default the history server writes to `/tmp/spark-events` so you'll need to create that directory if not setup with
+
+`mkdir -p /tmp/spark-events`
+
+The scripts for running the examples generally run with the event log enabled.
+
+You can set the SPARK_EVENTLOG=true before running the scala tests and you'll get the history server too!
+
+e.g.
+
+`SPARK_EVENTLOG=true sbt test`
+
+If you want to run just a specific test you can run [testOnly](https://www.scala-sbt.org/1.x/docs/Testing.html)
+
+Then to view the history server you'll want to launch it using the `${SPARK_HOME}/sbin/start-history-server.sh` then you [can go to your local history server](http://localhost:18080/)
diff --git a/accelerators/comet_env_setup.sh b/accelerators/comet_env_setup.sh
new file mode 100644
index 00000000..3563f0eb
--- /dev/null
+++ b/accelerators/comet_env_setup.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+SPARK_EXTRA="
+--jars ${COMET_JAR} \
+--driver-class-path ${COMET_JAR} \
+--conf spark.comet.enabled=true \
+--conf spark.comet.exec.enabled=true \
+--conf spark.comet.exec.all.enabled=true \
+--conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
+--conf spark.comet.exec.shuffle.enabled=true \
+--conf spark.comet.columnar.shuffle.enabled=true"
+# Instead of --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions we set
+# EXTRA_EXTENSIONS so it can be appended to iceberg
+if [ -z "$EXTRA_EXTENSIONS" ]; then
+  EXTRA_EXTENSIONS="org.apache.comet.CometSparkSessionExtensions"
+else
+  EXTRA_EXTENSIONS="org.apache.comet.CometSparkSessionExtensions,$EXTRA_EXTENSIONS"
+fi
+export EXTRA_EXTENSIONS
+export SPARK_EXTRA
diff --git a/accelerators/comet_ex.sh b/accelerators/comet_ex.sh
new file mode 100755
index 00000000..268a4dcb
--- /dev/null
+++ b/accelerators/comet_ex.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -ex
+
+# If you change this update the workflow version too.
+SPARK_MAJOR=${SPARK_MAJOR:-3.5}
+SPARK_VERSION=${SPARK_MAJOR}.1
+export SPARK_MAJOR
+export SPARK_VERSION
+
+source setup_comet.sh
+pushd ..
+source ./env_setup.sh 
+popd
+source comet_env_setup.sh
+pushd ..
+USE_COMET="true" ./run_sql_examples.sh
diff --git a/accelerators/gluten_config.properties b/accelerators/gluten_config.properties
new file mode 100644
index 00000000..eab39465
--- /dev/null
+++ b/accelerators/gluten_config.properties
@@ -0,0 +1,5 @@
+spark.plugins=io.glutenproject.GlutenPlugin
+spark.memory.offHeap.enabled=true
+spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager
+# This static allocation is one of the hardest part of using Gluten
+spark.memory.offHeap.size=20g
diff --git a/accelerators/gluten_env_setup.sh b/accelerators/gluten_env_setup.sh
new file mode 100755
index 00000000..6bda6ecd
--- /dev/null
+++ b/accelerators/gluten_env_setup.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Check if we gluten and gluten UDFs present
+GLUTEN_NATIVE_LIB_NAME=libhigh-performance-spark-gluten-0.so
+NATIVE_LIB_DIR=$(pwd)/../native/src/
+NATIVE_LIB_PATH="${NATIVE_LIB_DIR}${GLUTEN_NATIVE_LIB_NAME}"
+GLUTEN_HOME=incubator-gluten
+source /etc/lsb-release
+if [ -n "$GLUTEN_JAR_PATH" ]; then
+  GLUTEN_EXISTS="true"
+  GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \
+     --conf spark.memory.offHeap.enabled=true \
+     --conf spark.memory.offHeap.size=5g \
+     --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
+     --jars ${GLUTEN_JAR_PATH}"
+fi
+if [ -f "${NATIVE_LIB_PATH}" ]; then
+  if [ "$GLUTEN_EXISTS" == "true" ]; then
+    GLUTEN_UDF_EXISTS="true"
+    GLUTEN_SPARK_EXTRA="$GLUTEN_SPARK_EXTRA \
+      --conf spark.jars=${GLUTEN_JAR_PATH} \
+      --conf spark.gluten.loadLibFromJar=true \
+      --files ${NATIVE_LIB_PATH} \
+      --conf spark.gluten.sql.columnar.backend.velox.udfLibraryPaths=${GLUTEN_NATIVE_LIB_NAME}"
+  fi
+fi
+SPARK_EXTRA=GLUTEN_SPARK_EXTRA
+
+export SPARK_EXTRA
+export GLUTEN_UDF_EXISTS
+export GLUTEN_EXISTS
diff --git a/accelerators/gluten_spark_34_ex.sh b/accelerators/gluten_spark_34_ex.sh
new file mode 100755
index 00000000..0f98ab8e
--- /dev/null
+++ b/accelerators/gluten_spark_34_ex.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -ex
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd "${SCRIPT_DIR}"
+source "${SCRIPT_DIR}/setup_gluten_spark34.sh"
+
+export SPARK_HOME
+PATH="$(pwd)/${SPARK_DIR}/bin:$PATH"
+export PATH
+"${SPARK_HOME}/bin/spark-sql" --master local[5] \
+  --conf spark.plugins=io.glutenproject.GlutenPlugin \
+  --conf spark.memory.offHeap.enabled=true \
+  --conf spark.memory.offHeap.size=5g \
+  --jars "${GLUTEN_JAR}" \
+  --conf spark.eventLog.enabled=true \
+  -e "SELECT 1"
+
+source gluten_env_setup.sh
+cd ..
+./run_sql_examples.sh || echo "Expected to fail"
diff --git a/accelerators/install_rust_if_needed.sh b/accelerators/install_rust_if_needed.sh
new file mode 100644
index 00000000..76826e8e
--- /dev/null
+++ b/accelerators/install_rust_if_needed.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+if [ -f "$HOME/.cargo/env" ]; then
+  source "$HOME/.cargo/env"
+fi
+
+if ! command -v cargo; then
+  curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+  source "$HOME/.cargo/env"
+fi
diff --git a/accelerators/run_gluten.sh b/accelerators/run_gluten.sh
new file mode 100755
index 00000000..34ddb3b1
--- /dev/null
+++ b/accelerators/run_gluten.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+"${SPARK_HOME}/bin/spark-shell" --master local --jars "${ACCEL_JARS}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_2.12-1.1.1.jar"  --spark-properties=gluten_config.properties
diff --git a/accelerators/setup_comet.sh b/accelerators/setup_comet.sh
new file mode 100755
index 00000000..ed89a0d8
--- /dev/null
+++ b/accelerators/setup_comet.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -ex
+source install_rust_if_needed.sh
+
+if command -v protoc >/dev/null 2>&1; then
+  echo "protoc already installed"
+else
+  sudo apt-get install -y protobuf-compiler
+fi
+
+if [ -z "${SPARK_MAJOR}" ]; then
+  echo "Need a spark major version specified."
+  exit 1
+else
+  echo "Building comet for Spark ${SPARK_MAJOR}"
+fi
+
+#tag::build[]
+# If we don't have fusion checked out do it
+if [ ! -d arrow-datafusion-comet ]; then
+  git clone https://github.com/apache/arrow-datafusion-comet.git
+fi
+
+# Build JAR if not present
+if [ -z "$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar)" ]; then
+  cd arrow-datafusion-comet
+  make clean release PROFILES="-Pspark-${SPARK_MAJOR} -Pscala-2.13"
+  cd ..
+fi
+COMET_JAR="$(pwd)/$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*SNAPSHOT.jar)"
+export COMET_JAR
+#end::build[]
diff --git a/accelerators/setup_gluten_deps.sh b/accelerators/setup_gluten_deps.sh
new file mode 100755
index 00000000..6472390c
--- /dev/null
+++ b/accelerators/setup_gluten_deps.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -ex
+
+sudo apt-get update
+#tag::gluten_deps[]
+sudo apt-get install -y locales wget tar tzdata git ccache cmake ninja-build build-essential \
+     llvm-dev clang libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev \
+     libcurl4-openssl-dev maven rapidjson-dev libdouble-conversion-dev libgflags-dev \
+     libsodium-dev libsnappy-dev nasm
+sudo apt install -y libunwind-dev
+sudo apt-get install -y libgoogle-glog-dev
+sudo apt-get -y install docker-compose
+sudo apt-get install -y libre2-9 || sudo apt-get install -y libre2-10
+#end::gluten_deps[]
diff --git a/accelerators/setup_gluten_from_src.sh b/accelerators/setup_gluten_from_src.sh
new file mode 100755
index 00000000..4788e05f
--- /dev/null
+++ b/accelerators/setup_gluten_from_src.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -ex
+
+# Setup deps
+source setup_gluten_deps.sh
+
+# Try gluten w/clickhouse
+#if [ ! -d gluten ]; then
+#  git clone https://github.com/oap-project/gluten.git
+#  cd gluten
+#  bash ./ep/build-clickhouse/src/build_clickhouse.sh
+#fi
+
+# Build gluten
+if [ ! -d gluten ]; then
+  # We need Spark 3.5 w/scala212
+  git clone git@github.com:holdenk/gluten.git || git clone https://github.com/holdenk/gluten.git
+  cd gluten
+  git checkout add-spark35-scala213-hack
+  ./dev/builddeps-veloxbe.sh
+  mvn clean package -Pbackends-velox -Pspark-3.5 -DskipTests
+  cd ..
+fi
diff --git a/accelerators/setup_gluten_spark34.sh b/accelerators/setup_gluten_spark34.sh
new file mode 100755
index 00000000..0cbfbc19
--- /dev/null
+++ b/accelerators/setup_gluten_spark34.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+mkdir -p /tmp/spark-events
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ACCEL_JARS=${SCRIPT_DIR}
+SPARK_MAJOR_VERSION=3.4
+SCALA_VERSION=${SCALA_VERSION:-"2.12"}
+
+set -ex
+
+# Note: this does not work on Ubuntu 23, only on 22
+# You might get something like:
+# # C  [libgluten.so+0x30c753]  gluten::Runtime::registerFactory(std::string const&, std::function<gluten::Runtime* (std::unordered_map<std::string, std::string, std::hash<std::string>, std::equal_to<std::string>, std::allocator<std::pair<std::string const, std::string> > > const&)>)+0x23
+
+
+SPARK_VERSION=3.4.2
+SPARK_MAJOR=3.4
+HADOOP_VERSION=3
+SPARK_DIR="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
+SPARK_FILE="${SPARK_DIR}.tgz"
+
+export SPARK_MAJOR
+export SPARK_VERSION
+
+source setup_gluten_deps.sh
+
+cd ..
+source /etc/lsb-release
+# Pre-baked only
+if [ "$DISTRIB_RELEASE" == "20.04" ]; then
+  source ./env_setup.sh
+  cd "${SCRIPT_DIR}"
+
+  GLUTEN_JAR="gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar"
+  GLUTEN_JAR_PATH="${SCRIPT_DIR}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar"
+
+  if [ ! -f "${GLUTEN_JAR_PATH}" ]; then
+    wget "https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR}" || unset GLUTEN_JAR_PATH
+  fi
+
+fi
+# Rather than if/else we fall through to build if wget fails because major version is not supported.
+if [ -z "$GLUTEN_JAR_PATH" ]; then
+  #tag::build_gluten[]
+  if [ ! -d incubator-gluten ]; then
+    git clone https://github.com/apache/incubator-gluten.git
+  fi
+  cd incubator-gluten
+  sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON
+  mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests
+  GLUTEN_JAR_PATH="$(pwd)/package/target/gluten-package-*-SNAPSHOT-${SPARK_MAJOR_VERSION}.jar"
+  #end::build_gluten[]
+fi
+
+export GLUTEN_JAR_PATH
+
diff --git a/build.sbt b/build.sbt
index 35b1508e..f5c04850 100644
--- a/build.sbt
+++ b/build.sbt
@@ -1,97 +1,125 @@
-organization := "com.highperformancespark"
-
-name := "examples"
-
-publishMavenStyle := true
-
-version := "0.0.1"
-
-scalaVersion := "2.11.6"
-scalaVersion in ThisBuild := "2.11.6"
-ivyScala := ivyScala.value map { _.copy(overrideScalaVersion = true) }
-
-crossScalaVersions := Seq("2.11.6")
-
-javacOptions ++= Seq("-source", "1.8", "-target", "1.8")
-
-//tag::sparkVersion[]
-sparkVersion := "2.2.0"
-//end::sparkVersion[]
-
-//tag::sparkComponents[]
-sparkComponents ++= Seq("core")
-//end::sparkComponents[]
-//tag::sparkExtraComponents[]
-sparkComponents ++= Seq("streaming", "mllib")
-//end::sparkExtraComponents[]
-//tag::addSQLHiveComponent[]
-sparkComponents ++= Seq("sql", "hive", "hive-thriftserver", "hive-thriftserver")
-//end::addSQLHiveComponent[]
+lazy val root = (project in file("."))
+  .aggregate(core, native)
 
-parallelExecution in Test := false
 
-fork := true
-
-javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true")
+organization := "com.highperformancespark"
 
-// additional libraries
-libraryDependencies ++= Seq(
-  "org.scalatest" %% "scalatest" % "3.0.1",
-  "org.scalacheck" %% "scalacheck" % "1.13.4",
-  "junit" % "junit" % "4.12",
-  "junit" % "junit" % "4.11",
-  "com.holdenkarau" %% "spark-testing-base" % "2.2.0_0.7.2",
-  "com.novocode" % "junit-interface" % "0.11" % "test->default",
-  //tag::scalaLogging[]
-  "com.typesafe.scala-logging" %% "scala-logging" % "3.5.0",
-  //end::scalaLogging[]
-  "org.codehaus.jackson" % "jackson-core-asl" % "1.8.8",
-  "org.codehaus.jackson" % "jackson-mapper-asl" % "1.8.8",
-  "org.codehaus.jackson" % "jackson-core-asl" % "1.9.13",
-  "org.codehaus.jackson" % "jackson-mapper-asl" % "1.9.13",
-  "net.java.dev.jna" % "jna" % "4.2.2")
+//tag::addSparkScalaFix[]
+// Needs to be commented out post-upgrade because of Scala versions.
+//ThisBuild / scalafixDependencies +=
+//  "com.holdenkarau" %% "spark-scalafix-rules-2.4.8" % "0.1.5"
+//ThisBuild / scalafixDependencies +=
+//  "com.github.liancheng" %% "organize-imports" % "0.6.0"
+//end::addSparkScalaFix[]
+
+lazy val V = _root_.scalafix.sbt.BuildInfo
+
+scalaVersion := "2.13.13"
+addCompilerPlugin(scalafixSemanticdb)
+scalacOptions ++= List(
+  "-Yrangepos",
+  "-P:semanticdb:synthetics:on"
+)
 
 
-scalacOptions ++= Seq("-deprecation", "-unchecked")
+name := "examples"
 
-pomIncludeRepository := { x => false }
+publishMavenStyle := true
 
+version := "0.0.1"
 resolvers ++= Seq(
-  "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
-  "Spray Repository" at "http://repo.spray.cc/",
+  "JBoss Repository" at "https://repository.jboss.org/nexus/content/repositories/releases/",
   "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
-  "Akka Repository" at "http://repo.akka.io/releases/",
-  "Twitter4J Repository" at "http://twitter4j.org/maven2/",
   "Apache HBase" at "https://repository.apache.org/content/repositories/releases",
-  "Twitter Maven Repo" at "http://maven.twttr.com/",
+  "Twitter Maven Repo" at "https://maven.twttr.com/",
   "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools",
   "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/",
-  "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
-  "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/",
-  "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
+  "Typesafe repository" at "https://repo.typesafe.com/typesafe/releases/",
+  "Second Typesafe repo" at "https://repo.typesafe.com/typesafe/maven-releases/",
+  "Mesosphere Public Repository" at "https://downloads.mesosphere.io/maven",
   Resolver.sonatypeRepo("public"),
-  Resolver.bintrayRepo("jodersky", "sbt-jni-macros"),
-  "jodersky" at "https://dl.bintray.com/jodersky/maven/"
+  Resolver.mavenLocal
 )
 
 licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html"))
 
-mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
-  {
-    case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
-    case m if m.startsWith("META-INF") => MergeStrategy.discard
-    case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first
-    case PathList("org", "apache", xs @ _*) => MergeStrategy.first
-    case PathList("org", "jboss", xs @ _*) => MergeStrategy.first
-    case "log4j.properties" => MergeStrategy.discard
-    case "about.html"  => MergeStrategy.rename
-    case "reference.conf" => MergeStrategy.concat
-    case _ => MergeStrategy.first
+def specialOptions = {
+  // We only need these extra props for JRE>17
+  if (sys.props("java.specification.version") > "1.17") {
+    Seq(
+      "base/java.lang", "base/java.lang.invoke", "base/java.lang.reflect", "base/java.io", "base/java.net", "base/java.nio",
+      "base/java.util", "base/java.util.concurrent", "base/java.util.concurrent.atomic",
+      "base/sun.nio.ch", "base/sun.nio.cs", "base/sun.security.action",
+      "base/sun.util.calendar", "security.jgss/sun.security.krb5",
+    ).map("--add-opens=java." + _ + "=ALL-UNNAMED")
+  } else {
+    Seq()
   }
 }
 
-// JNI
 
-enablePlugins(JniNative)
+val sparkVersion = settingKey[String]("Spark version")
+val sparkTestingVersion = settingKey[String]("Spark testing base version without Spark version part")
+
+
+// Core (non-JNI bits)
+
+lazy val core = (project in file("core")) // regular scala code with @native methods
+  .dependsOn(native % Runtime)
+  .settings(javah / target := (native / nativeCompile / sourceDirectory).value / "include")
+  .settings(scalaVersion := "2.13.13")
+  .settings(sbtJniCoreScope := Compile)
+  .settings(
+    scalaVersion := "2.13.8",
+    javacOptions ++= Seq("-source", "17", "-target", "17"),
+    parallelExecution in Test := false,
+    fork := true,
+    javaOptions ++= Seq("-Xms4048M", "-Xmx4048M", "-Djna.nosys=true"),
+    Test / javaOptions ++= specialOptions,
+    // 2.4.5 is the highest version we have with the old spark-testing-base deps
+    sparkVersion := System.getProperty("sparkVersion", "4.0.0"),
+    sparkTestingVersion := "2.1.2",
+    // additional libraries
+    libraryDependencies ++= Seq(
+      "org.apache.spark" %% "spark-core"                % sparkVersion.value % Provided,
+      "org.apache.spark" %% "spark-streaming"           % sparkVersion.value % Provided,
+      "org.apache.spark" %% "spark-sql"                 % sparkVersion.value % Provided,
+      "org.apache.spark" %% "spark-hive"                % sparkVersion.value % Provided,
+      "org.apache.spark" %% "spark-hive-thriftserver"   % sparkVersion.value % Provided,
+      "org.apache.spark" %% "spark-catalyst"            % sparkVersion.value % Provided,
+      "org.apache.spark" %% "spark-yarn"                % sparkVersion.value % Provided,
+      "org.apache.spark" %% "spark-mllib"               % sparkVersion.value % Provided,
+      "com.holdenkarau" %% "spark-testing-base"         % s"${sparkVersion.value}_${sparkTestingVersion.value}" % Test,
+      //tag::scalaLogging[]
+      "com.typesafe.scala-logging" %% "scala-logging" % "3.9.4",
+      //end::scalaLogging[]
+      "net.java.dev.jna" % "jna" % "5.12.1"),
+    scalacOptions ++= Seq("-deprecation", "-unchecked"),
+    pomIncludeRepository := { x => false },
+    resolvers += Resolver.mavenLocal
+  )
+
+// JNI Magic!
+lazy val native = (project in file("native")) // native code and build script
+  .settings(nativeCompile / sourceDirectory := sourceDirectory.value)
+  .settings(scalaVersion := "2.13.13")
+  .enablePlugins(JniNative) // JniNative needs to be explicitly enabled
+
+//tag::xmlVersionConflict[]
+// See https://github.com/scala/bug/issues/12632
+ThisBuild / libraryDependencySchemes ++= Seq(
+  "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always
+)
+//end::xmlVersionConflict[]
+
+assemblyMergeStrategy in assembly := {
+      case x => MergeStrategy.first
+}
 
-sourceDirectory in nativeCompile := sourceDirectory.value
+assemblyMergeStrategy in native := {
+      case x => MergeStrategy.first
+}
+
+assemblyMergeStrategy in core := {
+      case x => MergeStrategy.first
+}
diff --git a/build_container.sh b/build_container.sh
new file mode 100755
index 00000000..691ae67d
--- /dev/null
+++ b/build_container.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -ex
+
+cp .git/index /tmp/git_index
+export GIT_INDEX_FILE=/tmp/git_index
+git add -u
+hash=$(git write-tree)
+unset GIT_INDEX_FILE
+oldhash=$(cat oldhash || true)
+if [ "$hash" = "$oldhash" ] && [ -f myapp.tar ]; then
+  echo "Skipping making tar since we match."
+else
+  echo "Making tar since no match"
+  git archive -o myapp.tar --format=tar HEAD
+  echo "$hash" > oldhash
+fi
+VERSION=${VERSION:-0.5}
+IMAGE=${IMAGE:-holdenk/hps:$VERSION}
+MINI_IMAGE=${MINI_IMAGE:-holdenk/hps-mini:$VERSION}
+docker buildx build --platform=linux/amd64,linux/arm64 -t "${MINI_IMAGE}" -f Dockerfile-mini .  --push
+docker buildx build --platform=linux/amd64,linux/arm64 -t "${IMAGE}" .  --push --build-arg base="${MINI_IMAGE}"
+#docker buildx build --platform=linux/amd64 -t "${IMAGE}" .  --push
diff --git a/build_windows.sbt b/build_windows.sbt
deleted file mode 100644
index b698ab9a..00000000
--- a/build_windows.sbt
+++ /dev/null
@@ -1,91 +0,0 @@
-organization := "com.highperformancespark"
-
-name := "examples"
-
-publishMavenStyle := true
-
-version := "0.0.1"
-
-scalaVersion := "2.11.6"
-scalaVersion in ThisBuild := "2.11.6"
-ivyScala := ivyScala.value map { _.copy(overrideScalaVersion = true) }
-
-crossScalaVersions := Seq("2.11.6")
-
-javacOptions ++= Seq("-source", "1.8", "-target", "1.8")
-
-//tag::sparkVersion[]
-sparkVersion := "2.2.0"
-//end::sparkVersion[]
-
-//tag::sparkComponents[]
-sparkComponents ++= Seq("core")
-//end::sparkComponents[]
-//tag::sparkExtraComponents[]
-sparkComponents ++= Seq("streaming", "mllib")
-//end::sparkExtraComponents[]
-//tag::addSQLHiveComponent[]
-sparkComponents ++= Seq("sql", "hive", "hive-thriftserver", "hive-thriftserver")
-//end::addSQLHiveComponent[]
-
-parallelExecution in Test := false
-
-fork := true
-
-javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true")
-
-// additional libraries
-libraryDependencies ++= Seq(
-  "org.scalatest" %% "scalatest" % "3.0.1",
-  "org.scalacheck" %% "scalacheck" % "1.13.4",
-  "junit" % "junit" % "4.12",
-  "junit" % "junit" % "4.11",
-  "com.holdenkarau" %% "spark-testing-base" % "2.2.0_0.7.2",
-  "com.novocode" % "junit-interface" % "0.11" % "test->default",
-  //tag::sacalLogging[]
-  "com.typesafe.scala-logging" %% "scala-logging" % "3.5.0",
-  //end::scalaLogging[]
-  "org.codehaus.jackson" % "jackson-core-asl" % "1.8.8",
-  "org.codehaus.jackson" % "jackson-mapper-asl" % "1.8.8",
-  "org.codehaus.jackson" % "jackson-core-asl" % "1.9.13",
-  "org.codehaus.jackson" % "jackson-mapper-asl" % "1.9.13",
-  "net.java.dev.jna" % "jna" % "4.2.2")
-
-
-scalacOptions ++= Seq("-deprecation", "-unchecked")
-
-pomIncludeRepository := { x => false }
-
-resolvers ++= Seq(
-  "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
-  "Spray Repository" at "http://repo.spray.cc/",
-  "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
-  "Akka Repository" at "http://repo.akka.io/releases/",
-  "Twitter4J Repository" at "http://twitter4j.org/maven2/",
-  "Apache HBase" at "https://repository.apache.org/content/repositories/releases",
-  "Twitter Maven Repo" at "http://maven.twttr.com/",
-  "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools",
-  "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/",
-  "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
-  "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/",
-  "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven",
-  Resolver.sonatypeRepo("public"),
-  Resolver.bintrayRepo("jodersky", "sbt-jni-macros"),
-  "jodersky" at "https://dl.bintray.com/jodersky/maven/"
-)
-
-licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html"))
-
-mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
-  {
-    case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
-    case m if m.startsWith("META-INF") => MergeStrategy.discard
-    case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first
-    case PathList("org", "apache", xs @ _*) => MergeStrategy.first
-    case PathList("org", "jboss", xs @ _*) => MergeStrategy.first
-    case "log4j.properties" => MergeStrategy.discard
-    case "about.html"  => MergeStrategy.rename
-    case "reference.conf" => MergeStrategy.concat
-    case _ => MergeStrategy.first
-  }
-}
diff --git a/c b/c
new file mode 100644
index 00000000..cb4d93b6
--- /dev/null
+++ b/c
@@ -0,0 +1,2 @@
+bloop
+
diff --git a/src/main/java/com/highperformancespark/examples/JavaInterop.java b/core/src/main/java/com/highperformancespark/examples/JavaInterop.java
similarity index 100%
rename from src/main/java/com/highperformancespark/examples/JavaInterop.java
rename to core/src/main/java/com/highperformancespark/examples/JavaInterop.java
diff --git a/src/main/java/com/highperformancespark/examples/WordCount.java b/core/src/main/java/com/highperformancespark/examples/WordCount.java
similarity index 100%
rename from src/main/java/com/highperformancespark/examples/WordCount.java
rename to core/src/main/java/com/highperformancespark/examples/WordCount.java
diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java
similarity index 85%
rename from src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java
rename to core/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java
index 950f9e5c..62b32e06 100644
--- a/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java
+++ b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java
@@ -4,10 +4,9 @@
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.Column;
 import org.apache.spark.sql.*;
-import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.SparkSession;
 import org.apache.spark.sql.expressions.Window;
 import org.apache.spark.sql.expressions.WindowSpec;
-import org.apache.spark.sql.hive.HiveContext;
 
 import java.util.HashMap;
 import java.util.Map;
@@ -16,39 +15,23 @@
 
 public class JavaHappyPandas {
 
-  /**
-   * Creates SQLContext with an existing SparkContext.
-   */
-  public static SQLContext sqlContext(JavaSparkContext jsc) {
-    SQLContext sqlContext = new SQLContext(jsc);
-    return sqlContext;
-  }
-
-  /**
-   * Creates HiveContext with an existing SparkContext.
-   */
-  public static HiveContext hiveContext(JavaSparkContext jsc) {
-    HiveContext hiveContext = new HiveContext(jsc);
-    return hiveContext;
-  }
-
   /**
    * Illustrate loading some JSON data.
    */
-  public static Dataset<Row> loadDataSimple(JavaSparkContext jsc, SQLContext sqlContext, String path) {
-    Dataset<Row> df1 = sqlContext.read().json(path);
+  public static Dataset<Row> loadDataSimple(JavaSparkContext jsc, SparkSession session, String path) {
+    Dataset<Row> df1 = session.read().json(path);
 
-    Dataset<Row> df2 = sqlContext.read().format("json").option("samplingRatio", "1.0").load(path);
+    Dataset<Row> df2 = session.read().format("json").option("samplingRatio", "1.0").load(path);
 
     JavaRDD<String> jsonRDD = jsc.textFile(path);
-    Dataset<Row> df3 = sqlContext.read().json(jsonRDD);
+    Dataset<Row> df3 = session.read().json(jsonRDD);
 
     return df1;
   }
 
-  public static Dataset<Row> jsonLoadFromRDD(SQLContext sqlContext, JavaRDD<String> input) {
+  public static Dataset<Row> jsonLoadFromRDD(SparkSession session, JavaRDD<String> input) {
     JavaRDD<String> rdd = input.filter(e -> e.contains("panda"));
-    Dataset<Row> df = sqlContext.read().json(rdd);
+    Dataset<Row> df = session.read().json(rdd);
     return df;
   }
 
@@ -147,10 +130,10 @@ public static Dataset<Row> minMeanSizePerZip(Dataset<Row> pandas) {
   }
 
   public static Dataset<Row> simpleSqlExample(Dataset<Row> pandas) {
-    SQLContext sqlContext = pandas.sqlContext();
+    SparkSession session = SparkSession.builder().getOrCreate();
     pandas.registerTempTable("pandas");
 
-    Dataset<Row> miniPandas = sqlContext.sql("SELECT * FROM pandas WHERE pandaSize < 12");
+    Dataset<Row> miniPandas = session.sql("SELECT * FROM pandas WHERE pandaSize < 12");
     return miniPandas;
   }
 
diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java
similarity index 100%
rename from src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java
rename to core/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java
diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java
similarity index 100%
rename from src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java
rename to core/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java
diff --git a/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java b/core/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java
similarity index 100%
rename from src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java
rename to core/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java
diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java
similarity index 100%
rename from src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java
rename to core/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java
diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java
similarity index 100%
rename from src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java
rename to core/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java
diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java
similarity index 100%
rename from src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java
rename to core/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java
diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java
similarity index 100%
rename from src/main/java/com/highperformancespark/examples/objects/JavaPandas.java
rename to core/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java
diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java
similarity index 100%
rename from src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java
rename to core/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java
diff --git a/src/main/julia/setup.jl b/core/src/main/julia/setup.jl
similarity index 100%
rename from src/main/julia/setup.jl
rename to core/src/main/julia/setup.jl
diff --git a/src/main/julia/wc.jl b/core/src/main/julia/wc.jl
similarity index 100%
rename from src/main/julia/wc.jl
rename to core/src/main/julia/wc.jl
diff --git a/src/main/perl/Changes b/core/src/main/perl/Changes
similarity index 100%
rename from src/main/perl/Changes
rename to core/src/main/perl/Changes
diff --git a/src/main/perl/MANIFEST b/core/src/main/perl/MANIFEST
similarity index 100%
rename from src/main/perl/MANIFEST
rename to core/src/main/perl/MANIFEST
diff --git a/src/main/perl/Makefile.PL b/core/src/main/perl/Makefile.PL
similarity index 100%
rename from src/main/perl/Makefile.PL
rename to core/src/main/perl/Makefile.PL
diff --git a/src/main/perl/README b/core/src/main/perl/README
similarity index 100%
rename from src/main/perl/README
rename to core/src/main/perl/README
diff --git a/src/main/perl/ghinfo.pl b/core/src/main/perl/ghinfo.pl
similarity index 100%
rename from src/main/perl/ghinfo.pl
rename to core/src/main/perl/ghinfo.pl
diff --git a/src/main/perl/ignore.txt b/core/src/main/perl/ignore.txt
similarity index 100%
rename from src/main/perl/ignore.txt
rename to core/src/main/perl/ignore.txt
diff --git a/src/main/perl/lib/HighPerformanceSpark/Examples.pm b/core/src/main/perl/lib/HighPerformanceSpark/Examples.pm
similarity index 100%
rename from src/main/perl/lib/HighPerformanceSpark/Examples.pm
rename to core/src/main/perl/lib/HighPerformanceSpark/Examples.pm
diff --git a/src/main/perl/t/00-load.t b/core/src/main/perl/t/00-load.t
similarity index 100%
rename from src/main/perl/t/00-load.t
rename to core/src/main/perl/t/00-load.t
diff --git a/src/main/perl/t/manifest.t b/core/src/main/perl/t/manifest.t
similarity index 100%
rename from src/main/perl/t/manifest.t
rename to core/src/main/perl/t/manifest.t
diff --git a/src/main/perl/t/pod-coverage.t b/core/src/main/perl/t/pod-coverage.t
similarity index 100%
rename from src/main/perl/t/pod-coverage.t
rename to core/src/main/perl/t/pod-coverage.t
diff --git a/src/main/perl/t/pod.t b/core/src/main/perl/t/pod.t
similarity index 100%
rename from src/main/perl/t/pod.t
rename to core/src/main/perl/t/pod.t
diff --git a/src/main/perl/xt/boilerplate.t b/core/src/main/perl/xt/boilerplate.t
similarity index 100%
rename from src/main/perl/xt/boilerplate.t
rename to core/src/main/perl/xt/boilerplate.t
diff --git a/src/main/r/dapply.R b/core/src/main/r/dapply.R
similarity index 100%
rename from src/main/r/dapply.R
rename to core/src/main/r/dapply.R
diff --git a/src/main/r/wc.R b/core/src/main/r/wc.R
similarity index 100%
rename from src/main/r/wc.R
rename to core/src/main/r/wc.R
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala
similarity index 79%
rename from src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala
rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala
index 8aeb8ebc..def3e088 100644
--- a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala
@@ -6,19 +6,16 @@ package com.highperformancespark.examples.dataframe
 
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
-//tag::sparkSQLImports[]
-import org.apache.spark.sql.{Dataset, DataFrame, SparkSession, Row}
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.expressions._
 import org.apache.spark.sql.functions._
-//end::sparkSQLImports[]
-
-//tag::legacySparkSQLImports[]
-import org.apache.spark.sql.SQLContext
-//end::legacySparkSQLImports[]
-//tag::legacySparkHiveImports[]
-import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.sql.hive.thriftserver._
+import org.apache.spark.sql.Encoders
 //end::legacySparkHiveImports[]
 
 object HappyPandas {
@@ -29,7 +26,7 @@ object HappyPandas {
   def sparkSession(): SparkSession = {
     //tag::createSparkSession[]
     val session = SparkSession.builder()
-      .enableHiveSupport()
+      //.enableHiveSupport() -- try disabling this
       .getOrCreate()
     // Import the implicits, unlike in core Spark the implicits are defined
     // on the context.
@@ -38,12 +35,15 @@ object HappyPandas {
     session
   }
 
+  val session = sparkSession()
+  import session.implicits._
+
   /**
    * Creates SQLContext with an existing SparkContext.
    */
   def sqlContext(sc: SparkContext): SQLContext = {
     //tag::createSQLContext[]
-    val sqlContext = new SQLContext(sc)
+    val sqlContext = SparkSession.builder.getOrCreate().sqlContext
     // Import the implicits, unlike in core Spark the implicits are defined
     // on the context.
     import sqlContext.implicits._
@@ -54,9 +54,9 @@ object HappyPandas {
   /**
    * Creates HiveContext Spark with an existing SparkContext using hive.
    */
-  def hiveContext(sc: SparkContext): HiveContext = {
+  def hiveContext(sc: SparkContext): SQLContext = {
     //tag::createHiveContext[]
-    val hiveContext = new HiveContext(sc)
+    val hiveContext = SparkSession.builder.enableHiveSupport().getOrCreate().sqlContext
     // Import the implicits, unlike in core Spark the implicits are defined
     // on the context.
     import hiveContext.implicits._
@@ -78,7 +78,7 @@ object HappyPandas {
     //end::loadPandaJSONComplex[]
     val jsonRDD = sc.textFile(path)
     //tag::loadPandaJsonRDD[]
-    val df3 = session.read.json(jsonRDD)
+    val df3 = session.read.json(session.createDataset(jsonRDD)(Encoders.STRING))
     //end::loadPandaJSONRDD[]
     df1
   }
@@ -86,7 +86,7 @@ object HappyPandas {
   def jsonLoadFromRDD(session: SparkSession, input: RDD[String]): DataFrame = {
     //tag::loadPandaJSONRDD[]
     val rdd: RDD[String] = input.filter(_.contains("panda"))
-    val df = session.read.json(rdd)
+    val df = session.read.json(session.createDataset(rdd)(Encoders.STRING))
     //end::loadPandaJSONRDD[]
     df
   }
@@ -113,8 +113,8 @@ object HappyPandas {
     */
   def happyPandasPercentage(pandaInfo: DataFrame): DataFrame = {
     pandaInfo.select(
-      pandaInfo("place"),
-      (pandaInfo("happyPandas") / pandaInfo("totalPandas")).as("percentHappy")
+      $"place",
+      ($"happyPandas" / $"totalPandas").as("percentHappy")
     )
   }
 
@@ -126,9 +126,9 @@ object HappyPandas {
     * @return Returns a DataFrame of pandaId and integer value for pandaType.
     */
   def encodePandaType(pandaInfo: DataFrame): DataFrame = {
-    pandaInfo.select(pandaInfo("id"),
-      (when(pandaInfo("pt") === "giant", 0).
-      when(pandaInfo("pt") === "red", 1).
+    pandaInfo.select($"id",
+      (when($"pt" === "giant", 0).
+      when($"pt" === "red", 1).
       otherwise(2)).as("encodedType")
     )
   }
@@ -138,7 +138,7 @@ object HappyPandas {
     * Gets places with happy pandas more than minHappinessBound.
     */
   def minHappyPandas(pandaInfo: DataFrame, minHappyPandas: Int): DataFrame = {
-    pandaInfo.filter(pandaInfo("happyPandas") >= minHappyPandas)
+    pandaInfo.filter($"happyPandas" >= minHappyPandas)
   }
 
   /**
@@ -158,7 +158,7 @@ object HappyPandas {
             RawPanda(id, zip, pt, happy, attrs.toArray)
         }}
     pandaInfo.select(
-      (pandaInfo("attributes")(0) / pandaInfo("attributes")(1))
+      ($"attributes"(0) / $"attributes"(1))
         .as("squishyness"))
     //end::selectExplode[]
   }
@@ -167,6 +167,7 @@ object HappyPandas {
     * Find pandas that are sad
     */
   def sadPandas(pandaInfo: DataFrame): DataFrame = {
+    // This one is our intentional non $ example
     //tag::simpleFilter[]
     pandaInfo.filter(pandaInfo("happy") !== true)
     //end::simpleFilter[]
@@ -178,7 +179,7 @@ object HappyPandas {
   def happyFuzzyPandas(pandaInfo: DataFrame): DataFrame = {
     //tag::complexFilter[]
     pandaInfo.filter(
-      pandaInfo("happy").and(pandaInfo("attributes")(0) > pandaInfo("attributes")(1))
+      $"happy".and($"attributes"(0) > $"attributes"(1))
     )
     //end::complexFilter[]
   }
@@ -187,7 +188,7 @@ object HappyPandas {
     * Gets places that contains happy pandas more than unhappy pandas.
     */
   def happyPandasPlaces(pandaInfo: DataFrame): DataFrame = {
-    pandaInfo.filter(pandaInfo("happyPandas") >= pandaInfo("totalPandas") / 2)
+    pandaInfo.filter($"happyPandas" >= $"totalPandas" / 2)
   }
 
 
@@ -258,7 +259,7 @@ object HappyPandas {
     miniPandas
   }
 
-  def startJDBCServer(hiveContext: HiveContext): Unit = {
+  def startJDBCServer(hiveContext: SQLContext): Unit = {
     //tag::startJDBC[]
     hiveContext.setConf("hive.server2.thrift.port", "9090")
     HiveThriftServer2.startWithContext(hiveContext)
@@ -314,27 +315,52 @@ object HappyPandas {
     //end::rightouterJoin[]
 
     //tag::leftsemiJoin[]
-    // Left semi join explicit
+    // Left semi join explicit.
+    // Here we're explicit about which DF which col comes from given
+    // the shared name.
     df1.join(df2, df1("name") === df2("name"), "left_semi")
     //end::leftsemiJoin[]
   }
 
+
+  def badComplexJoin(df1: Dataset[Pandas], df2: Dataset[Pandas]): Dataset[(Pandas, Pandas)] = {
+    df1.joinWith(df2, regexp(df1("name"), df2("name"))).alias("regexp join")
+  }
+
+
+  //tag::badJoinMagic[]
+  def badJoin(df1: Dataset[Pandas], df2: Dataset[Pandas]): Dataset[(Pandas, Pandas)] = {
+    val session = df1.sparkSession
+    val sle = session.udf.register("strLenEq", (s: String, s2: String) => s.length() == s2.length())
+    df1.joinWith(df2, sle(df1("name"), df2("name"))).alias("strlenEqJoin")
+  }
+  //end::badJoinMagic[]
+
+  //tag::okJoin[]
+  def okJoin(df1: Dataset[Pandas], df2: Dataset[Pandas]): Dataset[(Pandas, Pandas)] = {
+    val session = df1.sparkSession
+    val sl = session.udf.register("strLen", (s: String) => s.length())
+    df1.joinWith(df2, sl(df1("name")) === sl(df2("name"))).alias("strlenJoin")
+  }
+  //end::okJoin[]
+
   /**
    * Cut the lineage of a DataFrame which has too long a query plan.
    */
   def cutLineage(df: DataFrame): DataFrame = {
-    val sqlCtx = df.sqlContext
+    val session = SparkSession.builder.getOrCreate()
+    import session.implicits._
     //tag::cutLineage[]
     val rdd = df.rdd
     rdd.cache()
-    sqlCtx.createDataFrame(rdd, df.schema)
+    session.createDataFrame(rdd, df.schema)
     //end::cutLineage[]
   }
 
   // Self join
   def selfJoin(df: DataFrame): DataFrame = {
-    val sqlCtx = df.sqlContext
-    import sqlCtx.implicits._
+    val session = SparkSession.builder.getOrCreate()
+    import session.implicits._
     //tag::selfJoin[]
     val joined = df.as("a").join(df.as("b")).where($"a.name" === $"b.name")
     //end::selfJoin[]
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala
similarity index 93%
rename from src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala
rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala
index 82be10fc..54ca5342 100644
--- a/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala
@@ -88,11 +88,21 @@ case class LoadSave(sc: SparkContext, session: SparkSession) {
   }
   //end::saveAppend[]
 
+  def upsertPandas(input: DataFrame): Unit = {
+    //tag::upsert[]
+    input.mergeInto("pandaInfo", $"source.id" === $"target.id")
+         .whenMatched() // Note you can override the general match condition above if desired
+         .updateAll()
+         .whenNotMatched()
+         .insertAll()
+    //end::upsert[]
+  }
+
   def createJDBC() = {
-    //tag::createJDBC[]
     session.read.jdbc("jdbc:dialect:serverName;user=user;password=pass",
       "table", new Properties)
 
+    //tag::createJDBC[]
     session.read.format("jdbc")
       .option("url", "jdbc:dialect:serverName")
       .option("dbtable", "table").load()
@@ -100,10 +110,10 @@ case class LoadSave(sc: SparkContext, session: SparkSession) {
   }
 
   def writeJDBC(df: DataFrame) = {
-    //tag::writeJDBC[]
     df.write.jdbc("jdbc:dialect:serverName;user=user;password=pass",
       "table", new Properties)
 
+    //tag::writeJDBC[]
     df.write.format("jdbc")
       .option("url", "jdbc:dialect:serverName")
       .option("user", "user")
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala
similarity index 92%
rename from src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala
rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala
index 2ccdd10f..b74e1cbb 100644
--- a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala
@@ -9,10 +9,9 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.expressions._
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types._
-// Additional imports for using HiveContext
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.thriftserver._
+import org.apache.spark.sql.types._
 
 case class MiniPandaInfo(zip: String, size: Double)
 
@@ -68,9 +67,10 @@ class MixedDataset(sqlCtx: SQLContext) {
 
   //tag::maxPandaSizePerZipScala[]
   def maxPandaSizePerZipScala(ds: Dataset[RawPanda]): Dataset[(String, Double)] = {
-    ds.groupByKey(rp => rp.zip).mapGroups{ case (g, iter) =>
+    def groupMapFun(g: String, iter: Iterator[RawPanda]): (String, Double)  = {
       (g, iter.map(_.attributes(2)).reduceLeft(Math.max(_, _)))
     }
+    ds.groupByKey(rp => rp.zip).mapGroups(groupMapFun)
   }
   //end::maxPandaSizePerZipScala[]
 
@@ -89,7 +89,7 @@ class MixedDataset(sqlCtx: SQLContext) {
       Dataset[(RawPanda, CoffeeShop)] = {
     //tag::joinWith[]
     val result: Dataset[(RawPanda, CoffeeShop)] = pandas.joinWith(coffeeShops,
-      $"zip" === $"zip")
+      pandas("zip") === coffeeShops("zip"))
     //end::joinWith[]
     result
   }
@@ -100,8 +100,8 @@ class MixedDataset(sqlCtx: SQLContext) {
   def selfJoin(pandas: Dataset[RawPanda]):
       Dataset[(RawPanda, RawPanda)] = {
     //tag::selfJoin[]
-    val result: Dataset[(RawPanda, RawPanda)] = pandas.joinWith(pandas,
-      $"zip" === $"zip")
+    val result: Dataset[(RawPanda, RawPanda)] = pandas.as("l").joinWith(pandas.as("r"),
+      $"l.zip" === $"r.zip")
     //end::selfJoin[]
     result
   }
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back b/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back
rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back
diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala
new file mode 100644
index 00000000..8e482bfc
--- /dev/null
+++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala
@@ -0,0 +1,28 @@
+/**
+ * Extension for the SparkSession to allow us to plug in a custom optimizer
+ */
+
+package com.highperformancespark.examples.dataframe
+
+import org.apache.spark.sql.catalyst.optimizer._
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.trees.TreePattern._
+import org.apache.spark.sql.catalyst.expressions.{And, IsNotNull}
+
+object NullabilityFilterOptimizer extends Rule[LogicalPlan] {
+
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    plan.transform {
+      case p @ Project(projectList, projChild) =>
+        val children = projectList.flatMap(_.children)
+        // If there are no null intolerant children don't worry about it
+        if (children.isEmpty) {
+          p
+        } else {
+          val filterCond = children.map(IsNotNull(_)).reduceLeft(And)
+          Project(projectList, Filter(filterCond, projChild))
+        }
+    }
+  }
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala
similarity index 92%
rename from src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala
rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala
index b1d64dc7..c7cf0cae 100644
--- a/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala
@@ -14,7 +14,7 @@ case class RawPanda(id: Long, zip: String, pt: String,
   happy: Boolean, attributes: Array[Double]) {
   override def equals(o: Any) = o match {
     case other: RawPanda => (id == other.id && pt == other.pt &&
-        happy == other.happy && attributes.deep == other.attributes.deep)
+        happy == other.happy && attributes.sameElements(other.attributes))
     case _ => false
   }
   override def hashCode(): Int = {
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala
rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala
diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/SQLExtension.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/SQLExtension.scala
new file mode 100644
index 00000000..14e2072f
--- /dev/null
+++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/SQLExtension.scala
@@ -0,0 +1,15 @@
+/**
+ * Extension for the SparkSession to allow us to plug in a custom optimizer
+ */
+
+package com.highperformancespark.examples.dataframe
+
+import org.apache.spark.sql.{SparkSessionExtensions, SparkSessionExtensionsProvider}
+
+class SQLExtension extends SparkSessionExtensionsProvider {
+  override def apply(extensions: SparkSessionExtensions): Unit = {
+    // There are _many different_ types of rules you can inject, here we're focused on
+    // making things go fast so our sample is an optimizer rule (AQE rules could also make sense).
+    extensions.injectOptimizerRule(session => NullabilityFilterOptimizer)
+  }
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala
rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/errors/throws.scala b/core/src/main/scala/com/high-performance-spark-examples/errors/throws.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/errors/throws.scala
rename to core/src/main/scala/com/high-performance-spark-examples/errors/throws.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala
similarity index 96%
rename from src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala
rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala
index 9f8ec9d3..afcdeb85 100644
--- a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala
@@ -1,12 +1,13 @@
 package com.highperformancespark.examples.goldilocks
 
+import scala.collection.Map
+import scala.collection.mutable
+
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.Row
 import org.apache.spark.storage.StorageLevel
 
-import scala.collection.mutable.MutableList
-import scala.collection.{Map, mutable}
-
 object GoldilocksGroupByKey {
   //tag::groupByKey[]
   def findRankStatistics(
@@ -252,7 +253,7 @@ object GoldilocksFirstTry {
     // to sort the partitionsColumnsFreq array by the partition index (the
     // first value in the tuple).
     partitionColumnsFreq.sortBy(_._1).map { case (partitionIndex, columnsFreq) =>
-      val relevantIndexList = new MutableList[(Int, Long)]()
+      val relevantIndexList = new mutable.ListBuffer[(Int, Long)]()
 
       columnsFreq.zipWithIndex.foreach{ case (colCount, colIndex)  =>
         val runningTotalCol = runningTotal(colIndex)
@@ -291,8 +292,8 @@ object GoldilocksFirstTry {
       (partitionIndex : Int, valueColumnPairs : Iterator[(Double, Int)]) => {
         val targetsInThisPart: List[(Int, Long)] = ranksLocations(partitionIndex)._2
         if (targetsInThisPart.nonEmpty) {
-          val columnsRelativeIndex: Map[Int, List[Long]] =
-          targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2))
+          val columnsRelativeIndex: collection.MapView[Int, List[Long]] =
+            targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2))
           val columnsInThisPart = targetsInThisPart.map(_._1).distinct
 
           val runningTotals : mutable.HashMap[Int, Long]=  new mutable.HashMap()
diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala
rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala
index 92cb44fd..71a66afa 100644
--- a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala
@@ -1,12 +1,12 @@
 package com.highperformancespark.examples.goldilocks
 
+import scala.collection.Map
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 
-import scala.collection.Map
-import scala.collection.mutable.ArrayBuffer
-
 //tag::colIndex_partition[]
 class ColumnIndexPartition(override val numPartitions: Int)
   extends Partitioner {
diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala
similarity index 98%
rename from src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala
rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala
index 2b3adc10..2097d021 100644
--- a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala
@@ -1,12 +1,13 @@
 package com.highperformancespark.examples.goldilocks
 
+import scala.collection.Map
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.storage.StorageLevel
 
-import scala.collection.mutable.ArrayBuffer
-import scala.collection.{Map, mutable}
-
 
 object GoldilocksWithHashMap {
 
@@ -173,7 +174,7 @@ object GoldilocksWithHashMap {
     val runningTotal = Array.fill[Long](numOfColumns)(0)
 
     partitionColumnsFreq.sortBy(_._1).map { case (partitionIndex, columnsFreq)=>
-      val relevantIndexList = new mutable.MutableList[(Int, Long)]()
+      val relevantIndexList = new mutable.ListBuffer[(Int, Long)]()
 
       columnsFreq.zipWithIndex.foreach{ case (colCount, colIndex)  =>
         val runningTotalCol = runningTotal(colIndex)
@@ -302,7 +303,7 @@ object FindTargetsSubRoutine extends Serializable {
   def withArrayBuffer(valueColumnPairsIter : Iterator[((Double, Int), Long)],
     targetsInThisPart: List[(Int, Long)] ): Iterator[(Int, Double)] = {
 
-      val columnsRelativeIndex: Predef.Map[Int, List[Long]] =
+      val columnsRelativeIndex: collection.MapView[Int, List[Long]] =
         targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2))
 
     // The column indices of the pairs that are desired rank statistics that live in
diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala
similarity index 93%
rename from src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala
rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala
index a60a39fc..d7024aea 100644
--- a/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala
@@ -1,11 +1,11 @@
 package  com.highperformancespark.examples.goldilocks
 
-import org.apache.spark.HashPartitioner
-import org.apache.spark.rdd.RDD
-
 import scala.collection.Map
 import scala.reflect.ClassTag
 
+import org.apache.spark.HashPartitioner
+import org.apache.spark.rdd.RDD
+
 object RDDJoinExamples {
 
  /* For Example, suppose we have one RDD with some data in the form (Panda id, score)
@@ -95,7 +95,7 @@ object RDDJoinExamples {
  }
 
  /**
-   * Performs a broad cast hash join for two RDDs.
+   * Performs a broadcast hash join for two RDDs.
    * @param bigRDD - the first rdd, should be the larger RDD
    * @param smallRDD - the small rdd, should be small enough to fit in memory
    * @tparam K - The type of the key
@@ -103,8 +103,8 @@ object RDDJoinExamples {
    * @tparam V2 - The type of the values for the second array
    * @return
    */
- //tag::coreBroadCast[]
- def manualBroadCastHashJoin[K : Ordering : ClassTag, V1 : ClassTag,
+ //tag::coreBroadcast[]
+ def manualBroadcastHashJoin[K : Ordering : ClassTag, V1 : ClassTag,
  V2 : ClassTag](bigRDD : RDD[(K, V1)],
   smallRDD : RDD[(K, V2)])= {
   val smallRDDLocal: Map[K, V2] = smallRDD.collectAsMap()
@@ -113,11 +113,13 @@ object RDDJoinExamples {
    iter.flatMap{
     case (k,v1 ) =>
      smallRDDLocalBcast.value.get(k) match {
+      // Note: You could switch this to a left join by changing the empty seq
+      // to instead return Seq(k, Seq.empty[(V1, V2)])
       case None => Seq.empty[(K, (V1, V2))]
       case Some(v2) => Seq((k, (v1, v2)))
      }
    }
   }, preservesPartitioning = true)
  }
- //end:coreBroadCast[]
+ //end::coreBroadcast[]
 }
diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala
similarity index 98%
rename from src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala
rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala
index 2b73ba45..b4e08738 100644
--- a/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala
@@ -3,7 +3,8 @@ package  com.highperformancespark.examples.goldilocks
 import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
 
-import org.apache.spark.{HashPartitioner, Partitioner}
+import org.apache.spark.HashPartitioner
+import org.apache.spark.Partitioner
 import org.apache.spark.rdd.RDD
 
 object PandaSecondarySort {
diff --git a/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
similarity index 97%
rename from src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
rename to core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
index 2b87a7e3..9fdef436 100644
--- a/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
@@ -1,22 +1,20 @@
 package com.highperformancespark.examples.ml
 
-import com.highperformancespark.examples.dataframe._
-
-import scala.collection.{Map, mutable}
-import scala.collection.mutable.{ArrayBuffer, MutableList}
+import scala.collection.Map
 
 import org.apache.spark._
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql._
-import org.apache.spark.sql._
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types._
 import org.apache.spark.ml._
 import org.apache.spark.ml.classification._
 import org.apache.spark.ml.linalg._
-//tag::extraImports[]
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+import com.highperformancespark.examples.dataframe._
 //end::extraImports[]
 
 //tag::basicPipelineSetup[]
diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala
rename to core/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala
similarity index 97%
rename from src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala
rename to core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala
index 13e937f6..ee34ed77 100644
--- a/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala
@@ -1,22 +1,20 @@
 package com.highperformancespark.examples.ml
 
-import com.highperformancespark.examples.dataframe._
-
-import scala.collection.{Map, mutable}
-import scala.collection.mutable.{ArrayBuffer, MutableList}
+import scala.collection.Map
 
 import org.apache.spark._
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql._
-import org.apache.spark.sql._
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types._
 import org.apache.spark.ml._
 import org.apache.spark.ml.classification._
 import org.apache.spark.ml.linalg._
-//tag::extraImports[]
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+import com.highperformancespark.examples.dataframe._
 //end::extraImports[]
 
 case class LabeledToken(label: Double, index: Integer)
@@ -40,7 +38,7 @@ class SimpleNaiveBayes(val uid: String)
     // Note this estimator assumes they start at 0 and go to numClasses
     val numClasses = getNumClasses(ds)
     // Get the number of features by peaking at the first row
-    val numFeatures: Integer = ds.select(col($(featuresCol))).head
+    val numFeatures: Integer = ds.select(col($(featuresCol))).head()
       .get(0).asInstanceOf[Vector].size
     // Determine the number of records for each class
     val groupedByLabel = ds.select(col($(labelCol)).as[Double]).groupByKey(x => x)
diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala
similarity index 95%
rename from src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala
rename to core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala
index 9117c74e..7f63ef8d 100644
--- a/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala
@@ -1,30 +1,22 @@
 package com.highperformancespark.examples.ml
 
-import com.highperformancespark.examples.dataframe._
-
-import scala.collection.{Map, mutable}
-import scala.collection.mutable.{ArrayBuffer, MutableList}
-
 import org.apache.spark._
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql._
-import org.apache.spark.sql._
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types._
-//tag::basicImport[]
 import org.apache.spark.ml._
-import org.apache.spark.ml.feature._
 import org.apache.spark.ml.classification._
-//end::basicImport[]
-//tag::renameImport[]
+import org.apache.spark.ml.feature._
 import org.apache.spark.ml.linalg.{Vector => SparkVector}
-//end::renameImport[]
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.tuning._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+import com.highperformancespark.examples.dataframe._
 
 object SimplePipeline {
   def constructAndSetParams(df: DataFrame) = {
-    val sqlCtx = df.sqlContext
     //tag::constructSetParams[]
     val hashingTF = new HashingTF()
     hashingTF.setInputCol("input")
@@ -33,7 +25,6 @@ object SimplePipeline {
   }
 
   def constructSimpleTransformer(df: DataFrame) = {
-    val sqlCtx = df.sqlContext
     //tag::simpleTransformer[]
     val hashingTF = new HashingTF()
     // We don't set the output column here so the default output column of
@@ -69,7 +60,6 @@ object SimplePipeline {
   }
 
   def constructSimpleEstimator(df: DataFrame) = {
-    val sqlCtx = df.sqlContext
     //tag::simpleNaiveBayes[]
     val nb = new NaiveBayes()
     nb.setLabelCol("happy")
diff --git a/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala b/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala
similarity index 74%
rename from src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala
rename to core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala
index ddbc9d65..3fab009e 100644
--- a/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala
@@ -1,21 +1,17 @@
 package com.highperformancespark.examples.mllib
 
-import com.highperformancespark.examples.dataframe._
-
-import scala.collection.{Map, mutable}
-import scala.collection.mutable.{ArrayBuffer, MutableList}
+import scala.collection.Map
 
 import org.apache.spark._
-import org.apache.spark.rdd.RDD
-//tag::imports[]
-import com.github.fommil.netlib.BLAS.{getInstance => blas}
+import org.apache.spark.mllib.classification.LogisticRegressionModel
+import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
+import org.apache.spark.mllib.feature._
 import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS,
-  LogisticRegressionModel}
-// Rename Vector to SparkVector to avoid conflicts with Scala's Vector class
 import org.apache.spark.mllib.linalg.{Vector => SparkVector}
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.feature._
+import org.apache.spark.rdd.RDD
+
+import com.highperformancespark.examples.dataframe._
 //end::imports[]
 
 object GoldilocksMLlib {
@@ -97,42 +93,6 @@ object GoldilocksMLlib {
   }
   //end::trainScaler[]
 
-  //tag::word2vecSimple[]
-  def word2vec(sc: SparkContext, rdd: RDD[String]): RDD[SparkVector] = {
-    // Tokenize our data
-    val tokenized = rdd.map(_.split(" ").toIterable)
-    // Construct our word2vec model
-    val wv = new Word2Vec()
-    val wvm = wv.fit(tokenized)
-    val wvmb = sc.broadcast(wvm)
-    // WVM can now transform single words
-    println(wvm.transform("panda"))
-    // Vector size is 100 - we use this to build a transformer on top of WVM that
-    // works on sentences.
-    val vectorSize = 100
-    // The transform function works on a per-word basis, but we have
-    // sentences as input.
-    tokenized.map{words =>
-      // If there is nothing in the sentence output a null vector
-      if (words.isEmpty) {
-        Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double])
-      } else {
-        // If there are sentences construct a running sum of the
-        // vectors for each word
-        val sum = Array[Double](vectorSize)
-        words.foreach { word =>
-          blas.daxpy(
-            vectorSize, 1.0, wvmb.value.transform(word).toArray, 1, sum, 1)
-        }
-        // Then scale it by the number of words
-        blas.dscal(sum.length, 1.0 / words.size, sum, 1)
-        // And wrap it in a Spark vector
-        Vectors.dense(sum)
-      }
-    }
-  }
-  //end::word2vecSimple[]
-
   //tag::hashingTFPreserve[]
   def toVectorPerserving(rdd: RDD[RawPanda]): RDD[(RawPanda, SparkVector)] = {
     val ht = new HashingTF()
diff --git a/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala b/core/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala
rename to core/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala b/core/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala
similarity index 96%
rename from src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala
rename to core/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala
index 40eb61fa..ca6d65c4 100644
--- a/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala
@@ -16,8 +16,9 @@
  */
 package com.highperformancespark.examples.ffi
 
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkFiles
 import org.apache.spark.rdd._
-import org.apache.spark.{SparkContext, SparkFiles}
 
 object PipeExample {
   //tag::pipeExample[]
diff --git a/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala b/core/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala
similarity index 86%
rename from src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala
rename to core/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala
index 485c73d8..16aa779e 100644
--- a/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala
@@ -1,10 +1,12 @@
 package com.highperformancespark.examples.ffi
 
 object StandAlone {
+  // $COVERAGE-OFF$
   def main(args: Array[String]) {
     //tag::systemLoadLibrary[]
     System.loadLibrary("highPerformanceSpark0")
     //end::systemLoadLibrary[]
     println(new SumJNI().sum(Array(1,2,3)))
   }
+   // $COVERAGE-ON$
 }
diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala b/core/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala
rename to core/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala b/core/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala
rename to core/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala b/core/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala
similarity index 85%
rename from src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala
rename to core/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala
index ed0caafb..65de6c2f 100644
--- a/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala
@@ -1,6 +1,6 @@
 package com.highperformancespark.examples.ffi
 
-import ch.jodersky.jni.nativeLoader
+import com.github.sbt.jni.nativeLoader
 
 //tag::sumJNIDecorator[]
 @nativeLoader("high-performance-spark0")
diff --git a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala b/core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala
similarity index 86%
rename from src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala
rename to core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala
index b6e59ae1..5a06ff63 100644
--- a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala
@@ -16,18 +16,23 @@
  */
 package com.highperformancespark.examples.perf
 
-import com.highperformancespark.examples.dataframe.RawPanda
-import com.highperformancespark.examples.tools._
-
+import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
 import org.apache.spark.rdd._
-import org.apache.spark.{SparkContext, SparkConf}
-import org.apache.spark.sql.{SparkSession, DataFrame, Dataset, Row}
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.types._
 
+import com.highperformancespark.examples.dataframe.RawPanda
+import com.highperformancespark.examples.tools._
+
 /**
  * A simple performance test to compare a simple sort between DataFrame, and RDD
  */
 object SimplePerfTest {
+  // $COVERAGE-OFF$
   def main(args: Array[String]) = {
     val sparkConf = new SparkConf().setAppName("simple-perf-test")
     val sparkSession = SparkSession.builder().enableHiveSupport().getOrCreate()
@@ -59,9 +64,9 @@ object SimplePerfTest {
     println(dataFrameTimeings.map(_._2).mkString(","))
   }
 
-  def testOnRDD(rdd: RDD[(Int, Double)]) = {
-    rdd.map{case (x, y) => (x, (y, 1))}
-      .reduceByKey{case (x, y) => (x._1 + y._1, x._2 + y._2)}.count()
+  def testOnRDD(rdd: RDD[(Int, Double)]): Long = {
+    val kvc: RDD[(Int, (Double , Int))] = rdd.map{case (x, y) => (x, (y, 1))}
+    kvc.reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2)).count()
   }
 
   def groupOnRDD(rdd: RDD[(Int, Double)]) = {
@@ -81,4 +86,5 @@ object SimplePerfTest {
     println(s"Time ${t1 - t0}ns")
     (result, t1 - t0)
   }
+  // $COVERAGE-ON$
 }
diff --git a/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala b/core/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala
similarity index 97%
rename from src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala
rename to core/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala
index 2fa173ca..2cde7b2e 100644
--- a/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala
@@ -5,15 +5,14 @@ package com.highperformancespark.examples.streaming
 
 import scala.reflect.ClassTag
 
-import org.apache.hadoop.io.{LongWritable, Text}
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
-
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
-
-//tag::DStreamImports[]
 import org.apache.spark.streaming._
 import org.apache.spark.streaming.dstream._
+
+import org.apache.hadoop.io.LongWritable
+import org.apache.hadoop.io.Text
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
 //end::DStreamImports[]
 
 object DStreamExamples {
diff --git a/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala b/core/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala
similarity index 89%
rename from src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala
rename to core/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala
index f773a2e7..0c50469e 100644
--- a/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala
@@ -5,6 +5,7 @@ import scala.concurrent.duration._
 import org.apache.spark.sql._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.streaming._
+import org.apache.spark.sql.streaming.Trigger
 
 
 object Structured {
@@ -21,7 +22,7 @@ object Structured {
       // Write out the result as parquet
       format("parquet").
       // Specify the interval at which new data will be picked up
-      trigger(ProcessingTime(1.second)).
+      trigger(Trigger.ProcessingTime(1.second)).
       queryName("pandas").start()
     //end::writeComplete[]
   }
diff --git a/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala b/core/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala
rename to core/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala
similarity index 50%
rename from src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala
rename to core/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala
index 02287ae9..ffc7d838 100644
--- a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala
@@ -2,21 +2,20 @@ package com.highperformancespark.examples.tools
 
 import scala.collection.immutable.HashSet
 
-import com.highperformancespark.examples.dataframe.RawPanda
-
 import org.apache.spark._
 import org.apache.spark.rdd.RDD
 
+import com.highperformancespark.examples.dataframe.RawPanda
 //tag::loggerImport[]
-import com.typesafe.scalalogging.LazyLogging
+import org.apache.logging.log4j.LogManager
 //end::loggerImport[]
 
-object FilterInvalidPandas extends LazyLogging {
+object FilterInvalidPandas {
 
   def filterInvalidPandas(sc: SparkContext, invalidPandas: List[Long],
     input: RDD[RawPanda]) = {
     //tag::broadcast[]
-    val invalid = HashSet() ++ invalidPandas
+    val invalid: HashSet[Long] = HashSet() ++ invalidPandas
     val invalidBroadcast = sc.broadcast(invalid)
     input.filter{panda => !invalidBroadcast.value.contains(panda.id)}
     //end::broadcast[]
@@ -25,11 +24,12 @@ object FilterInvalidPandas extends LazyLogging {
   def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long],
     input: RDD[RawPanda]) = {
     //tag::broadcastAndLog[]
-    val invalid = HashSet() ++ invalidPandas
+    val invalid: HashSet[Long] = HashSet() ++ invalidPandas
     val invalidBroadcast = sc.broadcast(invalid)
     def keepPanda(pandaId: Long) = {
+      val logger = LogManager.getLogger("fart based logs")
       if (invalidBroadcast.value.contains(pandaId)) {
-        logger.debug(s"Invalid panda ${pandaId} discovered")
+        logger.debug("hi")
         false
       } else {
         true
@@ -39,3 +39,24 @@ object FilterInvalidPandas extends LazyLogging {
     //end::broadcastAndLog[]
   }
 }
+
+//tag::broadcastAndLogClass[]
+class AltLog() {
+  lazy val logger = LogManager.getLogger("fart based logs")
+  def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long],
+      input: RDD[RawPanda]) = {
+    val invalid: HashSet[Long] = HashSet() ++ invalidPandas
+    val invalidBroadcast = sc.broadcast(invalid)
+    def keepPanda(pandaId: Long) = {
+      val logger = LogManager.getLogger("fart based logs")
+      if (invalidBroadcast.value.contains(pandaId)) {
+        logger.debug("hi")
+        false
+      } else {
+        true
+      }
+    }
+    input.filter{panda => keepPanda(panda.id)}
+  }
+}
+//end::broadcastAndLogClass[]
diff --git a/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala b/core/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala
rename to core/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala
index da4fd384..586ee3b6 100644
--- a/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala
@@ -1,12 +1,12 @@
 package com.highperformancespark.examples.tools
 
-import com.highperformancespark.examples.dataframe.RawPanda
-
 import org.apache.spark._
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.random.RandomRDDs
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
-import org.apache.spark.mllib.random.RandomRDDs
-import org.apache.spark.mllib.linalg.Vector
+
+import com.highperformancespark.examples.dataframe.RawPanda
 
 object GenerateScalingData {
   /**
diff --git a/core/src/main/scala/com/high-performance-spark-examples/tools/ResourceProfileEx.scala b/core/src/main/scala/com/high-performance-spark-examples/tools/ResourceProfileEx.scala
new file mode 100644
index 00000000..21b7afa7
--- /dev/null
+++ b/core/src/main/scala/com/high-performance-spark-examples/tools/ResourceProfileEx.scala
@@ -0,0 +1,41 @@
+package com.highperformancespark.examples.gpu
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.resource._
+import org.apache.spark.resource.ResourceProfileBuilder
+import org.apache.spark.TaskContext
+
+object GPUResourceProfileExample {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession.builder()
+      .appName("GPUResourceProfileExample")
+      .getOrCreate()
+    run(spark)
+  }
+
+  def run(spark: SparkSession) = {
+    val sc = spark.sparkContext
+    //tag::gpuResourceProfileExample[]
+    // Create a resource profile requesting 2 NVIDIA GPUs per executor and 1 per task
+    val gpuResourceProfile = new ResourceProfileBuilder()
+      .require(new ExecutorResourceRequests().resource(
+        "gpu", 2, vendor="nvidia",
+        discoveryScript="/opt/spark/bin/getGpusResources.sh" // See sample in Spark repo
+      ))
+      .require(new TaskResourceRequests().resource("gpu", 1))
+      .build()
+
+    // Use resource profile to run on a machine with GPUs.
+    val rdd = sc.parallelize(1 to 4, 4)
+      .withResources(gpuResourceProfile)
+      .map { i =>
+        // Do some special GPU stuff here my friend
+        i
+      }
+    //end::gpuResourceProfileExample[]
+
+    rdd.collect().foreach(println)
+
+    spark.stop()
+  }
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala b/core/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala
similarity index 98%
rename from src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala
rename to core/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala
index 298a7c3f..30684411 100644
--- a/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala
@@ -1,5 +1,5 @@
+import scala.reflect.ClassTag
 import scala.util.Random
-import scala.reflect.{ClassTag}
 
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
diff --git a/core/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/core/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
new file mode 100644
index 00000000..f58cdbb9
--- /dev/null
+++ b/core/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
@@ -0,0 +1,126 @@
+/**
+ * Illustrates how to use Spark accumulators. Note that most of these examples
+ * are "dangerous" in that they may not return consistent results.
+ */
+package com.highperformancespark.examples.transformations
+
+import java.{lang => jl}
+
+import scala.collection.mutable.HashSet
+
+import org.apache.spark._
+import org.apache.spark.rdd._
+import org.apache.spark.util.AccumulatorV2
+
+import com.highperformancespark.examples.dataframe.RawPanda
+object Accumulators {
+  /**
+   * Compute the total fuzzyness with an accumulator while generating
+   * an id and zip pair for sorting.
+   */
+  //tag::sumFuzzyAcc[]
+  def computeTotalFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]):
+      (RDD[(String, Long)], Double) = {
+    // Create an accumulator with the initial value of 0.0
+    val acc = sc.doubleAccumulator
+    val transformed = rdd.map{x => acc.add(x.attributes(0)); (x.zip, x.id)}
+    // accumulator still has zero value
+    // Note: This example is dangerous since the transformation may be
+    // evaluated multiple times.
+    transformed.count() // force evaluation
+    (transformed, acc.value)
+  }
+  //end::sumFuzzyAcc[]
+
+  /**
+   * Compute the max fuzzyness with an accumulator while generating an
+   * id and zip pair for sorting.
+   */
+  //tag::maxFuzzyAcc[]
+  def computeMaxFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]):
+      (RDD[(String, Long)], Double) = {
+    class MaxDoubleParam extends AccumulatorV2[jl.Double, jl.Double] {
+      var _value = Double.MinValue
+      override def isZero(): Boolean = {
+        _value == Double.MinValue
+      }
+      override def reset() = {
+        _value = Double.MinValue
+      }
+
+      override def add(r1: jl.Double): Unit = {
+        _value = Math.max(r1, _value)
+      }
+
+      def add(r1: Double): Unit = {
+        _value = Math.max(r1, _value)
+      }
+
+      def copy(): MaxDoubleParam = {
+        val newAcc = new MaxDoubleParam()
+        newAcc._value = _value
+        newAcc
+      }
+
+      override def merge(other: AccumulatorV2[jl.Double, jl.Double]): Unit = other match {
+        case o: MaxDoubleParam =>
+          _value = Math.max(_value, o._value)
+        case _ =>
+          throw new UnsupportedOperationException(
+            s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
+      }
+
+      override def value: jl.Double = _value
+    }
+    // Create an accumulator with the initial value of Double.MinValue
+    val acc = new MaxDoubleParam()
+    sc.register(acc)
+    val transformed = rdd.map{x => acc.add(x.attributes(0)); (x.zip, x.id)}
+    // accumulator still has Double.MinValue
+    // Note: This example is dangerous since the transformation may be
+    // evaluated multiple times.
+    transformed.count() // force evaluation
+    (transformed, acc.value)
+  }
+  //end::maxFuzzyAcc[]
+
+  //tag::uniquePandaAcc[]
+  def uniquePandas(sc: SparkContext, rdd: RDD[RawPanda]): HashSet[Long] = {
+    class UniqParam extends AccumulatorV2[Long, HashSet[Long]] {
+      val _values = new HashSet[Long]
+      override def isZero() = _values.isEmpty
+
+      override def copy(): UniqParam = {
+        val nacc = new UniqParam
+        nacc._values ++= _values
+        nacc
+      }
+
+      override def reset(): Unit = {
+        _values.clear()
+      }
+
+      override def merge(other: AccumulatorV2[Long, HashSet[Long]]): Unit = other match {
+        case o: UniqParam =>
+          _values ++= o._values
+        case _ =>
+          throw new UnsupportedOperationException(
+            s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
+      }
+
+      override def value: HashSet[Long] = _values
+      // For adding new values
+      override def add(t: Long) = {
+        _values += t
+      }
+    }
+    // Create an accumulator with the initial value of Double.MinValue
+    val acc = new UniqParam()
+    sc.register(acc)
+    val transformed = rdd.map{x => acc.add(x.id); (x.zip, x.id)}
+    // accumulator still has zero values
+    transformed.count() // force evaluation
+    acc.value
+  }
+  //end::uniquePandaAcc[]
+}
diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala b/core/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala
rename to core/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala b/core/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala
similarity index 99%
rename from src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala
rename to core/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala
index 948df496..aca85410 100644
--- a/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala
+++ b/core/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala
@@ -6,15 +6,13 @@
  */
 package com.highperformancespark.examples.transformations
 
-import com.highperformancespark.examples.dataframe.RawPanda
+import scala.collection.mutable.HashSet
 
 import org.apache.spark._
-//tag::import[]
-import org.apache.spark.util.AccumulatorV2
-//end::import[]
 import org.apache.spark.rdd._
+import org.apache.spark.util.AccumulatorV2
 
-import scala.collection.mutable.HashSet
+import com.highperformancespark.examples.dataframe.RawPanda
 object NewAccumulators {
   /**
    * Compute the total fuzzyness with an accumulator while generating
diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala b/core/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala
rename to core/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala b/core/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala
similarity index 100%
rename from src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala
rename to core/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala
diff --git a/src/test/java/com/highperformancespark/examples/JavaInteropTest.java b/core/src/test/java/com/highperformancespark/examples/JavaInteropTest.java
similarity index 100%
rename from src/test/java/com/highperformancespark/examples/JavaInteropTest.java
rename to core/src/test/java/com/highperformancespark/examples/JavaInteropTest.java
diff --git a/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java b/core/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java
similarity index 98%
rename from src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java
rename to core/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java
index d6bec37c..284397f9 100644
--- a/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java
+++ b/core/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java
@@ -3,7 +3,7 @@
 import com.highperformancespark.examples.objects.JavaPandaInfo;
 import com.highperformancespark.examples.objects.JavaPandas;
 import com.highperformancespark.examples.objects.JavaRawPanda;
-import com.holdenkarau.spark.testing.JavaDataFrameSuiteBase;
+//import com.holdenkarau.spark.testing.JavaDataFrameSuiteBase;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
@@ -16,6 +16,8 @@
 
 import static org.junit.Assert.*;
 
+// Temporarily disable until we upgrade to Spark 3.3
+/*
 public class JavaHappyPandasTest extends JavaDataFrameSuiteBase {
   String toronto = "toronto";
   String sandiego = "san diego";
@@ -149,3 +151,4 @@ public void simpleSQLExample() {
   }
 
 }
+*/
diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala
similarity index 86%
rename from src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala
rename to core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala
index 3fb10a53..854fc4e2 100644
--- a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala
@@ -4,17 +4,24 @@
  */
 package com.highperformancespark.examples.dataframe
 
-import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas}
-import com.holdenkarau.spark.testing._
-import org.apache.spark.sql.types._
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
-import org.scalatest.Matchers._
-import org.scalatest.FunSuite
-
 import scala.collection.mutable
 import scala.util.Random
 
-class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.{SQLContext, SparkSession}
+import org.apache.spark.sql.types._
+
+import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo
+import com.highperformancespark.examples.dataframe.HappyPandas.Pandas
+import com.holdenkarau.spark.testing._
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers._
+
+class HappyPandasTest extends AnyFunSuite with DataFrameSuiteBase {
+
+  override def appName: String = "happyPandasTest"
+
   val toronto = "toronto"
   val sandiego = "san diego"
   val virginia = "virginia"
@@ -44,6 +51,30 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
     rez.foreach{x => assert(x(0) == x(1))}
   }
 
+  test("bad regexp join") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val df1 = sqlCtx.createDataset(pandasList)
+    val df2 = sqlCtx.createDataset(pandasList)
+    val result = HappyPandas.badComplexJoin(df1, df2).collect()
+  }
+
+  test("bad udf join") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val df1 = sqlCtx.createDataset(pandasList)
+    val df2 = sqlCtx.createDataset(pandasList)
+    val result = HappyPandas.badJoin(df1, df2).collect()
+  }
+
+  test("ok udf join") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val df1 = sqlCtx.createDataset(pandasList)
+    val df2 = sqlCtx.createDataset(pandasList)
+    val result = HappyPandas.okJoin(df1, df2).collect()
+  }
+
   test("simple explode test") {
     val inputDF = sqlContext.createDataFrame(pandaPlaces)
     val pandaInfo = sqlContext.createDataFrame(rawPandaList)
@@ -64,7 +95,7 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
     val expectedDf = createDF(expectedList, ("place", StringType),
                                               ("percentHappy", DoubleType))
 
-    val inputDF = sqlContext.createDataFrame(pandaInfoList)
+    val inputDF = spark.createDataFrame(pandaInfoList)
     val resultDF = HappyPandas.happyPandasPercentage(inputDF)
 
     assertDataFrameApproximateEquals(expectedDf, resultDF, 1E-5)
@@ -72,7 +103,7 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
   //end::approxEqualDataFrames[]
 
   test("verify approx by hand") {
-    val inputDF = sqlContext.createDataFrame(pandaInfoList)
+    val inputDF = spark.createDataFrame(pandaInfoList)
     val resultDF = HappyPandas.happyPandasPercentage(inputDF)
     val resultRows = resultDF.collect()
 
@@ -90,7 +121,7 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
   }
 
   test("test encode Panda type") {
-    val inputDF = sqlContext.createDataFrame(rawPandaList)
+    val inputDF = spark.createDataFrame(rawPandaList)
     val resultDF = HappyPandas.encodePandaType(inputDF)
 
     val expectedRows = List(Row(10L, 0), Row(11L, 1))
@@ -103,7 +134,7 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
   //tag::exactEqualDataFrames[]
   test("verify exact equality") {
     // test minHappyPandas
-    val inputDF = sqlContext.createDataFrame(pandaInfoList)
+    val inputDF = spark.createDataFrame(pandaInfoList)
     val result = HappyPandas.minHappyPandas(inputDF, 2)
     val resultRows = result.collect()
 
@@ -113,12 +144,12 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
   //end::exactEqualDataFrames[]
 
   test("test happyPandasPlaces") {
-    val inputDF = sqlContext.createDataFrame(pandaInfoList)
+    val inputDF = spark.createDataFrame(pandaInfoList)
     val resultDF = HappyPandas.happyPandasPlaces(inputDF)
 
     val expectedRows = List(PandaInfo(toronto, "giant", 1, 2),
                             PandaInfo(sandiego, "red", 2, 3))
-    val expectedDF = sqlContext.createDataFrame(expectedRows)
+    val expectedDF = spark.createDataFrame(expectedRows)
 
     assertDataFrameEquals(expectedDF, resultDF)
   }
@@ -230,7 +261,7 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
         .flatMap(zipPandas => {
           val pandas = zipPandas._2
           val length = pandas.size - 1
-          val result = new mutable.MutableList[Row]
+          val result = new mutable.ListBuffer[Row]
 
           for (i <- 0 to length) {
             var totalSum = 0
diff --git a/core/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala
new file mode 100644
index 00000000..cbd79adc
--- /dev/null
+++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala
@@ -0,0 +1,180 @@
+/**
+ * Checks basic Dataset magics
+ */
+package com.highperformancespark.examples.dataframe
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.util.Random
+
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.types._
+
+import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo
+import com.highperformancespark.examples.dataframe.HappyPandas.Pandas
+import com.holdenkarau.spark.testing._
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers._
+
+class MixedDatasetSuite extends AnyFunSuite
+  with DataFrameSuiteBase
+  with DatasetSuiteBase
+  with RDDComparisons {
+
+  val rawPandaList = List(
+    RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9, 20.0)),
+    RawPanda(11L, "94110", "red", true, Array(1.0, 0.7, 30.0)))
+
+  test("happy panda sums") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val mixedDS = new MixedDataset(sqlCtx)
+    val inputDF = sqlCtx.createDataFrame(rawPandaList)
+    val inputDS = inputDF.as[RawPanda]
+    val result = mixedDS.happyPandaSums(inputDS)
+    assert(result === (2.0 +- 0.001))
+  }
+
+  test("basic select") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val inputDF = sqlCtx.createDataFrame(rawPandaList)
+    val inputDS = inputDF.as[RawPanda]
+    val mixedDS = new MixedDataset(sqlCtx)
+    val squishy = mixedDS.squishyPandas(inputDS).collect()
+    assert(squishy(0)._2 === true)
+  }
+
+  test("funquery") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val inputDF = sqlCtx.createDataFrame(rawPandaList)
+    val inputDS = inputDF.as[RawPanda]
+    val mixedDS = new MixedDataset(sqlCtx)
+    val summedAttrs = mixedDS.funMap(inputDS).collect()
+    assert(summedAttrs(0) === 21.9 +- 0.001)
+    assert(summedAttrs(1) === 31.7 +- 0.001)
+  }
+
+  test("max pandas size per zip") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val inputDF = sqlCtx.createDataFrame(rawPandaList)
+    val inputDS = inputDF.as[RawPanda]
+    val mixedDS = new MixedDataset(sqlCtx)
+    val bigPandas = mixedDS.maxPandaSizePerZip(inputDS).collect()
+    assert(bigPandas.size === 1)
+    assert(bigPandas(0)._2 === 30.0 +- 0.00001)
+  }
+
+  test("max pandas size per zip scala version") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val inputDF = sqlCtx.createDataFrame(rawPandaList)
+    val inputDS = inputDF.as[RawPanda]
+    val mixedDS = new MixedDataset(sqlCtx)
+    val bigPandas = mixedDS.maxPandaSizePerZipScala(inputDS).collect()
+    assert(bigPandas.size === 1)
+    assert(bigPandas(0)._2 === 30.0 +- 0.00001)
+  }
+
+  test("union pandas") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val happyPandas = sqlCtx.createDataset(rawPandaList.take(1))
+    val sadPandas = sqlCtx.createDataset(rawPandaList.drop(1))
+    val mixedDS = new MixedDataset(sqlCtx)
+    val unionPandas = mixedDS.unionPandas(happyPandas, sadPandas).collect
+    assert(unionPandas.toSet == rawPandaList.toSet)
+  }
+
+  test("typed query") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val inputDF = sqlCtx.createDataFrame(rawPandaList)
+    val inputDS = inputDF.as[RawPanda]
+    val mixedDS = new MixedDataset(sqlCtx)
+    val typedResult = mixedDS.typedQueryExample(inputDS)
+    assert(typedResult.collect().toList == rawPandaList.map(_.attributes(0)))
+  }
+
+  test("join different dataset") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val pandaDS = sqlCtx.createDataFrame(rawPandaList).as[RawPanda]
+    val rawCoffeeShop = List(
+      CoffeeShop("94110", "Starbucks"),
+      CoffeeShop("98765", "Caribou")
+    )
+    val coffeeShopDS = sqlCtx.createDataFrame(rawCoffeeShop).as[CoffeeShop]
+    val mixedDS = new MixedDataset(sqlCtx)
+    val joinResult = mixedDS.joinSample(pandaDS, coffeeShopDS)
+    val expected = for {
+      panda <- rawPandaList
+      coffeeShop <- rawCoffeeShop
+      if (panda.zip == coffeeShop.zip)
+    } yield (panda, coffeeShop)
+    assert(joinResult.collect().toSet == expected.toSet)
+  }
+
+  test("self join") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val inputDF = sqlCtx.createDataFrame(rawPandaList)
+    val inputDS = inputDF.as[RawPanda]
+    val mixedDS = new MixedDataset(sqlCtx)
+    val selfJoinResult = mixedDS.selfJoin(inputDS)
+    val expected = for {
+      left <- rawPandaList
+      right <- rawPandaList
+      if (left.zip == right.zip)
+    } yield (left, right)
+    assert(selfJoinResult.collect().toSet == expected.toSet)
+  }
+
+  test("convert an RDD to DS") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val mixedDS = new MixedDataset(sqlCtx)
+    val rdd = sc.parallelize(rawPandaList)
+    val result = mixedDS.fromRDD(rdd)
+    val expected = sqlCtx.createDataFrame(rawPandaList).as[RawPanda]
+    assertDatasetEquals(expected, result)
+  }
+
+  test("convert a Dataset to an RDD") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val mixedDS = new MixedDataset(sqlCtx)
+    val rdd = sc.parallelize(rawPandaList)
+    val dataset = sqlCtx.createDataFrame(rawPandaList).as[RawPanda]
+    val result = mixedDS.toRDD(dataset)
+    val expected = sc.parallelize(rawPandaList)
+    assertRDDEquals(expected, result)
+  }
+
+  test("convert a Dataset to a DataFrame") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val mixedDS = new MixedDataset(sqlCtx)
+    val rdd = sc.parallelize(rawPandaList)
+    val dataset = sqlCtx.createDataFrame(rawPandaList).as[RawPanda]
+    val result = mixedDS.toDF(dataset)
+    val expected = sqlCtx.createDataFrame(rawPandaList)
+    assertDataFrameEquals(expected, result)
+  }
+
+
+  test("convert a DataFrame to a DataSset") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val mixedDS = new MixedDataset(sqlCtx)
+    val dataframe = sqlCtx.createDataFrame(rawPandaList)
+    val result = mixedDS.fromDF(dataframe)
+    val expected = sqlCtx.createDataFrame(rawPandaList).as[RawPanda]
+    assertDatasetEquals(expected, result)
+  }
+
+}
diff --git a/core/src/test/scala/com/high-performance-spark-examples/dataframe/PandaPlaceFilterPushdown.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/PandaPlaceFilterPushdown.scala
new file mode 100644
index 00000000..17215ab2
--- /dev/null
+++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/PandaPlaceFilterPushdown.scala
@@ -0,0 +1,48 @@
+/**
+ * Happy Panda Example for DataFrames.
+ * Computes the % of happy pandas. Very contrived.
+ */
+package com.highperformancespark.examples.dataframe
+
+import scala.collection.mutable
+import scala.util.Random
+
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.{SQLContext, SparkSession}
+import org.apache.spark.sql.types._
+
+import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo
+import com.highperformancespark.examples.dataframe.HappyPandas.Pandas
+import com.holdenkarau.spark.testing._
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers._
+
+case class ExtraMagic(
+  place: String,
+  pandaType: String,
+  happyPandas: Integer,
+  totalPandas: Integer,
+  extraInfo: Integer)
+
+
+class PandaPlaceFilterPushdown extends AnyFunSuite with DataFrameSuiteBase {
+
+  override def appName: String = "pandaPlaceFilterPushdown"
+
+  val basicList = List(
+    ExtraMagic("a", "b", 1, 2, 3),
+    ExtraMagic("toronto", "b", 1, 2, 3),
+  )
+
+  test("simpleFilterTest") {
+    val sqlCtx = sqlContext
+    import sqlCtx.implicits._
+    val inputDF = sqlCtx.createDataFrame(basicList)
+    val restrictedDF = inputDF.select($"place", $"pandaType", $"happyPandas", $"totalPandas")
+    val switched = inputDF.as[PandaInfo]
+    // Note if we write the filter with functional syntax it does not push down.
+    val filtered = switched.filter($"place" === "a")
+    assert(filtered.count() === 1)
+  }
+}
diff --git a/core/src/test/scala/com/high-performance-spark-examples/dataframe/SQLExtensionTest.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/SQLExtensionTest.scala
new file mode 100644
index 00000000..91408fd7
--- /dev/null
+++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/SQLExtensionTest.scala
@@ -0,0 +1,49 @@
+/**
+ * Happy Panda Example for DataFrames.
+ * Computes the % of happy pandas. Very contrived.
+ */
+package com.highperformancespark.examples.dataframe
+
+import scala.collection.mutable
+import scala.util.Random
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql._
+import org.apache.spark.sql.execution.ExplainMode
+import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.sql.functions.{lower, rand}
+import org.apache.spark.sql.types._
+
+import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo
+import com.highperformancespark.examples.dataframe.HappyPandas.Pandas
+import com.holdenkarau.spark.testing._
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers._
+
+class SQLExtensionTest extends AnyFunSuite with ScalaDataFrameSuiteBase {
+
+  val rawPandaList = List(
+    RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9)),
+    RawPanda(11L, "94110", "red", true, Array(1.0, 0.9)))
+
+  override def conf: SparkConf = {
+    val initialConf = super.conf
+    initialConf.set(
+      "spark.sql.extensions",
+      "com.highperformancespark.examples.dataframe.SQLExtension")
+  }
+
+  def explainToString(df: DataFrame): String = {
+    df.queryExecution.explainString(ExplainMode.fromString("extended"))
+  }
+
+  test("Magic") {
+    import spark.implicits._
+    val inputDF = spark.createDataFrame(rawPandaList)
+    spark.sql("DROP TABLE IF EXISTS farts")
+    inputDF.write.saveAsTable("farts")
+    val testDF = spark.read.table("farts")
+    val explained: String = explainToString(testDF.select($"zip".cast(IntegerType)))
+    explained should include ("isnotnull(zip#")
+  }
+}
diff --git a/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala
similarity index 86%
rename from src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala
rename to core/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala
index 2b54ce75..8a6ba097 100644
--- a/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala
@@ -1,10 +1,9 @@
 package com.highperformancespark.examples.errors
 
 import com.holdenkarau.spark.testing._
+import org.scalatest.funsuite.AnyFunSuite
 
-import org.scalatest.FunSuite
-
-class ThrowsSuite extends FunSuite with SharedSparkContext {
+class ThrowsSuite extends AnyFunSuite with SharedSparkContext {
   test("inner throw & outer throw should both throw SparkExceptions exceptions") {
     intercept[org.apache.spark.SparkException] {
       Throws.throwInner(sc)
diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala
similarity index 96%
rename from src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala
rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala
index 97082841..4067fcba 100644
--- a/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala
@@ -1,10 +1,11 @@
 package com.highperformancespark.examples.goldilocks
 
-import com.holdenkarau.spark.testing.SharedSparkContext
 import org.apache.spark.rdd.RDD
-import org.scalatest.FunSuite
 
-class EvaluationTests extends FunSuite with SharedSparkContext {
+import com.holdenkarau.spark.testing.SharedSparkContext
+import org.scalatest.funsuite.AnyFunSuite
+
+class EvaluationTests extends AnyFunSuite with SharedSparkContext {
   val doubleList = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0)
   val keyValuePairs = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0).zipWithIndex
   val path = "target/testResults"
diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala
similarity index 89%
rename from src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala
rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala
index 53884778..2e7fea8c 100644
--- a/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala
@@ -1,15 +1,21 @@
 package com.highperformancespark.examples.goldilocks
 
-import com.holdenkarau.spark.testing.SharedSparkContext
+import scala.collection.immutable.IndexedSeq
+
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.{StructType, DoubleType, StructField}
-import org.apache.spark.sql.{Row, SQLContext, DataFrame}
-import org.scalatest.FunSuite
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.types.DoubleType
+import org.apache.spark.sql.types.StructField
+import org.apache.spark.sql.types.StructType
 
-import scala.collection.immutable.IndexedSeq
+import com.holdenkarau.spark.testing.SharedSparkContext
+import org.scalatest.funsuite.AnyFunSuite
+import org.apache.spark.sql.SparkSession
 
-class GoldilocksLargeTests extends FunSuite with SharedSparkContext{
+class GoldilocksLargeTests extends AnyFunSuite with SharedSparkContext{
 
 
   def testGoldilocksImplementations(
@@ -47,7 +53,7 @@ class GoldilocksLargeTests extends FunSuite with SharedSparkContext{
   }
 
   test("Goldilocks on local data solution "){
-    val sqlContext = new SQLContext(sc)
+    val sqlContext = SparkSession.builder.getOrCreate().sqlContext
     val testRanks = List(3L, 8L)
     val (smallTestData, result) =
       DataCreationUtils.createLocalTestData(5, 10, testRanks)
diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala
similarity index 81%
rename from src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala
rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala
index 69dcc5e8..ea0a16af 100644
--- a/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala
@@ -1,11 +1,12 @@
 package com.highperformancespark.examples.goldilocks
 
-import com.holdenkarau.spark.testing.SharedSparkContext
 import org.apache.spark.rdd.RDD
-import org.scalatest.FunSuite
+
+import com.holdenkarau.spark.testing.SharedSparkContext
+import org.scalatest.funsuite.AnyFunSuite
 
 
-class JoinTest extends FunSuite with SharedSparkContext {
+class JoinTest extends AnyFunSuite with SharedSparkContext {
   test("Hash join"){
     val keySet = "a, b, c, d, e, f, g".split(",")
     val smallRDD = sc.parallelize(keySet.map(letter => (letter, letter.hashCode)))
@@ -13,7 +14,7 @@ class JoinTest extends FunSuite with SharedSparkContext {
       sc.parallelize(keySet.flatMap{ letter =>
         Range(1, 50).map(i => (letter, letter.hashCode() / i.toDouble))})
     val result: RDD[(String, (Double, Int))] =
-      RDDJoinExamples.manualBroadCastHashJoin(
+      RDDJoinExamples.manualBroadcastHashJoin(
         largeRDD, smallRDD)
     val nativeJoin: RDD[(String, (Double, Int))] = largeRDD.join(smallRDD)
 
diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala
similarity index 87%
rename from src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala
rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala
index 131f3111..92130165 100644
--- a/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala
@@ -1,12 +1,16 @@
 package com.highperformancespark.examples.goldilocks
 
 import org.apache.spark._
-import org.apache.spark.sql.{Row, SQLContext}
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.SQLContext
+
+import org.scalatest.BeforeAndAfterAll
+import org.scalatest.funsuite.AnyFunSuite
+import org.apache.spark.sql.SparkSession
 
 
 // tag::MAGIC_PANDA[]
-class QuantileOnlyArtisanalTest extends FunSuite with BeforeAndAfterAll {
+class QuantileOnlyArtisanalTest extends AnyFunSuite with BeforeAndAfterAll {
   @transient private var _sc: SparkContext = _
   def sc: SparkContext = _sc
 
@@ -31,15 +35,15 @@ class QuantileOnlyArtisanalTest extends FunSuite with BeforeAndAfterAll {
     3 -> Set(6.0, 7.0))
 
   test("Goldilocks naive Solution"){
-    val sqlContext = new SQLContext(sc)
+    val sqlContext = SparkSession.builder.getOrCreate().sqlContext
     val input = sqlContext.createDataFrame(inputList)
     val whileLoopSolution = GoldilocksWhileLoop.findRankStatistics(
       input, List(2L, 3L)).mapValues(_.toSet)
     val inputAsKeyValuePairs = GoldilocksGroupByKey.mapToKeyValuePairs(input)
     val groupByKeySolution = GoldilocksGroupByKey.findRankStatistics(
       inputAsKeyValuePairs, List(2L,3L)).mapValues(_.toSet)
-    assert(whileLoopSolution == expectedResult)
-    assert(groupByKeySolution == expectedResult)
+    assert(whileLoopSolution.toMap == expectedResult)
+    assert(groupByKeySolution.toMap == expectedResult)
   }
 
   override def afterAll() {
@@ -56,7 +60,7 @@ class QuantileOnlyArtisanalTest extends FunSuite with BeforeAndAfterAll {
 // We don't need the rest of the tests included.
 class QuantileOnlyArtisanalTestContinued extends QuantileOnlyArtisanalTest {
   test("Goldilocks first try ") {
-    val sqlContext = new SQLContext(sc)
+    val sqlContext = SparkSession.builder.getOrCreate().sqlContext
     val input = sqlContext.createDataFrame(inputList)
     val secondAndThird = GoldilocksFirstTry.findRankStatistics(
       input, targetRanks = List(2L, 3L))
@@ -112,7 +116,7 @@ class QuantileOnlyArtisanalTestContinued extends QuantileOnlyArtisanalTest {
 
 
   test("GoldiLocks With Hashmap ") {
-    val sqlContext = new SQLContext(sc)
+    val sqlContext = SparkSession.builder.getOrCreate().sqlContext
     val input = sqlContext.createDataFrame(inputList)
     val secondAndThird = GoldilocksWithHashMap.findRankStatistics(
       input, targetRanks = List(2L, 3L))
@@ -127,12 +131,12 @@ class QuantileOnlyArtisanalTestContinued extends QuantileOnlyArtisanalTest {
   }
 
   test("Goldilocks Secondary Sort"){
-    val sqlContext = new SQLContext(sc)
+    val sqlContext = SparkSession.builder.getOrCreate().sqlContext
     val input = sqlContext.createDataFrame(inputList)
     val secondarySortSolution =
       GoldilocksWithHashMap.findRankStatistics(
         input, targetRanks = List(2L, 3L)).mapValues(_.toSet)
-    assert(secondarySortSolution == expectedResult)
+    assert(secondarySortSolution.toMap == expectedResult)
   }
 
   test("Secondary Sort"){
diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala
similarity index 96%
rename from src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala
rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala
index 4ac03e77..2ff69cbf 100644
--- a/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala
@@ -1,14 +1,15 @@
 
 package com.highperformancespark.examples.goldilocks
 
-import com.holdenkarau.spark.testing.SharedSparkContext
+import scala.reflect.ClassTag
+
 import org.apache.spark.rdd.RDD
-import org.scalatest.FunSuite
 
-import scala.reflect.ClassTag
+import com.holdenkarau.spark.testing.SharedSparkContext
+import org.scalatest.funsuite.AnyFunSuite
 
 
-class SortingTests extends FunSuite with SharedSparkContext {
+class SortingTests extends AnyFunSuite with SharedSparkContext {
 
   test("Test Sort by two keys"){
 
diff --git a/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/core/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
similarity index 89%
rename from src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
rename to core/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
index 3b9159c1..940d2231 100644
--- a/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
@@ -3,13 +3,14 @@
   */
 package com.highperformancespark.examples.ml
 
-import com.holdenkarau.spark.testing.DataFrameSuiteBase
 import org.apache.spark.sql.Dataset
-import org.scalatest.FunSuite
+
+import com.holdenkarau.spark.testing.DataFrameSuiteBase
+import org.scalatest.funsuite.AnyFunSuite
 
 case class TestRow(id: Int, inputColumn: String)
 
-class CustomPipelineSuite extends FunSuite with DataFrameSuiteBase {
+class CustomPipelineSuite extends AnyFunSuite with DataFrameSuiteBase {
   val d = List(
     TestRow(0, "a"),
     TestRow(1, "b"),
diff --git a/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/core/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala
similarity index 75%
rename from src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala
rename to core/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala
index 1fa296a0..7a893107 100644
--- a/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala
@@ -3,21 +3,24 @@
  */
 package com.highperformancespark.examples.ml
 
-import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas}
-
-import com.holdenkarau.spark.testing._
-
 import org.apache.spark.ml._
 import org.apache.spark.ml.feature._
 import org.apache.spark.ml.param._
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext}
-import org.scalatest.Matchers._
-import org.scalatest.FunSuite
+
+import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo
+import com.highperformancespark.examples.dataframe.HappyPandas.Pandas
+import com.holdenkarau.spark.testing._
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers._
 
 case class MiniPanda(happy: Double, fuzzy: Double, old: Double)
 
-class SimpleNaiveBayesSuite extends FunSuite with DataFrameSuiteBase {
+class SimpleNaiveBayesSuite extends AnyFunSuite with DataFrameSuiteBase {
   val miniPandasList = List(
     MiniPanda(1.0, 1.0, 1.0),
     MiniPanda(1.0, 1.0, 0.0),
diff --git a/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala
similarity index 89%
rename from src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala
rename to core/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala
index fa551a50..05b70e8e 100644
--- a/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala
@@ -3,16 +3,13 @@
  */
 package com.highperformancespark.examples.mllib
 
-import com.highperformancespark.examples.dataframe.RawPanda
+import org.apache.spark.mllib.linalg.{Vector => SparkVector}
 
+import com.highperformancespark.examples.dataframe.RawPanda
 import com.holdenkarau.spark.testing._
+import org.scalatest.funsuite.AnyFunSuite
 
-import org.scalatest.FunSuite
-
-
-import org.apache.spark.mllib.linalg.{Vector => SparkVector}
-
-class GoldilocksMLlibSuite extends FunSuite with SharedSparkContext {
+class GoldilocksMLlibSuite extends AnyFunSuite with SharedSparkContext {
   val rps = List(
     RawPanda(1L, "94110", "giant", true, Array(0.0, 0.0)),
     RawPanda(2L, "94110", "giant", false, Array(0.0, 3.0)),
diff --git a/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala b/core/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala
similarity index 83%
rename from src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala
rename to core/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala
index 724ddaa3..0b0ed361 100644
--- a/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala
@@ -4,13 +4,14 @@
 package com.highperformancespark.examples.ffi
 
 import com.holdenkarau.spark.testing._
-import org.scalacheck.{Arbitrary, Gen}
+import org.scalacheck.Arbitrary
+import org.scalacheck.Gen
 import org.scalacheck.Prop.forAll
-import org.scalatest.FunSuite
-import org.scalatest.prop.Checkers
-import org.scalatest.Matchers._
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers._
+import org.scalatestplus.scalacheck.Checkers
 
-class NativeExampleSuite extends FunSuite
+class NativeExampleSuite extends AnyFunSuite
     with SharedSparkContext with Checkers with RDDComparisons {
 
   test("local sum") {
diff --git a/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala
similarity index 61%
rename from src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala
rename to core/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala
index 4b1f0324..aa45fe1e 100644
--- a/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala
@@ -4,12 +4,12 @@
 package com.highperformancespark.examples.ffi
 
 import com.holdenkarau.spark.testing._
-import org.scalatest.FunSuite
-import org.scalatest.prop.Checkers
-import org.scalatest.Matchers._
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers._
+import org.scalatestplus.scalacheck.Checkers
 
 
-class PipeExampleSuite extends FunSuite with SharedSparkContext with Checkers {
+class PipeExampleSuite extends AnyFunSuite with SharedSparkContext with Checkers {
   ignore("commentors on a pr") {
     val rdd = sc.parallelize(List(12883))
     val expected = (12883, List("SparkQA", "srowen"))
diff --git a/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala
similarity index 86%
rename from src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala
rename to core/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala
index b5a3d440..871e2aa9 100644
--- a/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala
@@ -5,14 +5,14 @@
  */
 package com.highperformancespark.examples.streaming
 
+import java.lang.Thread
+
 import org.apache.spark.streaming._
 
-import java.lang.Thread
 import com.holdenkarau.spark.testing._
+import org.scalatest.funsuite.AnyFunSuite
 
-import org.scalatest.FunSuite
-
-class DStreamExamplesSuite extends FunSuite with SharedSparkContext {
+class DStreamExamplesSuite extends AnyFunSuite with SharedSparkContext {
   test("simple set up") {
     val ssc = DStreamExamples.makeStreamingContext(sc)
     val inputStream = DStreamExamples.fileAPIExample(ssc, "./")
diff --git a/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala
similarity index 88%
rename from src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala
rename to core/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala
index ca364d14..a0afb64b 100644
--- a/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala
@@ -3,14 +3,14 @@
  */
 package com.highperformancespark.examples.tokenize
 
+import java.lang.Thread
+
 import org.apache.spark.streaming._
 
-import java.lang.Thread
 import com.holdenkarau.spark.testing._
+import org.scalatest.funsuite.AnyFunSuite
 
-import org.scalatest.FunSuite
-
-class SampleTokenizeSuite extends FunSuite with SharedSparkContext {
+class SampleTokenizeSuite extends AnyFunSuite with SharedSparkContext {
   val input = List("hi holden", "I like coffee")
   val expected = List("hi", "holden", "I", "like", "coffee")
 
diff --git a/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala
similarity index 58%
rename from src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala
rename to core/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala
index 897a8d39..545b7898 100644
--- a/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala
@@ -4,12 +4,10 @@
 package com.highperformancespark.examples.tools
 
 import com.highperformancespark.examples.dataframe.RawPanda
-
 import com.holdenkarau.spark.testing._
+import org.scalatest.funsuite.AnyFunSuite
 
-import org.scalatest.FunSuite
-
-class FilterInvalidPandasSuite extends FunSuite with SharedSparkContext {
+class FilterInvalidPandasSuite extends AnyFunSuite with SharedSparkContext {
   test("simple filter") {
     val invalidPandas = List(1L, 2L)
     val inputPandas = List(
@@ -23,4 +21,16 @@ class FilterInvalidPandasSuite extends FunSuite with SharedSparkContext {
     assert(result1.collect() === result2.collect())
     assert(result1.count() === 1)
   }
+
+  test("alt log") {
+    val invalidPandas = List(1L, 2L)
+    val inputPandas = List(
+      RawPanda(1L, "94110", "giant", true, Array(0.0)),
+      RawPanda(3L, "94110", "giant", true, Array(0.0)))
+    val input = sc.parallelize(inputPandas)
+    val al = new AltLog()
+    val result1 =
+      al.filterInvalidPandasWithLogs(sc, invalidPandas, input)
+    assert(result1.count() === 1)
+  }
 }
diff --git a/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala
similarity index 91%
rename from src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala
rename to core/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala
index 15f60d12..1d761601 100644
--- a/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala
@@ -4,12 +4,10 @@
 package com.highperformancespark.examples.tools
 
 import com.highperformancespark.examples.dataframe.RawPanda
-
 import com.holdenkarau.spark.testing._
+import org.scalatest.funsuite.AnyFunSuite
 
-import org.scalatest.FunSuite
-
-class GeneratescalaingDataSuite extends FunSuite with SharedSparkContext {
+class GeneratescalaingDataSuite extends AnyFunSuite with SharedSparkContext {
   // The number of entries depends somewhat on the partition split because we
   // zip multiple separate RDDs so its more of a "request"
   test("expected num entries") {
diff --git a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/core/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
similarity index 57%
rename from src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
rename to core/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
index 5eb995f2..48991e0d 100644
--- a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
+++ b/core/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
@@ -3,13 +3,13 @@
  */
 package com.highperformancespark.examples.transformations
 
-import com.highperformancespark.examples.dataframe.RawPanda
+import scala.collection.immutable.HashSet
 
+import com.highperformancespark.examples.dataframe.RawPanda
 import com.holdenkarau.spark.testing._
+import org.scalatest.funsuite.AnyFunSuite
 
-import org.scalatest.FunSuite
-
-class AccumulatorsTest extends FunSuite with SharedSparkContext {
+class AccumulatorsTest extends AnyFunSuite with SharedSparkContext {
   test("accumulator max should function") {
     val input = sc.parallelize(1.to(100)).map(x =>
       RawPanda(1L, "1", "red", true, Array(x.toDouble)))
@@ -23,4 +23,17 @@ class AccumulatorsTest extends FunSuite with SharedSparkContext {
     val (_, sum) = Accumulators.computeTotalFuzzyNess(sc, input)
     assert(sum === 5050.0)
   }
+
+  test("accumulator unique should function") {
+    val input1 = sc.parallelize(1 to 100).map(x =>
+      RawPanda(1L, "1", "red", true, Array(x.toDouble))
+    )
+
+    val input2 = sc.parallelize(1 to 100).map(x =>
+      RawPanda(2L, "2", "blude", false, Array(x.toDouble))
+    )
+
+    val set = Accumulators.uniquePandas(sc, input1 ++ input2)
+    assert(set == HashSet(2, 1))
+  }
 }
diff --git a/core/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala b/core/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala
new file mode 100644
index 00000000..68eab956
--- /dev/null
+++ b/core/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala
@@ -0,0 +1,63 @@
+package com.highperformancespark.examples.wordcount
+
+
+import com.holdenkarau.spark.testing.SharedSparkContext
+import org.scalatest.funsuite.AnyFunSuite
+
+class WordCountTest extends AnyFunSuite with SharedSparkContext {
+  test("word count with Stop Words Removed"){
+    val wordRDD = sc.parallelize(Seq(
+      "How happy was the panda? You ask.",
+      "Panda is the most happy panda in all the #$!?ing land!"))
+
+    val stopWords: Set[String] = Set("a", "the", "in", "was", "there", "she", "he")
+    val illegalTokens: Array[Char] = "#$%?!.".toCharArray
+
+    val wordCounts = WordCount.withStopWordsFiltered(
+      wordRDD, illegalTokens, stopWords)
+    val wordCountsAsMap = wordCounts.collectAsMap()
+    assert(!wordCountsAsMap.contains("the"))
+    assert(!wordCountsAsMap.contains("?"))
+    assert(!wordCountsAsMap.contains("#$!?ing"))
+    assert(wordCountsAsMap.contains("ing"))
+    assert(wordCountsAsMap.get("panda").get.equals(3))
+  }
+
+  test("word count with simple counting") {
+    val wordRDD = sc.parallelize(
+      Seq(
+        "a b c d",
+        "b c d e"
+      )
+    )
+    val wordCounts = WordCount.simpleWordCount(wordRDD)
+
+    val wordCountsAsMap = wordCounts.collectAsMap()
+
+    for (character <- 'a' to 'e') {
+      assert(wordCountsAsMap.contains(character.toString))
+    }
+    for (character <- 'b' to 'd') {
+      assert(wordCountsAsMap.get(character.toString).get == 2)
+    }
+  }
+
+  test("word count with bad idea") {
+    val wordRDD = sc.parallelize(
+      Seq(
+        "a b c d",
+        "b c d e"
+      )
+    )
+    val wordCounts = WordCount.badIdea(wordRDD)
+
+    val wordCountsAsMap = wordCounts.collectAsMap()
+
+    for (character <- 'a' to 'e') {
+      assert(wordCountsAsMap.contains(character.toString))
+    }
+    for (character <- 'b' to 'd') {
+      assert(wordCountsAsMap.get(character.toString).get == 2)
+    }
+  }
+}
diff --git a/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala b/core/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala
similarity index 100%
rename from src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala
rename to core/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala
diff --git a/data/project.csv b/data/project.csv
new file mode 100644
index 00000000..69210101
--- /dev/null
+++ b/data/project.csv
@@ -0,0 +1,5 @@
+creator,projectname,stars
+holdenk,spark-upgrade,17
+krisnova,rust-nova,71
+kbendick,MongoMart,6
+mateiz,spark,36600
\ No newline at end of file
diff --git a/env_setup.sh b/env_setup.sh
new file mode 100755
index 00000000..f31f427d
--- /dev/null
+++ b/env_setup.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+set -ex
+
+# Download Spark and iceberg if not present
+SPARK_MAJOR=${SPARK_MAJOR:-"3.5"}
+SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.3"}
+SCALA_VERSION=${SCALA_VERSION:-"2.13"}
+HADOOP_VERSION="3"
+SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
+SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz"
+if [ "$SCALA_VERSION" = "2.13" ]; then
+  SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3-scala2.13.tgz"
+  SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala2.13"
+fi
+ICEBERG_VERSION=${ICEBERG_VERSION:-"1.9.2"}
+if [ ! -f "${SPARK_FILE}" ]; then
+  SPARK_DIST_URL="https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}"
+  SPARK_ARCHIVE_DIST_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_FILE}"
+  if command -v axel &> /dev/null
+  then
+    (axel --quiet "$SPARK_DIST_URL" || axel --quiet "$SPARK_ARCHIVE_DIST_URL") &
+  else
+    (wget --quiet "$SPARK_DIST_URL" || wget --quiet "$SPARK_ARCHIVE_DIST_URL") &
+  fi
+fi
+# Download Icberg if not present
+ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}-${ICEBERG_VERSION}.jar"
+if [ ! -f "${ICEBERG_FILE}" ]; then
+  wget --quiet "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" &
+fi
+wait
+sleep 1
+# Setup the env
+if [ ! -d "${SPARK_PATH}" ]; then
+  tar -xf "${SPARK_FILE}"
+fi
+
+SPARK_HOME="${SPARK_PATH}"
+export SPARK_HOME
+
+if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then
+  # Delete the old JAR first.
+  rm "${SPARK_PATH}/jars/iceberg-spark-runtime*.jar" || echo "No old version to delete."
+  cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}"
+fi
+
+# Set up for running pyspark and friends
+export PATH="${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH}"
+
+# Make sure we have a history directory
+mkdir -p /tmp/spark-events
+
+mkdir -p ./data/fetched/
+if [ ! -f ./data/fetched/2021 ]; then
+  wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" -O ./data/fetched/2021
+fi
+if [ ! -f ./data/fetched/2022 ]; then
+  wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2022" -O ./data/fetched/2022
+fi
+if [ ! -f ./data/fetched/2023 ]; then
+  wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2023" -O ./data/fetched/2023
+fi
+
diff --git a/high_performance_pyspark/__init__.py b/high_performance_pyspark/__init__.py
index 7741593d..3f79c0dd 100644
--- a/high_performance_pyspark/__init__.py
+++ b/high_performance_pyspark/__init__.py
@@ -22,4 +22,3 @@
 
 import os
 import sys
-
diff --git a/iceberg-workshop-solutions/Workshop-Template.ipynb b/iceberg-workshop-solutions/Workshop-Template.ipynb
new file mode 100644
index 00000000..472a9c3c
--- /dev/null
+++ b/iceberg-workshop-solutions/Workshop-Template.ipynb
@@ -0,0 +1,552 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34577ad3-822f-4370-bcba-56b9fcec3196",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import org.apache.spark.sql._\n",
+    "import scala.sys.process._"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d3141ec-7779-467a-9f76-2e51030fd1c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// So now we need to configure Spark to use Iceberg\n",
+    "// See https://iceberg.apache.org/docs/1.6.0/spark-configuration/ & https://iceberg.apache.org/docs/1.6.0/spark-getting-started/\n",
+    "// We'll use the \"hadoop\" (aka file) catalog & /high-performance-spark-examples/warehouse for the location\n",
+    "val spark = (SparkSession.builder.master(\"local[*]\")\n",
+    "             // Setup the extensions\n",
+    "             // You'll want to configure Iceberg here as discussed above\n",
+    "             // If you want to match the solution you'll want to configure the Iceberg catalog to be \"local.\"\n",
+    "             .getOrCreate()\n",
+    "             )\n",
+    "import spark._"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c53080aa-a9d6-45f9-968b-8e052e7fa963",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88d54d05-0c49-4268-9b65-8c72679cb0f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sparkContext.uiWebUrl.get"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "270730c9-9787-407c-ba22-f0cee1f67f53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Load the current data\n",
+    "val df = spark.read.option(\"header\", \"true\").option(\"inferSchema\", \"true\").csv(\"/high-performance-spark-examples/data/fetched/2021\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87ca6359-86bc-42a4-93dd-4fc64496b145",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Drop existing table if present & create new table\n",
+    "spark.sql(\"DROP TABLE IF EXISTS local.uk_gender_pay_data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8bdeb3eb-b725-409b-ab3a-409d0e8309ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Write the data out\n",
+    "df.write.saveAsTable(\"local.uk_gender_pay_data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "554c6036-0c6b-4e3c-a9e1-7251c608b48f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"ls /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb541fbf-4a79-402d-a6b2-e999106e9a18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"cat  /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/v1.metadata.json\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90149834-27a2-45a3-aa8a-dae2162da854",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Iceberg Java SDK time imports\n",
+    "import java.util.HashMap\n",
+    "import java.util.Map\n",
+    "\n",
+    "import org.apache.iceberg.Table\n",
+    "import org.apache.iceberg.catalog.TableIdentifier\n",
+    "import org.apache.iceberg.hadoop.HadoopCatalog\n",
+    "\n",
+    "\n",
+    "// And to handle java types\n",
+    "import scala.jdk.CollectionConverters._"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebf56bc6-d420-474c-b3a8-ded03b23eff8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Create a local Iceberg Catalog client. Here we're using the \"hadoop catalog\"\n",
+    "// The spark hadoop conf can be got from: spark.sparkContext.hadoopConfiguration\n",
+    "// Here we make the Catalog, it's kind of funky. Spark also has methods which return tables but they're Spark tables so\n",
+    "// which aren't the type we want\n",
+    "// https://iceberg.apache.org/javadoc/latest/org/apache/iceberg/hadoop/HadoopCatalog.html\n",
+    "val catalog = ..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c55dc276-035f-40d4-9a47-bd4698f2519d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Now we want to load the table. To do that we need to make a TableIdentifier of the same table we wrote to. Note it'll just be\n",
+    "// the table name no need for the \"local\" prefix.\n",
+    "// See https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/catalog/TableIdentifier.html\n",
+    "val name = ..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ea4b1cc-bd1b-42b4-bdbe-27625b461db9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Load the table\n",
+    "val table = catalog.loadTable(name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd1c6add-d465-4b81-9c34-6c8f40197ab2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Now we want to get the snapshots from the table. There are a few different ways we can do this:\n",
+    "// 1) Using the Iceberg Table API (see https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/Table.html)\n",
+    "// 2) Using the Iceberg + Spark SQL special query interface https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/Table.html\n",
+    "val snapshots = ..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9a96986d-b3a5-49ad-aeac-a492bf3fc8e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val snapshot = snapshots(0).snapshotId()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15c6cb85-ff64-405f-ae6a-7e3c917ac12a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val altSnapshotQuery = ..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f93516ad-3ae9-4bb6-989f-7c127f82143c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val altSnapshotId = ..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15e67d1b-1e9e-45a0-af94-1c9c79e03d54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4829752b-dc30-49db-93ae-911f1c2743c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// And the files!\n",
+    "// We can also list snapshots with the select\n",
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f262d890-0818-410a-aec8-2986a04ae16e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Lets take a quick look and see\n",
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7369bcc8-a738-48dc-a475-55885d4460cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"DELETE FROM local.uk_gender_pay_data WHERE isnull(responsibleperson)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73d279f3-f2a5-4ddf-a56f-d473b0c28b97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Make sure the data is gone\n",
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b6902ef-b742-466d-b4c8-d6830ff67cf4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Yay! ok now lets travel back in time\n",
+    "// We can do this with SQL or with a read option\n",
+    "// SQL: https://iceberg.apache.org/docs/nightly/spark-queries/#sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7e899a1-d2cd-4e25-b142-e69fb9ca6774",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// DF: https://iceberg.apache.org/docs/nightly/spark-queries/#dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8884a8a2-bbb7-47b1-85f6-744c60612dcb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(f\"SELECT * FROM local.uk_gender_pay_data.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f53692b-f14a-4df7-8069-147eca8da0cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"DROP TABLE IF EXISTS local.uk_gender_pay_data_postcode\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8deb38c3-1e64-4eba-ac80-c75d5674258b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Write the data out partitioned to do this we'll want to use the SQL interface so we can use the truncate function\n",
+    "// since the regular Scala API doesn't support partioning by things besides raw keys.\n",
+    "https://iceberg.apache.org/docs/1.5.1/spark-ddl/#partitioned-by"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e87e6b08-5c0e-4356-a0ee-7245b7d7790b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Inspect the files again. This should look familiar ish\n",
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data_postcode.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71569b2e-7def-42a4-bf3e-69ee9667a41d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Add some more data, we've got 2022, 2023 , & 2024\n",
+    "// Make sure to use the append mode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5be0f8c7-2926-4bf6-bc9d-c02a15648e83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val uncompacted_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb6c4b7d-f8d1-41f7-b014-c6434bbb6d48",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val uncompacted_metadata_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/metadata/\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfb116e4-c66d-4027-80a3-e7de9ad62ee0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "586bdb3c-19f0-4a63-b87f-d181e8c44c06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.snapshots\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22351ea4-8cb7-43c2-b205-4554d0b15aca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import org.apache.iceberg.spark.actions.SparkActions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "928f9da9-d65b-4d53-b818-82a27f8171a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// So far the logging has been... verbose but interesting, but the next stages it's actually too much\n",
+    "spark.sparkContext.setLogLevel(\"ERROR\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "807193d8-8ff5-4a9c-b6ae-510ee0bb2f84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Ok let's try and compact things down a little bit.\n",
+    "// You should look at SparkActions & use the rewrite data files operation.\n",
+    "// Consider specifying rewrite-all to true to force rewrites\n",
+    "// https://iceberg.apache.org/javadoc/latest/org/apache/iceberg/spark/actions/SparkActions.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e4a013d-1af5-4dd8-82c1-5115905f3feb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val compacted_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c40db89-7ce1-40ed-a111-1395e5b75a0a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Interesting. Note it _added_ a new file but the old files are all still there. That's kind of expected/ok since if we look at the\n",
+    "// files actually currently used it's just the new one\n",
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9198c74b-87d5-42b0-9987-587095848282",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Remove the old snapshots but keep the latest one.\n",
+    "// This produces _so much logging_ by default that running in the NB would be slow (that's why we set the log level to error)\n",
+    "// Here your going to want to use the expireSnapshots action.\n",
+    "// Note: if you _just set_ retainLast it will keep all snapshots, retain last is like a safety mechanism that keeps the last K\n",
+    "// snapshots. To get rid of everything except the last expire everything older than right now."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be51d1ca-a105-407f-ac3c-41c0f9258891",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val compacted_and_expired_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18825715-ced8-401f-b7b3-9ea682d38757",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Table is in an inconsistent state here, this is not \"good\" but YOLO\n",
+    "// spark.sql(\"REFRESH local.uk_gender_pay_data\").show()\n",
+    "// spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa8644af-5604-4147-8546-f65e749b8253",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f1983f2-2fe7-4e43-a78e-40fd1c7577fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Remove the orphaned files\n",
+    "SparkActions.get().deleteOrphanFiles(table).execute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73b3e2ca-555b-467c-a253-d96aab32e27b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val cleaned_and_compacted_file_list = \"ls ../warehouse/uk_gender_pay_data/data/\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "921d0c02-1fb7-43ec-ac0a-b5d1c3a40c3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b8ff0a3-8c6e-4d67-8afb-d1541c7e6dbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Lets go take a look at a quick side-by-side test\n",
+    "//cd /high-performance-spark-examples/spark-upgrade/;./e2e_demo/scala/run_demo.sh"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05d47a57-3bfa-484a-90ed-0231a17a7205",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Ok, let's try branching! Note: requires very recent Iceberg, so if you're doing this elsewhere might not be a party\n",
+    "// Relevant docs: https://iceberg.apache.org/docs/nightly/spark-ddl/#branching-and-tagging-ddl\n",
+    "// https://iceberg.apache.org/docs/nightly/spark-queries/#sql"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)",
+   "language": "scala",
+   "name": "scala2.13"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-scala",
+   "file_extension": ".sc",
+   "mimetype": "text/x-scala",
+   "name": "scala",
+   "nbconvert_exporter": "script",
+   "version": "2.13.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/iceberg-workshop-solutions/Workshop.ipynb b/iceberg-workshop-solutions/Workshop.ipynb
new file mode 100644
index 00000000..ebd12637
--- /dev/null
+++ b/iceberg-workshop-solutions/Workshop.ipynb
@@ -0,0 +1,577 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34577ad3-822f-4370-bcba-56b9fcec3196",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import org.apache.spark.sql._\n",
+    "import scala.sys.process._"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d3141ec-7779-467a-9f76-2e51030fd1c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// So now we need to configure Spark to use Iceberg\n",
+    "// See https://iceberg.apache.org/docs/1.6.0/spark-configuration/ & https://iceberg.apache.org/docs/1.6.0/spark-getting-started/\n",
+    "// We'll use the \"hadoop\" (aka file) catalog & /high-performance-spark-examples/warehouse for the location\n",
+    "val spark = (SparkSession.builder.master(\"local[*]\")\n",
+    "             // Setup the extensions\n",
+    "             .config(\"spark.sql.extensions\", \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\")\n",
+    "             .config(\"spark.sql.catalog.local\", \"org.apache.iceberg.spark.SparkCatalog\")\n",
+    "             .config(\"spark.sql.catalog.local.type\", \"hadoop\")\n",
+    "             .config(\"spark.sql.catalog.local.warehouse\", \"/high-performance-spark-examples/warehouse\")\n",
+    "             .getOrCreate()\n",
+    "             )\n",
+    "import spark._"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ecbdf4a8-3f16-4242-9d89-0ce7835b49e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sparkContext.uiWebUrl.get"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "270730c9-9787-407c-ba22-f0cee1f67f53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Load the current data\n",
+    "val df = spark.read.option(\"header\", \"true\").option(\"inferSchema\", \"true\").csv(\"/high-performance-spark-examples/data/fetched/2021\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87ca6359-86bc-42a4-93dd-4fc64496b145",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Drop existing table if present & create new table\n",
+    "spark.sql(\"DROP TABLE IF EXISTS local.uk_gender_pay_data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8bdeb3eb-b725-409b-ab3a-409d0e8309ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Write the data out\n",
+    "df.write.saveAsTable(\"local.uk_gender_pay_data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "554c6036-0c6b-4e3c-a9e1-7251c608b48f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"ls /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb541fbf-4a79-402d-a6b2-e999106e9a18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"cat  /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/v1.metadata.json\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90149834-27a2-45a3-aa8a-dae2162da854",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Java SDK time imports\n",
+    "import java.util.HashMap\n",
+    "import java.util.Map\n",
+    "\n",
+    "import org.apache.iceberg.Table\n",
+    "import org.apache.iceberg.catalog.TableIdentifier\n",
+    "import org.apache.iceberg.hadoop.HadoopCatalog\n",
+    "\n",
+    "\n",
+    "// And to handle java types\n",
+    "import scala.jdk.CollectionConverters._"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebf56bc6-d420-474c-b3a8-ded03b23eff8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Create a local Iceberg Catalog client. Here we're using the \"hadoop catalog\"\n",
+    "// The spark hadoop conf can be got from: spark.sparkContext.hadoopConfiguration\n",
+    "// Here we make the Catalog, it's kind of funky. Spark also has methods which return tables but they're Spark tables so\n",
+    "// which aren't the type we want\n",
+    "val catalog = new HadoopCatalog(spark.sparkContext.hadoopConfiguration, \"/high-performance-spark-examples/warehouse\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c55dc276-035f-40d4-9a47-bd4698f2519d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Now we want to load the table. To do that we need to make a TableIdentifier of the same table we wrote to. Note it'll just be\n",
+    "// the table name no need for the \"local\" prefix.\n",
+    "// See https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/catalog/TableIdentifier.html\n",
+    "val name = TableIdentifier.of(\"uk_gender_pay_data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ea4b1cc-bd1b-42b4-bdbe-27625b461db9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val table = catalog.loadTable(name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd1c6add-d465-4b81-9c34-6c8f40197ab2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Now we want to get the snapshots from the table. There are a few different ways we can do this:\n",
+    "// 1) Using the Iceberg Table API (see https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/Table.html)\n",
+    "// 2) Using the Iceberg + Spark SQL special query interface https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/Table.html\n",
+    "val snapshots = table.snapshots().asScala.toList\n",
+    "snapshots"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9a96986d-b3a5-49ad-aeac-a492bf3fc8e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val snapshot = snapshots(0).snapshotId()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15c6cb85-ff64-405f-ae6a-7e3c917ac12a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val altSnapshotQuery = spark.sql(\"SELECT * FROM local.uk_gender_pay_data.snapshots\")\n",
+    "altSnapshotQuery.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f93516ad-3ae9-4bb6-989f-7c127f82143c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val altSnapshotId = spark.sql(\"SELECT snapshot_id FROM local.uk_gender_pay_data.snapshots\").collect()(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15e67d1b-1e9e-45a0-af94-1c9c79e03d54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d94eb4db-bf03-49be-865a-e80c0613d526",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// We can also list snapshots with the select\n",
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.snapshots\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4829752b-dc30-49db-93ae-911f1c2743c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// And the files!\n",
+    "// We can also list snapshots with the select\n",
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f262d890-0818-410a-aec8-2986a04ae16e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Lets take a quick look and see\n",
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7369bcc8-a738-48dc-a475-55885d4460cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"DELETE FROM local.uk_gender_pay_data WHERE isnull(responsibleperson)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73d279f3-f2a5-4ddf-a56f-d473b0c28b97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Make sure the data is gone\n",
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b6902ef-b742-466d-b4c8-d6830ff67cf4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Yay! ok now lets travel back in time\n",
+    "// We can do this with SQL or with a read option\n",
+    "spark.sql(f\"SELECT * FROM local.uk_gender_pay_data VERSION AS OF ${snapshot} WHERE isnull(responsibleperson) LIMIT 5\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7e899a1-d2cd-4e25-b142-e69fb9ca6774",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Or the same with option + DF syntax\n",
+    "spark.read.option(\"snapshot-id\", f\"${snapshot}\").table(\"local.uk_gender_pay_data\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8884a8a2-bbb7-47b1-85f6-744c60612dcb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(f\"SELECT * FROM local.uk_gender_pay_data.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f53692b-f14a-4df7-8069-147eca8da0cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"DROP TABLE IF EXISTS local.uk_gender_pay_data_postcode\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8deb38c3-1e64-4eba-ac80-c75d5674258b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Write the data out partitioned\n",
+    "df.registerTempTable(\"temp_table\")\n",
+    "// We could use the table write semantics but we can't do truncate() on that\n",
+    "spark.sql(\"CREATE TABLE local.uk_gender_pay_data_postcode USING iceberg PARTITIONED BY (truncate(1, PostCode)) AS select * from temp_table\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e87e6b08-5c0e-4356-a0ee-7245b7d7790b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Inspect the files again. This should look familiar ish\n",
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data_postcode.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71569b2e-7def-42a4-bf3e-69ee9667a41d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val year_dfs = 2022.to(2023).map(r => spark.read.option(\"header\", \"true\").option(\"inferSchema\", \"true\").csv(s\"/high-performance-spark-examples/data/fetched/${r}\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c4441cf-fd65-4a29-94fb-6d3aa927f6b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "List(\"local.uk_gender_pay_data\", \"local.uk_gender_pay_data_postcode\").foreach(table => year_dfs.foreach(df => df.write.mode(\"append\").saveAsTable(table)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5be0f8c7-2926-4bf6-bc9d-c02a15648e83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val uncompacted_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb6c4b7d-f8d1-41f7-b014-c6434bbb6d48",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val uncompacted_metadata_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/metadata/\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfb116e4-c66d-4027-80a3-e7de9ad62ee0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "586bdb3c-19f0-4a63-b87f-d181e8c44c06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.snapshots\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22351ea4-8cb7-43c2-b205-4554d0b15aca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import org.apache.iceberg.spark.actions.SparkActions\n",
+    "// Iceberg actions\n",
+    "import org.apache.iceberg.actions.Action"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "928f9da9-d65b-4d53-b818-82a27f8171a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// So far the logging has been... verbose but interesting, but the next stages it's actually too much\n",
+    "spark.sparkContext.setLogLevel(\"ERROR\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "807193d8-8ff5-4a9c-b6ae-510ee0bb2f84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Ok let's try and compact things down a little bit.\n",
+    "// You should look at SparkActions & use the rewrite data files operation.\n",
+    "// Consider specifying rewrite-all to true to force rewrites\n",
+    "// https://iceberg.apache.org/javadoc/latest/org/apache/iceberg/spark/actions/SparkActions.html\n",
+    "SparkActions.get().rewriteDataFiles(table).option(\"target-file-size-bytes\", (512L*1024L*1024L).toString).option(\"rewrite-all\", \"true\").execute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e4a013d-1af5-4dd8-82c1-5115905f3feb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val compacted_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c40db89-7ce1-40ed-a111-1395e5b75a0a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9198c74b-87d5-42b0-9987-587095848282",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Remove the old snapshots but keep the latest one.\n",
+    "// This produces _so much logging_ by default that running in the NB would be slow (that's why we set the log level to error)\n",
+    "// Here your going to want to use the expireSnapshots action.\n",
+    "// Note: if you _just set_ retainLast it will keep all snapshots, retain last is like a safety mechanism that keeps the last K\n",
+    "// snapshots. To get rid of everything except the last expire everything older than right now.\n",
+    "SparkActions.get().expireSnapshots(table).expireOlderThan(System.currentTimeMillis()).retainLast(1).execute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be51d1ca-a105-407f-ac3c-41c0f9258891",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val compacted_and_expired_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18825715-ced8-401f-b7b3-9ea682d38757",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Table is in an inconsistent state here, this is not \"good\"\n",
+    "spark.sql(\"REFRESH local.uk_gender_pay_data\").show()\n",
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa8644af-5604-4147-8546-f65e749b8253",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f1983f2-2fe7-4e43-a78e-40fd1c7577fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Remove the orphaned files\n",
+    "SparkActions.get().deleteOrphanFiles(table).execute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73b3e2ca-555b-467c-a253-d96aab32e27b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val cleaned_and_compacted_file_list = \"ls ../warehouse/uk_gender_pay_data/data/\".!!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "921d0c02-1fb7-43ec-ac0a-b5d1c3a40c3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b8ff0a3-8c6e-4d67-8afb-d1541c7e6dbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Lets go take a look at a quick side-by-side test\n",
+    "//cd /high-performance-spark-examples/spark-upgrade/;./e2e_demo/scala/run_demo.sh\n",
+    "//That'll be easier to run in a terminal than the .!! trick we've been doing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05d47a57-3bfa-484a-90ed-0231a17a7205",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "// Ok, let's try branching! Note: requires very recent Iceberg, so if you're doing this elsewhere might not be a party\n",
+    "// Relevant docs: https://iceberg.apache.org/docs/nightly/spark-ddl/#branching-and-tagging-ddl\n",
+    "// https://iceberg.apache.org/docs/nightly/spark-queries/#sql\n",
+    "spark.sql(\"ALTER TABLE local.uk_gender_pay_data CREATE BRANCH IF NOT EXISTS `new-software-branch`\")\n",
+    "spark.sql(\"DELETE FROM local.uk_gender_pay_data.`branch_new-software-branch` WHERE isnull(DueDate)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "128591e9-fc12-4791-8797-901ce2f1c6b7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)",
+   "language": "scala",
+   "name": "scala2.13"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-scala",
+   "file_extension": ".sc",
+   "mimetype": "text/x-scala",
+   "name": "scala",
+   "nbconvert_exporter": "script",
+   "version": "2.13.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/migration/sql.sh b/migration/sql.sh
new file mode 100644
index 00000000..3d94f07e
--- /dev/null
+++ b/migration/sql.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+pip install sqlfluff
+python -m pip install 'sqlfluff-plugin-sparksql-upgrade @ git+https://github.com/holdenk/spark-upgrade#subdirectory=sql'
+
+sqlfluff rules |grep -i spark
+sqlfluff fix --dialect sparksql  farts.sql
diff --git a/misc/container_launch.sh b/misc/container_launch.sh
new file mode 100755
index 00000000..31f0edbb
--- /dev/null
+++ b/misc/container_launch.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+if [ ! -f /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb ]; then
+  cp /high-performance-spark-examples/iceberg-workshop-solutions/Workshop-Template.ipynb /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb
+fi
+jupyter-lab --ip 0.0.0.0 --port 8877
diff --git a/misc/kernel.json b/misc/kernel.json
new file mode 100644
index 00000000..5812f16a
--- /dev/null
+++ b/misc/kernel.json
@@ -0,0 +1,19 @@
+{
+  "argv": [
+    "java",
+    "-cp",
+      "/home/dev/.local/share/jupyter/kernels/scala2.13/launcher.jar:.:/high-performance-spark-examples/:/high-performance-spark-examples/target/scala-2.13/home/dev/.local/share/jupyter/kernels/scala/launcher.jar:/high-performance-spark-examples/spark-3.5.2-bin-hadoop3-scala2.13/jars/*",  
+    "coursier.bootstrap.launcher.Launcher",
+    "--log",
+    "info",
+    "--metabrowse",
+    "--id",
+    "scala2.13",
+    "--display-name",
+    "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)",
+    "--connection-file",
+    "{connection_file}"
+  ],
+  "display_name": "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)",
+  "language": "scala"
+}
diff --git a/native/src/CMakeLists.txt b/native/src/CMakeLists.txt
new file mode 100644
index 00000000..e9766458
--- /dev/null
+++ b/native/src/CMakeLists.txt
@@ -0,0 +1,71 @@
+################################################################
+# A minimal CMake file that is compatible with sbt-jni         #
+#                                                              #
+# All settings required by sbt-jni have been marked so, please #
+# add/modify/remove settings to build your specific library.   #
+################################################################
+
+cmake_minimum_required(VERSION 3.12)
+
+option(SBT "Set if invoked from sbt-jni" OFF)
+
+# Define project and related variables
+# (required by sbt-jni) please use semantic versioning
+#
+project (high-performance-spark)
+enable_language(Fortran)
+set(PROJECT_VERSION_MAJOR 0)
+set(PROJECT_VERSION_MINOR 0)
+set(PROJECT_VERSION_PATCH 0)
+
+set (LIB_NAME ${PROJECT_NAME}${PROJECT_VERSION_MAJOR})
+
+#tag::velox[]
+set (GLUTEN_LIB_NAME ${PROJECT_NAME}-gluten-${PROJECT_VERSION_MAJOR})
+# For gluten+velox, you can leave out if not using gluten
+set(GLUTEN_HOME ../../gluten)
+set(CMAKE_FIND_DEBUG_MODE TRUE)
+find_library(VELOX_LIBRARY NAMES velox HINTS
+	     ${GLUTEN_HOME}/cpp/build/releases NO_DEFAULT_PATH)
+# End gluten specific
+
+if(VELOX_LIBRARY)
+    file(GLOB GLUTEN_UDF_FILES
+      "./c/gluten/*.cpp")
+    add_library(${GLUTEN_LIB_NAME} SHARED ${GLUTEN_UDF_FILES})
+    target_include_directories(${GLUTEN_LIB_NAME} PRIVATE ${GLUTEN_HOME}/cpp ${GLUTEN_HOME}/ep/build-velox/build/velox_ep)
+    target_link_libraries(${GLUTEN_LIB_NAME} PRIVATE ${VELOX_LIBRARY})
+else()
+    message(WARNING "Velox library not found. Specific path not added.")
+endif()
+#end::velox[]
+
+# Setup JNI
+find_package(JNI REQUIRED)
+if (JNI_FOUND)
+    message (STATUS "JNI include directories: ${JNI_INCLUDE_DIRS}")
+endif()
+
+# Include directories
+include_directories(.)
+include_directories(include)
+include_directories(${JNI_INCLUDE_DIRS})
+
+# Sources
+file(GLOB LIB_SRC
+  "*.c"
+  "*.f95"
+  "*.f*"
+  "*.cc"
+  "*.cpp"
+  "./c/*.c"
+  "./c/*.cpp"
+  "./fortran/*.f95"
+  "./fortran/*.f*"
+)
+
+# Setup installation targets
+# (required by sbt-jni) major version should always be appended to library name
+#
+add_library(${LIB_NAME} SHARED ${LIB_SRC})
+install(TARGETS ${LIB_NAME} LIBRARY DESTINATION .)
diff --git a/native/src/c/gluten/GlutenUDF.cpp b/native/src/c/gluten/GlutenUDF.cpp
new file mode 100644
index 00000000..14019f4a
--- /dev/null
+++ b/native/src/c/gluten/GlutenUDF.cpp
@@ -0,0 +1,82 @@
+// Filename MyUDF.cpp
+
+#include <velox/expression/VectorFunction.h>
+#include <velox/udf/Udf.h>
+#include <iostream>
+
+
+namespace {
+using namespace facebook::velox;
+
+template <TypeKind Kind>
+class PlusConstantFunction : public exec::VectorFunction {
+ public:
+  explicit PlusConstantFunction(int32_t addition) : addition_(addition) {}
+
+  void apply(
+      const SelectivityVector& rows,
+      std::vector<VectorPtr>& args,
+      const TypePtr& /* outputType */,
+      exec::EvalCtx& context,
+      VectorPtr& result) const override {
+    using nativeType = typename TypeTraits<Kind>::NativeType;
+    VELOX_CHECK_EQ(args.size(), 1);
+
+    auto& arg = args[0];
+
+    // The argument may be flat or constant.
+    VELOX_CHECK(arg->isFlatEncoding() || arg->isConstantEncoding());
+
+    BaseVector::ensureWritable(rows, createScalarType<Kind>(), context.pool(), result);
+
+    auto* flatResult = result->asFlatVector<nativeType>();
+    auto* rawResult = flatResult->mutableRawValues();
+
+    flatResult->clearNulls(rows);
+
+    if (arg->isConstantEncoding()) {
+      auto value = arg->as<ConstantVector<nativeType>>()->valueAt(0);
+      rows.applyToSelected([&](auto row) { rawResult[row] = value + addition_; });
+    } else {
+      auto* rawInput = arg->as<FlatVector<nativeType>>()->rawValues();
+
+      rows.applyToSelected([&](auto row) { rawResult[row] = rawInput[row] + addition_; });
+    }
+  }
+
+ private:
+  const int32_t addition_;
+};
+
+static std::vector<std::shared_ptr<exec::FunctionSignature>> integerSignatures() {
+  // integer -> integer
+  return {exec::FunctionSignatureBuilder().returnType("integer").argumentType("integer").build()};
+}
+
+static std::vector<std::shared_ptr<exec::FunctionSignature>> bigintSignatures() {
+  // bigint -> bigint
+  return {exec::FunctionSignatureBuilder().returnType("bigint").argumentType("bigint").build()};
+}
+
+} // namespace
+
+const int kNumMyUdf = 2;
+gluten::UdfEntry myUdf[kNumMyUdf] = {{"myudf1", "integer"}, {"myudf2", "bigint"}};
+
+DEFINE_GET_NUM_UDF {
+  return kNumMyUdf;
+}
+
+DEFINE_GET_UDF_ENTRIES {
+  for (auto i = 0; i < kNumMyUdf; ++i) {
+    udfEntries[i] = myUdf[i];
+  }
+}
+
+DEFINE_REGISTER_UDF {
+  facebook::velox::exec::registerVectorFunction(
+      "myudf1", integerSignatures(), std::make_unique<PlusConstantFunction<facebook::velox::TypeKind::INTEGER>>(5));
+  facebook::velox::exec::registerVectorFunction(
+      "myudf2", bigintSignatures(), std::make_unique<PlusConstantFunction<facebook::velox::TypeKind::BIGINT>>(5));
+  std::cout << "registered myudf1, myudf2" << std::endl;
+}
diff --git a/src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h b/native/src/c/include/com_highperformancespark_examples_ffi_SumJNI.h
similarity index 100%
rename from src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h
rename to native/src/c/include/com_highperformancespark_examples_ffi_SumJNI.h
diff --git a/src/main/c/sum.c b/native/src/c/sum.c
similarity index 100%
rename from src/main/c/sum.c
rename to native/src/c/sum.c
diff --git a/src/main/c/sum.h b/native/src/c/sum.h
similarity index 100%
rename from src/main/c/sum.h
rename to native/src/c/sum.h
diff --git a/src/main/c/sum_wrapper.c b/native/src/c/sum_wrapper.c
similarity index 100%
rename from src/main/c/sum_wrapper.c
rename to native/src/c/sum_wrapper.c
diff --git a/src/main/c/sumf_wrapper.c b/native/src/c/sumf_wrapper.c
similarity index 100%
rename from src/main/c/sumf_wrapper.c
rename to native/src/c/sumf_wrapper.c
diff --git a/src/main/fortran/sumf.f95 b/native/src/fortran/sumf.f95
similarity index 100%
rename from src/main/fortran/sumf.f95
rename to native/src/fortran/sumf.f95
diff --git a/project/build.properties b/project/build.properties
new file mode 100644
index 00000000..04267b14
--- /dev/null
+++ b/project/build.properties
@@ -0,0 +1 @@
+sbt.version=1.9.9
diff --git a/project/plugins.sbt b/project/plugins.sbt
index 26c430ed..8cfbf42a 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -1,22 +1,27 @@
-addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0")
+addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
 
 resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
 
 resolvers += "sonatype-snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/"
 
 
-//tag::addSparkPackagesPlugin[]
-resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven"
+addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.2")
 
-addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.5")
-//end::addSparkPackagesPlugin[]
+addDependencyTreePlugin
 
-//addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
-
-addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.0")
+//tag::scalaFix[]
+addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.12.1")
+//end::scalaFix[]
 
 //tag::sbtJNIPlugin[]
-addSbtPlugin("ch.jodersky" %% "sbt-jni" % "1.0.0-RC3")
+addSbtPlugin("com.github.sbt" %% "sbt-jni" % "1.7.0")
 //end::sbtJNIPlugin[]
 
-addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0")
+//tag::xmlVersionConflict[]
+// See https://github.com/scala/bug/issues/12632
+ThisBuild / libraryDependencySchemes ++= Seq(
+  "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always
+)
+//end::xmlVersionConflict[]
+
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0")
diff --git a/python/.flake8 b/python/.flake8
new file mode 100644
index 00000000..79a16af7
--- /dev/null
+++ b/python/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 120
\ No newline at end of file
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 00000000..3cf58309
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1 @@
+Python examples for High Performance Spark
diff --git a/high_performance_pyspark/SQLLineage.py b/python/examples/SQLLineage.py
similarity index 58%
rename from high_performance_pyspark/SQLLineage.py
rename to python/examples/SQLLineage.py
index 121f0b40..26bd0c4a 100644
--- a/high_performance_pyspark/SQLLineage.py
+++ b/python/examples/SQLLineage.py
@@ -1,3 +1,13 @@
+from pyspark.sql import DataFrame, Row
+from pyspark.sql.session import SparkSession
+import sys
+
+global df
+global sc
+global rdd
+global spark
+
+
 """
 >>> df = rdd.toDF()
 >>> df2 = cutLineage(df)
@@ -7,20 +17,12 @@
 True
 """
 
-global df
-global sc
-global rdd
-global spark
-
-from pyspark.context import SparkContext
-from pyspark.sql import DataFrame, Row
-from pyspark.sql.session import SparkSession
 
 # tag::cutLineage[]
 def cutLineage(df):
     """
     Cut the lineage of a DataFrame - used for iterative algorithms
-    
+
     .. Note: This uses internal members and may break between versions
     >>> df = rdd.toDF()
     >>> cutDf = cutLineage(df)
@@ -30,43 +32,48 @@ def cutLineage(df):
     jRDD = df._jdf.toJavaRDD()
     jSchema = df._jdf.schema()
     jRDD.cache()
-    sqlCtx = df.sql_ctx
-    try:
-        javaSqlCtx = sqlCtx._jsqlContext
-    except:
-        javaSqlCtx = sqlCtx._ssql_ctx
-    newJavaDF = javaSqlCtx.createDataFrame(jRDD, jSchema)
-    newDF = DataFrame(newJavaDF, sqlCtx)
+    session = df.sparkSession
+    javaSparkSession = session._jsparkSession
+    newJavaDF = javaSparkSession.createDataFrame(jRDD, jSchema)
+    newDF = DataFrame(newJavaDF, session)
     return newDF
+
+
 # end::cutLineage[]
 
+
 def _setupTest():
     globs = globals()
-    spark = SparkSession.builder \
-                        .master("local[4]") \
-                        .getOrCreate()
+    spark = SparkSession.builder.master("local[4]").getOrCreate()
     sc = spark._sc
     sc.setLogLevel("ERROR")
-    globs['sc'] = sc
-    globs['spark'] = spark
-    globs['rdd'] = rdd = sc.parallelize(
-        [Row(field1=1, field2="row1"),
-         Row(field1=2, field2="row2"),
-         Row(field1=3, field2="row3")])
+    globs["sc"] = sc
+    globs["spark"] = spark
+    globs["rdd"] = sc.parallelize(
+        [
+            Row(field1=1, field2="row1"),
+            Row(field1=2, field2="row2"),
+            Row(field1=3, field2="row3"),
+        ]
+    )
     return globs
 
+
 def _test():
     """
     Run the tests.
     """
     import doctest
+
     globs = _setupTest()
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
-    globs['sc'].stop()
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS
+    )
+    globs["sc"].stop()
     if failure_count:
         exit(-1)
 
-import sys
+
 if __name__ == "__main__":
     _test()
 # Hack to support running in nose
diff --git a/python/examples/__init__.py b/python/examples/__init__.py
new file mode 100644
index 00000000..80db2c40
--- /dev/null
+++ b/python/examples/__init__.py
@@ -0,0 +1 @@
+__version__ = 0.2
diff --git a/high_performance_pyspark/bad_pyspark.py b/python/examples/bad_pyspark.py
similarity index 81%
rename from high_performance_pyspark/bad_pyspark.py
rename to python/examples/bad_pyspark.py
index 46741dc9..083fbdd6 100644
--- a/high_performance_pyspark/bad_pyspark.py
+++ b/python/examples/bad_pyspark.py
@@ -1,10 +1,11 @@
 # This script triggers a number of different PySpark errors
 
-from pyspark import *
 from pyspark.sql.session import SparkSession
+import sys
 
 global sc
 
+
 def nonExistentInput(sc):
     """
     Attempt to load non existent input
@@ -18,6 +19,7 @@ def nonExistentInput(sc):
     failedRdd.count()
     # end::nonExistent[]
 
+
 def throwOuter(sc):
     """
     Attempt to load non existant input
@@ -33,6 +35,7 @@ def throwOuter(sc):
     transform2.count()
     # end::throwOuter[]
 
+
 def throwInner(sc):
     """
     Attempt to load non existant input
@@ -48,6 +51,7 @@ def throwInner(sc):
     transform2.count()
     # end::throwInner[]
 
+
 # tag::rewrite[]
 def add1(x):
     """
@@ -57,6 +61,7 @@ def add1(x):
     """
     return x + 1
 
+
 def divZero(x):
     """
     Divide by zero (cause an error)
@@ -67,6 +72,7 @@ def divZero(x):
     """
     return x / 0
 
+
 def throwOuter2(sc):
     """
     Attempt to load non existant input
@@ -80,6 +86,7 @@ def throwOuter2(sc):
     transform2 = transform1.map(divZero)
     transform2.count()
 
+
 def throwInner2(sc):
     """
     Attempt to load non existant input
@@ -92,8 +99,11 @@ def throwInner2(sc):
     transform1 = data.map(divZero)
     transform2 = transform1.map(add1)
     transform2.count()
+
+
 # end::rewrite[]
 
+
 def throwInner3(sc):
     """
     Attempt to load non existant input
@@ -102,14 +112,17 @@ def throwInner3(sc):
     """
     data = sc.parallelize(range(10))
     rejectedCount = sc.accumulator(0)
+
     def loggedDivZero(x):
         import logging
+
         try:
             return [x / 0]
         except Exception as e:
             rejectedCount.add(1)
             logging.warning("Error found " + repr(e))
             return []
+
     transform1 = data.flatMap(loggedDivZero)
     transform2 = transform1.map(add1)
     transform2.count()
@@ -118,45 +131,51 @@ def loggedDivZero(x):
 
 def runOutOfMemory(sc):
     """
-    Run out of memory on the workers.
-    In standalone modes results in a memory error, but in YARN may trigger YARN container
-    overhead errors.
-    >>> runOutOfMemory(sc)
+    Run out of memory on the workers from a skewed shuffle.
+    >>> runOutOfMemory(sc) # doctest: +SKIP
     Traceback (most recent call last):
         ...
     Py4JJavaError:...
     """
     # tag::worker_oom[]
-    data = sc.parallelize(range(10))
-    def generate_too_much(itr):
-        return range(10000000000000)
-    itr = data.flatMap(generate_too_much)
-    itr.count()
+    data = sc.parallelize(range(10000))
+
+    def generate_too_much(i: int):
+        return list(map(lambda v: (i % 2, v), range(100000 * i)))
+
+    bad = data.flatMap(generate_too_much).groupByKey()
+    bad.count()
     # end::worker_oom[]
 
+
 def _setupTest():
     globs = globals()
-    spark = SparkSession.builder \
-                        .master("local[4]") \
-                        .getOrCreate()
+    spark = SparkSession.builder.master("local[4]").getOrCreate()
     sc = spark._sc
-    globs['sc'] = sc
+    globs["sc"] = sc
     return globs
-    
+
+
 def _test():
     """
-    Run the tests. 
-    Note this will print a lot of error message to stderr since we don't capture the JVM sub process
-    stdout/stderr for doctests.
+    Run the tests.
+    Note this will print a lot of error message to stderr since we don't
+    capture the JVM sub process stdout/stderr for doctests.
     """
     import doctest
-    globs = setupTest()
-    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
-    globs['sc'].stop()
+
+    globs = _setupTest()
+    (failure_count, test_count) = doctest.testmod(
+        globs=globs, optionflags=doctest.ELLIPSIS
+    )
+    print("All tests done, stopping Spark context.")
+    globs["sc"].stop()
     if failure_count:
         exit(-1)
+    else:
+        exit(0)
+
 
-import sys
 if __name__ == "__main__":
     _test()
 # Hack to support running in nose
diff --git a/python/examples/bad_pyspark.py.fail b/python/examples/bad_pyspark.py.fail
new file mode 100644
index 00000000..e69de29b
diff --git a/python/examples/dual_write.py b/python/examples/dual_write.py
new file mode 100644
index 00000000..94f27157
--- /dev/null
+++ b/python/examples/dual_write.py
@@ -0,0 +1,22 @@
+import asyncactions  # noqa # pylint: disable=unused-import
+
+
+class DualWriteExample:
+    def do_write(self, df, p1, p2):
+        """
+        Apply two concrete actions to a DataFrame in parallel.
+        A common use case is two views of the same data, normally
+        one with sensitive data and one scrubbed/clean.
+        """
+        # First we "persist" it (you can also checkpoint or choose a different
+        # level of persistence.
+        df.persist()
+        df.count()
+        # Create the distinct "safe" view.
+        df1 = df.select("times")
+        # Start the async actions
+        async1 = df1.write.mode("append").format("parquet").saveAsync(p1)
+        async2 = df.write.mode("append").format("parquet").saveAsync(p2)
+        # Block until the writes are both finished.
+        async1.result()
+        async2.result()
diff --git a/python/examples/load_previous_run_data.py b/python/examples/load_previous_run_data.py
new file mode 100644
index 00000000..d9277682
--- /dev/null
+++ b/python/examples/load_previous_run_data.py
@@ -0,0 +1,31 @@
+import os
+import tempfile
+
+
+class LoadPreviousRunData(object):
+    def __init__(self, session):
+        self.session = session
+
+    def find_oldest_id(self, local_path):
+        """Find the oldest Spark job since it's probably not being updated."""
+        directories = os.listdir(local_path)
+        return min(directories, key=lambda x: os.path.getmtime(f"{local_path}/{x}"))
+
+    def do_magic(self):
+        local_path = "/tmp/spark-events"
+        event_log_path = f"file://{local_path}"
+        application_id = self.find_oldest_id(local_path)
+        return self.load_json_records(event_log_path, application_id)
+
+    # tag::load[]
+    def load_json_records(self, event_log_path, application_id):
+        print(f"Loading {application_id}")
+        full_log_path = f"{event_log_path}/{application_id}"
+        df = self.session.read.json(full_log_path)
+        special_events = df.filter(
+            (df["Event"] == "SparkListenerExecutorAdded")
+            | (df["Event"] == "SparkListenerJobEnd")
+        )
+        special_events.show()
+
+    # end::load[]
diff --git a/python/examples/pandera_ex.py b/python/examples/pandera_ex.py
new file mode 100644
index 00000000..f3afa7c9
--- /dev/null
+++ b/python/examples/pandera_ex.py
@@ -0,0 +1,52 @@
+from pyspark.sql.session import SparkSession
+
+# tag::pandera_imports[]
+import pandera.pyspark as pa
+import pyspark.sql.types as T
+
+# end::pandera_imports[]
+
+
+# tag::simple_data_schema[]
+class ProjectDataSchema(pa.DataFrameModel):
+    # Note str_length is currently broken :/
+    creator: T.StringType() = pa.Field(str_length={"min_value": 1})
+    projectname: T.StringType() = pa.Field()
+    stars: T.IntegerType() = pa.Field(ge=0)
+
+
+# end::simple_data_schema[]
+
+
+# tag::gender_data[]
+class GenderData(pa.DataFrameModel):
+    MaleBonusPercent: T.DoubleType() = pa.Field(nullable=True, le=5)
+    FemaleBonusPercent: T.DoubleType() = pa.Field(nullable=True)
+    CompanyNumber: T.IntegerType() = pa.Field()
+
+
+# end::gender_data[]
+
+if __name__ == "__main__":
+    spark = SparkSession.builder.master("local[4]").getOrCreate()
+    # Make sure to make
+    # "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021"
+    # available as ./data/2021
+    uk_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True)
+
+    # tag::validate_gender_data[]
+    validated_df = GenderData(uk_df)
+    # Print out the errors. You may wish to exit with an error condition.
+    if validated_df.pandera.errors != {}:
+        print(validated_df.pandera.errors)
+        # sys.exit(1)
+    # end::validate_gender_data[]
+
+    # tag::validate_project_data[]
+    project_data = spark.read.csv("./data/project.csv", header=True, inferSchema=True)
+    validated_df = ProjectDataSchema(project_data)
+    # Print out the errors. You may wish to exit with an error condition.
+    if validated_df.pandera.errors != {}:
+        print(validated_df.pandera.errors)
+        # sys.exit(1)
+    # end::validate_project_data[]
diff --git a/high_performance_pyspark/simple_perf.py b/python/examples/simple_perf.py
similarity index 61%
rename from high_performance_pyspark/simple_perf.py
rename to python/examples/simple_perf.py
index 773ad3e0..1c725255 100644
--- a/high_performance_pyspark/simple_perf.py
+++ b/python/examples/simple_perf.py
@@ -1,14 +1,19 @@
 # When running this example make sure to include the built Scala jar :
-# $SPARK_HOME/bin/pyspark --jars ./target/examples-0.0.1.jar --driver-class-path ./target/examples-0.0.1.jar
+#
+# $SPARK_HOME/bin/pyspark --jars \
+# ./target/examples-0.0.1.jar --driver-class-path ./target/examples-0.0.1.jar
+#
 # This example illustrates how to interface Scala and Python code, but caution
 # should be taken as it depends on many private members that may change in
 # future releases of Spark.
 
-from pyspark.sql.types import *
-from pyspark.sql import *
+from pyspark.sql.types import StructType, IntegerType, DoubleType, StructField
+from pyspark.sql import DataFrame, SparkSession
+import sys
 import timeit
 import time
 
+
 def generate_scale_data(sqlCtx, rows, numCols):
     """
     Generate scale data for the performance test.
@@ -28,14 +33,7 @@ def generate_scale_data(sqlCtx, rows, numCols):
     """
     # tag::javaInterop[]
     sc = sqlCtx._sc
-    # Get the SQL Context, 2.1, 2.0 and pre-2.0 syntax - yay internals :p
-    try:
-        try:
-            javaSqlCtx = sqlCtx._jsqlContext
-        except:
-            javaSqlCtx = sqlCtx._ssql_ctx
-    except:
-        javaSqlCtx = sqlCtx._jwrapped
+    javaSparkSession = sqlCtx._jsparkSession
     jsc = sc._jsc
     scalasc = jsc.sc()
     gateway = sc._gateway
@@ -45,21 +43,17 @@ def generate_scale_data(sqlCtx, rows, numCols):
     # This returns a Java RDD of Rows - normally it would better to
     # return a DataFrame directly, but for illustration we will work
     # with an RDD of Rows.
-    java_rdd = (gateway.jvm.com.highperformancespark.examples.
-                tools.GenerateScalingData.
-                generateMiniScaleRows(scalasc, rows, numCols))
+    java_rdd = gateway.jvm.com.highperformancespark.examples.tools.GenerateScalingData.generateMiniScaleRows(
+        scalasc, rows, numCols
+    )
     # Schemas are serialized to JSON and sent back and forth
     # Construct a Python Schema and turn it into a Java Schema
-    schema = StructType([
-        StructField("zip", IntegerType()),
-        StructField("fuzzyness", DoubleType())])
-    # 2.1 / pre-2.1
-    try:
-        jschema = javaSqlCtx.parseDataType(schema.json())
-    except:
-        jschema = sqlCtx._jsparkSession.parseDataType(schema.json())
+    schema = StructType(
+        [StructField("zip", IntegerType()), StructField("fuzzyness", DoubleType())]
+    )
+    jschema = javaSparkSession.parseDataType(schema.json())
     # Convert the Java RDD to Java DataFrame
-    java_dataframe = javaSqlCtx.createDataFrame(java_rdd, jschema)
+    java_dataframe = javaSparkSession.createDataFrame(java_rdd, jschema)
     # Wrap the Java DataFrame into a Python DataFrame
     python_dataframe = DataFrame(java_dataframe, sqlCtx)
     # Convert the Python DataFrame into an RDD
@@ -67,19 +61,25 @@ def generate_scale_data(sqlCtx, rows, numCols):
     return (python_dataframe, pairRDD)
     # end::javaInterop[]
 
+
 def runOnDF(df):
     result = df.groupBy("zip").avg("fuzzyness").count()
     return result
 
+
 def runOnRDD(rdd):
-    result = rdd.map(lambda (x, y): (x, (y, 1))). \
-             reduceByKey(lambda x, y: (x[0] + y [0], x[1] + y[1])). \
-             count()
+    result = (
+        rdd.map(lambda xy: (xy[0], (xy[1], 1)))
+        .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
+        .count()
+    )
     return result
 
+
 def groupOnRDD(rdd):
     return rdd.groupByKey().mapValues(lambda v: sum(v) / float(len(v))).count()
 
+
 def run(sc, sqlCtx, scalingFactor, size):
     """
     Run the simple perf test printing the results to stdout.
@@ -98,17 +98,30 @@ def run(sc, sqlCtx, scalingFactor, size):
     """
     (input_df, input_rdd) = generate_scale_data(sqlCtx, scalingFactor, size)
     input_rdd.cache().count()
-    rddTimeings = timeit.repeat(stmt=lambda: runOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()')
-    groupTimeings = timeit.repeat(stmt=lambda: groupOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()')
+    rddTimeings = timeit.repeat(
+        stmt=lambda: runOnRDD(input_rdd),
+        repeat=10,
+        number=1,
+        timer=time.time,
+        setup="gc.enable()",
+    )
+    groupTimeings = timeit.repeat(
+        stmt=lambda: groupOnRDD(input_rdd),
+        repeat=10,
+        number=1,
+        timer=time.time,
+        setup="gc.enable()",
+    )
     input_df.cache().count()
-    dfTimeings = timeit.repeat(stmt=lambda: runOnDF(input_df), repeat=10, number=1, timer=time.time, setup='gc.enable()')
-    print "RDD:"
-    print rddTimeings
-    print "group:"
-    print groupTimeings
-    print "df:"
-    print dfTimeings
-    print "yay"
+    dfTimeings = timeit.repeat(
+        stmt=lambda: runOnDF(input_df),
+        repeat=10,
+        number=1,
+        timer=time.time,
+        setup="gc.enable()",
+    )
+    print(f"RDD: {rddTimeings}, group: {groupTimeings}, df: {dfTimeings}")
+
 
 def parseArgs(args):
     """
@@ -123,15 +136,15 @@ def parseArgs(args):
 
 
 if __name__ == "__main__":
-
     """
     Usage: simple_perf_test scalingFactor size
     """
-    import sys
-    from pyspark import SparkContext
-    from pyspark.sql import SQLContext
-    (scalingFactor, size) = parseArgs(sys.argv)
-    session = SparkSession.appName("SimplePythonPerf").builder.getOrCreate()
+
+    scalingFactor = 1
+    size = 1
+    if len(sys.argv) > 2:
+        (scalingFactor, size) = parseArgs(sys.argv)
+    session = SparkSession.builder.appName("SimplePythonPerf").getOrCreate()
     sc = session._sc
     run(sc, session, scalingFactor, size)
 
diff --git a/python/examples/spark_expectations_example.py b/python/examples/spark_expectations_example.py
new file mode 100644
index 00000000..003bb158
--- /dev/null
+++ b/python/examples/spark_expectations_example.py
@@ -0,0 +1,111 @@
+from pyspark import SparkFiles
+from pyspark.sql import *
+from spark_expectations.core.expectations import (
+    SparkExpectations,
+    WrappedDataFrameWriter,
+)
+
+spark = SparkSession.builder.master("local[4]").getOrCreate()
+sc = spark.sparkContext
+sc.setLogLevel("ERROR")
+
+# tag::global_setup[]
+se_conf = {
+    "se_notifications_enable_email": False,
+    "se_notifications_email_smtp_host": "mailhost.example.com",
+    "se_notifications_email_smtp_port": 25,
+    "se_notifications_email_from": "timbit@example.com",
+    "se_notifications_email_subject": "spark expectations - data quality - notifications",
+    "se_notifications_on_fail": True,
+    "se_notifications_on_error_drop_exceeds_threshold_breach": True,
+    "se_notifications_on_error_drop_threshold": 15,
+}
+# end::global_setup[]
+
+
+# tag::setup_and_load[]
+from spark_expectations.config.user_config import Constants as user_config
+
+spark.sql("DROP TABLE IF EXISTS local.magic_validation")
+spark.sql(
+    """
+create table local.magic_validation (
+    product_id STRING,
+    table_name STRING,
+    rule_type STRING,
+    rule STRING,
+    column_name STRING,
+    expectation STRING,
+    action_if_failed STRING,
+    tag STRING,
+    description STRING,
+    enable_for_source_dq_validation BOOLEAN,
+    enable_for_target_dq_validation BOOLEAN,
+    is_active BOOLEAN,
+    enable_error_drop_alert BOOLEAN,
+    error_drop_threshold INT
+)"""
+)
+# Reminder: addFile does not handle directories well.
+rule_file = "spark_expectations_sample_rules.json"
+sc.addFile(rule_file)
+df = spark.read.json(SparkFiles.get(rule_file))
+print(df)
+df.write.option("byname", "true").mode("append").saveAsTable("local.magic_validation")
+spark.read.table("local.magic_validation").show()
+
+# Can be used to point to your desired metastore.
+se_writer = WrappedDataFrameWriter().mode("append").format("iceberg")
+
+rule_df = spark.sql("select * from local.magic_validation")
+
+se: SparkExpectations = SparkExpectations(
+    rules_df=rule_df,  # See if we can replace this with the DF we wrote out.
+    product_id="pay",  # We will only apply rules matching this product id
+    stats_table="local.dq_stats",
+    stats_table_writer=se_writer,
+    target_and_error_table_writer=se_writer,
+    stats_streaming_options={user_config.se_enable_streaming: False},
+)
+# end::setup_and_load[]
+rule_df.show(truncate=200)
+
+
+# tag::run_validation_row[]
+@se.with_expectations(
+    user_conf=se_conf,
+    write_to_table=False,  # If set to true SE will write to the target table.
+    target_and_error_table_writer=se_writer,
+    # target_table is used to create the error table (e.g. here local.fake_table_name_error)
+    # and filter the rules on top of the global product filter.
+    target_table="local.fake_table_name",
+)
+def load_data():
+    raw_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True)
+    uk_df = raw_df.select("CompanyNumber", "MaleBonusPercent", "FemaleBonuspercent")
+    return uk_df
+
+
+# data = load_data()
+# end::run_validation_row[]
+
+
+# tag::run_validation_complex[]
+@se.with_expectations(
+    user_conf=se_conf,
+    write_to_table=True,  # If set to true SE will write to the target table.
+    target_and_error_table_writer=se_writer,
+    # target_table is used to create the error table (e.g. here local.fake_table_name_error)
+    # and filter the rules on top of the global product filter.
+    target_table="local.3rd_fake",
+)
+def load_data2():
+    raw_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True)
+    uk_df = raw_df.select("CompanyNumber", "MaleBonusPercent", "FemaleBonuspercent")
+    return uk_df
+
+
+data = load_data2()
+# end::run_validation_complex[]
+
+spark.sql("SELECT * FROM local.3rd_fake_error").show(truncate=300)
diff --git a/python/examples/spark_expectations_example.py.fail b/python/examples/spark_expectations_example.py.fail
new file mode 100644
index 00000000..e69de29b
diff --git a/python/examples/test_dual_write.py b/python/examples/test_dual_write.py
new file mode 100644
index 00000000..e68eb2b1
--- /dev/null
+++ b/python/examples/test_dual_write.py
@@ -0,0 +1,27 @@
+import os
+import tempfile
+
+# tag::test[]
+from sparktestingbase.sqltestcase import SQLTestCase
+from pyspark.sql.functions import current_timestamp
+from pyspark.sql.types import Row
+from .dual_write import DualWriteExample
+
+
+class DualWriteTest(SQLTestCase):
+    def test_always_passes(self):
+        self.assertTrue(True)
+
+    def test_actual_dual_write(self):
+        tempdir = tempfile.mkdtemp()
+        p1 = os.path.join(tempdir, "data1")
+        p2 = os.path.join(tempdir, "data2")
+        df = self.sqlCtx.createDataFrame([Row("timbit"), Row("farted")], ["names"])
+        combined = df.withColumn("times", current_timestamp())
+        DualWriteExample().do_write(combined, p1, p2)
+        df1 = self.sqlCtx.read.format("parquet").load(p1)
+        df2 = self.sqlCtx.read.format("parquet").load(p2)
+        self.assertDataFrameEqual(df2.select("times"), df1, 0.1)
+
+
+# end::test[]
diff --git a/python/examples/test_dual_write_new.py b/python/examples/test_dual_write_new.py
new file mode 100644
index 00000000..e8b6df52
--- /dev/null
+++ b/python/examples/test_dual_write_new.py
@@ -0,0 +1,39 @@
+import os
+import tempfile
+
+# tag::test[]
+import unittest
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import current_timestamp
+from pyspark.sql.types import Row
+from pyspark.testing.utils import assertDataFrameEqual
+from .dual_write import DualWriteExample
+
+
+class DualWriteTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.spark = SparkSession.builder.appName(
+            "Testing PySpark Example"
+        ).getOrCreate()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.spark.stop()
+
+    def test_always_passes(self):
+        self.assertTrue(True)
+
+    def test_actual_dual_write(self):
+        tempdir = tempfile.mkdtemp()
+        p1 = os.path.join(tempdir, "data1")
+        p2 = os.path.join(tempdir, "data2")
+        df = self.spark.createDataFrame([Row("timbit"), Row("farted")], ["names"])
+        combined = df.withColumn("times", current_timestamp())
+        DualWriteExample().do_write(combined, p1, p2)
+        df1 = self.spark.read.format("parquet").load(p1)
+        df2 = self.spark.read.format("parquet").load(p2)
+        assertDataFrameEqual(df2.select("times"), df1, 0.1)
+
+
+# end::test[]
diff --git a/python/examples/test_load_previous_run_data.py b/python/examples/test_load_previous_run_data.py
new file mode 100644
index 00000000..1f0ca313
--- /dev/null
+++ b/python/examples/test_load_previous_run_data.py
@@ -0,0 +1,15 @@
+from pyspark.sql.session import SparkSession
+import os
+import tempfile
+
+from sparktestingbase.sqltestcase import SQLTestCase
+from .load_previous_run_data import LoadPreviousRunData
+
+
+class TestLoadPreviousRunData(SQLTestCase):
+    def test_do_magic(self):
+        lprd = LoadPreviousRunData(self.session)
+        try:
+            lprd.do_magic()
+        except FileNotFoundError:
+            print("No previous jobs")
diff --git a/python/examples/udf.py b/python/examples/udf.py
new file mode 100644
index 00000000..f0d6a605
--- /dev/null
+++ b/python/examples/udf.py
@@ -0,0 +1,73 @@
+# This script triggers a number of different PySpark errors
+
+from pyspark.sql.session import SparkSession
+from pyspark.sql.functions import pandas_udf, udf
+from typing import Iterator
+import sys
+import pandas as pd
+
+global sc
+
+
+# We need the session before we can use @udf
+spark = SparkSession.builder.master("local[4]").getOrCreate()
+
+
+# tag::simple_udf[]
+@udf("long")
+def classic_add1(e: int) -> int:
+    return e + 1
+
+
+# end::simple_udf[]
+
+
+# tag::agg_new_udf[]
+@pandas_udf("long")
+def pandas_sum(s: pd.Series) -> int:
+    return s.sum()
+
+
+# end::agg_new_udf[]
+
+
+# tag::new_udf[]
+@pandas_udf("long")
+def pandas_add1(s: pd.Series) -> pd.Series:
+    # Vectorized operation on all of the elems in series at once
+    return s + 1
+
+
+# end::new_udf[]
+
+
+# tag::complex_udf[]
+@pandas_udf("long")
+def pandas_nested_add1(d: pd.DataFrame) -> pd.Series:
+    # Takes a struct and returns the age elem + 1, if we wanted
+    # to update (e.g. return struct) we could update d and return it instead.
+    return d["age"] + 1
+
+
+# end::complex_udf[]
+
+
+# tag::batches_of_batches_udf[]
+@pandas_udf("long")
+def pandas_batches_of_batches(t: Iterator[pd.Series]) -> Iterator[pd.Series]:
+    my_db_connection = None  # Expensive setup logic goes here
+    for s in t:
+        # Do something with your setup logic
+        if my_db_connection is None:
+            # Vectorized operation on all of the elems in series at once
+            yield s + 1
+
+
+# end::batches_of_batches_udf[]
+
+
+if __name__ == "__main__":
+    # Make sure to make
+    # "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021"
+    # available as ./data/2021
+    uk_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True)
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 00000000..38b11847
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,7 @@
+[build-system]
+requires = ["setuptools >= 58.0"]
+build-backend = "setuptools.build_meta"
+
+[[tool.mypy.overrides]]
+module = "examples"
+ignore_missing_imports = true
diff --git a/python/requirements.txt b/python/requirements.txt
new file mode 100644
index 00000000..6654dc9d
--- /dev/null
+++ b/python/requirements.txt
@@ -0,0 +1,11 @@
+spark-testing-base
+pandas
+pyarrow
+pyspark==3.5.0
+pyspark-asyncactions
+pandera
+pandera[pyspark]
+spark-expectations>=1.0
+venv-pack
+requests
+numpy<2.0
diff --git a/python/setup.cfg b/python/setup.cfg
new file mode 100644
index 00000000..64c8931c
--- /dev/null
+++ b/python/setup.cfg
@@ -0,0 +1,39 @@
+[metadata]
+name = examples
+version = attr: examples.__version__
+author = Holden and Anya
+author_email = your@email.address
+url = https://github.com/high-performance-spark/high-performance-spark-examples
+description = Python Examples for High Performance Spark
+long_description = file: README.md
+long_description_content_type = text/markdown
+keywords = example, setuptools, pyspark
+license = BSD 3-Clause License
+classifiers =
+    License :: OSI Approved :: BSD License
+    Programming Language :: Python :: 3
+
+[options]
+packages = find:
+zip_safe = True
+include_package_data = True
+install_requires =
+    pandas >= 1.4.1
+    PyYAML >= 6.0
+    typer
+    mypy
+    pyspark
+    pyspark-asyncactions
+    
+
+[options.entry_points]
+console_scripts = 
+    my-example-utility = example.example_module:main
+
+[options.extras_require]
+dev = 
+    black>=22.1.0
+    flake8>=4.0.1
+
+[options.package_data]
+* = README.md
\ No newline at end of file
diff --git a/python/tox.ini b/python/tox.ini
new file mode 100644
index 00000000..2aa2d4d2
--- /dev/null
+++ b/python/tox.ini
@@ -0,0 +1,75 @@
+[tox]
+passenv = *
+isolated_build = True
+requires = tox-conda
+envlist =
+    isort
+    py310
+    black
+    mypy
+    flake8
+
+skip_missing_interpeters = true
+
+[gh-actions]
+python =
+#    3.9: py39
+# We need a new version of PySpark w/3.10 support.
+    3.10: py310
+
+[testenv]
+setenv =
+    DJANGO_SETTINGS_MODULE=fighthealthinsurance.settings
+    PYTHONPATH={toxinidir}
+    DJANGO_CONFIGURATION=Dev
+passenv = *
+extras =
+    tests
+    coverage
+deps =
+  pytest
+  isort==4.3.21
+  pyspark==3.5.0
+  flake8
+  spark-testing-base>=0.11.1
+  mypy
+  -rrequirements.txt
+commands =
+  pytest examples \
+   {posargs}
+allowlist_externals = pytest
+
+[testenv:isort]
+extras = tests
+skipsdist = True
+commands = isort --check-only --diff examples
+allowlist_externals = isort
+
+[testenv:black]
+extras = tests
+skipsdist = True
+commands = black --check examples
+allowlist_externals = black
+deps =
+  black
+  -rrequirements.txt
+
+[testenv:flake8]
+extras = tests
+skipsdist = True
+commands = flake8 --ignore=F403,E402,F401,F405,W503,E265 examples
+allowlist_externals = flake8
+
+[testenv:mypy]
+extras = tests
+passenv = *
+deps =
+  pytest
+  mypy
+  -rrequirements.txt
+setenv =
+    {[testenv]setenv}
+    MYPYPATH={toxinidir}
+commands =
+    mypy -m examples
+allowlist_externals = mypy
\ No newline at end of file
diff --git a/run_container.sh b/run_container.sh
new file mode 100755
index 00000000..0efe1f60
--- /dev/null
+++ b/run_container.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+set -ex
+VERSION=${VERSION:-0.5}
+IMAGE=${IMAGE:-holdenk/hps:$VERSION}
+export VERSION
+export IMAGE
+docker image pull "$IMAGE"
+mkdir -p warehouse
+mkdir -p iceberg-workshop
+docker container  run --mount type=bind,source="$(pwd)"/warehouse,target=/high-performance-spark-examples/warehouse --mount type=bind,source="$(pwd)/iceberg-workshop",target=/high-performance-spark-examples/iceberg-workshop -p 8877:8877 -p 4040:4040 -it "${IMAGE}" # /bin/bash
diff --git a/run_pyspark_examples.sh b/run_pyspark_examples.sh
new file mode 100755
index 00000000..7e0818e4
--- /dev/null
+++ b/run_pyspark_examples.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+# shellcheck disable=SC1091,SC2034
+
+source env_setup.sh
+
+set -ex
+
+set -o pipefail
+
+#tag::package_venv[]
+if [ ! -d pyspark_venv ]; then
+  python -m venv pyspark_venv
+fi
+
+source pyspark_venv/bin/activate
+pip install -r ./python/requirements.txt
+
+if [ ! -f pyspark_venv.tar.gz ]; then
+  venv-pack -o pyspark_venv.tar.gz
+fi
+
+
+# Set in local and client mode where the driver uses the Python present
+# (requires that you have activated the venv as we did above)
+PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_DRIVER_PYTHON
+export PYTHON_PATH=./environment/bin/python
+#end::package_venv[]
+
+# Some hack for our json magic
+cat se*.json > spark_expectations_sample_rules.json
+
+function check_fail () {
+  local ex="$1"
+  local code="$2"
+  if [ -f "${ex}.fail" ]; then
+    echo "ok";
+  else
+    exit "$code"
+  fi
+}
+
+EXAMPLE_JAR="./core/target/scala-2.13/core-assembly-0.1.0-SNAPSHOT.jar"
+
+pip install setuptools
+
+# Iceberg JAR not yet available for Spark 4.
+if [ ! -f "${EXAMPLE_JAR}" ]; then
+  rm ./core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala # temp hack no merge in Spark 3.
+  sbt core/assembly -DsparkVersion="${SPARK_VERSION}"
+fi
+
+if [ ! -f "${EXAMPLE_JAR}" ]; then
+  echo "Can't find sample jar?!?"
+  exit 1
+fi
+
+function run_example () {
+  local ex="$1"
+  # shellcheck disable=SC2046
+  spark-submit \
+	       --master local[5] \
+	       --conf spark.eventLog.enabled=true \
+	       --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \
+	       --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \
+	       --conf spark.sql.catalog.spark_catalog.type=hive \
+	       --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
+	       --conf spark.sql.catalog.local.type=hadoop \
+	       --archives pyspark_venv.tar.gz#environment \
+	       --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \
+	       $(cat "${ex}.conf" || echo "") \
+	       --name "${ex}" \
+	       --jars "${EXAMPLE_JAR}" \
+	       "${ex}" 2>&1 | tee -a "${ex}.out" || check_fail "$ex" $?
+}
+
+if [ $# -eq 1 ]; then
+  run_example "python/examples/$1"
+else
+  for ex in python/examples/*.py; do
+    if [[ "$ex" =~ test.* ]]; then
+      echo "Skipping ex $ex as it is a test and covered by our tests."
+    else
+      echo "Running $ex"
+      run_example "$ex"
+    fi
+  done
+fi
diff --git a/run_sql_examples.sh b/run_sql_examples.sh
new file mode 100755
index 00000000..946abf4c
--- /dev/null
+++ b/run_sql_examples.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+set -ex
+set -o pipefail
+
+source env_setup.sh
+
+# You might want to set SPARK_EXTRA to do things like log more info
+
+function run_example () {
+  local sql_file="$1"
+  local extra="$2"
+  EXTENSIONS=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
+  if [ -n "$EXTRA_EXTENSIONS" ]; then
+    EXTENSIONS="$EXTENSIONS,$EXTRA_EXTENSIONS"
+  fi
+  # shellcheck disable=SC2046,SC2086
+  ${SPARK_HOME}/bin/spark-sql --master local[5] \
+	    --conf spark.eventLog.enabled=true \
+	    --conf spark.sql.extensions=$EXTENSIONS \
+	    --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \
+	    --conf spark.sql.catalog.spark_catalog.type=hive \
+	    --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
+	    --conf spark.sql.catalog.local.type=hadoop \
+	    --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \
+	    ${extra} ${SPARK_EXTRA} \
+	    $(cat "${sql_file}.conf" || echo "") \
+	    --name "${sql_file}" \
+	    -f "${sql_file}" 2>&1 | tee -a "${sql_file}.out" || ls "${sql_file}.expected_to_fail"
+}
+
+
+# If you want to look at them
+# ${SPARK_PATH}/sbin/start-history-server.sh
+
+if [ $# -eq 1 ]; then
+  if [[ "$1" != *"gluten_only"* ]]; then
+    run_example "sql/$1"
+  else
+    echo "Processing gluten ${sql_file}"
+    # shellcheck disable=SC2046
+    run_example "$sql_file"
+  fi
+else
+  # For each SQL
+  for sql_file in sql/*.sql; do
+    if [[ "$sql_file" != *"_only"* ]]; then
+      echo "Processing ${sql_file}"
+      # shellcheck disable=SC2046
+      run_example "$sql_file"
+    elif [[ "$sql_file" != *"gluten_only"* && "$GLUTEN_EXISTS" == "true" ]]; then
+      echo "Processing gluten ${sql_file}"
+      # shellcheck disable=SC2046
+      run_example "$sql_file"
+    elif [[ "$sql_file" != *"gluten_udf_only"* && "$GLUTEN_UDF_EXISTS" == "true" ]]; then
+      echo "Processing gluten UDF ${sql_file}"
+      # shellcheck disable=SC2046
+      run_example "$sql_file"
+    else
+      echo "Skipping $sql_file since we did not find gluten and this is restricted example."
+    fi
+  done
+fi
diff --git a/sbt/sbt b/sbt/sbt
deleted file mode 100755
index aac1085a..00000000
--- a/sbt/sbt
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This script launches sbt for this project. If present it uses the system 
-# version of sbt. If there is no system version of sbt it attempts to download
-# sbt locally.
-SBT_VERSION=0.13.9
-URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
-URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
-JAR=sbt/sbt-launch-${SBT_VERSION}.jar
-
-# Download sbt launch jar if it hasn't been downloaded yet
-if [ ! -f ${JAR} ]; then
-  # Download
-  printf "Attempting to fetch sbt\n"
-  set -x
-  JAR_DL=${JAR}.part
-  if hash wget 2>/dev/null; then
-    (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
-  elif hash axel 2>/dev/null; then
-    (axel  ${URL1} -o ${JAR_DL} || axel  ${URL2} -o ${JAR_DL}) && mv ${JAR_DL} ${JAR}
-  else
-    printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
-    exit -1
-  fi
-fi
-if [ ! -f ${JAR} ]; then
-  # We failed to download
-  printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
-  exit -1
-fi
-printf "Launching sbt from ${JAR}\n"
-java \
-  -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
-  -jar ${JAR} \
-  "$@"
diff --git a/sbt/sbt.bat b/sbt/sbt.bat
deleted file mode 100644
index 0f7a3e9a..00000000
--- a/sbt/sbt.bat
+++ /dev/null
@@ -1,95 +0,0 @@
-@REM SBT launcher script
-@REM 
-@REM Environment:
-@REM JAVA_HOME - location of a JDK home dir (mandatory)
-@REM SBT_OPTS  - JVM options (optional)
-@REM Configuration:
-@REM sbtconfig.txt found in the SBT_HOME.
-
-@REM   ZOMG! We need delayed expansion to build up CFG_OPTS later 
-@setlocal enabledelayedexpansion
-
-@echo off
-set SBT_HOME=%~dp0
-
-rem FIRST we load the config file of extra options.
-set FN=%SBT_HOME%\..\conf\sbtconfig.txt
-set CFG_OPTS=
-FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO (
-  set DO_NOT_REUSE_ME=%%i
-  rem ZOMG (Part #2) WE use !! here to delay the expansion of
-  rem CFG_OPTS, otherwise it remains "" for this loop.
-  set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME!
-)
-
-rem poor man's jenv (which is not available on Windows)
-IF DEFINED JAVA_HOMES (
-  IF EXIST .java-version FOR /F %%A IN (.java-version) DO (
-    SET JAVA_HOME=%JAVA_HOMES%\%%A
-    SET JDK_HOME=%JAVA_HOMES%\%%A
-  )
-)
-rem must set PATH or wrong javac is used for java projects
-IF DEFINED JAVA_HOME SET "PATH=%JAVA_HOME%\bin;%PATH%"
-
-rem users can set JAVA_OPTS via .jvmopts (sbt-extras style)
-IF EXIST .jvmopts FOR /F %%A IN (.jvmopts) DO (
-  SET JAVA_OPTS=%%A !JAVA_OPTS!
-)
-
-rem We use the value of the JAVACMD environment variable if defined
-set _JAVACMD=%JAVACMD%
-
-if "%_JAVACMD%"=="" (
-  if not "%JAVA_HOME%"=="" (
-    if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe"
-  )
-)
-
-if "%_JAVACMD%"=="" set _JAVACMD=java
-
-rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config.
-set _JAVA_OPTS=%JAVA_OPTS%
-if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS%
-
-:args_loop
-if "%~1" == "" goto args_end
-
-if "%~1" == "-jvm-debug" (
-  set JVM_DEBUG=true
-  set /a JVM_DEBUG_PORT=5005 2>nul >nul
-) else if "!JVM_DEBUG!" == "true" (
-  set /a JVM_DEBUG_PORT=%1 2>nul >nul
-  if not "%~1" == "!JVM_DEBUG_PORT!" (
-    set SBT_ARGS=!SBT_ARGS! %1
-  )
-) else (
-  set SBT_ARGS=!SBT_ARGS! %1
-)
-
-shift
-goto args_loop
-:args_end
-
-if defined JVM_DEBUG_PORT (
-  set _JAVA_OPTS=!_JAVA_OPTS! -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=!JVM_DEBUG_PORT!
-)
-
-call :run %SBT_ARGS%
-
-if ERRORLEVEL 1 goto error
-goto end
-
-:run
-
-"%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -cp "%SBT_HOME%sbt-launch.jar" xsbt.boot.Boot %*
-goto :eof
-
-:error
-@endlocal
-exit /B 1
-
-
-:end
-@endlocal
-exit /B 0
diff --git a/se_complex.json b/se_complex.json
new file mode 100644
index 00000000..f073e640
--- /dev/null
+++ b/se_complex.json
@@ -0,0 +1,2 @@
+{"product_id": "pay", "table_name": "local.3rd_fake", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": false, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1}
+{"product_id": "pay", "table_name": "local.3rd_fake", "rule_type": "query_dq", "rule": "history", "column_name": "MaleBonusPercent", "expectation": "(select count(*) from 3rd_fake_view) > (select input_count from local.dq_stats WHERE table_name='local.3rd_fake' LIMIT 1)", "action_if_failed": "fail", "tag": "", "description": "We should always have more records than before", "enable_for_source_dq_validation": false, "enable_for_target_dq_validation": true, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1}
diff --git a/se_simple.json b/se_simple.json
new file mode 100644
index 00000000..72d9b866
--- /dev/null
+++ b/se_simple.json
@@ -0,0 +1 @@
+{"product_id": "pay", "table_name": "local.fake_table_name", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": false, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1}
diff --git a/sql/gluten_only_nonpartitioned_table_join.sql b/sql/gluten_only_nonpartitioned_table_join.sql
new file mode 100644
index 00000000..572437c5
--- /dev/null
+++ b/sql/gluten_only_nonpartitioned_table_join.sql
@@ -0,0 +1,12 @@
+CREATE TABLE IF NOT EXISTS local.udevelopers (
+       username string,
+       firstname string,
+       lastname string)
+USING iceberg;
+CREATE TABLE IF NOT EXISTS local.uprojects (
+       creator string,
+       uprojectname string)
+USING iceberg;
+INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova");
+INSERT INTO local.uprojects VALUES("krisnova", "aurae");
+SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username;
diff --git a/sql/iceberg-schema-evolution-gotcha-possibility.sql b/sql/iceberg-schema-evolution-gotcha-possibility.sql
new file mode 100644
index 00000000..99b9fd60
--- /dev/null
+++ b/sql/iceberg-schema-evolution-gotcha-possibility.sql
@@ -0,0 +1,14 @@
+DROP TABLE IF EXISTS local.udevelopers_sorted;
+CREATE TABLE IF NOT EXISTS local.udevelopers_sorted (
+       username string,
+       firstname string,
+       lastname string)
+USING ICEBERG;
+INSERT INTO local.udevelopers_sorted VALUES("krisnova", "Kris", "Nova");
+ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY lastname;
+ALTER TABLE local.udevelopers_sorted RENAME COLUMN lastname TO deprecated_lastname;
+SELECT * FROM local.udevelopers_sorted;
+ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY username;
+ALTER TABLE local.udevelopers_sorted DROP COLUMN deprecated_lastname;
+SELECT * FROM local.udevelopers_sorted;
+
diff --git a/sql/iceberg-schema-evolution-gotcha-possibility.sql.expected_to_fail b/sql/iceberg-schema-evolution-gotcha-possibility.sql.expected_to_fail
new file mode 100644
index 00000000..e69de29b
diff --git a/sql/iceberg-schema-evolution-gotcha-workaround.sql b/sql/iceberg-schema-evolution-gotcha-workaround.sql
new file mode 100644
index 00000000..5b57afb2
--- /dev/null
+++ b/sql/iceberg-schema-evolution-gotcha-workaround.sql
@@ -0,0 +1,15 @@
+DROP TABLE IF EXISTS local.udevelopers_sorted;
+CREATE TABLE IF NOT EXISTS local.udevelopers_sorted (
+       username string,
+       firstname string,
+       lastname string)
+USING ICEBERG;
+ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY lastname;
+INSERT INTO local.udevelopers_sorted VALUES("krisnova", "Kris", "Nova");
+SELECT * FROM local.udevelopers_sorted;
+ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY username;
+-- Hack, add it to identifier fields so we can do a "partial" drop where it stays in the schema and we don't
+-- corrupt the metadata.
+ALTER TABLE local.udevelopers_sorted ADD PARTITION FIELD lastname;
+ALTER TABLE local.udevelopers_sorted DROP PARTITION FIELD lastname;
+SELECT * FROM local.udevelopers_sorted;
diff --git a/sql/nonpartitioned_table_join.sql b/sql/nonpartitioned_table_join.sql
new file mode 100644
index 00000000..572437c5
--- /dev/null
+++ b/sql/nonpartitioned_table_join.sql
@@ -0,0 +1,12 @@
+CREATE TABLE IF NOT EXISTS local.udevelopers (
+       username string,
+       firstname string,
+       lastname string)
+USING iceberg;
+CREATE TABLE IF NOT EXISTS local.uprojects (
+       creator string,
+       uprojectname string)
+USING iceberg;
+INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova");
+INSERT INTO local.uprojects VALUES("krisnova", "aurae");
+SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username;
diff --git a/sql/nonpartitioned_table_join.sql.conf b/sql/nonpartitioned_table_join.sql.conf
new file mode 100644
index 00000000..ece26ce0
--- /dev/null
+++ b/sql/nonpartitioned_table_join.sql.conf
@@ -0,0 +1,7 @@
+	    --conf spark.sql.sources.v2.bucketing.enabled=true
+	    --conf spark.sql.iceberg.planning.preserve-data-grouping=true
+	    --conf spark.sql.requireAllClusterKeysForCoPartition=false
+
+	    --conf spark.sql.adaptive.enabled=false
+	    --conf spark.sql.autoBroadcastJoinThreshold=-1
+	    --conf spark.sql.shuffle.partitions=4
diff --git a/sql/partioned_table_join.sql b/sql/partioned_table_join.sql
new file mode 100644
index 00000000..1f6dac31
--- /dev/null
+++ b/sql/partioned_table_join.sql
@@ -0,0 +1,14 @@
+CREATE TABLE IF NOT EXISTS local.developers (
+       username string,
+       firstname string,
+       lastname string)
+USING iceberg
+PARTITIONED BY (username);
+CREATE TABLE IF NOT EXISTS local.projects (
+       creator string,
+       projectname string)
+USING iceberg
+PARTITIONED BY (creator);
+INSERT INTO local.developers VALUES("krisnova", "Kris", "Nova");
+INSERT INTO local.projects VALUES("krisnova", "aurae");
+SELECT * FROM local.developers INNER JOIN local.projects ON local.projects.creator = local.developers.username;
diff --git a/sql/partioned_table_join.sql.conf b/sql/partioned_table_join.sql.conf
new file mode 100644
index 00000000..ece26ce0
--- /dev/null
+++ b/sql/partioned_table_join.sql.conf
@@ -0,0 +1,7 @@
+	    --conf spark.sql.sources.v2.bucketing.enabled=true
+	    --conf spark.sql.iceberg.planning.preserve-data-grouping=true
+	    --conf spark.sql.requireAllClusterKeysForCoPartition=false
+
+	    --conf spark.sql.adaptive.enabled=false
+	    --conf spark.sql.autoBroadcastJoinThreshold=-1
+	    --conf spark.sql.shuffle.partitions=4
diff --git a/sql/wap.sql b/sql/wap.sql
new file mode 100644
index 00000000..6665c22b
--- /dev/null
+++ b/sql/wap.sql
@@ -0,0 +1,19 @@
+DROP TABLE IF EXISTS local.wap_projects;
+CREATE TABLE local.wap_projects (
+       creator string,
+       projectname string)
+USING iceberg
+PARTITIONED BY (creator);
+ALTER TABLE local.projects SET TBLPROPERTIES (
+    'write.wap.enabled''true'
+);
+-- We need a first commit, see https://github.com/apache/iceberg/issues/8849
+INSERT INTO local.wap_projects VALUES("holdenk", "spark");
+ALTER TABLE local.wap_projects DROP BRANCH IF EXISTS `audit-branch`;
+ALTER TABLE local.wap_projects CREATE BRANCH `audit-branch`;
+SET spark.wap.branch = 'audit-branch';
+INSERT INTO local.projects VALUES("krisnova", "aurae");
+SELECT count(*) FROM local.wap_projects VERSION AS OF 'audit-branch' WHERE creator is NULL;
+SELECT count(*) FROM local.wap_projects VERSION AS OF 'audit-branch' WHERE creator == "krisnova";
+CALL local.system.remove_orphan_files(table => 'local.wap_projects');
+CALL local.system.fast_forward("local.wap_projects", "main", "audit-branch");
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
deleted file mode 100644
index e88b326a..00000000
--- a/src/CMakeLists.txt
+++ /dev/null
@@ -1,74 +0,0 @@
-################################################################
-# A minimal CMake file that is compatible with sbt-jni         #
-#                                                              #
-# All settings required by sbt-jni have been marked so, please #
-# add/modify/remove settings to build your specific library.   #
-################################################################
-
-cmake_minimum_required(VERSION 2.6)
-
-# Define project and related variables
-#
-project (high-performance-spark)
-
-# Enable fortan
-enable_language (Fortran)
-include(FortranCInterface)
-
-
-# FFLAGS depend on the compiler
-get_filename_component (Fortran_COMPILER_NAME ${CMAKE_Fortran_COMPILER} NAME)
-
-
-# Set versions and library name
-# (required by sbt-jni) please use semantic versioning
-#
-set (VERSION_MAJOR 0)
-set (VERSION_MINOR 0)
-set (VERSION_PATCH 0)
-# (required by sbt-jni) major version will always be appended to library name
-set (LIB_NAME ${CMAKE_PROJECT_NAME}${VERSION_MAJOR})
-
-# Command-line options
-#
-# (set by sbt-jni)
-set (LIB_INSTALL_DIR lib CACHE PATH "Path in which to install libraries (equivalent to Autoconf --libdir).")
-# (set by sbt-jni)
-set (LIB_ENABLE_MINOR_VERSIONS ON CACHE BOOLEAN "Build libraries with minor and patch versions appended.")
-
-# Setup JNI
-find_package(JNI REQUIRED)
-if (JNI_FOUND)
-    message (STATUS "JNI include directories: ${JNI_INCLUDE_DIRS}")
-endif()
-
-# Include directories
-include_directories(.)
-include_directories(./main/c)
-include_directories(./main/c/include)
-include_directories(${JNI_INCLUDE_DIRS})
-
-# Setup main shared library
-file(GLOB LIB_SRC
-  "*.c"
-  "*.cpp"
-  "./main/c/*.c"
-  "./main/c/*.cpp"
-  "./main/fortran/*.f*"
-)
-add_library(${LIB_NAME} SHARED ${LIB_SRC})
-
-# By default, in a regular build, minor and patch versions are added to the generated files.
-# When built through sbt-jni however, LIB_ENABLE_MINOR_VERSIONS is deactivated and only a
-# major-versioned library file is built.
-if (LIB_ENABLE_MINOR_VERSIONS)
-   set_target_properties(
-	${LIB_NAME}
-	PROPERTIES
-	VERSION 0.${VERSION_MINOR}.${VERSION_PATCH} # major version always 0, it is included in library name
-	SOVERSION 0
-   )
-endif()
-
-# Installation targets
-install(TARGETS ${LIB_NAME} LIBRARY DESTINATION ${LIB_INSTALL_DIR})
diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
deleted file mode 100644
index bddc84b4..00000000
--- a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Illustrates how to use Spark accumulators. Note that most of these examples
- * are "dangerous" in that they may not return consistent results.
- */
-package com.highperformancespark.examples.transformations
-
-import com.highperformancespark.examples.dataframe.RawPanda
-
-import org.apache.spark._
-import org.apache.spark.rdd._
-
-import scala.collection.mutable.HashSet
-object Accumulators {
-  /**
-   * Compute the total fuzzyness with an accumulator while generating
-   * an id and zip pair for sorting.
-   */
-  //tag::sumFuzzyAcc[]
-  def computeTotalFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]):
-      (RDD[(String, Long)], Double) = {
-    // Create an accumulator with the initial value of 0.0
-    val acc = sc.accumulator(0.0)
-    val transformed = rdd.map{x => acc += x.attributes(0); (x.zip, x.id)}
-    // accumulator still has zero value
-    // Note: This example is dangerous since the transformation may be
-    // evaluated multiple times.
-    transformed.count() // force evaluation
-    (transformed, acc.value)
-  }
-  //end::sumFuzzyAcc[]
-
-  /**
-   * Compute the max fuzzyness with an accumulator while generating an
-   * id and zip pair for sorting.
-   */
-  //tag::maxFuzzyAcc[]
-  def computeMaxFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]):
-      (RDD[(String, Long)], Double) = {
-    object MaxDoubleParam extends AccumulatorParam[Double] {
-      override def zero(initValue: Double) = initValue
-      override def addInPlace(r1: Double, r2: Double): Double = {
-        Math.max(r1, r2)
-      }
-    }
-    // Create an accumulator with the initial value of Double.MinValue
-    val acc = sc.accumulator(Double.MinValue)(MaxDoubleParam)
-    val transformed = rdd.map{x => acc += x.attributes(0); (x.zip, x.id)}
-    // accumulator still has Double.MinValue
-    // Note: This example is dangerous since the transformation may be
-    // evaluated multiple times.
-    transformed.count() // force evaluation
-    (transformed, acc.value)
-  }
-  //end::maxFuzzyAcc[]
-
-  //tag::uniquePandaAcc[]
-  def uniquePandas(sc: SparkContext, rdd: RDD[RawPanda]): HashSet[Long] = {
-    object UniqParam extends AccumulableParam[HashSet[Long], Long] {
-      override def zero(initValue: HashSet[Long]) = initValue
-      // For adding new values
-      override def addAccumulator(r: HashSet[Long], t: Long): HashSet[Long] = {
-        r += t
-        r
-      }
-      // For merging accumulators
-      override def addInPlace(r1: HashSet[Long], r2: HashSet[Long]):
-          HashSet[Long] = {
-        r1 ++ r2
-      }
-    }
-    // Create an accumulator with the initial value of Double.MinValue
-    val acc = sc.accumulable(new HashSet[Long]())(UniqParam)
-    val transformed = rdd.map{x => acc += x.id; (x.zip, x.id)}
-    // accumulator still has Double.MinValue
-    transformed.count() // force evaluation
-    acc.value
-  }
-  //end::uniquePandaAcc[]
-}
diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala b/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala
deleted file mode 100644
index 6571ceef..00000000
--- a/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala
+++ /dev/null
@@ -1,63 +0,0 @@
-/**
- * Checks basic Dataset magics
- */
-package com.highperformancespark.examples.dataframe
-
-import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas}
-import com.holdenkarau.spark.testing._
-import org.apache.spark.sql.types._
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
-import org.scalatest.Matchers._
-import org.scalatest.FunSuite
-
-import scala.collection.mutable
-import scala.util.Random
-
-class MixedDatasetSuite extends FunSuite with DataFrameSuiteBase {
-
-  val rawPandaList = List(
-    RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9, 20.0)),
-    RawPanda(11L, "94110", "red", true, Array(1.0, 0.7, 30.0)))
-
-  test("happy panda sums") {
-    val sqlCtx = sqlContext
-    import sqlCtx.implicits._
-    val mixedDS = new MixedDataset(sqlCtx)
-    val inputDF = sqlCtx.createDataFrame(rawPandaList)
-    val inputDS = inputDF.as[RawPanda]
-    val result = mixedDS.happyPandaSums(inputDS)
-    assert(result === (2.0 +- 0.001))
-  }
-
-  test("basic select") {
-    val sqlCtx = sqlContext
-    import sqlCtx.implicits._
-    val inputDF = sqlCtx.createDataFrame(rawPandaList)
-    val inputDS = inputDF.as[RawPanda]
-    val mixedDS = new MixedDataset(sqlCtx)
-    val squishy = mixedDS.squishyPandas(inputDS).collect()
-    assert(squishy(0)._2 === true)
-  }
-
-  test("funquery") {
-    val sqlCtx = sqlContext
-    import sqlCtx.implicits._
-    val inputDF = sqlCtx.createDataFrame(rawPandaList)
-    val inputDS = inputDF.as[RawPanda]
-    val mixedDS = new MixedDataset(sqlCtx)
-    val summedAttrs = mixedDS.funMap(inputDS).collect()
-    assert(summedAttrs(0) === 21.9 +- 0.001)
-    assert(summedAttrs(1) === 31.7 +- 0.001)
-  }
-
-  test("max pandas size per zip") {
-    val sqlCtx = sqlContext
-    import sqlCtx.implicits._
-    val inputDF = sqlCtx.createDataFrame(rawPandaList)
-    val inputDS = inputDF.as[RawPanda]
-    val mixedDS = new MixedDataset(sqlCtx)
-    val bigPandas = mixedDS.maxPandaSizePerZip(inputDS).collect()
-    assert(bigPandas.size === 1)
-    assert(bigPandas(0)._2 === 30.0 +- 0.00001)
-  }
-}
diff --git a/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala b/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala
deleted file mode 100644
index 4fd8ad52..00000000
--- a/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala
+++ /dev/null
@@ -1,25 +0,0 @@
-package com.highperformancespark.examples.wordcount
-
-
-import com.holdenkarau.spark.testing.SharedSparkContext
-import org.scalatest.FunSuite
-
-class WordCountTest extends FunSuite with SharedSparkContext {
-  test("word count with Stop Words Removed"){
-    val wordRDD = sc.parallelize(Seq(
-      "How happy was the panda? You ask.",
-      "Panda is the most happy panda in all the #$!?ing land!"))
-
-    val stopWords: Set[String] = Set("a", "the", "in", "was", "there", "she", "he")
-    val illegalTokens: Array[Char] = "#$%?!.".toCharArray
-
-    val wordCounts = WordCount.withStopWordsFiltered(
-      wordRDD, illegalTokens, stopWords)
-    val wordCountsAsMap = wordCounts.collectAsMap()
-    assert(!wordCountsAsMap.contains("the"))
-    assert(!wordCountsAsMap.contains("?"))
-    assert(!wordCountsAsMap.contains("#$!?ing"))
-    assert(wordCountsAsMap.contains("ing"))
-    assert(wordCountsAsMap.get("panda").get.equals(3))
-  }
-}
diff --git a/target-validator/ex.yaml b/target-validator/ex.yaml
new file mode 100644
index 00000000..ce8b4925
--- /dev/null
+++ b/target-validator/ex.yaml
@@ -0,0 +1,31 @@
+detailedErrors: true
+numKeyCols: 4
+# We might have a large number of errors so just show the first 5
+numErrorsToReport: 5
+
+email:
+  smtpHost: smtp.example.com
+  subject: Data Validation Summary
+  from: data-validator-no-reply@example.com
+  to:
+    - professor-timbit@example.com
+
+tables:
+  - db: gender_paygaps
+    table: uk
+    # Columns that taken together uniquely specifies each row (think of groupBy)
+    keyColumns:
+      - CompanyNumber
+      - EmployerId
+      - CompanyLinkToGPGInfo
+      - ResponsiblePerson
+    # Used to filter
+    condition: MaleBonusPercent >= FemaleBonusPercent
+    checks:
+      # We expect at least 500 records
+      - type: rowCount
+        minNumRows: 500
+      # We don't expect more than 1% not companies in the dataset.
+      - type: nullCheck
+        column: CompanyNumber
+        threshold: 0.01
diff --git a/target-validator/runme.sh b/target-validator/runme.sh
new file mode 100755
index 00000000..b6236dd7
--- /dev/null
+++ b/target-validator/runme.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# shellcheck disable=SC1091,SC2034
+
+source ../env_setup.sh 
+set -ex
+export SPARK_VERSION="${SPARK_VERSION:-3.4.1}"
+
+# Disable for now until the target folks agree on the PR nested builds are slow.
+exit 0
+
+git clone git@github.com:holdenk/data-validator.git || git clone https://github.com/holdenk/data-validator.git
+cd data-validator
+git checkout upgrade-to-modern-spark
+sbt -Dspark="${SPARK_VERSION}" clean assembly
+JAR_PATH="$(pwd)/target/scala-2.12/data-validator-assembly-${SPARK_VERSION}_0.15.0.jar"
+export JAR_PATH
+cd ..
+"${SPARK_HOME}/bin/spark-submit" --master local  "$JAR_PATH" --config ex.yaml || echo "Failed as expected."