From f02142bebf528437702ec8fa689c9c0263e96fe7 Mon Sep 17 00:00:00 2001 From: Jimin Hsieh Date: Tue, 20 Aug 2019 09:12:52 +0800 Subject: [PATCH 01/53] Add sbt version (#98) * Add `build.properties` file for IntelliJ * `MaxPermSize` was deprecated * Use the same Spark version as build file * Prevent the failure of installing Oracle JDK --- .travis.yml | 8 +++++--- build.sbt | 2 +- build_windows.sbt | 2 +- project/build.properties | 1 + 4 files changed, 8 insertions(+), 5 deletions(-) create mode 100644 project/build.properties diff --git a/.travis.yml b/.travis.yml index 58147d3..2630422 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,5 @@ +dist: trusty + language: scala sudo: false cache: @@ -52,9 +54,9 @@ before_install: script: - "export SPARK_CONF_DIR=./log4j/" - sbt clean coverage compile package assembly test || (rm -rf ~/.ivy2 ~/.m2 && sbt clean coverage compile package test) - - "[ -f spark] || mkdir spark && cd spark && axel http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz && cd .." - - "tar -xf ./spark/spark-2.1.0-bin-hadoop2.7.tgz" - - "export SPARK_HOME=`pwd`/spark-2.1.0-bin-hadoop2.7" + - "[ -f spark] || mkdir spark && cd spark && axel http://d3kbcqa49mib13.cloudfront.net/spark-2.2.0-bin-hadoop2.7.tgz && cd .." + - "tar -xf ./spark/spark-2.2.0-bin-hadoop2.7.tgz" + - "export SPARK_HOME=`pwd`/spark-2.2.0-bin-hadoop2.7" - "export PYTHONPATH=$SPARK_HOME/python:`ls -1 $SPARK_HOME/python/lib/py4j-*-src.zip`:$PYTHONPATH" - "PYSPARK_SUBMIT_ARGS='--jars ./target/examples-assembly-0.0.1.jar pyspark-shell' nosetests --with-doctest --doctest-options=+ELLIPSIS --logging-level=INFO --detailed-errors --verbosity=2 --with-coverage --cover-html-dir=./htmlcov" - # $SPARK_HOME/bin/spark-submit ./src/main/r/wc.R $SPARK_HOME/README.md diff --git a/build.sbt b/build.sbt index 35b1508..28dbf2e 100644 --- a/build.sbt +++ b/build.sbt @@ -32,7 +32,7 @@ parallelExecution in Test := false fork := true -javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true") +javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true") // additional libraries libraryDependencies ++= Seq( diff --git a/build_windows.sbt b/build_windows.sbt index b698ab9..7b68f4a 100644 --- a/build_windows.sbt +++ b/build_windows.sbt @@ -32,7 +32,7 @@ parallelExecution in Test := false fork := true -javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true") +javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true") // additional libraries libraryDependencies ++= Seq( diff --git a/project/build.properties b/project/build.properties new file mode 100644 index 0000000..8e682c5 --- /dev/null +++ b/project/build.properties @@ -0,0 +1 @@ +sbt.version=0.13.18 From b4c5dafc7c27a40ef755326a2f459761b109ffb2 Mon Sep 17 00:00:00 2001 From: Jimin Hsieh <5125598+jiminhsieh@users.noreply.github.com> Date: Sat, 3 Dec 2022 00:21:50 +0800 Subject: [PATCH 02/53] Add unit tests (#99) * Add rest of unit test of WordCountTest * Add reset of unit tests of Accumlators * Ignore coverage of main method * Add rest of unit test of MixedDataset * Bump the version of sbt-scoverage to 1.6.0 * Fix warning from scalastyle --- project/plugins.sbt | 2 +- .../dataframe/MixedDataset.scala | 6 +- .../native/StandAlone.scala | 2 + .../perf/SimplePerfTest.scala | 2 + .../dataframe/MixedDatasetSuite.scala | 115 +++++++++++++++++- .../transformations/Accumulators.scala | 14 +++ .../wordcount/WordCountTest.scala | 38 ++++++ 7 files changed, 174 insertions(+), 5 deletions(-) diff --git a/project/plugins.sbt b/project/plugins.sbt index 26c430e..f3e7599 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -13,7 +13,7 @@ addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.5") //addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.0") +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0") //tag::sbtJNIPlugin[] addSbtPlugin("ch.jodersky" %% "sbt-jni" % "1.0.0-RC3") diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala index 2ccdd10..e9049d9 100644 --- a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala @@ -89,7 +89,7 @@ class MixedDataset(sqlCtx: SQLContext) { Dataset[(RawPanda, CoffeeShop)] = { //tag::joinWith[] val result: Dataset[(RawPanda, CoffeeShop)] = pandas.joinWith(coffeeShops, - $"zip" === $"zip") + pandas("zip") === coffeeShops("zip")) //end::joinWith[] result } @@ -100,8 +100,8 @@ class MixedDataset(sqlCtx: SQLContext) { def selfJoin(pandas: Dataset[RawPanda]): Dataset[(RawPanda, RawPanda)] = { //tag::selfJoin[] - val result: Dataset[(RawPanda, RawPanda)] = pandas.joinWith(pandas, - $"zip" === $"zip") + val result: Dataset[(RawPanda, RawPanda)] = pandas.as("l").joinWith(pandas.as("r"), + $"l.zip" === $"r.zip") //end::selfJoin[] result } diff --git a/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala b/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala index 485c73d..16aa779 100644 --- a/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala +++ b/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala @@ -1,10 +1,12 @@ package com.highperformancespark.examples.ffi object StandAlone { + // $COVERAGE-OFF$ def main(args: Array[String]) { //tag::systemLoadLibrary[] System.loadLibrary("highPerformanceSpark0") //end::systemLoadLibrary[] println(new SumJNI().sum(Array(1,2,3))) } + // $COVERAGE-ON$ } diff --git a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala b/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala index b6e59ae..5859a78 100644 --- a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala +++ b/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.types._ * A simple performance test to compare a simple sort between DataFrame, and RDD */ object SimplePerfTest { + // $COVERAGE-OFF$ def main(args: Array[String]) = { val sparkConf = new SparkConf().setAppName("simple-perf-test") val sparkSession = SparkSession.builder().enableHiveSupport().getOrCreate() @@ -81,4 +82,5 @@ object SimplePerfTest { println(s"Time ${t1 - t0}ns") (result, t1 - t0) } + // $COVERAGE-ON$ } diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala b/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala index 6571cee..1dc5d4a 100644 --- a/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala @@ -10,10 +10,14 @@ import org.apache.spark.sql.{DataFrame, Row, SQLContext} import org.scalatest.Matchers._ import org.scalatest.FunSuite +import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.Random -class MixedDatasetSuite extends FunSuite with DataFrameSuiteBase { +class MixedDatasetSuite extends FunSuite + with DataFrameSuiteBase + with DatasetSuiteBase + with RDDComparisons { val rawPandaList = List( RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9, 20.0)), @@ -60,4 +64,113 @@ class MixedDatasetSuite extends FunSuite with DataFrameSuiteBase { assert(bigPandas.size === 1) assert(bigPandas(0)._2 === 30.0 +- 0.00001) } + + test("max pandas size per zip scala version") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val mixedDS = new MixedDataset(sqlCtx) + val bigPandas = mixedDS.maxPandaSizePerZipScala(inputDS).collect() + assert(bigPandas.size === 1) + assert(bigPandas(0)._2 === 30.0 +- 0.00001) + } + + test("union pandas") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val happyPandas = sqlCtx.createDataset(rawPandaList.take(1)) + val sadPandas = sqlCtx.createDataset(rawPandaList.drop(1)) + val mixedDS = new MixedDataset(sqlCtx) + val unionPandas = mixedDS.unionPandas(happyPandas, sadPandas).collect + assert(unionPandas.toSet == rawPandaList.toSet) + } + + test("typed query") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val mixedDS = new MixedDataset(sqlCtx) + val typedResult = mixedDS.typedQueryExample(inputDS) + assert(typedResult.collect().toList == rawPandaList.map(_.attributes(0))) + } + + test("join different dataset") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val pandaDS = sqlCtx.createDataFrame(rawPandaList).as[RawPanda] + val rawCoffeeShop = List( + CoffeeShop("94110", "Starbucks"), + CoffeeShop("98765", "Caribou") + ) + val coffeeShopDS = sqlCtx.createDataFrame(rawCoffeeShop).as[CoffeeShop] + val mixedDS = new MixedDataset(sqlCtx) + val joinResult = mixedDS.joinSample(pandaDS, coffeeShopDS) + val expected = for { + panda <- rawPandaList + coffeeShop <- rawCoffeeShop + if (panda.zip == coffeeShop.zip) + } yield (panda, coffeeShop) + assert(joinResult.collect().toSet == expected.toSet) + } + + test("self join") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val mixedDS = new MixedDataset(sqlCtx) + val selfJoinResult = mixedDS.selfJoin(inputDS) + val expected = for { + left <- rawPandaList + right <- rawPandaList + if (left.zip == right.zip) + } yield (left, right) + assert(selfJoinResult.collect().toSet == expected.toSet) + } + + test("convert an RDD to DS") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val mixedDS = new MixedDataset(sqlCtx) + val rdd = sc.parallelize(rawPandaList) + val result = mixedDS.fromRDD(rdd) + val expected = sqlCtx.createDataFrame(rawPandaList).as[RawPanda] + assertDatasetEquals(expected, result) + } + + test("convert a Dataset to an RDD") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val mixedDS = new MixedDataset(sqlCtx) + val rdd = sc.parallelize(rawPandaList) + val dataset = sqlCtx.createDataFrame(rawPandaList).as[RawPanda] + val result = mixedDS.toRDD(dataset) + val expected = sc.parallelize(rawPandaList) + assertRDDEquals(expected, result) + } + + test("convert a Dataset to a DataFrame") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val mixedDS = new MixedDataset(sqlCtx) + val rdd = sc.parallelize(rawPandaList) + val dataset = sqlCtx.createDataFrame(rawPandaList).as[RawPanda] + val result = mixedDS.toDF(dataset) + val expected = sqlCtx.createDataFrame(rawPandaList) + assertDataFrameEquals(expected, result) + } + + + test("convert a DataFrame to a DataSset") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val mixedDS = new MixedDataset(sqlCtx) + val dataframe = sqlCtx.createDataFrame(rawPandaList) + val result = mixedDS.fromDF(dataframe) + val expected = sqlCtx.createDataFrame(rawPandaList).as[RawPanda] + assertDatasetEquals(expected, result) + } + } diff --git a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala index 5eb995f..4000f23 100644 --- a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala +++ b/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala @@ -8,6 +8,7 @@ import com.highperformancespark.examples.dataframe.RawPanda import com.holdenkarau.spark.testing._ import org.scalatest.FunSuite +import scala.collection.immutable.HashSet class AccumulatorsTest extends FunSuite with SharedSparkContext { test("accumulator max should function") { @@ -23,4 +24,17 @@ class AccumulatorsTest extends FunSuite with SharedSparkContext { val (_, sum) = Accumulators.computeTotalFuzzyNess(sc, input) assert(sum === 5050.0) } + + test("accumulator unique should function") { + val input1 = sc.parallelize(1 to 100).map(x => + RawPanda(1L, "1", "red", true, Array(x.toDouble)) + ) + + val input2 = sc.parallelize(1 to 100).map(x => + RawPanda(2L, "2", "blude", false, Array(x.toDouble)) + ) + + val set = Accumulators.uniquePandas(sc, input1 ++ input2) + assert(set == HashSet(2, 1)) + } } diff --git a/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala b/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala index 4fd8ad5..3772a48 100644 --- a/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala +++ b/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala @@ -22,4 +22,42 @@ class WordCountTest extends FunSuite with SharedSparkContext { assert(wordCountsAsMap.contains("ing")) assert(wordCountsAsMap.get("panda").get.equals(3)) } + + test("word count with simple counting") { + val wordRDD = sc.parallelize( + Seq( + "a b c d", + "b c d e" + ) + ) + val wordCounts = WordCount.simpleWordCount(wordRDD) + + val wordCountsAsMap = wordCounts.collectAsMap() + + for (character <- 'a' to 'e') { + assert(wordCountsAsMap.contains(character.toString)) + } + for (character <- 'b' to 'd') { + assert(wordCountsAsMap.get(character.toString).get == 2) + } + } + + test("word count with bad idea") { + val wordRDD = sc.parallelize( + Seq( + "a b c d", + "b c d e" + ) + ) + val wordCounts = WordCount.badIdea(wordRDD) + + val wordCountsAsMap = wordCounts.collectAsMap() + + for (character <- 'a' to 'e') { + assert(wordCountsAsMap.contains(character.toString)) + } + for (character <- 'b' to 'd') { + assert(wordCountsAsMap.get(character.toString).get == 2) + } + } } From 18dd4488e9b75799fa071469cbb4216856d7c5ac Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 2 Dec 2022 08:25:05 -0800 Subject: [PATCH 03/53] Switch from Travis to GitHub actions. Fix test Fix matrix Fix sync --- .github/workflows/ci.yml | 31 +++++++++++++++++++ .travis.yml | 66 ---------------------------------------- 2 files changed, 31 insertions(+), 66 deletions(-) create mode 100644 .github/workflows/ci.yml delete mode 100644 .travis.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..89ad9bb --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,31 @@ +name: CI +on: + pull_request: + push: +jobs: + test: + strategy: + fail-fast: false + matrix: + include: + - java: 11 + - java: 8 + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Sync the current branch with the latest in spark-testing-base + if: github.repository != 'high-performance-spark/high-performance-spark-examples' + id: sync-branch + run: | + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/} + git -c user.name='Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD || echo "no merge needed." + git -c user.name='Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" || echo "no merge needed." + - name: Setup JDK + uses: actions/setup-java@v3 + with: + distribution: temurin + java-version: ${{ matrix.java }} + cache: sbt + - name: Build and Test + run: sbt clean +test diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 2630422..0000000 --- a/.travis.yml +++ /dev/null @@ -1,66 +0,0 @@ -dist: trusty - -language: scala -sudo: false -cache: - directories: - - $HOME/.ivy2 - - $HOME/spark - - $HOME/.cache/pip - - $HOME/.pip-cache - - $HOME/.sbt/launchers - - $HOME/perl5 -scala: - - 2.11.6 -jdk: - - oraclejdk8 -r: - - release -addons: - apt: - sources: - - ubuntu-toolchain-r-test - - ppa:marutter/rdev - packages: - - gfortran - - gcc - - binutils - - python-pip - - python-pandas - - python-numpy - - gfortran - - cmake - - perl - - cpanminus - - r-base - - libcurl4-gnutls-dev - - libxml2-dev - - libssl-dev - - r-base-dev - - axel -r_packages: - - Imap -before_install: - - # Setup Python - - pip install --user codecov unittest2 nose pep8 pylint - - # Setup perl - - cpanm --force --local-lib $HOME/perl5 --quite --notest Pithub || cat ~/.cpanm/build.log - - cd ./src/main/perl; cpanm --local-lib $HOME/perl5 --force --quiet --installdeps --notest .; cd ../../../ - - PATH="$HOME/perl5/bin${PATH:+:${PATH}}"; export PATH; - - PERL5LIB=":$HOME/perl5/lib/perl5${PERL5LIB:+:${PERL5LIB}}"; export PERL5LIB; - - PERL_LOCAL_LIB_ROOT="$HOME/perl5${PERL_LOCAL_LIB_ROOT:+:${PERL_LOCAL_LIB_ROOT}}"; export PERL_LOCAL_LIB_ROOT; - - PERL_MB_OPT="--install_base \"$HOME/perl5\""; export PERL_MB_OPT; - - PERL_MM_OPT="INSTALL_BASE=$HOME/perl5"; export PERL_MM_OPT; -script: - - "export SPARK_CONF_DIR=./log4j/" - - sbt clean coverage compile package assembly test || (rm -rf ~/.ivy2 ~/.m2 && sbt clean coverage compile package test) - - "[ -f spark] || mkdir spark && cd spark && axel http://d3kbcqa49mib13.cloudfront.net/spark-2.2.0-bin-hadoop2.7.tgz && cd .." - - "tar -xf ./spark/spark-2.2.0-bin-hadoop2.7.tgz" - - "export SPARK_HOME=`pwd`/spark-2.2.0-bin-hadoop2.7" - - "export PYTHONPATH=$SPARK_HOME/python:`ls -1 $SPARK_HOME/python/lib/py4j-*-src.zip`:$PYTHONPATH" - - "PYSPARK_SUBMIT_ARGS='--jars ./target/examples-assembly-0.0.1.jar pyspark-shell' nosetests --with-doctest --doctest-options=+ELLIPSIS --logging-level=INFO --detailed-errors --verbosity=2 --with-coverage --cover-html-dir=./htmlcov" - - # $SPARK_HOME/bin/spark-submit ./src/main/r/wc.R $SPARK_HOME/README.md - - # $SPARK_HOME/bin/spark-submit ./src/main/r/dapply.R -after_success: - - sbt coverageReport || sbt update coverageReport - - codecov \ No newline at end of file From eab02654b24384fd23e0dad46ac6cd109ba615b3 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 3 Dec 2022 07:38:42 -0800 Subject: [PATCH 04/53] Start getting ready to apply to auto-upgrade rules Update sbt verison more build fixes jodersky jni change --- .scalafix.conf | 31 ++++++ README.md | 2 + build.sbt | 105 ++++++++---------- build_windows.sbt | 91 --------------- project/build.properties | 2 +- project/plugins.sbt | 21 ++-- .../native/SumJNI.scala | 2 +- 7 files changed, 94 insertions(+), 160 deletions(-) create mode 100644 .scalafix.conf delete mode 100644 build_windows.sbt diff --git a/.scalafix.conf b/.scalafix.conf new file mode 100644 index 0000000..69589c1 --- /dev/null +++ b/.scalafix.conf @@ -0,0 +1,31 @@ +UnionRewrite.deprecatedMethod { + "unionAll" = "union" +} + +OrganizeImports { + blankLines = Auto, + groups = [ + "re:javax?\\." + "scala." + "org.apache.spark." + "*" + ], + removeUnused = false +} + +rules = [ + OrganizeImports, + DisableSyntax, + SparkAutoUpgrade, + MigrateHiveContext, + MigrateToSparkSessionBuilder, + MigrateDeprecatedDataFrameReaderFuns, + AccumulatorUpgrade, + onFailureFix, + ExecutorPluginWarn, + UnionRewrite, + GroupByKeyWarn, + GroupByKeyRewrite, + MetadataWarnQQ, + ScalaTestImportChange +] \ No newline at end of file diff --git a/README.md b/README.md index 551928f..ad8c8f6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # high-performance-spark-examples Examples for High Performance Spark +We are in the progress of updata this for Spark 3.3+ and the 2ed edition of our book! + # Building Most of the examples can be built with sbt, the C and Fortran components depend on gcc, g77, and cmake. diff --git a/build.sbt b/build.sbt index 28dbf2e..e515752 100644 --- a/build.sbt +++ b/build.sbt @@ -1,55 +1,58 @@ organization := "com.highperformancespark" +//tag::addSparkScalaFix[] +ThisBuild / scalafixDependencies += + "com.holdenkarau" %% "spark-scalafix-rules-2.4.8" % "0.1.4" +ThisBuild / scalafixDependencies += + "com.github.liancheng" %% "organize-imports" % "0.6.0" +//end::addSparkScalaFix[] + +lazy val V = _root_.scalafix.sbt.BuildInfo + +scalaVersion := V.scala212 +addCompilerPlugin(scalafixSemanticdb) +scalacOptions ++= List( + "-Yrangepos", + "-P:semanticdb:synthetics:on" +) + + name := "examples" publishMavenStyle := true version := "0.0.1" -scalaVersion := "2.11.6" -scalaVersion in ThisBuild := "2.11.6" -ivyScala := ivyScala.value map { _.copy(overrideScalaVersion = true) } - -crossScalaVersions := Seq("2.11.6") - javacOptions ++= Seq("-source", "1.8", "-target", "1.8") -//tag::sparkVersion[] -sparkVersion := "2.2.0" -//end::sparkVersion[] - -//tag::sparkComponents[] -sparkComponents ++= Seq("core") -//end::sparkComponents[] -//tag::sparkExtraComponents[] -sparkComponents ++= Seq("streaming", "mllib") -//end::sparkExtraComponents[] -//tag::addSQLHiveComponent[] -sparkComponents ++= Seq("sql", "hive", "hive-thriftserver", "hive-thriftserver") -//end::addSQLHiveComponent[] - parallelExecution in Test := false fork := true javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true") +val sparkVersion = settingKey[String]("Spark version") +val sparkTestingVersion = settingKey[String]("Spark testing base version without Spark version part") + +// 2.4.5 is the highest version we have with the old spark-testing-base deps +sparkVersion := System.getProperty("sparkVersion", "2.4.5") +sparkTestingVersion := "0.14.0" + // additional libraries libraryDependencies ++= Seq( - "org.scalatest" %% "scalatest" % "3.0.1", - "org.scalacheck" %% "scalacheck" % "1.13.4", - "junit" % "junit" % "4.12", - "junit" % "junit" % "4.11", - "com.holdenkarau" %% "spark-testing-base" % "2.2.0_0.7.2", - "com.novocode" % "junit-interface" % "0.11" % "test->default", + "org.apache.spark" %% "spark-core" % sparkVersion.value, + "org.apache.spark" %% "spark-streaming" % sparkVersion.value, + "org.apache.spark" %% "spark-sql" % sparkVersion.value, + "org.apache.spark" %% "spark-hive" % sparkVersion.value, + "org.apache.spark" %% "spark-hive-thriftserver" % sparkVersion.value, + "org.apache.spark" %% "spark-catalyst" % sparkVersion.value, + "org.apache.spark" %% "spark-yarn" % sparkVersion.value, + "org.apache.spark" %% "spark-mllib" % sparkVersion.value, + "com.holdenkarau" %% "spark-testing-base" % s"${sparkVersion.value}_${sparkTestingVersion.value}", //tag::scalaLogging[] - "com.typesafe.scala-logging" %% "scala-logging" % "3.5.0", + "com.typesafe.scala-logging" %% "scala-logging" % "3.9.4", //end::scalaLogging[] - "org.codehaus.jackson" % "jackson-core-asl" % "1.8.8", - "org.codehaus.jackson" % "jackson-mapper-asl" % "1.8.8", - "org.codehaus.jackson" % "jackson-core-asl" % "1.9.13", - "org.codehaus.jackson" % "jackson-mapper-asl" % "1.9.13", - "net.java.dev.jna" % "jna" % "4.2.2") + "net.java.dev.jna" % "jna" % "5.12.1") scalacOptions ++= Seq("-deprecation", "-unchecked") @@ -57,41 +60,29 @@ scalacOptions ++= Seq("-deprecation", "-unchecked") pomIncludeRepository := { x => false } resolvers ++= Seq( - "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/", - "Spray Repository" at "http://repo.spray.cc/", + "JBoss Repository" at "https://repository.jboss.org/nexus/content/repositories/releases/", "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", - "Akka Repository" at "http://repo.akka.io/releases/", - "Twitter4J Repository" at "http://twitter4j.org/maven2/", "Apache HBase" at "https://repository.apache.org/content/repositories/releases", - "Twitter Maven Repo" at "http://maven.twttr.com/", + "Twitter Maven Repo" at "https://maven.twttr.com/", "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/", - "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", - "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/", - "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven", - Resolver.sonatypeRepo("public"), - Resolver.bintrayRepo("jodersky", "sbt-jni-macros"), - "jodersky" at "https://dl.bintray.com/jodersky/maven/" + "Typesafe repository" at "https://repo.typesafe.com/typesafe/releases/", + "Second Typesafe repo" at "https://repo.typesafe.com/typesafe/maven-releases/", + "Mesosphere Public Repository" at "https://downloads.mesosphere.io/maven", + Resolver.sonatypeRepo("public") ) licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")) -mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => - { - case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard - case m if m.startsWith("META-INF") => MergeStrategy.discard - case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first - case PathList("org", "apache", xs @ _*) => MergeStrategy.first - case PathList("org", "jboss", xs @ _*) => MergeStrategy.first - case "log4j.properties" => MergeStrategy.discard - case "about.html" => MergeStrategy.rename - case "reference.conf" => MergeStrategy.concat - case _ => MergeStrategy.first - } -} - // JNI enablePlugins(JniNative) sourceDirectory in nativeCompile := sourceDirectory.value + +//tag::xmlVersionConflict[] +// See https://github.com/scala/bug/issues/12632 +ThisBuild / libraryDependencySchemes ++= Seq( + "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always +) +//end::xmlVersionConflict[] diff --git a/build_windows.sbt b/build_windows.sbt deleted file mode 100644 index 7b68f4a..0000000 --- a/build_windows.sbt +++ /dev/null @@ -1,91 +0,0 @@ -organization := "com.highperformancespark" - -name := "examples" - -publishMavenStyle := true - -version := "0.0.1" - -scalaVersion := "2.11.6" -scalaVersion in ThisBuild := "2.11.6" -ivyScala := ivyScala.value map { _.copy(overrideScalaVersion = true) } - -crossScalaVersions := Seq("2.11.6") - -javacOptions ++= Seq("-source", "1.8", "-target", "1.8") - -//tag::sparkVersion[] -sparkVersion := "2.2.0" -//end::sparkVersion[] - -//tag::sparkComponents[] -sparkComponents ++= Seq("core") -//end::sparkComponents[] -//tag::sparkExtraComponents[] -sparkComponents ++= Seq("streaming", "mllib") -//end::sparkExtraComponents[] -//tag::addSQLHiveComponent[] -sparkComponents ++= Seq("sql", "hive", "hive-thriftserver", "hive-thriftserver") -//end::addSQLHiveComponent[] - -parallelExecution in Test := false - -fork := true - -javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true") - -// additional libraries -libraryDependencies ++= Seq( - "org.scalatest" %% "scalatest" % "3.0.1", - "org.scalacheck" %% "scalacheck" % "1.13.4", - "junit" % "junit" % "4.12", - "junit" % "junit" % "4.11", - "com.holdenkarau" %% "spark-testing-base" % "2.2.0_0.7.2", - "com.novocode" % "junit-interface" % "0.11" % "test->default", - //tag::sacalLogging[] - "com.typesafe.scala-logging" %% "scala-logging" % "3.5.0", - //end::scalaLogging[] - "org.codehaus.jackson" % "jackson-core-asl" % "1.8.8", - "org.codehaus.jackson" % "jackson-mapper-asl" % "1.8.8", - "org.codehaus.jackson" % "jackson-core-asl" % "1.9.13", - "org.codehaus.jackson" % "jackson-mapper-asl" % "1.9.13", - "net.java.dev.jna" % "jna" % "4.2.2") - - -scalacOptions ++= Seq("-deprecation", "-unchecked") - -pomIncludeRepository := { x => false } - -resolvers ++= Seq( - "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/", - "Spray Repository" at "http://repo.spray.cc/", - "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", - "Akka Repository" at "http://repo.akka.io/releases/", - "Twitter4J Repository" at "http://twitter4j.org/maven2/", - "Apache HBase" at "https://repository.apache.org/content/repositories/releases", - "Twitter Maven Repo" at "http://maven.twttr.com/", - "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", - "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/", - "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", - "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/", - "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven", - Resolver.sonatypeRepo("public"), - Resolver.bintrayRepo("jodersky", "sbt-jni-macros"), - "jodersky" at "https://dl.bintray.com/jodersky/maven/" -) - -licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")) - -mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => - { - case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard - case m if m.startsWith("META-INF") => MergeStrategy.discard - case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first - case PathList("org", "apache", xs @ _*) => MergeStrategy.first - case PathList("org", "jboss", xs @ _*) => MergeStrategy.first - case "log4j.properties" => MergeStrategy.discard - case "about.html" => MergeStrategy.rename - case "reference.conf" => MergeStrategy.concat - case _ => MergeStrategy.first - } -} diff --git a/project/build.properties b/project/build.properties index 8e682c5..8b9a0b0 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=0.13.18 +sbt.version=1.8.0 diff --git a/project/plugins.sbt b/project/plugins.sbt index f3e7599..aa2b0a4 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,22 +1,23 @@ -addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0") +addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" resolvers += "sonatype-snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/" -//tag::addSparkPackagesPlugin[] -resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven" +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") -addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.5") -//end::addSparkPackagesPlugin[] +addDependencyTreePlugin -//addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") - -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.0") +addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.10.4") //tag::sbtJNIPlugin[] -addSbtPlugin("ch.jodersky" %% "sbt-jni" % "1.0.0-RC3") +addSbtPlugin("com.github.sbt" %% "sbt-jni" % "1.5.4") //end::sbtJNIPlugin[] -addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0") +//tag::xmlVersionConflict[] +// See https://github.com/scala/bug/issues/12632 +ThisBuild / libraryDependencySchemes ++= Seq( + "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always +) +//end::xmlVersionConflict[] diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala b/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala index ed0caaf..65de6c2 100644 --- a/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala +++ b/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala @@ -1,6 +1,6 @@ package com.highperformancespark.examples.ffi -import ch.jodersky.jni.nativeLoader +import com.github.sbt.jni.nativeLoader //tag::sumJNIDecorator[] @nativeLoader("high-performance-spark0") From 394e834c8d1c5bd3b0285ffc66e165618d2d0d79 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 3 Dec 2022 13:23:17 -0800 Subject: [PATCH 05/53] Update scalafix rules --- .scalafix.conf | 1 + build.sbt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.scalafix.conf b/.scalafix.conf index 69589c1..793a60d 100644 --- a/.scalafix.conf +++ b/.scalafix.conf @@ -27,5 +27,6 @@ rules = [ GroupByKeyWarn, GroupByKeyRewrite, MetadataWarnQQ, + ScalaTestExtendsFix, ScalaTestImportChange ] \ No newline at end of file diff --git a/build.sbt b/build.sbt index e515752..140195d 100644 --- a/build.sbt +++ b/build.sbt @@ -2,7 +2,7 @@ organization := "com.highperformancespark" //tag::addSparkScalaFix[] ThisBuild / scalafixDependencies += - "com.holdenkarau" %% "spark-scalafix-rules-2.4.8" % "0.1.4" + "com.holdenkarau" %% "spark-scalafix-rules-2.4.8" % "0.1.5" ThisBuild / scalafixDependencies += "com.github.liancheng" %% "organize-imports" % "0.6.0" //end::addSparkScalaFix[] From 32b54c93c021bf70018939e6e0b945989e052b19 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 3 Dec 2022 13:23:33 -0800 Subject: [PATCH 06/53] Port accumulators, this is a semi-manual port Acc Changes -- semi manual --- .../transformations/Accumulators.scala | 51 +++++++++++++++---- .../transformations/NewAccumulators.scala | 8 ++- .../transformations/Accumulators.scala | 5 +- 3 files changed, 46 insertions(+), 18 deletions(-) diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala index bddc84b..f4816d7 100644 --- a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala +++ b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala @@ -4,12 +4,15 @@ */ package com.highperformancespark.examples.transformations -import com.highperformancespark.examples.dataframe.RawPanda +import java.{lang => jl} + +import scala.collection.mutable.HashSet import org.apache.spark._ +import org.apache.spark.util.AccumulatorV2 import org.apache.spark.rdd._ -import scala.collection.mutable.HashSet +import com.highperformancespark.examples.dataframe.RawPanda object Accumulators { /** * Compute the total fuzzyness with an accumulator while generating @@ -19,8 +22,8 @@ object Accumulators { def computeTotalFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): (RDD[(String, Long)], Double) = { // Create an accumulator with the initial value of 0.0 - val acc = sc.accumulator(0.0) - val transformed = rdd.map{x => acc += x.attributes(0); (x.zip, x.id)} + val acc = sc.doubleAccumulator + val transformed = rdd.map{x => acc.add(x.attributes(0)); (x.zip, x.id)} // accumulator still has zero value // Note: This example is dangerous since the transformation may be // evaluated multiple times. @@ -36,15 +39,43 @@ object Accumulators { //tag::maxFuzzyAcc[] def computeMaxFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): (RDD[(String, Long)], Double) = { - object MaxDoubleParam extends AccumulatorParam[Double] { - override def zero(initValue: Double) = initValue - override def addInPlace(r1: Double, r2: Double): Double = { - Math.max(r1, r2) + class MaxDoubleParam extends AccumulatorV2[jl.Double, jl.Double] { + var _value = Double.MinValue + override def isZero(): Boolean = { + _value == Double.MinValue + } + override def reset() = { + _value = Double.MinValue + } + + override def add(r1: jl.Double): Unit = { + _value = Math.max(r1, _value) } + + def add(r1: Double): Unit = { + _value = Math.max(r1, _value) + } + + def copy(): MaxDoubleParam = { + val newAcc = new MaxDoubleParam() + newAcc._value = _value + newAcc + } + + override def merge(other: AccumulatorV2[jl.Double, jl.Double]): Unit = other match { + case o: MaxDoubleParam => + _value = Math.max(_value, o._value) + case _ => + throw new UnsupportedOperationException( + s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") + } + + override def value: jl.Double = _value } // Create an accumulator with the initial value of Double.MinValue - val acc = sc.accumulator(Double.MinValue)(MaxDoubleParam) - val transformed = rdd.map{x => acc += x.attributes(0); (x.zip, x.id)} + val acc = new MaxDoubleParam() + sc.register(acc) + val transformed = rdd.map{x => acc.add(x.attributes(0)); (x.zip, x.id)} // accumulator still has Double.MinValue // Note: This example is dangerous since the transformation may be // evaluated multiple times. diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala b/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala index 948df49..aca8541 100644 --- a/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala +++ b/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala @@ -6,15 +6,13 @@ */ package com.highperformancespark.examples.transformations -import com.highperformancespark.examples.dataframe.RawPanda +import scala.collection.mutable.HashSet import org.apache.spark._ -//tag::import[] -import org.apache.spark.util.AccumulatorV2 -//end::import[] import org.apache.spark.rdd._ +import org.apache.spark.util.AccumulatorV2 -import scala.collection.mutable.HashSet +import com.highperformancespark.examples.dataframe.RawPanda object NewAccumulators { /** * Compute the total fuzzyness with an accumulator while generating diff --git a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala index 4000f23..d043d38 100644 --- a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala +++ b/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala @@ -3,12 +3,11 @@ */ package com.highperformancespark.examples.transformations -import com.highperformancespark.examples.dataframe.RawPanda +import scala.collection.immutable.HashSet +import com.highperformancespark.examples.dataframe.RawPanda import com.holdenkarau.spark.testing._ - import org.scalatest.FunSuite -import scala.collection.immutable.HashSet class AccumulatorsTest extends FunSuite with SharedSparkContext { test("accumulator max should function") { From 8fb2494d583cc91c93f1db49f7529ebf1acae804 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 3 Dec 2022 13:37:06 -0800 Subject: [PATCH 07/53] Run scalafixAll OrganizeImports --- .scalafix.conf | 1 - .../dataframe/HappyPandas.scala | 13 ++++------ .../dataframe/MixedDataset.scala | 3 +-- .../goldilocks/GoldilocksFirstTry.scala | 10 ++++--- .../goldilocks/GoldilocksSecondarySort.scala | 6 ++--- .../goldilocks/GoldilocksWithHashMap.scala | 7 ++--- .../goldilocks/RDDJoinExamples.scala | 6 ++--- .../goldilocks/SecondarySort.scala | 3 ++- .../ml/CustomPipeline.scala | 21 ++++++++------- .../ml/SimpleNaiveBayes.scala | 21 ++++++++------- .../ml/SimplePipeline.scala | 26 +++++++++---------- .../mllib/GoldilocksMLlib.scala | 22 ++++++++-------- .../native/PipeExample.scala | 3 ++- .../perf/SimplePerfTest.scala | 14 ++++++---- .../streaming/DStream.scala | 9 +++---- .../tools/FilterInvalidPandas.scala | 4 +-- .../tools/GenerateScalingData.scala | 8 +++--- .../tools/SampleData.scala | 2 +- .../transformations/Accumulators.scala | 2 +- .../dataframe/HappyPandasTest.scala | 18 ++++++++----- .../dataframe/MixedDatasetSuite.scala | 18 ++++++++----- .../errors/ThrowsSuite.scala | 1 - .../goldilocks/EvaluationTests.scala | 3 ++- .../goldilocks/GoldilocksLargeTests.scala | 15 +++++++---- .../goldilocks/JoinTest.scala | 3 ++- .../QuantileOnlyArtisanalTest.scala | 7 +++-- .../goldilocks/SortingTests.scala | 7 ++--- .../ml/CustomPipeline.scala | 3 ++- .../ml/SimpleNaiveBayes.scala | 15 ++++++----- .../mllib/GoldilocksMLlibSuite.scala | 7 ++--- .../native/NativeExample.scala | 5 ++-- .../native/PipeExampleSuite.scala | 2 +- .../streaming/DStreamSuite.scala | 4 +-- .../tokenize/SampleTokenizeSuite.scala | 4 +-- .../tools/FilterInvalidPandasSuite.scala | 2 -- .../tools/GenerateScalingDataSuite.scala | 2 -- 36 files changed, 157 insertions(+), 140 deletions(-) diff --git a/.scalafix.conf b/.scalafix.conf index 793a60d..8697e8f 100644 --- a/.scalafix.conf +++ b/.scalafix.conf @@ -14,7 +14,6 @@ OrganizeImports { } rules = [ - OrganizeImports, DisableSyntax, SparkAutoUpgrade, MigrateHiveContext, diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala index 8aeb8eb..1a51290 100644 --- a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala @@ -6,17 +6,14 @@ package com.highperformancespark.examples.dataframe import org.apache.spark._ import org.apache.spark.rdd.RDD -//tag::sparkSQLImports[] -import org.apache.spark.sql.{Dataset, DataFrame, SparkSession, Row} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.Row +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.expressions._ import org.apache.spark.sql.functions._ -//end::sparkSQLImports[] - -//tag::legacySparkSQLImports[] -import org.apache.spark.sql.SQLContext -//end::legacySparkSQLImports[] -//tag::legacySparkHiveImports[] import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver._ //end::legacySparkHiveImports[] diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala index e9049d9..8943ba7 100644 --- a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala @@ -9,10 +9,9 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.expressions._ import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types._ -// Additional imports for using HiveContext import org.apache.spark.sql.hive._ import org.apache.spark.sql.hive.thriftserver._ +import org.apache.spark.sql.types._ case class MiniPandaInfo(zip: String, size: Double) diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala index 9f8ec9d..341364e 100644 --- a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala +++ b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala @@ -1,12 +1,14 @@ package com.highperformancespark.examples.goldilocks +import scala.collection.Map +import scala.collection.mutable +import scala.collection.mutable.MutableList + import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Row import org.apache.spark.storage.StorageLevel -import scala.collection.mutable.MutableList -import scala.collection.{Map, mutable} - object GoldilocksGroupByKey { //tag::groupByKey[] def findRankStatistics( diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala index 92cb44f..71a66af 100644 --- a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala +++ b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala @@ -1,12 +1,12 @@ package com.highperformancespark.examples.goldilocks +import scala.collection.Map +import scala.collection.mutable.ArrayBuffer + import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.sql._ -import scala.collection.Map -import scala.collection.mutable.ArrayBuffer - //tag::colIndex_partition[] class ColumnIndexPartition(override val numPartitions: Int) extends Partitioner { diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala index 2b3adc1..9dd365b 100644 --- a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala +++ b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala @@ -1,12 +1,13 @@ package com.highperformancespark.examples.goldilocks +import scala.collection.Map +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel -import scala.collection.mutable.ArrayBuffer -import scala.collection.{Map, mutable} - object GoldilocksWithHashMap { diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala b/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala index a60a39f..1fef85c 100644 --- a/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala +++ b/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala @@ -1,11 +1,11 @@ package com.highperformancespark.examples.goldilocks -import org.apache.spark.HashPartitioner -import org.apache.spark.rdd.RDD - import scala.collection.Map import scala.reflect.ClassTag +import org.apache.spark.HashPartitioner +import org.apache.spark.rdd.RDD + object RDDJoinExamples { /* For Example, suppose we have one RDD with some data in the form (Panda id, score) diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala b/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala index 2b73ba4..b4e0873 100644 --- a/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala +++ b/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala @@ -3,7 +3,8 @@ package com.highperformancespark.examples.goldilocks import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag -import org.apache.spark.{HashPartitioner, Partitioner} +import org.apache.spark.HashPartitioner +import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD object PandaSecondarySort { diff --git a/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala index 2b87a7e..bd84cc0 100644 --- a/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala +++ b/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala @@ -1,22 +1,23 @@ package com.highperformancespark.examples.ml -import com.highperformancespark.examples.dataframe._ - -import scala.collection.{Map, mutable} -import scala.collection.mutable.{ArrayBuffer, MutableList} +import scala.collection.Map +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.MutableList import org.apache.spark._ -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql._ -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types._ import org.apache.spark.ml._ import org.apache.spark.ml.classification._ import org.apache.spark.ml.linalg._ -//tag::extraImports[] import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe._ //end::extraImports[] //tag::basicPipelineSetup[] diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala index 13e937f..6b1e55e 100644 --- a/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala +++ b/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala @@ -1,22 +1,23 @@ package com.highperformancespark.examples.ml -import com.highperformancespark.examples.dataframe._ - -import scala.collection.{Map, mutable} -import scala.collection.mutable.{ArrayBuffer, MutableList} +import scala.collection.Map +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.MutableList import org.apache.spark._ -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql._ -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types._ import org.apache.spark.ml._ import org.apache.spark.ml.classification._ import org.apache.spark.ml.linalg._ -//tag::extraImports[] import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe._ //end::extraImports[] case class LabeledToken(label: Double, index: Integer) diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala b/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala index 9117c74..d161563 100644 --- a/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala +++ b/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala @@ -1,26 +1,24 @@ package com.highperformancespark.examples.ml -import com.highperformancespark.examples.dataframe._ - -import scala.collection.{Map, mutable} -import scala.collection.mutable.{ArrayBuffer, MutableList} +import scala.collection.Map +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.MutableList import org.apache.spark._ -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql._ -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types._ -//tag::basicImport[] import org.apache.spark.ml._ -import org.apache.spark.ml.feature._ import org.apache.spark.ml.classification._ -//end::basicImport[] -//tag::renameImport[] +import org.apache.spark.ml.feature._ import org.apache.spark.ml.linalg.{Vector => SparkVector} -//end::renameImport[] import org.apache.spark.ml.param._ import org.apache.spark.ml.tuning._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe._ object SimplePipeline { def constructAndSetParams(df: DataFrame) = { diff --git a/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala b/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala index ddbc9d6..cde64c7 100644 --- a/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala +++ b/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala @@ -1,21 +1,21 @@ package com.highperformancespark.examples.mllib -import com.highperformancespark.examples.dataframe._ - -import scala.collection.{Map, mutable} -import scala.collection.mutable.{ArrayBuffer, MutableList} +import scala.collection.Map +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.MutableList import org.apache.spark._ -import org.apache.spark.rdd.RDD -//tag::imports[] -import com.github.fommil.netlib.BLAS.{getInstance => blas} +import org.apache.spark.mllib.classification.LogisticRegressionModel +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS +import org.apache.spark.mllib.feature._ import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, - LogisticRegressionModel} -// Rename Vector to SparkVector to avoid conflicts with Scala's Vector class import org.apache.spark.mllib.linalg.{Vector => SparkVector} import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.feature._ +import org.apache.spark.rdd.RDD + +import com.github.fommil.netlib.BLAS.{getInstance => blas} +import com.highperformancespark.examples.dataframe._ //end::imports[] object GoldilocksMLlib { diff --git a/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala b/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala index 40eb61f..ca6d65c 100644 --- a/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala +++ b/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala @@ -16,8 +16,9 @@ */ package com.highperformancespark.examples.ffi +import org.apache.spark.SparkContext +import org.apache.spark.SparkFiles import org.apache.spark.rdd._ -import org.apache.spark.{SparkContext, SparkFiles} object PipeExample { //tag::pipeExample[] diff --git a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala b/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala index 5859a78..197c6b7 100644 --- a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala +++ b/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala @@ -16,14 +16,18 @@ */ package com.highperformancespark.examples.perf -import com.highperformancespark.examples.dataframe.RawPanda -import com.highperformancespark.examples.tools._ - +import org.apache.spark.SparkConf +import org.apache.spark.SparkContext import org.apache.spark.rdd._ -import org.apache.spark.{SparkContext, SparkConf} -import org.apache.spark.sql.{SparkSession, DataFrame, Dataset, Row} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.Row +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types._ +import com.highperformancespark.examples.dataframe.RawPanda +import com.highperformancespark.examples.tools._ + /** * A simple performance test to compare a simple sort between DataFrame, and RDD */ diff --git a/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala b/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala index 2fa173c..2cde7b2 100644 --- a/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala +++ b/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala @@ -5,15 +5,14 @@ package com.highperformancespark.examples.streaming import scala.reflect.ClassTag -import org.apache.hadoop.io.{LongWritable, Text} -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat - import org.apache.spark._ import org.apache.spark.rdd.RDD - -//tag::DStreamImports[] import org.apache.spark.streaming._ import org.apache.spark.streaming.dstream._ + +import org.apache.hadoop.io.LongWritable +import org.apache.hadoop.io.Text +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat //end::DStreamImports[] object DStreamExamples { diff --git a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala b/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala index 02287ae..bd51db2 100644 --- a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala +++ b/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala @@ -2,12 +2,10 @@ package com.highperformancespark.examples.tools import scala.collection.immutable.HashSet -import com.highperformancespark.examples.dataframe.RawPanda - import org.apache.spark._ import org.apache.spark.rdd.RDD -//tag::loggerImport[] +import com.highperformancespark.examples.dataframe.RawPanda import com.typesafe.scalalogging.LazyLogging //end::loggerImport[] diff --git a/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala b/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala index da4fd38..586ee3b 100644 --- a/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala +++ b/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala @@ -1,12 +1,12 @@ package com.highperformancespark.examples.tools -import com.highperformancespark.examples.dataframe.RawPanda - import org.apache.spark._ +import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row -import org.apache.spark.mllib.random.RandomRDDs -import org.apache.spark.mllib.linalg.Vector + +import com.highperformancespark.examples.dataframe.RawPanda object GenerateScalingData { /** diff --git a/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala b/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala index 298a7c3..3068441 100644 --- a/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala +++ b/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala @@ -1,5 +1,5 @@ +import scala.reflect.ClassTag import scala.util.Random -import scala.reflect.{ClassTag} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala index f4816d7..636fd90 100644 --- a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala +++ b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala @@ -9,8 +9,8 @@ import java.{lang => jl} import scala.collection.mutable.HashSet import org.apache.spark._ -import org.apache.spark.util.AccumulatorV2 import org.apache.spark.rdd._ +import org.apache.spark.util.AccumulatorV2 import com.highperformancespark.examples.dataframe.RawPanda object Accumulators { diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala index 3fb10a5..695dbcf 100644 --- a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala +++ b/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala @@ -4,16 +4,20 @@ */ package com.highperformancespark.examples.dataframe -import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas} -import com.holdenkarau.spark.testing._ -import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.scalatest.Matchers._ -import org.scalatest.FunSuite - import scala.collection.mutable import scala.util.Random +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Row +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo +import com.highperformancespark.examples.dataframe.HappyPandas.Pandas +import com.holdenkarau.spark.testing._ +import org.scalatest.FunSuite +import org.scalatest.Matchers._ + class HappyPandasTest extends FunSuite with DataFrameSuiteBase { val toronto = "toronto" val sandiego = "san diego" diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala b/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala index 1dc5d4a..44b066c 100644 --- a/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala @@ -3,17 +3,21 @@ */ package com.highperformancespark.examples.dataframe -import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas} -import com.holdenkarau.spark.testing._ -import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.scalatest.Matchers._ -import org.scalatest.FunSuite - import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.Random +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Row +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo +import com.highperformancespark.examples.dataframe.HappyPandas.Pandas +import com.holdenkarau.spark.testing._ +import org.scalatest.FunSuite +import org.scalatest.Matchers._ + class MixedDatasetSuite extends FunSuite with DataFrameSuiteBase with DatasetSuiteBase diff --git a/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala b/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala index 2b54ce7..c2b68f7 100644 --- a/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala @@ -1,7 +1,6 @@ package com.highperformancespark.examples.errors import com.holdenkarau.spark.testing._ - import org.scalatest.FunSuite class ThrowsSuite extends FunSuite with SharedSparkContext { diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala index 9708284..56c279e 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala @@ -1,7 +1,8 @@ package com.highperformancespark.examples.goldilocks -import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.rdd.RDD + +import com.holdenkarau.spark.testing.SharedSparkContext import org.scalatest.FunSuite class EvaluationTests extends FunSuite with SharedSparkContext { diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala index 5388477..fed7d85 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala @@ -1,13 +1,18 @@ package com.highperformancespark.examples.goldilocks -import com.holdenkarau.spark.testing.SharedSparkContext +import scala.collection.immutable.IndexedSeq + import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD -import org.apache.spark.sql.types.{StructType, DoubleType, StructField} -import org.apache.spark.sql.{Row, SQLContext, DataFrame} -import org.scalatest.FunSuite +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Row +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.types.DoubleType +import org.apache.spark.sql.types.StructField +import org.apache.spark.sql.types.StructType -import scala.collection.immutable.IndexedSeq +import com.holdenkarau.spark.testing.SharedSparkContext +import org.scalatest.FunSuite class GoldilocksLargeTests extends FunSuite with SharedSparkContext{ diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala index 69dcc5e..50315b8 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala @@ -1,7 +1,8 @@ package com.highperformancespark.examples.goldilocks -import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.rdd.RDD + +import com.holdenkarau.spark.testing.SharedSparkContext import org.scalatest.FunSuite diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala index 131f311..c649548 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala @@ -1,8 +1,11 @@ package com.highperformancespark.examples.goldilocks import org.apache.spark._ -import org.apache.spark.sql.{Row, SQLContext} -import org.scalatest.{BeforeAndAfterAll, FunSuite} +import org.apache.spark.sql.Row +import org.apache.spark.sql.SQLContext + +import org.scalatest.BeforeAndAfterAll +import org.scalatest.FunSuite // tag::MAGIC_PANDA[] diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala index 4ac03e7..0f1af7b 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala @@ -1,11 +1,12 @@ package com.highperformancespark.examples.goldilocks -import com.holdenkarau.spark.testing.SharedSparkContext +import scala.reflect.ClassTag + import org.apache.spark.rdd.RDD -import org.scalatest.FunSuite -import scala.reflect.ClassTag +import com.holdenkarau.spark.testing.SharedSparkContext +import org.scalatest.FunSuite class SortingTests extends FunSuite with SharedSparkContext { diff --git a/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala index 3b9159c..2303bd9 100644 --- a/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala +++ b/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala @@ -3,8 +3,9 @@ */ package com.highperformancespark.examples.ml -import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.sql.Dataset + +import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.scalatest.FunSuite case class TestRow(id: Int, inputColumn: String) diff --git a/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala index 1fa296a..2fc3739 100644 --- a/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala +++ b/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala @@ -3,17 +3,20 @@ */ package com.highperformancespark.examples.ml -import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas} - -import com.holdenkarau.spark.testing._ - import org.apache.spark.ml._ import org.apache.spark.ml.feature._ import org.apache.spark.ml.param._ +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.Row +import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types._ -import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext} -import org.scalatest.Matchers._ + +import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo +import com.highperformancespark.examples.dataframe.HappyPandas.Pandas +import com.holdenkarau.spark.testing._ import org.scalatest.FunSuite +import org.scalatest.Matchers._ case class MiniPanda(happy: Double, fuzzy: Double, old: Double) diff --git a/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala b/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala index fa551a5..280d3e4 100644 --- a/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala @@ -3,15 +3,12 @@ */ package com.highperformancespark.examples.mllib -import com.highperformancespark.examples.dataframe.RawPanda +import org.apache.spark.mllib.linalg.{Vector => SparkVector} +import com.highperformancespark.examples.dataframe.RawPanda import com.holdenkarau.spark.testing._ - import org.scalatest.FunSuite - -import org.apache.spark.mllib.linalg.{Vector => SparkVector} - class GoldilocksMLlibSuite extends FunSuite with SharedSparkContext { val rps = List( RawPanda(1L, "94110", "giant", true, Array(0.0, 0.0)), diff --git a/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala b/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala index 724ddaa..c1dc58d 100644 --- a/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala +++ b/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala @@ -4,11 +4,12 @@ package com.highperformancespark.examples.ffi import com.holdenkarau.spark.testing._ -import org.scalacheck.{Arbitrary, Gen} +import org.scalacheck.Arbitrary +import org.scalacheck.Gen import org.scalacheck.Prop.forAll import org.scalatest.FunSuite -import org.scalatest.prop.Checkers import org.scalatest.Matchers._ +import org.scalatest.prop.Checkers class NativeExampleSuite extends FunSuite with SharedSparkContext with Checkers with RDDComparisons { diff --git a/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala b/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala index 4b1f032..ded5388 100644 --- a/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala @@ -5,8 +5,8 @@ package com.highperformancespark.examples.ffi import com.holdenkarau.spark.testing._ import org.scalatest.FunSuite -import org.scalatest.prop.Checkers import org.scalatest.Matchers._ +import org.scalatest.prop.Checkers class PipeExampleSuite extends FunSuite with SharedSparkContext with Checkers { diff --git a/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala b/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala index b5a3d44..878aeee 100644 --- a/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala @@ -5,11 +5,11 @@ */ package com.highperformancespark.examples.streaming +import java.lang.Thread + import org.apache.spark.streaming._ -import java.lang.Thread import com.holdenkarau.spark.testing._ - import org.scalatest.FunSuite class DStreamExamplesSuite extends FunSuite with SharedSparkContext { diff --git a/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala b/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala index ca364d1..bc8fd84 100644 --- a/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala @@ -3,11 +3,11 @@ */ package com.highperformancespark.examples.tokenize +import java.lang.Thread + import org.apache.spark.streaming._ -import java.lang.Thread import com.holdenkarau.spark.testing._ - import org.scalatest.FunSuite class SampleTokenizeSuite extends FunSuite with SharedSparkContext { diff --git a/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala b/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala index 897a8d3..b924cf4 100644 --- a/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala @@ -4,9 +4,7 @@ package com.highperformancespark.examples.tools import com.highperformancespark.examples.dataframe.RawPanda - import com.holdenkarau.spark.testing._ - import org.scalatest.FunSuite class FilterInvalidPandasSuite extends FunSuite with SharedSparkContext { diff --git a/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala b/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala index 15f60d1..52933ce 100644 --- a/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala @@ -4,9 +4,7 @@ package com.highperformancespark.examples.tools import com.highperformancespark.examples.dataframe.RawPanda - import com.holdenkarau.spark.testing._ - import org.scalatest.FunSuite class GeneratescalaingDataSuite extends FunSuite with SharedSparkContext { From e733c25c9676fe72d4bbecd59b2aeef1b2c005d3 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 3 Dec 2022 13:43:57 -0800 Subject: [PATCH 08/53] Upgrade! Yay! --- build.sbt | 4 ++-- .../dataframe/HappyPandas.scala | 14 +++++++------- .../examples/dataframe/JavaHappyPandasTest.java | 5 ++++- .../dataframe/HappyPandasTest.scala | 6 +++--- .../dataframe/MixedDatasetSuite.scala | 6 +++--- .../errors/ThrowsSuite.scala | 4 ++-- .../goldilocks/EvaluationTests.scala | 4 ++-- .../goldilocks/GoldilocksLargeTests.scala | 7 ++++--- .../goldilocks/JoinTest.scala | 4 ++-- .../goldilocks/QuantileOnlyArtisanalTest.scala | 13 +++++++------ .../goldilocks/SortingTests.scala | 4 ++-- .../ml/CustomPipeline.scala | 4 ++-- .../ml/SimpleNaiveBayes.scala | 6 +++--- .../mllib/GoldilocksMLlibSuite.scala | 4 ++-- .../native/NativeExample.scala | 8 ++++---- .../native/PipeExampleSuite.scala | 8 ++++---- .../streaming/DStreamSuite.scala | 4 ++-- .../tokenize/SampleTokenizeSuite.scala | 4 ++-- .../tools/FilterInvalidPandasSuite.scala | 4 ++-- .../tools/GenerateScalingDataSuite.scala | 4 ++-- .../transformations/Accumulators.scala | 4 ++-- .../wordcount/WordCountTest.scala | 4 ++-- 22 files changed, 65 insertions(+), 60 deletions(-) diff --git a/build.sbt b/build.sbt index 140195d..e60437b 100644 --- a/build.sbt +++ b/build.sbt @@ -35,8 +35,8 @@ val sparkVersion = settingKey[String]("Spark version") val sparkTestingVersion = settingKey[String]("Spark testing base version without Spark version part") // 2.4.5 is the highest version we have with the old spark-testing-base deps -sparkVersion := System.getProperty("sparkVersion", "2.4.5") -sparkTestingVersion := "0.14.0" +sparkVersion := System.getProperty("sparkVersion", "2.4.8") +sparkTestingVersion := "1.3.0" // additional libraries libraryDependencies ++= Seq( diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala index 1a51290..f9ce89e 100644 --- a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala @@ -14,8 +14,8 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.expressions._ import org.apache.spark.sql.functions._ -import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver._ +import org.apache.spark.sql.Encoders //end::legacySparkHiveImports[] object HappyPandas { @@ -40,7 +40,7 @@ object HappyPandas { */ def sqlContext(sc: SparkContext): SQLContext = { //tag::createSQLContext[] - val sqlContext = new SQLContext(sc) + val sqlContext = SparkSession.builder.getOrCreate().sqlContext // Import the implicits, unlike in core Spark the implicits are defined // on the context. import sqlContext.implicits._ @@ -51,9 +51,9 @@ object HappyPandas { /** * Creates HiveContext Spark with an existing SparkContext using hive. */ - def hiveContext(sc: SparkContext): HiveContext = { + def hiveContext(sc: SparkContext): SQLContext = { //tag::createHiveContext[] - val hiveContext = new HiveContext(sc) + val hiveContext = SparkSession.builder.enableHiveSupport().getOrCreate().sqlContext // Import the implicits, unlike in core Spark the implicits are defined // on the context. import hiveContext.implicits._ @@ -75,7 +75,7 @@ object HappyPandas { //end::loadPandaJSONComplex[] val jsonRDD = sc.textFile(path) //tag::loadPandaJsonRDD[] - val df3 = session.read.json(jsonRDD) + val df3 = session.read.json(session.createDataset(jsonRDD)(Encoders.STRING)) //end::loadPandaJSONRDD[] df1 } @@ -83,7 +83,7 @@ object HappyPandas { def jsonLoadFromRDD(session: SparkSession, input: RDD[String]): DataFrame = { //tag::loadPandaJSONRDD[] val rdd: RDD[String] = input.filter(_.contains("panda")) - val df = session.read.json(rdd) + val df = session.read.json(session.createDataset(rdd)(Encoders.STRING)) //end::loadPandaJSONRDD[] df } @@ -255,7 +255,7 @@ object HappyPandas { miniPandas } - def startJDBCServer(hiveContext: HiveContext): Unit = { + def startJDBCServer(hiveContext: SQLContext): Unit = { //tag::startJDBC[] hiveContext.setConf("hive.server2.thrift.port", "9090") HiveThriftServer2.startWithContext(hiveContext) diff --git a/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java b/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java index d6bec37..284397f 100644 --- a/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java +++ b/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java @@ -3,7 +3,7 @@ import com.highperformancespark.examples.objects.JavaPandaInfo; import com.highperformancespark.examples.objects.JavaPandas; import com.highperformancespark.examples.objects.JavaRawPanda; -import com.holdenkarau.spark.testing.JavaDataFrameSuiteBase; +//import com.holdenkarau.spark.testing.JavaDataFrameSuiteBase; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; @@ -16,6 +16,8 @@ import static org.junit.Assert.*; +// Temporarily disable until we upgrade to Spark 3.3 +/* public class JavaHappyPandasTest extends JavaDataFrameSuiteBase { String toronto = "toronto"; String sandiego = "san diego"; @@ -149,3 +151,4 @@ public void simpleSQLExample() { } } +*/ diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala index 695dbcf..7c5dbaa 100644 --- a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala +++ b/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala @@ -15,10 +15,10 @@ import org.apache.spark.sql.types._ import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo import com.highperformancespark.examples.dataframe.HappyPandas.Pandas import com.holdenkarau.spark.testing._ -import org.scalatest.FunSuite -import org.scalatest.Matchers._ +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ -class HappyPandasTest extends FunSuite with DataFrameSuiteBase { +class HappyPandasTest extends AnyFunSuite with DataFrameSuiteBase { val toronto = "toronto" val sandiego = "san diego" val virginia = "virginia" diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala b/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala index 44b066c..cbd79ad 100644 --- a/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala @@ -15,10 +15,10 @@ import org.apache.spark.sql.types._ import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo import com.highperformancespark.examples.dataframe.HappyPandas.Pandas import com.holdenkarau.spark.testing._ -import org.scalatest.FunSuite -import org.scalatest.Matchers._ +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ -class MixedDatasetSuite extends FunSuite +class MixedDatasetSuite extends AnyFunSuite with DataFrameSuiteBase with DatasetSuiteBase with RDDComparisons { diff --git a/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala b/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala index c2b68f7..8a6ba09 100644 --- a/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala @@ -1,9 +1,9 @@ package com.highperformancespark.examples.errors import com.holdenkarau.spark.testing._ -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite -class ThrowsSuite extends FunSuite with SharedSparkContext { +class ThrowsSuite extends AnyFunSuite with SharedSparkContext { test("inner throw & outer throw should both throw SparkExceptions exceptions") { intercept[org.apache.spark.SparkException] { Throws.throwInner(sc) diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala index 56c279e..4067fcb 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala @@ -3,9 +3,9 @@ package com.highperformancespark.examples.goldilocks import org.apache.spark.rdd.RDD import com.holdenkarau.spark.testing.SharedSparkContext -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite -class EvaluationTests extends FunSuite with SharedSparkContext { +class EvaluationTests extends AnyFunSuite with SharedSparkContext { val doubleList = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0) val keyValuePairs = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0).zipWithIndex val path = "target/testResults" diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala index fed7d85..2e7fea8 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala @@ -12,9 +12,10 @@ import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StructType import com.holdenkarau.spark.testing.SharedSparkContext -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite +import org.apache.spark.sql.SparkSession -class GoldilocksLargeTests extends FunSuite with SharedSparkContext{ +class GoldilocksLargeTests extends AnyFunSuite with SharedSparkContext{ def testGoldilocksImplementations( @@ -52,7 +53,7 @@ class GoldilocksLargeTests extends FunSuite with SharedSparkContext{ } test("Goldilocks on local data solution "){ - val sqlContext = new SQLContext(sc) + val sqlContext = SparkSession.builder.getOrCreate().sqlContext val testRanks = List(3L, 8L) val (smallTestData, result) = DataCreationUtils.createLocalTestData(5, 10, testRanks) diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala index 50315b8..d1729f8 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala @@ -3,10 +3,10 @@ package com.highperformancespark.examples.goldilocks import org.apache.spark.rdd.RDD import com.holdenkarau.spark.testing.SharedSparkContext -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite -class JoinTest extends FunSuite with SharedSparkContext { +class JoinTest extends AnyFunSuite with SharedSparkContext { test("Hash join"){ val keySet = "a, b, c, d, e, f, g".split(",") val smallRDD = sc.parallelize(keySet.map(letter => (letter, letter.hashCode))) diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala index c649548..ed5f9b2 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala @@ -5,11 +5,12 @@ import org.apache.spark.sql.Row import org.apache.spark.sql.SQLContext import org.scalatest.BeforeAndAfterAll -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite +import org.apache.spark.sql.SparkSession // tag::MAGIC_PANDA[] -class QuantileOnlyArtisanalTest extends FunSuite with BeforeAndAfterAll { +class QuantileOnlyArtisanalTest extends AnyFunSuite with BeforeAndAfterAll { @transient private var _sc: SparkContext = _ def sc: SparkContext = _sc @@ -34,7 +35,7 @@ class QuantileOnlyArtisanalTest extends FunSuite with BeforeAndAfterAll { 3 -> Set(6.0, 7.0)) test("Goldilocks naive Solution"){ - val sqlContext = new SQLContext(sc) + val sqlContext = SparkSession.builder.getOrCreate().sqlContext val input = sqlContext.createDataFrame(inputList) val whileLoopSolution = GoldilocksWhileLoop.findRankStatistics( input, List(2L, 3L)).mapValues(_.toSet) @@ -59,7 +60,7 @@ class QuantileOnlyArtisanalTest extends FunSuite with BeforeAndAfterAll { // We don't need the rest of the tests included. class QuantileOnlyArtisanalTestContinued extends QuantileOnlyArtisanalTest { test("Goldilocks first try ") { - val sqlContext = new SQLContext(sc) + val sqlContext = SparkSession.builder.getOrCreate().sqlContext val input = sqlContext.createDataFrame(inputList) val secondAndThird = GoldilocksFirstTry.findRankStatistics( input, targetRanks = List(2L, 3L)) @@ -115,7 +116,7 @@ class QuantileOnlyArtisanalTestContinued extends QuantileOnlyArtisanalTest { test("GoldiLocks With Hashmap ") { - val sqlContext = new SQLContext(sc) + val sqlContext = SparkSession.builder.getOrCreate().sqlContext val input = sqlContext.createDataFrame(inputList) val secondAndThird = GoldilocksWithHashMap.findRankStatistics( input, targetRanks = List(2L, 3L)) @@ -130,7 +131,7 @@ class QuantileOnlyArtisanalTestContinued extends QuantileOnlyArtisanalTest { } test("Goldilocks Secondary Sort"){ - val sqlContext = new SQLContext(sc) + val sqlContext = SparkSession.builder.getOrCreate().sqlContext val input = sqlContext.createDataFrame(inputList) val secondarySortSolution = GoldilocksWithHashMap.findRankStatistics( diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala index 0f1af7b..2ff69cb 100644 --- a/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala @@ -6,10 +6,10 @@ import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import com.holdenkarau.spark.testing.SharedSparkContext -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite -class SortingTests extends FunSuite with SharedSparkContext { +class SortingTests extends AnyFunSuite with SharedSparkContext { test("Test Sort by two keys"){ diff --git a/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala index 2303bd9..940d223 100644 --- a/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala +++ b/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala @@ -6,11 +6,11 @@ package com.highperformancespark.examples.ml import org.apache.spark.sql.Dataset import com.holdenkarau.spark.testing.DataFrameSuiteBase -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite case class TestRow(id: Int, inputColumn: String) -class CustomPipelineSuite extends FunSuite with DataFrameSuiteBase { +class CustomPipelineSuite extends AnyFunSuite with DataFrameSuiteBase { val d = List( TestRow(0, "a"), TestRow(1, "b"), diff --git a/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala index 2fc3739..7a89310 100644 --- a/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala +++ b/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala @@ -15,12 +15,12 @@ import org.apache.spark.sql.types._ import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo import com.highperformancespark.examples.dataframe.HappyPandas.Pandas import com.holdenkarau.spark.testing._ -import org.scalatest.FunSuite -import org.scalatest.Matchers._ +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ case class MiniPanda(happy: Double, fuzzy: Double, old: Double) -class SimpleNaiveBayesSuite extends FunSuite with DataFrameSuiteBase { +class SimpleNaiveBayesSuite extends AnyFunSuite with DataFrameSuiteBase { val miniPandasList = List( MiniPanda(1.0, 1.0, 1.0), MiniPanda(1.0, 1.0, 0.0), diff --git a/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala b/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala index 280d3e4..05b70e8 100644 --- a/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala @@ -7,9 +7,9 @@ import org.apache.spark.mllib.linalg.{Vector => SparkVector} import com.highperformancespark.examples.dataframe.RawPanda import com.holdenkarau.spark.testing._ -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite -class GoldilocksMLlibSuite extends FunSuite with SharedSparkContext { +class GoldilocksMLlibSuite extends AnyFunSuite with SharedSparkContext { val rps = List( RawPanda(1L, "94110", "giant", true, Array(0.0, 0.0)), RawPanda(2L, "94110", "giant", false, Array(0.0, 3.0)), diff --git a/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala b/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala index c1dc58d..0b0ed36 100644 --- a/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala +++ b/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala @@ -7,11 +7,11 @@ import com.holdenkarau.spark.testing._ import org.scalacheck.Arbitrary import org.scalacheck.Gen import org.scalacheck.Prop.forAll -import org.scalatest.FunSuite -import org.scalatest.Matchers._ -import org.scalatest.prop.Checkers +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ +import org.scalatestplus.scalacheck.Checkers -class NativeExampleSuite extends FunSuite +class NativeExampleSuite extends AnyFunSuite with SharedSparkContext with Checkers with RDDComparisons { test("local sum") { diff --git a/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala b/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala index ded5388..aa45fe1 100644 --- a/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala @@ -4,12 +4,12 @@ package com.highperformancespark.examples.ffi import com.holdenkarau.spark.testing._ -import org.scalatest.FunSuite -import org.scalatest.Matchers._ -import org.scalatest.prop.Checkers +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ +import org.scalatestplus.scalacheck.Checkers -class PipeExampleSuite extends FunSuite with SharedSparkContext with Checkers { +class PipeExampleSuite extends AnyFunSuite with SharedSparkContext with Checkers { ignore("commentors on a pr") { val rdd = sc.parallelize(List(12883)) val expected = (12883, List("SparkQA", "srowen")) diff --git a/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala b/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala index 878aeee..871e2aa 100644 --- a/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala @@ -10,9 +10,9 @@ import java.lang.Thread import org.apache.spark.streaming._ import com.holdenkarau.spark.testing._ -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite -class DStreamExamplesSuite extends FunSuite with SharedSparkContext { +class DStreamExamplesSuite extends AnyFunSuite with SharedSparkContext { test("simple set up") { val ssc = DStreamExamples.makeStreamingContext(sc) val inputStream = DStreamExamples.fileAPIExample(ssc, "./") diff --git a/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala b/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala index bc8fd84..a0afb64 100644 --- a/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala @@ -8,9 +8,9 @@ import java.lang.Thread import org.apache.spark.streaming._ import com.holdenkarau.spark.testing._ -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite -class SampleTokenizeSuite extends FunSuite with SharedSparkContext { +class SampleTokenizeSuite extends AnyFunSuite with SharedSparkContext { val input = List("hi holden", "I like coffee") val expected = List("hi", "holden", "I", "like", "coffee") diff --git a/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala b/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala index b924cf4..5373705 100644 --- a/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala @@ -5,9 +5,9 @@ package com.highperformancespark.examples.tools import com.highperformancespark.examples.dataframe.RawPanda import com.holdenkarau.spark.testing._ -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite -class FilterInvalidPandasSuite extends FunSuite with SharedSparkContext { +class FilterInvalidPandasSuite extends AnyFunSuite with SharedSparkContext { test("simple filter") { val invalidPandas = List(1L, 2L) val inputPandas = List( diff --git a/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala b/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala index 52933ce..1d76160 100644 --- a/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala @@ -5,9 +5,9 @@ package com.highperformancespark.examples.tools import com.highperformancespark.examples.dataframe.RawPanda import com.holdenkarau.spark.testing._ -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite -class GeneratescalaingDataSuite extends FunSuite with SharedSparkContext { +class GeneratescalaingDataSuite extends AnyFunSuite with SharedSparkContext { // The number of entries depends somewhat on the partition split because we // zip multiple separate RDDs so its more of a "request" test("expected num entries") { diff --git a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala index d043d38..48991e0 100644 --- a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala +++ b/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala @@ -7,9 +7,9 @@ import scala.collection.immutable.HashSet import com.highperformancespark.examples.dataframe.RawPanda import com.holdenkarau.spark.testing._ -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite -class AccumulatorsTest extends FunSuite with SharedSparkContext { +class AccumulatorsTest extends AnyFunSuite with SharedSparkContext { test("accumulator max should function") { val input = sc.parallelize(1.to(100)).map(x => RawPanda(1L, "1", "red", true, Array(x.toDouble))) diff --git a/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala b/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala index 3772a48..68eab95 100644 --- a/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala +++ b/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala @@ -2,9 +2,9 @@ package com.highperformancespark.examples.wordcount import com.holdenkarau.spark.testing.SharedSparkContext -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite -class WordCountTest extends FunSuite with SharedSparkContext { +class WordCountTest extends AnyFunSuite with SharedSparkContext { test("word count with Stop Words Removed"){ val wordRDD = sc.parallelize(Seq( "How happy was the panda? You ask.", From 587ef100ccf0799de706a4e612f1430fc69c3730 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 3 Dec 2022 14:01:44 -0800 Subject: [PATCH 09/53] Manually port the rest of the accumulator. --- build.sbt | 2 +- .../streaming/Structured.scala | 3 +- .../transformations/Accumulators.scala | 42 +++++++++++++------ 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/build.sbt b/build.sbt index e60437b..19d497d 100644 --- a/build.sbt +++ b/build.sbt @@ -35,7 +35,7 @@ val sparkVersion = settingKey[String]("Spark version") val sparkTestingVersion = settingKey[String]("Spark testing base version without Spark version part") // 2.4.5 is the highest version we have with the old spark-testing-base deps -sparkVersion := System.getProperty("sparkVersion", "2.4.8") +sparkVersion := System.getProperty("sparkVersion", "3.3.0") sparkTestingVersion := "1.3.0" // additional libraries diff --git a/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala b/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala index f773a2e..0c50469 100644 --- a/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala +++ b/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala @@ -5,6 +5,7 @@ import scala.concurrent.duration._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.streaming._ +import org.apache.spark.sql.streaming.Trigger object Structured { @@ -21,7 +22,7 @@ object Structured { // Write out the result as parquet format("parquet"). // Specify the interval at which new data will be picked up - trigger(ProcessingTime(1.second)). + trigger(Trigger.ProcessingTime(1.second)). queryName("pandas").start() //end::writeComplete[] } diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala index 636fd90..f58cdbb 100644 --- a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala +++ b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala @@ -86,23 +86,39 @@ object Accumulators { //tag::uniquePandaAcc[] def uniquePandas(sc: SparkContext, rdd: RDD[RawPanda]): HashSet[Long] = { - object UniqParam extends AccumulableParam[HashSet[Long], Long] { - override def zero(initValue: HashSet[Long]) = initValue - // For adding new values - override def addAccumulator(r: HashSet[Long], t: Long): HashSet[Long] = { - r += t - r + class UniqParam extends AccumulatorV2[Long, HashSet[Long]] { + val _values = new HashSet[Long] + override def isZero() = _values.isEmpty + + override def copy(): UniqParam = { + val nacc = new UniqParam + nacc._values ++= _values + nacc } - // For merging accumulators - override def addInPlace(r1: HashSet[Long], r2: HashSet[Long]): - HashSet[Long] = { - r1 ++ r2 + + override def reset(): Unit = { + _values.clear() + } + + override def merge(other: AccumulatorV2[Long, HashSet[Long]]): Unit = other match { + case o: UniqParam => + _values ++= o._values + case _ => + throw new UnsupportedOperationException( + s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") + } + + override def value: HashSet[Long] = _values + // For adding new values + override def add(t: Long) = { + _values += t } } // Create an accumulator with the initial value of Double.MinValue - val acc = sc.accumulable(new HashSet[Long]())(UniqParam) - val transformed = rdd.map{x => acc += x.id; (x.zip, x.id)} - // accumulator still has Double.MinValue + val acc = new UniqParam() + sc.register(acc) + val transformed = rdd.map{x => acc.add(x.id); (x.zip, x.id)} + // accumulator still has zero values transformed.count() // force evaluation acc.value } From f353b0659d78d529fe7edb74886ad9f43e8dfab0 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 6 Dec 2022 07:07:02 -0800 Subject: [PATCH 10/53] Add a sample for SQL migration. --- migration/sql.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 migration/sql.sh diff --git a/migration/sql.sh b/migration/sql.sh new file mode 100644 index 0000000..57fba45 --- /dev/null +++ b/migration/sql.sh @@ -0,0 +1,5 @@ +pip install sqlfluff +python -m pip install 'sqlfluff-plugin-sparksql-upgrade @ git+https://github.com/holdenk/spark-upgrade#subdirectory=sql' + +sqlfluff rules |grep -i spark +sqlfluff fix --dialect sparksql farts.sql From 5c99cde327969da5c8d7979b1ba467207ec226c8 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 2 Feb 2023 22:41:01 -0800 Subject: [PATCH 11/53] Support running in JDK17. --- build.sbt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 19d497d..67ba92b 100644 --- a/build.sbt +++ b/build.sbt @@ -29,14 +29,21 @@ parallelExecution in Test := false fork := true -javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true") +javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-Djna.nosys=true") + +Test / javaOptions ++= Seq( + "base/java.lang", "base/java.lang.invoke", "base/java.lang.reflect", "base/java.io", "base/java.net", "base/java.nio", + "base/java.util", "base/java.util.concurrent", "base/java.util.concurrent.atomic", + "base/sun.nio.ch", "base/sun.nio.cs", "base/sun.security.action", + "base/sun.util.calendar", "security.jgss/sun.security.krb5", + ).map("--add-opens=java." + _ + "=ALL-UNNAMED") val sparkVersion = settingKey[String]("Spark version") val sparkTestingVersion = settingKey[String]("Spark testing base version without Spark version part") // 2.4.5 is the highest version we have with the old spark-testing-base deps sparkVersion := System.getProperty("sparkVersion", "3.3.0") -sparkTestingVersion := "1.3.0" +sparkTestingVersion := "1.4.0" // additional libraries libraryDependencies ++= Seq( From 91ba8750ba72fa9da71f5d6b4b092dd6956781db Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 2 Feb 2023 22:41:28 -0800 Subject: [PATCH 12/53] Fix filter panda example --- project/build.properties | 2 +- project/plugins.sbt | 2 ++ .../tools/FilterInvalidPandas.scala | 33 ++++++++++++++++--- .../tools/FilterInvalidPandasSuite.scala | 12 +++++++ 4 files changed, 43 insertions(+), 6 deletions(-) diff --git a/project/build.properties b/project/build.properties index 8b9a0b0..46e43a9 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=1.8.0 +sbt.version=1.8.2 diff --git a/project/plugins.sbt b/project/plugins.sbt index aa2b0a4..8ea77fc 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -9,7 +9,9 @@ addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") addDependencyTreePlugin +//tag::scalaFix[] addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.10.4") +//end::scalaFix[] //tag::sbtJNIPlugin[] addSbtPlugin("com.github.sbt" %% "sbt-jni" % "1.5.4") diff --git a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala b/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala index bd51db2..ffc7d83 100644 --- a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala +++ b/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala @@ -6,15 +6,16 @@ import org.apache.spark._ import org.apache.spark.rdd.RDD import com.highperformancespark.examples.dataframe.RawPanda -import com.typesafe.scalalogging.LazyLogging +//tag::loggerImport[] +import org.apache.logging.log4j.LogManager //end::loggerImport[] -object FilterInvalidPandas extends LazyLogging { +object FilterInvalidPandas { def filterInvalidPandas(sc: SparkContext, invalidPandas: List[Long], input: RDD[RawPanda]) = { //tag::broadcast[] - val invalid = HashSet() ++ invalidPandas + val invalid: HashSet[Long] = HashSet() ++ invalidPandas val invalidBroadcast = sc.broadcast(invalid) input.filter{panda => !invalidBroadcast.value.contains(panda.id)} //end::broadcast[] @@ -23,11 +24,12 @@ object FilterInvalidPandas extends LazyLogging { def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long], input: RDD[RawPanda]) = { //tag::broadcastAndLog[] - val invalid = HashSet() ++ invalidPandas + val invalid: HashSet[Long] = HashSet() ++ invalidPandas val invalidBroadcast = sc.broadcast(invalid) def keepPanda(pandaId: Long) = { + val logger = LogManager.getLogger("fart based logs") if (invalidBroadcast.value.contains(pandaId)) { - logger.debug(s"Invalid panda ${pandaId} discovered") + logger.debug("hi") false } else { true @@ -37,3 +39,24 @@ object FilterInvalidPandas extends LazyLogging { //end::broadcastAndLog[] } } + +//tag::broadcastAndLogClass[] +class AltLog() { + lazy val logger = LogManager.getLogger("fart based logs") + def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long], + input: RDD[RawPanda]) = { + val invalid: HashSet[Long] = HashSet() ++ invalidPandas + val invalidBroadcast = sc.broadcast(invalid) + def keepPanda(pandaId: Long) = { + val logger = LogManager.getLogger("fart based logs") + if (invalidBroadcast.value.contains(pandaId)) { + logger.debug("hi") + false + } else { + true + } + } + input.filter{panda => keepPanda(panda.id)} + } +} +//end::broadcastAndLogClass[] diff --git a/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala b/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala index 5373705..545b789 100644 --- a/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala +++ b/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala @@ -21,4 +21,16 @@ class FilterInvalidPandasSuite extends AnyFunSuite with SharedSparkContext { assert(result1.collect() === result2.collect()) assert(result1.count() === 1) } + + test("alt log") { + val invalidPandas = List(1L, 2L) + val inputPandas = List( + RawPanda(1L, "94110", "giant", true, Array(0.0)), + RawPanda(3L, "94110", "giant", true, Array(0.0))) + val input = sc.parallelize(inputPandas) + val al = new AltLog() + val result1 = + al.filterInvalidPandasWithLogs(sc, invalidPandas, input) + assert(result1.count() === 1) + } } From 6e76796c4fa70b4c629448a2acea199e627ed78c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 2 Feb 2023 23:11:29 -0800 Subject: [PATCH 13/53] Test JDK17 --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 89ad9bb..7f31464 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,6 +8,7 @@ jobs: fail-fast: false matrix: include: + - java: 17 - java: 11 - java: 8 runs-on: ubuntu-latest From baf3ee7482629094ea3e26c539b3eddbc03c646a Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 2 Feb 2023 23:19:10 -0800 Subject: [PATCH 14/53] Try and add back JRE8 support. --- build.sbt | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/build.sbt b/build.sbt index 67ba92b..6d478bc 100644 --- a/build.sbt +++ b/build.sbt @@ -31,12 +31,21 @@ fork := true javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-Djna.nosys=true") -Test / javaOptions ++= Seq( - "base/java.lang", "base/java.lang.invoke", "base/java.lang.reflect", "base/java.io", "base/java.net", "base/java.nio", - "base/java.util", "base/java.util.concurrent", "base/java.util.concurrent.atomic", - "base/sun.nio.ch", "base/sun.nio.cs", "base/sun.security.action", - "base/sun.util.calendar", "security.jgss/sun.security.krb5", - ).map("--add-opens=java." + _ + "=ALL-UNNAMED") +def specialOptions = { + // We only need these extra props for JRE>17 + if (sys.props("java.specification.version") > "1.17") { + Seq( + "base/java.lang", "base/java.lang.invoke", "base/java.lang.reflect", "base/java.io", "base/java.net", "base/java.nio", + "base/java.util", "base/java.util.concurrent", "base/java.util.concurrent.atomic", + "base/sun.nio.ch", "base/sun.nio.cs", "base/sun.security.action", + "base/sun.util.calendar", "security.jgss/sun.security.krb5", + ).map("--add-opens=java." + _ + "=ALL-UNNAMED") + } else { + Seq() + } +} + +Test / javaOptions ++= specialOptions val sparkVersion = settingKey[String]("Spark version") val sparkTestingVersion = settingKey[String]("Spark testing base version without Spark version part") From a72fb4d8153ef2f42c4b743f851820126934402a Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 2 Feb 2023 23:25:19 -0800 Subject: [PATCH 15/53] try packaging? idk why we aren't finding the so --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7f31464..beaff8f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,4 +29,4 @@ jobs: java-version: ${{ matrix.java }} cache: sbt - name: Build and Test - run: sbt clean +test + run: sbt clean package +test From 53fd99f91bfe1513a111c1df983650d6e668a3db Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 28 Mar 2023 19:28:37 -0700 Subject: [PATCH 16/53] Add our first Python example (for dual writes). --- .github/workflows/ci.yml | 4 +- .gitignore | 24 +++++++++- python/README.md | 1 + python/examples/__init__.py | 1 + python/examples/dual_write.py | 22 ++++++++++ python/examples/test_dual_write.py | 23 ++++++++++ python/pyproject.toml | 7 +++ python/requirements.txt | 5 +++ python/setup.cfg | 39 +++++++++++++++++ python/tox.ini | 70 ++++++++++++++++++++++++++++++ 10 files changed, 194 insertions(+), 2 deletions(-) create mode 100644 python/README.md create mode 100644 python/examples/__init__.py create mode 100644 python/examples/dual_write.py create mode 100644 python/examples/test_dual_write.py create mode 100644 python/pyproject.toml create mode 100644 python/requirements.txt create mode 100644 python/setup.cfg create mode 100644 python/tox.ini diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index beaff8f..9c4fcb5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,5 +28,7 @@ jobs: distribution: temurin java-version: ${{ matrix.java }} cache: sbt - - name: Build and Test + - name: Scala Build and Test run: sbt clean package +test + - name: Python Build and Test + run: cd python; tox diff --git a/.gitignore b/.gitignore index 4a8e38c..b48ebd7 100644 --- a/.gitignore +++ b/.gitignore @@ -25,9 +25,31 @@ sbt/*launch*.jar # python *.pyc +.tox +.bsp + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST # native *.o *.so *.so.0.0.0 -*.so.0 \ No newline at end of file +*.so.0 diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..3cf5830 --- /dev/null +++ b/python/README.md @@ -0,0 +1 @@ +Python examples for High Performance Spark diff --git a/python/examples/__init__.py b/python/examples/__init__.py new file mode 100644 index 0000000..80db2c4 --- /dev/null +++ b/python/examples/__init__.py @@ -0,0 +1 @@ +__version__ = 0.2 diff --git a/python/examples/dual_write.py b/python/examples/dual_write.py new file mode 100644 index 0000000..94f2715 --- /dev/null +++ b/python/examples/dual_write.py @@ -0,0 +1,22 @@ +import asyncactions # noqa # pylint: disable=unused-import + + +class DualWriteExample: + def do_write(self, df, p1, p2): + """ + Apply two concrete actions to a DataFrame in parallel. + A common use case is two views of the same data, normally + one with sensitive data and one scrubbed/clean. + """ + # First we "persist" it (you can also checkpoint or choose a different + # level of persistence. + df.persist() + df.count() + # Create the distinct "safe" view. + df1 = df.select("times") + # Start the async actions + async1 = df1.write.mode("append").format("parquet").saveAsync(p1) + async2 = df.write.mode("append").format("parquet").saveAsync(p2) + # Block until the writes are both finished. + async1.result() + async2.result() diff --git a/python/examples/test_dual_write.py b/python/examples/test_dual_write.py new file mode 100644 index 0000000..d85e7f9 --- /dev/null +++ b/python/examples/test_dual_write.py @@ -0,0 +1,23 @@ +import os +import tempfile + +from sparktestingbase.sqltestcase import SQLTestCase +from pyspark.sql.functions import current_timestamp +from pyspark.sql.types import Row +from .dual_write import DualWriteExample + + +class DualWriteTest(SQLTestCase): + def test_always_passes(self): + self.assertTrue(True) + + def test_actual_dual_write(self): + tempdir = tempfile.mkdtemp() + p1 = os.path.join(tempdir, "data1") + p2 = os.path.join(tempdir, "data2") + df = self.sqlCtx.createDataFrame([Row("timbit"), Row("farted")], ["names"]) + combined = df.withColumn("times", current_timestamp()) + DualWriteExample().do_write(combined, p1, p2) + df1 = self.sqlCtx.read.format("parquet").load(p1) + df2 = self.sqlCtx.read.format("parquet").load(p2) + self.assertDataFrameEqual(df2.select("times"), df1, 0.1) diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 0000000..38b1184 --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,7 @@ +[build-system] +requires = ["setuptools >= 58.0"] +build-backend = "setuptools.build_meta" + +[[tool.mypy.overrides]] +module = "examples" +ignore_missing_imports = true diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 0000000..2947477 --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,5 @@ +spark-testing-base +pandas +pyarrow +pyspark +pyspark-asyncactions diff --git a/python/setup.cfg b/python/setup.cfg new file mode 100644 index 0000000..64c8931 --- /dev/null +++ b/python/setup.cfg @@ -0,0 +1,39 @@ +[metadata] +name = examples +version = attr: examples.__version__ +author = Holden and Anya +author_email = your@email.address +url = https://github.com/high-performance-spark/high-performance-spark-examples +description = Python Examples for High Performance Spark +long_description = file: README.md +long_description_content_type = text/markdown +keywords = example, setuptools, pyspark +license = BSD 3-Clause License +classifiers = + License :: OSI Approved :: BSD License + Programming Language :: Python :: 3 + +[options] +packages = find: +zip_safe = True +include_package_data = True +install_requires = + pandas >= 1.4.1 + PyYAML >= 6.0 + typer + mypy + pyspark + pyspark-asyncactions + + +[options.entry_points] +console_scripts = + my-example-utility = example.example_module:main + +[options.extras_require] +dev = + black>=22.1.0 + flake8>=4.0.1 + +[options.package_data] +* = README.md \ No newline at end of file diff --git a/python/tox.ini b/python/tox.ini new file mode 100644 index 0000000..e4da796 --- /dev/null +++ b/python/tox.ini @@ -0,0 +1,70 @@ +[tox] +passenv = * +isolated_build = True +requires = tox-conda +envlist = + isort + py39 + black + mypy + flake8 + +skip_missing_interpeters = true + +[gh-actions] +python = + 3.9: py39 +# We need a new version of PySpark w/3.10 support. +# 3.10: py310 + +[testenv] +setenv = + DJANGO_SETTINGS_MODULE=fighthealthinsurance.settings + PYTHONPATH={toxinidir} + DJANGO_CONFIGURATION=Dev +passenv = * +extras = + tests + coverage +deps = + pytest + isort==4.3.21 + pyspark + flake8 + spark-testing-base + -rrequirements.txt +commands = + pytest examples \ + {posargs} +allowlist_externals = pytest + +[testenv:isort] +extras = tests +skipsdist = True +commands = isort --check-only --diff examples +allowlist_externals = isort + +[testenv:black] +extras = tests +skipsdist = True +commands = black --check examples +allowlist_externals = black + +[testenv:flake8] +extras = tests +skipsdist = True +commands = flake8 examples +allowlist_externals = flake8 + +[testenv:mypy] +extras = tests +passenv = * +deps = + pytest + -rrequirements.txt +setenv = + {[testenv]setenv} + MYPYPATH={toxinidir} +commands = + mypy -m examples +allowlist_externals = mypy \ No newline at end of file From e6e357ba46f7e772da237eb1ee00f64868440085 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 28 Mar 2023 19:30:15 -0700 Subject: [PATCH 17/53] Fix style --- python/.flake8 | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 python/.flake8 diff --git a/python/.flake8 b/python/.flake8 new file mode 100644 index 0000000..79a16af --- /dev/null +++ b/python/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 120 \ No newline at end of file From 8ced69262bbc32bbbc943b1433289a2e041df74d Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 28 Mar 2023 19:36:09 -0700 Subject: [PATCH 18/53] Add py action. --- .github/workflows/ci.yml | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9c4fcb5..7aa3603 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,5 +30,17 @@ jobs: cache: sbt - name: Scala Build and Test run: sbt clean package +test - - name: Python Build and Test - run: cd python; tox + python-test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + - name: Run tox + run: | + cd python; tox From 7ba42b209b2ba6ab282f13b902bef181dab4e6ec Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 26 Aug 2023 18:03:37 -0700 Subject: [PATCH 19/53] Get the JNI examples working with modern versions of the tools. * Try and debug native builds some more * Slightly adjust CMakeLists.txt and build.sbt * Enable fortran because life is sad. * JDK11+ seems reasonable for "high performance" Fix the library we are loading. --------- Co-authored-by: Grigory Pomadchin --- .github/workflows/ci.yml | 1 - .gitignore | 10 ++ build.sbt | 107 +++++++++--------- .../examples/JavaInterop.java | 0 .../examples/WordCount.java | 0 .../examples/dataframe/JavaHappyPandas.java | 0 .../examples/dataframe/JavaLoadSave.java | 0 .../examples/dataframe/JavaUDFs.java | 0 .../examples/ffi/SumJNIJava.java | 0 .../examples/objects/JavaCoffeeShop.java | 0 .../examples/objects/JavaPandaInfo.java | 0 .../examples/objects/JavaPandaPlace.java | 0 .../examples/objects/JavaPandas.java | 0 .../examples/objects/JavaRawPanda.java | 0 {src => core/src}/main/julia/setup.jl | 0 {src => core/src}/main/julia/wc.jl | 0 {src => core/src}/main/perl/Changes | 0 {src => core/src}/main/perl/MANIFEST | 0 {src => core/src}/main/perl/Makefile.PL | 0 {src => core/src}/main/perl/README | 0 {src => core/src}/main/perl/ghinfo.pl | 0 {src => core/src}/main/perl/ignore.txt | 0 .../perl/lib/HighPerformanceSpark/Examples.pm | 0 {src => core/src}/main/perl/t/00-load.t | 0 {src => core/src}/main/perl/t/manifest.t | 0 {src => core/src}/main/perl/t/pod-coverage.t | 0 {src => core/src}/main/perl/t/pod.t | 0 {src => core/src}/main/perl/xt/boilerplate.t | 0 {src => core/src}/main/r/dapply.R | 0 {src => core/src}/main/r/wc.R | 0 .../dataframe/HappyPandas.scala | 0 .../dataframe/LoadSave.scala | 0 .../dataframe/MixedDataset.scala | 0 .../dataframe/MixedDataset.scala_back | 0 .../dataframe/RawPandas.scala | 0 .../dataframe/RegularSQL.scala | 0 .../dataframe/UDFs.scala | 0 .../errors/throws.scala | 0 .../goldilocks/GoldilocksFirstTry.scala | 0 .../goldilocks/GoldilocksSecondarySort.scala | 0 .../goldilocks/GoldilocksWithHashMap.scala | 0 .../goldilocks/RDDJoinExamples.scala | 0 .../goldilocks/SecondarySort.scala | 0 .../ml/CustomPipeline.scala | 0 .../ml/SimpleExport.scala | 0 .../ml/SimpleNaiveBayes.scala | 0 .../ml/SimplePipeline.scala | 0 .../mllib/GoldilocksMLlib.scala | 0 .../native/NativeExample.scala | 0 .../native/PipeExample.scala | 0 .../native/StandAlone.scala | 0 .../native/SumFJNA.scala | 0 .../native/SumJNA.scala | 0 .../native/SumJNI.scala | 0 .../perf/SimplePerfTest.scala | 0 .../streaming/DStream.scala | 0 .../streaming/Structured.scala | 0 .../tokenize/SampleTokenize.scala | 0 .../tools/FilterInvalidPandas.scala | 0 .../tools/GenerateScalingData.scala | 0 .../tools/SampleData.scala | 0 .../transformations/Accumulators.scala | 0 .../transformations/NarrowAndWide.scala | 0 .../transformations/NewAccumulators.scala | 0 .../transformations/SmartAggregations.scala | 0 .../wordcount/WordCount.scala | 0 .../examples/JavaInteropTest.java | 0 .../dataframe/JavaHappyPandasTest.java | 0 .../dataframe/HappyPandasTest.scala | 0 .../dataframe/MixedDatasetSuite.scala | 0 .../errors/ThrowsSuite.scala | 0 .../goldilocks/EvaluationTests.scala | 0 .../goldilocks/GoldilocksLargeTests.scala | 0 .../goldilocks/JoinTest.scala | 0 .../QuantileOnlyArtisanalTest.scala | 0 .../goldilocks/SortingTests.scala | 0 .../ml/CustomPipeline.scala | 0 .../ml/SimpleNaiveBayes.scala | 0 .../mllib/GoldilocksMLlibSuite.scala | 0 .../native/NativeExample.scala | 0 .../native/PipeExampleSuite.scala | 0 .../streaming/DStreamSuite.scala | 0 .../tokenize/SampleTokenizeSuite.scala | 0 .../tools/FilterInvalidPandasSuite.scala | 0 .../tools/GenerateScalingDataSuite.scala | 0 .../transformations/Accumulators.scala | 0 .../wordcount/WordCountTest.scala | 0 .../examples/JavaInteropHelper.scala | 0 native/src/CMakeLists.txt | 50 ++++++++ ...highperformancespark_examples_ffi_SumJNI.h | 0 {src/main => native/src}/c/sum.c | 0 {src/main => native/src}/c/sum.h | 0 {src/main => native/src}/c/sum_wrapper.c | 0 {src/main => native/src}/c/sumf_wrapper.c | 0 {src/main => native/src}/fortran/sumf.f95 | 0 src/CMakeLists.txt | 74 ------------ 96 files changed, 115 insertions(+), 127 deletions(-) rename {src => core/src}/main/java/com/highperformancespark/examples/JavaInterop.java (100%) rename {src => core/src}/main/java/com/highperformancespark/examples/WordCount.java (100%) rename {src => core/src}/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java (100%) rename {src => core/src}/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java (100%) rename {src => core/src}/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java (100%) rename {src => core/src}/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java (100%) rename {src => core/src}/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java (100%) rename {src => core/src}/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java (100%) rename {src => core/src}/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java (100%) rename {src => core/src}/main/java/com/highperformancespark/examples/objects/JavaPandas.java (100%) rename {src => core/src}/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java (100%) rename {src => core/src}/main/julia/setup.jl (100%) rename {src => core/src}/main/julia/wc.jl (100%) rename {src => core/src}/main/perl/Changes (100%) rename {src => core/src}/main/perl/MANIFEST (100%) rename {src => core/src}/main/perl/Makefile.PL (100%) rename {src => core/src}/main/perl/README (100%) rename {src => core/src}/main/perl/ghinfo.pl (100%) rename {src => core/src}/main/perl/ignore.txt (100%) rename {src => core/src}/main/perl/lib/HighPerformanceSpark/Examples.pm (100%) rename {src => core/src}/main/perl/t/00-load.t (100%) rename {src => core/src}/main/perl/t/manifest.t (100%) rename {src => core/src}/main/perl/t/pod-coverage.t (100%) rename {src => core/src}/main/perl/t/pod.t (100%) rename {src => core/src}/main/perl/xt/boilerplate.t (100%) rename {src => core/src}/main/r/dapply.R (100%) rename {src => core/src}/main/r/wc.R (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/errors/throws.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/native/NativeExample.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/native/PipeExample.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/native/StandAlone.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/native/SumJNA.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/native/SumJNI.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/streaming/DStream.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/streaming/Structured.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/tools/SampleData.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala (100%) rename {src => core/src}/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala (100%) rename {src => core/src}/test/java/com/highperformancespark/examples/JavaInteropTest.java (100%) rename {src => core/src}/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/native/NativeExample.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala (100%) rename {src => core/src}/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala (100%) rename {src => core/src}/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala (100%) create mode 100644 native/src/CMakeLists.txt rename {src/main => native/src}/c/include/com_highperformancespark_examples_ffi_SumJNI.h (100%) rename {src/main => native/src}/c/sum.c (100%) rename {src/main => native/src}/c/sum.h (100%) rename {src/main => native/src}/c/sum_wrapper.c (100%) rename {src/main => native/src}/c/sumf_wrapper.c (100%) rename {src/main => native/src}/fortran/sumf.f95 (100%) delete mode 100644 src/CMakeLists.txt diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7aa3603..bb9113a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,7 +10,6 @@ jobs: include: - java: 17 - java: 11 - - java: 8 runs-on: ubuntu-latest steps: - name: Checkout diff --git a/.gitignore b/.gitignore index b48ebd7..8b2b139 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ lib_managed/ src_managed/ project/boot/ project/plugins/project/ +.bsp # Scala-IDE specific .scala_dependencies @@ -23,6 +24,15 @@ project/plugins/project/ *~ sbt/*launch*.jar +# VSCode specific +.vscode +.history + +# Metals +.metals +.bloop +metals.sbt + # python *.pyc .tox diff --git a/build.sbt b/build.sbt index 6d478bc..671da8b 100644 --- a/build.sbt +++ b/build.sbt @@ -1,3 +1,7 @@ +lazy val root = (project in file(".")) + .aggregate(core, native) + + organization := "com.highperformancespark" //tag::addSparkScalaFix[] @@ -22,14 +26,20 @@ name := "examples" publishMavenStyle := true version := "0.0.1" +resolvers ++= Seq( + "JBoss Repository" at "https://repository.jboss.org/nexus/content/repositories/releases/", + "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", + "Apache HBase" at "https://repository.apache.org/content/repositories/releases", + "Twitter Maven Repo" at "https://maven.twttr.com/", + "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", + "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/", + "Typesafe repository" at "https://repo.typesafe.com/typesafe/releases/", + "Second Typesafe repo" at "https://repo.typesafe.com/typesafe/maven-releases/", + "Mesosphere Public Repository" at "https://downloads.mesosphere.io/maven", + Resolver.sonatypeRepo("public") +) -javacOptions ++= Seq("-source", "1.8", "-target", "1.8") - -parallelExecution in Test := false - -fork := true - -javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-Djna.nosys=true") +licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")) def specialOptions = { // We only need these extra props for JRE>17 @@ -45,56 +55,49 @@ def specialOptions = { } } -Test / javaOptions ++= specialOptions val sparkVersion = settingKey[String]("Spark version") val sparkTestingVersion = settingKey[String]("Spark testing base version without Spark version part") -// 2.4.5 is the highest version we have with the old spark-testing-base deps -sparkVersion := System.getProperty("sparkVersion", "3.3.0") -sparkTestingVersion := "1.4.0" - -// additional libraries -libraryDependencies ++= Seq( - "org.apache.spark" %% "spark-core" % sparkVersion.value, - "org.apache.spark" %% "spark-streaming" % sparkVersion.value, - "org.apache.spark" %% "spark-sql" % sparkVersion.value, - "org.apache.spark" %% "spark-hive" % sparkVersion.value, - "org.apache.spark" %% "spark-hive-thriftserver" % sparkVersion.value, - "org.apache.spark" %% "spark-catalyst" % sparkVersion.value, - "org.apache.spark" %% "spark-yarn" % sparkVersion.value, - "org.apache.spark" %% "spark-mllib" % sparkVersion.value, - "com.holdenkarau" %% "spark-testing-base" % s"${sparkVersion.value}_${sparkTestingVersion.value}", - //tag::scalaLogging[] - "com.typesafe.scala-logging" %% "scala-logging" % "3.9.4", - //end::scalaLogging[] - "net.java.dev.jna" % "jna" % "5.12.1") - - -scalacOptions ++= Seq("-deprecation", "-unchecked") - -pomIncludeRepository := { x => false } - -resolvers ++= Seq( - "JBoss Repository" at "https://repository.jboss.org/nexus/content/repositories/releases/", - "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", - "Apache HBase" at "https://repository.apache.org/content/repositories/releases", - "Twitter Maven Repo" at "https://maven.twttr.com/", - "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", - "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/", - "Typesafe repository" at "https://repo.typesafe.com/typesafe/releases/", - "Second Typesafe repo" at "https://repo.typesafe.com/typesafe/maven-releases/", - "Mesosphere Public Repository" at "https://downloads.mesosphere.io/maven", - Resolver.sonatypeRepo("public") -) - -licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")) - -// JNI - -enablePlugins(JniNative) -sourceDirectory in nativeCompile := sourceDirectory.value +// Core (non-JNI bits) + +lazy val core = (project in file("core")) // regular scala code with @native methods + .dependsOn(native % Runtime) + .settings(javah / target := (native / nativeCompile / sourceDirectory).value / "include") + .settings(sbtJniCoreScope := Compile) + .settings( + javacOptions ++= Seq("-source", "1.8", "-target", "1.8"), + parallelExecution in Test := false, + fork := true, + javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-Djna.nosys=true"), + Test / javaOptions ++= specialOptions, + // 2.4.5 is the highest version we have with the old spark-testing-base deps + sparkVersion := System.getProperty("sparkVersion", "3.3.0"), + sparkTestingVersion := "1.4.0", + // additional libraries + libraryDependencies ++= Seq( + "org.apache.spark" %% "spark-core" % sparkVersion.value, + "org.apache.spark" %% "spark-streaming" % sparkVersion.value, + "org.apache.spark" %% "spark-sql" % sparkVersion.value, + "org.apache.spark" %% "spark-hive" % sparkVersion.value, + "org.apache.spark" %% "spark-hive-thriftserver" % sparkVersion.value, + "org.apache.spark" %% "spark-catalyst" % sparkVersion.value, + "org.apache.spark" %% "spark-yarn" % sparkVersion.value, + "org.apache.spark" %% "spark-mllib" % sparkVersion.value, + "com.holdenkarau" %% "spark-testing-base" % s"${sparkVersion.value}_${sparkTestingVersion.value}", + //tag::scalaLogging[] + "com.typesafe.scala-logging" %% "scala-logging" % "3.9.4", + //end::scalaLogging[] + "net.java.dev.jna" % "jna" % "5.12.1"), + scalacOptions ++= Seq("-deprecation", "-unchecked"), + pomIncludeRepository := { x => false }, + ) + +// JNI Magic! +lazy val native = (project in file("native")) // native code and build script + .settings(nativeCompile / sourceDirectory := sourceDirectory.value) + .enablePlugins(JniNative) // JniNative needs to be explicitly enabled //tag::xmlVersionConflict[] // See https://github.com/scala/bug/issues/12632 diff --git a/src/main/java/com/highperformancespark/examples/JavaInterop.java b/core/src/main/java/com/highperformancespark/examples/JavaInterop.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/JavaInterop.java rename to core/src/main/java/com/highperformancespark/examples/JavaInterop.java diff --git a/src/main/java/com/highperformancespark/examples/WordCount.java b/core/src/main/java/com/highperformancespark/examples/WordCount.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/WordCount.java rename to core/src/main/java/com/highperformancespark/examples/WordCount.java diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java rename to core/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java rename to core/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java rename to core/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java diff --git a/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java b/core/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java rename to core/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java rename to core/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java rename to core/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java rename to core/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/objects/JavaPandas.java rename to core/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java b/core/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java similarity index 100% rename from src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java rename to core/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java diff --git a/src/main/julia/setup.jl b/core/src/main/julia/setup.jl similarity index 100% rename from src/main/julia/setup.jl rename to core/src/main/julia/setup.jl diff --git a/src/main/julia/wc.jl b/core/src/main/julia/wc.jl similarity index 100% rename from src/main/julia/wc.jl rename to core/src/main/julia/wc.jl diff --git a/src/main/perl/Changes b/core/src/main/perl/Changes similarity index 100% rename from src/main/perl/Changes rename to core/src/main/perl/Changes diff --git a/src/main/perl/MANIFEST b/core/src/main/perl/MANIFEST similarity index 100% rename from src/main/perl/MANIFEST rename to core/src/main/perl/MANIFEST diff --git a/src/main/perl/Makefile.PL b/core/src/main/perl/Makefile.PL similarity index 100% rename from src/main/perl/Makefile.PL rename to core/src/main/perl/Makefile.PL diff --git a/src/main/perl/README b/core/src/main/perl/README similarity index 100% rename from src/main/perl/README rename to core/src/main/perl/README diff --git a/src/main/perl/ghinfo.pl b/core/src/main/perl/ghinfo.pl similarity index 100% rename from src/main/perl/ghinfo.pl rename to core/src/main/perl/ghinfo.pl diff --git a/src/main/perl/ignore.txt b/core/src/main/perl/ignore.txt similarity index 100% rename from src/main/perl/ignore.txt rename to core/src/main/perl/ignore.txt diff --git a/src/main/perl/lib/HighPerformanceSpark/Examples.pm b/core/src/main/perl/lib/HighPerformanceSpark/Examples.pm similarity index 100% rename from src/main/perl/lib/HighPerformanceSpark/Examples.pm rename to core/src/main/perl/lib/HighPerformanceSpark/Examples.pm diff --git a/src/main/perl/t/00-load.t b/core/src/main/perl/t/00-load.t similarity index 100% rename from src/main/perl/t/00-load.t rename to core/src/main/perl/t/00-load.t diff --git a/src/main/perl/t/manifest.t b/core/src/main/perl/t/manifest.t similarity index 100% rename from src/main/perl/t/manifest.t rename to core/src/main/perl/t/manifest.t diff --git a/src/main/perl/t/pod-coverage.t b/core/src/main/perl/t/pod-coverage.t similarity index 100% rename from src/main/perl/t/pod-coverage.t rename to core/src/main/perl/t/pod-coverage.t diff --git a/src/main/perl/t/pod.t b/core/src/main/perl/t/pod.t similarity index 100% rename from src/main/perl/t/pod.t rename to core/src/main/perl/t/pod.t diff --git a/src/main/perl/xt/boilerplate.t b/core/src/main/perl/xt/boilerplate.t similarity index 100% rename from src/main/perl/xt/boilerplate.t rename to core/src/main/perl/xt/boilerplate.t diff --git a/src/main/r/dapply.R b/core/src/main/r/dapply.R similarity index 100% rename from src/main/r/dapply.R rename to core/src/main/r/dapply.R diff --git a/src/main/r/wc.R b/core/src/main/r/wc.R similarity index 100% rename from src/main/r/wc.R rename to core/src/main/r/wc.R diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back b/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala rename to core/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala diff --git a/src/main/scala/com/high-performance-spark-examples/errors/throws.scala b/core/src/main/scala/com/high-performance-spark-examples/errors/throws.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/errors/throws.scala rename to core/src/main/scala/com/high-performance-spark-examples/errors/throws.scala diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala rename to core/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala diff --git a/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala rename to core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala rename to core/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala rename to core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala rename to core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala diff --git a/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala b/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala rename to core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala diff --git a/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala b/core/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala rename to core/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala diff --git a/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala b/core/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala rename to core/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala diff --git a/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala b/core/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala rename to core/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala b/core/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala rename to core/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala b/core/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala rename to core/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala b/core/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala rename to core/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala diff --git a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala b/core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala rename to core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala diff --git a/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala b/core/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala rename to core/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala diff --git a/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala b/core/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala rename to core/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala diff --git a/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala b/core/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala rename to core/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala diff --git a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala rename to core/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala diff --git a/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala b/core/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala rename to core/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala diff --git a/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala b/core/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala rename to core/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/core/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala rename to core/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala b/core/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala rename to core/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala b/core/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala rename to core/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala b/core/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala rename to core/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala diff --git a/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala b/core/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala similarity index 100% rename from src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala rename to core/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala diff --git a/src/test/java/com/highperformancespark/examples/JavaInteropTest.java b/core/src/test/java/com/highperformancespark/examples/JavaInteropTest.java similarity index 100% rename from src/test/java/com/highperformancespark/examples/JavaInteropTest.java rename to core/src/test/java/com/highperformancespark/examples/JavaInteropTest.java diff --git a/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java b/core/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java similarity index 100% rename from src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java rename to core/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala rename to core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala diff --git a/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala rename to core/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala diff --git a/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/core/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala rename to core/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala diff --git a/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/core/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala rename to core/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala diff --git a/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala diff --git a/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala b/core/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala rename to core/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala diff --git a/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala diff --git a/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala diff --git a/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala diff --git a/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala diff --git a/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala b/core/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala rename to core/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala diff --git a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/core/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala rename to core/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala diff --git a/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala b/core/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala similarity index 100% rename from src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala rename to core/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala diff --git a/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala b/core/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala similarity index 100% rename from src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala rename to core/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala diff --git a/native/src/CMakeLists.txt b/native/src/CMakeLists.txt new file mode 100644 index 0000000..04acf78 --- /dev/null +++ b/native/src/CMakeLists.txt @@ -0,0 +1,50 @@ +################################################################ +# A minimal CMake file that is compatible with sbt-jni # +# # +# All settings required by sbt-jni have been marked so, please # +# add/modify/remove settings to build your specific library. # +################################################################ + +cmake_minimum_required(VERSION 3.12) + +option(SBT "Set if invoked from sbt-jni" OFF) + +# Define project and related variables +# (required by sbt-jni) please use semantic versioning +# +project (high-performance-spark) +enable_language(Fortran) +set(PROJECT_VERSION_MAJOR 0) +set(PROJECT_VERSION_MINOR 0) +set(PROJECT_VERSION_PATCH 0) + +# Setup JNI +find_package(JNI REQUIRED) +if (JNI_FOUND) + message (STATUS "JNI include directories: ${JNI_INCLUDE_DIRS}") +endif() + +# Include directories +include_directories(.) +include_directories(include) +include_directories(${JNI_INCLUDE_DIRS}) + +# Sources +file(GLOB LIB_SRC + "*.c" + "*.f95" + "*.f*" + "*.cc" + "*.cpp" + "./c/*.c" + "./c/*.cpp" + "./fortran/*.f95" + "./fortran/*.f*" +) + +# Setup installation targets +# (required by sbt-jni) major version should always be appended to library name +# +set (LIB_NAME ${PROJECT_NAME}${PROJECT_VERSION_MAJOR}) +add_library(${LIB_NAME} SHARED ${LIB_SRC}) +install(TARGETS ${LIB_NAME} LIBRARY DESTINATION .) diff --git a/src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h b/native/src/c/include/com_highperformancespark_examples_ffi_SumJNI.h similarity index 100% rename from src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h rename to native/src/c/include/com_highperformancespark_examples_ffi_SumJNI.h diff --git a/src/main/c/sum.c b/native/src/c/sum.c similarity index 100% rename from src/main/c/sum.c rename to native/src/c/sum.c diff --git a/src/main/c/sum.h b/native/src/c/sum.h similarity index 100% rename from src/main/c/sum.h rename to native/src/c/sum.h diff --git a/src/main/c/sum_wrapper.c b/native/src/c/sum_wrapper.c similarity index 100% rename from src/main/c/sum_wrapper.c rename to native/src/c/sum_wrapper.c diff --git a/src/main/c/sumf_wrapper.c b/native/src/c/sumf_wrapper.c similarity index 100% rename from src/main/c/sumf_wrapper.c rename to native/src/c/sumf_wrapper.c diff --git a/src/main/fortran/sumf.f95 b/native/src/fortran/sumf.f95 similarity index 100% rename from src/main/fortran/sumf.f95 rename to native/src/fortran/sumf.f95 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt deleted file mode 100644 index e88b326..0000000 --- a/src/CMakeLists.txt +++ /dev/null @@ -1,74 +0,0 @@ -################################################################ -# A minimal CMake file that is compatible with sbt-jni # -# # -# All settings required by sbt-jni have been marked so, please # -# add/modify/remove settings to build your specific library. # -################################################################ - -cmake_minimum_required(VERSION 2.6) - -# Define project and related variables -# -project (high-performance-spark) - -# Enable fortan -enable_language (Fortran) -include(FortranCInterface) - - -# FFLAGS depend on the compiler -get_filename_component (Fortran_COMPILER_NAME ${CMAKE_Fortran_COMPILER} NAME) - - -# Set versions and library name -# (required by sbt-jni) please use semantic versioning -# -set (VERSION_MAJOR 0) -set (VERSION_MINOR 0) -set (VERSION_PATCH 0) -# (required by sbt-jni) major version will always be appended to library name -set (LIB_NAME ${CMAKE_PROJECT_NAME}${VERSION_MAJOR}) - -# Command-line options -# -# (set by sbt-jni) -set (LIB_INSTALL_DIR lib CACHE PATH "Path in which to install libraries (equivalent to Autoconf --libdir).") -# (set by sbt-jni) -set (LIB_ENABLE_MINOR_VERSIONS ON CACHE BOOLEAN "Build libraries with minor and patch versions appended.") - -# Setup JNI -find_package(JNI REQUIRED) -if (JNI_FOUND) - message (STATUS "JNI include directories: ${JNI_INCLUDE_DIRS}") -endif() - -# Include directories -include_directories(.) -include_directories(./main/c) -include_directories(./main/c/include) -include_directories(${JNI_INCLUDE_DIRS}) - -# Setup main shared library -file(GLOB LIB_SRC - "*.c" - "*.cpp" - "./main/c/*.c" - "./main/c/*.cpp" - "./main/fortran/*.f*" -) -add_library(${LIB_NAME} SHARED ${LIB_SRC}) - -# By default, in a regular build, minor and patch versions are added to the generated files. -# When built through sbt-jni however, LIB_ENABLE_MINOR_VERSIONS is deactivated and only a -# major-versioned library file is built. -if (LIB_ENABLE_MINOR_VERSIONS) - set_target_properties( - ${LIB_NAME} - PROPERTIES - VERSION 0.${VERSION_MINOR}.${VERSION_PATCH} # major version always 0, it is included in library name - SOVERSION 0 - ) -endif() - -# Installation targets -install(TARGETS ${LIB_NAME} LIBRARY DESTINATION ${LIB_INSTALL_DIR}) From 3c48c54171b3cd8c9c8e4f59325eb817babfc804 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 28 Aug 2023 11:02:28 -0700 Subject: [PATCH 20/53] Fix Python CI errors (#109) * Run black for Python formatting. * ooh some Python 2 code was still around... my bad. * Add missing deps for python tox --- high_performance_pyspark/SQLLineage.py | 34 +++++++++----- high_performance_pyspark/__init__.py | 1 - high_performance_pyspark/bad_pyspark.py | 36 +++++++++++---- high_performance_pyspark/simple_perf.py | 59 +++++++++++++++++-------- python/tox.ini | 3 ++ 5 files changed, 93 insertions(+), 40 deletions(-) diff --git a/high_performance_pyspark/SQLLineage.py b/high_performance_pyspark/SQLLineage.py index 121f0b4..c9cedf2 100644 --- a/high_performance_pyspark/SQLLineage.py +++ b/high_performance_pyspark/SQLLineage.py @@ -20,7 +20,7 @@ def cutLineage(df): """ Cut the lineage of a DataFrame - used for iterative algorithms - + .. Note: This uses internal members and may break between versions >>> df = rdd.toDF() >>> cutDf = cutLineage(df) @@ -38,35 +38,45 @@ def cutLineage(df): newJavaDF = javaSqlCtx.createDataFrame(jRDD, jSchema) newDF = DataFrame(newJavaDF, sqlCtx) return newDF + + # end::cutLineage[] + def _setupTest(): globs = globals() - spark = SparkSession.builder \ - .master("local[4]") \ - .getOrCreate() + spark = SparkSession.builder.master("local[4]").getOrCreate() sc = spark._sc sc.setLogLevel("ERROR") - globs['sc'] = sc - globs['spark'] = spark - globs['rdd'] = rdd = sc.parallelize( - [Row(field1=1, field2="row1"), - Row(field1=2, field2="row2"), - Row(field1=3, field2="row3")]) + globs["sc"] = sc + globs["spark"] = spark + globs["rdd"] = rdd = sc.parallelize( + [ + Row(field1=1, field2="row1"), + Row(field1=2, field2="row2"), + Row(field1=3, field2="row3"), + ] + ) return globs + def _test(): """ Run the tests. """ import doctest + globs = _setupTest() - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - globs['sc'].stop() + (failure_count, test_count) = doctest.testmod( + globs=globs, optionflags=doctest.ELLIPSIS + ) + globs["sc"].stop() if failure_count: exit(-1) + import sys + if __name__ == "__main__": _test() # Hack to support running in nose diff --git a/high_performance_pyspark/__init__.py b/high_performance_pyspark/__init__.py index 7741593..3f79c0d 100644 --- a/high_performance_pyspark/__init__.py +++ b/high_performance_pyspark/__init__.py @@ -22,4 +22,3 @@ import os import sys - diff --git a/high_performance_pyspark/bad_pyspark.py b/high_performance_pyspark/bad_pyspark.py index 46741dc..1e529ae 100644 --- a/high_performance_pyspark/bad_pyspark.py +++ b/high_performance_pyspark/bad_pyspark.py @@ -5,6 +5,7 @@ global sc + def nonExistentInput(sc): """ Attempt to load non existent input @@ -18,6 +19,7 @@ def nonExistentInput(sc): failedRdd.count() # end::nonExistent[] + def throwOuter(sc): """ Attempt to load non existant input @@ -33,6 +35,7 @@ def throwOuter(sc): transform2.count() # end::throwOuter[] + def throwInner(sc): """ Attempt to load non existant input @@ -48,6 +51,7 @@ def throwInner(sc): transform2.count() # end::throwInner[] + # tag::rewrite[] def add1(x): """ @@ -57,6 +61,7 @@ def add1(x): """ return x + 1 + def divZero(x): """ Divide by zero (cause an error) @@ -67,6 +72,7 @@ def divZero(x): """ return x / 0 + def throwOuter2(sc): """ Attempt to load non existant input @@ -80,6 +86,7 @@ def throwOuter2(sc): transform2 = transform1.map(divZero) transform2.count() + def throwInner2(sc): """ Attempt to load non existant input @@ -92,8 +99,11 @@ def throwInner2(sc): transform1 = data.map(divZero) transform2 = transform1.map(add1) transform2.count() + + # end::rewrite[] + def throwInner3(sc): """ Attempt to load non existant input @@ -102,14 +112,17 @@ def throwInner3(sc): """ data = sc.parallelize(range(10)) rejectedCount = sc.accumulator(0) + def loggedDivZero(x): import logging + try: return [x / 0] except Exception as e: rejectedCount.add(1) logging.warning("Error found " + repr(e)) return [] + transform1 = data.flatMap(loggedDivZero) transform2 = transform1.map(add1) transform2.count() @@ -128,35 +141,42 @@ def runOutOfMemory(sc): """ # tag::worker_oom[] data = sc.parallelize(range(10)) + def generate_too_much(itr): return range(10000000000000) + itr = data.flatMap(generate_too_much) itr.count() # end::worker_oom[] + def _setupTest(): globs = globals() - spark = SparkSession.builder \ - .master("local[4]") \ - .getOrCreate() + spark = SparkSession.builder.master("local[4]").getOrCreate() sc = spark._sc - globs['sc'] = sc + globs["sc"] = sc return globs - + + def _test(): """ - Run the tests. + Run the tests. Note this will print a lot of error message to stderr since we don't capture the JVM sub process stdout/stderr for doctests. """ import doctest + globs = setupTest() - (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) - globs['sc'].stop() + (failure_count, test_count) = doctest.testmod( + globs=globs, optionflags=doctest.ELLIPSIS + ) + globs["sc"].stop() if failure_count: exit(-1) + import sys + if __name__ == "__main__": _test() # Hack to support running in nose diff --git a/high_performance_pyspark/simple_perf.py b/high_performance_pyspark/simple_perf.py index 773ad3e..1fb7f86 100644 --- a/high_performance_pyspark/simple_perf.py +++ b/high_performance_pyspark/simple_perf.py @@ -9,6 +9,7 @@ import timeit import time + def generate_scale_data(sqlCtx, rows, numCols): """ Generate scale data for the performance test. @@ -45,14 +46,14 @@ def generate_scale_data(sqlCtx, rows, numCols): # This returns a Java RDD of Rows - normally it would better to # return a DataFrame directly, but for illustration we will work # with an RDD of Rows. - java_rdd = (gateway.jvm.com.highperformancespark.examples. - tools.GenerateScalingData. - generateMiniScaleRows(scalasc, rows, numCols)) + java_rdd = gateway.jvm.com.highperformancespark.examples.tools.GenerateScalingData.generateMiniScaleRows( + scalasc, rows, numCols + ) # Schemas are serialized to JSON and sent back and forth # Construct a Python Schema and turn it into a Java Schema - schema = StructType([ - StructField("zip", IntegerType()), - StructField("fuzzyness", DoubleType())]) + schema = StructType( + [StructField("zip", IntegerType()), StructField("fuzzyness", DoubleType())] + ) # 2.1 / pre-2.1 try: jschema = javaSqlCtx.parseDataType(schema.json()) @@ -67,19 +68,25 @@ def generate_scale_data(sqlCtx, rows, numCols): return (python_dataframe, pairRDD) # end::javaInterop[] + def runOnDF(df): result = df.groupBy("zip").avg("fuzzyness").count() return result + def runOnRDD(rdd): - result = rdd.map(lambda (x, y): (x, (y, 1))). \ - reduceByKey(lambda x, y: (x[0] + y [0], x[1] + y[1])). \ - count() + result = ( + rdd.map(lambda x, y: (x, (y, 1))) + .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) + .count() + ) return result + def groupOnRDD(rdd): return rdd.groupByKey().mapValues(lambda v: sum(v) / float(len(v))).count() + def run(sc, sqlCtx, scalingFactor, size): """ Run the simple perf test printing the results to stdout. @@ -98,17 +105,30 @@ def run(sc, sqlCtx, scalingFactor, size): """ (input_df, input_rdd) = generate_scale_data(sqlCtx, scalingFactor, size) input_rdd.cache().count() - rddTimeings = timeit.repeat(stmt=lambda: runOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()') - groupTimeings = timeit.repeat(stmt=lambda: groupOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()') + rddTimeings = timeit.repeat( + stmt=lambda: runOnRDD(input_rdd), + repeat=10, + number=1, + timer=time.time, + setup="gc.enable()", + ) + groupTimeings = timeit.repeat( + stmt=lambda: groupOnRDD(input_rdd), + repeat=10, + number=1, + timer=time.time, + setup="gc.enable()", + ) input_df.cache().count() - dfTimeings = timeit.repeat(stmt=lambda: runOnDF(input_df), repeat=10, number=1, timer=time.time, setup='gc.enable()') - print "RDD:" - print rddTimeings - print "group:" - print groupTimeings - print "df:" - print dfTimeings - print "yay" + dfTimeings = timeit.repeat( + stmt=lambda: runOnDF(input_df), + repeat=10, + number=1, + timer=time.time, + setup="gc.enable()", + ) + print(f"RDD: {rddTimeings}, group: {groupTimeings}, df: {dfTimeings}") + def parseArgs(args): """ @@ -130,6 +150,7 @@ def parseArgs(args): import sys from pyspark import SparkContext from pyspark.sql import SQLContext + (scalingFactor, size) = parseArgs(sys.argv) session = SparkSession.appName("SimplePythonPerf").builder.getOrCreate() sc = session._sc diff --git a/python/tox.ini b/python/tox.ini index e4da796..3162d94 100644 --- a/python/tox.ini +++ b/python/tox.ini @@ -49,6 +49,9 @@ extras = tests skipsdist = True commands = black --check examples allowlist_externals = black +deps = + black + -rrequirements.txt [testenv:flake8] extras = tests From 64420150e0022fd60f607b2c56e580fedbb8aed5 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 28 Aug 2023 18:04:55 -0700 Subject: [PATCH 21/53] Start adding SQL examples to illustrate partioned/non-partioned joins (#110) * Start working on adding SQL examples + thing to run them for CI Work on script to run the sql examples. Add missing steps Add missing runson * Add partitioned/non-partitioned join examples. * Fix cache etc. Fix shellcheck Fix cache etc. * Rename run ex * Use java 17 for style. * Checkout needed. * Fix shellcheck * Try and fix cache of spark DL * Fix style with run_sql_examples --- .github/workflows/ci.yml | 35 ++++++++++++++++- .gitignore | 15 ++++++++ migration/sql.sh | 2 + run_sql_examples.sh | 52 ++++++++++++++++++++++++++ sql/nonpartitioned_table_join.sql | 12 ++++++ sql/nonpartitioned_table_join.sql.conf | 7 ++++ sql/partioned_table_join.sql | 14 +++++++ sql/partioned_table_join.sql.conf | 7 ++++ 8 files changed, 143 insertions(+), 1 deletion(-) create mode 100755 run_sql_examples.sh create mode 100644 sql/nonpartitioned_table_join.sql create mode 100644 sql/nonpartitioned_table_join.sql.conf create mode 100644 sql/partioned_table_join.sql create mode 100644 sql/partioned_table_join.sql.conf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bb9113a..faa6e1a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@v2 - - name: Sync the current branch with the latest in spark-testing-base + - name: Sync the current branch with the latest if: github.repository != 'high-performance-spark/high-performance-spark-examples' id: sync-branch run: | @@ -43,3 +43,36 @@ jobs: - name: Run tox run: | cd python; tox + run-sql-examples: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Cache Spark and friends + uses: actions/cache@v3 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + - name: Run sql examples + run: + ./run_sql_examples.sh + style: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Shellcheck + run: | + sudo apt-get install -y shellcheck + shellcheck $(find -name "*.sh") + - name: Setup JDK + uses: actions/setup-java@v3 + with: + distribution: temurin + java-version: 17 + cache: sbt + - name: scala + run: + sbt scalastyle diff --git a/.gitignore b/.gitignore index 8b2b139..077c1d3 100644 --- a/.gitignore +++ b/.gitignore @@ -57,9 +57,24 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +# scala stuff +.metals # native *.o *.so *.so.0.0.0 *.so.0 + +# Spark files +*.tgz +iceberg-spark-runtime-*.jar +spark-*-bin-hadoop*/ + +# Warehouse +spark-warehouse/ +warehouse/ +metastore_db/ + +# Misc internal stuff +sql/*.sql.out \ No newline at end of file diff --git a/migration/sql.sh b/migration/sql.sh index 57fba45..3d94f07 100644 --- a/migration/sql.sh +++ b/migration/sql.sh @@ -1,3 +1,5 @@ +#!/bin/bash + pip install sqlfluff python -m pip install 'sqlfluff-plugin-sparksql-upgrade @ git+https://github.com/holdenk/spark-upgrade#subdirectory=sql' diff --git a/run_sql_examples.sh b/run_sql_examples.sh new file mode 100755 index 0000000..7803542 --- /dev/null +++ b/run_sql_examples.sh @@ -0,0 +1,52 @@ +#!/bin/bash +set -ex + +# Download Spark and iceberg if not present +SPARK_MAJOR="3.4" +SPARK_VERSION=3.4.1 +HADOOP_VERSION="3" +SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" +SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" +ICEBERG_VERSION="1.3.1" +if [ ! -f "${SPARK_FILE}" ]; then + wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" & +fi +# Download Icberg if not present +ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_2.13-${ICEBERG_VERSION}.jar" +if [ ! -f "${ICEBERG_FILE}" ]; then + wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_2.13/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & +fi +wait +# Setup the env +if [ ! -d "${SPARK_PATH}" ]; then + tar -xf ${SPARK_FILE} +fi +if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then + cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}" +fi + +# Set up for running pyspark and friends +export PATH=${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH} + +# Make sure we have a history directory +mkdir -p /tmp/spark-events + +# We use `` for mid multi-line command comments. (see https://stackoverflow.com/questions/9522631/how-to-put-a-line-comment-for-a-multi-line-command). +# For each SQL +for sql_file in sql/*.sql; do + echo "Processing ${sql_file}" + # shellcheck disable=SC2046 + spark-sql --master local[5] \ + --conf spark.eventLog.enabled=true \ + --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ + --conf spark.sql.catalog.spark_catalog.type=hive \ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.local.type=hadoop \ + --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ + $(cat "${sql_file}.conf" || echo "") \ + --name "${sql_file}" \ + -f "${sql_file}" | tee -a "${sql_file}.out" +done + +# If you want to look at them +# ${SPARK_PATH}/sbin/start-history-server.sh diff --git a/sql/nonpartitioned_table_join.sql b/sql/nonpartitioned_table_join.sql new file mode 100644 index 0000000..572437c --- /dev/null +++ b/sql/nonpartitioned_table_join.sql @@ -0,0 +1,12 @@ +CREATE TABLE IF NOT EXISTS local.udevelopers ( + username string, + firstname string, + lastname string) +USING iceberg; +CREATE TABLE IF NOT EXISTS local.uprojects ( + creator string, + uprojectname string) +USING iceberg; +INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova"); +INSERT INTO local.uprojects VALUES("krisnova", "aurae"); +SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username; diff --git a/sql/nonpartitioned_table_join.sql.conf b/sql/nonpartitioned_table_join.sql.conf new file mode 100644 index 0000000..ece26ce --- /dev/null +++ b/sql/nonpartitioned_table_join.sql.conf @@ -0,0 +1,7 @@ + --conf spark.sql.sources.v2.bucketing.enabled=true + --conf spark.sql.iceberg.planning.preserve-data-grouping=true + --conf spark.sql.requireAllClusterKeysForCoPartition=false + + --conf spark.sql.adaptive.enabled=false + --conf spark.sql.autoBroadcastJoinThreshold=-1 + --conf spark.sql.shuffle.partitions=4 diff --git a/sql/partioned_table_join.sql b/sql/partioned_table_join.sql new file mode 100644 index 0000000..1f6dac3 --- /dev/null +++ b/sql/partioned_table_join.sql @@ -0,0 +1,14 @@ +CREATE TABLE IF NOT EXISTS local.developers ( + username string, + firstname string, + lastname string) +USING iceberg +PARTITIONED BY (username); +CREATE TABLE IF NOT EXISTS local.projects ( + creator string, + projectname string) +USING iceberg +PARTITIONED BY (creator); +INSERT INTO local.developers VALUES("krisnova", "Kris", "Nova"); +INSERT INTO local.projects VALUES("krisnova", "aurae"); +SELECT * FROM local.developers INNER JOIN local.projects ON local.projects.creator = local.developers.username; diff --git a/sql/partioned_table_join.sql.conf b/sql/partioned_table_join.sql.conf new file mode 100644 index 0000000..ece26ce --- /dev/null +++ b/sql/partioned_table_join.sql.conf @@ -0,0 +1,7 @@ + --conf spark.sql.sources.v2.bucketing.enabled=true + --conf spark.sql.iceberg.planning.preserve-data-grouping=true + --conf spark.sql.requireAllClusterKeysForCoPartition=false + + --conf spark.sql.adaptive.enabled=false + --conf spark.sql.autoBroadcastJoinThreshold=-1 + --conf spark.sql.shuffle.partitions=4 From 79acfc151ddb4d56c0b9d56124064706880536df Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 28 Aug 2023 18:19:29 -0700 Subject: [PATCH 22/53] Depend on user installing sbt. --- sbt/sbt | 52 ----------------------------- sbt/sbt.bat | 95 ----------------------------------------------------- 2 files changed, 147 deletions(-) delete mode 100755 sbt/sbt delete mode 100644 sbt/sbt.bat diff --git a/sbt/sbt b/sbt/sbt deleted file mode 100755 index aac1085..0000000 --- a/sbt/sbt +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This script launches sbt for this project. If present it uses the system -# version of sbt. If there is no system version of sbt it attempts to download -# sbt locally. -SBT_VERSION=0.13.9 -URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar -URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar -JAR=sbt/sbt-launch-${SBT_VERSION}.jar - -# Download sbt launch jar if it hasn't been downloaded yet -if [ ! -f ${JAR} ]; then - # Download - printf "Attempting to fetch sbt\n" - set -x - JAR_DL=${JAR}.part - if hash wget 2>/dev/null; then - (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} - elif hash axel 2>/dev/null; then - (axel ${URL1} -o ${JAR_DL} || axel ${URL2} -o ${JAR_DL}) && mv ${JAR_DL} ${JAR} - else - printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" - exit -1 - fi -fi -if [ ! -f ${JAR} ]; then - # We failed to download - printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" - exit -1 -fi -printf "Launching sbt from ${JAR}\n" -java \ - -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \ - -jar ${JAR} \ - "$@" diff --git a/sbt/sbt.bat b/sbt/sbt.bat deleted file mode 100644 index 0f7a3e9..0000000 --- a/sbt/sbt.bat +++ /dev/null @@ -1,95 +0,0 @@ -@REM SBT launcher script -@REM -@REM Environment: -@REM JAVA_HOME - location of a JDK home dir (mandatory) -@REM SBT_OPTS - JVM options (optional) -@REM Configuration: -@REM sbtconfig.txt found in the SBT_HOME. - -@REM ZOMG! We need delayed expansion to build up CFG_OPTS later -@setlocal enabledelayedexpansion - -@echo off -set SBT_HOME=%~dp0 - -rem FIRST we load the config file of extra options. -set FN=%SBT_HOME%\..\conf\sbtconfig.txt -set CFG_OPTS= -FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO ( - set DO_NOT_REUSE_ME=%%i - rem ZOMG (Part #2) WE use !! here to delay the expansion of - rem CFG_OPTS, otherwise it remains "" for this loop. - set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME! -) - -rem poor man's jenv (which is not available on Windows) -IF DEFINED JAVA_HOMES ( - IF EXIST .java-version FOR /F %%A IN (.java-version) DO ( - SET JAVA_HOME=%JAVA_HOMES%\%%A - SET JDK_HOME=%JAVA_HOMES%\%%A - ) -) -rem must set PATH or wrong javac is used for java projects -IF DEFINED JAVA_HOME SET "PATH=%JAVA_HOME%\bin;%PATH%" - -rem users can set JAVA_OPTS via .jvmopts (sbt-extras style) -IF EXIST .jvmopts FOR /F %%A IN (.jvmopts) DO ( - SET JAVA_OPTS=%%A !JAVA_OPTS! -) - -rem We use the value of the JAVACMD environment variable if defined -set _JAVACMD=%JAVACMD% - -if "%_JAVACMD%"=="" ( - if not "%JAVA_HOME%"=="" ( - if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe" - ) -) - -if "%_JAVACMD%"=="" set _JAVACMD=java - -rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config. -set _JAVA_OPTS=%JAVA_OPTS% -if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS% - -:args_loop -if "%~1" == "" goto args_end - -if "%~1" == "-jvm-debug" ( - set JVM_DEBUG=true - set /a JVM_DEBUG_PORT=5005 2>nul >nul -) else if "!JVM_DEBUG!" == "true" ( - set /a JVM_DEBUG_PORT=%1 2>nul >nul - if not "%~1" == "!JVM_DEBUG_PORT!" ( - set SBT_ARGS=!SBT_ARGS! %1 - ) -) else ( - set SBT_ARGS=!SBT_ARGS! %1 -) - -shift -goto args_loop -:args_end - -if defined JVM_DEBUG_PORT ( - set _JAVA_OPTS=!_JAVA_OPTS! -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=!JVM_DEBUG_PORT! -) - -call :run %SBT_ARGS% - -if ERRORLEVEL 1 goto error -goto end - -:run - -"%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -cp "%SBT_HOME%sbt-launch.jar" xsbt.boot.Boot %* -goto :eof - -:error -@endlocal -exit /B 1 - - -:end -@endlocal -exit /B 0 From 5ff04dcd132e5a7caa6e2ff4a2455b608f0a13a3 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 4 Sep 2023 17:51:17 -0700 Subject: [PATCH 23/53] Unify python examples and update run scripts (#111) * Unify the PySpark examples (idk why they were seperated) and add a script to run them same as the SQL ex. * Try and make run more flex * Change how we trigger OOMing. * Skip doctest of OOM since it puts SparkContext into a bad state. * Add a quote and disable SC2046 --- .github/workflows/ci.yml | 15 +++++++++ .gitignore | 3 +- env_setup.sh | 33 +++++++++++++++++++ .../examples}/SQLLineage.py | 29 ++++++++-------- .../examples}/bad_pyspark.py | 25 +++++++------- .../examples}/simple_perf.py | 26 ++++----------- run_pyspark_examples.sh | 20 +++++++++++ run_sql_examples.sh | 30 +---------------- 8 files changed, 102 insertions(+), 79 deletions(-) create mode 100644 env_setup.sh rename {high_performance_pyspark => python/examples}/SQLLineage.py (86%) rename {high_performance_pyspark => python/examples}/bad_pyspark.py (89%) rename {high_performance_pyspark => python/examples}/simple_perf.py (86%) create mode 100755 run_pyspark_examples.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index faa6e1a..6cbfb77 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -58,6 +58,21 @@ jobs: - name: Run sql examples run: ./run_sql_examples.sh + run-pyspark-examples: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Cache Spark and friends + uses: actions/cache@v3 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + - name: Run PySpark examples + run: + ./run_pyspark_examples.sh style: runs-on: ubuntu-latest steps: diff --git a/.gitignore b/.gitignore index 077c1d3..447436a 100644 --- a/.gitignore +++ b/.gitignore @@ -77,4 +77,5 @@ warehouse/ metastore_db/ # Misc internal stuff -sql/*.sql.out \ No newline at end of file +sql/*.sql.out +python/examples/*.py.out \ No newline at end of file diff --git a/env_setup.sh b/env_setup.sh new file mode 100644 index 0000000..5d06aa5 --- /dev/null +++ b/env_setup.sh @@ -0,0 +1,33 @@ +#!/bin/bash + + +# Download Spark and iceberg if not present +SPARK_MAJOR="3.4" +SPARK_VERSION=3.4.1 +HADOOP_VERSION="3" +SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" +SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" +ICEBERG_VERSION="1.3.1" +if [ ! -f "${SPARK_FILE}" ]; then + wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" & +fi +# Download Icberg if not present +ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_2.13-${ICEBERG_VERSION}.jar" +if [ ! -f "${ICEBERG_FILE}" ]; then + wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_2.13/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & +fi +wait +# Setup the env +if [ ! -d "${SPARK_PATH}" ]; then + tar -xf ${SPARK_FILE} +fi +if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then + cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}" +fi + +# Set up for running pyspark and friends +export PATH=${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH} + +# Make sure we have a history directory +mkdir -p /tmp/spark-events + diff --git a/high_performance_pyspark/SQLLineage.py b/python/examples/SQLLineage.py similarity index 86% rename from high_performance_pyspark/SQLLineage.py rename to python/examples/SQLLineage.py index c9cedf2..71b7209 100644 --- a/high_performance_pyspark/SQLLineage.py +++ b/python/examples/SQLLineage.py @@ -1,3 +1,13 @@ +from pyspark.sql import DataFrame, Row +from pyspark.sql.session import SparkSession +import sys + +global df +global sc +global rdd +global spark + + """ >>> df = rdd.toDF() >>> df2 = cutLineage(df) @@ -7,14 +17,6 @@ True """ -global df -global sc -global rdd -global spark - -from pyspark.context import SparkContext -from pyspark.sql import DataFrame, Row -from pyspark.sql.session import SparkSession # tag::cutLineage[] def cutLineage(df): @@ -31,11 +33,8 @@ def cutLineage(df): jSchema = df._jdf.schema() jRDD.cache() sqlCtx = df.sql_ctx - try: - javaSqlCtx = sqlCtx._jsqlContext - except: - javaSqlCtx = sqlCtx._ssql_ctx - newJavaDF = javaSqlCtx.createDataFrame(jRDD, jSchema) + javaSparkSession = sqlCtx._jSparkSession + newJavaDF = javaSparkSession.createDataFrame(jRDD, jSchema) newDF = DataFrame(newJavaDF, sqlCtx) return newDF @@ -50,7 +49,7 @@ def _setupTest(): sc.setLogLevel("ERROR") globs["sc"] = sc globs["spark"] = spark - globs["rdd"] = rdd = sc.parallelize( + globs["rdd"] = sc.parallelize( [ Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), @@ -75,8 +74,6 @@ def _test(): exit(-1) -import sys - if __name__ == "__main__": _test() # Hack to support running in nose diff --git a/high_performance_pyspark/bad_pyspark.py b/python/examples/bad_pyspark.py similarity index 89% rename from high_performance_pyspark/bad_pyspark.py rename to python/examples/bad_pyspark.py index 1e529ae..8b224d5 100644 --- a/high_performance_pyspark/bad_pyspark.py +++ b/python/examples/bad_pyspark.py @@ -1,7 +1,7 @@ # This script triggers a number of different PySpark errors -from pyspark import * from pyspark.sql.session import SparkSession +import sys global sc @@ -131,22 +131,20 @@ def loggedDivZero(x): def runOutOfMemory(sc): """ - Run out of memory on the workers. - In standalone modes results in a memory error, but in YARN may trigger YARN container - overhead errors. - >>> runOutOfMemory(sc) + Run out of memory on the workers from a skewed shuffle. + >>> runOutOfMemory(sc) # doctest: +SKIP Traceback (most recent call last): ... Py4JJavaError:... """ # tag::worker_oom[] - data = sc.parallelize(range(10)) + data = sc.parallelize(range(10000)) - def generate_too_much(itr): - return range(10000000000000) + def generate_too_much(i: int): + return list(map(lambda v: (i % 2, v), range(100000 * i))) - itr = data.flatMap(generate_too_much) - itr.count() + bad = data.flatMap(generate_too_much).groupByKey() + bad.count() # end::worker_oom[] @@ -166,17 +164,18 @@ def _test(): """ import doctest - globs = setupTest() + globs = _setupTest() (failure_count, test_count) = doctest.testmod( globs=globs, optionflags=doctest.ELLIPSIS ) + print("All tests done, stopping Spark context.") globs["sc"].stop() if failure_count: exit(-1) + else: + exit(0) -import sys - if __name__ == "__main__": _test() # Hack to support running in nose diff --git a/high_performance_pyspark/simple_perf.py b/python/examples/simple_perf.py similarity index 86% rename from high_performance_pyspark/simple_perf.py rename to python/examples/simple_perf.py index 1fb7f86..3ec452e 100644 --- a/high_performance_pyspark/simple_perf.py +++ b/python/examples/simple_perf.py @@ -4,8 +4,9 @@ # should be taken as it depends on many private members that may change in # future releases of Spark. -from pyspark.sql.types import * -from pyspark.sql import * +from pyspark.sql.types import StructType, IntegerType, DoubleType, StructField +from pyspark.sql import DataFrame, SparkSession +import sys import timeit import time @@ -29,14 +30,7 @@ def generate_scale_data(sqlCtx, rows, numCols): """ # tag::javaInterop[] sc = sqlCtx._sc - # Get the SQL Context, 2.1, 2.0 and pre-2.0 syntax - yay internals :p - try: - try: - javaSqlCtx = sqlCtx._jsqlContext - except: - javaSqlCtx = sqlCtx._ssql_ctx - except: - javaSqlCtx = sqlCtx._jwrapped + javaSparkSession = sqlCtx._jSparkSession jsc = sc._jsc scalasc = jsc.sc() gateway = sc._gateway @@ -54,13 +48,9 @@ def generate_scale_data(sqlCtx, rows, numCols): schema = StructType( [StructField("zip", IntegerType()), StructField("fuzzyness", DoubleType())] ) - # 2.1 / pre-2.1 - try: - jschema = javaSqlCtx.parseDataType(schema.json()) - except: - jschema = sqlCtx._jsparkSession.parseDataType(schema.json()) + jschema = javaSparkSession.parseDataType(schema.json()) # Convert the Java RDD to Java DataFrame - java_dataframe = javaSqlCtx.createDataFrame(java_rdd, jschema) + java_dataframe = javaSparkSession.createDataFrame(java_rdd, jschema) # Wrap the Java DataFrame into a Python DataFrame python_dataframe = DataFrame(java_dataframe, sqlCtx) # Convert the Python DataFrame into an RDD @@ -143,13 +133,9 @@ def parseArgs(args): if __name__ == "__main__": - """ Usage: simple_perf_test scalingFactor size """ - import sys - from pyspark import SparkContext - from pyspark.sql import SQLContext (scalingFactor, size) = parseArgs(sys.argv) session = SparkSession.appName("SimplePythonPerf").builder.getOrCreate() diff --git a/run_pyspark_examples.sh b/run_pyspark_examples.sh new file mode 100755 index 0000000..d76f0bd --- /dev/null +++ b/run_pyspark_examples.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +source env_setup.sh + +pip install -r ./python/requirements.txt + +for ex in python/examples/*.py; do + # shellcheck disable=SC2046 + spark-submit \ + --master local[5] \ + --conf spark.eventLog.enabled=true \ + --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ + --conf spark.sql.catalog.spark_catalog.type=hive \ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.local.type=hadoop \ + --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ + $(cat "${ex}.conf" || echo "") \ + --name "${ex}" \ + "${ex}" 2>&1 | tee -a "${ex}.out" +done diff --git a/run_sql_examples.sh b/run_sql_examples.sh index 7803542..ffc92d8 100755 --- a/run_sql_examples.sh +++ b/run_sql_examples.sh @@ -1,35 +1,7 @@ #!/bin/bash set -ex -# Download Spark and iceberg if not present -SPARK_MAJOR="3.4" -SPARK_VERSION=3.4.1 -HADOOP_VERSION="3" -SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" -SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" -ICEBERG_VERSION="1.3.1" -if [ ! -f "${SPARK_FILE}" ]; then - wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" & -fi -# Download Icberg if not present -ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_2.13-${ICEBERG_VERSION}.jar" -if [ ! -f "${ICEBERG_FILE}" ]; then - wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_2.13/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & -fi -wait -# Setup the env -if [ ! -d "${SPARK_PATH}" ]; then - tar -xf ${SPARK_FILE} -fi -if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then - cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}" -fi - -# Set up for running pyspark and friends -export PATH=${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH} - -# Make sure we have a history directory -mkdir -p /tmp/spark-events +source env_setup.sh # We use `` for mid multi-line command comments. (see https://stackoverflow.com/questions/9522631/how-to-put-a-line-comment-for-a-multi-line-command). # For each SQL From 4e109c473eb249bb00e2217b612b6d893e94675b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 21 Sep 2023 10:30:47 -0700 Subject: [PATCH 24/53] Add some data validation examples (#112) Add some data validation examples including: pandera Nike's spark-expectations basic WAP Target's data-validator Consisting of: * Add pandera req * Start adding a pandera example * Formatting * Format * Formatting * Add an example for the target data validator. * Flesh out the target example and add it to CI. * Start working on adding spark expectations from the Nike folks. * Update rule * Play around with spark expectations * Update the sample rule * Fix mismatched scala versions * Install IcebergSparkSessionExtensions * Start adding a pure SQL WAP example. * Switch to using Session over legacy context. * Format python ex * Style cleanup * Hmmm clone dv as well * Comment out not yet working FF in SQL * Fix examples * Skip CI on target data validator for now (nested build issue and it should go away once PR is merged anyways). * SparkSession imports are good. --- .github/workflows/ci.yml | 33 ++++++ .gitignore | 7 +- .../dataframe/HappyPandasTest.scala | 14 +-- data/project.csv | 5 + env_setup.sh | 12 +- python/examples/bad_pyspark.py | 4 +- python/examples/pandera_ex.py | 52 +++++++++ python/examples/simple_perf.py | 5 +- python/examples/spark_expectations_example.py | 105 ++++++++++++++++++ .../spark_expectations_sample_rules.json | 1 + python/requirements.txt | 6 + python/tox.ini | 2 +- run_pyspark_examples.sh | 40 ++++++- run_sql_examples.sh | 22 +++- sql/wap.sql | 14 +++ target-validator/ex.yaml | 31 ++++++ target-validator/runme.sh | 18 +++ 17 files changed, 350 insertions(+), 21 deletions(-) create mode 100644 data/project.csv create mode 100644 python/examples/pandera_ex.py create mode 100644 python/examples/spark_expectations_example.py create mode 100644 python/examples/spark_expectations_sample_rules.json create mode 100644 sql/wap.sql create mode 100644 target-validator/ex.yaml create mode 100755 target-validator/runme.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6cbfb77..9392efc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,9 +55,36 @@ jobs: spark*.tgz iceberg*.jar key: spark-artifacts + - name: Cache Data + uses: actions/cache@v3 + with: + path: | + data/fetched/* + key: data-fetched - name: Run sql examples run: ./run_sql_examples.sh + run-target-examples: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Cache Spark and friends + uses: actions/cache@v3 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + - name: Cache Data + uses: actions/cache@v3 + with: + path: | + data/fetched/* + key: data-fetched + - name: Run the target validator example + run: + cd target-validator; ./runme.sh run-pyspark-examples: runs-on: ubuntu-latest steps: @@ -70,6 +97,12 @@ jobs: spark*.tgz iceberg*.jar key: spark-artifacts + - name: Cache Data + uses: actions/cache@v3 + with: + path: | + data/fetched/* + key: data-fetched - name: Run PySpark examples run: ./run_pyspark_examples.sh diff --git a/.gitignore b/.gitignore index 447436a..21afd2c 100644 --- a/.gitignore +++ b/.gitignore @@ -78,4 +78,9 @@ metastore_db/ # Misc internal stuff sql/*.sql.out -python/examples/*.py.out \ No newline at end of file +python/examples/*.py.out +data/fetched/* + +# more python +pyspark_venv.tar.gz +pyspark_venv/ diff --git a/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala index 7c5dbaa..5621ee2 100644 --- a/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala @@ -9,7 +9,7 @@ import scala.util.Random import org.apache.spark.sql.DataFrame import org.apache.spark.sql.Row -import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.{SQLContext, SparkSession} import org.apache.spark.sql.types._ import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo @@ -68,7 +68,7 @@ class HappyPandasTest extends AnyFunSuite with DataFrameSuiteBase { val expectedDf = createDF(expectedList, ("place", StringType), ("percentHappy", DoubleType)) - val inputDF = sqlContext.createDataFrame(pandaInfoList) + val inputDF = spark.createDataFrame(pandaInfoList) val resultDF = HappyPandas.happyPandasPercentage(inputDF) assertDataFrameApproximateEquals(expectedDf, resultDF, 1E-5) @@ -76,7 +76,7 @@ class HappyPandasTest extends AnyFunSuite with DataFrameSuiteBase { //end::approxEqualDataFrames[] test("verify approx by hand") { - val inputDF = sqlContext.createDataFrame(pandaInfoList) + val inputDF = spark.createDataFrame(pandaInfoList) val resultDF = HappyPandas.happyPandasPercentage(inputDF) val resultRows = resultDF.collect() @@ -94,7 +94,7 @@ class HappyPandasTest extends AnyFunSuite with DataFrameSuiteBase { } test("test encode Panda type") { - val inputDF = sqlContext.createDataFrame(rawPandaList) + val inputDF = spark.createDataFrame(rawPandaList) val resultDF = HappyPandas.encodePandaType(inputDF) val expectedRows = List(Row(10L, 0), Row(11L, 1)) @@ -107,7 +107,7 @@ class HappyPandasTest extends AnyFunSuite with DataFrameSuiteBase { //tag::exactEqualDataFrames[] test("verify exact equality") { // test minHappyPandas - val inputDF = sqlContext.createDataFrame(pandaInfoList) + val inputDF = spark.createDataFrame(pandaInfoList) val result = HappyPandas.minHappyPandas(inputDF, 2) val resultRows = result.collect() @@ -117,12 +117,12 @@ class HappyPandasTest extends AnyFunSuite with DataFrameSuiteBase { //end::exactEqualDataFrames[] test("test happyPandasPlaces") { - val inputDF = sqlContext.createDataFrame(pandaInfoList) + val inputDF = spark.createDataFrame(pandaInfoList) val resultDF = HappyPandas.happyPandasPlaces(inputDF) val expectedRows = List(PandaInfo(toronto, "giant", 1, 2), PandaInfo(sandiego, "red", 2, 3)) - val expectedDF = sqlContext.createDataFrame(expectedRows) + val expectedDF = spark.createDataFrame(expectedRows) assertDataFrameEquals(expectedDF, resultDF) } diff --git a/data/project.csv b/data/project.csv new file mode 100644 index 0000000..6921010 --- /dev/null +++ b/data/project.csv @@ -0,0 +1,5 @@ +creator,projectname,stars +holdenk,spark-upgrade,17 +krisnova,rust-nova,71 +kbendick,MongoMart,6 +mateiz,spark,36600 \ No newline at end of file diff --git a/env_setup.sh b/env_setup.sh index 5d06aa5..80f34cb 100644 --- a/env_setup.sh +++ b/env_setup.sh @@ -4,6 +4,7 @@ # Download Spark and iceberg if not present SPARK_MAJOR="3.4" SPARK_VERSION=3.4.1 +SCALA_VERSION="2.12" HADOOP_VERSION="3" SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" @@ -12,15 +13,18 @@ if [ ! -f "${SPARK_FILE}" ]; then wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" & fi # Download Icberg if not present -ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_2.13-${ICEBERG_VERSION}.jar" +ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}-${ICEBERG_VERSION}.jar" if [ ! -f "${ICEBERG_FILE}" ]; then - wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_2.13/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & + wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & fi wait # Setup the env if [ ! -d "${SPARK_PATH}" ]; then tar -xf ${SPARK_FILE} fi + +export SPARK_HOME="${SPARK_PATH}" + if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}" fi @@ -31,3 +35,7 @@ export PATH=${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/s # Make sure we have a history directory mkdir -p /tmp/spark-events +mkdir -p ./data/fetched/ +if [ ! -f ./data/fetched/2021 ]; then + wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" -O ./data/fetched/2021 +fi diff --git a/python/examples/bad_pyspark.py b/python/examples/bad_pyspark.py index 8b224d5..083fbdd 100644 --- a/python/examples/bad_pyspark.py +++ b/python/examples/bad_pyspark.py @@ -159,8 +159,8 @@ def _setupTest(): def _test(): """ Run the tests. - Note this will print a lot of error message to stderr since we don't capture the JVM sub process - stdout/stderr for doctests. + Note this will print a lot of error message to stderr since we don't + capture the JVM sub process stdout/stderr for doctests. """ import doctest diff --git a/python/examples/pandera_ex.py b/python/examples/pandera_ex.py new file mode 100644 index 0000000..78155ce --- /dev/null +++ b/python/examples/pandera_ex.py @@ -0,0 +1,52 @@ +from pyspark.sql.session import SparkSession + +# tags::pandera_imports[] +import pandera.pyspark as pa +import pyspark.sql.types as T + +# end::pandera_imports[] + + +# tag::simple_data_schema[] +class ProjectDataSchema(pa.DataFrameModel): + # Note str_length is currently broken :/ + creator: T.StringType() = pa.Field(str_length={"min_value": 1}) + projectname: T.StringType() = pa.Field() + stars: T.IntegerType() = pa.Field(ge=0) + + +# end::simple_data_schema[] + + +# tag::gender_data[] +class GenderData(pa.DataFrameModel): + MaleBonusPercent: T.DoubleType() = pa.Field(nullable=True, le=5) + FemaleBonusPercent: T.DoubleType() = pa.Field(nullable=True) + CompanyNumber: T.IntegerType() = pa.Field() + + +# end::gender_data[] + +if __name__ == "__main__": + spark = SparkSession.builder.master("local[4]").getOrCreate() + # Make sure to make + # "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" + # available as ./data/2021 + uk_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) + + # tag::validate_gender_data[] + validated_df = GenderData(uk_df) + # Print out the errors. You may wish to exit with an error condition. + if validated_df.pandera.errors != {}: + print(validated_df.pandera.errors) + # sys.exit(1) + # end::validate_gender_data[] + + # tag::validate_project_data[] + project_data = spark.read.csv("./data/project.csv", header=True, inferSchema=True) + validated_df = ProjectDataSchema(project_data) + # Print out the errors. You may wish to exit with an error condition. + if validated_df.pandera.errors != {}: + print(validated_df.pandera.errors) + # sys.exit(1) + # end::validate_project_data[] diff --git a/python/examples/simple_perf.py b/python/examples/simple_perf.py index 3ec452e..30ba2c7 100644 --- a/python/examples/simple_perf.py +++ b/python/examples/simple_perf.py @@ -1,5 +1,8 @@ # When running this example make sure to include the built Scala jar : -# $SPARK_HOME/bin/pyspark --jars ./target/examples-0.0.1.jar --driver-class-path ./target/examples-0.0.1.jar +# +# $SPARK_HOME/bin/pyspark --jars \ +# ./target/examples-0.0.1.jar --driver-class-path ./target/examples-0.0.1.jar +# # This example illustrates how to interface Scala and Python code, but caution # should be taken as it depends on many private members that may change in # future releases of Spark. diff --git a/python/examples/spark_expectations_example.py b/python/examples/spark_expectations_example.py new file mode 100644 index 0000000..dab1202 --- /dev/null +++ b/python/examples/spark_expectations_example.py @@ -0,0 +1,105 @@ +from pyspark import SparkFiles +from pyspark.sql import * +from spark_expectations.core.expectations import SparkExpectations + +spark = SparkSession.builder.master("local[4]").getOrCreate() +sc = spark.sparkContext + +# tag::global_setup[] +from spark_expectations.config.user_config import * + +se_global_spark_Conf = { + "se_notifications_enable_email": False, + "se_notifications_email_smtp_host": "mailhost.example.com", + "se_notifications_email_smtp_port": 25, + "se_notifications_email_from": "timbit@example.com", + "se_notifications_email_subject": "spark expectations - data quality - notifications", + "se_notifications_on_fail": True, + "se_notifications_on_error_drop_exceeds_threshold_breach": True, + "se_notifications_on_error_drop_threshold": 15, + "se_enable_streaming": False, # Required or tries to publish to kafka. +} +# end::gloabl_setup[] + + +# tag::setup_and_load[] +spark.sql("DROP TABLE IF EXISTS local.magic_validation") +spark.sql( + """ +create table local.magic_validation ( + product_id STRING, + table_name STRING, + rule_type STRING, + rule STRING, + column_name STRING, + expectation STRING, + action_if_failed STRING, + tag STRING, + description STRING, + enable_for_source_dq_validation BOOLEAN, + enable_for_target_dq_validation BOOLEAN, + is_active BOOLEAN, + enable_error_drop_alert BOOLEAN, + error_drop_threshold INT +)""" +) +spark.sql( + """ +create table if not exists local.pay_stats ( + product_id STRING, + table_name STRING, + input_count LONG, + error_count LONG, + output_count LONG, + output_percentage FLOAT, + success_percentage FLOAT, + error_percentage FLOAT, + source_agg_dq_results array>, + final_agg_dq_results array>, + source_query_dq_results array>, + final_query_dq_results array>, + row_dq_res_summary array>, + row_dq_error_threshold array>, + dq_status map, + dq_run_time map, + dq_rules map>, + meta_dq_run_id STRING, + meta_dq_run_date DATE, + meta_dq_run_datetime TIMESTAMP +);""" +) +rule_file = "./spark_expectations_sample_rules.json" +sc.addFile(rule_file) +df = spark.read.json(SparkFiles.get(rule_file)) +print(df) +df.write.option("byname", "true").mode("append").saveAsTable("local.magic_validation") +spark.read.table("local.magic_validation").show() +se: SparkExpectations = SparkExpectations( + product_id="pay", debugger=True # Used to filter which rules we apply +) +# end::setup_and_load[] + + +# tag::run_validation[] +# Only row data quality checking +@se.with_expectations( + se.reader.get_rules_from_table( + product_rules_table="local.magic_validation", + target_table_name="local.bonuses", + dq_stats_table_name="local.pay_stats", + ), + write_to_table=False, + row_dq=True, + # This does not work currently (Iceberg) + spark_conf={"format": "iceberg"}, + options={"format": "iceberg"}, + options_error_table={"format": "iceberg"}, +) +def load_data(): + raw_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) + uk_df = raw_df.select("CompanyNumber", "MaleBonusPercent", "FemaleBonuspercent") + return uk_df + + +data = load_data() +# end::run_validation[] diff --git a/python/examples/spark_expectations_sample_rules.json b/python/examples/spark_expectations_sample_rules.json new file mode 100644 index 0000000..fc24e69 --- /dev/null +++ b/python/examples/spark_expectations_sample_rules.json @@ -0,0 +1 @@ +{"product_id": "pay", "table_name": "local.bonuses", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": true, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} diff --git a/python/requirements.txt b/python/requirements.txt index 2947477..9957d0f 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -3,3 +3,9 @@ pandas pyarrow pyspark pyspark-asyncactions +pandera +pandera[pyspark] +spark-expectations +venv-pack +delta-spark +requests diff --git a/python/tox.ini b/python/tox.ini index 3162d94..4c90b8f 100644 --- a/python/tox.ini +++ b/python/tox.ini @@ -56,7 +56,7 @@ deps = [testenv:flake8] extras = tests skipsdist = True -commands = flake8 examples +commands = flake8 --ignore=F403,E402,F401,F405 examples allowlist_externals = flake8 [testenv:mypy] diff --git a/run_pyspark_examples.sh b/run_pyspark_examples.sh index d76f0bd..5ce8912 100755 --- a/run_pyspark_examples.sh +++ b/run_pyspark_examples.sh @@ -1,20 +1,54 @@ #!/bin/bash +# shellcheck disable=SC1091,SC2034 source env_setup.sh +set -ex + +set -o pipefail + +#tag::package_venv[] +if [ ! -d pyspark_venv ]; then + python -m venv pyspark_venv +fi + +source pyspark_venv/bin/activate pip install -r ./python/requirements.txt -for ex in python/examples/*.py; do +if [ ! -f pyspark_venv.tar.gz ]; then + venv-pack -o pyspark_venv.tar.gz +fi + + +# Set in local and client mode where the driver uses the Python present +# (requires that you have activated the venv as we did above) +PYSPARK_DRIVER_PYTHON=python +export PYSPARK_DRIVER_PYTHON +export PYTHON_PATH=./environment/bin/python +#end::package_venv[] + +function run_example () { + local ex="$1" # shellcheck disable=SC2046 spark-submit \ --master local[5] \ --conf spark.eventLog.enabled=true \ --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ --conf spark.sql.catalog.spark_catalog.type=hive \ --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ --conf spark.sql.catalog.local.type=hadoop \ + --archives pyspark_venv.tar.gz#environment \ --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ $(cat "${ex}.conf" || echo "") \ --name "${ex}" \ - "${ex}" 2>&1 | tee -a "${ex}.out" -done + "${ex}" 2>&1 | tee -a "${ex}.out" || echo "ok" +} + +if [ $# -eq 1 ]; then + run_example "python/examples/$1" +else + for ex in python/examples/*.py; do + run_example "$ex" + done +fi diff --git a/run_sql_examples.sh b/run_sql_examples.sh index ffc92d8..e09f0d0 100755 --- a/run_sql_examples.sh +++ b/run_sql_examples.sh @@ -1,15 +1,17 @@ #!/bin/bash set -ex +set -o pipefail source env_setup.sh # We use `` for mid multi-line command comments. (see https://stackoverflow.com/questions/9522631/how-to-put-a-line-comment-for-a-multi-line-command). -# For each SQL -for sql_file in sql/*.sql; do - echo "Processing ${sql_file}" + +function run_example () { + local sql_file="$1" # shellcheck disable=SC2046 spark-sql --master local[5] \ --conf spark.eventLog.enabled=true \ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ --conf spark.sql.catalog.spark_catalog.type=hive \ --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ @@ -18,7 +20,19 @@ for sql_file in sql/*.sql; do $(cat "${sql_file}.conf" || echo "") \ --name "${sql_file}" \ -f "${sql_file}" | tee -a "${sql_file}.out" -done +} + # If you want to look at them # ${SPARK_PATH}/sbin/start-history-server.sh + +if [ $# -eq 1 ]; then + run_example "sql/$1" +else + # For each SQL + for sql_file in sql/*.sql; do + echo "Processing ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" + done +fi diff --git a/sql/wap.sql b/sql/wap.sql new file mode 100644 index 0000000..ac513eb --- /dev/null +++ b/sql/wap.sql @@ -0,0 +1,14 @@ +CREATE TABLE IF NOT EXISTS local.projects ( + creator string, + projectname string) +USING iceberg +PARTITIONED BY (creator); +ALTER TABLE local.projects SET TBLPROPERTIES ( + 'write.wap.enabled''true' +); +ALTER TABLE local.projects CREATE BRANCH IF NOT EXISTS `audit`; +SET spark.wap.branch = 'branch'; +INSERT INTO local.projects VALUES("krisnova", "aurae"); +SELECT count(*) FROM local.projects VERSION AS OF 'audit' WHERE creator is NULL; +-- This does not work until we upgrade to 3.5 + Iceberg 1.4. +-- CALL local.system.fastForward("local.projects", "main", "audit-branch"); diff --git a/target-validator/ex.yaml b/target-validator/ex.yaml new file mode 100644 index 0000000..ce8b492 --- /dev/null +++ b/target-validator/ex.yaml @@ -0,0 +1,31 @@ +detailedErrors: true +numKeyCols: 4 +# We might have a large number of errors so just show the first 5 +numErrorsToReport: 5 + +email: + smtpHost: smtp.example.com + subject: Data Validation Summary + from: data-validator-no-reply@example.com + to: + - professor-timbit@example.com + +tables: + - db: gender_paygaps + table: uk + # Columns that taken together uniquely specifies each row (think of groupBy) + keyColumns: + - CompanyNumber + - EmployerId + - CompanyLinkToGPGInfo + - ResponsiblePerson + # Used to filter + condition: MaleBonusPercent >= FemaleBonusPercent + checks: + # We expect at least 500 records + - type: rowCount + minNumRows: 500 + # We don't expect more than 1% not companies in the dataset. + - type: nullCheck + column: CompanyNumber + threshold: 0.01 diff --git a/target-validator/runme.sh b/target-validator/runme.sh new file mode 100755 index 0000000..52ebe14 --- /dev/null +++ b/target-validator/runme.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# shellcheck disable=SC1091,SC2034 + +source ../env_setup.sh +set -ex +export SPARK_VERSION="${SPARK_VERSION:-3.4.1}" + +# Disable for now until the target folks agree on the PR nested builds are slow. +exit 0 + +git clone git@github.com:holdenk/data-validator.git || git clone https://github.com/holdenk/data-validator.git +cd data-validator +git checkout upgrade-to-modern-spark +sbt -Dspark="${SPARK_VERSION}" clean assembly +JAR_PATH="$(pwd)/target/scala-2.12/data-validator-assembly-${SPARK_VERSION}_0.15.0.jar" +export JAR_PATH +cd .. +spark-submit --master local "$JAR_PATH" --config ex.yaml || echo "Failed as expected." From f35b345791cf05055223158882b3ed4964be4e6a Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 21 Sep 2023 14:26:55 -0700 Subject: [PATCH 25/53] Iceberg sort order restriction (#113) * Add pipefail * Add the iceberg schema evolution gotcha I'm looking into. * Finish the repro, cleanup the run script. * Add workaround * Add expected to fail --- run_sql_examples.sh | 4 +--- ...iceberg-schema-evolution-gotcha-possibility.sql | 13 +++++++++++++ ...olution-gotcha-possibility.sql.expected_to_fail | 0 sql/iceberg-schema-evolution-gotcha-workaround.sql | 14 ++++++++++++++ 4 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 sql/iceberg-schema-evolution-gotcha-possibility.sql create mode 100644 sql/iceberg-schema-evolution-gotcha-possibility.sql.expected_to_fail create mode 100644 sql/iceberg-schema-evolution-gotcha-workaround.sql diff --git a/run_sql_examples.sh b/run_sql_examples.sh index e09f0d0..c054b31 100755 --- a/run_sql_examples.sh +++ b/run_sql_examples.sh @@ -4,8 +4,6 @@ set -o pipefail source env_setup.sh -# We use `` for mid multi-line command comments. (see https://stackoverflow.com/questions/9522631/how-to-put-a-line-comment-for-a-multi-line-command). - function run_example () { local sql_file="$1" # shellcheck disable=SC2046 @@ -19,7 +17,7 @@ function run_example () { --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ $(cat "${sql_file}.conf" || echo "") \ --name "${sql_file}" \ - -f "${sql_file}" | tee -a "${sql_file}.out" + -f "${sql_file}" | tee -a "${sql_file}.out" || ls "${sql_file}.expected_to_fail" } diff --git a/sql/iceberg-schema-evolution-gotcha-possibility.sql b/sql/iceberg-schema-evolution-gotcha-possibility.sql new file mode 100644 index 0000000..2038423 --- /dev/null +++ b/sql/iceberg-schema-evolution-gotcha-possibility.sql @@ -0,0 +1,13 @@ +DROP TABLE IF EXISTS local.udevelopers_sorted; +CREATE TABLE IF NOT EXISTS local.udevelopers_sorted ( + username string, + firstname string, + lastname string) +USING ICEBERG; +ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY lastname; +ALTER TABLE local.udevelopers_sorted RENAME COLUMN lastname TO deprecated_lastname; +SELECT * FROM local.udevelopers_sorted; +ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY username; +ALTER TABLE local.udevelopers_sorted DROP COLUMN lastname; +SELECT * FROM local.udevelopers_sorted; + diff --git a/sql/iceberg-schema-evolution-gotcha-possibility.sql.expected_to_fail b/sql/iceberg-schema-evolution-gotcha-possibility.sql.expected_to_fail new file mode 100644 index 0000000..e69de29 diff --git a/sql/iceberg-schema-evolution-gotcha-workaround.sql b/sql/iceberg-schema-evolution-gotcha-workaround.sql new file mode 100644 index 0000000..9b3674d --- /dev/null +++ b/sql/iceberg-schema-evolution-gotcha-workaround.sql @@ -0,0 +1,14 @@ +DROP TABLE IF EXISTS local.udevelopers_sorted; +CREATE TABLE IF NOT EXISTS local.udevelopers_sorted ( + username string, + firstname string, + lastname string) +USING ICEBERG; +ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY lastname; +SELECT * FROM local.udevelopers_sorted; +ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY username; +-- Hack, add it to identifier fields so we can do a "partial" drop where it stays in the schema and we don't +-- corrupt the metadata. +ALTER TABLE local.udevelopers_sorted ADD PARTITION FIELD lastname; +ALTER TABLE local.udevelopers_sorted DROP PARTITION FIELD lastname; +SELECT * FROM local.udevelopers_sorted; From cd1983f98cc5a9940f842da740e566a2b5133964 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 24 Sep 2023 22:27:06 -0700 Subject: [PATCH 26/53] Allow running different versions more easily, fix repro fo Iceberg issue. --- env_setup.sh | 8 ++++---- sql/iceberg-schema-evolution-gotcha-possibility.sql | 3 ++- sql/iceberg-schema-evolution-gotcha-workaround.sql | 1 + 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/env_setup.sh b/env_setup.sh index 80f34cb..41e2839 100644 --- a/env_setup.sh +++ b/env_setup.sh @@ -2,13 +2,13 @@ # Download Spark and iceberg if not present -SPARK_MAJOR="3.4" -SPARK_VERSION=3.4.1 -SCALA_VERSION="2.12" +SPARK_MAJOR=${SPARK_MAJOR:-"3.4"} +SPARK_VERSION=${SPARK_VERSION:-"3.4.1"} +SCALA_VERSION=${SCALA_VERSION:-"2.12"} HADOOP_VERSION="3" SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" -ICEBERG_VERSION="1.3.1" +ICEBERG_VERSION=${ICEBERG_VERSION:-"1.3.1"} if [ ! -f "${SPARK_FILE}" ]; then wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" & fi diff --git a/sql/iceberg-schema-evolution-gotcha-possibility.sql b/sql/iceberg-schema-evolution-gotcha-possibility.sql index 2038423..99b9fd6 100644 --- a/sql/iceberg-schema-evolution-gotcha-possibility.sql +++ b/sql/iceberg-schema-evolution-gotcha-possibility.sql @@ -4,10 +4,11 @@ CREATE TABLE IF NOT EXISTS local.udevelopers_sorted ( firstname string, lastname string) USING ICEBERG; +INSERT INTO local.udevelopers_sorted VALUES("krisnova", "Kris", "Nova"); ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY lastname; ALTER TABLE local.udevelopers_sorted RENAME COLUMN lastname TO deprecated_lastname; SELECT * FROM local.udevelopers_sorted; ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY username; -ALTER TABLE local.udevelopers_sorted DROP COLUMN lastname; +ALTER TABLE local.udevelopers_sorted DROP COLUMN deprecated_lastname; SELECT * FROM local.udevelopers_sorted; diff --git a/sql/iceberg-schema-evolution-gotcha-workaround.sql b/sql/iceberg-schema-evolution-gotcha-workaround.sql index 9b3674d..5b57afb 100644 --- a/sql/iceberg-schema-evolution-gotcha-workaround.sql +++ b/sql/iceberg-schema-evolution-gotcha-workaround.sql @@ -5,6 +5,7 @@ CREATE TABLE IF NOT EXISTS local.udevelopers_sorted ( lastname string) USING ICEBERG; ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY lastname; +INSERT INTO local.udevelopers_sorted VALUES("krisnova", "Kris", "Nova"); SELECT * FROM local.udevelopers_sorted; ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY username; -- Hack, add it to identifier fields so we can do a "partial" drop where it stays in the schema and we don't From 0559b2f17ef1b405b3ccc8543b0beda3db832078 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 24 Sep 2023 22:33:04 -0700 Subject: [PATCH 27/53] Style fix --- env_setup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/env_setup.sh b/env_setup.sh index 41e2839..a79213e 100644 --- a/env_setup.sh +++ b/env_setup.sh @@ -20,7 +20,7 @@ fi wait # Setup the env if [ ! -d "${SPARK_PATH}" ]; then - tar -xf ${SPARK_FILE} + tar -xf "${SPARK_FILE}" fi export SPARK_HOME="${SPARK_PATH}" @@ -30,7 +30,7 @@ if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then fi # Set up for running pyspark and friends -export PATH=${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH} +export PATH="${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH}" # Make sure we have a history directory mkdir -p /tmp/spark-events From 9d5395cdba58856d6d9c8254b382ffeaaec9b563 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 8 Oct 2023 21:35:36 -0700 Subject: [PATCH 28/53] Spark expectations upgrade (#116) * Update to SE 1.0 * Break up the rules into seperatefiles. * Get SparkExpectations complex rules present WIP * Style fixes --- .gitignore | 1 + python/examples/spark_expectations_example.py | 102 ++++++++++-------- .../spark_expectations_sample_rules.json | 1 - python/requirements.txt | 4 +- run_pyspark_examples.sh | 3 + se_complex.json | 1 + se_simple.json | 1 + 7 files changed, 63 insertions(+), 50 deletions(-) delete mode 100644 python/examples/spark_expectations_sample_rules.json create mode 100644 se_complex.json create mode 100644 se_simple.json diff --git a/.gitignore b/.gitignore index 21afd2c..7b3e525 100644 --- a/.gitignore +++ b/.gitignore @@ -80,6 +80,7 @@ metastore_db/ sql/*.sql.out python/examples/*.py.out data/fetched/* +spark_expectations_sample_rules.json # more python pyspark_venv.tar.gz diff --git a/python/examples/spark_expectations_example.py b/python/examples/spark_expectations_example.py index dab1202..7b03df4 100644 --- a/python/examples/spark_expectations_example.py +++ b/python/examples/spark_expectations_example.py @@ -1,14 +1,16 @@ from pyspark import SparkFiles from pyspark.sql import * -from spark_expectations.core.expectations import SparkExpectations +from spark_expectations.core.expectations import ( + SparkExpectations, + WrappedDataFrameWriter, +) spark = SparkSession.builder.master("local[4]").getOrCreate() sc = spark.sparkContext +sc.setLogLevel("ERROR") # tag::global_setup[] -from spark_expectations.config.user_config import * - -se_global_spark_Conf = { +se_conf = { "se_notifications_enable_email": False, "se_notifications_email_smtp_host": "mailhost.example.com", "se_notifications_email_smtp_port": 25, @@ -17,12 +19,13 @@ "se_notifications_on_fail": True, "se_notifications_on_error_drop_exceeds_threshold_breach": True, "se_notifications_on_error_drop_threshold": 15, - "se_enable_streaming": False, # Required or tries to publish to kafka. } # end::gloabl_setup[] # tag::setup_and_load[] +from spark_expectations.config.user_config import Constants as user_config + spark.sql("DROP TABLE IF EXISTS local.magic_validation") spark.sql( """ @@ -43,57 +46,39 @@ error_drop_threshold INT )""" ) -spark.sql( - """ -create table if not exists local.pay_stats ( - product_id STRING, - table_name STRING, - input_count LONG, - error_count LONG, - output_count LONG, - output_percentage FLOAT, - success_percentage FLOAT, - error_percentage FLOAT, - source_agg_dq_results array>, - final_agg_dq_results array>, - source_query_dq_results array>, - final_query_dq_results array>, - row_dq_res_summary array>, - row_dq_error_threshold array>, - dq_status map, - dq_run_time map, - dq_rules map>, - meta_dq_run_id STRING, - meta_dq_run_date DATE, - meta_dq_run_datetime TIMESTAMP -);""" -) -rule_file = "./spark_expectations_sample_rules.json" +# Reminder: addFile does not handle directories well. +rule_file = "spark_expectations_sample_rules.json" sc.addFile(rule_file) df = spark.read.json(SparkFiles.get(rule_file)) print(df) df.write.option("byname", "true").mode("append").saveAsTable("local.magic_validation") spark.read.table("local.magic_validation").show() + +# Can be used to point to your desired metastore. +se_writer = WrappedDataFrameWriter().mode("append").format("iceberg") + +rule_df = spark.sql("select * from local.magic_validation") + se: SparkExpectations = SparkExpectations( - product_id="pay", debugger=True # Used to filter which rules we apply + rules_df=rule_df, # See if we can replace this with the DF we wrote out. + product_id="pay", # We will only apply rules matching this product id + stats_table="local.dq_stats", + stats_table_writer=se_writer, + target_and_error_table_writer=se_writer, + stats_streaming_options={user_config.se_enable_streaming: False}, ) # end::setup_and_load[] +rule_df.show(truncate=200) -# tag::run_validation[] -# Only row data quality checking +# tag::run_validation_row[] @se.with_expectations( - se.reader.get_rules_from_table( - product_rules_table="local.magic_validation", - target_table_name="local.bonuses", - dq_stats_table_name="local.pay_stats", - ), - write_to_table=False, - row_dq=True, - # This does not work currently (Iceberg) - spark_conf={"format": "iceberg"}, - options={"format": "iceberg"}, - options_error_table={"format": "iceberg"}, + user_conf=se_conf, + write_to_table=False, # If set to true SE will write to the target table. + target_and_error_table_writer=se_writer, + # target_table is used to create the error table (e.g. here local.fake_table_name_error) + # and filter the rules on top of the global product filter. + target_table="local.fake_table_name", ) def load_data(): raw_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) @@ -101,5 +86,28 @@ def load_data(): return uk_df -data = load_data() -# end::run_validation[] +# data = load_data() +# end::run_validation_row[] + + +# tag::run_validation_complex[] +@se.with_expectations( + user_conf=se_conf, + write_to_table=True, # If set to true SE will write to the target table. + target_and_error_table_writer=se_writer, + # target_table is used to create the error table (e.g. here local.fake_table_name_error) + # and filter the rules on top of the global product filter. + target_table="local.3rd_fake", +) +def load_data2(): + raw_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) + uk_df = raw_df.select("CompanyNumber", "MaleBonusPercent", "FemaleBonuspercent") + return uk_df + + +data = load_data2() +# end::run_validation_complex[] + +spark.sql("SELECT table_name, error_percentage, * FROM local.dq_stats").show( + truncate=300 +) diff --git a/python/examples/spark_expectations_sample_rules.json b/python/examples/spark_expectations_sample_rules.json deleted file mode 100644 index fc24e69..0000000 --- a/python/examples/spark_expectations_sample_rules.json +++ /dev/null @@ -1 +0,0 @@ -{"product_id": "pay", "table_name": "local.bonuses", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": true, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} diff --git a/python/requirements.txt b/python/requirements.txt index 9957d0f..2c77b23 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,11 +1,11 @@ spark-testing-base pandas pyarrow -pyspark +pyspark<3.5 pyspark-asyncactions pandera pandera[pyspark] -spark-expectations +spark-expectations>=1.0 venv-pack delta-spark requests diff --git a/run_pyspark_examples.sh b/run_pyspark_examples.sh index 5ce8912..a5cc7b8 100755 --- a/run_pyspark_examples.sh +++ b/run_pyspark_examples.sh @@ -27,6 +27,9 @@ export PYSPARK_DRIVER_PYTHON export PYTHON_PATH=./environment/bin/python #end::package_venv[] +# Some hack for our json magic +cat se*.json > spark_expectations_sample_rules.json + function run_example () { local ex="$1" # shellcheck disable=SC2046 diff --git a/se_complex.json b/se_complex.json new file mode 100644 index 0000000..f958d69 --- /dev/null +++ b/se_complex.json @@ -0,0 +1 @@ +{"product_id": "pay", "table_name": "local.3rd_fake", "rule_type": "query_dq", "rule": "history", "column_name": "MaleBonusPercent", "expectation": "(select count(*) from 3rd_fake_view) < (select input_count from local.dq_stats WHERE table_name='local.3rd_fake')", "action_if_failed": "fail", "tag": "", "description": "We should always have more records than before", "enable_for_source_dq_validation": false, "enable_for_target_dq_validation": true, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} diff --git a/se_simple.json b/se_simple.json new file mode 100644 index 0000000..72d9b86 --- /dev/null +++ b/se_simple.json @@ -0,0 +1 @@ +{"product_id": "pay", "table_name": "local.fake_table_name", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": false, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} From 61c0b31110a4e3e0276620c811db3635e5ad30d5 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 13 Oct 2023 18:29:53 -0700 Subject: [PATCH 29/53] Upgrade iceberg14 & spark 3.5 (#115) * Start upgrade to iceberg 1.4 * And Spark 3.5 * And update sbt build too. * Update spark testing base version to match. * Drop the word2vec simple MLlib example, Spark no longer depends on blas transitively. * Drop the BLAS import. * Fix cleanup * Bump to 1.4.5 to try and avoid the issue with metadata. * Go 1.4.7 --- build.sbt | 4 +- .../mllib/GoldilocksMLlib.scala | 37 ------------------- env_setup.sh | 6 ++- 3 files changed, 6 insertions(+), 41 deletions(-) diff --git a/build.sbt b/build.sbt index 671da8b..f950fa3 100644 --- a/build.sbt +++ b/build.sbt @@ -73,8 +73,8 @@ lazy val core = (project in file("core")) // regular scala code with @native met javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-Djna.nosys=true"), Test / javaOptions ++= specialOptions, // 2.4.5 is the highest version we have with the old spark-testing-base deps - sparkVersion := System.getProperty("sparkVersion", "3.3.0"), - sparkTestingVersion := "1.4.0", + sparkVersion := System.getProperty("sparkVersion", "3.5.0"), + sparkTestingVersion := "1.4.7", // additional libraries libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % sparkVersion.value, diff --git a/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala b/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala index cde64c7..f57b469 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala @@ -14,7 +14,6 @@ import org.apache.spark.mllib.linalg.{Vector => SparkVector} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD -import com.github.fommil.netlib.BLAS.{getInstance => blas} import com.highperformancespark.examples.dataframe._ //end::imports[] @@ -97,42 +96,6 @@ object GoldilocksMLlib { } //end::trainScaler[] - //tag::word2vecSimple[] - def word2vec(sc: SparkContext, rdd: RDD[String]): RDD[SparkVector] = { - // Tokenize our data - val tokenized = rdd.map(_.split(" ").toIterable) - // Construct our word2vec model - val wv = new Word2Vec() - val wvm = wv.fit(tokenized) - val wvmb = sc.broadcast(wvm) - // WVM can now transform single words - println(wvm.transform("panda")) - // Vector size is 100 - we use this to build a transformer on top of WVM that - // works on sentences. - val vectorSize = 100 - // The transform function works on a per-word basis, but we have - // sentences as input. - tokenized.map{words => - // If there is nothing in the sentence output a null vector - if (words.isEmpty) { - Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double]) - } else { - // If there are sentences construct a running sum of the - // vectors for each word - val sum = Array[Double](vectorSize) - words.foreach { word => - blas.daxpy( - vectorSize, 1.0, wvmb.value.transform(word).toArray, 1, sum, 1) - } - // Then scale it by the number of words - blas.dscal(sum.length, 1.0 / words.size, sum, 1) - // And wrap it in a Spark vector - Vectors.dense(sum) - } - } - } - //end::word2vecSimple[] - //tag::hashingTFPreserve[] def toVectorPerserving(rdd: RDD[RawPanda]): RDD[(RawPanda, SparkVector)] = { val ht = new HashingTF() diff --git a/env_setup.sh b/env_setup.sh index a79213e..34fa427 100644 --- a/env_setup.sh +++ b/env_setup.sh @@ -3,12 +3,12 @@ # Download Spark and iceberg if not present SPARK_MAJOR=${SPARK_MAJOR:-"3.4"} -SPARK_VERSION=${SPARK_VERSION:-"3.4.1"} +SPARK_VERSION=${SPARK_VERSION:-"3.5.0"} SCALA_VERSION=${SCALA_VERSION:-"2.12"} HADOOP_VERSION="3" SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" -ICEBERG_VERSION=${ICEBERG_VERSION:-"1.3.1"} +ICEBERG_VERSION=${ICEBERG_VERSION:-"1.4.0"} if [ ! -f "${SPARK_FILE}" ]; then wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" & fi @@ -26,6 +26,8 @@ fi export SPARK_HOME="${SPARK_PATH}" if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then + # Delete the old JAR first. + rm "${SPARK_PATH}/jars/iceberg-spark-runtime*.jar" || echo "No old version to delete." cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}" fi From 7f6f8b2ce3e56af97b3a0bd45e98e5c3bb1dbb7b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 14 Oct 2023 11:27:36 -0700 Subject: [PATCH 30/53] Start working on adding previous run data (#114) * Start working on adding previous run data * Fix up load previous run a bit * Drop large target size ex. * line breaks on log ops make sense to me. * in CI we don't have previous jobs * Use modern spark-testing-base for Python and use session not SQLCtx. --- python/examples/load_previous_run_data.py | 31 +++++++++++++++++++ .../examples/test_load_previous_run_data.py | 15 +++++++++ python/requirements.txt | 3 +- python/tox.ini | 12 +++---- 4 files changed, 53 insertions(+), 8 deletions(-) create mode 100644 python/examples/load_previous_run_data.py create mode 100644 python/examples/test_load_previous_run_data.py diff --git a/python/examples/load_previous_run_data.py b/python/examples/load_previous_run_data.py new file mode 100644 index 0000000..d927768 --- /dev/null +++ b/python/examples/load_previous_run_data.py @@ -0,0 +1,31 @@ +import os +import tempfile + + +class LoadPreviousRunData(object): + def __init__(self, session): + self.session = session + + def find_oldest_id(self, local_path): + """Find the oldest Spark job since it's probably not being updated.""" + directories = os.listdir(local_path) + return min(directories, key=lambda x: os.path.getmtime(f"{local_path}/{x}")) + + def do_magic(self): + local_path = "/tmp/spark-events" + event_log_path = f"file://{local_path}" + application_id = self.find_oldest_id(local_path) + return self.load_json_records(event_log_path, application_id) + + # tag::load[] + def load_json_records(self, event_log_path, application_id): + print(f"Loading {application_id}") + full_log_path = f"{event_log_path}/{application_id}" + df = self.session.read.json(full_log_path) + special_events = df.filter( + (df["Event"] == "SparkListenerExecutorAdded") + | (df["Event"] == "SparkListenerJobEnd") + ) + special_events.show() + + # end::load[] diff --git a/python/examples/test_load_previous_run_data.py b/python/examples/test_load_previous_run_data.py new file mode 100644 index 0000000..1f0ca31 --- /dev/null +++ b/python/examples/test_load_previous_run_data.py @@ -0,0 +1,15 @@ +from pyspark.sql.session import SparkSession +import os +import tempfile + +from sparktestingbase.sqltestcase import SQLTestCase +from .load_previous_run_data import LoadPreviousRunData + + +class TestLoadPreviousRunData(SQLTestCase): + def test_do_magic(self): + lprd = LoadPreviousRunData(self.session) + try: + lprd.do_magic() + except FileNotFoundError: + print("No previous jobs") diff --git a/python/requirements.txt b/python/requirements.txt index 2c77b23..75d55bb 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,11 +1,10 @@ spark-testing-base pandas pyarrow -pyspark<3.5 +pyspark==3.5.0 pyspark-asyncactions pandera pandera[pyspark] spark-expectations>=1.0 venv-pack -delta-spark requests diff --git a/python/tox.ini b/python/tox.ini index 4c90b8f..e661b21 100644 --- a/python/tox.ini +++ b/python/tox.ini @@ -4,7 +4,7 @@ isolated_build = True requires = tox-conda envlist = isort - py39 + py310 black mypy flake8 @@ -13,9 +13,9 @@ skip_missing_interpeters = true [gh-actions] python = - 3.9: py39 +# 3.9: py39 # We need a new version of PySpark w/3.10 support. -# 3.10: py310 + 3.10: py310 [testenv] setenv = @@ -29,9 +29,9 @@ extras = deps = pytest isort==4.3.21 - pyspark + pyspark==3.5.0 flake8 - spark-testing-base + spark-testing-base>=0.11.1 -rrequirements.txt commands = pytest examples \ @@ -56,7 +56,7 @@ deps = [testenv:flake8] extras = tests skipsdist = True -commands = flake8 --ignore=F403,E402,F401,F405 examples +commands = flake8 --ignore=F403,E402,F401,F405,W503 examples allowlist_externals = flake8 [testenv:mypy] From 512f712382db2fccba71521406315cd99c5c7f29 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 16 Oct 2023 20:19:12 -0700 Subject: [PATCH 31/53] Python Examples and Spark Expectations Examples fix (#118) * Add a row dq rule to se complex so that the agg dq can execute. * Get the Python examples running properly with the jar friend. * Use assembly magic * Style fix * Style fix * Mini fix * jspark not jSpark * Use core assembly * MOre fixing * Fix to seperate out tests from examples * Fix printing our the errors style --- .jvmopts | 4 +++ build.sbt | 32 +++++++++++++------ project/build.properties | 2 +- project/plugins.sbt | 2 ++ python/examples/SQLLineage.py | 6 ++-- python/examples/bad_pyspark.py.fail | 0 python/examples/simple_perf.py | 11 ++++--- python/examples/spark_expectations_example.py | 4 +-- .../spark_expectations_example.py.fail | 0 run_pyspark_examples.sh | 26 +++++++++++++-- se_complex.json | 3 +- 11 files changed, 66 insertions(+), 24 deletions(-) create mode 100644 .jvmopts create mode 100644 python/examples/bad_pyspark.py.fail create mode 100644 python/examples/spark_expectations_example.py.fail diff --git a/.jvmopts b/.jvmopts new file mode 100644 index 0000000..694a6c7 --- /dev/null +++ b/.jvmopts @@ -0,0 +1,4 @@ + -Xms4096M + -Xmx8096M + -Xss2M + -XX:MaxMetaspaceSize=4024M \ No newline at end of file diff --git a/build.sbt b/build.sbt index f950fa3..bd20c72 100644 --- a/build.sbt +++ b/build.sbt @@ -70,22 +70,22 @@ lazy val core = (project in file("core")) // regular scala code with @native met javacOptions ++= Seq("-source", "1.8", "-target", "1.8"), parallelExecution in Test := false, fork := true, - javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-Djna.nosys=true"), + javaOptions ++= Seq("-Xms4048M", "-Xmx4048M", "-Djna.nosys=true"), Test / javaOptions ++= specialOptions, // 2.4.5 is the highest version we have with the old spark-testing-base deps sparkVersion := System.getProperty("sparkVersion", "3.5.0"), sparkTestingVersion := "1.4.7", // additional libraries libraryDependencies ++= Seq( - "org.apache.spark" %% "spark-core" % sparkVersion.value, - "org.apache.spark" %% "spark-streaming" % sparkVersion.value, - "org.apache.spark" %% "spark-sql" % sparkVersion.value, - "org.apache.spark" %% "spark-hive" % sparkVersion.value, - "org.apache.spark" %% "spark-hive-thriftserver" % sparkVersion.value, - "org.apache.spark" %% "spark-catalyst" % sparkVersion.value, - "org.apache.spark" %% "spark-yarn" % sparkVersion.value, - "org.apache.spark" %% "spark-mllib" % sparkVersion.value, - "com.holdenkarau" %% "spark-testing-base" % s"${sparkVersion.value}_${sparkTestingVersion.value}", + "org.apache.spark" %% "spark-core" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-streaming" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-sql" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-hive" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-hive-thriftserver" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-yarn" % sparkVersion.value % Provided, + "org.apache.spark" %% "spark-mllib" % sparkVersion.value % Provided, + "com.holdenkarau" %% "spark-testing-base" % s"${sparkVersion.value}_${sparkTestingVersion.value}" % Test, //tag::scalaLogging[] "com.typesafe.scala-logging" %% "scala-logging" % "3.9.4", //end::scalaLogging[] @@ -105,3 +105,15 @@ ThisBuild / libraryDependencySchemes ++= Seq( "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always ) //end::xmlVersionConflict[] + +assemblyMergeStrategy in assembly := { + case x => MergeStrategy.first +} + +assemblyMergeStrategy in native := { + case x => MergeStrategy.first +} + +assemblyMergeStrategy in core := { + case x => MergeStrategy.first +} diff --git a/project/build.properties b/project/build.properties index 46e43a9..2743082 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=1.8.2 +sbt.version=1.9.6 diff --git a/project/plugins.sbt b/project/plugins.sbt index 8ea77fc..7c18949 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -23,3 +23,5 @@ ThisBuild / libraryDependencySchemes ++= Seq( "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always ) //end::xmlVersionConflict[] + +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.3") diff --git a/python/examples/SQLLineage.py b/python/examples/SQLLineage.py index 71b7209..26bd0c4 100644 --- a/python/examples/SQLLineage.py +++ b/python/examples/SQLLineage.py @@ -32,10 +32,10 @@ def cutLineage(df): jRDD = df._jdf.toJavaRDD() jSchema = df._jdf.schema() jRDD.cache() - sqlCtx = df.sql_ctx - javaSparkSession = sqlCtx._jSparkSession + session = df.sparkSession + javaSparkSession = session._jsparkSession newJavaDF = javaSparkSession.createDataFrame(jRDD, jSchema) - newDF = DataFrame(newJavaDF, sqlCtx) + newDF = DataFrame(newJavaDF, session) return newDF diff --git a/python/examples/bad_pyspark.py.fail b/python/examples/bad_pyspark.py.fail new file mode 100644 index 0000000..e69de29 diff --git a/python/examples/simple_perf.py b/python/examples/simple_perf.py index 30ba2c7..1c72525 100644 --- a/python/examples/simple_perf.py +++ b/python/examples/simple_perf.py @@ -33,7 +33,7 @@ def generate_scale_data(sqlCtx, rows, numCols): """ # tag::javaInterop[] sc = sqlCtx._sc - javaSparkSession = sqlCtx._jSparkSession + javaSparkSession = sqlCtx._jsparkSession jsc = sc._jsc scalasc = jsc.sc() gateway = sc._gateway @@ -69,7 +69,7 @@ def runOnDF(df): def runOnRDD(rdd): result = ( - rdd.map(lambda x, y: (x, (y, 1))) + rdd.map(lambda xy: (xy[0], (xy[1], 1))) .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) .count() ) @@ -140,8 +140,11 @@ def parseArgs(args): Usage: simple_perf_test scalingFactor size """ - (scalingFactor, size) = parseArgs(sys.argv) - session = SparkSession.appName("SimplePythonPerf").builder.getOrCreate() + scalingFactor = 1 + size = 1 + if len(sys.argv) > 2: + (scalingFactor, size) = parseArgs(sys.argv) + session = SparkSession.builder.appName("SimplePythonPerf").getOrCreate() sc = session._sc run(sc, session, scalingFactor, size) diff --git a/python/examples/spark_expectations_example.py b/python/examples/spark_expectations_example.py index 7b03df4..d50f829 100644 --- a/python/examples/spark_expectations_example.py +++ b/python/examples/spark_expectations_example.py @@ -108,6 +108,4 @@ def load_data2(): data = load_data2() # end::run_validation_complex[] -spark.sql("SELECT table_name, error_percentage, * FROM local.dq_stats").show( - truncate=300 -) +spark.sql("SELECT * FROM local.3rd_fake_error").show(truncate=300) diff --git a/python/examples/spark_expectations_example.py.fail b/python/examples/spark_expectations_example.py.fail new file mode 100644 index 0000000..e69de29 diff --git a/run_pyspark_examples.sh b/run_pyspark_examples.sh index a5cc7b8..1010347 100755 --- a/run_pyspark_examples.sh +++ b/run_pyspark_examples.sh @@ -30,6 +30,22 @@ export PYTHON_PATH=./environment/bin/python # Some hack for our json magic cat se*.json > spark_expectations_sample_rules.json +function check_fail () { + local ex="$1" + local code="$2" + if [ -f "${ex}.fail" ]; then + echo "ok"; + else + exit "$code" + fi +} + +EXAMPLE_JAR="./core/target/scala-2.12/core-assembly-0.1.0-SNAPSHOT.jar" + +if [ ! -f "${EXAMPLE_JAR}" ]; then + sbt core/assembly +fi + function run_example () { local ex="$1" # shellcheck disable=SC2046 @@ -45,13 +61,19 @@ function run_example () { --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ $(cat "${ex}.conf" || echo "") \ --name "${ex}" \ - "${ex}" 2>&1 | tee -a "${ex}.out" || echo "ok" + --jars "${EXAMPLE_JAR}" \ + "${ex}" 2>&1 | tee -a "${ex}.out" || check_fail "$ex" $? } if [ $# -eq 1 ]; then run_example "python/examples/$1" else for ex in python/examples/*.py; do - run_example "$ex" + if [[ "$ex" =~ test.* ]]; then + echo "Skipping ex $ex as it is a test and covered by our tests." + else + echo "Running $ex" + run_example "$ex" + fi done fi diff --git a/se_complex.json b/se_complex.json index f958d69..f073e64 100644 --- a/se_complex.json +++ b/se_complex.json @@ -1 +1,2 @@ -{"product_id": "pay", "table_name": "local.3rd_fake", "rule_type": "query_dq", "rule": "history", "column_name": "MaleBonusPercent", "expectation": "(select count(*) from 3rd_fake_view) < (select input_count from local.dq_stats WHERE table_name='local.3rd_fake')", "action_if_failed": "fail", "tag": "", "description": "We should always have more records than before", "enable_for_source_dq_validation": false, "enable_for_target_dq_validation": true, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} +{"product_id": "pay", "table_name": "local.3rd_fake", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": false, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} +{"product_id": "pay", "table_name": "local.3rd_fake", "rule_type": "query_dq", "rule": "history", "column_name": "MaleBonusPercent", "expectation": "(select count(*) from 3rd_fake_view) > (select input_count from local.dq_stats WHERE table_name='local.3rd_fake' LIMIT 1)", "action_if_failed": "fail", "tag": "", "description": "We should always have more records than before", "enable_for_source_dq_validation": false, "enable_for_target_dq_validation": true, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} From 270804f67f7455c8d4159bc67292ce5fa0df1eac Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 16 Oct 2023 22:01:03 -0700 Subject: [PATCH 32/53] Fix iceberg setup and also fix the WAP SQL (+ file upstream bug) (#119) --- env_setup.sh | 4 ++-- sql/wap.sql | 17 +++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/env_setup.sh b/env_setup.sh index 34fa427..2f4e834 100644 --- a/env_setup.sh +++ b/env_setup.sh @@ -2,8 +2,8 @@ # Download Spark and iceberg if not present -SPARK_MAJOR=${SPARK_MAJOR:-"3.4"} -SPARK_VERSION=${SPARK_VERSION:-"3.5.0"} +SPARK_MAJOR=${SPARK_MAJOR:-"3.5"} +SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.0"} SCALA_VERSION=${SCALA_VERSION:-"2.12"} HADOOP_VERSION="3" SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" diff --git a/sql/wap.sql b/sql/wap.sql index ac513eb..6665c22 100644 --- a/sql/wap.sql +++ b/sql/wap.sql @@ -1,4 +1,5 @@ -CREATE TABLE IF NOT EXISTS local.projects ( +DROP TABLE IF EXISTS local.wap_projects; +CREATE TABLE local.wap_projects ( creator string, projectname string) USING iceberg @@ -6,9 +7,13 @@ PARTITIONED BY (creator); ALTER TABLE local.projects SET TBLPROPERTIES ( 'write.wap.enabled''true' ); -ALTER TABLE local.projects CREATE BRANCH IF NOT EXISTS `audit`; -SET spark.wap.branch = 'branch'; +-- We need a first commit, see https://github.com/apache/iceberg/issues/8849 +INSERT INTO local.wap_projects VALUES("holdenk", "spark"); +ALTER TABLE local.wap_projects DROP BRANCH IF EXISTS `audit-branch`; +ALTER TABLE local.wap_projects CREATE BRANCH `audit-branch`; +SET spark.wap.branch = 'audit-branch'; INSERT INTO local.projects VALUES("krisnova", "aurae"); -SELECT count(*) FROM local.projects VERSION AS OF 'audit' WHERE creator is NULL; --- This does not work until we upgrade to 3.5 + Iceberg 1.4. --- CALL local.system.fastForward("local.projects", "main", "audit-branch"); +SELECT count(*) FROM local.wap_projects VERSION AS OF 'audit-branch' WHERE creator is NULL; +SELECT count(*) FROM local.wap_projects VERSION AS OF 'audit-branch' WHERE creator == "krisnova"; +CALL local.system.remove_orphan_files(table => 'local.wap_projects'); +CALL local.system.fast_forward("local.wap_projects", "main", "audit-branch"); From caf753359470e674051634324af58b65ca8c93ad Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 26 Nov 2023 12:54:57 -0800 Subject: [PATCH 33/53] Load previous run data / metrics (#122) * Start working on adding previous run data * Fix up load previous run a bit * Drop large target size ex. * line breaks on log ops make sense to me. * in CI we don't have previous jobs * Use modern spark-testing-base for Python and use session not SQLCtx. From 834a8e4fb06590105cfac6f1574aaa8a8b29b6d9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 31 Mar 2024 22:09:07 -0700 Subject: [PATCH 34/53] Update the beyond the jvm chapter (#123) * Update git ignore. * Start adding some shims for running accelerators. * Try and setup velox Try and fix dependecny setup issue. Ok if we can't install libgoogle-glog lets see if we can still build. Try explicitilyy install libunwind-dev as suggested by https://github.com/kadalu-tech/pkgs/pull/2/files#r1001042597 Ok try and make velox optional. Get the build to include our GLuten UDF if and only if gluten is present Refactor gluten UDF build to produce a seperate object file so we can conditonally do things based on that. Start work to selectively integrate gluten into the examples. Add script to setup gluten Make some progress to integrating gluten into our examples. simplify build options. Start adding a Gluten 34 ex but it only works in old ubuntu anyways. Give up on Gluten on modern systems because I don't have time for that. But maybe we can get it to work with Clickhouse Style fixes. Work on trying to get something with gluten to run. bloop * Update accel stuff * Get Gluten + Spark3.4 to party (note: this fails because of Gluten segfault) re-enable gluten-sql-ex Add cache accel cache. Lets go for 3.5.1 Update shell for style Fix gluten jar dl Add full path for SPARK_PATH Only use pre-built for 20.04 Build deps with sudo Ignore incubator gluten More work getting gluten and comet * Fix comet resolution * Multiple extensions (Iceberg and Comet) * Turn on Comet shuffle * Style fixes * Use version 3.4.2 and also use setup rust action for speed * Seperate out setup comet so we can debug faster. * Setup jdk versions for happiness. * Change caching to make sense * Work around the classloader issue we found. * shellcheck fix. * Hmm why no version. * Fix version pass in for setup * Fix comet setup * Try and fix gluten build * Style fix and statically link * vcpkg * Try and fix vcpkg * meh vcpkg is kind of a pain, lets skip it. * Huzzah --driver-class-path does the trick. * Make setup_gluten_deps better formated for book inclusion * Tag the gluten setup * Disable gluten SQL * Tag comet example for inclusion. * Add Python UDF/UDAF examples. * Style fixes * Move SparkSession builder up * style fix * Fix typing import + pd.DF * Style fix * Use axel if present * Add mypy to tox.ini so we don't depend on it being in the system setup. * Fix axel command. --- .github/workflows/ci.yml | 78 +++++++++++++++++- .gitignore | 10 +++ accelerators/comet_env_setup.sh | 20 +++++ accelerators/comet_ex.sh | 16 ++++ accelerators/gluten_config.properties | 5 ++ accelerators/gluten_env_setup.sh | 31 +++++++ accelerators/gluten_spark_34_ex.sh | 22 +++++ accelerators/install_rust_if_needed.sh | 9 ++ accelerators/run_gluten.sh | 3 + accelerators/setup_comet.sh | 27 ++++++ accelerators/setup_gluten_deps.sh | 14 ++++ accelerators/setup_gluten_from_src.sh | 23 ++++++ accelerators/setup_gluten_spark34.sh | 56 +++++++++++++ c | 2 + env_setup.sh | 18 +++- native/src/CMakeLists.txt | 23 +++++- native/src/c/gluten/GlutenUDF.cpp | 82 +++++++++++++++++++ python/examples/udf.py | 73 +++++++++++++++++ python/tox.ini | 2 + run_sql_examples.sh | 38 +++++++-- sql/gluten_only_nonpartitioned_table_join.sql | 12 +++ target-validator/runme.sh | 2 +- 22 files changed, 552 insertions(+), 14 deletions(-) create mode 100644 accelerators/comet_env_setup.sh create mode 100755 accelerators/comet_ex.sh create mode 100644 accelerators/gluten_config.properties create mode 100755 accelerators/gluten_env_setup.sh create mode 100755 accelerators/gluten_spark_34_ex.sh create mode 100644 accelerators/install_rust_if_needed.sh create mode 100755 accelerators/run_gluten.sh create mode 100755 accelerators/setup_comet.sh create mode 100755 accelerators/setup_gluten_deps.sh create mode 100755 accelerators/setup_gluten_from_src.sh create mode 100755 accelerators/setup_gluten_spark34.sh create mode 100644 c mode change 100644 => 100755 env_setup.sh create mode 100644 native/src/c/gluten/GlutenUDF.cpp create mode 100644 python/examples/udf.py create mode 100644 sql/gluten_only_nonpartitioned_table_join.sql diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9392efc..5746690 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,6 +64,76 @@ jobs: - name: Run sql examples run: ./run_sql_examples.sh + # run-gluten-sql-examples: + # runs-on: ubuntu-latest + # steps: + # - name: Checkout + # uses: actions/checkout@v2 + # - name: Cache Spark and friends + # uses: actions/cache@v3 + # with: + # path: | + # spark*.tgz + # iceberg*.jar + # key: spark-artifacts + # - name: Setup JDK + # uses: actions/setup-java@v3 + # with: + # distribution: temurin + # java-version: 17 + # - name: Cache Maven packages + # uses: actions/cache@v2 + # with: + # path: ~/.m2 + # key: ${{ runner.os }}-m2-gluten + # - name: Cache Data + # uses: actions/cache@v3 + # with: + # path: | + # data/fetched/* + # key: data-fetched + # - name: Run gluten + # run: + # cd accelerators; ./gluten_spark_34_ex.sh + run-comet-sql-examples: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Cache Spark and friends + uses: actions/cache@v3 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + - name: Cache Data + uses: actions/cache@v3 + with: + path: | + data/fetched/* + key: data-fetched + - name: Cache Maven packages + uses: actions/cache@v2 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-comet + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + - name: Setup JDK + uses: actions/setup-java@v3 + with: + distribution: temurin + java-version: 17 + - name: Setup comet + run: + cd accelerators; SPARK_MAJOR=3.4 ./setup_comet.sh + - name: Run comet + run: + cd accelerators; ./comet_ex.sh run-target-examples: runs-on: ubuntu-latest steps: @@ -76,6 +146,12 @@ jobs: spark*.tgz iceberg*.jar key: spark-artifacts + - name: Cache Accel + uses: actions/cache@v3 + with: + path: | + accelerators/*.jar + key: accelerators-artifacts - name: Cache Data uses: actions/cache@v3 with: @@ -114,7 +190,7 @@ jobs: - name: Shellcheck run: | sudo apt-get install -y shellcheck - shellcheck $(find -name "*.sh") + shellcheck -e SC2317,SC1091,SC2034,SC2164 $(find -name "*.sh") - name: Setup JDK uses: actions/setup-java@v3 with: diff --git a/.gitignore b/.gitignore index 7b3e525..8d1365c 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,13 @@ spark_expectations_sample_rules.json # more python pyspark_venv.tar.gz pyspark_venv/ + +# accel stuff +accelerators/*.jar +accelerators/arrow-datafusion-comet +# ignore gluten +gluten +gluten*.jar +spark-3*hadoop*/ +spark-3*hadoop*.tgz +accelerators/incubator-gluten diff --git a/accelerators/comet_env_setup.sh b/accelerators/comet_env_setup.sh new file mode 100644 index 0000000..3563f0e --- /dev/null +++ b/accelerators/comet_env_setup.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +SPARK_EXTRA=" +--jars ${COMET_JAR} \ +--driver-class-path ${COMET_JAR} \ +--conf spark.comet.enabled=true \ +--conf spark.comet.exec.enabled=true \ +--conf spark.comet.exec.all.enabled=true \ +--conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ +--conf spark.comet.exec.shuffle.enabled=true \ +--conf spark.comet.columnar.shuffle.enabled=true" +# Instead of --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions we set +# EXTRA_EXTENSIONS so it can be appended to iceberg +if [ -z "$EXTRA_EXTENSIONS" ]; then + EXTRA_EXTENSIONS="org.apache.comet.CometSparkSessionExtensions" +else + EXTRA_EXTENSIONS="org.apache.comet.CometSparkSessionExtensions,$EXTRA_EXTENSIONS" +fi +export EXTRA_EXTENSIONS +export SPARK_EXTRA diff --git a/accelerators/comet_ex.sh b/accelerators/comet_ex.sh new file mode 100755 index 0000000..cd08177 --- /dev/null +++ b/accelerators/comet_ex.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -ex + +# If you change this update the workflow version too. +SPARK_MAJOR=${SPARK_MAJOR:-3.4} +SPARK_VERSION=3.4.2 +export SPARK_MAJOR +export SPARK_VERSION + +source setup_comet.sh +pushd .. +source ./env_setup.sh +popd +source comet_env_setup.sh +pushd .. +USE_COMET="true" ./run_sql_examples.sh diff --git a/accelerators/gluten_config.properties b/accelerators/gluten_config.properties new file mode 100644 index 0000000..eab3946 --- /dev/null +++ b/accelerators/gluten_config.properties @@ -0,0 +1,5 @@ +spark.plugins=io.glutenproject.GlutenPlugin +spark.memory.offHeap.enabled=true +spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager +# This static allocation is one of the hardest part of using Gluten +spark.memory.offHeap.size=20g diff --git a/accelerators/gluten_env_setup.sh b/accelerators/gluten_env_setup.sh new file mode 100755 index 0000000..6bda6ec --- /dev/null +++ b/accelerators/gluten_env_setup.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# Check if we gluten and gluten UDFs present +GLUTEN_NATIVE_LIB_NAME=libhigh-performance-spark-gluten-0.so +NATIVE_LIB_DIR=$(pwd)/../native/src/ +NATIVE_LIB_PATH="${NATIVE_LIB_DIR}${GLUTEN_NATIVE_LIB_NAME}" +GLUTEN_HOME=incubator-gluten +source /etc/lsb-release +if [ -n "$GLUTEN_JAR_PATH" ]; then + GLUTEN_EXISTS="true" + GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=5g \ + --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \ + --jars ${GLUTEN_JAR_PATH}" +fi +if [ -f "${NATIVE_LIB_PATH}" ]; then + if [ "$GLUTEN_EXISTS" == "true" ]; then + GLUTEN_UDF_EXISTS="true" + GLUTEN_SPARK_EXTRA="$GLUTEN_SPARK_EXTRA \ + --conf spark.jars=${GLUTEN_JAR_PATH} \ + --conf spark.gluten.loadLibFromJar=true \ + --files ${NATIVE_LIB_PATH} \ + --conf spark.gluten.sql.columnar.backend.velox.udfLibraryPaths=${GLUTEN_NATIVE_LIB_NAME}" + fi +fi +SPARK_EXTRA=GLUTEN_SPARK_EXTRA + +export SPARK_EXTRA +export GLUTEN_UDF_EXISTS +export GLUTEN_EXISTS diff --git a/accelerators/gluten_spark_34_ex.sh b/accelerators/gluten_spark_34_ex.sh new file mode 100755 index 0000000..0f98ab8 --- /dev/null +++ b/accelerators/gluten_spark_34_ex.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd "${SCRIPT_DIR}" +source "${SCRIPT_DIR}/setup_gluten_spark34.sh" + +export SPARK_HOME +PATH="$(pwd)/${SPARK_DIR}/bin:$PATH" +export PATH +"${SPARK_HOME}/bin/spark-sql" --master local[5] \ + --conf spark.plugins=io.glutenproject.GlutenPlugin \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=5g \ + --jars "${GLUTEN_JAR}" \ + --conf spark.eventLog.enabled=true \ + -e "SELECT 1" + +source gluten_env_setup.sh +cd .. +./run_sql_examples.sh || echo "Expected to fail" diff --git a/accelerators/install_rust_if_needed.sh b/accelerators/install_rust_if_needed.sh new file mode 100644 index 0000000..76826e8 --- /dev/null +++ b/accelerators/install_rust_if_needed.sh @@ -0,0 +1,9 @@ +#!/bin/bash +if [ -f "$HOME/.cargo/env" ]; then + source "$HOME/.cargo/env" +fi + +if ! command -v cargo; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + source "$HOME/.cargo/env" +fi diff --git a/accelerators/run_gluten.sh b/accelerators/run_gluten.sh new file mode 100755 index 0000000..34ddb3b --- /dev/null +++ b/accelerators/run_gluten.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +"${SPARK_HOME}/bin/spark-shell" --master local --jars "${ACCEL_JARS}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_2.12-1.1.1.jar" --spark-properties=gluten_config.properties diff --git a/accelerators/setup_comet.sh b/accelerators/setup_comet.sh new file mode 100755 index 0000000..a63f8eb --- /dev/null +++ b/accelerators/setup_comet.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -ex +source install_rust_if_needed.sh + +if [ -z "${SPARK_MAJOR}" ]; then + echo "Need a spark major version specified." + exit 1 +else + echo "Building comet for Spark ${SPARK_MAJOR}" +fi + +#tag::build[] +# If we don't have fusion checked out do it +if [ ! -d arrow-datafusion-comet ]; then + git clone https://github.com/apache/arrow-datafusion-comet.git +fi + +# Build JAR if not present +if [ -z "$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar)" ]; then + cd arrow-datafusion-comet + make clean release PROFILES="-Pspark-${SPARK_MAJOR}" + cd .. +fi +COMET_JAR="$(pwd)/$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*SNAPSHOT.jar)" +export COMET_JAR +#end::build[] diff --git a/accelerators/setup_gluten_deps.sh b/accelerators/setup_gluten_deps.sh new file mode 100755 index 0000000..6472390 --- /dev/null +++ b/accelerators/setup_gluten_deps.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -ex + +sudo apt-get update +#tag::gluten_deps[] +sudo apt-get install -y locales wget tar tzdata git ccache cmake ninja-build build-essential \ + llvm-dev clang libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev \ + libcurl4-openssl-dev maven rapidjson-dev libdouble-conversion-dev libgflags-dev \ + libsodium-dev libsnappy-dev nasm +sudo apt install -y libunwind-dev +sudo apt-get install -y libgoogle-glog-dev +sudo apt-get -y install docker-compose +sudo apt-get install -y libre2-9 || sudo apt-get install -y libre2-10 +#end::gluten_deps[] diff --git a/accelerators/setup_gluten_from_src.sh b/accelerators/setup_gluten_from_src.sh new file mode 100755 index 0000000..4788e05 --- /dev/null +++ b/accelerators/setup_gluten_from_src.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -ex + +# Setup deps +source setup_gluten_deps.sh + +# Try gluten w/clickhouse +#if [ ! -d gluten ]; then +# git clone https://github.com/oap-project/gluten.git +# cd gluten +# bash ./ep/build-clickhouse/src/build_clickhouse.sh +#fi + +# Build gluten +if [ ! -d gluten ]; then + # We need Spark 3.5 w/scala212 + git clone git@github.com:holdenk/gluten.git || git clone https://github.com/holdenk/gluten.git + cd gluten + git checkout add-spark35-scala213-hack + ./dev/builddeps-veloxbe.sh + mvn clean package -Pbackends-velox -Pspark-3.5 -DskipTests + cd .. +fi diff --git a/accelerators/setup_gluten_spark34.sh b/accelerators/setup_gluten_spark34.sh new file mode 100755 index 0000000..0cbfbc1 --- /dev/null +++ b/accelerators/setup_gluten_spark34.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +mkdir -p /tmp/spark-events +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ACCEL_JARS=${SCRIPT_DIR} +SPARK_MAJOR_VERSION=3.4 +SCALA_VERSION=${SCALA_VERSION:-"2.12"} + +set -ex + +# Note: this does not work on Ubuntu 23, only on 22 +# You might get something like: +# # C [libgluten.so+0x30c753] gluten::Runtime::registerFactory(std::string const&, std::function, std::equal_to, std::allocator > > const&)>)+0x23 + + +SPARK_VERSION=3.4.2 +SPARK_MAJOR=3.4 +HADOOP_VERSION=3 +SPARK_DIR="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" +SPARK_FILE="${SPARK_DIR}.tgz" + +export SPARK_MAJOR +export SPARK_VERSION + +source setup_gluten_deps.sh + +cd .. +source /etc/lsb-release +# Pre-baked only +if [ "$DISTRIB_RELEASE" == "20.04" ]; then + source ./env_setup.sh + cd "${SCRIPT_DIR}" + + GLUTEN_JAR="gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar" + GLUTEN_JAR_PATH="${SCRIPT_DIR}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar" + + if [ ! -f "${GLUTEN_JAR_PATH}" ]; then + wget "https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR}" || unset GLUTEN_JAR_PATH + fi + +fi +# Rather than if/else we fall through to build if wget fails because major version is not supported. +if [ -z "$GLUTEN_JAR_PATH" ]; then + #tag::build_gluten[] + if [ ! -d incubator-gluten ]; then + git clone https://github.com/apache/incubator-gluten.git + fi + cd incubator-gluten + sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON + mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests + GLUTEN_JAR_PATH="$(pwd)/package/target/gluten-package-*-SNAPSHOT-${SPARK_MAJOR_VERSION}.jar" + #end::build_gluten[] +fi + +export GLUTEN_JAR_PATH + diff --git a/c b/c new file mode 100644 index 0000000..cb4d93b --- /dev/null +++ b/c @@ -0,0 +1,2 @@ +bloop + diff --git a/env_setup.sh b/env_setup.sh old mode 100644 new mode 100755 index 2f4e834..50ff073 --- a/env_setup.sh +++ b/env_setup.sh @@ -1,16 +1,23 @@ #!/bin/bash +set -ex # Download Spark and iceberg if not present SPARK_MAJOR=${SPARK_MAJOR:-"3.5"} -SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.0"} +SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.1"} SCALA_VERSION=${SCALA_VERSION:-"2.12"} HADOOP_VERSION="3" -SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" +SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" ICEBERG_VERSION=${ICEBERG_VERSION:-"1.4.0"} if [ ! -f "${SPARK_FILE}" ]; then - wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" & + SPARK_DIST_URL="https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" + if command -v axel &> /dev/null + then + axel "$SPARK_DIST_URL" & + else + wget "$SPARK_DIST_URL" & + fi fi # Download Icberg if not present ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}-${ICEBERG_VERSION}.jar" @@ -18,12 +25,14 @@ if [ ! -f "${ICEBERG_FILE}" ]; then wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & fi wait +sleep 1 # Setup the env if [ ! -d "${SPARK_PATH}" ]; then tar -xf "${SPARK_FILE}" fi -export SPARK_HOME="${SPARK_PATH}" +SPARK_HOME="${SPARK_PATH}" +export SPARK_HOME if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then # Delete the old JAR first. @@ -41,3 +50,4 @@ mkdir -p ./data/fetched/ if [ ! -f ./data/fetched/2021 ]; then wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" -O ./data/fetched/2021 fi + diff --git a/native/src/CMakeLists.txt b/native/src/CMakeLists.txt index 04acf78..e976645 100644 --- a/native/src/CMakeLists.txt +++ b/native/src/CMakeLists.txt @@ -18,6 +18,28 @@ set(PROJECT_VERSION_MAJOR 0) set(PROJECT_VERSION_MINOR 0) set(PROJECT_VERSION_PATCH 0) +set (LIB_NAME ${PROJECT_NAME}${PROJECT_VERSION_MAJOR}) + +#tag::velox[] +set (GLUTEN_LIB_NAME ${PROJECT_NAME}-gluten-${PROJECT_VERSION_MAJOR}) +# For gluten+velox, you can leave out if not using gluten +set(GLUTEN_HOME ../../gluten) +set(CMAKE_FIND_DEBUG_MODE TRUE) +find_library(VELOX_LIBRARY NAMES velox HINTS + ${GLUTEN_HOME}/cpp/build/releases NO_DEFAULT_PATH) +# End gluten specific + +if(VELOX_LIBRARY) + file(GLOB GLUTEN_UDF_FILES + "./c/gluten/*.cpp") + add_library(${GLUTEN_LIB_NAME} SHARED ${GLUTEN_UDF_FILES}) + target_include_directories(${GLUTEN_LIB_NAME} PRIVATE ${GLUTEN_HOME}/cpp ${GLUTEN_HOME}/ep/build-velox/build/velox_ep) + target_link_libraries(${GLUTEN_LIB_NAME} PRIVATE ${VELOX_LIBRARY}) +else() + message(WARNING "Velox library not found. Specific path not added.") +endif() +#end::velox[] + # Setup JNI find_package(JNI REQUIRED) if (JNI_FOUND) @@ -45,6 +67,5 @@ file(GLOB LIB_SRC # Setup installation targets # (required by sbt-jni) major version should always be appended to library name # -set (LIB_NAME ${PROJECT_NAME}${PROJECT_VERSION_MAJOR}) add_library(${LIB_NAME} SHARED ${LIB_SRC}) install(TARGETS ${LIB_NAME} LIBRARY DESTINATION .) diff --git a/native/src/c/gluten/GlutenUDF.cpp b/native/src/c/gluten/GlutenUDF.cpp new file mode 100644 index 0000000..14019f4 --- /dev/null +++ b/native/src/c/gluten/GlutenUDF.cpp @@ -0,0 +1,82 @@ +// Filename MyUDF.cpp + +#include +#include +#include + + +namespace { +using namespace facebook::velox; + +template +class PlusConstantFunction : public exec::VectorFunction { + public: + explicit PlusConstantFunction(int32_t addition) : addition_(addition) {} + + void apply( + const SelectivityVector& rows, + std::vector& args, + const TypePtr& /* outputType */, + exec::EvalCtx& context, + VectorPtr& result) const override { + using nativeType = typename TypeTraits::NativeType; + VELOX_CHECK_EQ(args.size(), 1); + + auto& arg = args[0]; + + // The argument may be flat or constant. + VELOX_CHECK(arg->isFlatEncoding() || arg->isConstantEncoding()); + + BaseVector::ensureWritable(rows, createScalarType(), context.pool(), result); + + auto* flatResult = result->asFlatVector(); + auto* rawResult = flatResult->mutableRawValues(); + + flatResult->clearNulls(rows); + + if (arg->isConstantEncoding()) { + auto value = arg->as>()->valueAt(0); + rows.applyToSelected([&](auto row) { rawResult[row] = value + addition_; }); + } else { + auto* rawInput = arg->as>()->rawValues(); + + rows.applyToSelected([&](auto row) { rawResult[row] = rawInput[row] + addition_; }); + } + } + + private: + const int32_t addition_; +}; + +static std::vector> integerSignatures() { + // integer -> integer + return {exec::FunctionSignatureBuilder().returnType("integer").argumentType("integer").build()}; +} + +static std::vector> bigintSignatures() { + // bigint -> bigint + return {exec::FunctionSignatureBuilder().returnType("bigint").argumentType("bigint").build()}; +} + +} // namespace + +const int kNumMyUdf = 2; +gluten::UdfEntry myUdf[kNumMyUdf] = {{"myudf1", "integer"}, {"myudf2", "bigint"}}; + +DEFINE_GET_NUM_UDF { + return kNumMyUdf; +} + +DEFINE_GET_UDF_ENTRIES { + for (auto i = 0; i < kNumMyUdf; ++i) { + udfEntries[i] = myUdf[i]; + } +} + +DEFINE_REGISTER_UDF { + facebook::velox::exec::registerVectorFunction( + "myudf1", integerSignatures(), std::make_unique>(5)); + facebook::velox::exec::registerVectorFunction( + "myudf2", bigintSignatures(), std::make_unique>(5)); + std::cout << "registered myudf1, myudf2" << std::endl; +} diff --git a/python/examples/udf.py b/python/examples/udf.py new file mode 100644 index 0000000..f0d6a60 --- /dev/null +++ b/python/examples/udf.py @@ -0,0 +1,73 @@ +# This script triggers a number of different PySpark errors + +from pyspark.sql.session import SparkSession +from pyspark.sql.functions import pandas_udf, udf +from typing import Iterator +import sys +import pandas as pd + +global sc + + +# We need the session before we can use @udf +spark = SparkSession.builder.master("local[4]").getOrCreate() + + +# tag::simple_udf[] +@udf("long") +def classic_add1(e: int) -> int: + return e + 1 + + +# end::simple_udf[] + + +# tag::agg_new_udf[] +@pandas_udf("long") +def pandas_sum(s: pd.Series) -> int: + return s.sum() + + +# end::agg_new_udf[] + + +# tag::new_udf[] +@pandas_udf("long") +def pandas_add1(s: pd.Series) -> pd.Series: + # Vectorized operation on all of the elems in series at once + return s + 1 + + +# end::new_udf[] + + +# tag::complex_udf[] +@pandas_udf("long") +def pandas_nested_add1(d: pd.DataFrame) -> pd.Series: + # Takes a struct and returns the age elem + 1, if we wanted + # to update (e.g. return struct) we could update d and return it instead. + return d["age"] + 1 + + +# end::complex_udf[] + + +# tag::batches_of_batches_udf[] +@pandas_udf("long") +def pandas_batches_of_batches(t: Iterator[pd.Series]) -> Iterator[pd.Series]: + my_db_connection = None # Expensive setup logic goes here + for s in t: + # Do something with your setup logic + if my_db_connection is None: + # Vectorized operation on all of the elems in series at once + yield s + 1 + + +# end::batches_of_batches_udf[] + + +if __name__ == "__main__": + # Make sure to make + # "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" + # available as ./data/2021 + uk_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) diff --git a/python/tox.ini b/python/tox.ini index e661b21..330cd58 100644 --- a/python/tox.ini +++ b/python/tox.ini @@ -32,6 +32,7 @@ deps = pyspark==3.5.0 flake8 spark-testing-base>=0.11.1 + mypy -rrequirements.txt commands = pytest examples \ @@ -64,6 +65,7 @@ extras = tests passenv = * deps = pytest + mypy -rrequirements.txt setenv = {[testenv]setenv} diff --git a/run_sql_examples.sh b/run_sql_examples.sh index c054b31..9edd720 100755 --- a/run_sql_examples.sh +++ b/run_sql_examples.sh @@ -6,15 +6,21 @@ source env_setup.sh function run_example () { local sql_file="$1" - # shellcheck disable=SC2046 - spark-sql --master local[5] \ + local extra="$2" + EXTENSIONS=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions + if [ -n "$EXTRA_EXTENSIONS" ]; then + EXTENSIONS="$EXTENSIONS,$EXTRA_EXTENSIONS" + fi + # shellcheck disable=SC2046,SC2086 + ${SPARK_HOME}/bin/spark-sql --master local[5] \ --conf spark.eventLog.enabled=true \ - --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + --conf spark.sql.extensions=$EXTENSIONS \ --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ --conf spark.sql.catalog.spark_catalog.type=hive \ --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ --conf spark.sql.catalog.local.type=hadoop \ --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ + ${extra} ${SPARK_EXTRA} \ $(cat "${sql_file}.conf" || echo "") \ --name "${sql_file}" \ -f "${sql_file}" | tee -a "${sql_file}.out" || ls "${sql_file}.expected_to_fail" @@ -25,12 +31,30 @@ function run_example () { # ${SPARK_PATH}/sbin/start-history-server.sh if [ $# -eq 1 ]; then - run_example "sql/$1" + if [[ "$1" != *"gluten_only"* ]]; then + run_example "sql/$1" + else + echo "Processing gluten ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" + fi else # For each SQL for sql_file in sql/*.sql; do - echo "Processing ${sql_file}" - # shellcheck disable=SC2046 - run_example "$sql_file" + if [[ "$sql_file" != *"_only"* ]]; then + echo "Processing ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" + elif [[ "$sql_file" != *"gluten_only"* && "$GLUTEN_EXISTS" == "true" ]]; then + echo "Processing gluten ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" + elif [[ "$sql_file" != *"gluten_udf_only"* && "$GLUTEN_UDF_EXISTS" == "true" ]]; then + echo "Processing gluten UDF ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" + else + echo "Skipping $sql_file since we did not find gluten and this is restricted example." + fi done fi diff --git a/sql/gluten_only_nonpartitioned_table_join.sql b/sql/gluten_only_nonpartitioned_table_join.sql new file mode 100644 index 0000000..572437c --- /dev/null +++ b/sql/gluten_only_nonpartitioned_table_join.sql @@ -0,0 +1,12 @@ +CREATE TABLE IF NOT EXISTS local.udevelopers ( + username string, + firstname string, + lastname string) +USING iceberg; +CREATE TABLE IF NOT EXISTS local.uprojects ( + creator string, + uprojectname string) +USING iceberg; +INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova"); +INSERT INTO local.uprojects VALUES("krisnova", "aurae"); +SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username; diff --git a/target-validator/runme.sh b/target-validator/runme.sh index 52ebe14..b6236dd 100755 --- a/target-validator/runme.sh +++ b/target-validator/runme.sh @@ -15,4 +15,4 @@ sbt -Dspark="${SPARK_VERSION}" clean assembly JAR_PATH="$(pwd)/target/scala-2.12/data-validator-assembly-${SPARK_VERSION}_0.15.0.jar" export JAR_PATH cd .. -spark-submit --master local "$JAR_PATH" --config ex.yaml || echo "Failed as expected." +"${SPARK_HOME}/bin/spark-submit" --master local "$JAR_PATH" --config ex.yaml || echo "Failed as expected." From 30bda21751004a5a5d571d3b56425b3aa98ddbb9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 6 Apr 2024 18:13:43 -0700 Subject: [PATCH 35/53] Update sql chapter examples to have $s etc. + add (optional) event log for them so we can get hist server (#124) * Add a bad regexp join and switch to mostly using $ in HappyPanda scala examples. * Use 1.5.1 * Ok lets run the history server for fun and profit --- README.md | 6 ++++ build.sbt | 4 +-- .../dataframe/HappyPandas.scala | 31 +++++++++++++------ .../dataframe/HappyPandasTest.scala | 11 +++++++ 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index ad8c8f6..85570e1 100644 --- a/README.md +++ b/README.md @@ -10,3 +10,9 @@ Most of the examples can be built with sbt, the C and Fortran components depend # Tests The full test suite depends on having the C and Fortran components built as well as a local R installation available. + +The most "accuate" way of seeing how we run the tests is to look at the .github workflows + +# History Server + +The history server can be a great way to figure out what's going on. You can set the SPARK_EVENTLOG=true before running the scala tests and you'll get the history server too! diff --git a/build.sbt b/build.sbt index bd20c72..4aa2905 100644 --- a/build.sbt +++ b/build.sbt @@ -73,8 +73,8 @@ lazy val core = (project in file("core")) // regular scala code with @native met javaOptions ++= Seq("-Xms4048M", "-Xmx4048M", "-Djna.nosys=true"), Test / javaOptions ++= specialOptions, // 2.4.5 is the highest version we have with the old spark-testing-base deps - sparkVersion := System.getProperty("sparkVersion", "3.5.0"), - sparkTestingVersion := "1.4.7", + sparkVersion := System.getProperty("sparkVersion", "3.5.1"), + sparkTestingVersion := "1.5.2", // additional libraries libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % sparkVersion.value % Provided, diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala index f9ce89e..f183bea 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala @@ -35,6 +35,9 @@ object HappyPandas { session } + val session = sparkSession() + import session.implicits._ + /** * Creates SQLContext with an existing SparkContext. */ @@ -110,8 +113,8 @@ object HappyPandas { */ def happyPandasPercentage(pandaInfo: DataFrame): DataFrame = { pandaInfo.select( - pandaInfo("place"), - (pandaInfo("happyPandas") / pandaInfo("totalPandas")).as("percentHappy") + $"place", + ($"happyPandas" / $"totalPandas").as("percentHappy") ) } @@ -123,9 +126,9 @@ object HappyPandas { * @return Returns a DataFrame of pandaId and integer value for pandaType. */ def encodePandaType(pandaInfo: DataFrame): DataFrame = { - pandaInfo.select(pandaInfo("id"), - (when(pandaInfo("pt") === "giant", 0). - when(pandaInfo("pt") === "red", 1). + pandaInfo.select($"id", + (when($"pt" === "giant", 0). + when($"pt" === "red", 1). otherwise(2)).as("encodedType") ) } @@ -135,7 +138,7 @@ object HappyPandas { * Gets places with happy pandas more than minHappinessBound. */ def minHappyPandas(pandaInfo: DataFrame, minHappyPandas: Int): DataFrame = { - pandaInfo.filter(pandaInfo("happyPandas") >= minHappyPandas) + pandaInfo.filter($"happyPandas" >= minHappyPandas) } /** @@ -155,7 +158,7 @@ object HappyPandas { RawPanda(id, zip, pt, happy, attrs.toArray) }} pandaInfo.select( - (pandaInfo("attributes")(0) / pandaInfo("attributes")(1)) + ($"attributes"(0) / $"attributes"(1)) .as("squishyness")) //end::selectExplode[] } @@ -164,6 +167,7 @@ object HappyPandas { * Find pandas that are sad */ def sadPandas(pandaInfo: DataFrame): DataFrame = { + // This one is our intentional non $ example //tag::simpleFilter[] pandaInfo.filter(pandaInfo("happy") !== true) //end::simpleFilter[] @@ -175,7 +179,7 @@ object HappyPandas { def happyFuzzyPandas(pandaInfo: DataFrame): DataFrame = { //tag::complexFilter[] pandaInfo.filter( - pandaInfo("happy").and(pandaInfo("attributes")(0) > pandaInfo("attributes")(1)) + $"happy".and($"attributes"(0) > $"attributes"(1)) ) //end::complexFilter[] } @@ -184,7 +188,7 @@ object HappyPandas { * Gets places that contains happy pandas more than unhappy pandas. */ def happyPandasPlaces(pandaInfo: DataFrame): DataFrame = { - pandaInfo.filter(pandaInfo("happyPandas") >= pandaInfo("totalPandas") / 2) + pandaInfo.filter($"happyPandas" >= $"totalPandas" / 2) } @@ -311,11 +315,18 @@ object HappyPandas { //end::rightouterJoin[] //tag::leftsemiJoin[] - // Left semi join explicit + // Left semi join explicit. + // Here we're explicit about which DF which col comes from given + // the shared name. df1.join(df2, df1("name") === df2("name"), "left_semi") //end::leftsemiJoin[] } + + def badComplexJoin(df1: Dataset[Pandas], df2: Dataset[Pandas]): Dataset[(Pandas, Pandas)] = { + df1.joinWith(df2, regexp(df1("name"), df2("name"))).alias("regexp join") + } + /** * Cut the lineage of a DataFrame which has too long a query plan. */ diff --git a/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala index 5621ee2..c9a9834 100644 --- a/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala @@ -19,6 +19,9 @@ import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers._ class HappyPandasTest extends AnyFunSuite with DataFrameSuiteBase { + + override def appName: String = "happyPandasTest" + val toronto = "toronto" val sandiego = "san diego" val virginia = "virginia" @@ -48,6 +51,14 @@ class HappyPandasTest extends AnyFunSuite with DataFrameSuiteBase { rez.foreach{x => assert(x(0) == x(1))} } + test("bad regexp join") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val df1 = sqlCtx.createDataset(pandasList) + val df2 = sqlCtx.createDataset(pandasList) + val result = HappyPandas.badComplexJoin(df1, df2).collect() + } + test("simple explode test") { val inputDF = sqlContext.createDataFrame(pandaPlaces) val pandaInfo = sqlContext.createDataFrame(rawPandaList) From 65eaf6945b1039bab6aee61c865dfbe8eccd91fb Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 21 Apr 2024 17:14:09 -0700 Subject: [PATCH 36/53] Update readme for history server details --- README.md | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 85570e1..3388374 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # high-performance-spark-examples Examples for High Performance Spark -We are in the progress of updata this for Spark 3.3+ and the 2ed edition of our book! +We are in the progress of updata this for Spark 3.5+ and the 2ed edition of our book! # Building @@ -15,4 +15,20 @@ The most "accuate" way of seeing how we run the tests is to look at the .github # History Server -The history server can be a great way to figure out what's going on. You can set the SPARK_EVENTLOG=true before running the scala tests and you'll get the history server too! +The history server can be a great way to figure out what's going on. + +By default the history server writes to `/tmp/spark-events` so you'll need to create that directory if not setup with + +`mkdir -p /tmp/spark-events` + +The scripts for running the examples generally run with the event log enabled. + +You can set the SPARK_EVENTLOG=true before running the scala tests and you'll get the history server too! + +e.g. + +`SPARK_EVENTLOG=true sbt test` + +If you want to run just a specific test you can run [testOnly](https://www.scala-sbt.org/1.x/docs/Testing.html) + +Then to view the history server you'll want to launch it using the `${SPARK_HOME}/sbin/start-history-server.sh` then you [can go to your local history server](http://localhost:18080/) From f837e00d783eed5531c120b30ac031008abeb8fc Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 4 May 2024 22:42:14 -0700 Subject: [PATCH 37/53] When new Spark releases come out our old versions may go away and we should fall back to the "archive" in those cases. (#127) --- env_setup.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/env_setup.sh b/env_setup.sh index 50ff073..64ae94b 100755 --- a/env_setup.sh +++ b/env_setup.sh @@ -12,11 +12,12 @@ SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" ICEBERG_VERSION=${ICEBERG_VERSION:-"1.4.0"} if [ ! -f "${SPARK_FILE}" ]; then SPARK_DIST_URL="https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" + SPARK_ARCHIVE_DIST_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" if command -v axel &> /dev/null then - axel "$SPARK_DIST_URL" & + (axel "$SPARK_DIST_URL" || axel "$SPARK_ARCHIVE_DIST_URL") & else - wget "$SPARK_DIST_URL" & + (wget "$SPARK_DIST_URL" || wget "$SPARK_ARCHIVE_DIST_URL") & fi fi # Download Icberg if not present From ae81ab783d16861c6339e2914c353c1d9697ed3f Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 3 May 2024 18:35:34 -0700 Subject: [PATCH 38/53] We don't need hive support for windows anymore yaaay --- .../high-performance-spark-examples/dataframe/HappyPandas.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala index f183bea..d2926e9 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala @@ -26,7 +26,7 @@ object HappyPandas { def sparkSession(): SparkSession = { //tag::createSparkSession[] val session = SparkSession.builder() - .enableHiveSupport() + //.enableHiveSupport() -- try disabling this .getOrCreate() // Import the implicits, unlike in core Spark the implicits are defined // on the context. From 7120b2bd1d9de972c47ecd13301a6da7db979928 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 25 Jun 2024 16:14:20 -0700 Subject: [PATCH 39/53] Sql extension plugin optimizer (#128) * WIP: Start adding a sample rule to introduce null filters for non-nullable magic. Note we have no tests but it compiles. * Redirect stedder as well * Fix the custom sample optimizer, switch to using transformUp instead of transformWithPruning * We don't care about bottom up or top down for this one * 86 the old way --- .../dataframe/LoadSave.scala | 4 +- .../NullabilityFilterOptimizer.scala | 28 +++++++++++ .../dataframe/SQLExtension.scala | 15 ++++++ .../dataframe/SQLExtensionTest.scala | 49 +++++++++++++++++++ run_sql_examples.sh | 4 +- 5 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala create mode 100644 core/src/main/scala/com/high-performance-spark-examples/dataframe/SQLExtension.scala create mode 100644 core/src/test/scala/com/high-performance-spark-examples/dataframe/SQLExtensionTest.scala diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala index 82be10f..b5f1ee3 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala @@ -89,10 +89,10 @@ case class LoadSave(sc: SparkContext, session: SparkSession) { //end::saveAppend[] def createJDBC() = { - //tag::createJDBC[] session.read.jdbc("jdbc:dialect:serverName;user=user;password=pass", "table", new Properties) + //tag::createJDBC[] session.read.format("jdbc") .option("url", "jdbc:dialect:serverName") .option("dbtable", "table").load() @@ -100,10 +100,10 @@ case class LoadSave(sc: SparkContext, session: SparkSession) { } def writeJDBC(df: DataFrame) = { - //tag::writeJDBC[] df.write.jdbc("jdbc:dialect:serverName;user=user;password=pass", "table", new Properties) + //tag::writeJDBC[] df.write.format("jdbc") .option("url", "jdbc:dialect:serverName") .option("user", "user") diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala new file mode 100644 index 0000000..6b20271 --- /dev/null +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala @@ -0,0 +1,28 @@ +/** + * Extension for the SparkSession to allow us to plug in a custom optimizer + */ + +package com.highperformancespark.examples.dataframe + +import org.apache.spark.sql.catalyst.optimizer._ +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.catalyst.trees.TreePattern._ +import org.apache.spark.sql.catalyst.expressions.{And, IsNotNull, NullIntolerant} + +object NullabilityFilterOptimizer extends Rule[LogicalPlan] { + + def apply(plan: LogicalPlan): LogicalPlan = { + plan.transform { + case p @ Project(projectList, projChild) => + val children = projectList.flatMap(_.children) + // If there are no null intolerant children don't worry about it + if (children.isEmpty) { + p + } else { + val filterCond = children.map(IsNotNull(_)).reduceLeft(And) + Project(projectList, Filter(filterCond, projChild)) + } + } + } +} diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/SQLExtension.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/SQLExtension.scala new file mode 100644 index 0000000..14e2072 --- /dev/null +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/SQLExtension.scala @@ -0,0 +1,15 @@ +/** + * Extension for the SparkSession to allow us to plug in a custom optimizer + */ + +package com.highperformancespark.examples.dataframe + +import org.apache.spark.sql.{SparkSessionExtensions, SparkSessionExtensionsProvider} + +class SQLExtension extends SparkSessionExtensionsProvider { + override def apply(extensions: SparkSessionExtensions): Unit = { + // There are _many different_ types of rules you can inject, here we're focused on + // making things go fast so our sample is an optimizer rule (AQE rules could also make sense). + extensions.injectOptimizerRule(session => NullabilityFilterOptimizer) + } +} diff --git a/core/src/test/scala/com/high-performance-spark-examples/dataframe/SQLExtensionTest.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/SQLExtensionTest.scala new file mode 100644 index 0000000..91408fd --- /dev/null +++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/SQLExtensionTest.scala @@ -0,0 +1,49 @@ +/** + * Happy Panda Example for DataFrames. + * Computes the % of happy pandas. Very contrived. + */ +package com.highperformancespark.examples.dataframe + +import scala.collection.mutable +import scala.util.Random + +import org.apache.spark.SparkConf +import org.apache.spark.sql._ +import org.apache.spark.sql.execution.ExplainMode +import org.apache.spark.sql.types.IntegerType +import org.apache.spark.sql.functions.{lower, rand} +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo +import com.highperformancespark.examples.dataframe.HappyPandas.Pandas +import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ + +class SQLExtensionTest extends AnyFunSuite with ScalaDataFrameSuiteBase { + + val rawPandaList = List( + RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9)), + RawPanda(11L, "94110", "red", true, Array(1.0, 0.9))) + + override def conf: SparkConf = { + val initialConf = super.conf + initialConf.set( + "spark.sql.extensions", + "com.highperformancespark.examples.dataframe.SQLExtension") + } + + def explainToString(df: DataFrame): String = { + df.queryExecution.explainString(ExplainMode.fromString("extended")) + } + + test("Magic") { + import spark.implicits._ + val inputDF = spark.createDataFrame(rawPandaList) + spark.sql("DROP TABLE IF EXISTS farts") + inputDF.write.saveAsTable("farts") + val testDF = spark.read.table("farts") + val explained: String = explainToString(testDF.select($"zip".cast(IntegerType))) + explained should include ("isnotnull(zip#") + } +} diff --git a/run_sql_examples.sh b/run_sql_examples.sh index 9edd720..946abf4 100755 --- a/run_sql_examples.sh +++ b/run_sql_examples.sh @@ -4,6 +4,8 @@ set -o pipefail source env_setup.sh +# You might want to set SPARK_EXTRA to do things like log more info + function run_example () { local sql_file="$1" local extra="$2" @@ -23,7 +25,7 @@ function run_example () { ${extra} ${SPARK_EXTRA} \ $(cat "${sql_file}.conf" || echo "") \ --name "${sql_file}" \ - -f "${sql_file}" | tee -a "${sql_file}.out" || ls "${sql_file}.expected_to_fail" + -f "${sql_file}" 2>&1 | tee -a "${sql_file}.out" || ls "${sql_file}.expected_to_fail" } From b436ad7266f713c2db8ce661c806c2225b41d4f3 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 27 Jun 2024 17:52:17 -0700 Subject: [PATCH 40/53] Temporary pin back numpy before 2.0 (#129) --- python/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/python/requirements.txt b/python/requirements.txt index 75d55bb..6654dc9 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -8,3 +8,4 @@ pandera[pyspark] spark-expectations>=1.0 venv-pack requests +numpy<2.0 From 3fc8c62fc374c8b55cbfe74049d4e5f9fa78dbe6 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 1 Jul 2024 15:54:10 -0700 Subject: [PATCH 41/53] Verify filter pushdown through dataset typing (#130) * Start adding test for filter pushdown through datasets * Quick example that shows we still push the filter down to the datasourcev2 relation even with an as[] in the middle. Figured I'd add this since there was a blog post a thing about how Spark had problems with this awhile ago and some folks might still be avoiding typed datasets as a result. --- .../dataframe/PandaPlaceFilterPushdown.scala | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 core/src/test/scala/com/high-performance-spark-examples/dataframe/PandaPlaceFilterPushdown.scala diff --git a/core/src/test/scala/com/high-performance-spark-examples/dataframe/PandaPlaceFilterPushdown.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/PandaPlaceFilterPushdown.scala new file mode 100644 index 0000000..17215ab --- /dev/null +++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/PandaPlaceFilterPushdown.scala @@ -0,0 +1,48 @@ +/** + * Happy Panda Example for DataFrames. + * Computes the % of happy pandas. Very contrived. + */ +package com.highperformancespark.examples.dataframe + +import scala.collection.mutable +import scala.util.Random + +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Row +import org.apache.spark.sql.{SQLContext, SparkSession} +import org.apache.spark.sql.types._ + +import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo +import com.highperformancespark.examples.dataframe.HappyPandas.Pandas +import com.holdenkarau.spark.testing._ +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers._ + +case class ExtraMagic( + place: String, + pandaType: String, + happyPandas: Integer, + totalPandas: Integer, + extraInfo: Integer) + + +class PandaPlaceFilterPushdown extends AnyFunSuite with DataFrameSuiteBase { + + override def appName: String = "pandaPlaceFilterPushdown" + + val basicList = List( + ExtraMagic("a", "b", 1, 2, 3), + ExtraMagic("toronto", "b", 1, 2, 3), + ) + + test("simpleFilterTest") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(basicList) + val restrictedDF = inputDF.select($"place", $"pandaType", $"happyPandas", $"totalPandas") + val switched = inputDF.as[PandaInfo] + // Note if we write the filter with functional syntax it does not push down. + val filtered = switched.filter($"place" === "a") + assert(filtered.count() === 1) + } +} From 37c085c1d8418e87f419dca493e383a72bc1f728 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 8 Aug 2024 20:23:33 -0700 Subject: [PATCH 42/53] Mini tag updates for sql (#131) * Add some bad join examples so you can see how the different join conditions impact the join type * Tag badjoin okjoin magics * Fix up RDD broadcast join ex * Fix test ref --- .../dataframe/HappyPandas.scala | 17 +++++++++++++++++ .../goldilocks/RDDJoinExamples.scala | 10 ++++++---- .../dataframe/HappyPandasTest.scala | 16 ++++++++++++++++ .../goldilocks/JoinTest.scala | 2 +- 4 files changed, 40 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala index d2926e9..e9e708d 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala @@ -327,6 +327,23 @@ object HappyPandas { df1.joinWith(df2, regexp(df1("name"), df2("name"))).alias("regexp join") } + + //tag::badJoinMagic[] + def badJoin(df1: Dataset[Pandas], df2: Dataset[Pandas]): Dataset[(Pandas, Pandas)] = { + val session = df1.sparkSession + val sle = session.udf.register("strLenEq", (s: String, s2: String) => s.length() == s2.length()) + df1.joinWith(df2, sle(df1("name"), df2("name"))).alias("strlenEqJoin") + } + //end::badJoinMagic[] + + //tag::okJoin[] + def okJoin(df1: Dataset[Pandas], df2: Dataset[Pandas]): Dataset[(Pandas, Pandas)] = { + val session = df1.sparkSession + val sl = session.udf.register("strLen", (s: String) => s.length()) + df1.joinWith(df2, sl(df1("name")) === sl(df2("name"))).alias("strlenJoin") + } + //end::okJoin[] + /** * Cut the lineage of a DataFrame which has too long a query plan. */ diff --git a/core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala index 1fef85c..d7024ae 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala @@ -95,7 +95,7 @@ object RDDJoinExamples { } /** - * Performs a broad cast hash join for two RDDs. + * Performs a broadcast hash join for two RDDs. * @param bigRDD - the first rdd, should be the larger RDD * @param smallRDD - the small rdd, should be small enough to fit in memory * @tparam K - The type of the key @@ -103,8 +103,8 @@ object RDDJoinExamples { * @tparam V2 - The type of the values for the second array * @return */ - //tag::coreBroadCast[] - def manualBroadCastHashJoin[K : Ordering : ClassTag, V1 : ClassTag, + //tag::coreBroadcast[] + def manualBroadcastHashJoin[K : Ordering : ClassTag, V1 : ClassTag, V2 : ClassTag](bigRDD : RDD[(K, V1)], smallRDD : RDD[(K, V2)])= { val smallRDDLocal: Map[K, V2] = smallRDD.collectAsMap() @@ -113,11 +113,13 @@ object RDDJoinExamples { iter.flatMap{ case (k,v1 ) => smallRDDLocalBcast.value.get(k) match { + // Note: You could switch this to a left join by changing the empty seq + // to instead return Seq(k, Seq.empty[(V1, V2)]) case None => Seq.empty[(K, (V1, V2))] case Some(v2) => Seq((k, (v1, v2))) } } }, preservesPartitioning = true) } - //end:coreBroadCast[] + //end::coreBroadcast[] } diff --git a/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala index c9a9834..49b09b9 100644 --- a/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala @@ -59,6 +59,22 @@ class HappyPandasTest extends AnyFunSuite with DataFrameSuiteBase { val result = HappyPandas.badComplexJoin(df1, df2).collect() } + test("bad udf join") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val df1 = sqlCtx.createDataset(pandasList) + val df2 = sqlCtx.createDataset(pandasList) + val result = HappyPandas.badJoin(df1, df2).collect() + } + + test("ok udf join") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val df1 = sqlCtx.createDataset(pandasList) + val df2 = sqlCtx.createDataset(pandasList) + val result = HappyPandas.okJoin(df1, df2).collect() + } + test("simple explode test") { val inputDF = sqlContext.createDataFrame(pandaPlaces) val pandaInfo = sqlContext.createDataFrame(rawPandaList) diff --git a/core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala index d1729f8..ea0a16a 100644 --- a/core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala @@ -14,7 +14,7 @@ class JoinTest extends AnyFunSuite with SharedSparkContext { sc.parallelize(keySet.flatMap{ letter => Range(1, 50).map(i => (letter, letter.hashCode() / i.toDouble))}) val result: RDD[(String, (Double, Int))] = - RDDJoinExamples.manualBroadCastHashJoin( + RDDJoinExamples.manualBroadcastHashJoin( largeRDD, smallRDD) val nativeJoin: RDD[(String, (Double, Int))] = largeRDD.join(smallRDD) From b3db591f87477188eec238e8f120cbff8c117852 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 12 Aug 2024 10:07:36 -0700 Subject: [PATCH 43/53] Iceberg class demo container (#132) * Start working on making docker container * Upgrade to latest Iceberg, ignore coursier file, install Scala Jupyter kernel * Install correct jupyter * We want the scala kernel to point to Spark & include the class path for the examples * More progress on the container * Ok we need to use 2.13.8 since using .14 gives us some cats issues, roll back kernel to match .8, cross-mount iceberg-workshop, forward 8877, do some tricks for faster container builds * Make directories for cross mount if not present, add jupyter-lab launch to bash history for folks that want to launch bash and then they can easily up arrow (and by folks I mean me) * Shellcheck fixes for build container script. * Use axel quietly so I can find things * Match scala version of Spark * More shellcheck fixes * Make the wgets quiet too. * oops missed one. * Fix scala version * Update for Spark 4 / Scala 2.13 * Bump sbt version (note see the spark-400 branch for the cherry picked parts) * Use 2.13 target * Match comet to regular build --- .github/workflows/ci.yml | 2 +- .gitignore | 9 +++ Dockerfile | 70 +++++++++++++++++++ accelerators/comet_ex.sh | 4 +- accelerators/setup_comet.sh | 2 +- build.sbt | 14 ++-- build_container.sh | 20 ++++++ .../dataframe/MixedDataset.scala | 3 +- .../dataframe/RawPandas.scala | 2 +- .../goldilocks/GoldilocksFirstTry.scala | 7 +- .../goldilocks/GoldilocksWithHashMap.scala | 4 +- .../ml/CustomPipeline.scala | 3 - .../ml/SimpleNaiveBayes.scala | 3 - .../ml/SimplePipeline.scala | 5 -- .../mllib/GoldilocksMLlib.scala | 3 - .../perf/SimplePerfTest.scala | 6 +- .../dataframe/HappyPandasTest.scala | 2 +- .../QuantileOnlyArtisanalTest.scala | 6 +- env_setup.sh | 14 ++-- misc/kernel.json | 19 +++++ project/build.properties | 2 +- run_container.sh | 7 ++ run_pyspark_examples.sh | 7 +- 23 files changed, 169 insertions(+), 45 deletions(-) create mode 100644 Dockerfile create mode 100755 build_container.sh create mode 100644 misc/kernel.json create mode 100755 run_container.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5746690..3527e31 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -130,7 +130,7 @@ jobs: java-version: 17 - name: Setup comet run: - cd accelerators; SPARK_MAJOR=3.4 ./setup_comet.sh + cd accelerators; SPARK_MAJOR=3.5 ./setup_comet.sh - name: Run comet run: cd accelerators; ./comet_ex.sh diff --git a/.gitignore b/.gitignore index 8d1365c..127fbe2 100644 --- a/.gitignore +++ b/.gitignore @@ -95,3 +95,12 @@ gluten*.jar spark-3*hadoop*/ spark-3*hadoop*.tgz accelerators/incubator-gluten +# ignore the temporary myapp from the dockerbuild +myapp.tar +# ignore glutten +incubator-glutten/* +# ignore nested build file. +project/build.sbt +coursier +# Magic file we use for build tracking +oldhash \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3616b4a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,70 @@ +# Open JDK11, Spark 3.X and the latest JDKs get a little spicy +FROM azul/zulu-openjdk:11-latest + +RUN apt-get -qq update && \ + apt-get -qq -y upgrade && \ + apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop && \ + locale-gen en_US.UTF-8 && \ + apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \ + echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \ + echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \ + curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \ + chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \ + apt-get update && \ + apt-get -qq -y install sbt && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -Lo coursier https://git.io/coursier-cli +RUN chmod +x coursier +# ensure the JAR of the CLI is in the coursier cache, in the image +RUN ./coursier --help +RUN pip install jupyter +RUN ./coursier bootstrap \ + -r jitpack \ + -i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \ + sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \ + --default=true --sources \ + -o almond && \ + ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13" +RUN chmod a+xr almond coursier +RUN ./coursier launch almond --scala 2.13.8 -- --install +# Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3 +#RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8 +RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb + +RUN adduser dev +RUN adduser dev sudo +RUN echo 'dev:dev' | chpasswd +RUN mkdir -p ~dev +RUN cp ./coursier ~dev/ +RUN echo "color_prompt=yes" >> ~dev/.bashrc +RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc +RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.1-bin-hadoop3" >> ~dev/.bashrc +RUN chown -R dev ~dev +USER dev +# Kernels are installed in user so we need to run as the user +RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13" +RUN ./coursier launch almond --scala 2.13.8 -- --install +USER root + +RUN mkdir /high-performance-spark-examples +RUN chown -R dev /high-performance-spark-examples +WORKDIR /high-performance-spark-examples +# Increase the chance of caching by copying just the env setup file first. +COPY --chown=dev:dev env_setup.sh ./ +# Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place +# Also downloads some test data +RUN SCALA_VERSION=2.13 ./env_setup.sh +RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back +# Note: We need to use /home in the COPY otherwise no happy pandas +COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new +RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json +RUN git clone https://github.com/holdenk/spark-upgrade.git +RUN chown -R dev /high-performance-spark-examples +ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/ +RUN chown -R dev /high-performance-spark-examples +USER dev +RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history +RUN sbt clean compile +CMD ["jupyter-lab", "--ip", "0.0.0.0", "--port", "8877"] + diff --git a/accelerators/comet_ex.sh b/accelerators/comet_ex.sh index cd08177..268a4dc 100755 --- a/accelerators/comet_ex.sh +++ b/accelerators/comet_ex.sh @@ -2,8 +2,8 @@ set -ex # If you change this update the workflow version too. -SPARK_MAJOR=${SPARK_MAJOR:-3.4} -SPARK_VERSION=3.4.2 +SPARK_MAJOR=${SPARK_MAJOR:-3.5} +SPARK_VERSION=${SPARK_MAJOR}.1 export SPARK_MAJOR export SPARK_VERSION diff --git a/accelerators/setup_comet.sh b/accelerators/setup_comet.sh index a63f8eb..2020cb6 100755 --- a/accelerators/setup_comet.sh +++ b/accelerators/setup_comet.sh @@ -19,7 +19,7 @@ fi # Build JAR if not present if [ -z "$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar)" ]; then cd arrow-datafusion-comet - make clean release PROFILES="-Pspark-${SPARK_MAJOR}" + make clean release PROFILES="-Pspark-${SPARK_MAJOR} -Pscala-2.13" cd .. fi COMET_JAR="$(pwd)/$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*SNAPSHOT.jar)" diff --git a/build.sbt b/build.sbt index 4aa2905..81c971e 100644 --- a/build.sbt +++ b/build.sbt @@ -1,3 +1,5 @@ +scalaVersion := "2.13.8" + lazy val root = (project in file(".")) .aggregate(core, native) @@ -5,15 +7,15 @@ lazy val root = (project in file(".")) organization := "com.highperformancespark" //tag::addSparkScalaFix[] -ThisBuild / scalafixDependencies += - "com.holdenkarau" %% "spark-scalafix-rules-2.4.8" % "0.1.5" -ThisBuild / scalafixDependencies += - "com.github.liancheng" %% "organize-imports" % "0.6.0" +// Needs to be commented out post-upgrade because of Scala versions. +//ThisBuild / scalafixDependencies += +// "com.holdenkarau" %% "spark-scalafix-rules-2.4.8" % "0.1.5" +//ThisBuild / scalafixDependencies += +// "com.github.liancheng" %% "organize-imports" % "0.6.0" //end::addSparkScalaFix[] lazy val V = _root_.scalafix.sbt.BuildInfo -scalaVersion := V.scala212 addCompilerPlugin(scalafixSemanticdb) scalacOptions ++= List( "-Yrangepos", @@ -67,6 +69,7 @@ lazy val core = (project in file("core")) // regular scala code with @native met .settings(javah / target := (native / nativeCompile / sourceDirectory).value / "include") .settings(sbtJniCoreScope := Compile) .settings( + scalaVersion := "2.13.8", javacOptions ++= Seq("-source", "1.8", "-target", "1.8"), parallelExecution in Test := false, fork := true, @@ -97,6 +100,7 @@ lazy val core = (project in file("core")) // regular scala code with @native met // JNI Magic! lazy val native = (project in file("native")) // native code and build script .settings(nativeCompile / sourceDirectory := sourceDirectory.value) + .settings(scalaVersion := "2.13.8") .enablePlugins(JniNative) // JniNative needs to be explicitly enabled //tag::xmlVersionConflict[] diff --git a/build_container.sh b/build_container.sh new file mode 100755 index 0000000..1c550c7 --- /dev/null +++ b/build_container.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -ex + +cp .git/index /tmp/git_index +export GIT_INDEX_FILE=/tmp/git_index +git add -u +hash=$(git write-tree) +unset GIT_INDEX_FILE +oldhash=$(cat oldhash || true) +if [ "$hash" = "$oldhash" ] && [ -f myapp.tar ]; then + echo "Skipping making tar since we match." +else + echo "Making tar since no match" + git archive -o myapp.tar --format=tar HEAD + echo "$hash" > oldhash +fi +IMAGE=holdenk/hps:0.1 +docker buildx build --platform=linux/amd64,linux/arm64 -t "${IMAGE}" . --push +#docker buildx build --platform=linux/amd64 -t "${IMAGE}" . --push diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala index 8943ba7..b74e1cb 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala @@ -67,9 +67,10 @@ class MixedDataset(sqlCtx: SQLContext) { //tag::maxPandaSizePerZipScala[] def maxPandaSizePerZipScala(ds: Dataset[RawPanda]): Dataset[(String, Double)] = { - ds.groupByKey(rp => rp.zip).mapGroups{ case (g, iter) => + def groupMapFun(g: String, iter: Iterator[RawPanda]): (String, Double) = { (g, iter.map(_.attributes(2)).reduceLeft(Math.max(_, _))) } + ds.groupByKey(rp => rp.zip).mapGroups(groupMapFun) } //end::maxPandaSizePerZipScala[] diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala index b1d64dc..c7cf0ca 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala @@ -14,7 +14,7 @@ case class RawPanda(id: Long, zip: String, pt: String, happy: Boolean, attributes: Array[Double]) { override def equals(o: Any) = o match { case other: RawPanda => (id == other.id && pt == other.pt && - happy == other.happy && attributes.deep == other.attributes.deep) + happy == other.happy && attributes.sameElements(other.attributes)) case _ => false } override def hashCode(): Int = { diff --git a/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala index 341364e..afcdeb8 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala @@ -2,7 +2,6 @@ package com.highperformancespark.examples.goldilocks import scala.collection.Map import scala.collection.mutable -import scala.collection.mutable.MutableList import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame @@ -254,7 +253,7 @@ object GoldilocksFirstTry { // to sort the partitionsColumnsFreq array by the partition index (the // first value in the tuple). partitionColumnsFreq.sortBy(_._1).map { case (partitionIndex, columnsFreq) => - val relevantIndexList = new MutableList[(Int, Long)]() + val relevantIndexList = new mutable.ListBuffer[(Int, Long)]() columnsFreq.zipWithIndex.foreach{ case (colCount, colIndex) => val runningTotalCol = runningTotal(colIndex) @@ -293,8 +292,8 @@ object GoldilocksFirstTry { (partitionIndex : Int, valueColumnPairs : Iterator[(Double, Int)]) => { val targetsInThisPart: List[(Int, Long)] = ranksLocations(partitionIndex)._2 if (targetsInThisPart.nonEmpty) { - val columnsRelativeIndex: Map[Int, List[Long]] = - targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2)) + val columnsRelativeIndex: collection.MapView[Int, List[Long]] = + targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2)) val columnsInThisPart = targetsInThisPart.map(_._1).distinct val runningTotals : mutable.HashMap[Int, Long]= new mutable.HashMap() diff --git a/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala index 9dd365b..2097d02 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala @@ -174,7 +174,7 @@ object GoldilocksWithHashMap { val runningTotal = Array.fill[Long](numOfColumns)(0) partitionColumnsFreq.sortBy(_._1).map { case (partitionIndex, columnsFreq)=> - val relevantIndexList = new mutable.MutableList[(Int, Long)]() + val relevantIndexList = new mutable.ListBuffer[(Int, Long)]() columnsFreq.zipWithIndex.foreach{ case (colCount, colIndex) => val runningTotalCol = runningTotal(colIndex) @@ -303,7 +303,7 @@ object FindTargetsSubRoutine extends Serializable { def withArrayBuffer(valueColumnPairsIter : Iterator[((Double, Int), Long)], targetsInThisPart: List[(Int, Long)] ): Iterator[(Int, Double)] = { - val columnsRelativeIndex: Predef.Map[Int, List[Long]] = + val columnsRelativeIndex: collection.MapView[Int, List[Long]] = targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2)) // The column indices of the pairs that are desired rank statistics that live in diff --git a/core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala index bd84cc0..9fdef43 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala @@ -1,9 +1,6 @@ package com.highperformancespark.examples.ml import scala.collection.Map -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer -import scala.collection.mutable.MutableList import org.apache.spark._ import org.apache.spark.ml._ diff --git a/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala index 6b1e55e..6a88a77 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala @@ -1,9 +1,6 @@ package com.highperformancespark.examples.ml import scala.collection.Map -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer -import scala.collection.mutable.MutableList import org.apache.spark._ import org.apache.spark.ml._ diff --git a/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala index d161563..b233693 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala @@ -1,10 +1,5 @@ package com.highperformancespark.examples.ml -import scala.collection.Map -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer -import scala.collection.mutable.MutableList - import org.apache.spark._ import org.apache.spark.ml._ import org.apache.spark.ml.classification._ diff --git a/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala b/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala index f57b469..3fab009 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala @@ -1,9 +1,6 @@ package com.highperformancespark.examples.mllib import scala.collection.Map -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer -import scala.collection.mutable.MutableList import org.apache.spark._ import org.apache.spark.mllib.classification.LogisticRegressionModel diff --git a/core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala b/core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala index 197c6b7..5a06ff6 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala @@ -64,9 +64,9 @@ object SimplePerfTest { println(dataFrameTimeings.map(_._2).mkString(",")) } - def testOnRDD(rdd: RDD[(Int, Double)]) = { - rdd.map{case (x, y) => (x, (y, 1))} - .reduceByKey{case (x, y) => (x._1 + y._1, x._2 + y._2)}.count() + def testOnRDD(rdd: RDD[(Int, Double)]): Long = { + val kvc: RDD[(Int, (Double , Int))] = rdd.map{case (x, y) => (x, (y, 1))} + kvc.reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2)).count() } def groupOnRDD(rdd: RDD[(Int, Double)]) = { diff --git a/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala index 49b09b9..854fc4e 100644 --- a/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala @@ -261,7 +261,7 @@ class HappyPandasTest extends AnyFunSuite with DataFrameSuiteBase { .flatMap(zipPandas => { val pandas = zipPandas._2 val length = pandas.size - 1 - val result = new mutable.MutableList[Row] + val result = new mutable.ListBuffer[Row] for (i <- 0 to length) { var totalSum = 0 diff --git a/core/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala index ed5f9b2..9213016 100644 --- a/core/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala +++ b/core/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala @@ -42,8 +42,8 @@ class QuantileOnlyArtisanalTest extends AnyFunSuite with BeforeAndAfterAll { val inputAsKeyValuePairs = GoldilocksGroupByKey.mapToKeyValuePairs(input) val groupByKeySolution = GoldilocksGroupByKey.findRankStatistics( inputAsKeyValuePairs, List(2L,3L)).mapValues(_.toSet) - assert(whileLoopSolution == expectedResult) - assert(groupByKeySolution == expectedResult) + assert(whileLoopSolution.toMap == expectedResult) + assert(groupByKeySolution.toMap == expectedResult) } override def afterAll() { @@ -136,7 +136,7 @@ class QuantileOnlyArtisanalTestContinued extends QuantileOnlyArtisanalTest { val secondarySortSolution = GoldilocksWithHashMap.findRankStatistics( input, targetRanks = List(2L, 3L)).mapValues(_.toSet) - assert(secondarySortSolution == expectedResult) + assert(secondarySortSolution.toMap == expectedResult) } test("Secondary Sort"){ diff --git a/env_setup.sh b/env_setup.sh index 64ae94b..b027a9e 100755 --- a/env_setup.sh +++ b/env_setup.sh @@ -5,25 +5,29 @@ set -ex # Download Spark and iceberg if not present SPARK_MAJOR=${SPARK_MAJOR:-"3.5"} SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.1"} -SCALA_VERSION=${SCALA_VERSION:-"2.12"} +SCALA_VERSION=${SCALA_VERSION:-"2.13"} HADOOP_VERSION="3" SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" -ICEBERG_VERSION=${ICEBERG_VERSION:-"1.4.0"} +if [ "$SCALA_VERSION" = "2.13" ]; then + SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3-scala2.13.tgz" + SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala2.13" +fi +ICEBERG_VERSION=${ICEBERG_VERSION:-"1.6.0"} if [ ! -f "${SPARK_FILE}" ]; then SPARK_DIST_URL="https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" SPARK_ARCHIVE_DIST_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" if command -v axel &> /dev/null then - (axel "$SPARK_DIST_URL" || axel "$SPARK_ARCHIVE_DIST_URL") & + (axel --quiet "$SPARK_DIST_URL" || axel --quiet "$SPARK_ARCHIVE_DIST_URL") & else - (wget "$SPARK_DIST_URL" || wget "$SPARK_ARCHIVE_DIST_URL") & + (wget --quiet "$SPARK_DIST_URL" || wget --quiet "$SPARK_ARCHIVE_DIST_URL") & fi fi # Download Icberg if not present ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}-${ICEBERG_VERSION}.jar" if [ ! -f "${ICEBERG_FILE}" ]; then - wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & + wget --quiet "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & fi wait sleep 1 diff --git a/misc/kernel.json b/misc/kernel.json new file mode 100644 index 0000000..d5575f8 --- /dev/null +++ b/misc/kernel.json @@ -0,0 +1,19 @@ +{ + "argv": [ + "java", + "-cp", + "/home/dev/.local/share/jupyter/kernels/scala2.13/launcher.jar:.:/high-performance-spark-examples/:/high-performance-spark-examples/target/scala-2.13/home/dev/.local/share/jupyter/kernels/scala/launcher.jar:/high-performance-spark-examples/spark-3.5.1-bin-hadoop3-scala2.13/jars/*", + "coursier.bootstrap.launcher.Launcher", + "--log", + "info", + "--metabrowse", + "--id", + "scala2.13", + "--display-name", + "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)", + "--connection-file", + "{connection_file}" + ], + "display_name": "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)", + "language": "scala" +} diff --git a/project/build.properties b/project/build.properties index 2743082..04267b1 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=1.9.6 +sbt.version=1.9.9 diff --git a/run_container.sh b/run_container.sh new file mode 100755 index 0000000..37d47aa --- /dev/null +++ b/run_container.sh @@ -0,0 +1,7 @@ +#!/bin/bash +set -ex +./build_container.sh +docker image pull holdenk/hps:0.1 +mkdir -p warehouse +mkdir -p iceberg-workshop +docker container run --mount type=bind,source="$(pwd)"/warehouse,target=/warehouse --mount type=bind,source="$(pwd)/iceberg-workshop",target=/high-performance-spark-examples/iceberg-workshop -p 8877:8877 -it holdenk/hps:0.1 /bin/bash diff --git a/run_pyspark_examples.sh b/run_pyspark_examples.sh index 1010347..6ab0546 100755 --- a/run_pyspark_examples.sh +++ b/run_pyspark_examples.sh @@ -40,12 +40,17 @@ function check_fail () { fi } -EXAMPLE_JAR="./core/target/scala-2.12/core-assembly-0.1.0-SNAPSHOT.jar" +EXAMPLE_JAR="./core/target/scala-2.13/core-assembly-0.1.0-SNAPSHOT.jar" if [ ! -f "${EXAMPLE_JAR}" ]; then sbt core/assembly fi +if [ ! -f "${EXAMPLE_JAR}" ]; then + echo "Can't find sample jar?!?" + exit 1 +fi + function run_example () { local ex="$1" # shellcheck disable=SC2046 From e2fecda09c033971b35a41f1c2b21fb72470932c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 15 Aug 2024 00:15:48 -0700 Subject: [PATCH 44/53] Add solution (#133) * Start working on sol nb * Finish update to Spark 3.5.2, download 2022 & 2023 data, install Python-is-Python3 so we can run the example. * Add solution * Ignore incubator-gluten * Update workshop NB * Install pyarrow & pyiceberg for folks who want to poke around at the parquet files. * Update solutions * More update * Forward the Spark UI for ze query plans. * Update solution * Update solutions * Re-enable cross build, use launch script * Make exec * Lets make a slimmed down container for folks who need it. * Fix spark home slim down mini some more eh wait we don't need a root user install of scala. oops we do need it * Tag mini image seperately * Stack them * Avoid pip cache dir * Don't keep the spark tarball in the image * Seperate out build from run * shell check fixes --- .gitignore | 7 +- Dockerfile | 69 +-- Dockerfile-mini | 69 +++ build_container.sh | 7 +- env_setup.sh | 8 +- .../Workshop-Template.ipynb | 552 +++++++++++++++++ iceberg-workshop-solutions/Workshop.ipynb | 577 ++++++++++++++++++ misc/container_launch.sh | 5 + misc/kernel.json | 2 +- run_container.sh | 9 +- 10 files changed, 1231 insertions(+), 74 deletions(-) create mode 100644 Dockerfile-mini create mode 100644 iceberg-workshop-solutions/Workshop-Template.ipynb create mode 100644 iceberg-workshop-solutions/Workshop.ipynb create mode 100755 misc/container_launch.sh diff --git a/.gitignore b/.gitignore index 127fbe2..3068584 100644 --- a/.gitignore +++ b/.gitignore @@ -103,4 +103,9 @@ incubator-glutten/* project/build.sbt coursier # Magic file we use for build tracking -oldhash \ No newline at end of file +oldhash +# ignore ipynb checkpoints +.ipynb_checkpoints/ + +# ignore accel +incubator-gluten/ diff --git a/Dockerfile b/Dockerfile index 3616b4a..2d1d0fb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,70 +1,7 @@ -# Open JDK11, Spark 3.X and the latest JDKs get a little spicy -FROM azul/zulu-openjdk:11-latest +ARG base +FROM $base -RUN apt-get -qq update && \ - apt-get -qq -y upgrade && \ - apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop && \ - locale-gen en_US.UTF-8 && \ - apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \ - echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \ - echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \ - curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \ - chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \ - apt-get update && \ - apt-get -qq -y install sbt && \ - rm -rf /var/lib/apt/lists/* - -RUN curl -Lo coursier https://git.io/coursier-cli -RUN chmod +x coursier -# ensure the JAR of the CLI is in the coursier cache, in the image -RUN ./coursier --help -RUN pip install jupyter -RUN ./coursier bootstrap \ - -r jitpack \ - -i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \ - sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \ - --default=true --sources \ - -o almond && \ - ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13" -RUN chmod a+xr almond coursier -RUN ./coursier launch almond --scala 2.13.8 -- --install -# Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3 -#RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8 -RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb - -RUN adduser dev -RUN adduser dev sudo -RUN echo 'dev:dev' | chpasswd -RUN mkdir -p ~dev -RUN cp ./coursier ~dev/ -RUN echo "color_prompt=yes" >> ~dev/.bashrc -RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc -RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.1-bin-hadoop3" >> ~dev/.bashrc -RUN chown -R dev ~dev -USER dev -# Kernels are installed in user so we need to run as the user -RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13" -RUN ./coursier launch almond --scala 2.13.8 -- --install USER root - -RUN mkdir /high-performance-spark-examples -RUN chown -R dev /high-performance-spark-examples -WORKDIR /high-performance-spark-examples -# Increase the chance of caching by copying just the env setup file first. -COPY --chown=dev:dev env_setup.sh ./ -# Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place -# Also downloads some test data -RUN SCALA_VERSION=2.13 ./env_setup.sh -RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back -# Note: We need to use /home in the COPY otherwise no happy pandas -COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new -RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json -RUN git clone https://github.com/holdenk/spark-upgrade.git -RUN chown -R dev /high-performance-spark-examples -ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/ -RUN chown -R dev /high-performance-spark-examples +RUN pip install --no-cache-dir pyarrow pyiceberg[pandas,snappy,daft,s3fs] avro USER dev -RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history RUN sbt clean compile -CMD ["jupyter-lab", "--ip", "0.0.0.0", "--port", "8877"] - diff --git a/Dockerfile-mini b/Dockerfile-mini new file mode 100644 index 0000000..b9e7ddf --- /dev/null +++ b/Dockerfile-mini @@ -0,0 +1,69 @@ +# Open JDK11, Spark 3.X and the latest JDKs get a little spicy +FROM azul/zulu-openjdk:11-latest + +RUN apt-get -qq update && \ + apt-get -qq -y upgrade && \ + apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop python-is-python3 && \ + locale-gen en_US.UTF-8 && \ + apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \ + echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \ + echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \ + curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \ + chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \ + apt-get update && \ + apt-get -qq -y install sbt && \ + rm -rf /var/lib/apt/lists/* + +RUN curl -Lo coursier https://git.io/coursier-cli +RUN chmod +x coursier +# ensure the JAR of the CLI is in the coursier cache, in the image +RUN ./coursier --help +RUN pip install --no-cache-dir jupyter +# Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3 +#RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8 +RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb + +RUN ./coursier bootstrap \ + -r jitpack \ + -i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \ + sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \ + --default=true --sources \ + -o almond && \ + ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13" + + +RUN adduser dev +RUN adduser dev sudo +RUN echo 'dev:dev' | chpasswd +RUN mkdir -p ~dev +RUN cp ./coursier ~dev/ +RUN echo "color_prompt=yes" >> ~dev/.bashrc +RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc +RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.2-bin-hadoop3" >> ~dev/.bashrc +RUN chown -R dev ~dev +USER dev +# Kernels are installed in user so we need to run as the user +RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13" +USER root + +RUN mkdir -p /high-performance-spark-examples +RUN mkdir -p /high-performance-spark-examples/warehouse +RUN chown -R dev /high-performance-spark-examples +WORKDIR /high-performance-spark-examples +# Increase the chance of caching by copying just the env setup file first. +COPY --chown=dev:dev env_setup.sh ./ +# Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place +# Also downloads some test data +RUN SCALA_VERSION=2.13 ./env_setup.sh && rm *.tgz +RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back +# Note: We need to use /home in the COPY otherwise no happy pandas +COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new +RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json +RUN chown -R dev /high-performance-spark-examples +ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/ +RUN git clone https://github.com/holdenk/spark-upgrade.git +RUN chown -R dev /high-performance-spark-examples +USER dev +RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history +CMD ["/high-performance-spark-examples/misc/container_launch.sh"] + diff --git a/build_container.sh b/build_container.sh index 1c550c7..26c61c5 100755 --- a/build_container.sh +++ b/build_container.sh @@ -15,6 +15,9 @@ else git archive -o myapp.tar --format=tar HEAD echo "$hash" > oldhash fi -IMAGE=holdenk/hps:0.1 -docker buildx build --platform=linux/amd64,linux/arm64 -t "${IMAGE}" . --push +VERSION=${VERSION:-0.4} +IMAGE=${IMAGE:-holdenk/hps:$VERSION} +MINI_IMAGE=${MINI_IMAGE:-holdenk/hps-mini:$VERSION} +docker buildx build --platform=linux/amd64,linux/arm64 -t "${MINI_IMAGE}" -f Dockerfile-mini . --push +docker buildx build --platform=linux/amd64,linux/arm64 -t "${IMAGE}" . --push --build-arg base="${MINI_IMAGE}" #docker buildx build --platform=linux/amd64 -t "${IMAGE}" . --push diff --git a/env_setup.sh b/env_setup.sh index b027a9e..a9b9e0d 100755 --- a/env_setup.sh +++ b/env_setup.sh @@ -4,7 +4,7 @@ set -ex # Download Spark and iceberg if not present SPARK_MAJOR=${SPARK_MAJOR:-"3.5"} -SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.1"} +SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.2"} SCALA_VERSION=${SCALA_VERSION:-"2.13"} HADOOP_VERSION="3" SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" @@ -55,4 +55,10 @@ mkdir -p ./data/fetched/ if [ ! -f ./data/fetched/2021 ]; then wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" -O ./data/fetched/2021 fi +if [ ! -f ./data/fetched/2022 ]; then + wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2022" -O ./data/fetched/2022 +fi +if [ ! -f ./data/fetched/2023 ]; then + wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2023" -O ./data/fetched/2023 +fi diff --git a/iceberg-workshop-solutions/Workshop-Template.ipynb b/iceberg-workshop-solutions/Workshop-Template.ipynb new file mode 100644 index 0000000..472a9c3 --- /dev/null +++ b/iceberg-workshop-solutions/Workshop-Template.ipynb @@ -0,0 +1,552 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "34577ad3-822f-4370-bcba-56b9fcec3196", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.spark.sql._\n", + "import scala.sys.process._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d3141ec-7779-467a-9f76-2e51030fd1c7", + "metadata": {}, + "outputs": [], + "source": [ + "// So now we need to configure Spark to use Iceberg\n", + "// See https://iceberg.apache.org/docs/1.6.0/spark-configuration/ & https://iceberg.apache.org/docs/1.6.0/spark-getting-started/\n", + "// We'll use the \"hadoop\" (aka file) catalog & /high-performance-spark-examples/warehouse for the location\n", + "val spark = (SparkSession.builder.master(\"local[*]\")\n", + " // Setup the extensions\n", + " // You'll want to configure Iceberg here as discussed above\n", + " // If you want to match the solution you'll want to configure the Iceberg catalog to be \"local.\"\n", + " .getOrCreate()\n", + " )\n", + "import spark._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c53080aa-a9d6-45f9-968b-8e052e7fa963", + "metadata": {}, + "outputs": [], + "source": [ + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88d54d05-0c49-4268-9b65-8c72679cb0f7", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sparkContext.uiWebUrl.get" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "270730c9-9787-407c-ba22-f0cee1f67f53", + "metadata": {}, + "outputs": [], + "source": [ + "// Load the current data\n", + "val df = spark.read.option(\"header\", \"true\").option(\"inferSchema\", \"true\").csv(\"/high-performance-spark-examples/data/fetched/2021\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87ca6359-86bc-42a4-93dd-4fc64496b145", + "metadata": {}, + "outputs": [], + "source": [ + "// Drop existing table if present & create new table\n", + "spark.sql(\"DROP TABLE IF EXISTS local.uk_gender_pay_data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bdeb3eb-b725-409b-ab3a-409d0e8309ae", + "metadata": {}, + "outputs": [], + "source": [ + "// Write the data out\n", + "df.write.saveAsTable(\"local.uk_gender_pay_data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "554c6036-0c6b-4e3c-a9e1-7251c608b48f", + "metadata": {}, + "outputs": [], + "source": [ + "\"ls /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb541fbf-4a79-402d-a6b2-e999106e9a18", + "metadata": {}, + "outputs": [], + "source": [ + "\"cat /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/v1.metadata.json\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90149834-27a2-45a3-aa8a-dae2162da854", + "metadata": {}, + "outputs": [], + "source": [ + "// Iceberg Java SDK time imports\n", + "import java.util.HashMap\n", + "import java.util.Map\n", + "\n", + "import org.apache.iceberg.Table\n", + "import org.apache.iceberg.catalog.TableIdentifier\n", + "import org.apache.iceberg.hadoop.HadoopCatalog\n", + "\n", + "\n", + "// And to handle java types\n", + "import scala.jdk.CollectionConverters._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf56bc6-d420-474c-b3a8-ded03b23eff8", + "metadata": {}, + "outputs": [], + "source": [ + "// Create a local Iceberg Catalog client. Here we're using the \"hadoop catalog\"\n", + "// The spark hadoop conf can be got from: spark.sparkContext.hadoopConfiguration\n", + "// Here we make the Catalog, it's kind of funky. Spark also has methods which return tables but they're Spark tables so\n", + "// which aren't the type we want\n", + "// https://iceberg.apache.org/javadoc/latest/org/apache/iceberg/hadoop/HadoopCatalog.html\n", + "val catalog = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c55dc276-035f-40d4-9a47-bd4698f2519d", + "metadata": {}, + "outputs": [], + "source": [ + "// Now we want to load the table. To do that we need to make a TableIdentifier of the same table we wrote to. Note it'll just be\n", + "// the table name no need for the \"local\" prefix.\n", + "// See https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/catalog/TableIdentifier.html\n", + "val name = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ea4b1cc-bd1b-42b4-bdbe-27625b461db9", + "metadata": {}, + "outputs": [], + "source": [ + "// Load the table\n", + "val table = catalog.loadTable(name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd1c6add-d465-4b81-9c34-6c8f40197ab2", + "metadata": {}, + "outputs": [], + "source": [ + "// Now we want to get the snapshots from the table. There are a few different ways we can do this:\n", + "// 1) Using the Iceberg Table API (see https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/Table.html)\n", + "// 2) Using the Iceberg + Spark SQL special query interface https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/Table.html\n", + "val snapshots = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a96986d-b3a5-49ad-aeac-a492bf3fc8e6", + "metadata": {}, + "outputs": [], + "source": [ + "val snapshot = snapshots(0).snapshotId()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15c6cb85-ff64-405f-ae6a-7e3c917ac12a", + "metadata": {}, + "outputs": [], + "source": [ + "val altSnapshotQuery = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f93516ad-3ae9-4bb6-989f-7c127f82143c", + "metadata": {}, + "outputs": [], + "source": [ + "val altSnapshotId = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15e67d1b-1e9e-45a0-af94-1c9c79e03d54", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4829752b-dc30-49db-93ae-911f1c2743c1", + "metadata": {}, + "outputs": [], + "source": [ + "// And the files!\n", + "// We can also list snapshots with the select\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f262d890-0818-410a-aec8-2986a04ae16e", + "metadata": {}, + "outputs": [], + "source": [ + "// Lets take a quick look and see\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7369bcc8-a738-48dc-a475-55885d4460cc", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"DELETE FROM local.uk_gender_pay_data WHERE isnull(responsibleperson)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73d279f3-f2a5-4ddf-a56f-d473b0c28b97", + "metadata": {}, + "outputs": [], + "source": [ + "// Make sure the data is gone\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b6902ef-b742-466d-b4c8-d6830ff67cf4", + "metadata": {}, + "outputs": [], + "source": [ + "// Yay! ok now lets travel back in time\n", + "// We can do this with SQL or with a read option\n", + "// SQL: https://iceberg.apache.org/docs/nightly/spark-queries/#sql" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7e899a1-d2cd-4e25-b142-e69fb9ca6774", + "metadata": {}, + "outputs": [], + "source": [ + "// DF: https://iceberg.apache.org/docs/nightly/spark-queries/#dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8884a8a2-bbb7-47b1-85f6-744c60612dcb", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(f\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f53692b-f14a-4df7-8069-147eca8da0cd", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"DROP TABLE IF EXISTS local.uk_gender_pay_data_postcode\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8deb38c3-1e64-4eba-ac80-c75d5674258b", + "metadata": {}, + "outputs": [], + "source": [ + "// Write the data out partitioned to do this we'll want to use the SQL interface so we can use the truncate function\n", + "// since the regular Scala API doesn't support partioning by things besides raw keys.\n", + "https://iceberg.apache.org/docs/1.5.1/spark-ddl/#partitioned-by" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e87e6b08-5c0e-4356-a0ee-7245b7d7790b", + "metadata": {}, + "outputs": [], + "source": [ + "// Inspect the files again. This should look familiar ish\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data_postcode.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71569b2e-7def-42a4-bf3e-69ee9667a41d", + "metadata": {}, + "outputs": [], + "source": [ + "// Add some more data, we've got 2022, 2023 , & 2024\n", + "// Make sure to use the append mode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5be0f8c7-2926-4bf6-bc9d-c02a15648e83", + "metadata": {}, + "outputs": [], + "source": [ + "val uncompacted_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb6c4b7d-f8d1-41f7-b014-c6434bbb6d48", + "metadata": {}, + "outputs": [], + "source": [ + "val uncompacted_metadata_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/metadata/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfb116e4-c66d-4027-80a3-e7de9ad62ee0", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "586bdb3c-19f0-4a63-b87f-d181e8c44c06", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.snapshots\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22351ea4-8cb7-43c2-b205-4554d0b15aca", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.spark.actions.SparkActions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "928f9da9-d65b-4d53-b818-82a27f8171a2", + "metadata": {}, + "outputs": [], + "source": [ + "// So far the logging has been... verbose but interesting, but the next stages it's actually too much\n", + "spark.sparkContext.setLogLevel(\"ERROR\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "807193d8-8ff5-4a9c-b6ae-510ee0bb2f84", + "metadata": {}, + "outputs": [], + "source": [ + "// Ok let's try and compact things down a little bit.\n", + "// You should look at SparkActions & use the rewrite data files operation.\n", + "// Consider specifying rewrite-all to true to force rewrites\n", + "// https://iceberg.apache.org/javadoc/latest/org/apache/iceberg/spark/actions/SparkActions.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e4a013d-1af5-4dd8-82c1-5115905f3feb", + "metadata": {}, + "outputs": [], + "source": [ + "val compacted_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c40db89-7ce1-40ed-a111-1395e5b75a0a", + "metadata": {}, + "outputs": [], + "source": [ + "// Interesting. Note it _added_ a new file but the old files are all still there. That's kind of expected/ok since if we look at the\n", + "// files actually currently used it's just the new one\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9198c74b-87d5-42b0-9987-587095848282", + "metadata": {}, + "outputs": [], + "source": [ + "// Remove the old snapshots but keep the latest one.\n", + "// This produces _so much logging_ by default that running in the NB would be slow (that's why we set the log level to error)\n", + "// Here your going to want to use the expireSnapshots action.\n", + "// Note: if you _just set_ retainLast it will keep all snapshots, retain last is like a safety mechanism that keeps the last K\n", + "// snapshots. To get rid of everything except the last expire everything older than right now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be51d1ca-a105-407f-ac3c-41c0f9258891", + "metadata": {}, + "outputs": [], + "source": [ + "val compacted_and_expired_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18825715-ced8-401f-b7b3-9ea682d38757", + "metadata": {}, + "outputs": [], + "source": [ + "// Table is in an inconsistent state here, this is not \"good\" but YOLO\n", + "// spark.sql(\"REFRESH local.uk_gender_pay_data\").show()\n", + "// spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa8644af-5604-4147-8546-f65e749b8253", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f1983f2-2fe7-4e43-a78e-40fd1c7577fd", + "metadata": {}, + "outputs": [], + "source": [ + "// Remove the orphaned files\n", + "SparkActions.get().deleteOrphanFiles(table).execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73b3e2ca-555b-467c-a253-d96aab32e27b", + "metadata": {}, + "outputs": [], + "source": [ + "val cleaned_and_compacted_file_list = \"ls ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "921d0c02-1fb7-43ec-ac0a-b5d1c3a40c3d", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b8ff0a3-8c6e-4d67-8afb-d1541c7e6dbd", + "metadata": {}, + "outputs": [], + "source": [ + "// Lets go take a look at a quick side-by-side test\n", + "//cd /high-performance-spark-examples/spark-upgrade/;./e2e_demo/scala/run_demo.sh" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05d47a57-3bfa-484a-90ed-0231a17a7205", + "metadata": {}, + "outputs": [], + "source": [ + "// Ok, let's try branching! Note: requires very recent Iceberg, so if you're doing this elsewhere might not be a party\n", + "// Relevant docs: https://iceberg.apache.org/docs/nightly/spark-ddl/#branching-and-tagging-ddl\n", + "// https://iceberg.apache.org/docs/nightly/spark-queries/#sql" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)", + "language": "scala", + "name": "scala2.13" + }, + "language_info": { + "codemirror_mode": "text/x-scala", + "file_extension": ".sc", + "mimetype": "text/x-scala", + "name": "scala", + "nbconvert_exporter": "script", + "version": "2.13.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/iceberg-workshop-solutions/Workshop.ipynb b/iceberg-workshop-solutions/Workshop.ipynb new file mode 100644 index 0000000..ebd1263 --- /dev/null +++ b/iceberg-workshop-solutions/Workshop.ipynb @@ -0,0 +1,577 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "34577ad3-822f-4370-bcba-56b9fcec3196", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.spark.sql._\n", + "import scala.sys.process._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d3141ec-7779-467a-9f76-2e51030fd1c7", + "metadata": {}, + "outputs": [], + "source": [ + "// So now we need to configure Spark to use Iceberg\n", + "// See https://iceberg.apache.org/docs/1.6.0/spark-configuration/ & https://iceberg.apache.org/docs/1.6.0/spark-getting-started/\n", + "// We'll use the \"hadoop\" (aka file) catalog & /high-performance-spark-examples/warehouse for the location\n", + "val spark = (SparkSession.builder.master(\"local[*]\")\n", + " // Setup the extensions\n", + " .config(\"spark.sql.extensions\", \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\")\n", + " .config(\"spark.sql.catalog.local\", \"org.apache.iceberg.spark.SparkCatalog\")\n", + " .config(\"spark.sql.catalog.local.type\", \"hadoop\")\n", + " .config(\"spark.sql.catalog.local.warehouse\", \"/high-performance-spark-examples/warehouse\")\n", + " .getOrCreate()\n", + " )\n", + "import spark._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ecbdf4a8-3f16-4242-9d89-0ce7835b49e7", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sparkContext.uiWebUrl.get" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "270730c9-9787-407c-ba22-f0cee1f67f53", + "metadata": {}, + "outputs": [], + "source": [ + "// Load the current data\n", + "val df = spark.read.option(\"header\", \"true\").option(\"inferSchema\", \"true\").csv(\"/high-performance-spark-examples/data/fetched/2021\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87ca6359-86bc-42a4-93dd-4fc64496b145", + "metadata": {}, + "outputs": [], + "source": [ + "// Drop existing table if present & create new table\n", + "spark.sql(\"DROP TABLE IF EXISTS local.uk_gender_pay_data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bdeb3eb-b725-409b-ab3a-409d0e8309ae", + "metadata": {}, + "outputs": [], + "source": [ + "// Write the data out\n", + "df.write.saveAsTable(\"local.uk_gender_pay_data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "554c6036-0c6b-4e3c-a9e1-7251c608b48f", + "metadata": {}, + "outputs": [], + "source": [ + "\"ls /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb541fbf-4a79-402d-a6b2-e999106e9a18", + "metadata": {}, + "outputs": [], + "source": [ + "\"cat /high-performance-spark-examples/warehouse/uk_gender_pay_data/metadata/v1.metadata.json\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90149834-27a2-45a3-aa8a-dae2162da854", + "metadata": {}, + "outputs": [], + "source": [ + "// Java SDK time imports\n", + "import java.util.HashMap\n", + "import java.util.Map\n", + "\n", + "import org.apache.iceberg.Table\n", + "import org.apache.iceberg.catalog.TableIdentifier\n", + "import org.apache.iceberg.hadoop.HadoopCatalog\n", + "\n", + "\n", + "// And to handle java types\n", + "import scala.jdk.CollectionConverters._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf56bc6-d420-474c-b3a8-ded03b23eff8", + "metadata": {}, + "outputs": [], + "source": [ + "// Create a local Iceberg Catalog client. Here we're using the \"hadoop catalog\"\n", + "// The spark hadoop conf can be got from: spark.sparkContext.hadoopConfiguration\n", + "// Here we make the Catalog, it's kind of funky. Spark also has methods which return tables but they're Spark tables so\n", + "// which aren't the type we want\n", + "val catalog = new HadoopCatalog(spark.sparkContext.hadoopConfiguration, \"/high-performance-spark-examples/warehouse\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c55dc276-035f-40d4-9a47-bd4698f2519d", + "metadata": {}, + "outputs": [], + "source": [ + "// Now we want to load the table. To do that we need to make a TableIdentifier of the same table we wrote to. Note it'll just be\n", + "// the table name no need for the \"local\" prefix.\n", + "// See https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/catalog/TableIdentifier.html\n", + "val name = TableIdentifier.of(\"uk_gender_pay_data\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ea4b1cc-bd1b-42b4-bdbe-27625b461db9", + "metadata": {}, + "outputs": [], + "source": [ + "val table = catalog.loadTable(name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd1c6add-d465-4b81-9c34-6c8f40197ab2", + "metadata": {}, + "outputs": [], + "source": [ + "// Now we want to get the snapshots from the table. There are a few different ways we can do this:\n", + "// 1) Using the Iceberg Table API (see https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/Table.html)\n", + "// 2) Using the Iceberg + Spark SQL special query interface https://iceberg.apache.org/javadoc/1.6.0/org/apache/iceberg/Table.html\n", + "val snapshots = table.snapshots().asScala.toList\n", + "snapshots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a96986d-b3a5-49ad-aeac-a492bf3fc8e6", + "metadata": {}, + "outputs": [], + "source": [ + "val snapshot = snapshots(0).snapshotId()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15c6cb85-ff64-405f-ae6a-7e3c917ac12a", + "metadata": {}, + "outputs": [], + "source": [ + "val altSnapshotQuery = spark.sql(\"SELECT * FROM local.uk_gender_pay_data.snapshots\")\n", + "altSnapshotQuery.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f93516ad-3ae9-4bb6-989f-7c127f82143c", + "metadata": {}, + "outputs": [], + "source": [ + "val altSnapshotId = spark.sql(\"SELECT snapshot_id FROM local.uk_gender_pay_data.snapshots\").collect()(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15e67d1b-1e9e-45a0-af94-1c9c79e03d54", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d94eb4db-bf03-49be-865a-e80c0613d526", + "metadata": {}, + "outputs": [], + "source": [ + "// We can also list snapshots with the select\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.snapshots\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4829752b-dc30-49db-93ae-911f1c2743c1", + "metadata": {}, + "outputs": [], + "source": [ + "// And the files!\n", + "// We can also list snapshots with the select\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f262d890-0818-410a-aec8-2986a04ae16e", + "metadata": {}, + "outputs": [], + "source": [ + "// Lets take a quick look and see\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7369bcc8-a738-48dc-a475-55885d4460cc", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"DELETE FROM local.uk_gender_pay_data WHERE isnull(responsibleperson)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73d279f3-f2a5-4ddf-a56f-d473b0c28b97", + "metadata": {}, + "outputs": [], + "source": [ + "// Make sure the data is gone\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b6902ef-b742-466d-b4c8-d6830ff67cf4", + "metadata": {}, + "outputs": [], + "source": [ + "// Yay! ok now lets travel back in time\n", + "// We can do this with SQL or with a read option\n", + "spark.sql(f\"SELECT * FROM local.uk_gender_pay_data VERSION AS OF ${snapshot} WHERE isnull(responsibleperson) LIMIT 5\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7e899a1-d2cd-4e25-b142-e69fb9ca6774", + "metadata": {}, + "outputs": [], + "source": [ + "// Or the same with option + DF syntax\n", + "spark.read.option(\"snapshot-id\", f\"${snapshot}\").table(\"local.uk_gender_pay_data\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8884a8a2-bbb7-47b1-85f6-744c60612dcb", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(f\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f53692b-f14a-4df7-8069-147eca8da0cd", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"DROP TABLE IF EXISTS local.uk_gender_pay_data_postcode\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8deb38c3-1e64-4eba-ac80-c75d5674258b", + "metadata": {}, + "outputs": [], + "source": [ + "// Write the data out partitioned\n", + "df.registerTempTable(\"temp_table\")\n", + "// We could use the table write semantics but we can't do truncate() on that\n", + "spark.sql(\"CREATE TABLE local.uk_gender_pay_data_postcode USING iceberg PARTITIONED BY (truncate(1, PostCode)) AS select * from temp_table\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e87e6b08-5c0e-4356-a0ee-7245b7d7790b", + "metadata": {}, + "outputs": [], + "source": [ + "// Inspect the files again. This should look familiar ish\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data_postcode.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71569b2e-7def-42a4-bf3e-69ee9667a41d", + "metadata": {}, + "outputs": [], + "source": [ + "val year_dfs = 2022.to(2023).map(r => spark.read.option(\"header\", \"true\").option(\"inferSchema\", \"true\").csv(s\"/high-performance-spark-examples/data/fetched/${r}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c4441cf-fd65-4a29-94fb-6d3aa927f6b1", + "metadata": {}, + "outputs": [], + "source": [ + "List(\"local.uk_gender_pay_data\", \"local.uk_gender_pay_data_postcode\").foreach(table => year_dfs.foreach(df => df.write.mode(\"append\").saveAsTable(table)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5be0f8c7-2926-4bf6-bc9d-c02a15648e83", + "metadata": {}, + "outputs": [], + "source": [ + "val uncompacted_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb6c4b7d-f8d1-41f7-b014-c6434bbb6d48", + "metadata": {}, + "outputs": [], + "source": [ + "val uncompacted_metadata_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/metadata/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfb116e4-c66d-4027-80a3-e7de9ad62ee0", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "586bdb3c-19f0-4a63-b87f-d181e8c44c06", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.snapshots\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22351ea4-8cb7-43c2-b205-4554d0b15aca", + "metadata": {}, + "outputs": [], + "source": [ + "import org.apache.iceberg.spark.actions.SparkActions\n", + "// Iceberg actions\n", + "import org.apache.iceberg.actions.Action" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "928f9da9-d65b-4d53-b818-82a27f8171a2", + "metadata": {}, + "outputs": [], + "source": [ + "// So far the logging has been... verbose but interesting, but the next stages it's actually too much\n", + "spark.sparkContext.setLogLevel(\"ERROR\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "807193d8-8ff5-4a9c-b6ae-510ee0bb2f84", + "metadata": {}, + "outputs": [], + "source": [ + "// Ok let's try and compact things down a little bit.\n", + "// You should look at SparkActions & use the rewrite data files operation.\n", + "// Consider specifying rewrite-all to true to force rewrites\n", + "// https://iceberg.apache.org/javadoc/latest/org/apache/iceberg/spark/actions/SparkActions.html\n", + "SparkActions.get().rewriteDataFiles(table).option(\"target-file-size-bytes\", (512L*1024L*1024L).toString).option(\"rewrite-all\", \"true\").execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e4a013d-1af5-4dd8-82c1-5115905f3feb", + "metadata": {}, + "outputs": [], + "source": [ + "val compacted_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c40db89-7ce1-40ed-a111-1395e5b75a0a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9198c74b-87d5-42b0-9987-587095848282", + "metadata": {}, + "outputs": [], + "source": [ + "// Remove the old snapshots but keep the latest one.\n", + "// This produces _so much logging_ by default that running in the NB would be slow (that's why we set the log level to error)\n", + "// Here your going to want to use the expireSnapshots action.\n", + "// Note: if you _just set_ retainLast it will keep all snapshots, retain last is like a safety mechanism that keeps the last K\n", + "// snapshots. To get rid of everything except the last expire everything older than right now.\n", + "SparkActions.get().expireSnapshots(table).expireOlderThan(System.currentTimeMillis()).retainLast(1).execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be51d1ca-a105-407f-ac3c-41c0f9258891", + "metadata": {}, + "outputs": [], + "source": [ + "val compacted_and_expired_file_list = \"ls -alh ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18825715-ced8-401f-b7b3-9ea682d38757", + "metadata": {}, + "outputs": [], + "source": [ + "// Table is in an inconsistent state here, this is not \"good\"\n", + "spark.sql(\"REFRESH local.uk_gender_pay_data\").show()\n", + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa8644af-5604-4147-8546-f65e749b8253", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f1983f2-2fe7-4e43-a78e-40fd1c7577fd", + "metadata": {}, + "outputs": [], + "source": [ + "// Remove the orphaned files\n", + "SparkActions.get().deleteOrphanFiles(table).execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73b3e2ca-555b-467c-a253-d96aab32e27b", + "metadata": {}, + "outputs": [], + "source": [ + "val cleaned_and_compacted_file_list = \"ls ../warehouse/uk_gender_pay_data/data/\".!!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "921d0c02-1fb7-43ec-ac0a-b5d1c3a40c3d", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SELECT * FROM local.uk_gender_pay_data.files\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b8ff0a3-8c6e-4d67-8afb-d1541c7e6dbd", + "metadata": {}, + "outputs": [], + "source": [ + "// Lets go take a look at a quick side-by-side test\n", + "//cd /high-performance-spark-examples/spark-upgrade/;./e2e_demo/scala/run_demo.sh\n", + "//That'll be easier to run in a terminal than the .!! trick we've been doing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05d47a57-3bfa-484a-90ed-0231a17a7205", + "metadata": {}, + "outputs": [], + "source": [ + "// Ok, let's try branching! Note: requires very recent Iceberg, so if you're doing this elsewhere might not be a party\n", + "// Relevant docs: https://iceberg.apache.org/docs/nightly/spark-ddl/#branching-and-tagging-ddl\n", + "// https://iceberg.apache.org/docs/nightly/spark-queries/#sql\n", + "spark.sql(\"ALTER TABLE local.uk_gender_pay_data CREATE BRANCH IF NOT EXISTS `new-software-branch`\")\n", + "spark.sql(\"DELETE FROM local.uk_gender_pay_data.`branch_new-software-branch` WHERE isnull(DueDate)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "128591e9-fc12-4791-8797-901ce2f1c6b7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)", + "language": "scala", + "name": "scala2.13" + }, + "language_info": { + "codemirror_mode": "text/x-scala", + "file_extension": ".sc", + "mimetype": "text/x-scala", + "name": "scala", + "nbconvert_exporter": "script", + "version": "2.13.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/misc/container_launch.sh b/misc/container_launch.sh new file mode 100755 index 0000000..34cf7d9 --- /dev/null +++ b/misc/container_launch.sh @@ -0,0 +1,5 @@ +#!/bin/bash +if [ ! -f /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb ]; then + cp /high-performance-spark-examples/iceberg-workshop/Workshop-Template.ipynb /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb +fi +jupyter-lab --ip 0.0.0.0 --port 8877 diff --git a/misc/kernel.json b/misc/kernel.json index d5575f8..5812f16 100644 --- a/misc/kernel.json +++ b/misc/kernel.json @@ -2,7 +2,7 @@ "argv": [ "java", "-cp", - "/home/dev/.local/share/jupyter/kernels/scala2.13/launcher.jar:.:/high-performance-spark-examples/:/high-performance-spark-examples/target/scala-2.13/home/dev/.local/share/jupyter/kernels/scala/launcher.jar:/high-performance-spark-examples/spark-3.5.1-bin-hadoop3-scala2.13/jars/*", + "/home/dev/.local/share/jupyter/kernels/scala2.13/launcher.jar:.:/high-performance-spark-examples/:/high-performance-spark-examples/target/scala-2.13/home/dev/.local/share/jupyter/kernels/scala/launcher.jar:/high-performance-spark-examples/spark-3.5.2-bin-hadoop3-scala2.13/jars/*", "coursier.bootstrap.launcher.Launcher", "--log", "info", diff --git a/run_container.sh b/run_container.sh index 37d47aa..396e513 100755 --- a/run_container.sh +++ b/run_container.sh @@ -1,7 +1,10 @@ #!/bin/bash set -ex -./build_container.sh -docker image pull holdenk/hps:0.1 +VERSION=${VERSION:-0.4} +IMAGE=${IMAGE:-holdenk/hps:$VERSION} +export VERSION +export IMAGE +docker image pull "$IMAGE" mkdir -p warehouse mkdir -p iceberg-workshop -docker container run --mount type=bind,source="$(pwd)"/warehouse,target=/warehouse --mount type=bind,source="$(pwd)/iceberg-workshop",target=/high-performance-spark-examples/iceberg-workshop -p 8877:8877 -it holdenk/hps:0.1 /bin/bash +docker container run --mount type=bind,source="$(pwd)"/warehouse,target=/high-performance-spark-examples/warehouse --mount type=bind,source="$(pwd)/iceberg-workshop",target=/high-performance-spark-examples/iceberg-workshop -p 8877:8877 -p 4040:4040 -it "${IMAGE}" # /bin/bash From 6901d194e6d2a57f73daf94307a6319412e011be Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 27 Aug 2024 09:44:17 -0700 Subject: [PATCH 45/53] Update the dev container (#134) * bump container version * Fix container launch script * fastavro is super useful for cat --- Dockerfile | 2 +- build_container.sh | 2 +- misc/container_launch.sh | 2 +- run_container.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2d1d0fb..c4feed8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,6 @@ ARG base FROM $base USER root -RUN pip install --no-cache-dir pyarrow pyiceberg[pandas,snappy,daft,s3fs] avro +RUN pip install --no-cache-dir pyarrow pyiceberg[pandas,snappy,daft,s3fs] avro fastavro USER dev RUN sbt clean compile diff --git a/build_container.sh b/build_container.sh index 26c61c5..691ae67 100755 --- a/build_container.sh +++ b/build_container.sh @@ -15,7 +15,7 @@ else git archive -o myapp.tar --format=tar HEAD echo "$hash" > oldhash fi -VERSION=${VERSION:-0.4} +VERSION=${VERSION:-0.5} IMAGE=${IMAGE:-holdenk/hps:$VERSION} MINI_IMAGE=${MINI_IMAGE:-holdenk/hps-mini:$VERSION} docker buildx build --platform=linux/amd64,linux/arm64 -t "${MINI_IMAGE}" -f Dockerfile-mini . --push diff --git a/misc/container_launch.sh b/misc/container_launch.sh index 34cf7d9..31f0edb 100755 --- a/misc/container_launch.sh +++ b/misc/container_launch.sh @@ -1,5 +1,5 @@ #!/bin/bash if [ ! -f /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb ]; then - cp /high-performance-spark-examples/iceberg-workshop/Workshop-Template.ipynb /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb + cp /high-performance-spark-examples/iceberg-workshop-solutions/Workshop-Template.ipynb /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb fi jupyter-lab --ip 0.0.0.0 --port 8877 diff --git a/run_container.sh b/run_container.sh index 396e513..0efe1f6 100755 --- a/run_container.sh +++ b/run_container.sh @@ -1,6 +1,6 @@ #!/bin/bash set -ex -VERSION=${VERSION:-0.4} +VERSION=${VERSION:-0.5} IMAGE=${IMAGE:-holdenk/hps:$VERSION} export VERSION export IMAGE From c1adb459574d27731e8504509d4062fa9a782225 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 27 Aug 2024 10:33:38 -0700 Subject: [PATCH 46/53] Test data frames with pyspark (#135) * Add tags to test_dual_write * Add the Python 3.5 example. * Fix up style and imports * Remove excess newline * Fix new test --- python/examples/test_dual_write.py | 4 +++ python/examples/test_dual_write_new.py | 39 ++++++++++++++++++++++++++ python/tox.ini | 2 +- 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 python/examples/test_dual_write_new.py diff --git a/python/examples/test_dual_write.py b/python/examples/test_dual_write.py index d85e7f9..e68eb2b 100644 --- a/python/examples/test_dual_write.py +++ b/python/examples/test_dual_write.py @@ -1,6 +1,7 @@ import os import tempfile +# tag::test[] from sparktestingbase.sqltestcase import SQLTestCase from pyspark.sql.functions import current_timestamp from pyspark.sql.types import Row @@ -21,3 +22,6 @@ def test_actual_dual_write(self): df1 = self.sqlCtx.read.format("parquet").load(p1) df2 = self.sqlCtx.read.format("parquet").load(p2) self.assertDataFrameEqual(df2.select("times"), df1, 0.1) + + +# end::test[] diff --git a/python/examples/test_dual_write_new.py b/python/examples/test_dual_write_new.py new file mode 100644 index 0000000..e8b6df5 --- /dev/null +++ b/python/examples/test_dual_write_new.py @@ -0,0 +1,39 @@ +import os +import tempfile + +# tag::test[] +import unittest +from pyspark.sql import SparkSession +from pyspark.sql.functions import current_timestamp +from pyspark.sql.types import Row +from pyspark.testing.utils import assertDataFrameEqual +from .dual_write import DualWriteExample + + +class DualWriteTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.spark = SparkSession.builder.appName( + "Testing PySpark Example" + ).getOrCreate() + + @classmethod + def tearDownClass(cls): + cls.spark.stop() + + def test_always_passes(self): + self.assertTrue(True) + + def test_actual_dual_write(self): + tempdir = tempfile.mkdtemp() + p1 = os.path.join(tempdir, "data1") + p2 = os.path.join(tempdir, "data2") + df = self.spark.createDataFrame([Row("timbit"), Row("farted")], ["names"]) + combined = df.withColumn("times", current_timestamp()) + DualWriteExample().do_write(combined, p1, p2) + df1 = self.spark.read.format("parquet").load(p1) + df2 = self.spark.read.format("parquet").load(p2) + assertDataFrameEqual(df2.select("times"), df1, 0.1) + + +# end::test[] diff --git a/python/tox.ini b/python/tox.ini index 330cd58..2aa2d4d 100644 --- a/python/tox.ini +++ b/python/tox.ini @@ -57,7 +57,7 @@ deps = [testenv:flake8] extras = tests skipsdist = True -commands = flake8 --ignore=F403,E402,F401,F405,W503 examples +commands = flake8 --ignore=F403,E402,F401,F405,W503,E265 examples allowlist_externals = flake8 [testenv:mypy] From da5958d9f254db77fb6740b3eeb9e53857bf920b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 2 Nov 2024 20:15:53 -0500 Subject: [PATCH 47/53] Switch to Spark 4 preview 2 (#136) * Use scala 2.13.13 & Spark 4 snapshot & spark testing base snapshot. Add local maven resolver for snapshots Add an upsert example Update sbt version and plugins Update for Spark 4 / Scala 2.13 * Drop 11 from the build matrix * We need JDK17 --- .github/workflows/ci.yml | 7 ++++++- build.sbt | 16 +++++++++------- .../dataframe/LoadSave.scala | 10 ++++++++++ .../ml/SimpleNaiveBayes.scala | 2 +- project/plugins.sbt | 8 ++++---- 5 files changed, 30 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3527e31..57a2455 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,7 +9,6 @@ jobs: matrix: include: - java: 17 - - java: 11 runs-on: ubuntu-latest steps: - name: Checkout @@ -179,6 +178,12 @@ jobs: path: | data/fetched/* key: data-fetched + - name: Setup JDK + uses: actions/setup-java@v3 + with: + distribution: temurin + java-version: 17 + cache: sbt - name: Run PySpark examples run: ./run_pyspark_examples.sh diff --git a/build.sbt b/build.sbt index 81c971e..6e43155 100644 --- a/build.sbt +++ b/build.sbt @@ -1,5 +1,3 @@ -scalaVersion := "2.13.8" - lazy val root = (project in file(".")) .aggregate(core, native) @@ -16,6 +14,7 @@ organization := "com.highperformancespark" lazy val V = _root_.scalafix.sbt.BuildInfo +scalaVersion := "2.13.13" addCompilerPlugin(scalafixSemanticdb) scalacOptions ++= List( "-Yrangepos", @@ -38,7 +37,8 @@ resolvers ++= Seq( "Typesafe repository" at "https://repo.typesafe.com/typesafe/releases/", "Second Typesafe repo" at "https://repo.typesafe.com/typesafe/maven-releases/", "Mesosphere Public Repository" at "https://downloads.mesosphere.io/maven", - Resolver.sonatypeRepo("public") + Resolver.sonatypeRepo("public"), + Resolver.mavenLocal ) licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")) @@ -67,17 +67,18 @@ val sparkTestingVersion = settingKey[String]("Spark testing base version without lazy val core = (project in file("core")) // regular scala code with @native methods .dependsOn(native % Runtime) .settings(javah / target := (native / nativeCompile / sourceDirectory).value / "include") + .settings(scalaVersion := "2.13.13") .settings(sbtJniCoreScope := Compile) .settings( scalaVersion := "2.13.8", - javacOptions ++= Seq("-source", "1.8", "-target", "1.8"), + javacOptions ++= Seq("-source", "17", "-target", "17"), parallelExecution in Test := false, fork := true, javaOptions ++= Seq("-Xms4048M", "-Xmx4048M", "-Djna.nosys=true"), Test / javaOptions ++= specialOptions, // 2.4.5 is the highest version we have with the old spark-testing-base deps - sparkVersion := System.getProperty("sparkVersion", "3.5.1"), - sparkTestingVersion := "1.5.2", + sparkVersion := System.getProperty("sparkVersion", "4.0.0-preview2"), + sparkTestingVersion := "2.0.1", // additional libraries libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % sparkVersion.value % Provided, @@ -95,12 +96,13 @@ lazy val core = (project in file("core")) // regular scala code with @native met "net.java.dev.jna" % "jna" % "5.12.1"), scalacOptions ++= Seq("-deprecation", "-unchecked"), pomIncludeRepository := { x => false }, + resolvers += Resolver.mavenLocal ) // JNI Magic! lazy val native = (project in file("native")) // native code and build script .settings(nativeCompile / sourceDirectory := sourceDirectory.value) - .settings(scalaVersion := "2.13.8") + .settings(scalaVersion := "2.13.13") .enablePlugins(JniNative) // JniNative needs to be explicitly enabled //tag::xmlVersionConflict[] diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala index b5f1ee3..54ca534 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala @@ -88,6 +88,16 @@ case class LoadSave(sc: SparkContext, session: SparkSession) { } //end::saveAppend[] + def upsertPandas(input: DataFrame): Unit = { + //tag::upsert[] + input.mergeInto("pandaInfo", $"source.id" === $"target.id") + .whenMatched() // Note you can override the general match condition above if desired + .updateAll() + .whenNotMatched() + .insertAll() + //end::upsert[] + } + def createJDBC() = { session.read.jdbc("jdbc:dialect:serverName;user=user;password=pass", "table", new Properties) diff --git a/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala index 6a88a77..ee34ed7 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala @@ -38,7 +38,7 @@ class SimpleNaiveBayes(val uid: String) // Note this estimator assumes they start at 0 and go to numClasses val numClasses = getNumClasses(ds) // Get the number of features by peaking at the first row - val numFeatures: Integer = ds.select(col($(featuresCol))).head + val numFeatures: Integer = ds.select(col($(featuresCol))).head() .get(0).asInstanceOf[Vector].size // Determine the number of records for each class val groupedByLabel = ds.select(col($(labelCol)).as[Double]).groupByKey(x => x) diff --git a/project/plugins.sbt b/project/plugins.sbt index 7c18949..8cfbf42 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -5,16 +5,16 @@ resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositori resolvers += "sonatype-snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/" -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.2") addDependencyTreePlugin //tag::scalaFix[] -addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.10.4") +addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.12.1") //end::scalaFix[] //tag::sbtJNIPlugin[] -addSbtPlugin("com.github.sbt" %% "sbt-jni" % "1.5.4") +addSbtPlugin("com.github.sbt" %% "sbt-jni" % "1.7.0") //end::sbtJNIPlugin[] //tag::xmlVersionConflict[] @@ -24,4 +24,4 @@ ThisBuild / libraryDependencySchemes ++= Seq( ) //end::xmlVersionConflict[] -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.3") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0") From 0ed8c1131a9ce6d923ac103857b18e606d0367e1 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 4 Aug 2025 14:11:30 -0700 Subject: [PATCH 48/53] Fix some tag typos (#137) --- python/examples/pandera_ex.py | 2 +- python/examples/spark_expectations_example.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/examples/pandera_ex.py b/python/examples/pandera_ex.py index 78155ce..f3afa7c 100644 --- a/python/examples/pandera_ex.py +++ b/python/examples/pandera_ex.py @@ -1,6 +1,6 @@ from pyspark.sql.session import SparkSession -# tags::pandera_imports[] +# tag::pandera_imports[] import pandera.pyspark as pa import pyspark.sql.types as T diff --git a/python/examples/spark_expectations_example.py b/python/examples/spark_expectations_example.py index d50f829..003bb15 100644 --- a/python/examples/spark_expectations_example.py +++ b/python/examples/spark_expectations_example.py @@ -20,7 +20,7 @@ "se_notifications_on_error_drop_exceeds_threshold_breach": True, "se_notifications_on_error_drop_threshold": 15, } -# end::gloabl_setup[] +# end::global_setup[] # tag::setup_and_load[] From 7444fe9a5457fdff448ba7ced7de60d942fbc494 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 30 Aug 2025 16:12:30 -0700 Subject: [PATCH 49/53] No more need for preview (#138) --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 6e43155..bf5ea27 100644 --- a/build.sbt +++ b/build.sbt @@ -77,7 +77,7 @@ lazy val core = (project in file("core")) // regular scala code with @native met javaOptions ++= Seq("-Xms4048M", "-Xmx4048M", "-Djna.nosys=true"), Test / javaOptions ++= specialOptions, // 2.4.5 is the highest version we have with the old spark-testing-base deps - sparkVersion := System.getProperty("sparkVersion", "4.0.0-preview2"), + sparkVersion := System.getProperty("sparkVersion", "4.0.0"), sparkTestingVersion := "2.0.1", // additional libraries libraryDependencies ++= Seq( From c09658ea3d0a93a9cf6051d66881ca712c32fe01 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 30 Aug 2025 16:17:28 -0700 Subject: [PATCH 50/53] Fix some tag typos (#139) From beb499888121a35d470605bbf5753b744f89c55b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 1 Sep 2025 21:42:22 -0700 Subject: [PATCH 51/53] Spark 4: Update GHA & Get Tests Running Again (#140) * Update GHA * Add missing uses for setting up the JDK * Add sbt explicitly now * Bump spark testing version * Update more to Spark 4, except PySpark ex which uses Iceberg leave that at 3.5 * remove loadsave for Spark3 compilation with PySpark. * Install proto if needed. * Fix rm * Add distutils fix typo * Setuptools --- .github/workflows/ci.yml | 408 ++++++++++-------- accelerators/setup_comet.sh | 6 + build.sbt | 2 +- .../examples/dataframe/JavaHappyPandas.java | 35 +- .../dataframe/HappyPandas.scala | 9 +- .../NullabilityFilterOptimizer.scala | 2 +- .../ml/SimplePipeline.scala | 3 - env_setup.sh | 4 +- run_pyspark_examples.sh | 6 +- 9 files changed, 264 insertions(+), 211 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 57a2455..b5e1d28 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,6 +2,7 @@ name: CI on: pull_request: push: + jobs: test: strategy: @@ -11,197 +12,258 @@ jobs: - java: 17 runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Sync the current branch with the latest - if: github.repository != 'high-performance-spark/high-performance-spark-examples' - id: sync-branch - run: | - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/} - git -c user.name='Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD || echo "no merge needed." - git -c user.name='Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" || echo "no merge needed." - - name: Setup JDK - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: ${{ matrix.java }} - cache: sbt - - name: Scala Build and Test - run: sbt clean package +test + - name: Checkout + uses: actions/checkout@v4 + + - name: Sync the current branch with the latest + if: github.repository != 'high-performance-spark/high-performance-spark-examples' + id: sync-branch + run: | + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/} + git -c user.name='Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD || echo "no merge needed." + git -c user.name='Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" || echo "no merge needed." + + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: ${{ matrix.java }} + cache: sbt + + - name: Add sbt + uses: sbt/setup-sbt@v1 + + - name: Scala Build and Test + run: sbt clean package +test + python-test: runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install tox tox-gh-actions - - name: Run tox - run: | - cd python; tox + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + + - name: Run tox + run: | + cd python; tox + run-sql-examples: runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Cache Spark and friends - uses: actions/cache@v3 - with: - path: | - spark*.tgz - iceberg*.jar - key: spark-artifacts - - name: Cache Data - uses: actions/cache@v3 - with: - path: | - data/fetched/* - key: data-fetched - - name: Run sql examples - run: - ./run_sql_examples.sh + - name: Checkout + uses: actions/checkout@v4 + + - name: Cache Spark and friends + uses: actions/cache@v4 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 + + - name: Add sbt + uses: sbt/setup-sbt@v1 + + - name: Cache Data + uses: actions/cache@v4 + with: + path: | + data/fetched/* + key: data-fetched + + - name: Run sql examples + run: ./run_sql_examples.sh + # run-gluten-sql-examples: # runs-on: ubuntu-latest # steps: - # - name: Checkout - # uses: actions/checkout@v2 - # - name: Cache Spark and friends - # uses: actions/cache@v3 - # with: - # path: | - # spark*.tgz - # iceberg*.jar - # key: spark-artifacts - # - name: Setup JDK - # uses: actions/setup-java@v3 - # with: - # distribution: temurin - # java-version: 17 - # - name: Cache Maven packages - # uses: actions/cache@v2 - # with: - # path: ~/.m2 - # key: ${{ runner.os }}-m2-gluten - # - name: Cache Data - # uses: actions/cache@v3 - # with: - # path: | - # data/fetched/* - # key: data-fetched - # - name: Run gluten - # run: - # cd accelerators; ./gluten_spark_34_ex.sh + # - name: Checkout + # uses: actions/checkout@v4 + # - name: Cache Spark and friends + # uses: actions/cache@v4 + # with: + # path: | + # spark*.tgz + # iceberg*.jar + # key: spark-artifacts + # - name: Setup JDK + # uses: actions/setup-java@v4 + # with: + # distribution: temurin + # java-version: 17 + # - name: Add sbt + # uses: sbt/setup-sbt@v1 + # - name: Cache Maven packages + # uses: actions/cache@v4 + # with: + # path: ~/.m2 + # key: ${{ runner.os }}-m2-gluten + # - name: Cache Data + # uses: actions/cache@v4 + # with: + # path: | + # data/fetched/* + # key: data-fetched + # - name: Run gluten + # run: | + # cd accelerators; ./gluten_spark_34_ex.sh + run-comet-sql-examples: runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Cache Spark and friends - uses: actions/cache@v3 - with: - path: | - spark*.tgz - iceberg*.jar - key: spark-artifacts - - name: Cache Data - uses: actions/cache@v3 - with: - path: | - data/fetched/* - key: data-fetched - - name: Cache Maven packages - uses: actions/cache@v2 - with: - path: ~/.m2 - key: ${{ runner.os }}-m2-comet - - name: Setup Rust - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - - name: Setup JDK - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: 17 - - name: Setup comet - run: - cd accelerators; SPARK_MAJOR=3.5 ./setup_comet.sh - - name: Run comet - run: - cd accelerators; ./comet_ex.sh + - name: Checkout + uses: actions/checkout@v4 + + - name: Cache Spark and friends + uses: actions/cache@v4 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + + - name: Cache Data + uses: actions/cache@v4 + with: + path: | + data/fetched/* + key: data-fetched + + - name: Cache Maven packages + uses: actions/cache@v4 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-comet + + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 + + - name: Add sbt + uses: sbt/setup-sbt@v1 + + - name: Setup comet + run: | + cd accelerators; SPARK_MAJOR=3.5 ./setup_comet.sh + + - name: Run comet + run: | + cd accelerators; ./comet_ex.sh + run-target-examples: runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Cache Spark and friends - uses: actions/cache@v3 - with: - path: | - spark*.tgz - iceberg*.jar - key: spark-artifacts - - name: Cache Accel - uses: actions/cache@v3 - with: - path: | - accelerators/*.jar - key: accelerators-artifacts - - name: Cache Data - uses: actions/cache@v3 - with: - path: | - data/fetched/* - key: data-fetched - - name: Run the target validator example - run: - cd target-validator; ./runme.sh + - name: Checkout + uses: actions/checkout@v4 + + - name: Cache Spark and friends + uses: actions/cache@v4 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 + + - name: Cache Accel + uses: actions/cache@v4 + with: + path: | + accelerators/*.jar + key: accelerators-artifacts + + - name: Cache Data + uses: actions/cache@v4 + with: + path: | + data/fetched/* + key: data-fetched + + - name: Run the target validator example + run: | + cd target-validator; ./runme.sh + run-pyspark-examples: runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Cache Spark and friends - uses: actions/cache@v3 - with: - path: | - spark*.tgz - iceberg*.jar - key: spark-artifacts - - name: Cache Data - uses: actions/cache@v3 - with: - path: | - data/fetched/* - key: data-fetched - - name: Setup JDK - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: 17 - cache: sbt - - name: Run PySpark examples - run: - ./run_pyspark_examples.sh + - name: Checkout + uses: actions/checkout@v4 + + - name: Cache Spark and friends + uses: actions/cache@v4 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + + - name: Cache Data + uses: actions/cache@v4 + with: + path: | + data/fetched/* + key: data-fetched + + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 + cache: sbt + + - name: Add sbt + uses: sbt/setup-sbt@v1 + + - name: Run PySpark examples + run: ./run_pyspark_examples.sh + style: runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Shellcheck - run: | - sudo apt-get install -y shellcheck - shellcheck -e SC2317,SC1091,SC2034,SC2164 $(find -name "*.sh") - - name: Setup JDK - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: 17 - cache: sbt - - name: scala - run: - sbt scalastyle + - name: Checkout + uses: actions/checkout@v4 + + - name: Shellcheck + run: | + sudo apt-get update + sudo apt-get install -y shellcheck + shellcheck -e SC2317,SC1091,SC2034,SC2164 $(find -name "*.sh") + + - name: Setup JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 17 + cache: sbt + - name: Add sbt + uses: sbt/setup-sbt@v1 + - name: scala style + run: + sbt scalastyle diff --git a/accelerators/setup_comet.sh b/accelerators/setup_comet.sh index 2020cb6..ed89a0d 100755 --- a/accelerators/setup_comet.sh +++ b/accelerators/setup_comet.sh @@ -3,6 +3,12 @@ set -ex source install_rust_if_needed.sh +if command -v protoc >/dev/null 2>&1; then + echo "protoc already installed" +else + sudo apt-get install -y protobuf-compiler +fi + if [ -z "${SPARK_MAJOR}" ]; then echo "Need a spark major version specified." exit 1 diff --git a/build.sbt b/build.sbt index bf5ea27..f5c0485 100644 --- a/build.sbt +++ b/build.sbt @@ -78,7 +78,7 @@ lazy val core = (project in file("core")) // regular scala code with @native met Test / javaOptions ++= specialOptions, // 2.4.5 is the highest version we have with the old spark-testing-base deps sparkVersion := System.getProperty("sparkVersion", "4.0.0"), - sparkTestingVersion := "2.0.1", + sparkTestingVersion := "2.1.2", // additional libraries libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % sparkVersion.value % Provided, diff --git a/core/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java index 950f9e5..62b32e0 100644 --- a/core/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java +++ b/core/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java @@ -4,10 +4,9 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Column; import org.apache.spark.sql.*; -import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.expressions.Window; import org.apache.spark.sql.expressions.WindowSpec; -import org.apache.spark.sql.hive.HiveContext; import java.util.HashMap; import java.util.Map; @@ -16,39 +15,23 @@ public class JavaHappyPandas { - /** - * Creates SQLContext with an existing SparkContext. - */ - public static SQLContext sqlContext(JavaSparkContext jsc) { - SQLContext sqlContext = new SQLContext(jsc); - return sqlContext; - } - - /** - * Creates HiveContext with an existing SparkContext. - */ - public static HiveContext hiveContext(JavaSparkContext jsc) { - HiveContext hiveContext = new HiveContext(jsc); - return hiveContext; - } - /** * Illustrate loading some JSON data. */ - public static Dataset loadDataSimple(JavaSparkContext jsc, SQLContext sqlContext, String path) { - Dataset df1 = sqlContext.read().json(path); + public static Dataset loadDataSimple(JavaSparkContext jsc, SparkSession session, String path) { + Dataset df1 = session.read().json(path); - Dataset df2 = sqlContext.read().format("json").option("samplingRatio", "1.0").load(path); + Dataset df2 = session.read().format("json").option("samplingRatio", "1.0").load(path); JavaRDD jsonRDD = jsc.textFile(path); - Dataset df3 = sqlContext.read().json(jsonRDD); + Dataset df3 = session.read().json(jsonRDD); return df1; } - public static Dataset jsonLoadFromRDD(SQLContext sqlContext, JavaRDD input) { + public static Dataset jsonLoadFromRDD(SparkSession session, JavaRDD input) { JavaRDD rdd = input.filter(e -> e.contains("panda")); - Dataset df = sqlContext.read().json(rdd); + Dataset df = session.read().json(rdd); return df; } @@ -147,10 +130,10 @@ public static Dataset minMeanSizePerZip(Dataset pandas) { } public static Dataset simpleSqlExample(Dataset pandas) { - SQLContext sqlContext = pandas.sqlContext(); + SparkSession session = SparkSession.builder().getOrCreate(); pandas.registerTempTable("pandas"); - Dataset miniPandas = sqlContext.sql("SELECT * FROM pandas WHERE pandaSize < 12"); + Dataset miniPandas = session.sql("SELECT * FROM pandas WHERE pandaSize < 12"); return miniPandas; } diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala index e9e708d..def3e08 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala @@ -348,18 +348,19 @@ object HappyPandas { * Cut the lineage of a DataFrame which has too long a query plan. */ def cutLineage(df: DataFrame): DataFrame = { - val sqlCtx = df.sqlContext + val session = SparkSession.builder.getOrCreate() + import session.implicits._ //tag::cutLineage[] val rdd = df.rdd rdd.cache() - sqlCtx.createDataFrame(rdd, df.schema) + session.createDataFrame(rdd, df.schema) //end::cutLineage[] } // Self join def selfJoin(df: DataFrame): DataFrame = { - val sqlCtx = df.sqlContext - import sqlCtx.implicits._ + val session = SparkSession.builder.getOrCreate() + import session.implicits._ //tag::selfJoin[] val joined = df.as("a").join(df.as("b")).where($"a.name" === $"b.name") //end::selfJoin[] diff --git a/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala b/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala index 6b20271..8e482bf 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala @@ -8,7 +8,7 @@ import org.apache.spark.sql.catalyst.optimizer._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.TreePattern._ -import org.apache.spark.sql.catalyst.expressions.{And, IsNotNull, NullIntolerant} +import org.apache.spark.sql.catalyst.expressions.{And, IsNotNull} object NullabilityFilterOptimizer extends Rule[LogicalPlan] { diff --git a/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala b/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala index b233693..7f63ef8 100644 --- a/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala +++ b/core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala @@ -17,7 +17,6 @@ import com.highperformancespark.examples.dataframe._ object SimplePipeline { def constructAndSetParams(df: DataFrame) = { - val sqlCtx = df.sqlContext //tag::constructSetParams[] val hashingTF = new HashingTF() hashingTF.setInputCol("input") @@ -26,7 +25,6 @@ object SimplePipeline { } def constructSimpleTransformer(df: DataFrame) = { - val sqlCtx = df.sqlContext //tag::simpleTransformer[] val hashingTF = new HashingTF() // We don't set the output column here so the default output column of @@ -62,7 +60,6 @@ object SimplePipeline { } def constructSimpleEstimator(df: DataFrame) = { - val sqlCtx = df.sqlContext //tag::simpleNaiveBayes[] val nb = new NaiveBayes() nb.setLabelCol("happy") diff --git a/env_setup.sh b/env_setup.sh index a9b9e0d..f31f427 100755 --- a/env_setup.sh +++ b/env_setup.sh @@ -4,7 +4,7 @@ set -ex # Download Spark and iceberg if not present SPARK_MAJOR=${SPARK_MAJOR:-"3.5"} -SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.2"} +SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.3"} SCALA_VERSION=${SCALA_VERSION:-"2.13"} HADOOP_VERSION="3" SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" @@ -13,7 +13,7 @@ if [ "$SCALA_VERSION" = "2.13" ]; then SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3-scala2.13.tgz" SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala2.13" fi -ICEBERG_VERSION=${ICEBERG_VERSION:-"1.6.0"} +ICEBERG_VERSION=${ICEBERG_VERSION:-"1.9.2"} if [ ! -f "${SPARK_FILE}" ]; then SPARK_DIST_URL="https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" SPARK_ARCHIVE_DIST_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" diff --git a/run_pyspark_examples.sh b/run_pyspark_examples.sh index 6ab0546..7e0818e 100755 --- a/run_pyspark_examples.sh +++ b/run_pyspark_examples.sh @@ -42,8 +42,12 @@ function check_fail () { EXAMPLE_JAR="./core/target/scala-2.13/core-assembly-0.1.0-SNAPSHOT.jar" +pip install setuptools + +# Iceberg JAR not yet available for Spark 4. if [ ! -f "${EXAMPLE_JAR}" ]; then - sbt core/assembly + rm ./core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala # temp hack no merge in Spark 3. + sbt core/assembly -DsparkVersion="${SPARK_VERSION}" fi if [ ! -f "${EXAMPLE_JAR}" ]; then From 96cdeb4759d05fb4faef4cf88e54ca6c26ab5f06 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 1 Sep 2025 22:11:56 -0700 Subject: [PATCH 52/53] Note re: updated version (#141) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3388374..b230d38 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # high-performance-spark-examples Examples for High Performance Spark -We are in the progress of updata this for Spark 3.5+ and the 2ed edition of our book! +We are in the progress of updata this for Spark 4 (some parts depending on external libraries like Iceberg, Comet, etc. are still 3.X) and the 2ed edition of our book! # Building From a54b9f2bec2fbadd80a5a4c5443d2b33c2e4eae9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 28 Nov 2025 01:15:53 -0800 Subject: [PATCH 53/53] Resource profile ex (#143) --- .../tools/ResourceProfileEx.scala | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 core/src/main/scala/com/high-performance-spark-examples/tools/ResourceProfileEx.scala diff --git a/core/src/main/scala/com/high-performance-spark-examples/tools/ResourceProfileEx.scala b/core/src/main/scala/com/high-performance-spark-examples/tools/ResourceProfileEx.scala new file mode 100644 index 0000000..21b7afa --- /dev/null +++ b/core/src/main/scala/com/high-performance-spark-examples/tools/ResourceProfileEx.scala @@ -0,0 +1,41 @@ +package com.highperformancespark.examples.gpu + +import org.apache.spark.sql.SparkSession +import org.apache.spark.resource._ +import org.apache.spark.resource.ResourceProfileBuilder +import org.apache.spark.TaskContext + +object GPUResourceProfileExample { + def main(args: Array[String]): Unit = { + val spark = SparkSession.builder() + .appName("GPUResourceProfileExample") + .getOrCreate() + run(spark) + } + + def run(spark: SparkSession) = { + val sc = spark.sparkContext + //tag::gpuResourceProfileExample[] + // Create a resource profile requesting 2 NVIDIA GPUs per executor and 1 per task + val gpuResourceProfile = new ResourceProfileBuilder() + .require(new ExecutorResourceRequests().resource( + "gpu", 2, vendor="nvidia", + discoveryScript="/opt/spark/bin/getGpusResources.sh" // See sample in Spark repo + )) + .require(new TaskResourceRequests().resource("gpu", 1)) + .build() + + // Use resource profile to run on a machine with GPUs. + val rdd = sc.parallelize(1 to 4, 4) + .withResources(gpuResourceProfile) + .map { i => + // Do some special GPU stuff here my friend + i + } + //end::gpuResourceProfileExample[] + + rdd.collect().foreach(println) + + spark.stop() + } +}