diff --git a/.gitignore b/.gitignore index c58d83b..4a8e38c 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,19 @@ project/plugins/project/ # Scala-IDE specific .scala_dependencies .worksheet +.idea/ + +# emacs stuff +\#*\# +\.\#* +*~ +sbt/*launch*.jar + +# python +*.pyc + +# native +*.o +*.so +*.so.0.0.0 +*.so.0 \ No newline at end of file diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..58147d3 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,64 @@ +language: scala +sudo: false +cache: + directories: + - $HOME/.ivy2 + - $HOME/spark + - $HOME/.cache/pip + - $HOME/.pip-cache + - $HOME/.sbt/launchers + - $HOME/perl5 +scala: + - 2.11.6 +jdk: + - oraclejdk8 +r: + - release +addons: + apt: + sources: + - ubuntu-toolchain-r-test + - ppa:marutter/rdev + packages: + - gfortran + - gcc + - binutils + - python-pip + - python-pandas + - python-numpy + - gfortran + - cmake + - perl + - cpanminus + - r-base + - libcurl4-gnutls-dev + - libxml2-dev + - libssl-dev + - r-base-dev + - axel +r_packages: + - Imap +before_install: + - # Setup Python + - pip install --user codecov unittest2 nose pep8 pylint + - # Setup perl + - cpanm --force --local-lib $HOME/perl5 --quite --notest Pithub || cat ~/.cpanm/build.log + - cd ./src/main/perl; cpanm --local-lib $HOME/perl5 --force --quiet --installdeps --notest .; cd ../../../ + - PATH="$HOME/perl5/bin${PATH:+:${PATH}}"; export PATH; + - PERL5LIB=":$HOME/perl5/lib/perl5${PERL5LIB:+:${PERL5LIB}}"; export PERL5LIB; + - PERL_LOCAL_LIB_ROOT="$HOME/perl5${PERL_LOCAL_LIB_ROOT:+:${PERL_LOCAL_LIB_ROOT}}"; export PERL_LOCAL_LIB_ROOT; + - PERL_MB_OPT="--install_base \"$HOME/perl5\""; export PERL_MB_OPT; + - PERL_MM_OPT="INSTALL_BASE=$HOME/perl5"; export PERL_MM_OPT; +script: + - "export SPARK_CONF_DIR=./log4j/" + - sbt clean coverage compile package assembly test || (rm -rf ~/.ivy2 ~/.m2 && sbt clean coverage compile package test) + - "[ -f spark] || mkdir spark && cd spark && axel http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz && cd .." + - "tar -xf ./spark/spark-2.1.0-bin-hadoop2.7.tgz" + - "export SPARK_HOME=`pwd`/spark-2.1.0-bin-hadoop2.7" + - "export PYTHONPATH=$SPARK_HOME/python:`ls -1 $SPARK_HOME/python/lib/py4j-*-src.zip`:$PYTHONPATH" + - "PYSPARK_SUBMIT_ARGS='--jars ./target/examples-assembly-0.0.1.jar pyspark-shell' nosetests --with-doctest --doctest-options=+ELLIPSIS --logging-level=INFO --detailed-errors --verbosity=2 --with-coverage --cover-html-dir=./htmlcov" + - # $SPARK_HOME/bin/spark-submit ./src/main/r/wc.R $SPARK_HOME/README.md + - # $SPARK_HOME/bin/spark-submit ./src/main/r/dapply.R +after_success: + - sbt coverageReport || sbt update coverageReport + - codecov \ No newline at end of file diff --git a/LICENSE b/LICENSE index 8f71f43..80f405b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,6 @@ +Individual components under resources are available under their own licenses. + * MySQL connector is GPL +The source code in this repo is available under the Apache License Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ diff --git a/README.md b/README.md index a7f4184..551928f 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,10 @@ # high-performance-spark-examples Examples for High Performance Spark + +# Building + +Most of the examples can be built with sbt, the C and Fortran components depend on gcc, g77, and cmake. + +# Tests + +The full test suite depends on having the C and Fortran components built as well as a local R installation available. diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 0000000..d8be93f --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,27 @@ +version: '{build}' + +platform: + - x86 + - x64 + +environment: + matrix: + - JAVA_HOME: C:\Program Files\Java\jdk1.8.0 + + +install: +- ps: Start-FileDownload 'http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/0.13.9/sbt-launch.jar' +- xcopy sbt-launch.jar sbt\ +- del build.sbt +- copy build_windows.sbt build.sbt + +build_script: +- sbt\sbt clean compile + +test_script: +- sbt\sbt "testOnly com.highperformancespark.examples.tools.FilterInvalidPandasSuite" + +cache: +- C:\Users\appveyor\.ivy2 +- C:\Users\appveyor\.m2 +- C:\Users\appveyor\.sbt \ No newline at end of file diff --git a/build.sbt b/build.sbt new file mode 100644 index 0000000..35b1508 --- /dev/null +++ b/build.sbt @@ -0,0 +1,97 @@ +organization := "com.highperformancespark" + +name := "examples" + +publishMavenStyle := true + +version := "0.0.1" + +scalaVersion := "2.11.6" +scalaVersion in ThisBuild := "2.11.6" +ivyScala := ivyScala.value map { _.copy(overrideScalaVersion = true) } + +crossScalaVersions := Seq("2.11.6") + +javacOptions ++= Seq("-source", "1.8", "-target", "1.8") + +//tag::sparkVersion[] +sparkVersion := "2.2.0" +//end::sparkVersion[] + +//tag::sparkComponents[] +sparkComponents ++= Seq("core") +//end::sparkComponents[] +//tag::sparkExtraComponents[] +sparkComponents ++= Seq("streaming", "mllib") +//end::sparkExtraComponents[] +//tag::addSQLHiveComponent[] +sparkComponents ++= Seq("sql", "hive", "hive-thriftserver", "hive-thriftserver") +//end::addSQLHiveComponent[] + +parallelExecution in Test := false + +fork := true + +javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true") + +// additional libraries +libraryDependencies ++= Seq( + "org.scalatest" %% "scalatest" % "3.0.1", + "org.scalacheck" %% "scalacheck" % "1.13.4", + "junit" % "junit" % "4.12", + "junit" % "junit" % "4.11", + "com.holdenkarau" %% "spark-testing-base" % "2.2.0_0.7.2", + "com.novocode" % "junit-interface" % "0.11" % "test->default", + //tag::scalaLogging[] + "com.typesafe.scala-logging" %% "scala-logging" % "3.5.0", + //end::scalaLogging[] + "org.codehaus.jackson" % "jackson-core-asl" % "1.8.8", + "org.codehaus.jackson" % "jackson-mapper-asl" % "1.8.8", + "org.codehaus.jackson" % "jackson-core-asl" % "1.9.13", + "org.codehaus.jackson" % "jackson-mapper-asl" % "1.9.13", + "net.java.dev.jna" % "jna" % "4.2.2") + + +scalacOptions ++= Seq("-deprecation", "-unchecked") + +pomIncludeRepository := { x => false } + +resolvers ++= Seq( + "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/", + "Spray Repository" at "http://repo.spray.cc/", + "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", + "Akka Repository" at "http://repo.akka.io/releases/", + "Twitter4J Repository" at "http://twitter4j.org/maven2/", + "Apache HBase" at "https://repository.apache.org/content/repositories/releases", + "Twitter Maven Repo" at "http://maven.twttr.com/", + "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", + "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/", + "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", + "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/", + "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven", + Resolver.sonatypeRepo("public"), + Resolver.bintrayRepo("jodersky", "sbt-jni-macros"), + "jodersky" at "https://dl.bintray.com/jodersky/maven/" +) + +licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")) + +mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => + { + case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard + case m if m.startsWith("META-INF") => MergeStrategy.discard + case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first + case PathList("org", "apache", xs @ _*) => MergeStrategy.first + case PathList("org", "jboss", xs @ _*) => MergeStrategy.first + case "log4j.properties" => MergeStrategy.discard + case "about.html" => MergeStrategy.rename + case "reference.conf" => MergeStrategy.concat + case _ => MergeStrategy.first + } +} + +// JNI + +enablePlugins(JniNative) + +sourceDirectory in nativeCompile := sourceDirectory.value diff --git a/build_windows.sbt b/build_windows.sbt new file mode 100644 index 0000000..b698ab9 --- /dev/null +++ b/build_windows.sbt @@ -0,0 +1,91 @@ +organization := "com.highperformancespark" + +name := "examples" + +publishMavenStyle := true + +version := "0.0.1" + +scalaVersion := "2.11.6" +scalaVersion in ThisBuild := "2.11.6" +ivyScala := ivyScala.value map { _.copy(overrideScalaVersion = true) } + +crossScalaVersions := Seq("2.11.6") + +javacOptions ++= Seq("-source", "1.8", "-target", "1.8") + +//tag::sparkVersion[] +sparkVersion := "2.2.0" +//end::sparkVersion[] + +//tag::sparkComponents[] +sparkComponents ++= Seq("core") +//end::sparkComponents[] +//tag::sparkExtraComponents[] +sparkComponents ++= Seq("streaming", "mllib") +//end::sparkExtraComponents[] +//tag::addSQLHiveComponent[] +sparkComponents ++= Seq("sql", "hive", "hive-thriftserver", "hive-thriftserver") +//end::addSQLHiveComponent[] + +parallelExecution in Test := false + +fork := true + +javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled", "-Djna.nosys=true") + +// additional libraries +libraryDependencies ++= Seq( + "org.scalatest" %% "scalatest" % "3.0.1", + "org.scalacheck" %% "scalacheck" % "1.13.4", + "junit" % "junit" % "4.12", + "junit" % "junit" % "4.11", + "com.holdenkarau" %% "spark-testing-base" % "2.2.0_0.7.2", + "com.novocode" % "junit-interface" % "0.11" % "test->default", + //tag::sacalLogging[] + "com.typesafe.scala-logging" %% "scala-logging" % "3.5.0", + //end::scalaLogging[] + "org.codehaus.jackson" % "jackson-core-asl" % "1.8.8", + "org.codehaus.jackson" % "jackson-mapper-asl" % "1.8.8", + "org.codehaus.jackson" % "jackson-core-asl" % "1.9.13", + "org.codehaus.jackson" % "jackson-mapper-asl" % "1.9.13", + "net.java.dev.jna" % "jna" % "4.2.2") + + +scalacOptions ++= Seq("-deprecation", "-unchecked") + +pomIncludeRepository := { x => false } + +resolvers ++= Seq( + "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/", + "Spray Repository" at "http://repo.spray.cc/", + "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", + "Akka Repository" at "http://repo.akka.io/releases/", + "Twitter4J Repository" at "http://twitter4j.org/maven2/", + "Apache HBase" at "https://repository.apache.org/content/repositories/releases", + "Twitter Maven Repo" at "http://maven.twttr.com/", + "scala-tools" at "https://oss.sonatype.org/content/groups/scala-tools", + "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/", + "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", + "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/", + "Mesosphere Public Repository" at "http://downloads.mesosphere.io/maven", + Resolver.sonatypeRepo("public"), + Resolver.bintrayRepo("jodersky", "sbt-jni-macros"), + "jodersky" at "https://dl.bintray.com/jodersky/maven/" +) + +licenses := Seq("Apache License 2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")) + +mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => + { + case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard + case m if m.startsWith("META-INF") => MergeStrategy.discard + case PathList("javax", "servlet", xs @ _*) => MergeStrategy.first + case PathList("org", "apache", xs @ _*) => MergeStrategy.first + case PathList("org", "jboss", xs @ _*) => MergeStrategy.first + case "log4j.properties" => MergeStrategy.discard + case "about.html" => MergeStrategy.rename + case "reference.conf" => MergeStrategy.concat + case _ => MergeStrategy.first + } +} diff --git a/conf/log4j.properties b/conf/log4j.properties new file mode 100644 index 0000000..e90a817 --- /dev/null +++ b/conf/log4j.properties @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=ERROR, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=ERROR + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.spark-project.jetty=ERROR +log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR diff --git a/conf/sbtconfig.txt b/conf/sbtconfig.txt new file mode 100644 index 0000000..9f17943 --- /dev/null +++ b/conf/sbtconfig.txt @@ -0,0 +1,18 @@ + +# Set the java args to high + +-Xmx2048M + +-XX:MaxPermSize=2048m + +-XX:ReservedCodeCacheSize=128m + +-XX:+CMSClassUnloadingEnabled + +# Set the extra SBT options + +-Dsbt.log.format=true + +# JNA + +-Djna.nosys=true diff --git a/high_performance_pyspark/SQLLineage.py b/high_performance_pyspark/SQLLineage.py new file mode 100644 index 0000000..121f0b4 --- /dev/null +++ b/high_performance_pyspark/SQLLineage.py @@ -0,0 +1,74 @@ +""" +>>> df = rdd.toDF() +>>> df2 = cutLineage(df) +>>> df.head() == df2.head() +True +>>> df.schema == df2.schema +True +""" + +global df +global sc +global rdd +global spark + +from pyspark.context import SparkContext +from pyspark.sql import DataFrame, Row +from pyspark.sql.session import SparkSession + +# tag::cutLineage[] +def cutLineage(df): + """ + Cut the lineage of a DataFrame - used for iterative algorithms + + .. Note: This uses internal members and may break between versions + >>> df = rdd.toDF() + >>> cutDf = cutLineage(df) + >>> cutDf.count() + 3 + """ + jRDD = df._jdf.toJavaRDD() + jSchema = df._jdf.schema() + jRDD.cache() + sqlCtx = df.sql_ctx + try: + javaSqlCtx = sqlCtx._jsqlContext + except: + javaSqlCtx = sqlCtx._ssql_ctx + newJavaDF = javaSqlCtx.createDataFrame(jRDD, jSchema) + newDF = DataFrame(newJavaDF, sqlCtx) + return newDF +# end::cutLineage[] + +def _setupTest(): + globs = globals() + spark = SparkSession.builder \ + .master("local[4]") \ + .getOrCreate() + sc = spark._sc + sc.setLogLevel("ERROR") + globs['sc'] = sc + globs['spark'] = spark + globs['rdd'] = rdd = sc.parallelize( + [Row(field1=1, field2="row1"), + Row(field1=2, field2="row2"), + Row(field1=3, field2="row3")]) + return globs + +def _test(): + """ + Run the tests. + """ + import doctest + globs = _setupTest() + (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) + globs['sc'].stop() + if failure_count: + exit(-1) + +import sys +if __name__ == "__main__": + _test() +# Hack to support running in nose +elif sys.stdout != sys.__stdout__: + _setupTest() diff --git a/high_performance_pyspark/__init__.py b/high_performance_pyspark/__init__.py new file mode 100644 index 0000000..7741593 --- /dev/null +++ b/high_performance_pyspark/__init__.py @@ -0,0 +1,25 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +""" +Python version of selected examples from High Performance Spark +""" + +import os +import sys + diff --git a/high_performance_pyspark/bad_pyspark.py b/high_performance_pyspark/bad_pyspark.py new file mode 100644 index 0000000..46741dc --- /dev/null +++ b/high_performance_pyspark/bad_pyspark.py @@ -0,0 +1,164 @@ +# This script triggers a number of different PySpark errors + +from pyspark import * +from pyspark.sql.session import SparkSession + +global sc + +def nonExistentInput(sc): + """ + Attempt to load non existent input + >>> nonExistentInput(sc) + Traceback (most recent call last): + ... + Py4JJavaError:... + """ + # tag::nonExistent[] + failedRdd = sc.textFile("file:///doesnotexist") + failedRdd.count() + # end::nonExistent[] + +def throwOuter(sc): + """ + Attempt to load non existant input + >>> throwOuter(sc) + Traceback (most recent call last): + ... + Py4JJavaError:... + """ + # tag::throwOuter[] + data = sc.parallelize(range(10)) + transform1 = data.map(lambda x: x + 1) + transform2 = transform1.map(lambda x: x / 0) + transform2.count() + # end::throwOuter[] + +def throwInner(sc): + """ + Attempt to load non existant input + >>> throwInner(sc) + Traceback (most recent call last): + ... + Py4JJavaError:... + """ + # tag::throwInner[] + data = sc.parallelize(range(10)) + transform1 = data.map(lambda x: x / 0) + transform2 = transform1.map(lambda x: x + 1) + transform2.count() + # end::throwInner[] + +# tag::rewrite[] +def add1(x): + """ + Add 1 + >>> add1(2) + 3 + """ + return x + 1 + +def divZero(x): + """ + Divide by zero (cause an error) + >>> divZero(2) + Traceback (most recent call last): + ... + ZeroDivisionError: integer division or modulo by zero + """ + return x / 0 + +def throwOuter2(sc): + """ + Attempt to load non existant input + >>> throwOuter2(sc) + Traceback (most recent call last): + ... + Py4JJavaError:... + """ + data = sc.parallelize(range(10)) + transform1 = data.map(add1) + transform2 = transform1.map(divZero) + transform2.count() + +def throwInner2(sc): + """ + Attempt to load non existant input + >>> throwInner2(sc) + Traceback (most recent call last): + ... + Py4JJavaError:... + """ + data = sc.parallelize(range(10)) + transform1 = data.map(divZero) + transform2 = transform1.map(add1) + transform2.count() +# end::rewrite[] + +def throwInner3(sc): + """ + Attempt to load non existant input + >>> throwInner3(sc) + Reject 10 + """ + data = sc.parallelize(range(10)) + rejectedCount = sc.accumulator(0) + def loggedDivZero(x): + import logging + try: + return [x / 0] + except Exception as e: + rejectedCount.add(1) + logging.warning("Error found " + repr(e)) + return [] + transform1 = data.flatMap(loggedDivZero) + transform2 = transform1.map(add1) + transform2.count() + print("Reject " + str(rejectedCount.value)) + + +def runOutOfMemory(sc): + """ + Run out of memory on the workers. + In standalone modes results in a memory error, but in YARN may trigger YARN container + overhead errors. + >>> runOutOfMemory(sc) + Traceback (most recent call last): + ... + Py4JJavaError:... + """ + # tag::worker_oom[] + data = sc.parallelize(range(10)) + def generate_too_much(itr): + return range(10000000000000) + itr = data.flatMap(generate_too_much) + itr.count() + # end::worker_oom[] + +def _setupTest(): + globs = globals() + spark = SparkSession.builder \ + .master("local[4]") \ + .getOrCreate() + sc = spark._sc + globs['sc'] = sc + return globs + +def _test(): + """ + Run the tests. + Note this will print a lot of error message to stderr since we don't capture the JVM sub process + stdout/stderr for doctests. + """ + import doctest + globs = setupTest() + (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) + globs['sc'].stop() + if failure_count: + exit(-1) + +import sys +if __name__ == "__main__": + _test() +# Hack to support running in nose +elif sys.stdout != sys.__stdout__: + _setupTest() diff --git a/high_performance_pyspark/simple_perf.py b/high_performance_pyspark/simple_perf.py new file mode 100644 index 0000000..773ad3e --- /dev/null +++ b/high_performance_pyspark/simple_perf.py @@ -0,0 +1,138 @@ +# When running this example make sure to include the built Scala jar : +# $SPARK_HOME/bin/pyspark --jars ./target/examples-0.0.1.jar --driver-class-path ./target/examples-0.0.1.jar +# This example illustrates how to interface Scala and Python code, but caution +# should be taken as it depends on many private members that may change in +# future releases of Spark. + +from pyspark.sql.types import * +from pyspark.sql import * +import timeit +import time + +def generate_scale_data(sqlCtx, rows, numCols): + """ + Generate scale data for the performance test. + + This also illustrates calling custom Scala code from the driver. + + .. Note: This depends on many internal methods and may break between versions. + + # This assumes our jars have been added with export PYSPARK_SUBMIT_ARGS + >>> session = SparkSession.builder.getOrCreate() + >>> scaleData = generate_scale_data(session, 100L, 1) + >>> scaleData[0].count() + 100 + >>> scaleData[1].count() + 100 + >>> session.stop() + """ + # tag::javaInterop[] + sc = sqlCtx._sc + # Get the SQL Context, 2.1, 2.0 and pre-2.0 syntax - yay internals :p + try: + try: + javaSqlCtx = sqlCtx._jsqlContext + except: + javaSqlCtx = sqlCtx._ssql_ctx + except: + javaSqlCtx = sqlCtx._jwrapped + jsc = sc._jsc + scalasc = jsc.sc() + gateway = sc._gateway + # Call a java method that gives us back an RDD of JVM Rows (Int, Double) + # While Python RDDs are wrapped Java RDDs (even of Rows) the contents are + # different, so we can't directly wrap this. + # This returns a Java RDD of Rows - normally it would better to + # return a DataFrame directly, but for illustration we will work + # with an RDD of Rows. + java_rdd = (gateway.jvm.com.highperformancespark.examples. + tools.GenerateScalingData. + generateMiniScaleRows(scalasc, rows, numCols)) + # Schemas are serialized to JSON and sent back and forth + # Construct a Python Schema and turn it into a Java Schema + schema = StructType([ + StructField("zip", IntegerType()), + StructField("fuzzyness", DoubleType())]) + # 2.1 / pre-2.1 + try: + jschema = javaSqlCtx.parseDataType(schema.json()) + except: + jschema = sqlCtx._jsparkSession.parseDataType(schema.json()) + # Convert the Java RDD to Java DataFrame + java_dataframe = javaSqlCtx.createDataFrame(java_rdd, jschema) + # Wrap the Java DataFrame into a Python DataFrame + python_dataframe = DataFrame(java_dataframe, sqlCtx) + # Convert the Python DataFrame into an RDD + pairRDD = python_dataframe.rdd.map(lambda row: (row[0], row[1])) + return (python_dataframe, pairRDD) + # end::javaInterop[] + +def runOnDF(df): + result = df.groupBy("zip").avg("fuzzyness").count() + return result + +def runOnRDD(rdd): + result = rdd.map(lambda (x, y): (x, (y, 1))). \ + reduceByKey(lambda x, y: (x[0] + y [0], x[1] + y[1])). \ + count() + return result + +def groupOnRDD(rdd): + return rdd.groupByKey().mapValues(lambda v: sum(v) / float(len(v))).count() + +def run(sc, sqlCtx, scalingFactor, size): + """ + Run the simple perf test printing the results to stdout. + + >>> session = SparkSession.builder.getOrCreate() + >>> sc = session._sc + >>> run(sc, session, 5L, 1) + RDD: + ... + group: + ... + df: + ... + yay + >>> session.stop() + """ + (input_df, input_rdd) = generate_scale_data(sqlCtx, scalingFactor, size) + input_rdd.cache().count() + rddTimeings = timeit.repeat(stmt=lambda: runOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()') + groupTimeings = timeit.repeat(stmt=lambda: groupOnRDD(input_rdd), repeat=10, number=1, timer=time.time, setup='gc.enable()') + input_df.cache().count() + dfTimeings = timeit.repeat(stmt=lambda: runOnDF(input_df), repeat=10, number=1, timer=time.time, setup='gc.enable()') + print "RDD:" + print rddTimeings + print "group:" + print groupTimeings + print "df:" + print dfTimeings + print "yay" + +def parseArgs(args): + """ + Parse the args, no error checking. + + >>> parseArgs(["foobaz", "1", "2"]) + (1, 2) + """ + scalingFactor = int(args[1]) + size = int(args[2]) + return (scalingFactor, size) + + +if __name__ == "__main__": + + """ + Usage: simple_perf_test scalingFactor size + """ + import sys + from pyspark import SparkContext + from pyspark.sql import SQLContext + (scalingFactor, size) = parseArgs(sys.argv) + session = SparkSession.appName("SimplePythonPerf").builder.getOrCreate() + sc = session._sc + run(sc, session, scalingFactor, size) + + sc.stop() diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 0000000..26c430e --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1,22 @@ +addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0") + +resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" + +resolvers += "sonatype-snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/" + + +//tag::addSparkPackagesPlugin[] +resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven" + +addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.5") +//end::addSparkPackagesPlugin[] + +//addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") + +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.0") + +//tag::sbtJNIPlugin[] +addSbtPlugin("ch.jodersky" %% "sbt-jni" % "1.0.0-RC3") +//end::sbtJNIPlugin[] + +addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0") diff --git a/resources/mysql-connector-java-5.1.38.jar b/resources/mysql-connector-java-5.1.38.jar new file mode 100644 index 0000000..be09493 Binary files /dev/null and b/resources/mysql-connector-java-5.1.38.jar differ diff --git a/resources/rawpanda.json b/resources/rawpanda.json new file mode 100644 index 0000000..1d9940d --- /dev/null +++ b/resources/rawpanda.json @@ -0,0 +1,2 @@ +{"name":"mission","pandas":[{"id":1,"zip":"94110","pt":"giant", "happy":true, + "attributes":[0.4,0.5]}]} diff --git a/sbt/sbt b/sbt/sbt new file mode 100755 index 0000000..aac1085 --- /dev/null +++ b/sbt/sbt @@ -0,0 +1,52 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This script launches sbt for this project. If present it uses the system +# version of sbt. If there is no system version of sbt it attempts to download +# sbt locally. +SBT_VERSION=0.13.9 +URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar +URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar +JAR=sbt/sbt-launch-${SBT_VERSION}.jar + +# Download sbt launch jar if it hasn't been downloaded yet +if [ ! -f ${JAR} ]; then + # Download + printf "Attempting to fetch sbt\n" + set -x + JAR_DL=${JAR}.part + if hash wget 2>/dev/null; then + (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} + elif hash axel 2>/dev/null; then + (axel ${URL1} -o ${JAR_DL} || axel ${URL2} -o ${JAR_DL}) && mv ${JAR_DL} ${JAR} + else + printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" + exit -1 + fi +fi +if [ ! -f ${JAR} ]; then + # We failed to download + printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" + exit -1 +fi +printf "Launching sbt from ${JAR}\n" +java \ + -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \ + -jar ${JAR} \ + "$@" diff --git a/sbt/sbt.bat b/sbt/sbt.bat new file mode 100644 index 0000000..0f7a3e9 --- /dev/null +++ b/sbt/sbt.bat @@ -0,0 +1,95 @@ +@REM SBT launcher script +@REM +@REM Environment: +@REM JAVA_HOME - location of a JDK home dir (mandatory) +@REM SBT_OPTS - JVM options (optional) +@REM Configuration: +@REM sbtconfig.txt found in the SBT_HOME. + +@REM ZOMG! We need delayed expansion to build up CFG_OPTS later +@setlocal enabledelayedexpansion + +@echo off +set SBT_HOME=%~dp0 + +rem FIRST we load the config file of extra options. +set FN=%SBT_HOME%\..\conf\sbtconfig.txt +set CFG_OPTS= +FOR /F "tokens=* eol=# usebackq delims=" %%i IN ("%FN%") DO ( + set DO_NOT_REUSE_ME=%%i + rem ZOMG (Part #2) WE use !! here to delay the expansion of + rem CFG_OPTS, otherwise it remains "" for this loop. + set CFG_OPTS=!CFG_OPTS! !DO_NOT_REUSE_ME! +) + +rem poor man's jenv (which is not available on Windows) +IF DEFINED JAVA_HOMES ( + IF EXIST .java-version FOR /F %%A IN (.java-version) DO ( + SET JAVA_HOME=%JAVA_HOMES%\%%A + SET JDK_HOME=%JAVA_HOMES%\%%A + ) +) +rem must set PATH or wrong javac is used for java projects +IF DEFINED JAVA_HOME SET "PATH=%JAVA_HOME%\bin;%PATH%" + +rem users can set JAVA_OPTS via .jvmopts (sbt-extras style) +IF EXIST .jvmopts FOR /F %%A IN (.jvmopts) DO ( + SET JAVA_OPTS=%%A !JAVA_OPTS! +) + +rem We use the value of the JAVACMD environment variable if defined +set _JAVACMD=%JAVACMD% + +if "%_JAVACMD%"=="" ( + if not "%JAVA_HOME%"=="" ( + if exist "%JAVA_HOME%\bin\java.exe" set "_JAVACMD=%JAVA_HOME%\bin\java.exe" + ) +) + +if "%_JAVACMD%"=="" set _JAVACMD=java + +rem We use the value of the JAVA_OPTS environment variable if defined, rather than the config. +set _JAVA_OPTS=%JAVA_OPTS% +if "%_JAVA_OPTS%"=="" set _JAVA_OPTS=%CFG_OPTS% + +:args_loop +if "%~1" == "" goto args_end + +if "%~1" == "-jvm-debug" ( + set JVM_DEBUG=true + set /a JVM_DEBUG_PORT=5005 2>nul >nul +) else if "!JVM_DEBUG!" == "true" ( + set /a JVM_DEBUG_PORT=%1 2>nul >nul + if not "%~1" == "!JVM_DEBUG_PORT!" ( + set SBT_ARGS=!SBT_ARGS! %1 + ) +) else ( + set SBT_ARGS=!SBT_ARGS! %1 +) + +shift +goto args_loop +:args_end + +if defined JVM_DEBUG_PORT ( + set _JAVA_OPTS=!_JAVA_OPTS! -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=!JVM_DEBUG_PORT! +) + +call :run %SBT_ARGS% + +if ERRORLEVEL 1 goto error +goto end + +:run + +"%_JAVACMD%" %_JAVA_OPTS% %SBT_OPTS% -cp "%SBT_HOME%sbt-launch.jar" xsbt.boot.Boot %* +goto :eof + +:error +@endlocal +exit /B 1 + + +:end +@endlocal +exit /B 0 diff --git a/scalastyle-config.xml b/scalastyle-config.xml new file mode 100644 index 0000000..17d780a --- /dev/null +++ b/scalastyle-config.xml @@ -0,0 +1,117 @@ + + Scalastyle standard configuration + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/shell-scripts/launch-with-mysql-jdbc b/shell-scripts/launch-with-mysql-jdbc new file mode 100644 index 0000000..90ac352 --- /dev/null +++ b/shell-scripts/launch-with-mysql-jdbc @@ -0,0 +1,5 @@ +ASSEMBLY_JAR=./target/scala-2.10/examples_2.10.jar +CLASS="com.highperformancespark.dataframe.mysqlload" +#tag:[submit] +spark-submit --jars ./resources/mysql-connector-java-5.1.38.jar $ASSEMBLY_JAR $CLASS +#end:[submit] \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..e88b326 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,74 @@ +################################################################ +# A minimal CMake file that is compatible with sbt-jni # +# # +# All settings required by sbt-jni have been marked so, please # +# add/modify/remove settings to build your specific library. # +################################################################ + +cmake_minimum_required(VERSION 2.6) + +# Define project and related variables +# +project (high-performance-spark) + +# Enable fortan +enable_language (Fortran) +include(FortranCInterface) + + +# FFLAGS depend on the compiler +get_filename_component (Fortran_COMPILER_NAME ${CMAKE_Fortran_COMPILER} NAME) + + +# Set versions and library name +# (required by sbt-jni) please use semantic versioning +# +set (VERSION_MAJOR 0) +set (VERSION_MINOR 0) +set (VERSION_PATCH 0) +# (required by sbt-jni) major version will always be appended to library name +set (LIB_NAME ${CMAKE_PROJECT_NAME}${VERSION_MAJOR}) + +# Command-line options +# +# (set by sbt-jni) +set (LIB_INSTALL_DIR lib CACHE PATH "Path in which to install libraries (equivalent to Autoconf --libdir).") +# (set by sbt-jni) +set (LIB_ENABLE_MINOR_VERSIONS ON CACHE BOOLEAN "Build libraries with minor and patch versions appended.") + +# Setup JNI +find_package(JNI REQUIRED) +if (JNI_FOUND) + message (STATUS "JNI include directories: ${JNI_INCLUDE_DIRS}") +endif() + +# Include directories +include_directories(.) +include_directories(./main/c) +include_directories(./main/c/include) +include_directories(${JNI_INCLUDE_DIRS}) + +# Setup main shared library +file(GLOB LIB_SRC + "*.c" + "*.cpp" + "./main/c/*.c" + "./main/c/*.cpp" + "./main/fortran/*.f*" +) +add_library(${LIB_NAME} SHARED ${LIB_SRC}) + +# By default, in a regular build, minor and patch versions are added to the generated files. +# When built through sbt-jni however, LIB_ENABLE_MINOR_VERSIONS is deactivated and only a +# major-versioned library file is built. +if (LIB_ENABLE_MINOR_VERSIONS) + set_target_properties( + ${LIB_NAME} + PROPERTIES + VERSION 0.${VERSION_MINOR}.${VERSION_PATCH} # major version always 0, it is included in library name + SOVERSION 0 + ) +endif() + +# Installation targets +install(TARGETS ${LIB_NAME} LIBRARY DESTINATION ${LIB_INSTALL_DIR}) diff --git a/src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h b/src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h new file mode 100644 index 0000000..75be264 --- /dev/null +++ b/src/main/c/include/com_highperformancespark_examples_ffi_SumJNI.h @@ -0,0 +1,21 @@ +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include +/* Header for class com_highperformancespark_examples_ffi_SumJNI */ + +#ifndef _Included_com_highperformancespark_examples_ffi_SumJNI +#define _Included_com_highperformancespark_examples_ffi_SumJNI +#ifdef __cplusplus +extern "C" { +#endif +/* + * Class: com_highperformancespark_examples_ffi_SumJNI + * Method: sum + * Signature: ([I)I + */ +JNIEXPORT jint JNICALL Java_com_highperformancespark_examples_ffi_SumJNI_sum + (JNIEnv *, jobject, jintArray); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/src/main/c/sum.c b/src/main/c/sum.c new file mode 100644 index 0000000..f571aad --- /dev/null +++ b/src/main/c/sum.c @@ -0,0 +1,9 @@ +#include "sum.h" + +int sum(int input[], int num_elem) { + int c, ret = 0; + for (c = 0; c < num_elem; c++) { + ret += input[c]; + } + return ret; +} diff --git a/src/main/c/sum.h b/src/main/c/sum.h new file mode 100644 index 0000000..d04be96 --- /dev/null +++ b/src/main/c/sum.h @@ -0,0 +1,6 @@ +#ifndef _SUM_H +#define _SUM_H + +int sum(int input[], int num_elem); + +#endif /* _SUM_H */ diff --git a/src/main/c/sum_wrapper.c b/src/main/c/sum_wrapper.c new file mode 100644 index 0000000..a499d3e --- /dev/null +++ b/src/main/c/sum_wrapper.c @@ -0,0 +1,16 @@ +#include "sum.h" +#include "include/com_highperformancespark_examples_ffi_SumJNI.h" +#include +#include + +/* + * Class: com_highperformancespark_examples_ffi_SumJNI + * Method: sum + * Signature: ([I)I + */ +JNIEXPORT jint JNICALL Java_com_highperformancespark_examples_ffi_SumJNI_sum +(JNIEnv *env, jobject obj, jintArray ja) { + jsize size = (*env)->GetArrayLength(env, ja); + jint *a = (*env)->GetIntArrayElements(env, ja, 0); + return sum(a, size); +} diff --git a/src/main/c/sumf_wrapper.c b/src/main/c/sumf_wrapper.c new file mode 100644 index 0000000..43c7da0 --- /dev/null +++ b/src/main/c/sumf_wrapper.c @@ -0,0 +1,7 @@ +// Fortran routine +extern int sumf(int *, int[]); + +// Call the fortran code which expects by reference size +int wrap_sum(int input[], int size) { + return sumf(&size, input); +} diff --git a/src/main/fortran/sumf.f95 b/src/main/fortran/sumf.f95 new file mode 100644 index 0000000..04680b7 --- /dev/null +++ b/src/main/fortran/sumf.f95 @@ -0,0 +1,4 @@ + INTEGER FUNCTION SUMF(N,A) BIND(C, NAME='sumf') + INTEGER A(N) + SUMF=SUM(A) + END diff --git a/src/main/java/com/highperformancespark/examples/JavaInterop.java b/src/main/java/com/highperformancespark/examples/JavaInterop.java new file mode 100644 index 0000000..3e3ed6b --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/JavaInterop.java @@ -0,0 +1,38 @@ +package com.highperformancespark.examples; + +import scala.reflect.*; +import scala.Tuple2; + +import org.apache.spark.rdd.RDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import java.util.HashMap; +import java.util.Map; + +import static org.apache.spark.sql.functions.*; + +public class JavaInterop { + + //tag::realClassTag[] + public static JavaPairRDD wrapPairRDD( + RDD> rdd) { + // Construct the class tags + ClassTag strCt = ClassTag$.MODULE$.apply(String.class); + ClassTag longCt = ClassTag$.MODULE$.apply(scala.Long.class); + return new JavaPairRDD(rdd, strCt, longCt); + } + //end::realClassTag[] + + //tag::fakeClassTag[] + public static JavaPairRDD wrapPairRDDFakeCt( + RDD> rdd) { + // Construct the class tags by casting AnyRef - this would be more commonly done + // with generic or templated code where we can't explicitly construct the correct + // class tag as using fake class tags may result in degraded performance. + ClassTag fake = ClassTag$.MODULE$.AnyRef(); + return new JavaPairRDD(rdd, fake, fake); + } + //end::fakeClassTag[] +} diff --git a/src/main/java/com/highperformancespark/examples/WordCount.java b/src/main/java/com/highperformancespark/examples/WordCount.java new file mode 100644 index 0000000..8dd8442 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/WordCount.java @@ -0,0 +1,25 @@ +package com.highperformancespark.examples; + +//tag::wordCount[] +import scala.Tuple2; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import java.util.regex.Pattern; +import java.util.Arrays; + +public final class WordCount { + private static final Pattern pattern = Pattern.compile(" "); + + public static void main(String[] args) throws Exception { + JavaSparkContext jsc = new JavaSparkContext(); + JavaRDD lines = jsc.textFile(args[0]); + JavaRDD words = lines.flatMap(e -> Arrays.asList( + pattern.split(e)).iterator()); + JavaPairRDD wordsIntial = words.mapToPair( + e -> new Tuple2(e, 1)); + } +} +//end::wordCount[] diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java b/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java new file mode 100644 index 0000000..950f9e5 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/dataframe/JavaHappyPandas.java @@ -0,0 +1,210 @@ +package com.highperformancespark.examples.dataframe; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Column; +import org.apache.spark.sql.*; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.expressions.Window; +import org.apache.spark.sql.expressions.WindowSpec; +import org.apache.spark.sql.hive.HiveContext; + +import java.util.HashMap; +import java.util.Map; + +import static org.apache.spark.sql.functions.*; + +public class JavaHappyPandas { + + /** + * Creates SQLContext with an existing SparkContext. + */ + public static SQLContext sqlContext(JavaSparkContext jsc) { + SQLContext sqlContext = new SQLContext(jsc); + return sqlContext; + } + + /** + * Creates HiveContext with an existing SparkContext. + */ + public static HiveContext hiveContext(JavaSparkContext jsc) { + HiveContext hiveContext = new HiveContext(jsc); + return hiveContext; + } + + /** + * Illustrate loading some JSON data. + */ + public static Dataset loadDataSimple(JavaSparkContext jsc, SQLContext sqlContext, String path) { + Dataset df1 = sqlContext.read().json(path); + + Dataset df2 = sqlContext.read().format("json").option("samplingRatio", "1.0").load(path); + + JavaRDD jsonRDD = jsc.textFile(path); + Dataset df3 = sqlContext.read().json(jsonRDD); + + return df1; + } + + public static Dataset jsonLoadFromRDD(SQLContext sqlContext, JavaRDD input) { + JavaRDD rdd = input.filter(e -> e.contains("panda")); + Dataset df = sqlContext.read().json(rdd); + return df; + } + + // Here will be some examples on PandaInfo DataFrame + + /** + * Gets the percentage of happy pandas per place. + * + * @param pandaInfo the input DataFrame + * @return Returns DataFrame of (place, percentage of happy pandas) + */ + public static Dataset happyPandasPercentage(Dataset pandaInfo) { + Dataset happyPercentage = pandaInfo.select(pandaInfo.col("place"), + (pandaInfo.col("happyPandas").divide(pandaInfo.col("totalPandas"))).as("percentHappy")); + return happyPercentage; + } + + /** + * Encodes pandaType to Integer values instead of String values. + * + * @param pandaInfo the input DataFrame + * @return Returns a DataFrame of pandaId and integer value for pandaType. + */ + public static Dataset encodePandaType(Dataset pandaInfo) { + Dataset encodedDF = pandaInfo.select(pandaInfo.col("id"), + when(pandaInfo.col("pt").equalTo("giant"), 0). + when(pandaInfo.col("pt").equalTo("red"), 1). + otherwise(2).as("encodedType")); + + return encodedDF; + } + + /** + * Gets places with happy pandas more than minHappinessBound. + */ + public static Dataset minHappyPandas(Dataset pandaInfo, int minHappyPandas) { + return pandaInfo.filter(pandaInfo.col("happyPandas").geq(minHappyPandas)); + } + + /** + * Find pandas that are sad. + */ + public static Dataset sadPandas(Dataset pandaInfo) { + return pandaInfo.filter(pandaInfo.col("happy").notEqual(true)); + } + + /** + * Find pandas that are happy and fuzzier than squishy. + */ + public static Dataset happyFuzzyPandas(Dataset pandaInfo) { + Dataset df = pandaInfo.filter( + pandaInfo.col("happy").and(pandaInfo.col("attributes").apply(0)).gt(pandaInfo.col("attributes").apply(1)) + ); + + return df; + } + + /** + * Gets places that contains happy pandas more than unhappy pandas. + */ + public static Dataset happyPandasPlaces(Dataset pandaInfo) { + return pandaInfo.filter(pandaInfo.col("happyPandas").geq(pandaInfo.col("totalPandas").divide(2))); + } + + /** + * Remove duplicate pandas by id. + */ + public static Dataset removeDuplicates(Dataset pandas) { + Dataset df = pandas.dropDuplicates(new String[]{"id"}); + return df; + } + + public static Dataset describePandas(Dataset pandas) { + return pandas.describe(); + } + + public static Dataset maxPandaSizePerZip(Dataset pandas) { + return pandas.groupBy(pandas.col("zip")).max("pandaSize"); + } + + public static Dataset minMaxPandaSizePerZip(Dataset pandas) { + return pandas.groupBy(pandas.col("zip")).agg(min("pandaSize"), max("pandaSize")); + } + + public static Dataset minPandaSizeMaxAgePerZip(Dataset pandas) { + Map map = new HashMap<>(); + map.put("pandaSize", "min"); + map.put("age", "max"); + + Dataset df = pandas.groupBy(pandas.col("zip")).agg(map); + return df; + } + + public static Dataset minMeanSizePerZip(Dataset pandas) { + return pandas.groupBy(pandas.col("zip")).agg(min(pandas.col("pandaSize")), mean(pandas.col("pandaSize"))); + } + + public static Dataset simpleSqlExample(Dataset pandas) { + SQLContext sqlContext = pandas.sqlContext(); + pandas.registerTempTable("pandas"); + + Dataset miniPandas = sqlContext.sql("SELECT * FROM pandas WHERE pandaSize < 12"); + return miniPandas; + } + + /** + * Orders pandas by size ascending and by age descending. + * Pandas will be sorted by "size" first and if two pandas + * have the same "size" will be sorted by "age". + */ + public static Dataset orderPandas(Dataset pandas) { + return pandas.orderBy(pandas.col("pandaSize").asc(), pandas.col("age").desc()); + } + + public static Dataset computeRelativePandaSizes(Dataset pandas) { + //tag::relativePandaSizesWindow[] + WindowSpec windowSpec = Window + .orderBy(pandas.col("age")) + .partitionBy(pandas.col("zip")) + .rowsBetween(-10, 10); // can use rangeBetween for range instead + //end::relativePandaSizesWindow[] + + //tag::relativePandaSizesQuery[] + Column pandaRelativeSizeCol = pandas.col("pandaSize").minus(avg(pandas.col("pandaSize")).over(windowSpec)); + + return pandas.select(pandas.col("name"), pandas.col("zip"), pandas.col("pandaSize"), + pandas.col("age"), pandaRelativeSizeCol.as("panda_relative_size")); + //end::relativePandaSizesQuery[] + } + + public static void joins(Dataset df1, Dataset df2) { + //tag::innerJoin[] + // Inner join implicit + df1.join(df2, df1.col("name").equalTo(df2.col("name"))); + // Inner join explicit + df1.join(df2, df1.col("name").equalTo(df2.col("name")), "inner"); + //end::innerJoin[] + + //tag::leftouterJoin[] + // Left outer join explicit + df1.join(df2, df1.col("name").equalTo(df2.col("name")), "left_outer"); + //end::leftouterJoin[] + + //tag::rightouterJoin[] + // Right outer join explicit + df1.join(df2, df1.col("name").equalTo(df2.col("name")), "right_outer"); + //end::rightouterJoin[] + + //tag::leftsemiJoin[] + // Left semi join explicit + df1.join(df2, df1.col("name").equalTo(df2.col("name")), "left_semi"); + //end::leftsemiJoin[] + } + + public static Dataset selfJoin(Dataset df) { + return (df.as("a")).join(df.as("b")).where("a.name = b.name"); + } + +} diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java b/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java new file mode 100644 index 0000000..5abf17b --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java @@ -0,0 +1,140 @@ +package com.highperformancespark.examples.dataframe; + +import com.highperformancespark.examples.objects.JavaPandaPlace; +import com.highperformancespark.examples.objects.JavaRawPanda; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.*; +import org.apache.spark.sql.types.*; + +import java.util.List; +import java.util.Properties; +import java.util.stream.Collectors; + +public class JavaLoadSave { + private SQLContext sqlContext; + + public JavaLoadSave(SQLContext sqlContext) { + this.sqlContext = sqlContext; + } + + //tag::createFromRDD[] + public Dataset createFromJavaBean(JavaRDD input) { + // Create DataFrame using Java Bean + Dataset df1 = sqlContext.createDataFrame(input, JavaPandaPlace.class); + + // Create DataFrame using JavaRDD + JavaRDD rowRDD = input.map(pm -> RowFactory.create(pm.getName(), + pm.getPandas().stream() + .map(pi -> RowFactory.create(pi.getId(), pi.getZip(), pi.isHappy(), pi.getAttributes())) + .collect(Collectors.toList()))); + + ArrayType pandasType = DataTypes.createArrayType(new StructType( + new StructField[]{ + new StructField("id", DataTypes.LongType, true, Metadata.empty()), + new StructField("zip", DataTypes.StringType, true, Metadata.empty()), + new StructField("happy", DataTypes.BooleanType, true, Metadata.empty()), + new StructField("attributes", DataTypes.createArrayType(DataTypes.FloatType), true, Metadata.empty()) + } + )); + + StructType schema = new StructType(new StructField[]{ + new StructField("name", DataTypes.StringType, true, Metadata.empty()), + new StructField("pandas", pandasType, true, Metadata.empty()) + }); + + Dataset df2 = sqlContext.createDataFrame(rowRDD, schema); + return df2; + } + //end::createFromRDD[] + + //tag::createFromLocal[] + public Dataset createFromLocal(List input) { + return sqlContext.createDataFrame(input, PandaPlace.class); + } + //end::createFromLocal[] + + //tag::collectResults[] + public List collectDF(Dataset df) { + return df.collectAsList(); + } + //end::collectResults[] + + //tag::toRDD[] + public JavaRDD toRDD(Dataset input) { + JavaRDD rdd = input.javaRDD().map(row -> new JavaRawPanda(row.getLong(0), row.getString(1), + row.getString(2), row.getBoolean(3), row.getList(4))); + return rdd; + } + //end::toRDD[] + + //tag::partitionedOutput[] + public void writeOutByZip(Dataset input) { + input.write().partitionBy("zipcode").format("json").save("output/"); + } + //end::partitionedOutput[] + + //tag::saveAppend[] + public void writeAppend(Dataset input) { + input.write().mode(SaveMode.Append).save("output/"); + } + //end::saveAppend[] + + public Dataset createJDBC() { + //tag::createJDBC[] + Dataset df1 = sqlContext.read().jdbc("jdbc:dialect:serverName;user=user;password=pass", + "table", new Properties()); + + Dataset df2 = sqlContext.read().format("jdbc") + .option("url", "jdbc:dialect:serverName") + .option("dbtable", "table").load(); + + return df2; + //end::createJDBC[] + } + + public void writeJDBC(Dataset df) { + //tag::writeJDBC[] + df.write().jdbc("jdbc:dialect:serverName;user=user;password=pass", + "table", new Properties()); + + df.write().format("jdbc") + .option("url", "jdbc:dialect:serverName") + .option("user", "user") + .option("password", "pass") + .option("dbtable", "table").save(); + //end::writeJDBC[] + } + + //tag::loadParquet[] + public Dataset loadParquet(String path) { + // Configure Spark to read binary data as string, note: must be configured on SQLContext + sqlContext.setConf("spark.sql.parquet.binaryAsString", "true"); + + // Load parquet data using merge schema (configured through option) + Dataset df = sqlContext.read() + .option("mergeSchema", "true") + .format("parquet") + .load(path); + + return df; + } + //end::loadParquet[] + + //tag::writeParquet[] + public void writeParquet(Dataset df, String path) { + df.write().format("parquet").save(path); + } + //end::writeParquet[] + + //tag::loadHiveTable[] + public Dataset loadHiveTable() { + return sqlContext.read().table("pandas"); + } + //end::loadHiveTable[] + + //tag::saveManagedTable[] + public void saveManagedTable(Dataset df) { + df.write().saveAsTable("pandas"); + } + //end::saveManagedTable[] +} diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java b/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java new file mode 100644 index 0000000..95e3dea --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java @@ -0,0 +1,78 @@ +package com.highperformancespark.examples.dataframe; + +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.expressions.MutableAggregationBuffer; +import org.apache.spark.sql.expressions.UserDefinedAggregateFunction; +import org.apache.spark.sql.types.*; + +public class JavaUDFs { + + public static void setupUDFs(SQLContext sqlContext) { + //tag::basicUDF[] + sqlContext.udf() + .register("strlen", + (String s) -> s.length(), DataTypes.StringType); + //end::basicUDF[] + } + + public static void setupUDAFs(SQLContext sqlContext) { + + class Avg extends UserDefinedAggregateFunction { + + @Override + public StructType inputSchema() { + StructType inputSchema = + new StructType(new StructField[]{new StructField("value", DataTypes.DoubleType, true, Metadata.empty())}); + return inputSchema; + } + + @Override + public StructType bufferSchema() { + StructType bufferSchema = + new StructType(new StructField[]{ + new StructField("count", DataTypes.LongType, true, Metadata.empty()), + new StructField("sum", DataTypes.DoubleType, true, Metadata.empty()) + }); + + return bufferSchema; + } + + @Override + public DataType dataType() { + return DataTypes.DoubleType; + } + + @Override + public boolean deterministic() { + return true; + } + + @Override + public void initialize(MutableAggregationBuffer buffer) { + buffer.update(0, 0L); + buffer.update(1, 0.0); + } + + @Override + public void update(MutableAggregationBuffer buffer, Row input) { + buffer.update(0, buffer.getLong(0) + 1); + buffer.update(1, buffer.getDouble(1) + input.getDouble(0)); + } + + @Override + public void merge(MutableAggregationBuffer buffer1, Row buffer2) { + buffer1.update(0, buffer1.getLong(0) + buffer2.getLong(0)); + buffer1.update(1, buffer1.getDouble(1) + buffer2.getDouble(1)); + } + + @Override + public Object evaluate(Row buffer) { + return buffer.getDouble(1) / buffer.getLong(0); + } + } + + Avg average = new Avg(); + sqlContext.udf().register("ourAvg", average); + } +} diff --git a/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java b/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java new file mode 100644 index 0000000..adb7fd4 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java @@ -0,0 +1,7 @@ +package com.highperformancespark.examples.ffi; + +// tag::sumJNIJava[] +class SumJNIJava { + public static native Integer sum(Integer[] array); +} +// end::sumJNIJava[] diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java b/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java new file mode 100644 index 0000000..e3f5325 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java @@ -0,0 +1,29 @@ +package com.highperformancespark.examples.objects; + +import java.io.Serializable; + +public class JavaCoffeeShop implements Serializable { + private String zip; + private String name; + + public JavaCoffeeShop(String zip, String name) { + this.zip = zip; + this.name = name; + } + + public String getZip() { + return zip; + } + + public void setZip(String zip) { + this.zip = zip; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } +} \ No newline at end of file diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java b/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java new file mode 100644 index 0000000..c2b7847 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java @@ -0,0 +1,56 @@ +package com.highperformancespark.examples.objects; + +import java.io.Serializable; + +public class JavaPandaInfo implements Serializable { + private String place; + private String pandaType; + private int happyPandas; + private int totalPandas; + + /** + * @param place name of place + * @param pandaType type of pandas in this place + * @param happyPandas number of happy pandas in this place + * @param totalPandas total number of pandas in this place + */ + public JavaPandaInfo(String place, String pandaType, int happyPandas, int totalPandas) { + this.place = place; + this.pandaType = pandaType; + this.happyPandas = happyPandas; + this.totalPandas = totalPandas; + } + + public String getPlace() { + return place; + } + + public void setPlace(String place) { + this.place = place; + } + + public String getPandaType() { + return pandaType; + } + + public void setPandaType(String pandaType) { + this.pandaType = pandaType; + } + + public int getHappyPandas() { + return happyPandas; + } + + public void setHappyPandas(int happyPandas) { + this.happyPandas = happyPandas; + } + + public int getTotalPandas() { + return totalPandas; + } + + public void setTotalPandas(int totalPandas) { + this.totalPandas = totalPandas; + } + +} diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java b/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java new file mode 100644 index 0000000..dc33d9c --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java @@ -0,0 +1,34 @@ +package com.highperformancespark.examples.objects; + +import java.io.Serializable; +import java.util.List; + +public class JavaPandaPlace implements Serializable { + private String name; + private List pandas; + + /** + * @param name place name + * @param pandas pandas in that place + */ + public JavaPandaPlace(String name, List pandas) { + this.name = name; + this.pandas = pandas; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public List getPandas() { + return pandas; + } + + public void setPandas(List pandas) { + this.pandas = pandas; + } +} \ No newline at end of file diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java b/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java new file mode 100644 index 0000000..f73e93f --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java @@ -0,0 +1,56 @@ +package com.highperformancespark.examples.objects; + +import java.io.Serializable; + +public class JavaPandas implements Serializable { + private String name; + private String zip; + private int pandaSize; + private int age; + + /** + * @param name name of panda + * @param zip zip code + * @param pandaSize size of panda in KG + * @param age age of panda + */ + public JavaPandas(String name, String zip, int pandaSize, int age) { + this.name = name; + this.zip = zip; + this.pandaSize = pandaSize; + this.age = age; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getZip() { + return zip; + } + + public void setZip(String zip) { + this.zip = zip; + } + + public int getPandaSize() { + return pandaSize; + } + + public void setPandaSize(int pandaSize) { + this.pandaSize = pandaSize; + } + + public int getAge() { + return age; + } + + public void setAge(int age) { + this.age = age; + } + +} diff --git a/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java b/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java new file mode 100644 index 0000000..7d2be17 --- /dev/null +++ b/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java @@ -0,0 +1,67 @@ +package com.highperformancespark.examples.objects; + +import java.io.Serializable; +import java.util.List; + +public class JavaRawPanda implements Serializable { + private long id; + private String zip; + private String pt; + private boolean happy; + private List attributes; + + /** + * @param id panda id + * @param zip zip code of panda residence + * @param pt Type of panda as a string + * @param happy if panda is happy + * @param attributes array of panada attributes + */ + public JavaRawPanda(long id, String zip, String pt, boolean happy, List attributes) { + this.attributes = attributes; + this.id = id; + this.zip = zip; + this.pt = pt; + this.happy = happy; + } + + public long getId() { + return id; + } + + public void setId(long id) { + this.id = id; + } + + public String getZip() { + return zip; + } + + public void setZip(String zip) { + this.zip = zip; + } + + public String getPt() { + return pt; + } + + public void setPt(String pt) { + this.pt = pt; + } + + public boolean isHappy() { + return happy; + } + + public void setHappy(boolean happy) { + this.happy = happy; + } + + public List getAttributes() { + return attributes; + } + + public void setAttributes(List attributes) { + this.attributes = attributes; + } +} \ No newline at end of file diff --git a/src/main/julia/setup.jl b/src/main/julia/setup.jl new file mode 100644 index 0000000..2d3068f --- /dev/null +++ b/src/main/julia/setup.jl @@ -0,0 +1,4 @@ +Pkg.clone("https://github.com/dfdx/Spark.jl") +Pkg.build("Spark") +# we also need latest master of JavaCall.jl +Pkg.checkout("JavaCall") \ No newline at end of file diff --git a/src/main/julia/wc.jl b/src/main/julia/wc.jl new file mode 100644 index 0000000..6239671 --- /dev/null +++ b/src/main/julia/wc.jl @@ -0,0 +1,5 @@ +using Spark +sc = SparkContext(master="local") +path = string("file:///", ENV["SPARK_HOME"], "/README.md") +txt = text_file(sc, path) +# Normally we would use a flatmap, but currently only has map_partitions diff --git a/src/main/perl/Changes b/src/main/perl/Changes new file mode 100644 index 0000000..ba8a0dc --- /dev/null +++ b/src/main/perl/Changes @@ -0,0 +1,5 @@ +Revision history for HighPerformanceSpark-Examples + +0.01 Date/time + First version, released on an unsuspecting world. + diff --git a/src/main/perl/MANIFEST b/src/main/perl/MANIFEST new file mode 100644 index 0000000..93f2c6e --- /dev/null +++ b/src/main/perl/MANIFEST @@ -0,0 +1,9 @@ +Changes +lib/HighPerformanceSpark/Examples.pm +Makefile.PL +MANIFEST This list of files +README +t/00-load.t +t/manifest.t +t/pod-coverage.t +t/pod.t diff --git a/src/main/perl/Makefile.PL b/src/main/perl/Makefile.PL new file mode 100644 index 0000000..3735deb --- /dev/null +++ b/src/main/perl/Makefile.PL @@ -0,0 +1,28 @@ +use 5.006; +use strict; +use warnings; +use ExtUtils::MakeMaker; + +WriteMakefile( + NAME => 'HighPerformanceSpark::Examples', + AUTHOR => q{Holden Karau And Rachel Warren }, + VERSION_FROM => 'lib/HighPerformanceSpark/Examples.pm', + ABSTRACT_FROM => 'lib/HighPerformanceSpark/Examples.pm', + LICENSE => 'apache_2_0', + PL_FILES => {}, + EXE_FILES => [ 'ghinfo.pl' ], + MIN_PERL_VERSION => 5.006, + CONFIGURE_REQUIRES => { + 'ExtUtils::MakeMaker' => 0, + }, + BUILD_REQUIRES => { + 'Test::More' => 0, + }, + PREREQ_PM => { + 'Pithub' => 0.01033, + #'ABC' => 1.6, + #'Foo::Bar::Module' => 5.0401, + }, + dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', }, + clean => { FILES => 'HighPerformanceSpark-Examples-*' }, +); diff --git a/src/main/perl/README b/src/main/perl/README new file mode 100644 index 0000000..5df3f27 --- /dev/null +++ b/src/main/perl/README @@ -0,0 +1,61 @@ +HighPerformanceSpark-Examples + +The README is used to introduce the module and provide instructions on +how to install the module, any machine dependencies it may have (for +example C compilers and installed libraries) and any other information +that should be provided before the module is installed. + +A README file is required for CPAN modules since CPAN extracts the README +file from a module distribution so that people browsing the archive +can use it to get an idea of the module's uses. It is usually a good idea +to provide version information here so that people can decide whether +fixes for the module are worth downloading. + + +INSTALLATION + +To install this module, run the following commands: + + perl Makefile.PL + make + make test + make install + +SUPPORT AND DOCUMENTATION + +After installing, you can find documentation for this module with the +perldoc command. + + perldoc HighPerformanceSpark::Examples + +You can also look for information at: + + RT, CPAN's request tracker (report bugs here) + http://rt.cpan.org/NoAuth/Bugs.html?Dist=HighPerformanceSpark-Examples + + AnnoCPAN, Annotated CPAN documentation + http://annocpan.org/dist/HighPerformanceSpark-Examples + + CPAN Ratings + http://cpanratings.perl.org/d/HighPerformanceSpark-Examples + + Search CPAN + http://search.cpan.org/dist/HighPerformanceSpark-Examples/ + + +LICENSE AND COPYRIGHT + +Copyright (C) 2016 Holden Karau And Rachel Warren + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + L + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + diff --git a/src/main/perl/ghinfo.pl b/src/main/perl/ghinfo.pl new file mode 100644 index 0000000..b67b571 --- /dev/null +++ b/src/main/perl/ghinfo.pl @@ -0,0 +1,20 @@ +#!/usr/bin/perl +use strict; +use warnings; + +use Pithub; +use Data::Dumper; + +# Find all of the commentors on an issue +my $user = $ENV{'user'}; +my $repo = $ENV{'repo'}; +my $p = Pithub->new(user => $user, repo => $repo); +while (my $id = <>) { + chomp ($id); + my $issue_comments = $p->issues->comments->list(issue_id => $id); + print $id; + while (my $comment = $issue_comments->next) { + print " ".$comment->{"user"}->{"login"}; + } + print "\n"; +} diff --git a/src/main/perl/ignore.txt b/src/main/perl/ignore.txt new file mode 100644 index 0000000..c2b781b --- /dev/null +++ b/src/main/perl/ignore.txt @@ -0,0 +1,18 @@ +Makefile +Makefile.old +Build +Build.bat +META.* +MYMETA.* +.build/ +_build/ +cover_db/ +blib/ +inc/ +.lwpcookies +.last_cover_stats +nytprof.out +pod2htm*.tmp +pm_to_blib +HighPerformanceSpark-Examples-* +HighPerformanceSpark-Examples-*.tar.gz diff --git a/src/main/perl/lib/HighPerformanceSpark/Examples.pm b/src/main/perl/lib/HighPerformanceSpark/Examples.pm new file mode 100644 index 0000000..fb5be0e --- /dev/null +++ b/src/main/perl/lib/HighPerformanceSpark/Examples.pm @@ -0,0 +1,117 @@ +package HighPerformanceSpark::Examples; + +use 5.006; +use strict; +use warnings; + +=head1 NAME + +HighPerformanceSpark::Examples - The great new HighPerformanceSpark::Examples! + +=head1 VERSION + +Version 0.01 + +=cut + +our $VERSION = '0.01'; + + +=head1 SYNOPSIS + +Quick summary of what the module does. + +Perhaps a little code snippet. + + use HighPerformanceSpark::Examples; + + my $foo = HighPerformanceSpark::Examples->new(); + ... + +=head1 EXPORT + +A list of functions that can be exported. You can delete this section +if you don't export anything, such as for a purely object-oriented module. + +=head1 SUBROUTINES/METHODS + +=head2 function1 + +=cut + +sub function1 { +} + +=head2 function2 + +=cut + +sub function2 { +} + +=head1 AUTHOR + +Holden Karau And Rachel Warren, C<< >> + +=head1 BUGS + +Please report any bugs or feature requests to C, or through +the web interface at L. I will be notified, and then you'll +automatically be notified of progress on your bug as I make changes. + + + + +=head1 SUPPORT + +You can find documentation for this module with the perldoc command. + + perldoc HighPerformanceSpark::Examples + + +You can also look for information at: + +=over 4 + +=item * RT: CPAN's request tracker (report bugs here) + +L + +=item * AnnoCPAN: Annotated CPAN documentation + +L + +=item * CPAN Ratings + +L + +=item * Search CPAN + +L + +=back + + +=head1 ACKNOWLEDGEMENTS + + +=head1 LICENSE AND COPYRIGHT + +Copyright 2016 Holden Karau And Rachel Warren. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + L + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + + +=cut + +1; # End of HighPerformanceSpark::Examples diff --git a/src/main/perl/t/00-load.t b/src/main/perl/t/00-load.t new file mode 100644 index 0000000..bd94e18 --- /dev/null +++ b/src/main/perl/t/00-load.t @@ -0,0 +1,13 @@ +#!perl -T +use 5.006; +use strict; +use warnings; +use Test::More; + +plan tests => 1; + +BEGIN { + use_ok( 'HighPerformanceSpark::Examples' ) || print "Bail out!\n"; +} + +diag( "Testing HighPerformanceSpark::Examples $HighPerformanceSpark::Examples::VERSION, Perl $], $^X" ); diff --git a/src/main/perl/t/manifest.t b/src/main/perl/t/manifest.t new file mode 100644 index 0000000..e0b558e --- /dev/null +++ b/src/main/perl/t/manifest.t @@ -0,0 +1,15 @@ +#!perl -T +use 5.006; +use strict; +use warnings; +use Test::More; + +unless ( $ENV{RELEASE_TESTING} ) { + plan( skip_all => "Author tests not required for installation" ); +} + +my $min_tcm = 0.9; +eval "use Test::CheckManifest $min_tcm"; +plan skip_all => "Test::CheckManifest $min_tcm required" if $@; + +ok_manifest(); diff --git a/src/main/perl/t/pod-coverage.t b/src/main/perl/t/pod-coverage.t new file mode 100644 index 0000000..f5728a5 --- /dev/null +++ b/src/main/perl/t/pod-coverage.t @@ -0,0 +1,24 @@ +#!perl -T +use 5.006; +use strict; +use warnings; +use Test::More; + +unless ( $ENV{RELEASE_TESTING} ) { + plan( skip_all => "Author tests not required for installation" ); +} + +# Ensure a recent version of Test::Pod::Coverage +my $min_tpc = 1.08; +eval "use Test::Pod::Coverage $min_tpc"; +plan skip_all => "Test::Pod::Coverage $min_tpc required for testing POD coverage" + if $@; + +# Test::Pod::Coverage doesn't require a minimum Pod::Coverage version, +# but older versions don't recognize some common documentation styles +my $min_pc = 0.18; +eval "use Pod::Coverage $min_pc"; +plan skip_all => "Pod::Coverage $min_pc required for testing POD coverage" + if $@; + +all_pod_coverage_ok(); diff --git a/src/main/perl/t/pod.t b/src/main/perl/t/pod.t new file mode 100644 index 0000000..4d3a0ce --- /dev/null +++ b/src/main/perl/t/pod.t @@ -0,0 +1,16 @@ +#!perl -T +use 5.006; +use strict; +use warnings; +use Test::More; + +unless ( $ENV{RELEASE_TESTING} ) { + plan( skip_all => "Author tests not required for installation" ); +} + +# Ensure a recent version of Test::Pod +my $min_tp = 1.22; +eval "use Test::Pod $min_tp"; +plan skip_all => "Test::Pod $min_tp required for testing POD" if $@; + +all_pod_files_ok(); diff --git a/src/main/perl/xt/boilerplate.t b/src/main/perl/xt/boilerplate.t new file mode 100644 index 0000000..7e97e3f --- /dev/null +++ b/src/main/perl/xt/boilerplate.t @@ -0,0 +1,57 @@ +#!perl -T +use 5.006; +use strict; +use warnings; +use Test::More; + +plan tests => 3; + +sub not_in_file_ok { + my ($filename, %regex) = @_; + open( my $fh, '<', $filename ) + or die "couldn't open $filename for reading: $!"; + + my %violated; + + while (my $line = <$fh>) { + while (my ($desc, $regex) = each %regex) { + if ($line =~ $regex) { + push @{$violated{$desc}||=[]}, $.; + } + } + } + + if (%violated) { + fail("$filename contains boilerplate text"); + diag "$_ appears on lines @{$violated{$_}}" for keys %violated; + } else { + pass("$filename contains no boilerplate text"); + } +} + +sub module_boilerplate_ok { + my ($module) = @_; + not_in_file_ok($module => + 'the great new $MODULENAME' => qr/ - The great new /, + 'boilerplate description' => qr/Quick summary of what the module/, + 'stub function definition' => qr/function[12]/, + ); +} + +TODO: { + local $TODO = "Need to replace the boilerplate text"; + + not_in_file_ok(README => + "The README is used..." => qr/The README is used/, + "'version information here'" => qr/to provide version information/, + ); + + not_in_file_ok(Changes => + "placeholder date/time" => qr(Date/time) + ); + + module_boilerplate_ok('lib/HighPerformanceSpark/Examples.pm'); + + +} + diff --git a/src/main/r/dapply.R b/src/main/r/dapply.R new file mode 100644 index 0000000..b12f1fc --- /dev/null +++ b/src/main/r/dapply.R @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +#tag::example[] +library(SparkR) + +# Setup SparkContext & SQLContext +sc <- sparkR.init(appName="high-performance-spark-wordcount-example") + +# Initialize SQLContext +sqlContext <- sparkRSQL.init(sc) + + +# Count the number of characters - note this fails on the text DF due to a bug. +df <- createDataFrame (sqlContext, + list(list(1L, 1, "1"), + list(2L, 2, "22"), + list(3L, 3, "333")), + c("a", "b", "c")) +resultingSchema <- structType(structField("length", "integer")) +result <- dapply(df, function(row) { + y <- list() + y <- cbind(y, nchar(row[[3]])) +}, resultingSchema) +showDF(result) +#end::example[] diff --git a/src/main/r/wc.R b/src/main/r/wc.R new file mode 100644 index 0000000..eedcf72 --- /dev/null +++ b/src/main/r/wc.R @@ -0,0 +1,60 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +args <- commandArgs(trailing = TRUE) + +if (length(args) != 1) { + print("Usage: wc.R ") + q("no") +} + +fileName <- args(1) + +#tag::example[] + +library(SparkR) + +# Setup SparkContext & SQLContext +sc <- sparkR.init(appName="high-performance-spark-wordcount-example") + +# Initialize SQLContext +sqlContext <- sparkRSQL.init(sc) + +# Load some simple data + +df <- read.text(fileName) + +# Split the words +words <- selectExpr(df, "split(value, \" \") as words") + +# Compute the count +explodedWords <- select(words, alias(explode(words$words), "words")) +wc <- agg(groupBy(explodedWords, "words"), "words" = "count") + + +# Attempting to push an array back fails +# resultingSchema <- structType(structField("words", "array")) +# words <- dapply(df, function(line) { +# y <- list() +# y[[1]] <- strsplit(line[[1]], " ") +# }, resultingSchema) +# Also attempting even the identity transformation on a DF from read.text fails +# in Spark 2.0-preview (although works fine on other DFs). + +# Display the result +showDF(wc) +#end::example[] diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala new file mode 100644 index 0000000..8aeb8eb --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala @@ -0,0 +1,343 @@ +/** + * Happy Panda Example for DataFrames. This computes the % of happy pandas and + * is a very contrived example (sorry!). + */ +package com.highperformancespark.examples.dataframe + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +//tag::sparkSQLImports[] +import org.apache.spark.sql.{Dataset, DataFrame, SparkSession, Row} +import org.apache.spark.sql.catalyst.expressions.aggregate._ +import org.apache.spark.sql.expressions._ +import org.apache.spark.sql.functions._ +//end::sparkSQLImports[] + +//tag::legacySparkSQLImports[] +import org.apache.spark.sql.SQLContext +//end::legacySparkSQLImports[] +//tag::legacySparkHiveImports[] +import org.apache.spark.sql.hive.HiveContext +import org.apache.spark.sql.hive.thriftserver._ +//end::legacySparkHiveImports[] + +object HappyPandas { + + /** + * Creates a SparkSession with Hive enabled + */ + def sparkSession(): SparkSession = { + //tag::createSparkSession[] + val session = SparkSession.builder() + .enableHiveSupport() + .getOrCreate() + // Import the implicits, unlike in core Spark the implicits are defined + // on the context. + import session.implicits._ + //end::createSparkSession[] + session + } + + /** + * Creates SQLContext with an existing SparkContext. + */ + def sqlContext(sc: SparkContext): SQLContext = { + //tag::createSQLContext[] + val sqlContext = new SQLContext(sc) + // Import the implicits, unlike in core Spark the implicits are defined + // on the context. + import sqlContext.implicits._ + //end::createSQLContext[] + sqlContext + } + + /** + * Creates HiveContext Spark with an existing SparkContext using hive. + */ + def hiveContext(sc: SparkContext): HiveContext = { + //tag::createHiveContext[] + val hiveContext = new HiveContext(sc) + // Import the implicits, unlike in core Spark the implicits are defined + // on the context. + import hiveContext.implicits._ + //end::createHiveContext[] + hiveContext + } + + /** + * Illustrate loading some JSON data. + */ + def loadDataSimple(sc: SparkContext, session: SparkSession, path: String): + DataFrame = { + //tag::loadPandaJSONSimple[] + val df1 = session.read.json(path) + //end::loadPandaJSONSimple[] + //tag::loadPandaJSONComplex[] + val df2 = session.read.format("json") + .option("samplingRatio", "1.0").load(path) + //end::loadPandaJSONComplex[] + val jsonRDD = sc.textFile(path) + //tag::loadPandaJsonRDD[] + val df3 = session.read.json(jsonRDD) + //end::loadPandaJSONRDD[] + df1 + } + + def jsonLoadFromRDD(session: SparkSession, input: RDD[String]): DataFrame = { + //tag::loadPandaJSONRDD[] + val rdd: RDD[String] = input.filter(_.contains("panda")) + val df = session.read.json(rdd) + //end::loadPandaJSONRDD[] + df + } + + // Here will be some examples on PandaInfo DataFrame + + /** + * @param place name of place + * @param pandaType type of pandas in this place + * @param happyPandas number of happy pandas in this place + * @param totalPandas total number of pandas in this place + */ + case class PandaInfo( + place: String, + pandaType: String, + happyPandas: Integer, + totalPandas: Integer) + + /** + * Gets the percentage of happy pandas per place. + * + * @param pandaInfo the input DataFrame + * @return Returns DataFrame of (place, percentage of happy pandas) + */ + def happyPandasPercentage(pandaInfo: DataFrame): DataFrame = { + pandaInfo.select( + pandaInfo("place"), + (pandaInfo("happyPandas") / pandaInfo("totalPandas")).as("percentHappy") + ) + } + + //tag::encodePandaType[] + /** + * Encodes pandaType to Integer values instead of String values. + * + * @param pandaInfo the input DataFrame + * @return Returns a DataFrame of pandaId and integer value for pandaType. + */ + def encodePandaType(pandaInfo: DataFrame): DataFrame = { + pandaInfo.select(pandaInfo("id"), + (when(pandaInfo("pt") === "giant", 0). + when(pandaInfo("pt") === "red", 1). + otherwise(2)).as("encodedType") + ) + } + //end::encodePandaType[] + + /** + * Gets places with happy pandas more than minHappinessBound. + */ + def minHappyPandas(pandaInfo: DataFrame, minHappyPandas: Int): DataFrame = { + pandaInfo.filter(pandaInfo("happyPandas") >= minHappyPandas) + } + + /** + * Extra the panda info from panda places and compute the squisheness of the panda + */ + def squishPandaFromPace(pandaPlace: DataFrame): DataFrame = { + //tag::selectExplode[] + val pandaInfo = pandaPlace.explode(pandaPlace("pandas")){ + case Row(pandas: Seq[Row]) => + pandas.map{ + case (Row( + id: Long, + zip: String, + pt: String, + happy: Boolean, + attrs: Seq[Double])) => + RawPanda(id, zip, pt, happy, attrs.toArray) + }} + pandaInfo.select( + (pandaInfo("attributes")(0) / pandaInfo("attributes")(1)) + .as("squishyness")) + //end::selectExplode[] + } + + /** + * Find pandas that are sad + */ + def sadPandas(pandaInfo: DataFrame): DataFrame = { + //tag::simpleFilter[] + pandaInfo.filter(pandaInfo("happy") !== true) + //end::simpleFilter[] + } + + /** + * Find pandas that are happy and fuzzier than squishy. + */ + def happyFuzzyPandas(pandaInfo: DataFrame): DataFrame = { + //tag::complexFilter[] + pandaInfo.filter( + pandaInfo("happy").and(pandaInfo("attributes")(0) > pandaInfo("attributes")(1)) + ) + //end::complexFilter[] + } + + /** + * Gets places that contains happy pandas more than unhappy pandas. + */ + def happyPandasPlaces(pandaInfo: DataFrame): DataFrame = { + pandaInfo.filter(pandaInfo("happyPandas") >= pandaInfo("totalPandas") / 2) + } + + + /** + * Remove duplicate pandas by id. + */ + def removeDuplicates(pandas: DataFrame): DataFrame = { + //tag::dropDuplicatePandaIds[] + pandas.dropDuplicates(List("id")) + //end::dropDuplicatePandaIds[] + } + + /** + * @param name name of panda + * @param zip zip code + * @param pandaSize size of panda in KG + * @param age age of panda + */ + case class Pandas(name: String, zip: String, pandaSize: Integer, age: Integer) + + def describePandas(pandas: DataFrame) = { + //tag::pandaSizeRangeVarDescribe[] + // Compute the count, mean, stddev, min, max summary stats for all + // of the numeric fields of the provided panda infos. non-numeric + // fields (such as string (name) or array types) are skipped. + val df = pandas.describe() + // Collect the summary back locally + println(df.collect()) + //end::pandaSizeRangeVarDescribe[] + } + + //tag::maxPandaSizePerZip[] + def maxPandaSizePerZip(pandas: DataFrame): DataFrame = { + pandas.groupBy(pandas("zip")).max("pandaSize") + } + //end::maxPandaSizePerZip[] + + //tag::minMaxPandasSizePerZip[] + def minMaxPandaSizePerZip(pandas: DataFrame): DataFrame = { + pandas.groupBy(pandas("zip")).agg(min("pandaSize"), max("pandaSize")) + } + //end::minMaxPandasSizePerZip[] + + def minPandaSizeMaxAgePerZip(pandas: DataFrame): DataFrame = { + // this query can be written in two methods + + // 1 + pandas.groupBy(pandas("zip")).agg(("pandaSize", "min"), ("age", "max")) + + // 2 + pandas.groupBy(pandas("zip")).agg(Map("pandaSize" -> "min", "age" -> "max")) + } + + //tag::complexAggPerZip[] + def minMeanSizePerZip(pandas: DataFrame): DataFrame = { + // Compute the min and mean + pandas.groupBy(pandas("zip")).agg( + min(pandas("pandaSize")), mean(pandas("pandaSize"))) + } + //end::complexAggPerZip[] + + def simpleSqlExample(pandas: DataFrame): DataFrame = { + val session = pandas.sparkSession + //tag::pandasSQLQuery[] + pandas.registerTempTable("pandas") + val miniPandas = session.sql("SELECT * FROM pandas WHERE pandaSize < 12") + //end::pandasSQLQuery[] + miniPandas + } + + def startJDBCServer(hiveContext: HiveContext): Unit = { + //tag::startJDBC[] + hiveContext.setConf("hive.server2.thrift.port", "9090") + HiveThriftServer2.startWithContext(hiveContext) + //end::startJDBC[] + } + + /** + * Orders pandas by size ascending and by age descending. + * Pandas will be sorted by "size" first and if two pandas have the same "size" + * will be sorted by "age". + */ + def orderPandas(pandas: DataFrame): DataFrame = { + //tag::simpleSort[] + pandas.orderBy(pandas("pandaSize").asc, pandas("age").desc) + //end::simpleSort[] + } + + def computeRelativePandaSizes(pandas: DataFrame): DataFrame = { + //tag::relativePandaSizesWindow[] + val windowSpec = Window + .orderBy(pandas("age")) + .partitionBy(pandas("zip")) + .rowsBetween(start = -10, end = 10) // can use rangeBetween for range instead + //end::relativePandaSizesWindow[] + + //tag::relativePandaSizesQuery[] + val pandaRelativeSizeCol = pandas("pandaSize") - + avg(pandas("pandaSize")).over(windowSpec) + + pandas.select(pandas("name"), pandas("zip"), pandas("pandaSize"), pandas("age"), + pandaRelativeSizeCol.as("panda_relative_size")) + //end::relativePandaSizesQuery[] + } + + // Join DataFrames of Pandas and Sizes with + def joins(df1: DataFrame, df2: DataFrame): Unit = { + + //tag::innerJoin[] + // Inner join implicit + df1.join(df2, df1("name") === df2("name")) + // Inner join explicit + df1.join(df2, df1("name") === df2("name"), "inner") + //end::innerJoin[] + + //tag::leftouterJoin[] + // Left outer join explicit + df1.join(df2, df1("name") === df2("name"), "left_outer") + //end::leftouterJoin[] + + //tag::rightouterJoin[] + // Right outer join explicit + df1.join(df2, df1("name") === df2("name"), "right_outer") + //end::rightouterJoin[] + + //tag::leftsemiJoin[] + // Left semi join explicit + df1.join(df2, df1("name") === df2("name"), "left_semi") + //end::leftsemiJoin[] + } + + /** + * Cut the lineage of a DataFrame which has too long a query plan. + */ + def cutLineage(df: DataFrame): DataFrame = { + val sqlCtx = df.sqlContext + //tag::cutLineage[] + val rdd = df.rdd + rdd.cache() + sqlCtx.createDataFrame(rdd, df.schema) + //end::cutLineage[] + } + + // Self join + def selfJoin(df: DataFrame): DataFrame = { + val sqlCtx = df.sqlContext + import sqlCtx.implicits._ + //tag::selfJoin[] + val joined = df.as("a").join(df.as("b")).where($"a.name" === $"b.name") + //end::selfJoin[] + joined + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala new file mode 100644 index 0000000..82be10f --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala @@ -0,0 +1,146 @@ +/** + * Load and save data to/from DataFrames + */ +package com.highperformancespark.examples.dataframe + +import java.util.Properties + +import org.apache.spark.SparkContext +import org.apache.spark.rdd._ +import org.apache.spark.sql._ +import org.apache.spark.sql.types._ + +case class LoadSave(sc: SparkContext, session: SparkSession) { + import session.implicits._ + //tag::createFromRDD[] + def createFromCaseClassRDD(input: RDD[PandaPlace]) = { + // Create DataFrame explicitly using session and schema inference + val df1 = session.createDataFrame(input) + + // Create DataFrame using session implicits and schema inference + val df2 = input.toDF() + + // Create a Row RDD from our RDD of case classes + val rowRDD = input.map(pm => Row(pm.name, + pm.pandas.map(pi => Row(pi.id, pi.zip, pi.happy, pi.attributes)))) + + val pandasType = ArrayType(StructType(List( + StructField("id", LongType, true), + StructField("zip", StringType, true), + StructField("happy", BooleanType, true), + StructField("attributes", ArrayType(FloatType), true)))) + + // Create DataFrame explicitly with specified schema + val schema = StructType(List(StructField("name", StringType, true), + StructField("pandas", pandasType))) + + val df3 = session.createDataFrame(rowRDD, schema) + } + //end::createFromRDD[] + + //tag::createFromRDDBasic[] + def createFromCaseClassRDD(input: Seq[PandaPlace]) = { + val rdd = sc.parallelize(input) + // Create DataFrame explicitly using session and schema inference + val df1 = session.createDataFrame(input) + } + //end::createFromRDDBasic[] + + //tag::createGetSchema[] + def createAndPrintSchema() = { + val damao = RawPanda(1, "M1B 5K7", "giant", true, Array(0.1, 0.1)) + val pandaPlace = PandaPlace("toronto", Array(damao)) + val df = session.createDataFrame(Seq(pandaPlace)) + df.printSchema() + } + //end::createGetSchema[] + + //tag::createFromLocal[] + def createFromLocal(input: Seq[PandaPlace]) = { + session.createDataFrame(input) + } + //end::createFromLocal[] + + //tag::collectResults[] + def collectDF(df: DataFrame) = { + val result: Array[Row] = df.collect() + result + } + //end::collectResults[] + + //tag::toRDD[] + def toRDD(input: DataFrame): RDD[RawPanda] = { + val rdd: RDD[Row] = input.rdd + rdd.map(row => RawPanda(row.getAs[Long](0), row.getAs[String](1), + row.getAs[String](2), row.getAs[Boolean](3), row.getAs[Array[Double]](4))) + } + //end::toRDD[] + + //tag::partitionedOutput[] + def writeOutByZip(input: DataFrame): Unit = { + input.write.partitionBy("zipcode").format("json").save("output/") + } + //end::partitionedOutput[] + + //tag::saveAppend[] + def writeAppend(input: DataFrame): Unit = { + input.write.mode(SaveMode.Append).save("output/") + } + //end::saveAppend[] + + def createJDBC() = { + //tag::createJDBC[] + session.read.jdbc("jdbc:dialect:serverName;user=user;password=pass", + "table", new Properties) + + session.read.format("jdbc") + .option("url", "jdbc:dialect:serverName") + .option("dbtable", "table").load() + //end::createJDBC[] + } + + def writeJDBC(df: DataFrame) = { + //tag::writeJDBC[] + df.write.jdbc("jdbc:dialect:serverName;user=user;password=pass", + "table", new Properties) + + df.write.format("jdbc") + .option("url", "jdbc:dialect:serverName") + .option("user", "user") + .option("password", "pass") + .option("dbtable", "table").save() + //end::writeJDBC[] + } + + //tag::loadParquet[] + def loadParquet(path: String): DataFrame = { + // Configure Spark to read binary data as string, + // note: must be configured on session. + session.conf.set("spark.sql.parquet.binaryAsString", "true") + + // Load parquet data using merge schema (configured through option) + session.read + .option("mergeSchema", "true") + .format("parquet") + .load(path) + } + //end::loadParquet[] + + //tag::writeParquet[] + def writeParquet(df: DataFrame, path: String) = { + df.write.format("parquet").save(path) + } + //end::writeParquet[] + + //tag::loadHiveTable[] + def loadHiveTable(): DataFrame = { + session.read.table("pandas") + } + //end::loadHiveTable[] + + //tag::saveManagedTable[] + def saveManagedTable(df: DataFrame): Unit = { + df.write.saveAsTable("pandas") + } + //end::saveManagedTable[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala new file mode 100644 index 0000000..2ccdd10 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala @@ -0,0 +1,144 @@ +/** + * A sample mixing relational & functional transformations with Datasets. + */ +package com.highperformancespark.examples.dataframe + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.expressions.aggregate._ +import org.apache.spark.sql.expressions._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +// Additional imports for using HiveContext +import org.apache.spark.sql.hive._ +import org.apache.spark.sql.hive.thriftserver._ + +case class MiniPandaInfo(zip: String, size: Double) + +class MixedDataset(sqlCtx: SQLContext) { + import sqlCtx.implicits._ + + /** + * A sample function on a Dataset of RawPandas. + * + * This is contrived, since our reduction could also be done with SQL aggregates, + * but we can see the flexibility of being able to specify arbitrary Scala code. + */ + def happyPandaSums(ds: Dataset[RawPanda]): Double = { + ds.toDF().filter($"happy" === true).as[RawPanda]. + select($"attributes"(0).as[Double]). + reduce((x, y) => x + y) + } + + /** + * A sample function on a Dataset of RawPandas. + * Use the first attribute to deterimine if a panda is squishy. + */ + //tag::basicSelect[] + def squishyPandas(ds: Dataset[RawPanda]): Dataset[(Long, Boolean)] = { + ds.select($"id".as[Long], ($"attributes"(0) > 0.5).as[Boolean]) + } + //end::basicSelect[] + + /** + * Union happy and sad pandas + */ + //tag::basicUnion[] + def unionPandas(happyPandas: Dataset[RawPanda], sadPandas: Dataset[RawPanda]) = { + happyPandas.union(sadPandas) + } + //end::basicUnion[] + + /** + * Functional map + Dataset, sums the positive attributes for the pandas + */ + //tag::functionalQuery[] + def funMap(ds: Dataset[RawPanda]): Dataset[Double] = { + ds.map{rp => rp.attributes.filter(_ > 0).sum} + } + //end::functionalQuery[] + + //tag::maxPandaSizePerZip[] + def maxPandaSizePerZip(ds: Dataset[RawPanda]): Dataset[(String, Double)] = { + ds.map(rp => MiniPandaInfo(rp.zip, rp.attributes(2))) + .groupByKey(mp => mp.zip).agg(max("size").as[Double]) + } + //end::maxPandaSizePerZip[] + + //tag::maxPandaSizePerZipScala[] + def maxPandaSizePerZipScala(ds: Dataset[RawPanda]): Dataset[(String, Double)] = { + ds.groupByKey(rp => rp.zip).mapGroups{ case (g, iter) => + (g, iter.map(_.attributes(2)).reduceLeft(Math.max(_, _))) + } + } + //end::maxPandaSizePerZipScala[] + + /** + * Illustrate how we make typed queries, using some of the float properties + * to produce boolean values. + */ + def typedQueryExample(ds: Dataset[RawPanda]): Dataset[Double] = { + ds.select($"attributes"(0).as[Double]) + } + + /** + * Illustrate Dataset joins + */ + def joinSample(pandas: Dataset[RawPanda], coffeeShops: Dataset[CoffeeShop]): + Dataset[(RawPanda, CoffeeShop)] = { + //tag::joinWith[] + val result: Dataset[(RawPanda, CoffeeShop)] = pandas.joinWith(coffeeShops, + $"zip" === $"zip") + //end::joinWith[] + result + } + + /** + * Illustrate a self join to compare pandas in the same zip code + */ + def selfJoin(pandas: Dataset[RawPanda]): + Dataset[(RawPanda, RawPanda)] = { + //tag::selfJoin[] + val result: Dataset[(RawPanda, RawPanda)] = pandas.joinWith(pandas, + $"zip" === $"zip") + //end::selfJoin[] + result + } + + //tag::fromRDD[] + /** + * Illustrate converting an RDD to DS + */ + def fromRDD(rdd: RDD[RawPanda]): Dataset[RawPanda] = { + rdd.toDS + } + + //end::fromRDD[] + + //tag::toRDDDF[] + /** + * Illustrate converting a Dataset to an RDD + */ + def toRDD(ds: Dataset[RawPanda]): RDD[RawPanda] = { + ds.rdd + } + + /** + * Illustrate converting a Dataset to a DataFrame + */ + def toDF(ds: Dataset[RawPanda]): DataFrame = { + ds.toDF() + } + //end::toRDDDF[] + + /** + * Illustrate DataFrame to Dataset. Its important to note that if the schema + * does not match what is expected by the Dataset this fails fast. + */ + //tag::DataFrameAsDataset[] + def fromDF(df: DataFrame): Dataset[RawPanda] = { + df.as[RawPanda] + } + //end::DataFrameAsDataset[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back new file mode 100644 index 0000000..cdae7c1 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back @@ -0,0 +1,67 @@ +/** + * A sample mixing relational & functional transformations with Datasets. + */ +package com.highperformancespark.examples.dataframe + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.expressions.aggregate._ +import org.apache.spark.sql.expressions._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +// Additional imports for using HiveContext +import org.apache.spark.sql.hive._ +import org.apache.spark.sql.hive.thriftserver._ + +class MixedDataset(sqlCtx: SQLContext) { + import sqlCtx.implicits._ + + /** + * A sample function on a Dataset of RawPandas. + * This is contrived, since our reduction could also be done with SQL aggregates, but + * we can see the flexibility of being able to specify arbitrary Scala code. + */ + def happyPandaSums(ds: Dataset[RawPanda]): Double = { + ds.toDF().filter($"happy" === true).as[RawPanda]. + select($"attributes"(0).as[Double]). + reduce((x, y) => x + y) + } + + /** + * Functional map + Dataset, sums the positive attributes for the pandas + */ + def funMap(ds: Dataset[RawPanda]): Dataset[Double] = { + ds.map{rp => rp.attributes.filter(_ > 0).sum} + } + + /** + * Illustrate how we make typed queries, using some of the float properties to produce boolean + * values. + */ + def typedQueryExample(ds: Dataset[RawPanda]): Dataset[Double] = { + ds.select($"attributes"(0).as[Double]) + } + + /** + * Illustrate converting a Dataset to an RDD + */ + def toRDD(ds: Dataset[RawPanda]): RDD[RawPanda] = { + ds.rdd + } + + /** + * Illustrate converting a Dataset to a DataFrame + */ + def toDF(ds: Dataset[RawPanda]): DataFrame = { + ds.toDF() + } + + /** + * Illustrate DataFrame to Dataset. Its important to note that if the schema does not match what + * is expected by the Dataset this fails fast. + */ + def fromDF(df: DataFrame): Dataset[RawPanda] = { + df.as[RawPanda] + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala new file mode 100644 index 0000000..b1d64dc --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala @@ -0,0 +1,32 @@ +package com.highperformancespark.examples.dataframe + +import java.util.Arrays +import java.util.Objects + +/** + * @param id panda id + * @param zip zip code of panda residence + * @param pt Type of panda as a string + * @param happy if panda is happy + * @param attributes array of panada attributes + */ +case class RawPanda(id: Long, zip: String, pt: String, + happy: Boolean, attributes: Array[Double]) { + override def equals(o: Any) = o match { + case other: RawPanda => (id == other.id && pt == other.pt && + happy == other.happy && attributes.deep == other.attributes.deep) + case _ => false + } + override def hashCode(): Int = { + 3 * Objects.hashCode(id) + 7 * Objects.hashCode(zip) + + 11 * Objects.hashCode(pt) + 13 * Arrays.hashCode(attributes) + } +} + +/** + * @param name place name + * @param pandas pandas in that place + */ +case class PandaPlace(name: String, pandas: Array[RawPanda]) + +case class CoffeeShop(zip: String, name: String) diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala new file mode 100644 index 0000000..a348c30 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala @@ -0,0 +1,29 @@ +/** + * Using plain-old-sql + */ +package com.highperformancespark.examples.dataframe + +import org.apache.spark.sql._ + +case class RegularSQL(sqlContext: SQLContext) { + + //tag::queryTable[] + def querySQL(): DataFrame = { + sqlContext.sql("SELECT * FROM pandas WHERE size > 0") + } + //end::queryTable[] + + // TODO: Holden: include a parquet example file and point this to that. + //tag::queryRawFile[] + def queryRawFile(): DataFrame = { + sqlContext.sql("SELECT * FROM parquet.`path_to_parquet_file`") + } + //end::queryRawFile[] + + //tag::registerTable[] + def registerTable(df: DataFrame): Unit = { + df.registerTempTable("pandas") + df.write.saveAsTable("perm_pandas") + } + //end::registerTable[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala new file mode 100644 index 0000000..56d4beb --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala @@ -0,0 +1,58 @@ +/** + * Example UDFs + */ +package com.highperformancespark.examples.dataframe + +import org.apache.spark.sql._ +import org.apache.spark.sql.expressions._ +import org.apache.spark.sql.types._ + +object UDFs { + //tag::setupUDFs[] + def setupUDFs(sqlCtx: SQLContext) = { + sqlCtx.udf.register("strLen", (s: String) => s.length()) + } + //end::setupUDFs[] + + //tag::setupUDAFs[] + def setupUDAFs(sqlCtx: SQLContext) = { + class Avg extends UserDefinedAggregateFunction { + // Input type + def inputSchema: org.apache.spark.sql.types.StructType = + StructType(StructField("value", DoubleType) :: Nil) + + def bufferSchema: StructType = StructType( + StructField("count", LongType) :: + StructField("sum", DoubleType) :: Nil + ) + + // Return type + def dataType: DataType = DoubleType + + def deterministic: Boolean = true + + def initialize(buffer: MutableAggregationBuffer): Unit = { + buffer(0) = 0L + buffer(1) = 0.0 + } + + def update(buffer: MutableAggregationBuffer,input: Row): Unit = { + buffer(0) = buffer.getAs[Long](0) + 1 + buffer(1) = buffer.getAs[Double](1) + input.getAs[Double](0) + } + + def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { + buffer1(0) = buffer1.getAs[Long](0) + buffer2.getAs[Long](0) + buffer1(1) = buffer1.getAs[Double](1) + buffer2.getAs[Double](1) + } + + def evaluate(buffer: Row): Any = { + buffer.getDouble(1) / buffer.getLong(0) + } + } + // Optionally register + val avg = new Avg + sqlCtx.udf.register("ourAvg", avg) + } + //end::setupUDAFs[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/errors/throws.scala b/src/main/scala/com/high-performance-spark-examples/errors/throws.scala new file mode 100644 index 0000000..cf695b1 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/errors/throws.scala @@ -0,0 +1,63 @@ +package com.highperformancespark.examples.errors + +import org.apache.spark._ +import org.apache.spark.rdd.RDD + +object Throws { + def throwInner(sc: SparkContext) = { + //tag::throwInner1[] + val data = sc.parallelize(List(1, 2, 3)) + // Will throw an exception when forced to evaluate + val transform1 = data.map(x => x/0) + val transform2 = transform1.map(x => x + 1) + transform2.collect() // Forces evaluation + //end::throwInner1[] + } + + def throwOuter(sc: SparkContext) = { + //tag::throwOuter1[] + val data = sc.parallelize(List(1, 2, 3)) + val transform1 = data.map(x => x + 1) + // Will throw an exception when forced to evaluate + val transform2 = transform1.map(x => x/0) + transform2.collect() // Forces evaluation + //end::throwOuter1[] + } + + //tag::badFunctions[] + def add1(x: Int): Int = { + x + 1 + } + + def divZero(x: Int): Int = { + x / 0 + } + //end::badFunctions[] + + //tag::badEx3[] + def throwInner2(sc: SparkContext) = { + val data = sc.parallelize(List(1, 2, 3)) + // Will throw an exception when forced to evaluate + val transform1 = data.map(divZero) + val transform2 = transform1.map(add1) + transform2.collect() // Forces evaluation + } + + def throwOuter2(sc: SparkContext) = { + val data = sc.parallelize(List(1, 2, 3)) + val transform1 = data.map(add1) + // Will throw an exception when forced to evaluate + val transform2 = transform1.map(divZero) + transform2.collect() // Forces evaluation + } + //end::badEx3 + + def nonExistentInput(sc: SparkContext) = { + //tag::nonExistentInput[] + val input = sc.textFile("file:///doesnotexist.txt") + val data = input.map(x => x.toInt) + val transform = data.map(x => x + 1) + transform.collect() // Forces evaluation + //end::nonExistentInput[] + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala new file mode 100644 index 0000000..9f8ec9d --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala @@ -0,0 +1,322 @@ +package com.highperformancespark.examples.goldilocks + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.storage.StorageLevel + +import scala.collection.mutable.MutableList +import scala.collection.{Map, mutable} + +object GoldilocksGroupByKey { + //tag::groupByKey[] + def findRankStatistics( + dataFrame: DataFrame, + ranks: List[Long]): Map[Int, Iterable[Double]] = { + require(ranks.forall(_ > 0)) + //Map to column index, value pairs + val pairRDD: RDD[(Int, Double)] = mapToKeyValuePairs(dataFrame) + + val groupColumns: RDD[(Int, Iterable[Double])] = pairRDD.groupByKey() + groupColumns.mapValues( + iter => { + //convert to an array and sort + val sortedIter = iter.toArray.sorted + + sortedIter.toIterable.zipWithIndex.flatMap({ + case (colValue, index) => + if (ranks.contains(index + 1)) { + Iterator(colValue) + } else { + Iterator.empty + } + }) + }).collectAsMap() + } + + def findRankStatistics( + pairRDD: RDD[(Int, Double)], + ranks: List[Long]): Map[Int, Iterable[Double]] = { + assert(ranks.forall(_ > 0)) + pairRDD.groupByKey().mapValues(iter => { + val sortedIter = iter.toArray.sorted + sortedIter.zipWithIndex.flatMap( + { + case (colValue, index) => + if (ranks.contains(index + 1)) { + //this is one of the desired rank statistics + Iterator(colValue) + } else { + Iterator.empty + } + } + ).toIterable //convert to more generic iterable type to match out spec + }).collectAsMap() + } + //end::groupByKey[] + + + //tag::toKeyValPairs[] + def mapToKeyValuePairs(dataFrame: DataFrame): RDD[(Int, Double)] = { + val rowLength = dataFrame.schema.length + dataFrame.rdd.flatMap( + row => Range(0, rowLength).map(i => (i, row.getDouble(i))) + ) + } + //end::toKeyValPairs[] +} + + +object GoldilocksWhileLoop{ + + //tag::rankstatsLoop[] + def findRankStatistics( + dataFrame: DataFrame, + ranks: List[Long]): Map[Int, Iterable[Double]] = { + require(ranks.forall(_ > 0)) + val numberOfColumns = dataFrame.schema.length + var i = 0 + var result = Map[Int, Iterable[Double]]() + + while(i < numberOfColumns){ + val col = dataFrame.rdd.map(row => row.getDouble(i)) + val sortedCol : RDD[(Double, Long)] = col.sortBy(v => v).zipWithIndex() + val ranksOnly = sortedCol.filter{ + //rank statistics are indexed from one. e.g. first element is 0 + case (colValue, index) => ranks.contains(index + 1) + }.keys + val list = ranksOnly.collect() + result += (i -> list) + i+=1 + } + result + } + //end::rankstatsLoop[] +} + + +object GoldilocksFirstTry { + + /** + * Find nth target rank for every column. + * + * For example: + * + * dataframe: + * (0.0, 4.5, 7.7, 5.0) + * (1.0, 5.5, 6.7, 6.0) + * (2.0, 5.5, 1.5, 7.0) + * (3.0, 5.5, 0.5, 7.0) + * (4.0, 5.5, 0.5, 8.0) + * + * targetRanks: + * 1, 3 + * + * The output will be: + * 0 -> (0.0, 2.0) + * 1 -> (4.5, 5.5) + * 2 -> (7.7, 1.5) + * 3 -> (5.0, 7.0) + * + * @param dataFrame dataframe of doubles + * @param targetRanks the required ranks for every column + * + * @return map of (column index, list of target ranks) + */ + //tag::firstTry[] + def findRankStatistics(dataFrame: DataFrame, targetRanks: List[Long]): + Map[Int, Iterable[Double]] = { + + val valueColumnPairs: RDD[(Double, Int)] = getValueColumnPairs(dataFrame) + val sortedValueColumnPairs = valueColumnPairs.sortByKey() + sortedValueColumnPairs.persist(StorageLevel.MEMORY_AND_DISK) + + val numOfColumns = dataFrame.schema.length + val partitionColumnsFreq = + getColumnsFreqPerPartition(sortedValueColumnPairs, numOfColumns) + val ranksLocations = getRanksLocationsWithinEachPart( + targetRanks, partitionColumnsFreq, numOfColumns) + + val targetRanksValues = findTargetRanksIteratively( + sortedValueColumnPairs, ranksLocations) + targetRanksValues.groupByKey().collectAsMap() + } + //end::firstTry[] + + /** + * Step 1. Map the rows to pairs of (value, column Index). + * + * For example: + * + * dataFrame: + * 1.5, 1.25, 2.0 + * 5.25, 2.5, 1.5 + * + * The output RDD will be: + * (1.5, 0) (1.25, 1) (2.0, 2) (5.25, 0) (2.5, 1) (1.5, 2) + * + * @param dataFrame dateframe of doubles + * + * @return RDD of pairs (value, column Index) + */ + //tag::firstTry_Step1[] + private def getValueColumnPairs(dataFrame : DataFrame): RDD[(Double, Int)] = { + dataFrame.rdd.flatMap{ + row: Row => row.toSeq.zipWithIndex + .map{ + case (v, index) => (v.toString.toDouble, index)} + } + } + //end::firstTry_Step1[] + + /** + * Step 2. Find the number of elements for each column in each partition. + * + * For Example: + * + * sortedValueColumnPairs: + * Partition 1: (1.5, 0) (1.25, 1) (2.0, 2) (5.25, 0) + * Partition 2: (7.5, 1) (9.5, 2) + * + * numOfColumns: 3 + * + * The output will be: + * [(0, [2, 1, 1]), (1, [0, 1, 1])] + * + * @param sortedValueColumnPairs - sorted RDD of (value, column Index) pairs + * @param numOfColumns the number of columns + * + * @return Array that contains + * (partition index, + * number of elements from every column on this partition) + */ + //tag::firstTry_Step2[] + private def getColumnsFreqPerPartition(sortedValueColumnPairs: RDD[(Double, Int)], + numOfColumns : Int): + Array[(Int, Array[Long])] = { + + val zero = Array.fill[Long](numOfColumns)(0) + + def aggregateColumnFrequencies (partitionIndex : Int, + valueColumnPairs : Iterator[(Double, Int)]) = { + val columnsFreq : Array[Long] = valueColumnPairs.aggregate(zero)( + (a : Array[Long], v : (Double, Int)) => { + val (value, colIndex) = v + //increment the cell in the zero array corresponding to this column index + a(colIndex) = a(colIndex) + 1L + a + }, + (a : Array[Long], b : Array[Long]) => { + a.zip(b).map{ case(aVal, bVal) => aVal + bVal} + }) + + Iterator((partitionIndex, columnsFreq)) + } + + sortedValueColumnPairs.mapPartitionsWithIndex( + aggregateColumnFrequencies).collect() + } + //end::firstTry_Step2[] + + /** + * Step 3: For each Partition determine the index of the elements that are + * desired rank statistics. + * + * This is done locally by the driver. + * + * For Example: + * + * targetRanks: 5 + * partitionColumnsFreq: [(0, [2, 3]), (1, [4, 1]), (2, [5, 2])] + * numOfColumns: 2 + * + * The output will be: + * + * [(0, []), (1, [(0, 3)]), (2, [(1, 1)])] + * + * @param partitionColumnsFreq Array of + * (partition index, + * columns frequencies per this partition) + * + * @return Array that contains + * (partition index, relevantIndexList) + * where relevantIndexList(i) = the index + * of an element on this partition that matches one of the target ranks. + */ + //tag::firstTry_Step3[] + private def getRanksLocationsWithinEachPart(targetRanks : List[Long], + partitionColumnsFreq : Array[(Int, Array[Long])], + numOfColumns : Int) : Array[(Int, List[(Int, Long)])] = { + + val runningTotal = Array.fill[Long](numOfColumns)(0) + // The partition indices are not necessarily in sorted order, so we need + // to sort the partitionsColumnsFreq array by the partition index (the + // first value in the tuple). + partitionColumnsFreq.sortBy(_._1).map { case (partitionIndex, columnsFreq) => + val relevantIndexList = new MutableList[(Int, Long)]() + + columnsFreq.zipWithIndex.foreach{ case (colCount, colIndex) => + val runningTotalCol = runningTotal(colIndex) + val ranksHere: List[Long] = targetRanks.filter(rank => + runningTotalCol < rank && runningTotalCol + colCount >= rank) + + // For each of the rank statistics present add this column index and the + // index it will be at on this partition (the rank - the running total). + relevantIndexList ++= ranksHere.map( + rank => (colIndex, rank - runningTotalCol)) + + runningTotal(colIndex) += colCount + } + + (partitionIndex, relevantIndexList.toList) + } + } + //end::firstTry_Step3[] + + /** + * Step 4: Finds rank statistics elements using ranksLocations. + * + * @param sortedValueColumnPairs - sorted RDD of (value, colIndex) pairs + * @param ranksLocations Array of (partition Index, list of (column index, + * rank index of this column at this partition)) + * + * @return returns RDD of the target ranks (column index, value) + */ + //tag::firstTry_Step4[] + private def findTargetRanksIteratively( + sortedValueColumnPairs : RDD[(Double, Int)], + ranksLocations : Array[(Int, List[(Int, Long)])]): + RDD[(Int, Double)] = { + + sortedValueColumnPairs.mapPartitionsWithIndex( + (partitionIndex : Int, valueColumnPairs : Iterator[(Double, Int)]) => { + val targetsInThisPart: List[(Int, Long)] = ranksLocations(partitionIndex)._2 + if (targetsInThisPart.nonEmpty) { + val columnsRelativeIndex: Map[Int, List[Long]] = + targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2)) + val columnsInThisPart = targetsInThisPart.map(_._1).distinct + + val runningTotals : mutable.HashMap[Int, Long]= new mutable.HashMap() + runningTotals ++= columnsInThisPart.map( + columnIndex => (columnIndex, 0L)).toMap + + //filter this iterator, so that it contains only those (value, columnIndex) + //that are the ranks statistics on this partition + //Keep track of the number of elements we have seen for each columnIndex using the + //running total hashMap. + valueColumnPairs.filter{ + case(value, colIndex) => + lazy val thisPairIsTheRankStatistic: Boolean = { + val total = runningTotals(colIndex) + 1L + runningTotals.update(colIndex, total) + columnsRelativeIndex(colIndex).contains(total) + } + (runningTotals contains colIndex) && thisPairIsTheRankStatistic + }.map(_.swap) + } + else { + Iterator.empty + } + }) + } + //end::firstTry_Step4[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala new file mode 100644 index 0000000..92cb44f --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksSecondarySort.scala @@ -0,0 +1,178 @@ +package com.highperformancespark.examples.goldilocks + +import org.apache.spark.Partitioner +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ + +import scala.collection.Map +import scala.collection.mutable.ArrayBuffer + +//tag::colIndex_partition[] +class ColumnIndexPartition(override val numPartitions: Int) + extends Partitioner { + require(numPartitions >= 0, s"Number of partitions " + + s"($numPartitions) cannot be negative.") + + override def getPartition(key: Any): Int = { + val k = key.asInstanceOf[(Int, Double)] + Math.abs(k._1) % numPartitions //hashcode of column index + } +} +//end::colIndex_partition[] + +object GoldilocksSecondarySort { + /** + * Find nth target rank for every column. + * + * For example: + * + * dataframe: + * (0.0, 4.5, 7.7, 5.0) + * (1.0, 5.5, 6.7, 6.0) + * (2.0, 5.5, 1.5, 7.0) + * (3.0, 5.5, 0.5, 7.0) + * (4.0, 5.5, 0.5, 8.0) + * + * targetRanks: + * 1, 3 + * + * The output will be: + * 0 -> (0.0, 2.0) + * 1 -> (4.5, 5.5) + * 2 -> (7.7, 1.5) + * 3 -> (5.0, 7.0) + * + * This process is executed as follows + * + * 0. Map to ((columnIndex, cellValue), 1) triples. + * 1. Define a custom partitioner which partitions according to the + * first half of the key. + * + * (column Index) + * 1. uses repartitionAndSortWithinPartitions with the custom partitioner. + * This will partition according to column index and then sort by column + * index and value. + * 2. mapPartitions on each partition which is sorted. Filter for correct rank + * stats in one pass. + * 3. Locally: group result so that each key has an iterator of elements. + * + * @param dataFrame - dataFrame of values + * @param targetRanks the rank statistics to find for every column. + * @return map of (column index, list of target ranks) + */ + //tag::goldilocksSecondarySort[] + def findRankStatistics(dataFrame: DataFrame, + targetRanks: List[Long], partitions: Int) = { + + val pairRDD: RDD[((Int, Double), Int)] = + GoldilocksGroupByKey.mapToKeyValuePairs(dataFrame).map((_, 1)) + + val partitioner = new ColumnIndexPartition(partitions) + //sort by the existing implicit ordering on tuples first key, second key + val sorted = pairRDD.repartitionAndSortWithinPartitions(partitioner) + + //filter for target ranks + val filterForTargetIndex: RDD[(Int, Double)] = + sorted.mapPartitions(iter => { + var currentColumnIndex = -1 + var runningTotal = 0 + iter.filter({ + case (((colIndex, value), _)) => + if (colIndex != currentColumnIndex) { + currentColumnIndex = colIndex //reset to the new column index + runningTotal = 1 + } else { + runningTotal += 1 + } + //if the running total corresponds to one of the rank statistics. + //keep this ((colIndex, value)) pair. + targetRanks.contains(runningTotal) + }) + }.map(_._1), preservesPartitioning = true) + groupSorted(filterForTargetIndex.collect()) + } + //end::goldilocksSecondarySort[] + + /** + * Given an array of (columnIndex, value) pairs that are already sorted. + * Groups the pairs with the same column index, creating an iterator of values. + */ + //tag::groupSortedGoldilocks[] + private def groupSorted( + it: Array[(Int, Double)]): Map[Int, Iterable[Double]] = { + val res = List[(Int, ArrayBuffer[Double])]() + it.foldLeft(res)((list, next) => list match { + case Nil => + val (firstKey, value) = next + List((firstKey, ArrayBuffer(value))) + case head :: rest => + val (curKey, valueBuf) = head + val (firstKey, value) = next + if (!firstKey.equals(curKey)) { + (firstKey, ArrayBuffer(value)) :: list + } else { + valueBuf.append(value) + list + } + }).map { case (key, buf) => (key, buf.toIterable) }.toMap + } + //end::groupSortedGoldilocks[] +} + +object GoldilocksSecondarySortV2{ + + def findRankStatistics(dataFrame: DataFrame, + ranks: List[Long], partitions : Int = 2) : Map[Int, Iterable[Double]] = { + val pairRDD = GoldilocksGroupByKey.mapToKeyValuePairs(dataFrame) + val partitioner = new ColumnIndexPartition(partitions) + val sorted = pairRDD.map((_, 1)).repartitionAndSortWithinPartitions(partitioner) + val filterForTargetIndex= sorted.keys.mapPartitions(iter => { + filterAndGroupRanks(iter, ranks) + }, true) + filterForTargetIndex.collectAsMap() + } + + /** + * Precondintion: Iterator must be sorted by (columnIndex, value). Groups by + * column index and filters the values so that only those that correspond to + * the desired rank statistics are included. + */ + def filterAndGroupRanks(it: Iterator[(Int, Double)], targetRanks : List[Long]): + Iterator[(Int, Iterable[Double])] = { + val res = List[(Int, Long, ArrayBuffer[Double])]() + it.foldLeft(res)((list, next) => list match { + case Nil => + val (firstKey, value) = next + val runningTotal = 1L + val ranksSoFar: ArrayBuffer[Double] = + if(targetRanks.contains(runningTotal)) { + ArrayBuffer(value) + } else { + ArrayBuffer[Double]() + } + List((firstKey, runningTotal, ranksSoFar)) + + case head :: rest => + val (curKey, runningTotal, valueBuf) = head + val (firstKey, value) = next + + if (!firstKey.equals(curKey) ) { + val resetRunningTotal = 1L + val nextBuf = if(targetRanks.contains(resetRunningTotal)) { + ArrayBuffer[Double](value) + } else { + ArrayBuffer[Double]() + } + (firstKey, resetRunningTotal, nextBuf) :: list + } else { + val newRunningTotal = runningTotal + 1 + if(targetRanks.contains(newRunningTotal)){ + valueBuf.append(value) + } + (curKey, newRunningTotal, valueBuf) :: rest + } + + }).map { case (key, total, buf) => (key, buf.toIterable) }.iterator + } + +} diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala new file mode 100644 index 0000000..2b3adc1 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksWithHashMap.scala @@ -0,0 +1,384 @@ +package com.highperformancespark.examples.goldilocks + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame +import org.apache.spark.storage.StorageLevel + +import scala.collection.mutable.ArrayBuffer +import scala.collection.{Map, mutable} + + +object GoldilocksWithHashMap { + + /** + * Find nth target rank for every column. + * + * For example: + * + * dataframe: + * (0.0, 4.5, 7.7, 5.0) + * (1.0, 5.5, 6.7, 6.0) + * (2.0, 5.5, 1.5, 7.0) + * (3.0, 5.5, 0.5, 7.0) + * (4.0, 5.5, 0.5, 8.0) + * + * targetRanks: + * 1, 3 + * + * The output will be: + * 0 -> (0.0, 2.0) + * 1 -> (4.5, 5.5) + * 2 -> (7.7, 1.5) + * 3 -> (5.0, 7.0) + * + * @param dataFrame dataframe of doubles + * @param targetRanks the required ranks for every column + * + * @return map of (column index, list of target ranks) + */ + //tag::hashMap[] + def findRankStatistics(dataFrame: DataFrame, targetRanks: List[Long]): + Map[Int, Iterable[Double]] = { + + val aggregatedValueColumnPairs: RDD[((Double, Int), Long)] = + getAggregatedValueColumnPairs(dataFrame) + val sortedAggregatedValueColumnPairs = aggregatedValueColumnPairs.sortByKey() + sortedAggregatedValueColumnPairs.persist(StorageLevel.MEMORY_AND_DISK) + + val numOfColumns = dataFrame.schema.length + val partitionColumnsFreq = + getColumnsFreqPerPartition(sortedAggregatedValueColumnPairs, numOfColumns) + val ranksLocations = + getRanksLocationsWithinEachPart(targetRanks, + partitionColumnsFreq, numOfColumns) + + val targetRanksValues = + findTargetRanksIteratively(sortedAggregatedValueColumnPairs, ranksLocations) + targetRanksValues.groupByKey().collectAsMap() + } + //end::hashMap[] + + /** + * Step 1. Map the rows to pairs of ((value, colIndex), count) where count is the + * number of times that value and that pair appear on this partition. + * + * For example: + * + * dataFrame: + * 1.5, 1.25, 2.0 + * 1.5, 2.5, 2.0 + * + * The output RDD will be: + * ((1.5, 0), 2) ((1.25, 1), 1) ((2.5, 1), 1) ((2.0, 2), 2) + * + * @param dataFrame of double columns to compute the rank statistics for + * + * @return returns RDD of ((value, column index), count) + */ + //tag::hashMap_step1[] + def getAggregatedValueColumnPairs(dataFrame: DataFrame): + RDD[((Double, Int), Long)] = { + + val aggregatedValueColumnRDD = dataFrame.rdd.mapPartitions(rows => { + val valueColumnMap = new mutable.HashMap[(Double, Int), Long]() + rows.foreach(row => { + row.toSeq.zipWithIndex.foreach{ case (value, columnIndex) => + val key = (value.toString.toDouble, columnIndex) + val count = valueColumnMap.getOrElseUpdate(key, 0) + valueColumnMap.update(key, count + 1) + } + }) + + valueColumnMap.toIterator + }) + + aggregatedValueColumnRDD + } + //end::hashMap_step1[] + + /** + * Step 2. Find the number of elements for each column in each partition. + * + * For Example: + * + * sortedValueColumnPairs: + * Partition 1: ((1.5, 0), 2) ((2.0, 0), 1) + * Partition 2: ((4.0, 0), 3) ((3.0, 1), 1) + * + * numOfColumns: 3 + * + * The output will be: + * [(0, [3, 0]), (1, [3, 1])] + * + * @param sortedAggregatedValueColumnPairs sortedAggregatedValueColumnPairs RDD of + * ((value, column index), count) + * @param numOfColumns the number of columns + * + * @return Array that contains + * (partition index, + * number of elements from every column on this partition) + */ + //tag::hashMap_step2[] + private def getColumnsFreqPerPartition( + sortedAggregatedValueColumnPairs: RDD[((Double, Int), Long)], + numOfColumns : Int): Array[(Int, Array[Long])] = { + + val zero = Array.fill[Long](numOfColumns)(0) + + def aggregateColumnFrequencies( + partitionIndex : Int, pairs : Iterator[((Double, Int), Long)]) = { + val columnsFreq : Array[Long] = pairs.aggregate(zero)( + (a : Array[Long], v : ((Double,Int), Long)) => { + val ((value, colIndex), count) = v + a(colIndex) = a(colIndex) + count + a}, + (a : Array[Long], b : Array[Long]) => { + a.zip(b).map{ case(aVal, bVal) => aVal + bVal} + }) + + Iterator((partitionIndex, columnsFreq)) + } + + sortedAggregatedValueColumnPairs.mapPartitionsWithIndex( + aggregateColumnFrequencies).collect() + } + //end::hashMap_step2[] + + /** + * Step 3: For each Partition determine the index of the elements + * that are desired rank statistics + * + * For Example: + * targetRanks: 5 + * partitionColumnsFreq: [(0, [2, 3]), (1, [4, 1]), (2, [5, 2])] + * numOfColumns: 2 + * + * The output will be: + * [(0, []), (1, [(0, 3)]), (2, [(1, 1)])] + * + * @param partitionColumnsFreq Array of + * (partition index, + * columns frequencies per this partition) + * + * @return Array that contains + * (partition index, relevantIndexList) + * Where relevantIndexList(i) = the index + * of an element on this partition that matches one of the target ranks) + */ + //tag::hashMap_step3[] + private def getRanksLocationsWithinEachPart(targetRanks : List[Long], + partitionColumnsFreq : Array[(Int, Array[Long])], + numOfColumns : Int) : Array[(Int, List[(Int, Long)])] = { + + val runningTotal = Array.fill[Long](numOfColumns)(0) + + partitionColumnsFreq.sortBy(_._1).map { case (partitionIndex, columnsFreq)=> + val relevantIndexList = new mutable.MutableList[(Int, Long)]() + + columnsFreq.zipWithIndex.foreach{ case (colCount, colIndex) => + val runningTotalCol = runningTotal(colIndex) + + val ranksHere: List[Long] = targetRanks.filter(rank => + runningTotalCol < rank && runningTotalCol + colCount >= rank) + relevantIndexList ++= ranksHere.map( + rank => (colIndex, rank - runningTotalCol)) + + runningTotal(colIndex) += colCount + } + + (partitionIndex, relevantIndexList.toList) + } + } + //end::hashMap_step3[] + + /** + * Finds rank statistics elements using ranksLocations. + * + * @param sortedAggregatedValueColumnPairs - sorted RDD of (value, colIndex) pairs + * @param ranksLocations Array of (partition Index, list of + * (column index, + * rank index of this column at this partition)) + * + * @return returns RDD of the target ranks (column index, value) + */ + //tag::mapPartitionsExample[] + private def findTargetRanksIteratively( + sortedAggregatedValueColumnPairs : RDD[((Double, Int), Long)], + ranksLocations : Array[(Int, List[(Int, Long)])]): RDD[(Int, Double)] = { + + sortedAggregatedValueColumnPairs.mapPartitionsWithIndex((partitionIndex : Int, + aggregatedValueColumnPairs : Iterator[((Double, Int), Long)]) => { + + val targetsInThisPart: List[(Int, Long)] = ranksLocations(partitionIndex)._2 + if (targetsInThisPart.nonEmpty) { + FindTargetsSubRoutine.asIteratorToIteratorTransformation( + aggregatedValueColumnPairs, + targetsInThisPart) + } else { + Iterator.empty + } + }) + } + //end::mapPartitionsExample[] + /** + * + * Find nth target rank for every column. + * Given an RDD of + * (value, columnindex) countPairs) + * @param valPairs - pairs with ((cell value, columnIndex), frequency). + * I.e. if in the 2nd column there are four instance of the + * value 0.5. One of these pairs would be ((0.5, 3), 4) + * + * @param colIndexList a list of the indices of the parameters to find rank + * statistics for + * @param targetRanks the desired rank statistics + * If we used List(25, 50, 75) we would be finding the 25th, + * 50th and 75th element in each column specified by colIndexList + * @param storageLevel The storage level to persist between sort and map partitions + * @param checkPoint true if we should checkpoint, false otherwise. + * @param directory- the directory to checkpoint in (must be a location on Hdfs) + * @return (ColumnIndex, Iterator of ordered rank statistics)) + */ + //tag::checkpointExample[] + def findQuantilesWithCustomStorage(valPairs: RDD[((Double, Int), Long)], + colIndexList: List[Int], + targetRanks: List[Long], + storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK, + checkPoint : Boolean, directory : String = ""): Map[Int, Iterable[Double]] = { + + val n = colIndexList.last + 1 + val sorted = valPairs.sortByKey() + if (storageLevel != StorageLevel.NONE) { + sorted.persist(storageLevel) + } + + if (checkPoint) { + sorted.sparkContext.setCheckpointDir(directory) + sorted.checkpoint() + } + + val partitionColumnsFreq = getColumnsFreqPerPartition(sorted, n) + val ranksLocations = getRanksLocationsWithinEachPart( + targetRanks, partitionColumnsFreq, n) + val targetRanksValues = findTargetRanksIteratively(sorted, ranksLocations) + targetRanksValues.groupByKey().collectAsMap() + } + //end::checkpointExample[] +} + + + +object FindTargetsSubRoutine extends Serializable { + + + /** + * This sub routine returns an Iterator of (columnIndex, value) that correspond + * to one of the desired rank statistics on this partition. + * + * Because in the original iterator, the pairs are distinct + * and include the count, one row of the original iterator could map to multiple + * elements in the output. + * + * i.e. if we were looking for the 2nd and 3rd element in column index 4 on + * this partition. And the head of this partition is + * ((3249.0, 4), 23) + * (i.e. the element 3249.0 in the 4 th column appears 23 times), + * then we would output (4, 3249.0) twice in the final iterator. + * Once because 3249.0 is the 2nd element and once because it is the third + * element on that partition for that column index and we are looking for both the + * second and third element. + * + * @param valueColumnPairsIter passed in from the mapPartitions function. + * An iterator of the sorted: + * ((value, columnIndex), count) tupples. + * @param targetsInThisPart - (columnIndex, index-on-partition pairs). In the above + * example this would include (4, 2) and (4,3) since we + * desire the 2nd element for column index 4 on this + * partition and the 3rd element. + * @return All of the rank statistics that live in this partition as an iterator + * of (columnIndex, value pairs) + */ + //tag::notIter[] + def withArrayBuffer(valueColumnPairsIter : Iterator[((Double, Int), Long)], + targetsInThisPart: List[(Int, Long)] ): Iterator[(Int, Double)] = { + + val columnsRelativeIndex: Predef.Map[Int, List[Long]] = + targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2)) + + // The column indices of the pairs that are desired rank statistics that live in + // this partition. + val columnsInThisPart: List[Int] = targetsInThisPart.map(_._1).distinct + + // A HashMap with the running totals of each column index. As we loop through + // the iterator. We will update the hashmap as we see elements of each + // column index. + val runningTotals : mutable.HashMap[Int, Long]= new mutable.HashMap() + runningTotals ++= columnsInThisPart.map(columnIndex => (columnIndex, 0L)).toMap + + //we use an array buffer to build the resulting iterator + val result: ArrayBuffer[(Int, Double)] = + new scala.collection.mutable.ArrayBuffer() + + valueColumnPairsIter.foreach { + case ((value, colIndex), count) => + + if (columnsInThisPart contains colIndex) { + + val total = runningTotals(colIndex) + //the ranks that are contains by this element of the input iterator. + //get by filtering the + val ranksPresent = columnsRelativeIndex(colIndex) + .filter(index => (index <= count + total) && (index > total)) + ranksPresent.foreach(r => result += ((colIndex, value))) + //update the running totals. + runningTotals.update(colIndex, total + count) + } + } + //convert + result.toIterator + } + //end::notIter[] + + + /** + * Same function as above but rather than building the result from an array buffer + * we use a flatMap on the iterator to get the resulting iterator. + */ + //tag::iterToIter[] + def asIteratorToIteratorTransformation( + valueColumnPairsIter : Iterator[((Double, Int), Long)], + targetsInThisPart: List[(Int, Long)] ): Iterator[(Int, Double)] = { + + val columnsRelativeIndex = targetsInThisPart.groupBy(_._1).mapValues(_.map(_._2)) + val columnsInThisPart = targetsInThisPart.map(_._1).distinct + + val runningTotals : mutable.HashMap[Int, Long]= new mutable.HashMap() + runningTotals ++= columnsInThisPart.map(columnIndex => (columnIndex, 0L)).toMap + + //filter out the pairs that don't have a column index that is in this part + val pairsWithRanksInThisPart = valueColumnPairsIter.filter{ + case (((value, colIndex), count)) => + columnsInThisPart contains colIndex + } + + // map the valueColumn pairs to a list of (colIndex, value) pairs that correspond + // to one of the desired rank statistics on this partition. + pairsWithRanksInThisPart.flatMap{ + + case (((value, colIndex), count)) => + + val total = runningTotals(colIndex) + val ranksPresent: List[Long] = columnsRelativeIndex(colIndex) + .filter(index => (index <= count + total) + && (index > total)) + + val nextElems: Iterator[(Int, Double)] = + ranksPresent.map(r => (colIndex, value)).toIterator + + //update the running totals + runningTotals.update(colIndex, total + count) + nextElems + } + } + //end::iterToIter[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala b/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala new file mode 100644 index 0000000..a60a39f --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala @@ -0,0 +1,123 @@ +package com.highperformancespark.examples.goldilocks + +import org.apache.spark.HashPartitioner +import org.apache.spark.rdd.RDD + +import scala.collection.Map +import scala.reflect.ClassTag + +object RDDJoinExamples { + + /* For Example, suppose we have one RDD with some data in the form (Panda id, score) + and another RDD with (Panda id, address), and we want to send each Panda some mail + with her best score. We could join the RDDs on ID and then compute the best score + for each address. Like this: + + 'ToDo: Insert Example' + + However, this is slower than first reducing the score data, so that the + //first dataset contains only one row for each Panda with her best score and then + //joining that data with the address data. + + 'ToDO: Insert an example of this' */ + //tag::joinScoresWithAddress[] + def joinScoresWithAddress1( scoreRDD : RDD[(Long, Double)], + addressRDD : RDD[(Long, String )]) : RDD[(Long, (Double, String))]= { + val joinedRDD = scoreRDD.join(addressRDD) + joinedRDD.reduceByKey( (x, y) => if(x._1 > y._1) x else y ) + } + //end::joinScoresWithAddress[] + + //tag::leftOuterJoinScoresWithAddress[] + def outerJoinScoresWithAddress(scoreRDD : RDD[(Long, Double)], + addressRDD: RDD[(Long, String)]) : RDD[(Long, (Double, Option[String]))]= { + val joinedRDD = scoreRDD.leftOuterJoin(addressRDD) + joinedRDD.reduceByKey( (x, y) => if(x._1 > y._1) x else y ) + } + //end::leftOuterJoinScoresWithAddress[] + + //tag::joinScoresWithAddressFast[] + def joinScoresWithAddress2(scoreRDD : RDD[(Long, Double)], + addressRDD: RDD[(Long, String)]) : RDD[(Long, (Double, String))]= { + val bestScoreData = scoreRDD.reduceByKey((x, y) => if(x > y) x else y) + bestScoreData.join(addressRDD) + } + //end::joinScoresWithAddressFast[] +/* + We could make the example in the previous section even faster, + by using the partitioner for the address data as an argument for + the reduce by key step. + 'ToDO: Insert the code to show this here' */ + //tag::joinScoresWithAddress3[] + def joinScoresWithAddress3(scoreRDD: RDD[(Long, Double)], + addressRDD: RDD[(Long, String)]) : RDD[(Long, (Double, String))]= { + // If addressRDD has a known partitioner we should use that, + // otherwise it has a default hash parttioner, which we can reconstruct by + // getting the number of partitions. + val addressDataPartitioner = addressRDD.partitioner match { + case (Some(p)) => p + case (None) => new HashPartitioner(addressRDD.partitions.length) + } + val bestScoreData = scoreRDD.reduceByKey(addressDataPartitioner, + (x, y) => if(x > y) x else y) + bestScoreData.join(addressRDD) + } + //end::joinScoresWithAddress3[] + + def debugString(scoreRDD: RDD[(Long, Double)], + addressRDD: RDD[(Long, String)]) = { + //tag::debugString[] + scoreRDD.join(addressRDD).toDebugString + //end::debugString[] + } + + /* + * Suppose we had two datasets of information about each panda, + * one with the scores, and one with there favorite foods. + * We could use cogroup to associate each Pandas id with an iterator + * of their scores and another iterator of their favorite foods. + */ + def coGroupExample(scoreRDD: RDD[(Long, Double)], foodRDD: RDD[(Long, String)], + addressRDD: RDD[(Long, String)]) = { + //tag::coGroupExample1[] + val cogroupedRDD: RDD[(Long, (Iterable[Double], Iterable[String]))] = + scoreRDD.cogroup(foodRDD) + //end::coGroupExample1[] + + /* + * For example, if we needed to join the panda score data with both address + * and favorite foods, it would be better to use co group than two + * join operations. + */ + //tag::coGroupExample2[] + val addressScoreFood = addressRDD.cogroup(scoreRDD, foodRDD) + //end::coGroupExample2[] + } + + /** + * Performs a broad cast hash join for two RDDs. + * @param bigRDD - the first rdd, should be the larger RDD + * @param smallRDD - the small rdd, should be small enough to fit in memory + * @tparam K - The type of the key + * @tparam V1 - The type of the values for the large array + * @tparam V2 - The type of the values for the second array + * @return + */ + //tag::coreBroadCast[] + def manualBroadCastHashJoin[K : Ordering : ClassTag, V1 : ClassTag, + V2 : ClassTag](bigRDD : RDD[(K, V1)], + smallRDD : RDD[(K, V2)])= { + val smallRDDLocal: Map[K, V2] = smallRDD.collectAsMap() + val smallRDDLocalBcast = bigRDD.sparkContext.broadcast(smallRDDLocal) + bigRDD.mapPartitions(iter => { + iter.flatMap{ + case (k,v1 ) => + smallRDDLocalBcast.value.get(k) match { + case None => Seq.empty[(K, (V1, V2))] + case Some(v2) => Seq((k, (v1, v2))) + } + } + }, preservesPartitioning = true) + } + //end:coreBroadCast[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala b/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala new file mode 100644 index 0000000..2b73ba4 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/goldilocks/SecondarySort.scala @@ -0,0 +1,185 @@ +package com.highperformancespark.examples.goldilocks + +import scala.collection.mutable.ArrayBuffer +import scala.reflect.ClassTag + +import org.apache.spark.{HashPartitioner, Partitioner} +import org.apache.spark.rdd.RDD + +object PandaSecondarySort { + + /** + * Sort first by panda Id (a tuple of four things) Name, address, zip, happiness, + * Then by city, zip, and name. + * + * @param rdd + * @return + */ + def secondarySort(rdd : RDD[(String, StreetAddress, Int, Double)]) = { + val keyedRDD: RDD[(PandaKey, (String, StreetAddress, Int, Double))] = rdd.map { + case (fullName, address, zip, happiness) => + (PandaKey(address.city, zip, address.houseNumber, fullName), + (fullName, address, zip, happiness)) + } + + //tag::implicitOrdering[] + implicit def orderByLocationAndName[A <: PandaKey]: Ordering[A] = { + Ordering.by(pandaKey => (pandaKey.city, pandaKey.zip, pandaKey.name)) + } + //end::implicitOrdering[] + + keyedRDD.sortByKey().values + } + + def groupByCityAndSortWithinGroups( + rdd : RDD[(String, StreetAddress, Int, Double)]) = { + val keyedRDD: RDD[(PandaKey, (String, StreetAddress, Int, Double))] = rdd.map { + case (fullName, address, zip, happiness) => + (PandaKey(address.city, zip, address.houseNumber, fullName), + (fullName, address, zip, happiness)) + } + + val pandaPartitioner = new PandaKeyPartitioner(rdd.partitions.length) + + implicit def orderByLocationAndName[A <: PandaKey]: Ordering[A] = { + Ordering.by(pandaKey => (pandaKey.city, pandaKey.zip, pandaKey.name)) + } + keyedRDD.repartitionAndSortWithinPartitions(pandaPartitioner) + val sortedOnPartitions: RDD[(PandaKey, (String, StreetAddress, Int, Double))] = + keyedRDD.repartitionAndSortWithinPartitions(pandaPartitioner) + sortedOnPartitions.mapPartitions( + iter => { + val typedIter = iter.map(x => (x, 1)) + SecondarySort.groupSorted(typedIter) + }) + } +} + +case class PandaKey(city : String, zip : Int, addressNumber : Long, name : String ) +case class StreetAddress(city : String, streetName : String, houseNumber : Long ) + +class PandaKeyPartitioner(override val numPartitions: Int) extends Partitioner { + require(numPartitions >= 0, + s"Number of partitions ($numPartitions) cannot be negative.") + + override def getPartition(key: Any): Int = { + val k = key.asInstanceOf[PandaKey] + Math.abs(k.city.hashCode) % numPartitions //hashcode of city + } +} + +/** + * A general implemention of Secondary Sort + */ +object SecondarySort { + + //tag::sortByTwoKeys[] + def sortByTwoKeys[K : Ordering : ClassTag, + S : Ordering : ClassTag, + V : ClassTag]( + pairRDD : RDD[((K, S), V)], partitions : Int ) = { + val colValuePartitioner = new PrimaryKeyPartitioner[K, S](partitions) + + //tag::implicitOrdering[] + + implicit val ordering: Ordering[(K, S)] = Ordering.Tuple2 + //end::implicitOrdering[] + val sortedWithinParts = pairRDD.repartitionAndSortWithinPartitions( + colValuePartitioner) + sortedWithinParts + } + //end::sortByTwoKeys[] + + //tag::sortAndGroup[] + def groupByKeyAndSortBySecondaryKey[K : Ordering : ClassTag, + S : Ordering : ClassTag, + V : ClassTag] + (pairRDD : RDD[((K, S), V)], partitions : Int): + RDD[(K, List[(S, V)])] = { + //Create an instance of our custom partitioner + val colValuePartitioner = new PrimaryKeyPartitioner[Double, Int](partitions) + + //define an implicit ordering, to order by the second key the ordering will + //be used even though not explicitly called + implicit val ordering: Ordering[(K, S)] = Ordering.Tuple2 + + //use repartitionAndSortWithinPartitions + val sortedWithinParts = + pairRDD.repartitionAndSortWithinPartitions(colValuePartitioner) + + sortedWithinParts.mapPartitions( iter => groupSorted[K, S, V](iter) ) + } + + def groupSorted[K,S,V]( + it: Iterator[((K, S), V)]): Iterator[(K, List[(S, V)])] = { + val res = List[(K, ArrayBuffer[(S, V)])]() + it.foldLeft(res)((list, next) => list match { + case Nil => + val ((firstKey, secondKey), value) = next + List((firstKey, ArrayBuffer((secondKey, value)))) + + case head :: rest => + val (curKey, valueBuf) = head + val ((firstKey, secondKey), value) = next + if (!firstKey.equals(curKey) ) { + (firstKey, ArrayBuffer((secondKey, value))) :: list + } else { + valueBuf.append((secondKey, value)) + list + } + + }).map { case (key, buf) => (key, buf.toList) }.iterator + } + //end::sortAndGroup[] + +} + +//tag::primaryKeyPartitioner[] +class PrimaryKeyPartitioner[K, S](partitions: Int) extends Partitioner { + /** + * We create a hash partitioner and use it with the first set of keys. + */ + val delegatePartitioner = new HashPartitioner(partitions) + + override def numPartitions = delegatePartitioner.numPartitions + + /** + * Partition according to the hash value of the first key + */ + override def getPartition(key: Any): Int = { + val k = key.asInstanceOf[(K, S)] + delegatePartitioner.getPartition(k._1) + } +} +//end::primaryKeyPartitioner[] + +object CoPartitioningLessons { + + def coLocated(a : RDD[(Int, String)], b : RDD[(Int, String)], + partitionerX : Partitioner, partitionerY :Partitioner): Unit = { + + //tag::coLocated[] + val rddA = a.partitionBy(partitionerX) + rddA.cache() + val rddB = b.partitionBy(partitionerY) + rddB.cache() + val rddC = a.cogroup(b) + rddC.count() + //end::coLocated[] + } + + def notCoLocated(a : RDD[(Int, String)], b : RDD[(Int, String )], + partitionerX : Partitioner, partitionerY :Partitioner): Unit = { + + //tag::notCoLocated[] + val rddA = a.partitionBy(partitionerX) + rddA.cache() + val rddB = b.partitionBy(partitionerY) + rddB.cache() + val rddC = a.cogroup(b) + rddA.count() + rddB.count() + rddC.count() + //end::notCoLocated[] + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala new file mode 100644 index 0000000..2b87a7e --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala @@ -0,0 +1,160 @@ +package com.highperformancespark.examples.ml + +import com.highperformancespark.examples.dataframe._ + +import scala.collection.{Map, mutable} +import scala.collection.mutable.{ArrayBuffer, MutableList} + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import org.apache.spark.ml._ +import org.apache.spark.ml.classification._ +import org.apache.spark.ml.linalg._ +//tag::extraImports[] +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.Identifiable +//end::extraImports[] + +//tag::basicPipelineSetup[] +class HardCodedWordCountStage(override val uid: String) extends Transformer { + def this() = this(Identifiable.randomUID("hardcodedwordcount")) + + def copy(extra: ParamMap): HardCodedWordCountStage = { + defaultCopy(extra) + } +//end::basicPipelineSetup[] + + //tag::basicTransformSchema[] + override def transformSchema(schema: StructType): StructType = { + // Check that the input type is a string + val idx = schema.fieldIndex("happy_pandas") + val field = schema.fields(idx) + if (field.dataType != StringType) { + throw new Exception( + s"Input type ${field.dataType} did not match input type StringType") + } + // Add the return field + schema.add(StructField("happy_panda_counts", IntegerType, false)) + } + //end::basicTransformSchema[] + + //tag::transformFunction[] + def transform(df: Dataset[_]): DataFrame = { + val wordcount = udf { in: String => in.split(" ").size } + df.select(col("*"), + wordcount(df.col("happy_pandas")).as("happy_panda_counts")) + } + //end::transformFunction[] +} + + +//tag::paramTransformer[] +class ConfigurableWordCount(override val uid: String) extends Transformer { + final val inputCol= new Param[String](this, "inputCol", "The input column") + final val outputCol = new Param[String](this, "outputCol", "The output column") + + def setInputCol(value: String): this.type = set(inputCol, value) + + def setOutputCol(value: String): this.type = set(outputCol, value) + + def this() = this(Identifiable.randomUID("configurablewordcount")) + + def copy(extra: ParamMap): HardCodedWordCountStage = { + defaultCopy(extra) + } + + override def transformSchema(schema: StructType): StructType = { + // Check that the input type is a string + val idx = schema.fieldIndex($(inputCol)) + val field = schema.fields(idx) + if (field.dataType != StringType) { + throw new Exception( + s"Input type ${field.dataType} did not match input type StringType") + } + // Add the return field + schema.add(StructField($(outputCol), IntegerType, false)) + } + + def transform(df: Dataset[_]): DataFrame = { + val wordcount = udf { in: String => in.split(" ").size } + df.select(col("*"), wordcount(df.col($(inputCol))).as($(outputCol))) + } +} +//end::paramTransformer[] + + +//tag::simpleIndexer[] +trait SimpleIndexerParams extends Params { + final val inputCol= new Param[String](this, "inputCol", "The input column") + final val outputCol = new Param[String](this, "outputCol", "The output column") +} + +class SimpleIndexer(override val uid: String) + extends Estimator[SimpleIndexerModel] with SimpleIndexerParams { + + def setInputCol(value: String) = set(inputCol, value) + + def setOutputCol(value: String) = set(outputCol, value) + + def this() = this(Identifiable.randomUID("simpleindexer")) + + override def copy(extra: ParamMap): SimpleIndexer = { + defaultCopy(extra) + } + + override def transformSchema(schema: StructType): StructType = { + // Check that the input type is a string + val idx = schema.fieldIndex($(inputCol)) + val field = schema.fields(idx) + if (field.dataType != StringType) { + throw new Exception( + s"Input type ${field.dataType} did not match input type StringType") + } + // Add the return field + schema.add(StructField($(outputCol), IntegerType, false)) + } + + override def fit(dataset: Dataset[_]): SimpleIndexerModel = { + import dataset.sparkSession.implicits._ + val words = dataset.select(dataset($(inputCol)).as[String]).distinct + .collect() + // Construct the model + val model = new SimpleIndexerModel(uid, words) + // Copy the parameters to the model + copyValues(model) + } +} + +class SimpleIndexerModel(override val uid: String, words: Array[String]) + extends Model[SimpleIndexerModel] with SimpleIndexerParams { + + override def copy(extra: ParamMap): SimpleIndexerModel = { + defaultCopy(extra) + } + + private val labelToIndex: Map[String, Double] = words.zipWithIndex. + map{case (x, y) => (x, y.toDouble)}.toMap + + override def transformSchema(schema: StructType): StructType = { + // Check that the input type is a string + val idx = schema.fieldIndex($(inputCol)) + val field = schema.fields(idx) + if (field.dataType != StringType) { + throw new Exception( + s"Input type ${field.dataType} did not match input type StringType") + } + // Add the return field + schema.add(StructField($(outputCol), IntegerType, false)) + } + + override def transform(dataset: Dataset[_]): DataFrame = { + val indexer = udf { label: String => labelToIndex(label) } + dataset.select(col("*"), + indexer(dataset($(inputCol)).cast(StringType)).as($(outputCol))) + } +} +//end::SimpleIndexer[] diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala b/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala new file mode 100644 index 0000000..9b16e6b --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala @@ -0,0 +1,11 @@ +package com.highperformancespark.examples.ml + +import org.apache.spark.ml.classification._ + +object SimpleExport { + //tag::exportLR[] + def exportLRToCSV(model: LogisticRegressionModel) = { + (model.coefficients.toArray :+ model.intercept).mkString(",") + } + //end::exportLR[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala new file mode 100644 index 0000000..13e937f --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala @@ -0,0 +1,135 @@ +package com.highperformancespark.examples.ml + +import com.highperformancespark.examples.dataframe._ + +import scala.collection.{Map, mutable} +import scala.collection.mutable.{ArrayBuffer, MutableList} + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import org.apache.spark.ml._ +import org.apache.spark.ml.classification._ +import org.apache.spark.ml.linalg._ +//tag::extraImports[] +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.Identifiable +//end::extraImports[] + +case class LabeledToken(label: Double, index: Integer) +//tag::SimpleNaiveBayes[] +// Simple Bernouli Naive Bayes classifier - no sanity checks for brevity +// Example only - not for production use. +class SimpleNaiveBayes(val uid: String) + extends Classifier[Vector, SimpleNaiveBayes, SimpleNaiveBayesModel] { + + def this() = this(Identifiable.randomUID("simple-naive-bayes")) + + override def train(ds: Dataset[_]): SimpleNaiveBayesModel = { + import ds.sparkSession.implicits._ + ds.cache() + // Note: you can use getNumClasses & extractLabeledPoints to get an RDD instead + // Using the RDD approach is common when integrating with legacy machine + // learning code or iterative algorithms which can create large query plans. + // Compute the number of documents + val numDocs = ds.count + // Get the number of classes. + // Note this estimator assumes they start at 0 and go to numClasses + val numClasses = getNumClasses(ds) + // Get the number of features by peaking at the first row + val numFeatures: Integer = ds.select(col($(featuresCol))).head + .get(0).asInstanceOf[Vector].size + // Determine the number of records for each class + val groupedByLabel = ds.select(col($(labelCol)).as[Double]).groupByKey(x => x) + val classCounts = groupedByLabel.agg(count("*").as[Long]) + .sort(col("value")).collect().toMap + // Select the labels and features so we can more easily map over them. + // Note: we do this as a DataFrame using the untyped API because the Vector + // UDT is no longer public. + val df = ds.select(col($(labelCol)).cast(DoubleType), col($(featuresCol))) + // Figure out the non-zero frequency of each feature for each label and + // output label index pairs using a case clas to make it easier to work with. + val labelCounts: Dataset[LabeledToken] = df.flatMap { + case Row(label: Double, features: Vector) => + features.toArray.zip(Stream from 1) + .filter{vIdx => vIdx._2 == 1.0} + .map{case (v, idx) => LabeledToken(label, idx)} + } + // Use the typed Dataset aggregation API to count the number of non-zero + // features for each label-feature index. + val aggregatedCounts: Array[((Double, Integer), Long)] = labelCounts + .groupByKey(x => (x.label, x.index)) + .agg(count("*").as[Long]).collect() + + val theta = Array.fill(numClasses)(new Array[Double](numFeatures)) + + // Compute the denominator for the general prioirs + val piLogDenom = math.log(numDocs + numClasses) + // Compute the priors for each class + val pi = classCounts.map{case(_, cc) => + math.log(cc.toDouble) - piLogDenom }.toArray + + // For each label/feature update the probabilities + aggregatedCounts.foreach{case ((label, featureIndex), count) => + // log of number of documents for this label + 2.0 (smoothing) + val thetaLogDenom = math.log( + classCounts.get(label).map(_.toDouble).getOrElse(0.0) + 2.0) + theta(label.toInt)(featureIndex) = math.log(count + 1.0) - thetaLogDenom + } + // Unpersist now that we are done computing everything + ds.unpersist() + // Construct a model + val model = new SimpleNaiveBayesModel( + uid, numClasses, numFeatures, Vectors.dense(pi), + new DenseMatrix(numClasses, theta(0).length, theta.flatten, true)) + // Copy the params values to the model + copyValues(model) + } + + override def copy(extra: ParamMap): SimpleNaiveBayes = { + defaultCopy(extra) + } +} + +// Simplified Naive Bayes Model +case class SimpleNaiveBayesModel( + override val uid: String, + override val numClasses: Int, + override val numFeatures: Int, + val pi: Vector, + val theta: DenseMatrix) extends + ClassificationModel[Vector, SimpleNaiveBayesModel] { + + override def copy(extra: ParamMap): SimpleNaiveBayesModel = { + val copied = new SimpleNaiveBayesModel(uid, numClasses, numFeatures, pi, theta) + copyValues(copied, extra).setParent(parent) + } + + // We have to do some tricks here because we are using Spark's + // Vector/DenseMatrix calculations - but for your own model don't feel + // limited to Spark's native ones. + val negThetaArray = theta.values.map(v => math.log(1.0 - math.exp(v))) + val negTheta = new DenseMatrix(numClasses, numFeatures, negThetaArray, true) + val thetaMinusNegThetaArray = theta.values.zip(negThetaArray) + .map{case (v, nv) => v - nv} + val thetaMinusNegTheta = new DenseMatrix( + numClasses, numFeatures, thetaMinusNegThetaArray, true) + val onesVec = Vectors.dense(Array.fill(theta.numCols)(1.0)) + val negThetaSum: Array[Double] = negTheta.multiply(onesVec).toArray + + // Here is the prediciton functionality you need to implement - for + // ClassificationModels transform automatically wraps this. + // If you might benefit from broadcasting your model or other optimizations you + // can override transform and place your desired logic there. + def predictRaw(features: Vector): Vector = { + // Toy implementation - use BLAS or similar instead + // the summing of the three vectors but the functionality isn't exposed. + Vectors.dense(thetaMinusNegTheta.multiply(features).toArray.zip(pi.toArray) + .map{case (x, y) => x + y}.zip(negThetaSum).map{case (x, y) => x + y} + ) + } +} +//end::SimpleNaiveBayes[] diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala b/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala new file mode 100644 index 0000000..9117c74 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala @@ -0,0 +1,195 @@ +package com.highperformancespark.examples.ml + +import com.highperformancespark.examples.dataframe._ + +import scala.collection.{Map, mutable} +import scala.collection.mutable.{ArrayBuffer, MutableList} + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +//tag::basicImport[] +import org.apache.spark.ml._ +import org.apache.spark.ml.feature._ +import org.apache.spark.ml.classification._ +//end::basicImport[] +//tag::renameImport[] +import org.apache.spark.ml.linalg.{Vector => SparkVector} +//end::renameImport[] +import org.apache.spark.ml.param._ +import org.apache.spark.ml.tuning._ + +object SimplePipeline { + def constructAndSetParams(df: DataFrame) = { + val sqlCtx = df.sqlContext + //tag::constructSetParams[] + val hashingTF = new HashingTF() + hashingTF.setInputCol("input") + hashingTF.setOutputCol("hashed_terms") + //end::constructSetParams[] + } + + def constructSimpleTransformer(df: DataFrame) = { + val sqlCtx = df.sqlContext + //tag::simpleTransformer[] + val hashingTF = new HashingTF() + // We don't set the output column here so the default output column of + // uid + "__output" is used. + hashingTF.setInputCol("input") + // Transformer the input + val transformed = hashingTF.transform(df) + // Since we don't know what the uid is we can use the getOutputCol function + val outputCol = hashingTF.getOutputCol + //end::simpleTransformer[] + (outputCol, transformed) + } + + def constructVectorAssembler() = { + //tag::vectorAssembler[] + val assembler = new VectorAssembler() + assembler.setInputCols(Array("size", "zipcode")) + //end::vectorAssembler[] + } + + // Here is a simple tokenizer to hashingtf transformer manually chained + def simpleTokenizerToHashing(df: DataFrame) = { + //tag::simpleTokenizerToHashing[] + val tokenizer = new Tokenizer() + tokenizer.setInputCol("name") + tokenizer.setOutputCol("tokenized_name") + val tokenizedData = tokenizer.transform(df) + val hashingTF = new HashingTF() + hashingTF.setInputCol("tokenized_name") + hashingTF.setOutputCol("name_tf") + hashingTF.transform(tokenizedData) + //end::simpleTokenizerToHashing[] + } + + def constructSimpleEstimator(df: DataFrame) = { + val sqlCtx = df.sqlContext + //tag::simpleNaiveBayes[] + val nb = new NaiveBayes() + nb.setLabelCol("happy") + nb.setFeaturesCol("features") + nb.setPredictionCol("prediction") + val nbModel = nb.fit(df) + //end::simpleNaiveBayes[] + } + + def stringIndexer(df: DataFrame) = { + //tag::stringIndexer[] + // Construct a simple string indexer + val sb = new StringIndexer() + sb.setInputCol("name") + sb.setOutputCol("indexed_name") + // Construct the model based on the input + val sbModel = sb.fit(df) + //end::stringIndexer[] + } + + def reverseStringIndexer(sbModel: StringIndexerModel) = { + //tag::indexToString[] + // Construct the inverse of the model to go from index-to-string + // after prediction. + val sbInverse = new IndexToString() + sbInverse.setInputCol("prediction") + sbInverse.setLabels(sbModel.labels) + //end::indexToString[] + // Or if meta data is present + //tag::indexToStringMD[] + // Construct the inverse of the model to go from + // index-to-string after prediction. + val sbInverseMD = new IndexToString() + sbInverseMD.setInputCol("prediction") + //end::indexToStringMD[] + } + + def normalizer() = { + //tag::normalizer[] + val normalizer = new Normalizer() + normalizer.setInputCol("features") + normalizer.setOutputCol("normalized_features") + //end::normalizer[] + } + + def paramSearch(df: DataFrame) = { + val tokenizer = new Tokenizer() + tokenizer.setInputCol("name") + tokenizer.setOutputCol("tokenized_name") + val hashingTF = new HashingTF() + hashingTF.setInputCol("tokenized_name") + hashingTF.setOutputCol("name_tf") + val assembler = new VectorAssembler() + assembler.setInputCols(Array("size", "zipcode", "name_tf", + "attributes")) + val normalizer = new Normalizer() + normalizer.setInputCol("features") + normalizer.setOutputCol("normalized_features") + val nb = new NaiveBayes() + nb.setLabelCol("happy") + nb.setFeaturesCol("normalized_features") + nb.setPredictionCol("prediction") + val pipeline = new Pipeline() + pipeline.setStages(Array(tokenizer, hashingTF, assembler, normalizer, nb)) + //tag::createSimpleParamGrid[] + // ParamGridBuilder constructs an Array of parameter combinations. + val paramGrid: Array[ParamMap] = new ParamGridBuilder() + .addGrid(nb.smoothing, Array(0.1, 0.5, 1.0, 2.0)) + .build() + //end::createSimpleParamGrid[] + //tag::runSimpleCVSearch[] + val cv = new CrossValidator() + .setEstimator(pipeline) + .setEstimatorParamMaps(paramGrid) + val cvModel = cv.fit(df) + val bestModel = cvModel.bestModel + //end::runSimpleCVSearch[] + //tag::complexParamSearch[] + val complexParamGrid: Array[ParamMap] = new ParamGridBuilder() + .addGrid(nb.smoothing, Array(0.1, 0.5, 1.0, 2.0)) + .addGrid(hashingTF.numFeatures, Array(1 << 18, 1 << 20)) + .addGrid(hashingTF.binary, Array(true, false)) + .addGrid(normalizer.p, Array(1.0, 1.5, 2.0)) + .build() + //end::complexParamSearch[] + bestModel + } + + def buildSimplePipeline(df: DataFrame) = { + //tag::simplePipeline[] + val tokenizer = new Tokenizer() + tokenizer.setInputCol("name") + tokenizer.setOutputCol("tokenized_name") + val hashingTF = new HashingTF() + hashingTF.setInputCol("tokenized_name") + hashingTF.setOutputCol("name_tf") + val assembler = new VectorAssembler() + assembler.setInputCols(Array("size", "zipcode", "name_tf", + "attributes")) + val nb = new NaiveBayes() + nb.setLabelCol("happy") + nb.setFeaturesCol("features") + nb.setPredictionCol("prediction") + val pipeline = new Pipeline() + pipeline.setStages(Array(tokenizer, hashingTF, assembler, nb)) + //end::simplePipeline[] + //tag::trainPipeline[] + val pipelineModel = pipeline.fit(df) + //end::trainPipeline[] + //tag::accessStages[] + val tokenizer2 = pipelineModel.stages(0).asInstanceOf[Tokenizer] + val nbFit = pipelineModel.stages.last.asInstanceOf[NaiveBayesModel] + //end::accessStages[] + //tag::newPipeline[] + val normalizer = new Normalizer() + normalizer.setInputCol("features") + normalizer.setOutputCol("normalized_features") + nb.setFeaturesCol("normalized_features") + pipeline.setStages(Array(tokenizer, hashingTF, assembler, normalizer, nb)) + val normalizedPipelineModel = pipelineModel.transform(df) + //end::newPipeline[] + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala b/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala new file mode 100644 index 0000000..ddbc9d6 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala @@ -0,0 +1,211 @@ +package com.highperformancespark.examples.mllib + +import com.highperformancespark.examples.dataframe._ + +import scala.collection.{Map, mutable} +import scala.collection.mutable.{ArrayBuffer, MutableList} + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +//tag::imports[] +import com.github.fommil.netlib.BLAS.{getInstance => blas} +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, + LogisticRegressionModel} +// Rename Vector to SparkVector to avoid conflicts with Scala's Vector class +import org.apache.spark.mllib.linalg.{Vector => SparkVector} +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.feature._ +//end::imports[] + +object GoldilocksMLlib { + + def booleanToDouble(boolean: Boolean): Double = { + if (boolean) 1.0 else 0.0 + } + + def toLabeledPointDense(rdd: RDD[RawPanda]): RDD[LabeledPoint] = { + //tag::toLabeledPointDense[] + rdd.map(rp => + LabeledPoint(booleanToDouble(rp.happy), + Vectors.dense(rp.attributes))) + //end::toLabeledPointDense[] + } + + //tag::toSparkVectorDense[] + def toSparkVectorDense(input: Array[Double]) = { + Vectors.dense(input) + } + //end::toSparkVectorDense[] + + //tag::selectTopTen[] + def selectTopTenFeatures(rdd: RDD[LabeledPoint]): + (ChiSqSelectorModel, Array[Int], RDD[SparkVector]) = { + val selector = new ChiSqSelector(10) + val model = selector.fit(rdd) + val topFeatures = model.selectedFeatures + val vecs = rdd.map(_.features) + (model, topFeatures, model.transform(vecs)) + } + //end::selectTopTen[] + + //tag::keepLabeled[] + def selectAndKeepLabeled(rdd: RDD[LabeledPoint]): RDD[LabeledPoint] = { + val selector = new ChiSqSelector(10) + val model = selector.fit(rdd) + rdd.map{ + case LabeledPoint(label, features) => + LabeledPoint(label, model.transform(features)) + } + } + //end::keepLabeled[] + + //tag::createLabelLookup[] + def createLabelLookup[T](rdd: RDD[T]): Map[T, Double] = { + val distinctLabels: Array[T] = rdd.distinct().collect() + distinctLabels.zipWithIndex + .map{case (label, x) => (label, x.toDouble)}.toMap + } + //end::createLabelLookup[] + + + //tag::hashingTFSimple[] + def hashingTf(rdd: RDD[String]): RDD[SparkVector] = { + val ht = new HashingTF() + val tokenized = rdd.map(_.split(" ").toIterable) + ht.transform(tokenized) + } + //end::hashingTFSimple[] + + //tag::word2vecTrain[] + def word2vecTrain(rdd: RDD[String]): Word2VecModel = { + // Tokenize our data + val tokenized = rdd.map(_.split(" ").toIterable) + // Construct our word2vec model + val wv = new Word2Vec() + wv.fit(tokenized) + } + //end::word2vecTrain[] + + + //tag::trainScaler[] + // Trains a feature scaler and returns the scaler and scaled features + def trainScaler(rdd: RDD[SparkVector]): (StandardScalerModel, RDD[SparkVector]) = { + val scaler = new StandardScaler() + val scalerModel = scaler.fit(rdd) + (scalerModel, scalerModel.transform(rdd)) + } + //end::trainScaler[] + + //tag::word2vecSimple[] + def word2vec(sc: SparkContext, rdd: RDD[String]): RDD[SparkVector] = { + // Tokenize our data + val tokenized = rdd.map(_.split(" ").toIterable) + // Construct our word2vec model + val wv = new Word2Vec() + val wvm = wv.fit(tokenized) + val wvmb = sc.broadcast(wvm) + // WVM can now transform single words + println(wvm.transform("panda")) + // Vector size is 100 - we use this to build a transformer on top of WVM that + // works on sentences. + val vectorSize = 100 + // The transform function works on a per-word basis, but we have + // sentences as input. + tokenized.map{words => + // If there is nothing in the sentence output a null vector + if (words.isEmpty) { + Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double]) + } else { + // If there are sentences construct a running sum of the + // vectors for each word + val sum = Array[Double](vectorSize) + words.foreach { word => + blas.daxpy( + vectorSize, 1.0, wvmb.value.transform(word).toArray, 1, sum, 1) + } + // Then scale it by the number of words + blas.dscal(sum.length, 1.0 / words.size, sum, 1) + // And wrap it in a Spark vector + Vectors.dense(sum) + } + } + } + //end::word2vecSimple[] + + //tag::hashingTFPreserve[] + def toVectorPerserving(rdd: RDD[RawPanda]): RDD[(RawPanda, SparkVector)] = { + val ht = new HashingTF() + rdd.map{panda => + val textField = panda.pt + val tokenizedTextField = textField.split(" ").toIterable + (panda, ht.transform(tokenizedTextField)) + } + } + //end::hashingTFPreserve[] + + //tag::hashingTFPreserveZip[] + def hashingTFPreserveZip(rdd: RDD[RawPanda]): RDD[(RawPanda, SparkVector)] = { + val ht = new HashingTF() + val tokenized = rdd.map{panda => panda.pt.split(" ").toIterable} + val vecs = ht.transform(tokenized) + rdd.zip(vecs) + } + //end::hashingTFPreserveZip[] + + //tag::toLabeledPointWithHashing[] + def toLabeledPointWithHashing(rdd: RDD[RawPanda]): RDD[LabeledPoint] = { + val ht = new HashingTF() + rdd.map{rp => + val hashingVec = ht.transform(rp.pt) + val combined = hashingVec.toArray ++ rp.attributes + LabeledPoint(booleanToDouble(rp.happy), + Vectors.dense(combined)) + } + } + //end::toLabeledPointWithHashing[] + + //tag::train[] + def trainModel(rdd: RDD[LabeledPoint]): LogisticRegressionModel = { + val lr = new LogisticRegressionWithLBFGS() + val lrModel = lr.run(rdd) + lrModel + } + //end::train[] + + //tag::trainWithIntercept[] + def trainModelWithInterept(rdd: RDD[LabeledPoint]): LogisticRegressionModel = { + val lr = new LogisticRegressionWithLBFGS() + lr.setIntercept(true) + val lrModel = lr.run(rdd) + lrModel + } + //end::trainWithIntercept[] + + //tag::predict[] + def predict(model: LogisticRegressionModel, rdd: RDD[SparkVector]): RDD[Double] = { + model.predict(rdd) + } + //end::predict[] + + //tag::save[] + def save(sc: SparkContext, path: String, model: LogisticRegressionModel) = { + //tag::savePMML[] + // Save to PMML - remote path + model.toPMML(sc, path + "/pmml") + // Save to PMML local path + model.toPMML(path + "/pmml") + //end::savePMML[] + //tag::saveInternal[] + // Save to internal - remote path + model.save(sc, path + "/internal") + //end::saveInternal[] + } + //end::save[] + + //tag::load[] + def load(sc: SparkContext, path: String): LogisticRegressionModel = { + LogisticRegressionModel.load(sc, path + "/internal") + } + //end::load[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala b/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala new file mode 100644 index 0000000..198518d --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala @@ -0,0 +1,9 @@ +package com.highperformancespark.examples.ffi + +import org.apache.spark.rdd.RDD + +object NativeExample { + def jniSum(input: RDD[(String, Array[Int])]): RDD[(String, Int)] = { + input.mapValues(values => new SumJNI().sum(values)) + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala b/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala new file mode 100644 index 0000000..40eb61f --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.highperformancespark.examples.ffi + +import org.apache.spark.rdd._ +import org.apache.spark.{SparkContext, SparkFiles} + +object PipeExample { + //tag::pipeExample[] + def lookupUserPRS(sc: SparkContext, input: RDD[Int]): RDD[(Int, List[String])] = { + // Copy our script to the worker nodes with sc.addFile + // Add file requires absolute paths + val distScriptName = "ghinfo.pl" + val userDir = System.getProperty("user.dir") + val localScript = s"${userDir}/src/main/perl/${distScriptName}" + val addedFile = sc.addFile(localScript) + + // Pass enviroment variables to our worker + val enviromentVars = Map("user" -> "apache", "repo" -> "spark") + val result = input.map(x => x.toString) + .pipe(SparkFiles.get(distScriptName), enviromentVars) + // Parse the results + result.map{record => + val elems: Array[String] = record.split(" ") + (elems(0).toInt, elems.slice(1, elems.size).sorted.distinct.toList) + } + } + //end::pipeExample[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala b/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala new file mode 100644 index 0000000..485c73d --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala @@ -0,0 +1,10 @@ +package com.highperformancespark.examples.ffi + +object StandAlone { + def main(args: Array[String]) { + //tag::systemLoadLibrary[] + System.loadLibrary("highPerformanceSpark0") + //end::systemLoadLibrary[] + println(new SumJNI().sum(Array(1,2,3))) + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala b/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala new file mode 100644 index 0000000..d6abf6a --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala @@ -0,0 +1,14 @@ +package com.highperformancespark.examples.ffi + +// tag::sumFJNA[] +import com.sun.jna._ +import com.sun.jna.ptr._ +object SumFJNA { + Native.register("high-performance-spark0") + @native def sumf(n: IntByReference, a: Array[Int]): Int + def easySum(size: Int, a: Array[Int]): Int = { + val ns = new IntByReference(size) + sumf(ns, a) + } +} +// end::sumFJNA[] diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala b/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala new file mode 100644 index 0000000..fe9d8f2 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala @@ -0,0 +1,9 @@ +package com.highperformancespark.examples.ffi + +// tag::sumJNA[] +import com.sun.jna._ +object SumJNA { + Native.register("high-performance-spark0") + @native def sum(n: Array[Int], size: Int): Int +} +// end::sumJNA[] diff --git a/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala b/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala new file mode 100644 index 0000000..ed0caaf --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala @@ -0,0 +1,12 @@ +package com.highperformancespark.examples.ffi + +import ch.jodersky.jni.nativeLoader + +//tag::sumJNIDecorator[] +@nativeLoader("high-performance-spark0") +//end::sumJNIDecorator[] +// tag::sumJNI[] +class SumJNI { + @native def sum(n: Array[Int]): Int +} +// end::sumJNI[] diff --git a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala b/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala new file mode 100644 index 0000000..b6e59ae --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.highperformancespark.examples.perf + +import com.highperformancespark.examples.dataframe.RawPanda +import com.highperformancespark.examples.tools._ + +import org.apache.spark.rdd._ +import org.apache.spark.{SparkContext, SparkConf} +import org.apache.spark.sql.{SparkSession, DataFrame, Dataset, Row} +import org.apache.spark.sql.types._ + +/** + * A simple performance test to compare a simple sort between DataFrame, and RDD + */ +object SimplePerfTest { + def main(args: Array[String]) = { + val sparkConf = new SparkConf().setAppName("simple-perf-test") + val sparkSession = SparkSession.builder().enableHiveSupport().getOrCreate() + val sc = sparkSession.sparkContext + val scalingFactor = if (args.length > 0) args(0).toLong else 100L + val size = if (args.length > 1) args(1).toInt else 50 + run(sc, sparkSession, scalingFactor, size) + } + + def run(sc: SparkContext, session: SparkSession, + scalingFactor: Long, size: Int) = { + import session.implicits._ + val inputRDD = GenerateScalingData.generateFullGoldilocks( + sc, scalingFactor, size) + val pairRDD = inputRDD.map(p => (p.zip.toInt, p.attributes(0))) + pairRDD.cache() + pairRDD.count() + val rddTimeings = 1.to(10).map(x => time(testOnRDD(pairRDD))) + val groupTimeings = 1.to(10).map(x => time(groupOnRDD(pairRDD))) + val df = inputRDD.toDF() + val inputDataFrame = df.select( + df("zip").cast(IntegerType), + df("attributes")(0).as("fuzzyness").cast(DoubleType)) + inputDataFrame.cache() + inputDataFrame.count() + val dataFrameTimeings = 1.to(10).map(x => time(testOnDataFrame(inputDataFrame))) + println(rddTimeings.map(_._2).mkString(",")) + println(groupTimeings.map(_._2).mkString(",")) + println(dataFrameTimeings.map(_._2).mkString(",")) + } + + def testOnRDD(rdd: RDD[(Int, Double)]) = { + rdd.map{case (x, y) => (x, (y, 1))} + .reduceByKey{case (x, y) => (x._1 + y._1, x._2 + y._2)}.count() + } + + def groupOnRDD(rdd: RDD[(Int, Double)]) = { + rdd.groupByKey().mapValues{v => + v.aggregate((0.0, 0))({case (x, y) => (x._1 + y, x._2 + 1)}, + {case (x, y) => (x._1 + y._1, x._2 + y._2)})}.count() + } + + def testOnDataFrame(df: DataFrame) = { + df.groupBy("zip").avg("fuzzyness").count() + } + + def time[R](block: => R): (R, Long) = { + val t0 = System.nanoTime() + val result = block // call-by-name + val t1 = System.nanoTime() + println(s"Time ${t1 - t0}ns") + (result, t1 - t0) + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala b/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala new file mode 100644 index 0000000..2fa173c --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala @@ -0,0 +1,85 @@ +/** + * Streaming Pandas Example with the old DStream APIs. + */ +package com.highperformancespark.examples.streaming + +import scala.reflect.ClassTag + +import org.apache.hadoop.io.{LongWritable, Text} +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat + +import org.apache.spark._ +import org.apache.spark.rdd.RDD + +//tag::DStreamImports[] +import org.apache.spark.streaming._ +import org.apache.spark.streaming.dstream._ +//end::DStreamImports[] + +object DStreamExamples { + def makeStreamingContext(sc: SparkContext) = { + //tag::ssc[] + val batchInterval = Seconds(1) + new StreamingContext(sc, batchInterval) + //end::ssc[] + } + + def makeRecoverableStreamingContext(sc: SparkContext, checkpointDir: String) = { + //tag::sscRecover[] + def createStreamingContext(): StreamingContext = { + val batchInterval = Seconds(1) + val ssc = new StreamingContext(sc, batchInterval) + ssc.checkpoint(checkpointDir) + // Then create whatever stream is required + // And whatever mappings need to go on those streams + ssc + } + val ssc = StreamingContext.getOrCreate(checkpointDir, + createStreamingContext _) + // Do whatever work needs to be done regardless of state + // Start context and run + ssc.start() + //end::sscRecover[] + } + + def fileAPIExample(ssc: StreamingContext, path: String): + DStream[(Long, String)] = { + //tag::file[] + // You don't need to write the types of the InputDStream but it for illustration + val inputDStream: InputDStream[(LongWritable, Text)] = + ssc.fileStream[LongWritable, Text, TextInputFormat](path) + // Convert the hadoop types to native JVM types for simplicity + def convert(input: (LongWritable, Text)) = { + (input._1.get(), input._2.toString()) + } + val input: DStream[(Long, String)] = inputDStream.map(convert) + //end::file[] + input + } + + def repartition(dstream: DStream[_]) = { + //tag::repartition[] + dstream.repartition(20) + //end::repartition[] + } + + //tag::repartitionWithTransform[] + def dStreamRepartition[A: ClassTag](dstream: DStream[A]): DStream[A] = { + dstream.transform{rdd => rdd.repartition(20)} + } + //end::repartitionWithTransform[] + + def simpleTextOut(target: String, dstream: DStream[_]) = { + //tag::simpleOut[] + dstream.saveAsTextFiles(target) + //end::simpleOut[] + } + + def foreachSaveSequence(target: String, dstream: DStream[(Long, String)]) = { + //tag::foreachSave[] + dstream.foreachRDD{(rdd, window) => + rdd.saveAsSequenceFile(target + window) + } + //end::foreachSave[] + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala b/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala new file mode 100644 index 0000000..f773a2e --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala @@ -0,0 +1,28 @@ +package com.highperformancespark.examples.structuredstreaming + +import scala.concurrent.duration._ + +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.streaming._ + + +object Structured { + def load(inputPath: String, session: SparkSession): Dataset[_] = { + //tag::loadSimple[] + session.readStream.parquet(inputPath) + //end::loadSimple[] + } + def write(counts: Dataset[_]) = { + //tag::writeComplete[] + val query = counts.writeStream. + // Specify the output mode as Complete to support aggregations + outputMode(OutputMode.Complete()). + // Write out the result as parquet + format("parquet"). + // Specify the interval at which new data will be picked up + trigger(ProcessingTime(1.second)). + queryName("pandas").start() + //end::writeComplete[] + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala b/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala new file mode 100644 index 0000000..b0806c8 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala @@ -0,0 +1,21 @@ +package com.highperformancespark.examples.tokenize + +import org.apache.spark.rdd.RDD + +object SampleTokenize { + //tag::DIFFICULT[] + def difficultTokenizeRDD(input: RDD[String]) = { + input.flatMap(_.split(" ")) + } + //end::DIFFICULT[] + + //tag::EASY[] + def tokenizeRDD(input: RDD[String]) = { + input.flatMap(tokenize) + } + + protected[tokenize] def tokenize(input: String) = { + input.split(" ") + } + //end::EASY[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala b/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala new file mode 100644 index 0000000..02287ae --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala @@ -0,0 +1,41 @@ +package com.highperformancespark.examples.tools + +import scala.collection.immutable.HashSet + +import com.highperformancespark.examples.dataframe.RawPanda + +import org.apache.spark._ +import org.apache.spark.rdd.RDD + +//tag::loggerImport[] +import com.typesafe.scalalogging.LazyLogging +//end::loggerImport[] + +object FilterInvalidPandas extends LazyLogging { + + def filterInvalidPandas(sc: SparkContext, invalidPandas: List[Long], + input: RDD[RawPanda]) = { + //tag::broadcast[] + val invalid = HashSet() ++ invalidPandas + val invalidBroadcast = sc.broadcast(invalid) + input.filter{panda => !invalidBroadcast.value.contains(panda.id)} + //end::broadcast[] + } + + def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long], + input: RDD[RawPanda]) = { + //tag::broadcastAndLog[] + val invalid = HashSet() ++ invalidPandas + val invalidBroadcast = sc.broadcast(invalid) + def keepPanda(pandaId: Long) = { + if (invalidBroadcast.value.contains(pandaId)) { + logger.debug(s"Invalid panda ${pandaId} discovered") + false + } else { + true + } + } + input.filter{panda => keepPanda(panda.id)} + //end::broadcastAndLog[] + } +} diff --git a/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala b/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala new file mode 100644 index 0000000..da4fd38 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala @@ -0,0 +1,86 @@ +package com.highperformancespark.examples.tools + +import com.highperformancespark.examples.dataframe.RawPanda + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.Row +import org.apache.spark.mllib.random.RandomRDDs +import org.apache.spark.mllib.linalg.Vector + +object GenerateScalingData { + /** + * Generate a Goldilocks data set. We expect the zip code to follow an exponential + * distribution and the data its self to be normal + * + * Note: May generate less than number of requested rows due to different + * distribution between + * + * partitions and zip being computed per partition. + * @param rows number of rows in the RDD (approximate) + * @param size number of value elements + */ + def generateFullGoldilocks(sc: SparkContext, rows: Long, numCols: Int): + RDD[RawPanda] = { + val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000, size = rows) + .map(_.toInt.toString) + val valuesRDD = RandomRDDs.normalVectorRDD( + sc, numRows = rows, numCols = numCols) + .repartition(zipRDD.partitions.size) + val keyRDD = sc.parallelize(1L.to(rows), zipRDD.getNumPartitions) + keyRDD.zipPartitions(zipRDD, valuesRDD){ + (i1, i2, i3) => + new Iterator[(Long, String, Vector)] { + def hasNext: Boolean = (i1.hasNext, i2.hasNext, i3.hasNext) match { + case (true, true, true) => true + case (false, false, false) => false + // Note: this is "unsafe" (we throw away data when one of + // the partitions has run out). + case _ => false + } + def next(): (Long, String, Vector) = (i1.next(), i2.next(), i3.next()) + } + }.map{case (k, z, v) => + RawPanda(k, z, "giant", v(0) > 0.5, v.toArray)} + } + + /** + * Transform it down to just the data used for the benchmark + */ + def generateMiniScale(sc: SparkContext, rows: Long, numCols: Int): + RDD[(Int, Double)] = { + generateFullGoldilocks(sc, rows, numCols) + .map(p => (p.zip.toInt, p.attributes(0))) + } + + /** + * Transform it down to just the data used for the benchmark + */ + def generateMiniScaleRows(sc: SparkContext, rows: Long, numCols: Int): + RDD[Row] = { + generateMiniScale(sc, rows, numCols).map{case (zip, fuzzy) => Row(zip, fuzzy)} + } + + // tag::MAGIC_PANDA[] + /** + * Generate a Goldilocks data set all with the same id. + * We expect the zip code to follow an exponential + * distribution and the data its self to be normal. + * Simplified to avoid a 3-way zip. + * + * Note: May generate less than number of requested rows due to + * different distribution between partitions and zip being computed + * per partition. + */ + def generateGoldilocks(sc: SparkContext, rows: Long, numCols: Int): + RDD[RawPanda] = { + val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000, size = rows) + .map(_.toInt.toString) + val valuesRDD = RandomRDDs.normalVectorRDD( + sc, numRows = rows, numCols = numCols) + zipRDD.zip(valuesRDD).map{case (z, v) => + RawPanda(1, z, "giant", v(0) > 0.5, v.toArray) + } + } + // end::MAGIC_PANDA[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala b/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala new file mode 100644 index 0000000..298a7c3 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala @@ -0,0 +1,68 @@ +import scala.util.Random +import scala.reflect.{ClassTag} + +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD + +/** + * Sample our production data to be able to use it for tests + */ +object SampleData { + /** + * Sample the input down to k % for usage in tests + */ + def sampleInput[T](rdd: RDD[T]): RDD[T] = { + // tag::randomSampleInput[] + rdd.sample(withReplacement=false, fraction=0.1) + // end::randomSampleInput[] + } + + /** + * Construct a stratified sample + */ + def stratifiedSample(rdd: RDD[(String, Array[Double])]): + RDD[(String, Array[Double])] = { + // tag::stratifiedSample[] + // 5% of the red pandas, and 50% of the giant pandas + val stratas = Map("red" -> 0.05, "giant" -> 0.50) + rdd.sampleByKey(withReplacement=false, fractions = stratas) + // end::stratifiedSample[] + } + + /** + * Custom random sample with RNG. This is intended as an example of how + * to save setup overhead. + */ + def slowSampleInput[T: ClassTag](rdd: RDD[T]): RDD[T] = { + rdd.flatMap{x => val r = new Random() + if (r.nextInt(10) == 0) { + Some(x) + } else { + None + }} + } + + /** + * Custom random sample with RNG. This is intended as an example of how to + * save setup overhead. + */ + def customSampleInput[T: ClassTag](rdd: RDD[T]): RDD[T] = { + // tag::mapPartitions[] + rdd.mapPartitions{itr => + // Only create once RNG per partitions + val r = new Random() + itr.filter(x => r.nextInt(10) == 0) + } + // end::mapPartitions[] + } + + // tag::broadcast[] + class LazyPrng { + @transient lazy val r = new Random() + } + def customSampleBroadcast[T: ClassTag](sc: SparkContext, rdd: RDD[T]): RDD[T]= { + val bcastprng = sc.broadcast(new LazyPrng()) + rdd.filter(x => bcastprng.value.r.nextInt(10) == 0) + } + // end::broadcast[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala new file mode 100644 index 0000000..bddc84b --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala @@ -0,0 +1,79 @@ +/** + * Illustrates how to use Spark accumulators. Note that most of these examples + * are "dangerous" in that they may not return consistent results. + */ +package com.highperformancespark.examples.transformations + +import com.highperformancespark.examples.dataframe.RawPanda + +import org.apache.spark._ +import org.apache.spark.rdd._ + +import scala.collection.mutable.HashSet +object Accumulators { + /** + * Compute the total fuzzyness with an accumulator while generating + * an id and zip pair for sorting. + */ + //tag::sumFuzzyAcc[] + def computeTotalFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): + (RDD[(String, Long)], Double) = { + // Create an accumulator with the initial value of 0.0 + val acc = sc.accumulator(0.0) + val transformed = rdd.map{x => acc += x.attributes(0); (x.zip, x.id)} + // accumulator still has zero value + // Note: This example is dangerous since the transformation may be + // evaluated multiple times. + transformed.count() // force evaluation + (transformed, acc.value) + } + //end::sumFuzzyAcc[] + + /** + * Compute the max fuzzyness with an accumulator while generating an + * id and zip pair for sorting. + */ + //tag::maxFuzzyAcc[] + def computeMaxFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): + (RDD[(String, Long)], Double) = { + object MaxDoubleParam extends AccumulatorParam[Double] { + override def zero(initValue: Double) = initValue + override def addInPlace(r1: Double, r2: Double): Double = { + Math.max(r1, r2) + } + } + // Create an accumulator with the initial value of Double.MinValue + val acc = sc.accumulator(Double.MinValue)(MaxDoubleParam) + val transformed = rdd.map{x => acc += x.attributes(0); (x.zip, x.id)} + // accumulator still has Double.MinValue + // Note: This example is dangerous since the transformation may be + // evaluated multiple times. + transformed.count() // force evaluation + (transformed, acc.value) + } + //end::maxFuzzyAcc[] + + //tag::uniquePandaAcc[] + def uniquePandas(sc: SparkContext, rdd: RDD[RawPanda]): HashSet[Long] = { + object UniqParam extends AccumulableParam[HashSet[Long], Long] { + override def zero(initValue: HashSet[Long]) = initValue + // For adding new values + override def addAccumulator(r: HashSet[Long], t: Long): HashSet[Long] = { + r += t + r + } + // For merging accumulators + override def addInPlace(r1: HashSet[Long], r2: HashSet[Long]): + HashSet[Long] = { + r1 ++ r2 + } + } + // Create an accumulator with the initial value of Double.MinValue + val acc = sc.accumulable(new HashSet[Long]())(UniqParam) + val transformed = rdd.map{x => acc += x.id; (x.zip, x.id)} + // accumulator still has Double.MinValue + transformed.count() // force evaluation + acc.value + } + //end::uniquePandaAcc[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala b/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala new file mode 100644 index 0000000..4670bb6 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala @@ -0,0 +1,38 @@ +package com.highperformancespark.examples.transformations + +import org.apache.spark.rdd.RDD + + +object NarrowAndWide { + + //toDO: Probably should write some sort of test for this. + //this is used in chapter 4 for the stage diagram + def sillySparkProgram(rdd1 : RDD[Int]) = { + + //tag::narrowWide[] + + //Narrow dependency. Map the rdd to tuples of (x, 1) + val rdd2 = rdd1.map(x => (x, 1)) + //wide dependency groupByKey + val rdd3 = rdd2.groupByKey() + //end::narrowWide[] + + rdd3 + } + //this is used in chapter two for the stage diagram. + + //tag::stageDiagram[] + def simpleSparkProgram(rdd : RDD[Double]): Long ={ + //stage1 + rdd.filter(_< 1000.0) + .map(x => (x, x) ) + //stage2 + .groupByKey() + .map{ case(value, groups) => (groups.sum, value)} + //stage 3 + .sortByKey() + .count() + } + //end::stageDiagram[] + +} diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala b/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala new file mode 100644 index 0000000..948df49 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala @@ -0,0 +1,156 @@ +/** + * Illustrates how to use Spark accumulators with the "new" V2 APIs. + * + * Note that most of these examples are "dangerous" in that they may + * not return consistent results. + */ +package com.highperformancespark.examples.transformations + +import com.highperformancespark.examples.dataframe.RawPanda + +import org.apache.spark._ +//tag::import[] +import org.apache.spark.util.AccumulatorV2 +//end::import[] +import org.apache.spark.rdd._ + +import scala.collection.mutable.HashSet +object NewAccumulators { + /** + * Compute the total fuzzyness with an accumulator while generating + * an id and zip pair for sorting. + */ + //tag::sumFuzzyAcc[] + def computeTotalFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): + (RDD[(String, Long)], Double) = { + // Create an named accumulator for doubles + val acc = sc.doubleAccumulator("fuzzyNess") + val transformed = rdd.map{x => acc.add(x.attributes(0)); (x.zip, x.id)} + // accumulator still has zero value + // Note: This example is dangerous since the transformation may be + // evaluated multiple times. + transformed.count() // force evaluation + (transformed, acc.value) + } + //end::sumFuzzyAcc[] + + /** + * Compute the max fuzzyness with an accumulator while generating + * an id and zip pair for sorting. + */ + //tag::maxFuzzyAcc[] + def computeMaxFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): + (RDD[(String, Long)], Option[Double]) = { + class MaxDoubleAccumulator extends AccumulatorV2[Double, Option[Double]] { + // Here is the var we will accumulate our value in to. + var currentVal: Option[Double] = None + override def isZero = currentVal.isEmpty + + // Reset the current accumulator to zero - used when sending over the wire + // to the workers. + override def reset() = { + currentVal = None + } + + // Copy the current accumulator - this is only realy used in context of + // copy and reset - but since its part of the public API lets be safe. + def copy() = { + val newCopy = new MaxDoubleAccumulator() + newCopy.currentVal = currentVal + newCopy + } + + // We override copy and reset for "speed" - no need to copy the value if + // we care going to zero it right away. This doesn't make much difference + // for Option[Double] but for something like Array[X] could be huge. + + override def copyAndReset() = { + new MaxDoubleAccumulator() + } + + // Add a new value (called on the worker side) + override def add(value: Double) = { + currentVal = Some( + // If the value is present compare it to the new value - otherwise + // just store the new value as the current max. + currentVal.map(acc => Math.max(acc, value)).getOrElse(value)) + } + + override def merge(other: AccumulatorV2[Double, Option[Double]]) = { + other match { + case otherFuzzy: MaxDoubleAccumulator => + // If the other accumulator has the option set merge it in with + // the standard add procedure. If the other accumulator isn't set + // do nothing. + otherFuzzy.currentVal.foreach(value => add(value)) + case _ => + // This should never happen, Spark will only call merge with + // the correct type - but that won't stop someone else from calling + // merge so throw an exception just in case. + throw new Exception("Unexpected merge with unsupported type" + other) + } + } + // Return the accumulated value. + override def value = currentVal + } + // Create a new custom accumulator + val acc = new MaxDoubleAccumulator() + sc.register(acc) + val transformed = rdd.map{x => acc.add(x.attributes(0)); (x.zip, x.id)} + // accumulator still has None value. + // Note: This example is dangerous since the transformation may be + // evaluated multiple times. + transformed.count() // force evaluation + (transformed, acc.value) + } + //end::maxFuzzyAcc[] + + //tag::uniquePandaAcc[] + def uniquePandas(sc: SparkContext, rdd: RDD[RawPanda]): HashSet[Long] = { + class UniqParam extends AccumulatorV2[Long, HashSet[Long]] { + var accValue: HashSet[Long] = new HashSet[Long]() + + def value = accValue + + override def copy() = { + val newCopy = new UniqParam() + newCopy.accValue = accValue.clone + newCopy + } + override def reset() = { + this.accValue = new HashSet[Long]() + } + override def isZero() = { + accValue.isEmpty + } + + // We override copy and reset for speed - no need to copy the value if + // we care going to zero it right away. + override def copyAndReset() = { + new UniqParam() + } + // For adding new values + override def add(value: Long) = { + accValue += value + } + // For merging accumulators + override def merge(other: AccumulatorV2[Long, HashSet[Long]]) = { + other match { + case otherUniq: UniqParam => + accValue = accValue ++ otherUniq.accValue + case _ => + throw new Exception("only support merging with same type") + } + } + } + // Create an accumulator for keeping track of unique values + val acc = new UniqParam() + // Register with a name + sc.register(acc, "Unique values") + val transformed = rdd.map{x => acc.add(x.id); (x.zip, x.id)} + // accumulator still has Double.MinValue + transformed.count() // force evaluation + acc.value + } + //end::uniquePandaAcc[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala b/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala new file mode 100644 index 0000000..2fc3aec --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala @@ -0,0 +1,224 @@ +package com.highperformancespark.examples.transformations + +import org.apache.spark.rdd.RDD +class SmartAggregations { + + //tag::naiveAggregation[] + /** + * Given an RDD of (PandaInstructor, ReportCardText) aggregate by instructor + * to an RDD of distinct keys of (PandaInstructor, ReportCardStatistics) + * where ReportCardMetrics is a case class with + * + * longestWord -> The longest word in all of the reports written by this instructor + * happyMentions -> The number of times this intructor mentioned the word happy + * averageWords -> The average number of words per report card for this instructor + */ + def calculateReportCardStatistics(rdd : RDD[(String, String)] + ): RDD[(String, ReportCardMetrics)] ={ + + rdd.aggregateByKey(new MetricsCalculator(totalWords = 0, + longestWord = 0, happyMentions = 0, numberReportCards = 0))( + seqOp = ((reportCardMetrics, reportCardText) => + reportCardMetrics.sequenceOp(reportCardText)), + combOp = (x, y) => x.compOp(y)) + .mapValues(_.toReportCardMetrics) + } + //end::naiveAggregation[] + + + /** + * Same as above, but rather than using the 'MetricsCalculator' class for + * computing the aggregations functions, we use a modified implementation + * called 'MetricsCalculatorReuseObjects' which modifies the original + * accumulator and returns it for both the sequnece op and the aggregatio op. + * + * @param rdd + * @return + */ + def calculateReportCardStatisticsReuseObjects(rdd : RDD[(String, String)] + ): RDD[(String, ReportCardMetrics)] ={ + + rdd.aggregateByKey(new MetricsCalculatorReuseObjects(totalWords = 0, + longestWord = 0, happyMentions = 0, numberReportCards = 0))( + seqOp = (reportCardMetrics, reportCardText) => + reportCardMetrics.sequenceOp(reportCardText), + combOp = (x, y) => x.compOp(y)) + .mapValues(_.toReportCardMetrics) + } + + //tag::goodAggregation[] + def calculateReportCardStatisticsWithArrays(rdd : RDD[(String, String)] + ): RDD[(String, ReportCardMetrics)] = { + + rdd.aggregateByKey( + //the zero value is a four element array of zeros + Array.fill[Int](4)(0) + )( + //seqOp adds the relevant values to the array + seqOp = (reportCardMetrics, reportCardText) => + MetricsCalculator_Arrays.sequenceOp(reportCardMetrics, reportCardText), + //combo defines how the arrays should be combined + combOp = (x, y) => MetricsCalculator_Arrays.compOp(x, y)) + .mapValues(MetricsCalculator_Arrays.toReportCardMetrics) + } + //end::goodAggregation[] + +} +//tag::caseClass[] +case class ReportCardMetrics( + longestWord : Int, + happyMentions : Int, + averageWords : Double) +//end::caseClass[] + + +//tag::firstCalculator[] + class MetricsCalculator( + val totalWords : Int, + val longestWord: Int, + val happyMentions : Int, + val numberReportCards: Int) extends Serializable { + + def sequenceOp(reportCardContent : String) : MetricsCalculator = { + val words = reportCardContent.split(" ") + val tW = words.length + val lW = words.map( w => w.length).max + val hM = words.count(w => w.toLowerCase.equals("happy")) + + new MetricsCalculator( + tW + totalWords, + Math.max(longestWord, lW), + hM + happyMentions, + numberReportCards + 1) + } + + def compOp(other : MetricsCalculator) : MetricsCalculator = { + new MetricsCalculator( + this.totalWords + other.totalWords, + Math.max(this.longestWord, other.longestWord), + this.happyMentions + other.happyMentions, + this.numberReportCards + other.numberReportCards) + } + + def toReportCardMetrics = + ReportCardMetrics( + longestWord, + happyMentions, + totalWords.toDouble/numberReportCards) +} +//end::firstCalculator[] + +//tag::calculator_reuse[] +class MetricsCalculatorReuseObjects( + var totalWords : Int, + var longestWord: Int, + var happyMentions : Int, + var numberReportCards: Int) extends Serializable { + + def sequenceOp(reportCardContent : String) : this.type = { + val words = reportCardContent.split(" ") + totalWords += words.length + longestWord = Math.max(longestWord, words.map( w => w.length).max) + happyMentions += words.count(w => w.toLowerCase.equals("happy")) + numberReportCards +=1 + this + } + + def compOp(other : MetricsCalculatorReuseObjects) : this.type = { + totalWords += other.totalWords + longestWord = Math.max(this.longestWord, other.longestWord) + happyMentions += other.happyMentions + numberReportCards += other.numberReportCards + this + } + + def toReportCardMetrics = + ReportCardMetrics( + longestWord, + happyMentions, + totalWords.toDouble/numberReportCards) +} +//end::calculator_reuse[] + + +//tag::calculator_array[] +object MetricsCalculator_Arrays extends Serializable { + val totalWordIndex = 0 + val longestWordIndex = 1 + val happyMentionsIndex = 2 + val numberReportCardsIndex = 3 + + def sequenceOp(reportCardMetrics : Array[Int], + reportCardContent : String) : Array[Int] = { + + val words = reportCardContent.split(" ") + //modify each of the elements in the array + reportCardMetrics(totalWordIndex) += words.length + reportCardMetrics(longestWordIndex) = Math.max( + reportCardMetrics(longestWordIndex), + words.map(w => w.length).max) + reportCardMetrics(happyMentionsIndex) += words.count( + w => w.toLowerCase.equals("happy")) + reportCardMetrics(numberReportCardsIndex) +=1 + reportCardMetrics + } + + def compOp(x : Array[Int], y : Array[Int]) : Array[Int] = { + //combine the first and second arrays by modifying the elements + // in the first array + x(totalWordIndex) += y(totalWordIndex) + x(longestWordIndex) = Math.max(x(longestWordIndex), y(longestWordIndex)) + x(happyMentionsIndex) += y(happyMentionsIndex) + x(numberReportCardsIndex) += y(numberReportCardsIndex) + x + } + + def toReportCardMetrics(ar : Array[Int]) : ReportCardMetrics = + ReportCardMetrics( + ar(longestWordIndex), + ar(happyMentionsIndex), + ar(totalWordIndex)/ar(numberReportCardsIndex) + ) +} +//end::calculator_array[] + + +object CollectionRoutines{ + + //tag::implicitExample[] + def findWordMetrics[T <:Seq[String]](collection : T ): (Int, Int)={ + val iterator = collection.toIterator + var mentionsOfHappy = 0 + var longestWordSoFar = 0 + while(iterator.hasNext){ + val n = iterator.next() + if(n.toLowerCase == "happy"){ + mentionsOfHappy +=1 + } + val length = n.length + if(length> longestWordSoFar) { + longestWordSoFar = length + } + + } + (longestWordSoFar, mentionsOfHappy) + } + //end::implicitExample[] + + + //tag::fasterSeqOp[] + val totalWordIndex = 0 + val longestWordIndex = 1 + val happyMentionsIndex = 2 + val numberReportCardsIndex = 3 + def fasterSeqOp(reportCardMetrics : Array[Int], content : String): Array[Int] = { + val words: Seq[String] = content.split(" ") + val (longestWord, happyMentions) = CollectionRoutines.findWordMetrics(words) + reportCardMetrics(totalWordIndex) += words.length + reportCardMetrics(longestWordIndex) = longestWord + reportCardMetrics(happyMentionsIndex) += happyMentions + reportCardMetrics(numberReportCardsIndex) +=1 + reportCardMetrics + } + //end::fasterSeqOp[] +} diff --git a/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala b/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala new file mode 100644 index 0000000..3e6ee36 --- /dev/null +++ b/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala @@ -0,0 +1,44 @@ +package com.highperformancespark.examples.wordcount + +/** + * What sort of big data book would this be if we didn't mention wordcount? + */ +import org.apache.spark.rdd._ + +object WordCount { + // bad idea: uses group by key + def badIdea(rdd: RDD[String]): RDD[(String, Int)] = { + val words = rdd.flatMap(_.split(" ")) + val wordPairs = words.map((_, 1)) + val grouped = wordPairs.groupByKey() + val wordCounts = grouped.mapValues(_.sum) + wordCounts + } + + // good idea: doesn't use group by key + //tag::simpleWordCount[] + def simpleWordCount(rdd: RDD[String]): RDD[(String, Int)] = { + val words = rdd.flatMap(_.split(" ")) + val wordPairs = words.map((_, 1)) + val wordCounts = wordPairs.reduceByKey(_ + _) + wordCounts + } + //end::simpleWordCount[] + + /** + * Come up with word counts but filter out the illegal tokens and stop words + */ + //tag::wordCountStopwords[] + def withStopWordsFiltered(rdd : RDD[String], illegalTokens : Array[Char], + stopWords : Set[String]): RDD[(String, Int)] = { + val separators = illegalTokens ++ Array[Char](' ') + val tokens: RDD[String] = rdd.flatMap(_.split(separators). + map(_.trim.toLowerCase)) + val words = tokens.filter(token => + !stopWords.contains(token) && (token.length > 0) ) + val wordPairs = words.map((_, 1)) + val wordCounts = wordPairs.reduceByKey(_ + _) + wordCounts + } + //end::wordCountStopwords[] +} diff --git a/src/test/java/com/highperformancespark/examples/JavaInteropTest.java b/src/test/java/com/highperformancespark/examples/JavaInteropTest.java new file mode 100644 index 0000000..66318f7 --- /dev/null +++ b/src/test/java/com/highperformancespark/examples/JavaInteropTest.java @@ -0,0 +1,43 @@ +package com.highperformancespark.examples; + +import com.holdenkarau.spark.testing.SharedJavaSparkContext; + +import scala.Tuple2; + +import org.apache.spark.rdd.RDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.*; + +import org.junit.Test; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class JavaInteropTest extends SharedJavaSparkContext { + + @Test + public void wrapPairRDDTest() { + JavaInteropTestHelper helper = new JavaInteropTestHelper(sc()); + JavaInterop ji = new JavaInterop(); + RDD> rdd = helper.generateMiniPairRDD(); + JavaPairRDD prdd = ji.wrapPairRDD(rdd); + List> expected = Arrays.asList(new Tuple2("panda", 12L)); + assertEquals(expected, prdd.collect()); + } + + @Test + public void wrapPairRDDFakeCtTest() { + JavaInteropTestHelper helper = new JavaInteropTestHelper(sc()); + JavaInterop ji = new JavaInterop(); + RDD> rdd = helper.generateMiniPairRDD(); + JavaPairRDD prdd = ji.wrapPairRDDFakeCt(rdd); + List> expected = Arrays.asList(new Tuple2("panda", 12L)); + assertEquals(expected, prdd.collect()); + } +} diff --git a/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java b/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java new file mode 100644 index 0000000..d6bec37 --- /dev/null +++ b/src/test/java/com/highperformancespark/examples/dataframe/JavaHappyPandasTest.java @@ -0,0 +1,151 @@ +package com.highperformancespark.examples.dataframe; + +import com.highperformancespark.examples.objects.JavaPandaInfo; +import com.highperformancespark.examples.objects.JavaPandas; +import com.highperformancespark.examples.objects.JavaRawPanda; +import com.holdenkarau.spark.testing.JavaDataFrameSuiteBase; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.*; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.*; + +public class JavaHappyPandasTest extends JavaDataFrameSuiteBase { + String toronto = "toronto"; + String sandiego = "san diego"; + String virginia = "virginia"; + + List pandaInfoList = Arrays.asList( + new JavaPandaInfo(toronto, "giant", 1, 2), + new JavaPandaInfo(sandiego, "red", 2, 3), + new JavaPandaInfo(virginia, "black", 1, 10) + ); + + List rawPandaList = Arrays.asList( + new JavaRawPanda(10L, "94110", "giant", true, Arrays.asList(1.0, 0.9)), + new JavaRawPanda(11L, "94110", "red", true, Arrays.asList(1.0, 0.9))); + + List pandasList = Arrays.asList( + new JavaPandas("bata", "10010", 10, 2), + new JavaPandas("wiza", "10010", 20, 4), + new JavaPandas("dabdob", "11000", 8, 2), + new JavaPandas("hanafy", "11000", 15, 7), + new JavaPandas("hamdi", "11111", 20, 10) + ); + + @Test + public void simpleSelfJoinTest() { + Dataset inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class); + Dataset result = JavaHappyPandas.selfJoin(inputDF).select("a.name", "b.name"); + List resultList = result.collectAsList(); + + resultList.stream().forEach(row -> assertEquals(row.getString(0), row.getString(1))); + } + + @Test + public void verifyhappyPandasPercentage() { + List expectedList = Arrays.asList(RowFactory.create(toronto, 0.5), + RowFactory.create(sandiego, 2 / 3.0), RowFactory.create(virginia, 1/10.0)); + Dataset expectedDF = sqlContext().createDataFrame( + expectedList, new StructType( + new StructField[]{ + new StructField("place", DataTypes.StringType, true, Metadata.empty()), + new StructField("percentHappy", DataTypes.DoubleType, true, Metadata.empty()) + })); + + Dataset inputDF = sqlContext().createDataFrame(pandaInfoList, JavaPandaInfo.class); + Dataset resultDF = JavaHappyPandas.happyPandasPercentage(inputDF); + + assertDataFrameApproximateEquals(expectedDF, resultDF, 1E-5); + } + + @Test + public void encodePandaType() { + Dataset inputDF = sqlContext().createDataFrame(rawPandaList, JavaRawPanda.class); + Dataset resultDF = JavaHappyPandas.encodePandaType(inputDF); + + List expectedRows = Arrays.asList(RowFactory.create(10L, 0), RowFactory.create(11L, 1)); + Dataset expectedDF = sqlContext().createDataFrame(expectedRows, new StructType(new StructField[]{ + new StructField("id", DataTypes.LongType, false, Metadata.empty()), + new StructField("encodedType", DataTypes.IntegerType, false, Metadata.empty()) + })); + + assertDataFrameEquals(expectedDF, resultDF); + } + + @Test + public void happyPandasPlaces() { + Dataset inputDF = sqlContext().createDataFrame(pandaInfoList, JavaPandaInfo.class); + Dataset resultDF = JavaHappyPandas.happyPandasPlaces(inputDF); + + List expectedRows = Arrays.asList( + new JavaPandaInfo(toronto, "giant", 1, 2), + new JavaPandaInfo(sandiego, "red", 2, 3)); + Dataset expectedDF = sqlContext().createDataFrame(expectedRows, JavaPandaInfo.class); + + assertDataFrameEquals(expectedDF, resultDF); + } + + @Test + public void maxPandaSizePerZip() { + Dataset inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class); + Dataset resultDF = JavaHappyPandas.maxPandaSizePerZip(inputDF); + + List expectedRows = Arrays.asList( + RowFactory.create(pandasList.get(1).getZip(), pandasList.get(1).getPandaSize()), + RowFactory.create(pandasList.get(3).getZip(), pandasList.get(3).getPandaSize()), + RowFactory.create(pandasList.get(4).getZip(), pandasList.get(4).getPandaSize()) + ); + Dataset expectedDF = sqlContext().createDataFrame(expectedRows, + new StructType( + new StructField[]{ + new StructField("zip", DataTypes.StringType, true, Metadata.empty()), + new StructField("max(pandaSize)", DataTypes.IntegerType, true, Metadata.empty()) + } + )); + + assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip")); + } + + @Test + public void complexAggPerZip() { + Dataset inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class); + Dataset resultDF = JavaHappyPandas.minMeanSizePerZip(inputDF); + + List expectedRows = Arrays.asList( + RowFactory.create(pandasList.get(1).getZip(), pandasList.get(0).getPandaSize(), 15.0), + RowFactory.create(pandasList.get(3).getZip(), pandasList.get(2).getPandaSize(), 11.5), + RowFactory.create(pandasList.get(4).getZip(), pandasList.get(4).getPandaSize(), 20.0)); + + Dataset expectedDF = sqlContext().createDataFrame(expectedRows, + new StructType( + new StructField[]{ + new StructField("zip", DataTypes.StringType, true, Metadata.empty()), + new StructField("min(pandaSize)", DataTypes.IntegerType, true, Metadata.empty()), + new StructField("avg(pandaSize)", DataTypes.DoubleType, true, Metadata.empty()) + } + )); + + assertDataFrameApproximateEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip"), 1E-5); + } + + @Test + public void simpleSQLExample() { + Dataset inputDF = sqlContext().createDataFrame(pandasList, JavaPandas.class); + Dataset resultDF = JavaHappyPandas.simpleSqlExample(inputDF); + + List expectedList = Arrays.asList( + pandasList.get(0), pandasList.get(2) + ); + Dataset expectedDF = sqlContext().createDataFrame(expectedList, JavaPandas.class); + + assertDataFrameEquals(expectedDF, resultDF); + } + +} diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala b/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala new file mode 100644 index 0000000..3fb10a5 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala @@ -0,0 +1,306 @@ +/** + * Happy Panda Example for DataFrames. + * Computes the % of happy pandas. Very contrived. + */ +package com.highperformancespark.examples.dataframe + +import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas} +import com.holdenkarau.spark.testing._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.scalatest.Matchers._ +import org.scalatest.FunSuite + +import scala.collection.mutable +import scala.util.Random + +class HappyPandasTest extends FunSuite with DataFrameSuiteBase { + val toronto = "toronto" + val sandiego = "san diego" + val virginia = "virginia" + val pandaInfoList = List( + PandaInfo(toronto, "giant", 1, 2), + PandaInfo(sandiego, "red", 2, 3), + PandaInfo(virginia, "black", 1, 10)) + + val rawPandaList = List( + RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9)), + RawPanda(11L, "94110", "red", true, Array(1.0, 0.9))) + + val pandasList = List(Pandas("bata", "10010", 10, 2), + Pandas("wiza", "10010", 20, 4), + Pandas("dabdob", "11000", 8, 2), + Pandas("hanafy", "11000", 15, 7), + Pandas("hamdi", "11111", 20, 10)) + + val pandaPlaces = List(PandaPlace("toronto", rawPandaList.toArray)) + + test("simple self join test") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(pandasList) + val result = HappyPandas.selfJoin(inputDF).select($"a.name", $"b.name") + val rez = result.collect() + rez.foreach{x => assert(x(0) == x(1))} + } + + test("simple explode test") { + val inputDF = sqlContext.createDataFrame(pandaPlaces) + val pandaInfo = sqlContext.createDataFrame(rawPandaList) + val expectedDf = pandaInfo.select( + (pandaInfo("attributes")(0) / pandaInfo("attributes")(1)) + .as("squishyness")) + val result = HappyPandas.squishPandaFromPace(inputDF) + + assertDataFrameApproximateEquals(expectedDf, result, 1E-5) + } + + //tag::approxEqualDataFrames[] + + test("verify simple happy pandas Percentage") { + val expectedList = List(Row(toronto, 0.5), + Row(sandiego, 2/3.0), + Row(virginia, 1/10.0)) + val expectedDf = createDF(expectedList, ("place", StringType), + ("percentHappy", DoubleType)) + + val inputDF = sqlContext.createDataFrame(pandaInfoList) + val resultDF = HappyPandas.happyPandasPercentage(inputDF) + + assertDataFrameApproximateEquals(expectedDf, resultDF, 1E-5) + } + //end::approxEqualDataFrames[] + + test("verify approx by hand") { + val inputDF = sqlContext.createDataFrame(pandaInfoList) + val resultDF = HappyPandas.happyPandasPercentage(inputDF) + val resultRows = resultDF.collect() + + val expectedRows = List(Row(toronto, 0.5), + Row(sandiego, 2/3.0), + Row(virginia, 1/10.0)) + + //tag::approxEqualRow[] + assert(expectedRows.length === resultRows.length) + expectedRows.zip(resultRows).foreach{case (r1, r2) => + assert(r1(0) === r2(0)) + assert(r1.getDouble(1) === (r2.getDouble(1) +- 0.001)) + } + //end::approxEqualRow[] + } + + test("test encode Panda type") { + val inputDF = sqlContext.createDataFrame(rawPandaList) + val resultDF = HappyPandas.encodePandaType(inputDF) + + val expectedRows = List(Row(10L, 0), Row(11L, 1)) + val expectedDF = createDF3(expectedRows, ("id", LongType, false), + ("encodedType", IntegerType, false)) + + assertDataFrameEquals(expectedDF, resultDF) + } + + //tag::exactEqualDataFrames[] + test("verify exact equality") { + // test minHappyPandas + val inputDF = sqlContext.createDataFrame(pandaInfoList) + val result = HappyPandas.minHappyPandas(inputDF, 2) + val resultRows = result.collect() + + val expectedRows = List(Row(sandiego, "red", 2, 3)) + assert(expectedRows === resultRows) + } + //end::exactEqualDataFrames[] + + test("test happyPandasPlaces") { + val inputDF = sqlContext.createDataFrame(pandaInfoList) + val resultDF = HappyPandas.happyPandasPlaces(inputDF) + + val expectedRows = List(PandaInfo(toronto, "giant", 1, 2), + PandaInfo(sandiego, "red", 2, 3)) + val expectedDF = sqlContext.createDataFrame(expectedRows) + + assertDataFrameEquals(expectedDF, resultDF) + } + + test("test maxPandaSizePerZip") { + val inputDF = sqlContext.createDataFrame(pandasList) + val resultDF = HappyPandas.maxPandaSizePerZip(inputDF) + + val expectedRows = List(Row(pandasList(1).zip, pandasList(1).pandaSize), + Row(pandasList(3).zip, pandasList(3).pandaSize), + Row(pandasList(4).zip, pandasList(4).pandaSize)) + val expectedDF = createDF(expectedRows, ("zip", StringType), + ("max(pandaSize)", IntegerType)) + + assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip")) + } + + test("test minMaxPandaSizePerZip"){ + val inputDF = sqlContext.createDataFrame(pandasList) + val resultDF = HappyPandas.minMaxPandaSizePerZip(inputDF) + + val expectedRows = List( + Row(pandasList(1).zip, pandasList(0).pandaSize, pandasList(1).pandaSize), + Row(pandasList(3).zip, pandasList(2).pandaSize, pandasList(3).pandaSize), + Row(pandasList(4).zip, pandasList(4).pandaSize, pandasList(4).pandaSize)) + + val expectedDF = createDF(expectedRows, ("zip", StringType), + ("min(pandaSize)", IntegerType), + ("max(pandaSize)", IntegerType)) + + assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip")) + } + + test("test minPandaSizeMaxAgePerZip") { + val inputDF = sqlContext.createDataFrame(pandasList) + val resultDF = HappyPandas.minPandaSizeMaxAgePerZip(inputDF) + + val expectedRows = List( + Row(pandasList(1).zip, pandasList(0).pandaSize, pandasList(1).age), + Row(pandasList(3).zip, pandasList(2).pandaSize, pandasList(3).age), + Row(pandasList(4).zip, pandasList(4).pandaSize, pandasList(4).age)) + + val expectedDF = createDF(expectedRows, ("zip", StringType), + ("min(pandaSize)", IntegerType), + ("max(age)", IntegerType)) + + assertDataFrameEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip")) + } + + test("test complexAggPerZip") { + val inputDF = sqlContext.createDataFrame(pandasList) + val resultDF = HappyPandas.minMeanSizePerZip(inputDF) + + val expectedRows = List( + Row(pandasList(1).zip, pandasList(0).pandaSize, 15.0), + Row(pandasList(3).zip, pandasList(2).pandaSize, 11.5), + Row(pandasList(4).zip, pandasList(4).pandaSize, 20.0)) + + val expectedDF = createDF(expectedRows, ("zip", StringType), + ("min(pandaSize)", IntegerType), + ("avg(pandaSize)", DoubleType)) + + assertDataFrameApproximateEquals(expectedDF.orderBy("zip"), + resultDF.orderBy("zip"), 1e-5) + } + + + test("test Simple SQL example") { + val inputDF = sqlContext.createDataFrame(pandasList) + val resultDF = HappyPandas.simpleSqlExample(inputDF) + + val expectedRows = List(pandasList(0), pandasList(2)) + val expectedDF = sqlContext.createDataFrame(expectedRows) + + assertDataFrameEquals(expectedDF, resultDF) + } + + test("test Order Pandas") { + val inputDF = sqlContext.createDataFrame(pandasList) + val resultDF = HappyPandas.orderPandas(inputDF) + + val expectedRows = List(pandasList(2), pandasList(0), pandasList(3), + pandasList(4), pandasList(1)) + val expectedDF = sqlContext.createDataFrame(expectedRows) + + assertDataFrameEquals(expectedDF, resultDF) + } + + + test("test computeRelativePandaSizes") { + val inputPandaList = loadPandaStuffies() + val inputDF = sqlContext.createDataFrame(inputPandaList) + + val resultDF = HappyPandas.computeRelativePandaSizes(inputDF) + + val expectedDF = getExpectedPandasRelativeSize(inputPandaList, -10, 10) + + assertDataFrameApproximateEquals(expectedDF.orderBy("name"), + resultDF.orderBy("name"), 1e-5) + } + + private def getExpectedPandasRelativeSize(pandaList: List[Pandas], + start: Int, end: Int):DataFrame = { + + val expectedRows = + pandaList + .groupBy(_.zip) + .map(zipPandas => (zipPandas._1, zipPandas._2.sortBy(_.age))) + .flatMap(zipPandas => { + val pandas = zipPandas._2 + val length = pandas.size - 1 + val result = new mutable.MutableList[Row] + + for (i <- 0 to length) { + var totalSum = 0 + val startOffset = math.max(0, i + start) + val endOffset = math.min(length, i + end) + + for (j <- startOffset to endOffset) + totalSum += pandas(j).pandaSize + + val count = endOffset - startOffset + 1 + val average = totalSum.toDouble / count + + val panda = pandas(i) + result += Row(panda.name, + panda.zip, + panda.pandaSize, + panda.age, + panda.pandaSize - average) + } + + result + }).toList + + val expectedDF = createDF(expectedRows, ("name", StringType), + ("zip", StringType), + ("pandaSize", IntegerType), + ("age", IntegerType), + ("panda_relative_size", DoubleType)) + + expectedDF + } + + private def loadPandaStuffies(): List[Pandas] = { + val zipCount = 3 + val maxPandasPerZip = 15 + val maxPandaAge = 50 + val maxPandaSize = 500 + val random = new Random() + + val pandas = + (1 to zipCount) + .flatMap(zipId => { + val pandasCount = 1 + random.nextInt(maxPandasPerZip) + val zipName = s"zip($zipId)" + + (1 to pandasCount).map(pandaId => { + val name = s"panda($pandaId)($zipId)" + val size = 1 + random.nextInt(maxPandaSize) + val age = 1 + random.nextInt(maxPandaAge) + + Pandas(name, zipName, size, age) + } + ) + + }) + + pandas.toList + } + + + private def createDF(list: List[Row], fields: (String, DataType)*) = + sqlContext.createDataFrame(sc.parallelize(list), structType2(fields)) + + private def structType2(fields: Seq[(String, DataType)]) = + StructType(fields.map(f => StructField(f._1, f._2)).toList) + + + private def createDF3(list: List[Row], fields: (String, DataType, Boolean)*) = + sqlContext.createDataFrame(sc.parallelize(list), structType3(fields)) + + private def structType3(fields: Seq[(String, DataType, Boolean)]) = + StructType(fields.map(f => StructField(f._1, f._2, f._3)).toList) +} diff --git a/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala b/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala new file mode 100644 index 0000000..6571cee --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala @@ -0,0 +1,63 @@ +/** + * Checks basic Dataset magics + */ +package com.highperformancespark.examples.dataframe + +import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas} +import com.holdenkarau.spark.testing._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.scalatest.Matchers._ +import org.scalatest.FunSuite + +import scala.collection.mutable +import scala.util.Random + +class MixedDatasetSuite extends FunSuite with DataFrameSuiteBase { + + val rawPandaList = List( + RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9, 20.0)), + RawPanda(11L, "94110", "red", true, Array(1.0, 0.7, 30.0))) + + test("happy panda sums") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val mixedDS = new MixedDataset(sqlCtx) + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val result = mixedDS.happyPandaSums(inputDS) + assert(result === (2.0 +- 0.001)) + } + + test("basic select") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val mixedDS = new MixedDataset(sqlCtx) + val squishy = mixedDS.squishyPandas(inputDS).collect() + assert(squishy(0)._2 === true) + } + + test("funquery") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val mixedDS = new MixedDataset(sqlCtx) + val summedAttrs = mixedDS.funMap(inputDS).collect() + assert(summedAttrs(0) === 21.9 +- 0.001) + assert(summedAttrs(1) === 31.7 +- 0.001) + } + + test("max pandas size per zip") { + val sqlCtx = sqlContext + import sqlCtx.implicits._ + val inputDF = sqlCtx.createDataFrame(rawPandaList) + val inputDS = inputDF.as[RawPanda] + val mixedDS = new MixedDataset(sqlCtx) + val bigPandas = mixedDS.maxPandaSizePerZip(inputDS).collect() + assert(bigPandas.size === 1) + assert(bigPandas(0)._2 === 30.0 +- 0.00001) + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala b/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala new file mode 100644 index 0000000..2b54ce7 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala @@ -0,0 +1,28 @@ +package com.highperformancespark.examples.errors + +import com.holdenkarau.spark.testing._ + +import org.scalatest.FunSuite + +class ThrowsSuite extends FunSuite with SharedSparkContext { + test("inner throw & outer throw should both throw SparkExceptions exceptions") { + intercept[org.apache.spark.SparkException] { + Throws.throwInner(sc) + } + intercept[org.apache.spark.SparkException] { + Throws.throwOuter(sc) + } + intercept[org.apache.spark.SparkException] { + Throws.throwInner2(sc) + } + intercept[org.apache.spark.SparkException] { + Throws.throwOuter2(sc) + } + } + + test("loading missing data should throw") { + intercept[org.apache.hadoop.mapred.InvalidInputException] { + Throws.nonExistentInput(sc) + } + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala new file mode 100644 index 0000000..9708284 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala @@ -0,0 +1,99 @@ +package com.highperformancespark.examples.goldilocks + +import com.holdenkarau.spark.testing.SharedSparkContext +import org.apache.spark.rdd.RDD +import org.scalatest.FunSuite + +class EvaluationTests extends FunSuite with SharedSparkContext { + val doubleList = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0) + val keyValuePairs = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0).zipWithIndex + val path = "target/testResults" + + test("MapValues preserves Partitioning "){ + val data: RDD[(Double, Int )] = sc.parallelize(keyValuePairs) + // tag::MapValues[] + val sortedData = data.sortByKey() + val mapValues: RDD[(Double, String)] = sortedData.mapValues(_.toString) + assert(mapValues.partitioner.isDefined, + "Using Map Values preserves partitioning") + + val map = sortedData.map( pair => (pair._1, pair._2.toString)) + assert(map.partitioner.isEmpty, "Using map does not preserve partitioning") + // end::MapValues[] + } + + test( "Subtract Behavior "){ + // tag::Subtract[] + val a = Array(1, 2, 3, 4, 4, 4, 4) + val b = Array(3, 4) + val rddA = sc.parallelize(a) + val rddB = sc.parallelize(b) + val rddC = rddA.subtract(rddB) + assert(rddC.count() < rddA.count() - rddB.count()) + // end::Subtract[] + } + + test( "Intersection Behavior "){ + // tag::Intersect[] + val a = Array(1, 2, 3, 4, 4, 4, 4) + val b = Array(3, 4) + val rddA = sc.parallelize(a) + val rddB = sc.parallelize(b) + val intersection = rddA.intersection(rddB) + val subtraction = rddA.subtract(rddB) + val union = intersection.union(subtraction) + assert(!rddA.collect().sorted.sameElements(union.collect().sorted)) + // end::Intersect[] + } + + test("Itereative Computations "){ + def rmse(rdd : RDD[(Int, Int )]) = { + val n = rdd.count() + math.sqrt(rdd.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n) + } + + val validationSet = sc.parallelize(keyValuePairs) + + // tag::iterativeComp[] + val testSet: Array[RDD[(Double, Int)]] = + Array( + validationSet.mapValues(_ + 1), + validationSet.mapValues(_ + 2), + validationSet) + validationSet.persist() //persist since we are using this RDD several times + val errors = testSet.map( rdd => { + rmse(rdd.join(validationSet).values) + }) + // end::iterativeComp[] + + // the one where we didn't change anything should have the + // lowest root mean squared error + assert(errors.min == errors(2)) + + } + + test( "Two actions without caching ") { + val rddA: RDD[(Double, Int)] = sc.parallelize(keyValuePairs) + + // tag::TwoActions[] + val sorted = rddA.sortByKey() + val count = sorted.count() // sorted Action 1 + val sample: Long = count / 10 + val sampled = sorted.take(sample.toInt) // sorted Action 2 + // end::TwoActions[] + } + + test( "Two actions with caching "){ + val rddA: RDD[(Double, Int)] = sc.parallelize(keyValuePairs) + // tag::TwoActionsCache[] + val sorted = rddA.sortByKey() + sorted.persist() + val count = sorted.count() // sorted Action 1 + val sample: Long = count / 10 + val sampled = sorted.take(sample.toInt) // sorted Action 2 + // end::TwoActionsCache[] + } + + + +} diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala new file mode 100644 index 0000000..5388477 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala @@ -0,0 +1,131 @@ +package com.highperformancespark.examples.goldilocks + +import com.holdenkarau.spark.testing.SharedSparkContext +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{StructType, DoubleType, StructField} +import org.apache.spark.sql.{Row, SQLContext, DataFrame} +import org.scalatest.FunSuite + +import scala.collection.immutable.IndexedSeq + +class GoldilocksLargeTests extends FunSuite with SharedSparkContext{ + + + def testGoldilocksImplementations( + data: DataFrame, targetRanks: List[Long], + expectedResult: Map[Int, Iterable[Long]]) = { + + val iterative = + GoldilocksWhileLoop.findRankStatistics(data, targetRanks) + val groupByKey = + GoldilocksGroupByKey.findRankStatistics(data, targetRanks) + val firstTry = + GoldilocksFirstTry.findRankStatistics(data, targetRanks) + val hashMap = + GoldilocksWithHashMap.findRankStatistics(data, targetRanks) + val secondarySort = + GoldilocksSecondarySort.findRankStatistics(data, targetRanks, + data.rdd.partitions.length) + val secondarySortV2 = + GoldilocksSecondarySortV2.findRankStatistics(data, targetRanks) + + expectedResult.foreach { + case((i, ranks)) => + assert(iterative(i).equals(ranks), + "The Iterative solution to goldilocks was incorrect for column " + i) + assert(groupByKey(i).equals(ranks), + "Group by key solution was incorrect") + assert(firstTry(i).equals(ranks), + "GoldilocksFirstTry incorrect for column " + i ) + assert(hashMap(i).equals(ranks), + "GoldilocksWithhashMap incorrect for column " + i) + assert(secondarySort(i).equals(ranks)) + assert(secondarySortV2(i).equals(ranks)) + + } + } + + test("Goldilocks on local data solution "){ + val sqlContext = new SQLContext(sc) + val testRanks = List(3L, 8L) + val (smallTestData, result) = + DataCreationUtils.createLocalTestData(5, 10, testRanks) + val schema = StructType( + result.keys.toSeq.map( + n => StructField("Column" + n.toString, DoubleType) + )) + val smallTestDF: DataFrame = + sqlContext.createDataFrame(sc.makeRDD(smallTestData), schema) + testGoldilocksImplementations(smallTestDF, testRanks, result) + } +} + +object DataCreationUtils { + def createLocalTestData(numberCols: Int, numberOfRows: Int, + targetRanks: List[Long]) = { + + val cols = Range(0,numberCols).toArray + val scalers = cols.map(x => 1.0) + val rowRange = Range(0, numberOfRows) + val columnArray: Array[IndexedSeq[Double]] = cols.map( + columnIndex => { + val columnValues = rowRange.map( + x => (Math.random(), x)).sortBy(_._1).map(_._2 * scalers(columnIndex)) + columnValues + }) + val rows = rowRange.map( + rowIndex => { + Row.fromSeq(cols.map( colI => columnArray(colI)(rowIndex)).toSeq) + }) + + + val result: Map[Int, Iterable[Long]] = cols.map(i => { + (i, targetRanks.map(r => Math.round((r-1)/scalers(i)))) + }).toMap + + (rows, result) + } + + + def createDistributedData(sc: SparkContext, partitions: Int, + elementsPerPartition: Int, numberOfColumns: Int ) = { + val partitionsStart: RDD[Int] = sc.parallelize( + Array.fill(partitions)(1)) + partitionsStart.repartition(partitions) + + var data: RDD[(Long, List[Int])] = partitionsStart.mapPartitionsWithIndex { + case (partIndex, elements) => + val rows = Range(0, elementsPerPartition) + .map(x => (Math.random(), x)) + .map { + case ((randomNumber, rowValue)) => + (randomNumber, + //index of element + (partIndex * elementsPerPartition.toLong + rowValue, + List(rowValue + partIndex * elementsPerPartition))) + } + rows.toIterator + }.sortByKey().values + + + Range(0, numberOfColumns).foreach(x => { + val nextColumn: RDD[(Long, Int)] = partitionsStart.mapPartitionsWithIndex { + case (partIndex, elements) => + val rows = Range(0, elementsPerPartition) + .map(x => (Math.random(), x)) + .map { + case ((randomNumber, rowValue)) => + (randomNumber, + //index of element + (partIndex * elementsPerPartition.toLong + rowValue, + rowValue + partIndex * elementsPerPartition)) + } + rows.toIterator + }.sortByKey().values + + data = nextColumn.join(data).mapValues(x => x._1 :: x._2) + }) + data + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala new file mode 100644 index 0000000..69dcc5e --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala @@ -0,0 +1,22 @@ +package com.highperformancespark.examples.goldilocks + +import com.holdenkarau.spark.testing.SharedSparkContext +import org.apache.spark.rdd.RDD +import org.scalatest.FunSuite + + +class JoinTest extends FunSuite with SharedSparkContext { + test("Hash join"){ + val keySet = "a, b, c, d, e, f, g".split(",") + val smallRDD = sc.parallelize(keySet.map(letter => (letter, letter.hashCode))) + val largeRDD: RDD[(String, Double)] = + sc.parallelize(keySet.flatMap{ letter => + Range(1, 50).map(i => (letter, letter.hashCode() / i.toDouble))}) + val result: RDD[(String, (Double, Int))] = + RDDJoinExamples.manualBroadCastHashJoin( + largeRDD, smallRDD) + val nativeJoin: RDD[(String, (Double, Int))] = largeRDD.join(smallRDD) + + assert(result.subtract(nativeJoin).count == 0) + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala new file mode 100644 index 0000000..131f311 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/QuantileOnlyArtisanalTest.scala @@ -0,0 +1,153 @@ +package com.highperformancespark.examples.goldilocks + +import org.apache.spark._ +import org.apache.spark.sql.{Row, SQLContext} +import org.scalatest.{BeforeAndAfterAll, FunSuite} + + +// tag::MAGIC_PANDA[] +class QuantileOnlyArtisanalTest extends FunSuite with BeforeAndAfterAll { + @transient private var _sc: SparkContext = _ + def sc: SparkContext = _sc + + val conf = new SparkConf().setMaster("local[4]").setAppName("test") + + override def beforeAll() { + _sc = new SparkContext(conf) + super.beforeAll() + } + + val inputList = List(GoldiLocksRow(0.0, 4.5, 7.7, 5.0), + GoldiLocksRow(4.0, 5.5, 0.5, 8.0), + GoldiLocksRow(1.0, 5.5, 6.7, 6.0), + GoldiLocksRow(3.0, 5.5, 0.5, 7.0), + GoldiLocksRow(2.0, 5.5, 1.5, 7.0) + ) + + val expectedResult = Map[Int, Set[Double]]( + 0 -> Set(1.0, 2.0), + 1 -> Set(5.5, 5.5), + 2 -> Set(0.5, 1.5), + 3 -> Set(6.0, 7.0)) + + test("Goldilocks naive Solution"){ + val sqlContext = new SQLContext(sc) + val input = sqlContext.createDataFrame(inputList) + val whileLoopSolution = GoldilocksWhileLoop.findRankStatistics( + input, List(2L, 3L)).mapValues(_.toSet) + val inputAsKeyValuePairs = GoldilocksGroupByKey.mapToKeyValuePairs(input) + val groupByKeySolution = GoldilocksGroupByKey.findRankStatistics( + inputAsKeyValuePairs, List(2L,3L)).mapValues(_.toSet) + assert(whileLoopSolution == expectedResult) + assert(groupByKeySolution == expectedResult) + } + + override def afterAll() { + // We clear the driver port so that we don't try and bind to the same port on + // restart. + sc.stop() + System.clearProperty("spark.driver.port") + _sc = null + super.afterAll() + } +} +// end::MAGIC_PANDA[] + +// We don't need the rest of the tests included. +class QuantileOnlyArtisanalTestContinued extends QuantileOnlyArtisanalTest { + test("Goldilocks first try ") { + val sqlContext = new SQLContext(sc) + val input = sqlContext.createDataFrame(inputList) + val secondAndThird = GoldilocksFirstTry.findRankStatistics( + input, targetRanks = List(2L, 3L)) + + secondAndThird.foreach(x => println( x._1 +"," + x._2.mkString(" "))) + assert(expectedResult.forall{case ((index, expectedRanks)) => + secondAndThird.get(index).get.toSet.equals(expectedRanks)}) + } + + //tests the edge case in which one partition does not contain + // any of the elements in one column. + test("Goldilocks first try multiplePartitions") { + import org.scalatest.PrivateMethodTester._ + val testData = sc.parallelize(List(1.0, 2.0, 3.0, 4.0).map(x => (x, x)), 3) + val mapPartitions = testData.mapPartitionsWithIndex { + case (index, iter) => + val key = if (index == 1) 1 else 0 + iter.map(x => (x._1, key)) + } + + val getColumnsFreqPerPartition = + PrivateMethod[Array[(Int, Array[Long])]]( + 'getColumnsFreqPerPartition) + val totals = (GoldilocksFirstTry invokePrivate + getColumnsFreqPerPartition(mapPartitions, 2)) + + totals.foreach(x => println(x._1 + " : " + x._2.mkString(" "))) + val getRanksLocationsWithinEachPart = + PrivateMethod[Array[(Int, List[(Int, Long)])]]( + 'getRanksLocationsWithinEachPart) + + val locations = (GoldilocksFirstTry invokePrivate + getRanksLocationsWithinEachPart(List(1L), totals, 2)) + locations.foreach(x => println(x._1 + " : " + x._2.mkString(" "))) + + //assert that there is nothing in the column with index 1 on the second partition + assert(totals(1)._2(0) == 0 ) + + val firstPartition = locations(0)._2 + //assertFirstPartitionOnlyContains a target rank for the for + // columnIndex 0, at index 1 + assert(firstPartition.toSet.equals(Set((0,1))) ) + + //assertSecondPartition only contains rank for columnIndex 1, at index 1 + val secondPartition = locations(1)._2 + assert(secondPartition.toSet.equals(Set((1,1))) ) + + //assert ThirdPartition contains no locations + val thirdPartition = locations(2)._2 + assert(thirdPartition.toSet.equals(Set())) + assert(locations.length == 3) + } + + + test("GoldiLocks With Hashmap ") { + val sqlContext = new SQLContext(sc) + val input = sqlContext.createDataFrame(inputList) + val secondAndThird = GoldilocksWithHashMap.findRankStatistics( + input, targetRanks = List(2L, 3L)) + val expectedResult = Map[Int, Set[Double]]( + 0 -> Set(1.0, 2.0), + 1 -> Set(5.5, 5.5), + 2 -> Set(0.5, 1.5), + 3 -> Set(6.0, 7.0)) + secondAndThird.foreach(x => println( x._1 +"," + x._2.mkString(" "))) + assert(expectedResult.forall{case ((index, expectedRanks)) => + secondAndThird.get(index).get.toSet.equals(expectedRanks)}) + } + + test("Goldilocks Secondary Sort"){ + val sqlContext = new SQLContext(sc) + val input = sqlContext.createDataFrame(inputList) + val secondarySortSolution = + GoldilocksWithHashMap.findRankStatistics( + input, targetRanks = List(2L, 3L)).mapValues(_.toSet) + assert(secondarySortSolution == expectedResult) + } + + test("Secondary Sort"){ + val data = sc.parallelize(Range.apply(0, 10)).flatMap(i => + List(20.0, 30.0, 40.0).map(x => ((x, i), 1L ))) + val r = SecondarySort.groupByKeyAndSortBySecondaryKey(data, 3) + r.collect().foreach( v => println( v)) + val rSorted = r.collect().sortWith( + lt = (a, b) => a._1.toDouble > b._1.toDouble ) + assert(r.collect().zipWithIndex.forall{ + case (((key, list), index )) => rSorted(index)._1.equals(key) + }) + } +} + +case class GoldiLocksRow(pandaId : Double, softness : Double, + fuzzyness : Double, size : Double) +case class LongPandaRow( args : Double*) diff --git a/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala b/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala new file mode 100644 index 0000000..4ac03e7 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala @@ -0,0 +1,70 @@ + +package com.highperformancespark.examples.goldilocks + +import com.holdenkarau.spark.testing.SharedSparkContext +import org.apache.spark.rdd.RDD +import org.scalatest.FunSuite + +import scala.reflect.ClassTag + + +class SortingTests extends FunSuite with SharedSparkContext { + + test("Test Sort by two keys"){ + + val sortedData: Array[((Int, Char), Double)] = Range(0, 15).flatMap( x => + Range(50, 100).map(i => (( x, i.toChar), Math.random())) + ).toArray + + val unsorted = scramble(sc.parallelize(sortedData),2) + val sortedSimple: Array[((Int, Char), Double)] = unsorted.sortByKey().collect() + + assert(sortedSimple sameElements sortedData) + } + + test("Panda Secondary Sort"){ + val pandaData: Array[(String, StreetAddress, Int, Double)] = Array( + ("Morris", StreetAddress("Accra","Grove", 52 ), 84440, 0.0), + ("Joe", StreetAddress("Accra","Grove", 52 ), 94440, 0.0), + ("Kobe", StreetAddress("Accra","Grove", 52 ), 94440, 0.0), + + ("Morris", StreetAddress("Albany","Grove", 52 ), 84440, 0.0), + ("Joe", StreetAddress("Albany","Grove", 52 ), 94440, 0.0), + ("Kobe", StreetAddress("Albany","Grove", 52 ), 94440, 0.5), + ("Morris", StreetAddress("Denver","Grove", 52 ), 84440, 0.5), + ("Joe", StreetAddress("LA","Grove", 52 ), 94440, 0.5), + ("Kobe", StreetAddress("LA","Grove", 52 ), 94440, 0.5), + ("Joe", StreetAddress("SanFransisco","Grove", 52 ), 94440, 0.5), + ("Kobe", StreetAddress("SanFransisco","Grove", 52 ), 94440, 0.5), + ("Joe", StreetAddress("Seattle","Grove", 52 ), 84440, 0.5), + ("Kobe", StreetAddress("Seattle","Grove", 52 ), 84440, 0.5), + ("Lacy", StreetAddress("Seattle","Grove", 52 ), 84440, 0.5), + ("Morris", StreetAddress("Seattle","Grove", 52 ), 84440, 0.5), + ("Joe", StreetAddress("Seattle","Grove", 52 ), 94440, 0.5), + ("Kobe", StreetAddress("Seattle","Grove", 52 ), 94440, 0.5), + ("Lacy", StreetAddress("Seattle","Grove", 52 ), 94440, 0.5), + ("Morris", StreetAddress("Seattle","Grove", 52 ), 94440, 0.5), + ("Joe", StreetAddress("Tacoma","Grove", 52 ), 94440, 0.5), + ("Kobe", StreetAddress("Tacoma","Grove", 52 ), 94440, 0.5), + ("Lacy", StreetAddress("Tacoma","Grove", 52 ), 94440, 0.5), + ("Morris", StreetAddress("Tacoma","Grove", 52 ), 94440, 0.5) + ) + + val unsorted = scramble(sc.parallelize(pandaData)) + val pandaSort = PandaSecondarySort.secondarySort(unsorted) + pandaSort.zipWithIndex().collect.foreach{ + case (x, i) => assert(x == pandaData(i.toInt), "Element " + x + " is wrong") + } + + + + } + + + def scramble[T : ClassTag]( rdd : RDD[T], partitions : Int= 3) = { + val wRandom = rdd.map((Math.random(), _)) + val unsorted = wRandom.sortByKey(true, partitions) + unsorted.values + } + +} diff --git a/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala new file mode 100644 index 0000000..3b9159c --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala @@ -0,0 +1,40 @@ +/** + * Simple tests for our CustomPipeline demo pipeline stage + */ +package com.highperformancespark.examples.ml + +import com.holdenkarau.spark.testing.DataFrameSuiteBase +import org.apache.spark.sql.Dataset +import org.scalatest.FunSuite + +case class TestRow(id: Int, inputColumn: String) + +class CustomPipelineSuite extends FunSuite with DataFrameSuiteBase { + val d = List( + TestRow(0, "a"), + TestRow(1, "b"), + TestRow(2, "c"), + TestRow(3, "a"), + TestRow(4, "a"), + TestRow(5, "c") + ) + + test("test spark context") { + val session = spark + val rdd = session.sparkContext.parallelize(1 to 10) + assert(rdd.sum === 55) + } + + test("simple indexer test") { + val session = spark + import session.implicits._ + val ds: Dataset[TestRow] = session.createDataset(d) + val indexer = new SimpleIndexer() + indexer.setInputCol("inputColumn") + indexer.setOutputCol("categoryIndex") + val model = indexer.fit(ds) + val predicted = model.transform(ds) + assert(predicted.columns.contains("categoryIndex")) + predicted.show() + } +} \ No newline at end of file diff --git a/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala new file mode 100644 index 0000000..1fa296a --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala @@ -0,0 +1,47 @@ +/** + * Simple tests for our SimpleNaiveBayes demo pipeline stage + */ +package com.highperformancespark.examples.ml + +import com.highperformancespark.examples.dataframe.HappyPandas.{PandaInfo, Pandas} + +import com.holdenkarau.spark.testing._ + +import org.apache.spark.ml._ +import org.apache.spark.ml.feature._ +import org.apache.spark.ml.param._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext} +import org.scalatest.Matchers._ +import org.scalatest.FunSuite + +case class MiniPanda(happy: Double, fuzzy: Double, old: Double) + +class SimpleNaiveBayesSuite extends FunSuite with DataFrameSuiteBase { + val miniPandasList = List( + MiniPanda(1.0, 1.0, 1.0), + MiniPanda(1.0, 1.0, 0.0), + MiniPanda(1.0, 1.0, 0.0), + MiniPanda(0.0, 0.0, 1.0), + MiniPanda(0.0, 0.0, 0.0)) + + test("simple sanity test") { + val session = spark + import session.implicits._ + val ds: Dataset[MiniPanda] = session.createDataset(miniPandasList) + val assembler = new VectorAssembler() + assembler.setInputCols(Array("fuzzy", "old")) + assembler.setOutputCol("magical_features") + val snb = new SimpleNaiveBayes() + snb.setLabelCol("happy") + snb.setFeaturesCol("magical_features") + val pipeline = new Pipeline().setStages(Array(assembler, snb)) + val model = pipeline.fit(ds) + val test = ds.select("fuzzy", "old") + val predicted = model.transform(test) + assert(predicted.count() === miniPandasList.size) + val nbModel = model.stages(1).asInstanceOf[SimpleNaiveBayesModel] + assert(nbModel.getFeaturesCol === "magical_features") + assert(nbModel.copy(ParamMap.empty).getFeaturesCol === "magical_features") + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala b/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala new file mode 100644 index 0000000..fa551a5 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala @@ -0,0 +1,39 @@ +/** + * Basic tests for our MLlib examples + */ +package com.highperformancespark.examples.mllib + +import com.highperformancespark.examples.dataframe.RawPanda + +import com.holdenkarau.spark.testing._ + +import org.scalatest.FunSuite + + +import org.apache.spark.mllib.linalg.{Vector => SparkVector} + +class GoldilocksMLlibSuite extends FunSuite with SharedSparkContext { + val rps = List( + RawPanda(1L, "94110", "giant", true, Array(0.0, 0.0)), + RawPanda(2L, "94110", "giant", false, Array(0.0, 3.0)), + RawPanda(3L, "94110", "giant", true, Array(0.0, 2.0))) + + test("boolean to double") { + assert(1.0 === GoldilocksMLlib.booleanToDouble(true)) + assert(0.0 === GoldilocksMLlib.booleanToDouble(false)) + } + + test("encoding") { + val input = sc.parallelize(rps) + val points = GoldilocksMLlib.toLabeledPointDense(input) + assert(points.count() == 3) + assert(points.filter(_.label != 0.0).count() == 2) + } + + test("lookup table") { + val input = sc.parallelize(List("hi", "bye", "coffee", "hi")) + val table = GoldilocksMLlib.createLabelLookup(input) + assert(table.size == 3) + } + +} diff --git a/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala b/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala new file mode 100644 index 0000000..724ddaa --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala @@ -0,0 +1,52 @@ +/** + * Test our simple JNI + */ +package com.highperformancespark.examples.ffi + +import com.holdenkarau.spark.testing._ +import org.scalacheck.{Arbitrary, Gen} +import org.scalacheck.Prop.forAll +import org.scalatest.FunSuite +import org.scalatest.prop.Checkers +import org.scalatest.Matchers._ + +class NativeExampleSuite extends FunSuite + with SharedSparkContext with Checkers with RDDComparisons { + + test("local sum") { + val input = Array(1, 2, 3) + val sumMagic = new SumJNI() + val result = sumMagic.sum(input) + val expected = 6 + assert(result === expected) + } + + test("super simple test") { + val input = sc.parallelize(List(("hi", Array(1, 2, 3)))) + val result = NativeExample.jniSum(input).collect() + val expected = List(("hi", 6)) + assert(result === expected) + } + + test("native call should find sum correctly") { + val property = forAll( + RDDGenerator.genRDD[(String, Array[Int])](sc)( + Arbitrary.arbitrary[(String, Array[Int])])) { + rdd => + val expected = rdd.mapValues(_.sum) + val result = NativeExample.jniSum(rdd) + compareRDDWithOrder(expected, result).isEmpty + } + check(property) + } + + test("JNA support") { + val input = Array(1, 2, 3) + assert(6 === SumJNA.sum(input, input.size)) + } + + test("JNA Fortran support") { + val input = Array(1, 2, 3) + assert(6 === SumFJNA.easySum(input.size, input)) + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala b/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala new file mode 100644 index 0000000..4b1f032 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala @@ -0,0 +1,19 @@ +/** + * Test our simple JNI + */ +package com.highperformancespark.examples.ffi + +import com.holdenkarau.spark.testing._ +import org.scalatest.FunSuite +import org.scalatest.prop.Checkers +import org.scalatest.Matchers._ + + +class PipeExampleSuite extends FunSuite with SharedSparkContext with Checkers { + ignore("commentors on a pr") { + val rdd = sc.parallelize(List(12883)) + val expected = (12883, List("SparkQA", "srowen")) + val result = PipeExample.lookupUserPRS(sc, rdd) + assert(expected === result.collect()(0)) + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala b/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala new file mode 100644 index 0000000..b5a3d44 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala @@ -0,0 +1,28 @@ +/** + * Simple tests for DStreamSuite - + * normally we would use streaming tests but since we want to test + * context creation as well we don't. + */ +package com.highperformancespark.examples.streaming + +import org.apache.spark.streaming._ + +import java.lang.Thread +import com.holdenkarau.spark.testing._ + +import org.scalatest.FunSuite + +class DStreamExamplesSuite extends FunSuite with SharedSparkContext { + test("simple set up") { + val ssc = DStreamExamples.makeStreamingContext(sc) + val inputStream = DStreamExamples.fileAPIExample(ssc, "./") + val repartitioned = DStreamExamples.repartition(inputStream) + repartitioned.foreachRDD(rdd => + assert(rdd.partitioner.get.numPartitions == 20) + ) + ssc.start() + // This is bad don't do this - but we don't have the full test tools here + Thread.sleep(100) + ssc.stop() + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala b/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala new file mode 100644 index 0000000..ca364d1 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala @@ -0,0 +1,32 @@ +/** + * Simple tests for tokenization + */ +package com.highperformancespark.examples.tokenize + +import org.apache.spark.streaming._ + +import java.lang.Thread +import com.holdenkarau.spark.testing._ + +import org.scalatest.FunSuite + +class SampleTokenizeSuite extends FunSuite with SharedSparkContext { + val input = List("hi holden", "I like coffee") + val expected = List("hi", "holden", "I", "like", "coffee") + + test("test the difficult to test one") { + val inputRDD = sc.parallelize(input) + val result = SampleTokenize.difficultTokenizeRDD(inputRDD).collect() + assert(result.toList == expected) + } + + test("test the easy to test one like the difficult one") { + val inputRDD = sc.parallelize(input) + val result = SampleTokenize.tokenizeRDD(inputRDD).collect() + assert(result.toList == expected) + } + + test("test the easy inner function - note no SC needed") { + assert(SampleTokenize.tokenize("hi holden").toList == List("hi", "holden")) + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala b/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala new file mode 100644 index 0000000..897a8d3 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala @@ -0,0 +1,26 @@ +/** + * Tests that we filter out bad pandas. + */ +package com.highperformancespark.examples.tools + +import com.highperformancespark.examples.dataframe.RawPanda + +import com.holdenkarau.spark.testing._ + +import org.scalatest.FunSuite + +class FilterInvalidPandasSuite extends FunSuite with SharedSparkContext { + test("simple filter") { + val invalidPandas = List(1L, 2L) + val inputPandas = List( + RawPanda(1L, "94110", "giant", true, Array(0.0)), + RawPanda(3L, "94110", "giant", true, Array(0.0))) + val input = sc.parallelize(inputPandas) + val result1 = + FilterInvalidPandas.filterInvalidPandas(sc, invalidPandas, input) + val result2 = + FilterInvalidPandas.filterInvalidPandasWithLogs(sc, invalidPandas, input) + assert(result1.collect() === result2.collect()) + assert(result1.count() === 1) + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala b/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala new file mode 100644 index 0000000..15f60d1 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala @@ -0,0 +1,42 @@ +/** + * Verify that generate scaling data returns results + */ +package com.highperformancespark.examples.tools + +import com.highperformancespark.examples.dataframe.RawPanda + +import com.holdenkarau.spark.testing._ + +import org.scalatest.FunSuite + +class GeneratescalaingDataSuite extends FunSuite with SharedSparkContext { + // The number of entries depends somewhat on the partition split because we + // zip multiple separate RDDs so its more of a "request" + test("expected num entries") { + val result = GenerateScalingData.generateFullGoldilocks(sc, 10L, 20) + assert(result.count() <= 10) + assert(result.count() > 5) + assert(result.map(_.id).distinct().count() > 1) + } + + test("expected num entries same id") { + val result = GenerateScalingData.generateGoldilocks(sc, 5L, 20) + assert(result.count() <= 5) + assert(result.count() >= 2) + assert(result.map(_.id).distinct().count() == 1) + } + + test("mini scale data") { + val result = GenerateScalingData.generateMiniScale(sc, 20L, 1) + assert(result.count() <= 20) + assert(result.count() > 5) + assert(result.map(_._1).distinct().count() > 1) + } + + test("mini scale rows") { + val result = GenerateScalingData.generateMiniScaleRows(sc, 20L, 1) + assert(result.count() <= 20) + assert(result.count() > 5) + assert(result.map(_(0)).distinct().count() > 1) + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala new file mode 100644 index 0000000..5eb995f --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala @@ -0,0 +1,26 @@ +/** + * Test that the accumulator example computes stuff. + */ +package com.highperformancespark.examples.transformations + +import com.highperformancespark.examples.dataframe.RawPanda + +import com.holdenkarau.spark.testing._ + +import org.scalatest.FunSuite + +class AccumulatorsTest extends FunSuite with SharedSparkContext { + test("accumulator max should function") { + val input = sc.parallelize(1.to(100)).map(x => + RawPanda(1L, "1", "red", true, Array(x.toDouble))) + val (_, max) = Accumulators.computeMaxFuzzyNess(sc, input) + assert(max === 100.0) + } + + test("accumulator sum should function") { + val input = sc.parallelize(1.to(100)).map(x => + RawPanda(1L, "1", "red", true, Array(x.toDouble))) + val (_, sum) = Accumulators.computeTotalFuzzyNess(sc, input) + assert(sum === 5050.0) + } +} diff --git a/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala b/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala new file mode 100644 index 0000000..4fd8ad5 --- /dev/null +++ b/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala @@ -0,0 +1,25 @@ +package com.highperformancespark.examples.wordcount + + +import com.holdenkarau.spark.testing.SharedSparkContext +import org.scalatest.FunSuite + +class WordCountTest extends FunSuite with SharedSparkContext { + test("word count with Stop Words Removed"){ + val wordRDD = sc.parallelize(Seq( + "How happy was the panda? You ask.", + "Panda is the most happy panda in all the #$!?ing land!")) + + val stopWords: Set[String] = Set("a", "the", "in", "was", "there", "she", "he") + val illegalTokens: Array[Char] = "#$%?!.".toCharArray + + val wordCounts = WordCount.withStopWordsFiltered( + wordRDD, illegalTokens, stopWords) + val wordCountsAsMap = wordCounts.collectAsMap() + assert(!wordCountsAsMap.contains("the")) + assert(!wordCountsAsMap.contains("?")) + assert(!wordCountsAsMap.contains("#$!?ing")) + assert(wordCountsAsMap.contains("ing")) + assert(wordCountsAsMap.get("panda").get.equals(3)) + } +} diff --git a/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala b/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala new file mode 100644 index 0000000..4d983a6 --- /dev/null +++ b/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala @@ -0,0 +1,11 @@ +package com.highperformancespark.examples + + +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD + +class JavaInteropTestHelper(sc: SparkContext) { + def generateMiniPairRDD(): RDD[(String, Long)] = { + sc.parallelize(List(("panda", 12L))) + } +}