DataEngDev
diff --git a/‎src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala‎
Lines changed: 4 additions & 3 deletions b/‎src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala‎
Lines changed: 22 additions & 8 deletions b/‎src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala‎
Lines changed: 22 additions & 8 deletions
diff --git a/‎src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala‎
Lines changed: 1 addition & 1 deletion b/‎src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala‎
Lines changed: 11 additions & 6 deletions b/‎src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala‎
Lines changed: 58 additions & 38 deletions b/‎src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala‎
Lines changed: 58 additions & 38 deletions
diff --git a/‎src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala‎
Lines changed: 3 additions & 2 deletions b/‎src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala‎
Lines changed: 3 additions & 2 deletions
@@ -10,15 +10,16 @@ import java.util.Objects
  * @param happy if panda is happy
  * @param attributes array of panada attributes
  */
-case class RawPanda(id: Long, zip: String, pt: String, happy: Boolean, attributes: Array[Double]) {
+case class RawPanda(id: Long, zip: String, pt: String,
+  happy: Boolean, attributes: Array[Double]) {
   override def equals(o: Any) = o match {
     case other: RawPanda => (id == other.id && pt == other.pt &&
         happy == other.happy && attributes.deep == other.attributes.deep)
     case _ => false
   }
   override def hashCode(): Int = {
-    3 * Objects.hashCode(id) + 7 * Objects.hashCode(zip) + 11 * Objects.hashCode(pt) +
-    13 * Arrays.hashCode(attributes)
+    3 * Objects.hashCode(id) + 7 * Objects.hashCode(zip) +
+    11 * Objects.hashCode(pt) + 13 * Arrays.hashCode(attributes)
   }
 }
 
 
@@ -1,5 +1,6 @@
 /**
- * Happy Panda Example for DataFrames. Computes the % of happy pandas. Very contrived.
+ * Happy Panda Example for DataFrames.
+ * Computes the % of happy pandas. Very contrived.
  */
 package com.highperformancespark.examples.dataframe
 
@@ -46,7 +47,9 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
   test("simple explode test") {
     val inputDF = sqlContext.createDataFrame(pandaPlaces)
     val pandaInfo = sqlContext.createDataFrame(rawPandaList)
-    val expectedDf = pandaInfo.select((pandaInfo("attributes")(0) / pandaInfo("attributes")(1)).as("squishyness"))
+    val expectedDf = pandaInfo.select(
+      (pandaInfo("attributes")(0) / pandaInfo("attributes")(1))
+        .as("squishyness"))
     val result = HappyPandas.squishPandaFromPace(inputDF)
 
     assertDataFrameApproximateEquals(expectedDf, result, 1E-5)
@@ -55,7 +58,9 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
   //tag::approxEqualDataFrames[]
 
   test("verify simple happy pandas Percentage") {
-    val expectedList = List(Row(toronto, 0.5), Row(sandiego, 2/3.0), Row(virginia, 1/10.0))
+    val expectedList = List(Row(toronto, 0.5),
+      Row(sandiego, 2/3.0),
+      Row(virginia, 1/10.0))
     val expectedDf = createDF(expectedList, ("place", StringType),
                                               ("percentHappy", DoubleType))
 
@@ -71,7 +76,9 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
     val resultDF = HappyPandas.happyPandasPercentage(inputDF)
     val resultRows = resultDF.collect()
 
-    val expectedRows = List(Row(toronto, 0.5), Row(sandiego, 2/3.0), Row(virginia, 1/10.0))
+    val expectedRows = List(Row(toronto, 0.5),
+      Row(sandiego, 2/3.0),
+      Row(virginia, 1/10.0))
 
     //tag::approxEqualRow[]
     assert(expectedRows.length === resultRows.length)
@@ -174,7 +181,8 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
                                             ("min(pandaSize)", IntegerType),
                                             ("avg(pandaSize)", DoubleType))
 
-    assertDataFrameApproximateEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip"), 1e-5)
+    assertDataFrameApproximateEquals(expectedDF.orderBy("zip"),
+      resultDF.orderBy("zip"), 1e-5)
   }
 
 
@@ -208,10 +216,12 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
 
     val expectedDF = getExpectedPandasRelativeSize(inputPandaList, -10, 10)
 
-    assertDataFrameApproximateEquals(expectedDF.orderBy("name"), resultDF.orderBy("name"), 1e-5)
+    assertDataFrameApproximateEquals(expectedDF.orderBy("name"),
+      resultDF.orderBy("name"), 1e-5)
   }
 
-  private def getExpectedPandasRelativeSize(pandaList: List[Pandas], start: Int, end: Int):DataFrame = {
+  private def getExpectedPandasRelativeSize(pandaList: List[Pandas],
+    start: Int, end: Int):DataFrame = {
 
     val expectedRows =
       pandaList
@@ -234,7 +244,11 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
             val average = totalSum.toDouble / count
 
             val panda = pandas(i)
-            result += Row(panda.name, panda.zip, panda.pandaSize, panda.age, panda.pandaSize - average)
+            result += Row(panda.name,
+              panda.zip,
+              panda.pandaSize,
+              panda.age,
+              panda.pandaSize - average)
           }
 
           result
 
@@ -36,7 +36,7 @@ class MixedDatasetSuite extends FunSuite with DataFrameSuiteBase {
     val inputDS = inputDF.as[RawPanda]
     val mixedDS = new MixedDataset(sqlCtx)
     val squishy = mixedDS.squishyPandas(inputDS).collect()
-    assert(squishy(0)._2 == true)
+    assert(squishy(0)._2 === true)
   }
 
   test("funquery") {
 
@@ -14,7 +14,8 @@ class EvaluationTests extends FunSuite with SharedSparkContext {
     // tag::MapValues[]
     val sortedData = data.sortByKey()
     val mapValues: RDD[(Double, String)] = sortedData.mapValues(_.toString)
-    assert(mapValues.partitioner.isDefined, "Using Map Values preserves partitioning")
+    assert(mapValues.partitioner.isDefined,
+      "Using Map Values preserves partitioning")
 
     val map = sortedData.map( pair => (pair._1, pair._2.toString))
     assert(map.partitioner.isEmpty, "Using map does not preserve partitioning")
@@ -46,22 +47,27 @@ class EvaluationTests extends FunSuite with SharedSparkContext {
   }
 
   test("Itereative Computations "){
-    def RMSE(rdd : RDD[(Int, Int )]) = {
+    def rmse(rdd : RDD[(Int, Int )]) = {
       val n = rdd.count()
       math.sqrt(rdd.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n)
     }
 
     val validationSet = sc.parallelize(keyValuePairs)
 
     // tag::iterativeComp[]
-    val testSet: Array[RDD[(Double, Int)]] = Array(validationSet.mapValues(_ + 1), validationSet.mapValues(_ + 2), validationSet)
+    val testSet: Array[RDD[(Double, Int)]] =
+      Array(
+        validationSet.mapValues(_ + 1),
+        validationSet.mapValues(_ + 2),
+        validationSet)
     validationSet.persist() //persist since we are using this RDD several times
     val errors = testSet.map( rdd => {
-        RMSE(rdd.join(validationSet).values)
+        rmse(rdd.join(validationSet).values)
     })
     // end::iterativeComp[]
 
-    //the one where we didn't change anything should have the lowest root mean squared error
+    // the one where we didn't change anything should have the
+    // lowest root mean squared error
     assert(errors.min == errors(2))
 
   }
@@ -91,4 +97,3 @@ class EvaluationTests extends FunSuite with SharedSparkContext {
 
 
 }
-
 
@@ -13,46 +13,65 @@ class GoldilocksLargeTests extends FunSuite with SharedSparkContext{
 
 
   def testGoldilocksImplementations(
-    data : DataFrame, targetRanks : List[Long],
-    expectedResult :  Map[Int, Iterable[Long]])= {
-    val iterative = GoldilocksWhileLoop.findRankStatistics(data, targetRanks)
-    val groupByKey = GoldilocksGroupByKey.findRankStatistics(data, targetRanks)
-    val firstTry = GoldilocksFirstTry.findRankStatistics(data, targetRanks)
-    val hashMap = GoldilocksWithHashMap.findRankStatistics(data, targetRanks)
-    val secondarySort = GoldilocksSecondarySort.findRankStatistics(data, targetRanks, data.rdd.partitions.length)
-    val secondarySortV2 = GoldilocksSecondarySortV2.findRankStatistics(data, targetRanks)
+    data: DataFrame, targetRanks: List[Long],
+    expectedResult: Map[Int, Iterable[Long]]) = {
+
+    val iterative =
+      GoldilocksWhileLoop.findRankStatistics(data, targetRanks)
+    val groupByKey =
+      GoldilocksGroupByKey.findRankStatistics(data, targetRanks)
+    val firstTry =
+      GoldilocksFirstTry.findRankStatistics(data, targetRanks)
+    val hashMap =
+      GoldilocksWithHashMap.findRankStatistics(data, targetRanks)
+    val secondarySort =
+      GoldilocksSecondarySort.findRankStatistics(data, targetRanks,
+        data.rdd.partitions.length)
+    val secondarySortV2 =
+      GoldilocksSecondarySortV2.findRankStatistics(data, targetRanks)
 
     expectedResult.foreach {
       case((i, ranks)) =>
         assert(iterative(i).equals(ranks),
-        "The Iterative solution to goldilocks was incorrect for column " + i)
-        assert(groupByKey(i).equals(ranks), "Group by key solution was incorrect")
-        assert(firstTry(i).equals(ranks), "GoldilocksFirstTry incorrect for column " + i )
-        assert(hashMap(i).equals(ranks), "GoldilocksWithhashMap incorrect for column " + i)
+          "The Iterative solution to goldilocks was incorrect for column " + i)
+        assert(groupByKey(i).equals(ranks),
+          "Group by key solution was incorrect")
+        assert(firstTry(i).equals(ranks),
+          "GoldilocksFirstTry incorrect for column " + i )
+        assert(hashMap(i).equals(ranks),
+          "GoldilocksWithhashMap incorrect for column " + i)
         assert(secondarySort(i).equals(ranks))
         assert(secondarySortV2(i).equals(ranks))
 
     }
   }
 
   test("Goldilocks on local data solution "){
-        val sqlContext = new SQLContext(sc)
-        val testRanks = List(3L, 8L)
-        val (smallTestData, result) = DataCreationUtils.createLocalTestData(5, 10, testRanks)
-        val schema = StructType(result.keys.toSeq.map(n => StructField("Column" + n.toString, DoubleType)))
-        val smallTestDF : DataFrame  = sqlContext.createDataFrame(sc.makeRDD(smallTestData), schema)
-        testGoldilocksImplementations(smallTestDF, testRanks, result)
-    }
+    val sqlContext = new SQLContext(sc)
+    val testRanks = List(3L, 8L)
+    val (smallTestData, result) =
+      DataCreationUtils.createLocalTestData(5, 10, testRanks)
+    val schema = StructType(
+      result.keys.toSeq.map(
+        n => StructField("Column" + n.toString, DoubleType)
+      ))
+    val smallTestDF: DataFrame =
+      sqlContext.createDataFrame(sc.makeRDD(smallTestData), schema)
+    testGoldilocksImplementations(smallTestDF, testRanks, result)
+  }
 }
 
 object DataCreationUtils {
-  def createLocalTestData(numberCols : Int, numberOfRows : Int, targetRanks : List[Long]) = {
+  def createLocalTestData(numberCols: Int, numberOfRows: Int,
+    targetRanks: List[Long]) = {
+
     val cols = Range(0,numberCols).toArray
     val scalers = cols.map(x => 1.0)
     val rowRange =  Range(0, numberOfRows)
     val columnArray: Array[IndexedSeq[Double]] = cols.map(
       columnIndex => {
-        val columnValues = rowRange.map(x => (Math.random(), x)).sortBy(_._1).map(_._2 * scalers(columnIndex))
+        val columnValues = rowRange.map(
+          x => (Math.random(), x)).sortBy(_._1).map(_._2 * scalers(columnIndex))
         columnValues
       })
     val rows = rowRange.map(
@@ -69,22 +88,23 @@ object DataCreationUtils {
   }
 
 
-  def createDistributedData(sc : SparkContext ,partitions: Int, elementsPerPartition : Int, numberOfColumns : Int ) = {
+  def createDistributedData(sc: SparkContext, partitions: Int,
+    elementsPerPartition: Int, numberOfColumns: Int ) = {
     val partitionsStart: RDD[Int] = sc.parallelize(
       Array.fill(partitions)(1))
     partitionsStart.repartition(partitions)
 
     var data: RDD[(Long, List[Int])] = partitionsStart.mapPartitionsWithIndex {
       case (partIndex, elements) =>
         val rows = Range(0, elementsPerPartition)
-                   .map(x => (Math.random(), x))
-                   .map {
-                     case ((randomNumber, rowValue)) =>
-                       (
-                         randomNumber,
-                         (partIndex * elementsPerPartition.toLong + rowValue, //index of element
-                           List(rowValue + partIndex * elementsPerPartition)))
-                   }
+          .map(x => (Math.random(), x))
+          .map {
+          case ((randomNumber, rowValue)) =>
+            (randomNumber,
+              //index of element
+              (partIndex * elementsPerPartition.toLong + rowValue,
+                List(rowValue + partIndex * elementsPerPartition)))
+        }
         rows.toIterator
     }.sortByKey().values
 
@@ -93,14 +113,14 @@ object DataCreationUtils {
       val nextColumn: RDD[(Long, Int)] = partitionsStart.mapPartitionsWithIndex {
         case (partIndex, elements) =>
           val rows = Range(0, elementsPerPartition)
-                     .map(x => (Math.random(), x))
-                     .map {
-                       case ((randomNumber, rowValue)) =>
-                         (
-                           randomNumber,
-                           (partIndex * elementsPerPartition.toLong + rowValue, //index of element
-                             rowValue + partIndex * elementsPerPartition))
-                     }
+            .map(x => (Math.random(), x))
+            .map {
+            case ((randomNumber, rowValue)) =>
+              (randomNumber,
+                //index of element
+                (partIndex * elementsPerPartition.toLong + rowValue,
+                  rowValue + partIndex * elementsPerPartition))
+          }
           rows.toIterator
       }.sortByKey().values
 
 
@@ -12,8 +12,9 @@ class JoinTest extends FunSuite with SharedSparkContext {
     val largeRDD: RDD[(String, Double)] =
       sc.parallelize(keySet.flatMap{ letter =>
         Range(1, 50).map(i => (letter, letter.hashCode() / i.toDouble))})
-    val result: RDD[(String, (Double, Int))] = RDDJoinExamples.manualBroadCastHashJoin(
-      largeRDD, smallRDD)
+    val result: RDD[(String, (Double, Int))] =
+      RDDJoinExamples.manualBroadCastHashJoin(
+        largeRDD, smallRDD)
     val nativeJoin: RDD[(String, (Double, Int))] = largeRDD.join(smallRDD)
 
     assert(result.subtract(nativeJoin).count == 0)
Original file line number	Diff line number	Diff line change
`@@ -10,15 +10,16 @@ import java.util.Objects`
`10`	`10`	`* @param happy if panda is happy`
`11`	`11`	`* @param attributes array of panada attributes`
`12`	`12`	`*/`
`13`		`-case class RawPanda(id: Long, zip: String, pt: String, happy: Boolean, attributes: Array[Double]) {`
	`13`	`+case class RawPanda(id: Long, zip: String, pt: String,`
	`14`	`+ happy: Boolean, attributes: Array[Double]) {`
`14`	`15`	`override def equals(o: Any) = o match {`
`15`	`16`	`case other: RawPanda => (id == other.id && pt == other.pt &&`
`16`	`17`	`happy == other.happy && attributes.deep == other.attributes.deep)`
`17`	`18`	`case _ => false`
`18`	`19`	`}`
`19`	`20`	`override def hashCode(): Int = {`
`20`		`- 3 * Objects.hashCode(id) + 7 * Objects.hashCode(zip) + 11 * Objects.hashCode(pt) +`
`21`		`- 13 * Arrays.hashCode(attributes)`
	`21`	`+ 3 * Objects.hashCode(id) + 7 * Objects.hashCode(zip) +`
	`22`	`+ 11 * Objects.hashCode(pt) + 13 * Arrays.hashCode(attributes)`
`22`	`23`	`}`
`23`	`24`	`}`
`24`	`25`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ class MixedDatasetSuite extends FunSuite with DataFrameSuiteBase {`
`36`	`36`	`val inputDS = inputDF.as[RawPanda]`
`37`	`37`	`val mixedDS = new MixedDataset(sqlCtx)`
`38`	`38`	`val squishy = mixedDS.squishyPandas(inputDS).collect()`
`39`		`- assert(squishy(0)._2 == true)`
	`39`	`+ assert(squishy(0)._2 === true)`
`40`	`40`	`}`
`41`	`41`
`42`	`42`	`test("funquery") {`