Skip to content

Commit 6cc6475

Browse files
authored
Merge pull request high-performance-spark#85 from holdenk/fix-more-style
Update the tests to fit the scalastyle guide as much as possible as well
2 parents 4a4ccc0 + a42882b commit 6cc6475

File tree

14 files changed

+151
-86
lines changed

14 files changed

+151
-86
lines changed

src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,16 @@ import java.util.Objects
1010
* @param happy if panda is happy
1111
* @param attributes array of panada attributes
1212
*/
13-
case class RawPanda(id: Long, zip: String, pt: String, happy: Boolean, attributes: Array[Double]) {
13+
case class RawPanda(id: Long, zip: String, pt: String,
14+
happy: Boolean, attributes: Array[Double]) {
1415
override def equals(o: Any) = o match {
1516
case other: RawPanda => (id == other.id && pt == other.pt &&
1617
happy == other.happy && attributes.deep == other.attributes.deep)
1718
case _ => false
1819
}
1920
override def hashCode(): Int = {
20-
3 * Objects.hashCode(id) + 7 * Objects.hashCode(zip) + 11 * Objects.hashCode(pt) +
21-
13 * Arrays.hashCode(attributes)
21+
3 * Objects.hashCode(id) + 7 * Objects.hashCode(zip) +
22+
11 * Objects.hashCode(pt) + 13 * Arrays.hashCode(attributes)
2223
}
2324
}
2425

src/test/scala/com/high-performance-spark-examples/dataframe/HappyPandasTest.scala

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/**
2-
* Happy Panda Example for DataFrames. Computes the % of happy pandas. Very contrived.
2+
* Happy Panda Example for DataFrames.
3+
* Computes the % of happy pandas. Very contrived.
34
*/
45
package com.highperformancespark.examples.dataframe
56

@@ -46,7 +47,9 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
4647
test("simple explode test") {
4748
val inputDF = sqlContext.createDataFrame(pandaPlaces)
4849
val pandaInfo = sqlContext.createDataFrame(rawPandaList)
49-
val expectedDf = pandaInfo.select((pandaInfo("attributes")(0) / pandaInfo("attributes")(1)).as("squishyness"))
50+
val expectedDf = pandaInfo.select(
51+
(pandaInfo("attributes")(0) / pandaInfo("attributes")(1))
52+
.as("squishyness"))
5053
val result = HappyPandas.squishPandaFromPace(inputDF)
5154

5255
assertDataFrameApproximateEquals(expectedDf, result, 1E-5)
@@ -55,7 +58,9 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
5558
//tag::approxEqualDataFrames[]
5659

5760
test("verify simple happy pandas Percentage") {
58-
val expectedList = List(Row(toronto, 0.5), Row(sandiego, 2/3.0), Row(virginia, 1/10.0))
61+
val expectedList = List(Row(toronto, 0.5),
62+
Row(sandiego, 2/3.0),
63+
Row(virginia, 1/10.0))
5964
val expectedDf = createDF(expectedList, ("place", StringType),
6065
("percentHappy", DoubleType))
6166

@@ -71,7 +76,9 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
7176
val resultDF = HappyPandas.happyPandasPercentage(inputDF)
7277
val resultRows = resultDF.collect()
7378

74-
val expectedRows = List(Row(toronto, 0.5), Row(sandiego, 2/3.0), Row(virginia, 1/10.0))
79+
val expectedRows = List(Row(toronto, 0.5),
80+
Row(sandiego, 2/3.0),
81+
Row(virginia, 1/10.0))
7582

7683
//tag::approxEqualRow[]
7784
assert(expectedRows.length === resultRows.length)
@@ -174,7 +181,8 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
174181
("min(pandaSize)", IntegerType),
175182
("avg(pandaSize)", DoubleType))
176183

177-
assertDataFrameApproximateEquals(expectedDF.orderBy("zip"), resultDF.orderBy("zip"), 1e-5)
184+
assertDataFrameApproximateEquals(expectedDF.orderBy("zip"),
185+
resultDF.orderBy("zip"), 1e-5)
178186
}
179187

180188

@@ -208,10 +216,12 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
208216

209217
val expectedDF = getExpectedPandasRelativeSize(inputPandaList, -10, 10)
210218

211-
assertDataFrameApproximateEquals(expectedDF.orderBy("name"), resultDF.orderBy("name"), 1e-5)
219+
assertDataFrameApproximateEquals(expectedDF.orderBy("name"),
220+
resultDF.orderBy("name"), 1e-5)
212221
}
213222

214-
private def getExpectedPandasRelativeSize(pandaList: List[Pandas], start: Int, end: Int):DataFrame = {
223+
private def getExpectedPandasRelativeSize(pandaList: List[Pandas],
224+
start: Int, end: Int):DataFrame = {
215225

216226
val expectedRows =
217227
pandaList
@@ -234,7 +244,11 @@ class HappyPandasTest extends FunSuite with DataFrameSuiteBase {
234244
val average = totalSum.toDouble / count
235245

236246
val panda = pandas(i)
237-
result += Row(panda.name, panda.zip, panda.pandaSize, panda.age, panda.pandaSize - average)
247+
result += Row(panda.name,
248+
panda.zip,
249+
panda.pandaSize,
250+
panda.age,
251+
panda.pandaSize - average)
238252
}
239253

240254
result

src/test/scala/com/high-performance-spark-examples/dataframe/MixedDatasetSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class MixedDatasetSuite extends FunSuite with DataFrameSuiteBase {
3636
val inputDS = inputDF.as[RawPanda]
3737
val mixedDS = new MixedDataset(sqlCtx)
3838
val squishy = mixedDS.squishyPandas(inputDS).collect()
39-
assert(squishy(0)._2 == true)
39+
assert(squishy(0)._2 === true)
4040
}
4141

4242
test("funquery") {

src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ class EvaluationTests extends FunSuite with SharedSparkContext {
1414
// tag::MapValues[]
1515
val sortedData = data.sortByKey()
1616
val mapValues: RDD[(Double, String)] = sortedData.mapValues(_.toString)
17-
assert(mapValues.partitioner.isDefined, "Using Map Values preserves partitioning")
17+
assert(mapValues.partitioner.isDefined,
18+
"Using Map Values preserves partitioning")
1819

1920
val map = sortedData.map( pair => (pair._1, pair._2.toString))
2021
assert(map.partitioner.isEmpty, "Using map does not preserve partitioning")
@@ -46,22 +47,27 @@ class EvaluationTests extends FunSuite with SharedSparkContext {
4647
}
4748

4849
test("Itereative Computations "){
49-
def RMSE(rdd : RDD[(Int, Int )]) = {
50+
def rmse(rdd : RDD[(Int, Int )]) = {
5051
val n = rdd.count()
5152
math.sqrt(rdd.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n)
5253
}
5354

5455
val validationSet = sc.parallelize(keyValuePairs)
5556

5657
// tag::iterativeComp[]
57-
val testSet: Array[RDD[(Double, Int)]] = Array(validationSet.mapValues(_ + 1), validationSet.mapValues(_ + 2), validationSet)
58+
val testSet: Array[RDD[(Double, Int)]] =
59+
Array(
60+
validationSet.mapValues(_ + 1),
61+
validationSet.mapValues(_ + 2),
62+
validationSet)
5863
validationSet.persist() //persist since we are using this RDD several times
5964
val errors = testSet.map( rdd => {
60-
RMSE(rdd.join(validationSet).values)
65+
rmse(rdd.join(validationSet).values)
6166
})
6267
// end::iterativeComp[]
6368

64-
//the one where we didn't change anything should have the lowest root mean squared error
69+
// the one where we didn't change anything should have the
70+
// lowest root mean squared error
6571
assert(errors.min == errors(2))
6672

6773
}
@@ -91,4 +97,3 @@ class EvaluationTests extends FunSuite with SharedSparkContext {
9197

9298

9399
}
94-

src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala

Lines changed: 58 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -13,46 +13,65 @@ class GoldilocksLargeTests extends FunSuite with SharedSparkContext{
1313

1414

1515
def testGoldilocksImplementations(
16-
data : DataFrame, targetRanks : List[Long],
17-
expectedResult : Map[Int, Iterable[Long]])= {
18-
val iterative = GoldilocksWhileLoop.findRankStatistics(data, targetRanks)
19-
val groupByKey = GoldilocksGroupByKey.findRankStatistics(data, targetRanks)
20-
val firstTry = GoldilocksFirstTry.findRankStatistics(data, targetRanks)
21-
val hashMap = GoldilocksWithHashMap.findRankStatistics(data, targetRanks)
22-
val secondarySort = GoldilocksSecondarySort.findRankStatistics(data, targetRanks, data.rdd.partitions.length)
23-
val secondarySortV2 = GoldilocksSecondarySortV2.findRankStatistics(data, targetRanks)
16+
data: DataFrame, targetRanks: List[Long],
17+
expectedResult: Map[Int, Iterable[Long]]) = {
18+
19+
val iterative =
20+
GoldilocksWhileLoop.findRankStatistics(data, targetRanks)
21+
val groupByKey =
22+
GoldilocksGroupByKey.findRankStatistics(data, targetRanks)
23+
val firstTry =
24+
GoldilocksFirstTry.findRankStatistics(data, targetRanks)
25+
val hashMap =
26+
GoldilocksWithHashMap.findRankStatistics(data, targetRanks)
27+
val secondarySort =
28+
GoldilocksSecondarySort.findRankStatistics(data, targetRanks,
29+
data.rdd.partitions.length)
30+
val secondarySortV2 =
31+
GoldilocksSecondarySortV2.findRankStatistics(data, targetRanks)
2432

2533
expectedResult.foreach {
2634
case((i, ranks)) =>
2735
assert(iterative(i).equals(ranks),
28-
"The Iterative solution to goldilocks was incorrect for column " + i)
29-
assert(groupByKey(i).equals(ranks), "Group by key solution was incorrect")
30-
assert(firstTry(i).equals(ranks), "GoldilocksFirstTry incorrect for column " + i )
31-
assert(hashMap(i).equals(ranks), "GoldilocksWithhashMap incorrect for column " + i)
36+
"The Iterative solution to goldilocks was incorrect for column " + i)
37+
assert(groupByKey(i).equals(ranks),
38+
"Group by key solution was incorrect")
39+
assert(firstTry(i).equals(ranks),
40+
"GoldilocksFirstTry incorrect for column " + i )
41+
assert(hashMap(i).equals(ranks),
42+
"GoldilocksWithhashMap incorrect for column " + i)
3243
assert(secondarySort(i).equals(ranks))
3344
assert(secondarySortV2(i).equals(ranks))
3445

3546
}
3647
}
3748

3849
test("Goldilocks on local data solution "){
39-
val sqlContext = new SQLContext(sc)
40-
val testRanks = List(3L, 8L)
41-
val (smallTestData, result) = DataCreationUtils.createLocalTestData(5, 10, testRanks)
42-
val schema = StructType(result.keys.toSeq.map(n => StructField("Column" + n.toString, DoubleType)))
43-
val smallTestDF : DataFrame = sqlContext.createDataFrame(sc.makeRDD(smallTestData), schema)
44-
testGoldilocksImplementations(smallTestDF, testRanks, result)
45-
}
50+
val sqlContext = new SQLContext(sc)
51+
val testRanks = List(3L, 8L)
52+
val (smallTestData, result) =
53+
DataCreationUtils.createLocalTestData(5, 10, testRanks)
54+
val schema = StructType(
55+
result.keys.toSeq.map(
56+
n => StructField("Column" + n.toString, DoubleType)
57+
))
58+
val smallTestDF: DataFrame =
59+
sqlContext.createDataFrame(sc.makeRDD(smallTestData), schema)
60+
testGoldilocksImplementations(smallTestDF, testRanks, result)
61+
}
4662
}
4763

4864
object DataCreationUtils {
49-
def createLocalTestData(numberCols : Int, numberOfRows : Int, targetRanks : List[Long]) = {
65+
def createLocalTestData(numberCols: Int, numberOfRows: Int,
66+
targetRanks: List[Long]) = {
67+
5068
val cols = Range(0,numberCols).toArray
5169
val scalers = cols.map(x => 1.0)
5270
val rowRange = Range(0, numberOfRows)
5371
val columnArray: Array[IndexedSeq[Double]] = cols.map(
5472
columnIndex => {
55-
val columnValues = rowRange.map(x => (Math.random(), x)).sortBy(_._1).map(_._2 * scalers(columnIndex))
73+
val columnValues = rowRange.map(
74+
x => (Math.random(), x)).sortBy(_._1).map(_._2 * scalers(columnIndex))
5675
columnValues
5776
})
5877
val rows = rowRange.map(
@@ -69,22 +88,23 @@ object DataCreationUtils {
6988
}
7089

7190

72-
def createDistributedData(sc : SparkContext ,partitions: Int, elementsPerPartition : Int, numberOfColumns : Int ) = {
91+
def createDistributedData(sc: SparkContext, partitions: Int,
92+
elementsPerPartition: Int, numberOfColumns: Int ) = {
7393
val partitionsStart: RDD[Int] = sc.parallelize(
7494
Array.fill(partitions)(1))
7595
partitionsStart.repartition(partitions)
7696

7797
var data: RDD[(Long, List[Int])] = partitionsStart.mapPartitionsWithIndex {
7898
case (partIndex, elements) =>
7999
val rows = Range(0, elementsPerPartition)
80-
.map(x => (Math.random(), x))
81-
.map {
82-
case ((randomNumber, rowValue)) =>
83-
(
84-
randomNumber,
85-
(partIndex * elementsPerPartition.toLong + rowValue, //index of element
86-
List(rowValue + partIndex * elementsPerPartition)))
87-
}
100+
.map(x => (Math.random(), x))
101+
.map {
102+
case ((randomNumber, rowValue)) =>
103+
(randomNumber,
104+
//index of element
105+
(partIndex * elementsPerPartition.toLong + rowValue,
106+
List(rowValue + partIndex * elementsPerPartition)))
107+
}
88108
rows.toIterator
89109
}.sortByKey().values
90110

@@ -93,14 +113,14 @@ object DataCreationUtils {
93113
val nextColumn: RDD[(Long, Int)] = partitionsStart.mapPartitionsWithIndex {
94114
case (partIndex, elements) =>
95115
val rows = Range(0, elementsPerPartition)
96-
.map(x => (Math.random(), x))
97-
.map {
98-
case ((randomNumber, rowValue)) =>
99-
(
100-
randomNumber,
101-
(partIndex * elementsPerPartition.toLong + rowValue, //index of element
102-
rowValue + partIndex * elementsPerPartition))
103-
}
116+
.map(x => (Math.random(), x))
117+
.map {
118+
case ((randomNumber, rowValue)) =>
119+
(randomNumber,
120+
//index of element
121+
(partIndex * elementsPerPartition.toLong + rowValue,
122+
rowValue + partIndex * elementsPerPartition))
123+
}
104124
rows.toIterator
105125
}.sortByKey().values
106126

src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@ class JoinTest extends FunSuite with SharedSparkContext {
1212
val largeRDD: RDD[(String, Double)] =
1313
sc.parallelize(keySet.flatMap{ letter =>
1414
Range(1, 50).map(i => (letter, letter.hashCode() / i.toDouble))})
15-
val result: RDD[(String, (Double, Int))] = RDDJoinExamples.manualBroadCastHashJoin(
16-
largeRDD, smallRDD)
15+
val result: RDD[(String, (Double, Int))] =
16+
RDDJoinExamples.manualBroadCastHashJoin(
17+
largeRDD, smallRDD)
1718
val nativeJoin: RDD[(String, (Double, Int))] = largeRDD.join(smallRDD)
1819

1920
assert(result.subtract(nativeJoin).count == 0)

0 commit comments

Comments
 (0)