Merge pull request high-performance-spark#86 from holdenk/fix-more-style

holdenk · web-flow · commit 3386d5d9a448 · 2017-04-28T22:05:48.000-07:00
Fix more style
diff --git a/high_performance_pyspark/simple_perf.py b/high_performance_pyspark/simple_perf.py
@@ -40,16 +40,19 @@ def generate_scale_data(sqlCtx, rows, numCols):
     scalasc = jsc.sc()
     gateway = sc._gateway
     # Call a java method that gives us back an RDD of JVM Rows (Int, Double)
-    # While Python RDDs are wrapped Java RDDs (even of Rows) the contents are different, so we
-    # can't directly wrap this.
+    # While Python RDDs are wrapped Java RDDs (even of Rows) the contents are
+    # different, so we can't directly wrap this.
     # This returns a Java RDD of Rows - normally it would better to
-    # return a DataFrame directly, but for illustration we will work with an RDD
-    # of Rows.
-    java_rdd = gateway.jvm.com.highperformancespark.examples.tools.GenerateScalingData. \
-               generateMiniScaleRows(scalasc, rows, numCols)
+    # return a DataFrame directly, but for illustration we will work
+    # with an RDD of Rows.
+    java_rdd = (gateway.jvm.com.highperformancespark.examples.
+                tools.GenerateScalingData.
+                generateMiniScaleRows(scalasc, rows, numCols))
     # Schemas are serialized to JSON and sent back and forth
     # Construct a Python Schema and turn it into a Java Schema
-    schema = StructType([StructField("zip", IntegerType()), StructField("fuzzyness", DoubleType())])
+    schema = StructType([
+        StructField("zip", IntegerType()),
+        StructField("fuzzyness", DoubleType())])
     # 2.1 / pre-2.1
     try:
         jschema = javaSqlCtx.parseDataType(schema.json())
diff --git a/src/main/java/com/highperformancespark/examples/JavaInterop.java b/src/main/java/com/highperformancespark/examples/JavaInterop.java
@@ -16,7 +16,8 @@
 public class JavaInterop {
 
   //tag::realClassTag[]
-  public static JavaPairRDD wrapPairRDD(RDD<Tuple2<String, Object>> rdd) {
+  public static JavaPairRDD wrapPairRDD(
+    RDD<Tuple2<String, Object>> rdd) {
     // Construct the class tags
     ClassTag<String> strCt = ClassTag$.MODULE$.apply(String.class);
     ClassTag<Long> longCt = ClassTag$.MODULE$.apply(scala.Long.class);
@@ -25,10 +26,11 @@ public static JavaPairRDD wrapPairRDD(RDD<Tuple2<String, Object>> rdd) {
   //end::realClassTag[]
 
   //tag::fakeClassTag[]
-  public static JavaPairRDD wrapPairRDDFakeCt(RDD<Tuple2<String, Object>> rdd) {
-    // Construct the class tags by casting AnyRef - this would be more commonly done with
-    // generic or templated code where we can't explicitly construct the correct class tag
-    // as using fake class tags may result in degraded performance.
+  public static JavaPairRDD wrapPairRDDFakeCt(
+    RDD<Tuple2<String, Object>> rdd) {
+    // Construct the class tags by casting AnyRef - this would be more commonly done
+    // with generic or templated code where we can't explicitly construct the correct
+    // class tag as using fake class tags may result in degraded performance.
     ClassTag<Object> fake = ClassTag$.MODULE$.AnyRef();
     return new JavaPairRDD(rdd, fake, fake);
   }
diff --git a/src/main/java/com/highperformancespark/examples/WordCount.java b/src/main/java/com/highperformancespark/examples/WordCount.java
@@ -16,9 +16,10 @@ public final class WordCount {
   public static void main(String[] args) throws Exception {
     JavaSparkContext jsc = new JavaSparkContext();
     JavaRDD<String> lines = jsc.textFile(args[0]);
-    JavaRDD<String> words = lines.flatMap(e -> Arrays.asList(pattern.split(e)).iterator());
+    JavaRDD<String> words = lines.flatMap(e -> Arrays.asList(
+                                            pattern.split(e)).iterator());
     JavaPairRDD<String, Integer> wordsIntial = words.mapToPair(
-      e -> new Tuple2<String, Integer>(e, 1));                                                         
+      e -> new Tuple2<String, Integer>(e, 1));
   }
 }
 //end::wordCount[]
diff --git a/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java b/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java
@@ -10,7 +10,9 @@ public class JavaUDFs {
 
   public static void setupUDFs(SQLContext sqlContext) {
     //tag::basicUDF[]
-    sqlContext.udf().register("strlen", (String s) -> s.length(), DataTypes.StringType);
+    sqlContext.udf()
+      .register("strlen",
+                (String s) -> s.length(), DataTypes.StringType);
     //end::basicUDF[]
   }
 
diff --git a/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala b/src/main/scala/com/high-performance-spark-examples/goldilocks/GoldilocksFirstTry.scala
@@ -10,7 +10,7 @@ import scala.collection.{Map, mutable}
 object GoldilocksGroupByKey {
   //tag::groupByKey[]
   def findRankStatistics(
-    dataFrame: DataFrame ,
+    dataFrame: DataFrame,
     ranks: List[Long]): Map[Int, Iterable[Double]] = {
     require(ranks.forall(_ > 0))
     //Map to column index, value pairs
@@ -199,7 +199,7 @@ object GoldilocksFirstTry {
     def aggregateColumnFrequencies (partitionIndex : Int,
       valueColumnPairs : Iterator[(Double, Int)]) = {
       val columnsFreq : Array[Long] = valueColumnPairs.aggregate(zero)(
-        (a : Array[Long], v : (Double ,Int)) => {
+        (a : Array[Long], v : (Double, Int)) => {
           val (value, colIndex) = v
           //increment the cell in the zero array corresponding to this column index
           a(colIndex) = a(colIndex) + 1L
diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala b/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala
@@ -1,4 +1,3 @@
-
 package com.highperformancespark.examples.transformations
 
 import org.apache.spark.rdd.RDD
@@ -26,7 +25,7 @@ object NarrowAndWide {
   def simpleSparkProgram(rdd : RDD[Double]): Long ={
   //stage1
     rdd.filter(_< 1000.0)
-      .map(x => (x , x) )
+      .map(x => (x, x) )
   //stage2
       .groupByKey()
       .map{ case(value, groups) => (groups.sum, value)}

Original file line number	Diff line number	Diff line change
`@@ -16,9 +16,10 @@ public final class WordCount {`
`16`	`16`	`public static void main(String[] args) throws Exception {`
`17`	`17`	`JavaSparkContext jsc = new JavaSparkContext();`
`18`	`18`	`JavaRDD<String> lines = jsc.textFile(args[0]);`
`19`		`- JavaRDD<String> words = lines.flatMap(e -> Arrays.asList(pattern.split(e)).iterator());`
	`19`	`+ JavaRDD<String> words = lines.flatMap(e -> Arrays.asList(`
	`20`	`+ pattern.split(e)).iterator());`
`20`	`21`	`JavaPairRDD<String, Integer> wordsIntial = words.mapToPair(`
`21`		`- e -> new Tuple2<String, Integer>(e, 1));`
	`22`	`+ e -> new Tuple2<String, Integer>(e, 1));`
`22`	`23`	`}`
`23`	`24`	`}`
`24`	`25`	`//end::wordCount[]`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,9 @@ public class JavaUDFs {`
`10`	`10`
`11`	`11`	`public static void setupUDFs(SQLContext sqlContext) {`
`12`	`12`	`//tag::basicUDF[]`
`13`		`- sqlContext.udf().register("strlen", (String s) -> s.length(), DataTypes.StringType);`
	`13`	`+ sqlContext.udf()`
	`14`	`+ .register("strlen",`
	`15`	`+ (String s) -> s.length(), DataTypes.StringType);`
`14`	`16`	`//end::basicUDF[]`
`15`	`17`	`}`
`16`	`18`