Start fixing a lot of long lines

holdenk · holdenk · commit 677dc8d1ed8e · 2017-04-04T10:56:57.000-07:00
diff --git a/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala b/src/main/scala/com/high-performance-spark-examples/dataframe/HappyPandas.scala
@@ -1,5 +1,6 @@
 /**
- * Happy Panda Example for DataFrames. Computes the % of happy pandas. Very contrived.
+ * Happy Panda Example for DataFrames. This computes the % of happy pandas and
+ * is a very contrived example (sorry!).
  */
 package com.highperformancespark.examples.dataframe
 
@@ -30,7 +31,8 @@ object HappyPandas {
     val session = SparkSession.builder()
       .enableHiveSupport()
       .getOrCreate()
-    // Import the implicits, unlike in core Spark the implicits are defined on the context
+    // Import the implicits, unlike in core Spark the implicits are defined
+    // on the context.
     import session.implicits._
     //end::createSparkSession[]
     session
@@ -42,7 +44,8 @@ object HappyPandas {
   def sqlContext(sc: SparkContext): SQLContext = {
     //tag::createSQLContext[]
     val sqlContext = new SQLContext(sc)
-    // Import the implicits, unlike in core Spark the implicits are defined on the context
+    // Import the implicits, unlike in core Spark the implicits are defined
+    // on the context.
     import sqlContext.implicits._
     //end::createSQLContext[]
     sqlContext
@@ -54,7 +57,8 @@ object HappyPandas {
   def hiveContext(sc: SparkContext): HiveContext = {
     //tag::createHiveContext[]
     val hiveContext = new HiveContext(sc)
-    // Import the implicits, unlike in core Spark the implicits are defined on the context
+    // Import the implicits, unlike in core Spark the implicits are defined
+    // on the context.
     import hiveContext.implicits._
     //end::createHiveContext[]
     hiveContext
@@ -63,7 +67,8 @@ object HappyPandas {
   /**
    * Illustrate loading some JSON data.
    */
-  def loadDataSimple(sc: SparkContext, session: SparkSession, path: String): DataFrame = {
+  def loadDataSimple(sc: SparkContext, session: SparkSession, path: String):
+      DataFrame = {
     //tag::loadPandaJSONSimple[]
     val df1 = session.read.json(path)
     //end::loadPandaJSONSimple[]
@@ -94,7 +99,11 @@ object HappyPandas {
     * @param happyPandas number of happy pandas in this place
     * @param totalPandas total number of pandas in this place
     */
-  case class PandaInfo(place: String, pandaType: String, happyPandas: Integer, totalPandas: Integer)
+  case class PandaInfo(
+    place: String,
+    pandaType: String,
+    happyPandas: Integer,
+    totalPandas: Integer)
 
   /**
     * Gets the percentage of happy pandas per place.
@@ -103,7 +112,10 @@ object HappyPandas {
     * @return Returns DataFrame of (place, percentage of happy pandas)
     */
   def happyPandasPercentage(pandaInfo: DataFrame): DataFrame = {
-    pandaInfo.select(pandaInfo("place"), (pandaInfo("happyPandas") / pandaInfo("totalPandas")).as("percentHappy"))
+    pandaInfo.select(
+      pandaInfo("place"),
+      (pandaInfo("happyPandas") / pandaInfo("totalPandas")).as("percentHappy")
+    )
   }
 
   //tag::encodePandaType[]
diff --git a/src/main/scala/com/high-performance-spark-examples/errors/throws.scala b/src/main/scala/com/high-performance-spark-examples/errors/throws.scala
@@ -7,7 +7,8 @@ object Throws {
   def throwInner(sc: SparkContext) = {
     //tag::throwInner1[]
     val data = sc.parallelize(List(1, 2, 3))
-    val transform1 = data.map(x => x/0) // Will throw an exception when forced to evaluate
+    // Will throw an exception when forced to evaluate
+    val transform1 = data.map(x => x/0)
     val transform2 = transform1.map(x => x + 1)
     transform2.collect() // Forces evaluation
     //end::throwInner1[]
@@ -17,7 +18,8 @@ object Throws {
     //tag::throwOuter1[]
     val data = sc.parallelize(List(1, 2, 3))
     val transform1 = data.map(x => x + 1)
-    val transform2 = transform1.map(x => x/0) // Will throw an exception when forced to evaluate
+    // Will throw an exception when forced to evaluate
+    val transform2 = transform1.map(x => x/0)
     transform2.collect() // Forces evaluation
     //end::throwOuter1[]
   }
@@ -35,15 +37,17 @@ object Throws {
   //tag::badEx3[]
   def throwInner2(sc: SparkContext) = {
     val data = sc.parallelize(List(1, 2, 3))
-    val transform1 = data.map(divZero) // Will throw an exception when forced to evaluate
+    // Will throw an exception when forced to evaluate
+    val transform1 = data.map(divZero)
     val transform2 = transform1.map(add1)
     transform2.collect() // Forces evaluation
   }
 
   def throwOuter2(sc: SparkContext) = {
     val data = sc.parallelize(List(1, 2, 3))
     val transform1 = data.map(add1)
-    val transform2 = transform1.map(divZero) // Will throw an exception when forced to evaluate
+    // Will throw an exception when forced to evaluate
+    val transform2 = transform1.map(divZero)
     transform2.collect() // Forces evaluation
   }
   //end::badEx3
diff --git a/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala b/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
@@ -34,7 +34,8 @@ class HardCodedWordCountStage(override val uid: String) extends Transformer {
     val idx = schema.fieldIndex("happy_pandas")
     val field = schema.fields(idx)
     if (field.dataType != StringType) {
-      throw new Exception(s"Input type ${field.dataType} did not match input type StringType")
+      throw new Exception(
+        s"Input type ${field.dataType} did not match input type StringType")
     }
     // Add the return field
     schema.add(StructField("happy_panda_counts", IntegerType, false))
@@ -71,7 +72,8 @@ class ConfigurableWordCount(override val uid: String) extends Transformer {
     val idx = schema.fieldIndex($(inputCol))
     val field = schema.fields(idx)
     if (field.dataType != StringType) {
-      throw new Exception(s"Input type ${field.dataType} did not match input type StringType")
+      throw new Exception(
+        s"Input type ${field.dataType} did not match input type StringType")
     }
     // Add the return field
     schema.add(StructField($(outputCol), IntegerType, false))
@@ -91,7 +93,8 @@ trait SimpleIndexerParams extends Params {
   final val outputCol = new Param[String](this, "outputCol", "The output column")
 }
 
-class SimpleIndexer(override val uid: String) extends Estimator[SimpleIndexerModel] with SimpleIndexerParams {
+class SimpleIndexer(override val uid: String)
+    extends Estimator[SimpleIndexerModel] with SimpleIndexerParams {
 
   def setInputCol(value: String) = set(inputCol, value)
 
@@ -108,7 +111,8 @@ class SimpleIndexer(override val uid: String) extends Estimator[SimpleIndexerMod
     val idx = schema.fieldIndex($(inputCol))
     val field = schema.fields(idx)
     if (field.dataType != StringType) {
-      throw new Exception(s"Input type ${field.dataType} did not match input type StringType")
+      throw new Exception(
+        s"Input type ${field.dataType} did not match input type StringType")
     }
     // Add the return field
     schema.add(StructField($(outputCol), IntegerType, false))
@@ -122,8 +126,8 @@ class SimpleIndexer(override val uid: String) extends Estimator[SimpleIndexerMod
   }
 }
 
-class SimpleIndexerModel(
-  override val uid: String, words: Array[String]) extends Model[SimpleIndexerModel] with SimpleIndexerParams {
+class SimpleIndexerModel(override val uid: String, words: Array[String])
+    extends Model[SimpleIndexerModel] with SimpleIndexerParams {
 
   override def copy(extra: ParamMap): SimpleIndexerModel = {
     defaultCopy(extra)
@@ -137,7 +141,8 @@ class SimpleIndexerModel(
     val idx = schema.fieldIndex($(inputCol))
     val field = schema.fields(idx)
     if (field.dataType != StringType) {
-      throw new Exception(s"Input type ${field.dataType} did not match input type StringType")
+      throw new Exception(
+        s"Input type ${field.dataType} did not match input type StringType")
     }
     // Add the return field
     schema.add(StructField($(outputCol), IntegerType, false))
diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala b/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala
@@ -32,8 +32,8 @@ class SimpleNaiveBayes(val uid: String)
     import ds.sparkSession.implicits._
     ds.cache()
     // Note: you can use getNumClasses & extractLabeledPoints to get an RDD instead
-    // Using the RDD approach is common when integrating with legacy machine learning code
-    // or iterative algorithms which can create large query plans.
+    // Using the RDD approach is common when integrating with legacy machine
+    // learning code or iterative algorithms which can create large query plans.
     // Compute the number of documents
     val numDocs = ds.count
     // Get the number of classes.
@@ -116,9 +116,10 @@ case class SimpleNaiveBayesModel(
   val onesVec = Vectors.dense(Array.fill(theta.numCols)(1.0))
   val negThetaSum: Array[Double] = negTheta.multiply(onesVec).toArray
 
-  // Here is the prediciton functionality you need to implement - for ClassificationModels
-  // transform automatically wraps this - but if you might benefit from broadcasting your model or
-  // other optimizations you can also override transform.
+  // Here is the prediciton functionality you need to implement - for
+  // ClassificationModels transform automatically wraps this.
+  // If you might benefit from broadcasting your model or other optimizations you
+  // can override transform and place your desired logic there.
   def predictRaw(features: Vector): Vector = {
     // Toy implementation - use BLAS or similar instead
     // the summing of the three vectors but the functionality isn't exposed.
diff --git a/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala b/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala
@@ -92,14 +92,16 @@ object SimplePipeline {
 
   def reverseStringIndexer(sbModel: StringIndexerModel) = {
     //tag::indexToString[]
-    // Construct the inverse of the model to go from index-to-string after prediction.
+    // Construct the inverse of the model to go from index-to-string
+    // after prediction.
     val sbInverse = new IndexToString()
     sbInverse.setInputCol("prediction")
     sbInverse.setLabels(sbModel.labels)
     //end::indexToString[]
     // Or if meta data is present
     //tag::indexToStringMD[]
-    // Construct the inverse of the model to go from index-to-string after prediction.
+    // Construct the inverse of the model to go from
+    // index-to-string after prediction.
     val sbInverseMD = new IndexToString()
     sbInverseMD.setInputCol("prediction")
     //end::indexToStringMD[]
diff --git a/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala b/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala
@@ -10,7 +10,8 @@ import org.apache.spark.rdd.RDD
 //tag::imports[]
 import com.github.fommil.netlib.BLAS.{getInstance => blas}
 import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, LogisticRegressionModel}
+import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS,
+  LogisticRegressionModel}
 // Rename Vector to SparkVector to avoid conflicts with Scala's Vector class
 import org.apache.spark.mllib.linalg.{Vector => SparkVector}
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -109,13 +110,15 @@ object GoldilocksMLlib {
     // Vector size is 100 - we use this to build a transformer on top of WVM that
     // works on sentences.
     val vectorSize = 100
-    // The transform function works on a per-word basis, but we have sentences as input.
+    // The transform function works on a per-word basis, but we have
+    // sentences as input.
     tokenized.map{words =>
       // If there is nothing in the sentence output a null vector
       if (words.isEmpty) {
         Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double])
       } else {
-        // If there are sentences construct a running sum of the vectors for each word
+        // If there are sentences construct a running sum of the
+        // vectors for each word
         val sum = Array[Double](vectorSize)
         words.foreach { word =>
           blas.daxpy(
diff --git a/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala b/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala
@@ -25,12 +25,14 @@ object PipeExample {
     // Copy our script to the worker nodes with sc.addFile
     // Add file requires absolute paths
     val distScriptName = "ghinfo.pl"
-    val localScript = System.getProperty("user.dir") + "/src/main/perl/" + distScriptName
+    val userDir = System.getProperty("user.dir")
+    val localScript = s"${userDir}/src/main/perl/${distScriptName}"
     val addedFile = sc.addFile(localScript)
 
     // Pass enviroment variables to our worker
     val enviromentVars = Map("user" -> "apache", "repo" -> "spark")
-    val result = input.map(x => x.toString).pipe(SparkFiles.get(distScriptName), enviromentVars)
+    val result = input.map(x => x.toString)
+      .pipe(SparkFiles.get(distScriptName), enviromentVars)
     // Parse the results
     result.map{record =>
       val elems: Array[String] = record.split(" ")
diff --git a/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala b/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala
@@ -37,16 +37,20 @@ object SimplePerfTest {
     run(sc, sparkSession, scalingFactor, size)
   }
 
-  def run(sc: SparkContext, session: SparkSession, scalingFactor: Long, size: Int) = {
+  def run(sc: SparkContext, session: SparkSession,
+    scalingFactor: Long, size: Int) = {
     import session.implicits._
-    val inputRDD = GenerateScalingData.generateFullGoldilocks(sc, scalingFactor, size)
+    val inputRDD = GenerateScalingData.generateFullGoldilocks(
+      sc, scalingFactor, size)
     val pairRDD = inputRDD.map(p => (p.zip.toInt, p.attributes(0)))
     pairRDD.cache()
     pairRDD.count()
     val rddTimeings = 1.to(10).map(x => time(testOnRDD(pairRDD)))
     val groupTimeings = 1.to(10).map(x => time(groupOnRDD(pairRDD)))
     val df = inputRDD.toDF()
-    val inputDataFrame = df.select(df("zip").cast(IntegerType), df("attributes")(0).as("fuzzyness").cast(DoubleType))
+    val inputDataFrame = df.select(
+      df("zip").cast(IntegerType),
+      df("attributes")(0).as("fuzzyness").cast(DoubleType))
     inputDataFrame.cache()
     inputDataFrame.count()
     val dataFrameTimeings = 1.to(10).map(x => time(testOnDataFrame(inputDataFrame)))
@@ -56,7 +60,8 @@ object SimplePerfTest {
   }
 
   def testOnRDD(rdd: RDD[(Int, Double)]) = {
-    rdd.map{case (x, y) => (x, (y, 1))}.reduceByKey{case (x, y) => (x._1 + y._1, x._2 + y._2)}.count()
+    rdd.map{case (x, y) => (x, (y, 1))}
+      .reduceByKey{case (x, y) => (x._1 + y._1, x._2 + y._2)}.count()
   }
 
   def groupOnRDD(rdd: RDD[(Int, Double)]) = {
diff --git a/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala b/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala
@@ -1,5 +1,5 @@
 /**
- * Happy Panda Example for DataFrames. Computes the % of happy pandas. Very contrived.
+ * Streaming Pandas Example with the old DStream APIs.
  */
 package com.highperformancespark.examples.streaming
 
@@ -42,7 +42,8 @@ object DStreamExamples {
     //end::sscRecover[]
   }
 
-  def fileAPIExample(ssc: StreamingContext, path: String): DStream[(Long, String)] = {
+  def fileAPIExample(ssc: StreamingContext, path: String):
+      DStream[(Long, String)] = {
     //tag::file[]
     // You don't need to write the types of the InputDStream but it for illustration
     val inputDStream: InputDStream[(LongWritable, Text)] =
diff --git a/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala b/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala
@@ -13,15 +13,17 @@ import com.typesafe.scalalogging.LazyLogging
 
 object FilterInvalidPandas extends LazyLogging {
 
-  def filterInvalidPandas(sc: SparkContext, invalidPandas: List[Long], input: RDD[RawPanda]) = {
+  def filterInvalidPandas(sc: SparkContext, invalidPandas: List[Long],
+    input: RDD[RawPanda]) = {
     //tag::broadcast[]
     val invalid = HashSet() ++ invalidPandas
     val invalidBroadcast = sc.broadcast(invalid)
     input.filter{panda => !invalidBroadcast.value.contains(panda.id)}
     //end::broadcast[]
   }
 
-  def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long], input: RDD[RawPanda]) = {
+  def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long],
+    input: RDD[RawPanda]) = {
     //tag::broadcastAndLog[]
     val invalid = HashSet() ++ invalidPandas
     val invalidBroadcast = sc.broadcast(invalid)
diff --git a/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala b/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala b/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala b/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala b/src/main/scala/com/high-performance-spark-examples/transformations/NewAccumulators.scala
diff --git a/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala b/src/main/scala/com/high-performance-spark-examples/transformations/SmartAggregations.scala