Skip to content

Commit 4cb85ce

Browse files
committed
Merge pull request rathboma#2 from helenahm/master
spark
2 parents e4b5be4 + 51e06c3 commit 4cb85ce

File tree

18 files changed

+608
-0
lines changed

18 files changed

+608
-0
lines changed

spark-scala/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
2+
## Compiling
3+
``` bash
4+
mvn compile
5+
```
6+
7+
## Testing
8+
``` bash
9+
mvn test
10+
```
11+
12+
## Running
13+
``` bash
14+
mvn assembly:single
15+
16+
``` bash
17+
/usr/bin/spark-submit --class main.scala.com.matthewrathbone.spark.Main --master local ./target/scala-spark-1.0-SNAPSHOT-jar-with-dependencies.jar /path/to/transactions_test.txt /path/to/users_test.txt /path/to/output_folder

spark-scala/pom.xml

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
2+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
3+
4+
<build>
5+
<sourceDirectory>src/main/scala/com/matthewrathbone/spark</sourceDirectory>
6+
<testSourceDirectory>src/test/scala/com/matthewrathbone/spark</testSourceDirectory>
7+
<pluginManagement>
8+
<plugins>
9+
<plugin>
10+
<groupId>org.scala-tools</groupId>
11+
<artifactId>maven-scala-plugin</artifactId>
12+
<version>2.9.1</version>
13+
</plugin>
14+
<plugin>
15+
<groupId>org.apache.maven.plugins</groupId>
16+
<artifactId>maven-compiler-plugin</artifactId>
17+
<version>2.0.2</version>
18+
</plugin>
19+
</plugins>
20+
</pluginManagement>
21+
<plugins>
22+
<plugin>
23+
<groupId>org.scala-tools</groupId>
24+
<artifactId>maven-scala-plugin</artifactId>
25+
<executions>
26+
<execution>
27+
<goals>
28+
<goal>compile</goal>
29+
<goal>testCompile</goal>
30+
</goals>
31+
</execution>
32+
</executions>
33+
<configuration>
34+
<scalaVersion>2.10.3</scalaVersion>
35+
</configuration>
36+
</plugin>
37+
<plugin>
38+
<groupId>org.scalatest</groupId>
39+
<artifactId>scalatest-maven-plugin</artifactId>
40+
<version>1.0</version>
41+
<configuration>
42+
<reportsDirectory>target/surefire-reports</reportsDirectory>
43+
<stdout>W</stdout> <!-- Skip coloring output -->
44+
</configuration>
45+
<executions>
46+
<execution>
47+
<id>scala-test</id>
48+
<goals>
49+
<goal>test</goal>
50+
</goals>
51+
</execution>
52+
</executions>
53+
</plugin>
54+
<plugin>
55+
<groupId>org.apache.maven.plugins</groupId>
56+
<artifactId>maven-surefire-plugin</artifactId>
57+
<version>2.8</version>
58+
</plugin>
59+
<plugin>
60+
<artifactId>maven-assembly-plugin</artifactId>
61+
<configuration>
62+
<descriptorRefs>
63+
<descriptorRef>jar-with-dependencies</descriptorRef>
64+
</descriptorRefs>
65+
<archive>
66+
<manifest>
67+
<mainClass>main.scala.com.matthewrathbone.spark.Main</mainClass>
68+
</manifest>
69+
</archive>
70+
</configuration>
71+
</plugin>
72+
</plugins>
73+
</build>
74+
75+
<modelVersion>4.0.0</modelVersion>
76+
<groupId>base</groupId>
77+
<artifactId>scala-spark</artifactId>
78+
<packaging>jar</packaging>
79+
<version>1.0-SNAPSHOT</version>
80+
<properties>
81+
<maven.compiler.source>1.7</maven.compiler.source>
82+
<maven.compiler.target>1.7</maven.compiler.target>
83+
</properties>
84+
<name>scala-spark</name>
85+
<url>http://maven.apache.org</url>
86+
<dependencies>
87+
<dependency>
88+
<groupId>org.scala-lang</groupId>
89+
<artifactId>scala-library</artifactId>
90+
<version>2.10.3</version>
91+
</dependency>
92+
<dependency>
93+
<groupId>org.scalatest</groupId>
94+
<artifactId>scalatest_2.10</artifactId>
95+
<version>3.0.0-M9</version>
96+
</dependency>
97+
<dependency>
98+
<groupId>org.apache.spark</groupId>
99+
<artifactId>spark-core_2.10</artifactId>
100+
<version>1.2.0</version>
101+
</dependency>
102+
<dependency>
103+
<groupId>junit</groupId>
104+
<artifactId>junit</artifactId>
105+
<version>4.8.2</version>
106+
<scope>test</scope>
107+
</dependency>
108+
</dependencies>
109+
<repositories>
110+
<repository>
111+
<id>cloudera</id>
112+
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
113+
</repository>
114+
<repository>
115+
<id>conjars.org</id>
116+
<url>http://conjars.org/repo</url>
117+
</repository>
118+
</repositories>
119+
</project>
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package main.scala.com.matthewrathbone.spark
2+
3+
import org.apache.spark.SparkContext
4+
import org.apache.spark.SparkContext._
5+
import org.apache.spark.SparkConf
6+
import org.apache.spark.rdd.RDD
7+
import scala.collection.Map
8+
9+
class ExampleJob(sc: SparkContext) {
10+
// reads data from text files and computes the results. This is what you test
11+
def run(t: String, u: String) : RDD[(String, String)] = {
12+
val transactions = sc.textFile(t)
13+
val newTransactionsPair = transactions.map{t =>
14+
val p = t.split("\t")
15+
(p(2).toInt, p(1).toInt)
16+
}
17+
18+
val users = sc.textFile(u)
19+
val newUsersPair = users.map{t =>
20+
val p = t.split("\t")
21+
(p(0).toInt, p(3))
22+
}
23+
24+
val result = processData(newTransactionsPair, newUsersPair)
25+
return sc.parallelize(result.toSeq).map(t => (t._1.toString, t._2.toString))
26+
}
27+
28+
def processData (t: RDD[(Int, Int)], u: RDD[(Int, String)]) : Map[Int,Long] = {
29+
var jn = t.leftOuterJoin(u).values.distinct
30+
return jn.countByKey
31+
}
32+
}
33+
34+
object ExampleJob {
35+
def main(args: Array[String]) {
36+
val transactionsIn = args(1)
37+
val usersIn = args(0)
38+
val conf = new SparkConf().setAppName("SparkJoins").setMaster("local")
39+
val context = new SparkContext(conf)
40+
val job = new ExampleJob(context)
41+
val results = job.run(transactionsIn, usersIn)
42+
val output = args(2)
43+
results.saveAsTextFile(output)
44+
context.stop()
45+
}
46+
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
package test.scala.com.matthewrathbone.spark
2+
3+
import org.scalatest.junit.AssertionsForJUnit
4+
import org.apache.spark.SparkContext
5+
import org.junit.Test
6+
import org.junit.Before
7+
import org.apache.spark.SparkConf
8+
import main.scala.com.matthewrathbone.spark.ExampleJob
9+
import org.junit.After
10+
11+
class SparkJoinsScalaTest extends AssertionsForJUnit {
12+
13+
var sc: SparkContext = _
14+
15+
@Before
16+
def initialize() {
17+
val conf = new SparkConf().setAppName("SparkJoins").setMaster("local")
18+
sc = new SparkContext(conf)
19+
}
20+
21+
@After
22+
def tearDown() {
23+
sc.stop()
24+
}
25+
26+
@Test
27+
def testExamleJobCode() {
28+
val job = new ExampleJob(sc)
29+
val result = job.run("./transactions.txt", "./users.txt")
30+
assert(result.collect()(0)._1 === "1")
31+
assert(result.collect()(0)._2 === "3")
32+
assert(result.collect()(1)._1 === "2")
33+
assert(result.collect()(1)._2 === "1")
34+
}
35+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
1 1 1 300 a jumper
2+
2 1 2 300 a jumper
3+
3 1 2 300 a jumper
4+
4 2 3 100 a ruber chicken
5+
5 1 3 300 a jumper
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
3+

spark-scala/transactions_test.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
1 1 1 300 a jumper
2+
2 1 2 300 a jumper
3+
3 1 2 300 a jumper
4+
4 2 3 100 a rubber chicken
5+
5 1 3 300 a jumper

spark-scala/users_test.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
3+

spark/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
2+
## Compiling
3+
``` bash
4+
mvn compile
5+
```
6+
7+
## Testing
8+
``` bash
9+
mvn test
10+
```
11+
12+
## Running
13+
``` bash
14+
mvn assembly:single
15+
16+
``` bash
17+
/usr/bin/spark-submit --class main.java.com.matthewrathbone.sparktest.SparkJoins --master local ./target/spark-example-1.0-SNAPSHOT-jar-with-dependencies.jar /path/to/transactions_test.txt /path/to/users_test.txt /path/to/output_folder

spark/pom.xml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
3+
<project xmlns="http://maven.apache.org/POM/4.0.0"
4+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
5+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
6+
<modelVersion>4.0.0</modelVersion>
7+
8+
<groupId>com.matthewrathbone.sparktest</groupId>
9+
<artifactId>spark-example</artifactId>
10+
<version>1.0-SNAPSHOT</version>
11+
<properties>
12+
<maven.compiler.source>1.7</maven.compiler.source>
13+
<maven.compiler.target>1.7</maven.compiler.target>
14+
</properties>
15+
16+
<dependencies>
17+
<dependency>
18+
<groupId>org.apache.spark</groupId>
19+
<artifactId>spark-core_2.10</artifactId>
20+
<version>1.2.0</version>
21+
</dependency>
22+
<dependency>
23+
<groupId>junit</groupId>
24+
<artifactId>junit</artifactId>
25+
<version>4.8.2</version>
26+
<scope>test</scope>
27+
</dependency>
28+
</dependencies>
29+
<build>
30+
<pluginManagement>
31+
<plugins>
32+
<plugin>
33+
<groupId>org.apache.maven.plugins</groupId>
34+
<artifactId>maven-surefire-plugin</artifactId>
35+
<version>2.8</version>
36+
</plugin>
37+
<plugin>
38+
<artifactId>maven-assembly-plugin</artifactId>
39+
<configuration>
40+
<archive>
41+
<manifest>
42+
<mainClass>main.java.com.matthewrathbone.sparktest.SparkJoins</mainClass>
43+
</manifest>
44+
</archive>
45+
<descriptorRefs>
46+
<descriptorRef>jar-with-dependencies</descriptorRef>
47+
</descriptorRefs>
48+
</configuration>
49+
</plugin>
50+
</plugins>
51+
</pluginManagement>
52+
</build>
53+
</project>

0 commit comments

Comments
 (0)