Skip to content

Commit 5620b43

Browse files
committed
Initial generated WordCount commit
1 parent 0849809 commit 5620b43

File tree

2 files changed

+56
-0
lines changed

2 files changed

+56
-0
lines changed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package com.sparklingpandas.sparklingml
2+
3+
/**
4+
* Everyone's favourite wordcount example.
5+
*/
6+
7+
import org.apache.spark.rdd._
8+
9+
object WordCount {
10+
/**
11+
* A slightly more complex than normal wordcount example with optional
12+
* separators and stopWords. Splits on the provided separators, removes
13+
* the stopwords, and converts everything to lower case.
14+
*/
15+
def withStopWordsFiltered(rdd : RDD[String],
16+
separators : Array[Char] = " ".toCharArray,
17+
stopWords : Set[String] = Set("the")): RDD[(String, Int)] = {
18+
19+
val tokens: RDD[String] = rdd.flatMap(_.split(separators).
20+
map(_.trim.toLowerCase))
21+
val lcStopWords = stopWords.map(_.trim.toLowerCase)
22+
val words = tokens.filter(token =>
23+
!lcStopWords.contains(token) && (token.length > 0))
24+
val wordPairs = words.map((_, 1))
25+
val wordCounts = wordPairs.reduceByKey(_ + _)
26+
wordCounts
27+
}
28+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package com.sparklingpandas.sparklingml
2+
3+
/**
4+
* A simple test for everyone's favourite wordcount example.
5+
*/
6+
7+
import com.holdenkarau.spark.testing.SharedSparkContext
8+
import org.scalatest.FunSuite
9+
10+
class WordCountTest extends FunSuite with SharedSparkContext {
11+
test("word count with Stop Words Removed"){
12+
val linesRDD = sc.parallelize(Seq(
13+
"How happy was the panda? You ask.",
14+
"Panda is the most happy panda in all the#!?ing land!"))
15+
16+
val stopWords: Set[String] = Set("a", "the", "in", "was", "there", "she", "he")
17+
val splitTokens: Array[Char] = "#%?!. ".toCharArray
18+
19+
val wordCounts = WordCount.withStopWordsFiltered(
20+
linesRDD, splitTokens, stopWords)
21+
val wordCountsAsMap = wordCounts.collectAsMap()
22+
assert(!wordCountsAsMap.contains("the"))
23+
assert(!wordCountsAsMap.contains("?"))
24+
assert(!wordCountsAsMap.contains("#!?ing"))
25+
assert(wordCountsAsMap.contains("ing"))
26+
assert(wordCountsAsMap.get("panda").get.equals(3))
27+
}
28+
}

0 commit comments

Comments
 (0)