Skip to content

Commit 4a805af

Browse files
committed
Merge pull request apache#367 from ankurdave/graphx
GraphX: Unifying Graphs and Tables GraphX extends Spark's distributed fault-tolerant collections API and interactive console with a new graph API which leverages recent advances in graph systems (e.g., [GraphLab](http://graphlab.org)) to enable users to easily and interactively build, transform, and reason about graph structured data at scale. See http://amplab.github.io/graphx/. Thanks to @jegonzal, @rxin, @ankurdave, @dcrankshaw, @jianpingjwang, @amatsukawa, @kellrott, and @adamnovak. Tasks left: - [x] Graph-level uncache - [x] Uncache previous iterations in Pregel - [x] ~~Uncache previous iterations in GraphLab~~ (postponed to post-release) - [x] - Describe GC issue with GraphLab - [ ] Write `docs/graphx-programming-guide.md` - [x] - Mention future Bagel support in docs - [ ] - Section on caching/uncaching in docs: As with Spark, cache something that is used more than once. In an iterative algorithm, try to cache and force (i.e., materialize) something every iteration, then uncache the cached things that depended on the newly materialized RDD but that won't be referenced again. - [x] Undo modifications to core collections and instead copy them to org.apache.spark.graphx - [x] Make Graph serializable to work around capture in Spark shell - [x] Rename graph -> graphx in package name and subproject - [x] Remove standalone PageRank - [x] ~~Fix amplab/graphx#52 by checking `iter.hasNext`~~
2 parents 945fe7a + 80e73ed commit 4a805af

File tree

76 files changed

+7132
-21
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+7132
-21
lines changed

bin/compute-classpath.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ if [ -f "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-dep
3939
CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
4040
CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
4141
CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/classes"
42+
CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/classes"
4243
CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/classes"
4344

4445
DEPS_ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar`
@@ -59,6 +60,7 @@ if [[ $SPARK_TESTING == 1 ]]; then
5960
CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/test-classes"
6061
CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/test-classes"
6162
CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/test-classes"
63+
CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/test-classes"
6264
CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/test-classes"
6365
fi
6466

core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
288288
if (getKeyClass().isArray && partitioner.isInstanceOf[HashPartitioner]) {
289289
throw new SparkException("Default partitioner cannot partition array keys.")
290290
}
291-
new ShuffledRDD[K, V, (K, V)](self, partitioner)
291+
if (self.partitioner == partitioner) self else new ShuffledRDD[K, V, (K, V)](self, partitioner)
292292
}
293293

294294
/**

core/src/main/scala/org/apache/spark/rdd/RDD.scala

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,11 @@ abstract class RDD[T: ClassTag](
548548
* *same number of partitions*, but does *not* require them to have the same number
549549
* of elements in each partition.
550550
*/
551+
def zipPartitions[B: ClassTag, V: ClassTag]
552+
(rdd2: RDD[B], preservesPartitioning: Boolean)
553+
(f: (Iterator[T], Iterator[B]) => Iterator[V]): RDD[V] =
554+
new ZippedPartitionsRDD2(sc, sc.clean(f), this, rdd2, preservesPartitioning)
555+
551556
def zipPartitions[B: ClassTag, V: ClassTag]
552557
(rdd2: RDD[B])
553558
(f: (Iterator[T], Iterator[B]) => Iterator[V]): RDD[V] =

core/src/main/scala/org/apache/spark/util/collection/BitSet.scala

Lines changed: 84 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,72 @@ package org.apache.spark.util.collection
2222
* A simple, fixed-size bit set implementation. This implementation is fast because it avoids
2323
* safety/bound checking.
2424
*/
25-
class BitSet(numBits: Int) {
25+
class BitSet(numBits: Int) extends Serializable {
2626

27-
private[this] val words = new Array[Long](bit2words(numBits))
28-
private[this] val numWords = words.length
27+
private val words = new Array[Long](bit2words(numBits))
28+
private val numWords = words.length
29+
30+
/**
31+
* Compute the capacity (number of bits) that can be represented
32+
* by this bitset.
33+
*/
34+
def capacity: Int = numWords * 64
35+
36+
/**
37+
* Set all the bits up to a given index
38+
*/
39+
def setUntil(bitIndex: Int) {
40+
val wordIndex = bitIndex >> 6 // divide by 64
41+
var i = 0
42+
while(i < wordIndex) { words(i) = -1; i += 1 }
43+
if(wordIndex < words.size) {
44+
// Set the remaining bits (note that the mask could still be zero)
45+
val mask = ~(-1L << (bitIndex & 0x3f))
46+
words(wordIndex) |= mask
47+
}
48+
}
49+
50+
/**
51+
* Compute the bit-wise AND of the two sets returning the
52+
* result.
53+
*/
54+
def &(other: BitSet): BitSet = {
55+
val newBS = new BitSet(math.max(capacity, other.capacity))
56+
val smaller = math.min(numWords, other.numWords)
57+
assert(newBS.numWords >= numWords)
58+
assert(newBS.numWords >= other.numWords)
59+
var ind = 0
60+
while( ind < smaller ) {
61+
newBS.words(ind) = words(ind) & other.words(ind)
62+
ind += 1
63+
}
64+
newBS
65+
}
66+
67+
/**
68+
* Compute the bit-wise OR of the two sets returning the
69+
* result.
70+
*/
71+
def |(other: BitSet): BitSet = {
72+
val newBS = new BitSet(math.max(capacity, other.capacity))
73+
assert(newBS.numWords >= numWords)
74+
assert(newBS.numWords >= other.numWords)
75+
val smaller = math.min(numWords, other.numWords)
76+
var ind = 0
77+
while( ind < smaller ) {
78+
newBS.words(ind) = words(ind) | other.words(ind)
79+
ind += 1
80+
}
81+
while( ind < numWords ) {
82+
newBS.words(ind) = words(ind)
83+
ind += 1
84+
}
85+
while( ind < other.numWords ) {
86+
newBS.words(ind) = other.words(ind)
87+
ind += 1
88+
}
89+
newBS
90+
}
2991

3092
/**
3193
* Sets the bit at the specified index to true.
@@ -36,6 +98,11 @@ class BitSet(numBits: Int) {
3698
words(index >> 6) |= bitmask // div by 64 and mask
3799
}
38100

101+
def unset(index: Int) {
102+
val bitmask = 1L << (index & 0x3f) // mod 64 and shift
103+
words(index >> 6) &= ~bitmask // div by 64 and mask
104+
}
105+
39106
/**
40107
* Return the value of the bit with the specified index. The value is true if the bit with
41108
* the index is currently set in this BitSet; otherwise, the result is false.
@@ -48,6 +115,20 @@ class BitSet(numBits: Int) {
48115
(words(index >> 6) & bitmask) != 0 // div by 64 and mask
49116
}
50117

118+
/**
119+
* Get an iterator over the set bits.
120+
*/
121+
def iterator = new Iterator[Int] {
122+
var ind = nextSetBit(0)
123+
override def hasNext: Boolean = ind >= 0
124+
override def next() = {
125+
val tmp = ind
126+
ind = nextSetBit(ind+1)
127+
tmp
128+
}
129+
}
130+
131+
51132
/** Return the number of bits set to true in this BitSet. */
52133
def cardinality(): Int = {
53134
var sum = 0

core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
8484

8585
protected var _bitset = new BitSet(_capacity)
8686

87+
def getBitSet = _bitset
88+
8789
// Init of the array in constructor (instead of in declaration) to work around a Scala compiler
8890
// specialization bug that would generate two arrays (one for Object and one for specialized T).
8991
protected var _data: Array[T] = _
@@ -161,7 +163,8 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
161163
def getPos(k: T): Int = {
162164
var pos = hashcode(hasher.hash(k)) & _mask
163165
var i = 1
164-
while (true) {
166+
val maxProbe = _data.size
167+
while (i < maxProbe) {
165168
if (!_bitset.get(pos)) {
166169
return INVALID_POS
167170
} else if (k == _data(pos)) {
@@ -179,6 +182,22 @@ class OpenHashSet[@specialized(Long, Int) T: ClassTag](
179182
/** Return the value at the specified position. */
180183
def getValue(pos: Int): T = _data(pos)
181184

185+
def iterator = new Iterator[T] {
186+
var pos = nextPos(0)
187+
override def hasNext: Boolean = pos != INVALID_POS
188+
override def next(): T = {
189+
val tmp = getValue(pos)
190+
pos = nextPos(pos+1)
191+
tmp
192+
}
193+
}
194+
195+
/** Return the value at the specified position. */
196+
def getValueSafe(pos: Int): T = {
197+
assert(_bitset.get(pos))
198+
_data(pos)
199+
}
200+
182201
/**
183202
* Return the next position with an element stored, starting from the given position inclusively.
184203
*/
@@ -259,7 +278,7 @@ object OpenHashSet {
259278
* A set of specialized hash function implementation to avoid boxing hash code computation
260279
* in the specialized implementation of OpenHashSet.
261280
*/
262-
sealed class Hasher[@specialized(Long, Int) T] {
281+
sealed class Hasher[@specialized(Long, Int) T] extends Serializable {
263282
def hash(o: T): Int = o.hashCode()
264283
}
265284

docs/_layouts/global.html

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
<link rel="stylesheet" href="css/main.css">
2222

2323
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
24-
24+
2525
<link rel="stylesheet" href="css/pygments-default.css">
2626

2727
<!-- Google analytics script -->
@@ -68,9 +68,10 @@
6868
<li><a href="streaming-programming-guide.html">Spark Streaming</a></li>
6969
<li><a href="mllib-guide.html">MLlib (Machine Learning)</a></li>
7070
<li><a href="bagel-programming-guide.html">Bagel (Pregel on Spark)</a></li>
71+
<li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
7172
</ul>
7273
</li>
73-
74+
7475
<li class="dropdown">
7576
<a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
7677
<ul class="dropdown-menu">
@@ -80,6 +81,7 @@
8081
<li><a href="api/streaming/index.html#org.apache.spark.streaming.package">Spark Streaming</a></li>
8182
<li><a href="api/mllib/index.html#org.apache.spark.mllib.package">MLlib (Machine Learning)</a></li>
8283
<li><a href="api/bagel/index.html#org.apache.spark.bagel.package">Bagel (Pregel on Spark)</a></li>
84+
<li><a href="api/graphx/index.html#org.apache.spark.graphx.package">GraphX (Graph Processing)</a></li>
8385
</ul>
8486
</li>
8587

@@ -161,7 +163,7 @@ <h2>Heading</h2>
161163
<script src="js/vendor/jquery-1.8.0.min.js"></script>
162164
<script src="js/vendor/bootstrap.min.js"></script>
163165
<script src="js/main.js"></script>
164-
166+
165167
<!-- A script to fix internal hash links because we have an overlapping top bar.
166168
Based on https://github.com/twitter/bootstrap/issues/193#issuecomment-2281510 -->
167169
<script>

docs/_plugins/copy_api_dirs.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
if not (ENV['SKIP_API'] == '1' or ENV['SKIP_SCALADOC'] == '1')
2222
# Build Scaladoc for Java/Scala
23-
projects = ["core", "examples", "repl", "bagel", "streaming", "mllib"]
23+
projects = ["core", "examples", "repl", "bagel", "graphx", "streaming", "mllib"]
2424

2525
puts "Moving to project root and building scaladoc."
2626
curr_dir = pwd

docs/api.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ Here you can find links to the Scaladoc generated for the Spark sbt subprojects.
99
- [Spark Examples](api/examples/index.html)
1010
- [Spark Streaming](api/streaming/index.html)
1111
- [Bagel](api/bagel/index.html)
12+
- [GraphX](api/graphx/index.html)
1213
- [PySpark](api/pyspark/index.html)

docs/bagel-programming-guide.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ layout: global
33
title: Bagel Programming Guide
44
---
55

6+
**Bagel will soon be superseded by [GraphX](graphx-programming-guide.html); we recommend that new users try GraphX instead.**
7+
68
Bagel is a Spark implementation of Google's [Pregel](http://portal.acm.org/citation.cfm?id=1807184) graph processing framework. Bagel currently supports basic graph computation, combiners, and aggregators.
79

810
In the Pregel programming model, jobs run as a sequence of iterations called _supersteps_. In each superstep, each vertex in the graph runs a user-specified function that can update state associated with the vertex and send messages to other vertices for use in the *next* iteration.
@@ -21,7 +23,7 @@ To use Bagel in your program, add the following SBT or Maven dependency:
2123

2224
Bagel operates on a graph represented as a [distributed dataset](scala-programming-guide.html) of (K, V) pairs, where keys are vertex IDs and values are vertices plus their associated state. In each superstep, Bagel runs a user-specified compute function on each vertex that takes as input the current vertex state and a list of messages sent to that vertex during the previous superstep, and returns the new vertex state and a list of outgoing messages.
2325

24-
For example, we can use Bagel to implement PageRank. Here, vertices represent pages, edges represent links between pages, and messages represent shares of PageRank sent to the pages that a particular page links to.
26+
For example, we can use Bagel to implement PageRank. Here, vertices represent pages, edges represent links between pages, and messages represent shares of PageRank sent to the pages that a particular page links to.
2527

2628
We first extend the default `Vertex` class to store a `Double`
2729
representing the current PageRank of the vertex, and similarly extend
@@ -38,7 +40,7 @@ import org.apache.spark.bagel.Bagel._
3840
val active: Boolean) extends Vertex
3941

4042
@serializable class PRMessage(
41-
val targetId: String, val rankShare: Double) extends Message
43+
val targetId: String, val rankShare: Double) extends Message
4244
{% endhighlight %}
4345

4446
Next, we load a sample graph from a text file as a distributed dataset and package it into `PRVertex` objects. We also cache the distributed dataset because Bagel will use it multiple times and we'd like to avoid recomputing it.
@@ -114,7 +116,7 @@ Here are the actions and types in the Bagel API. See [Bagel.scala](https://githu
114116
/*** Full form ***/
115117

116118
Bagel.run(sc, vertices, messages, combiner, aggregator, partitioner, numSplits)(compute)
117-
// where compute takes (vertex: V, combinedMessages: Option[C], aggregated: Option[A], superstep: Int)
119+
// where compute takes (vertex: V, combinedMessages: Option[C], aggregated: Option[A], superstep: Int)
118120
// and returns (newVertex: V, outMessages: Array[M])
119121

120122
/*** Abbreviated forms ***/
@@ -124,7 +126,7 @@ Bagel.run(sc, vertices, messages, combiner, partitioner, numSplits)(compute)
124126
// and returns (newVertex: V, outMessages: Array[M])
125127

126128
Bagel.run(sc, vertices, messages, combiner, numSplits)(compute)
127-
// where compute takes (vertex: V, combinedMessages: Option[C], superstep: Int)
129+
// where compute takes (vertex: V, combinedMessages: Option[C], superstep: Int)
128130
// and returns (newVertex: V, outMessages: Array[M])
129131

130132
Bagel.run(sc, vertices, messages, numSplits)(compute)

0 commit comments

Comments
 (0)