-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-1065] [PySpark] improve supporting for large broadcast #1912
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6226189
e93cf4b
9a7161f
c7baa8c
631a827
db3f232
e06df4a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,18 +21,16 @@ | |
| >>> b = sc.broadcast([1, 2, 3, 4, 5]) | ||
| >>> b.value | ||
| [1, 2, 3, 4, 5] | ||
|
|
||
| >>> from pyspark.broadcast import _broadcastRegistry | ||
| >>> _broadcastRegistry[b.bid] = b | ||
| >>> from cPickle import dumps, loads | ||
| >>> loads(dumps(b)).value | ||
| [1, 2, 3, 4, 5] | ||
|
|
||
| >>> sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect() | ||
| [1, 2, 3, 4, 5, 1, 2, 3, 4, 5] | ||
| >>> b.unpersist() | ||
|
|
||
| >>> large_broadcast = sc.broadcast(list(range(10000))) | ||
| """ | ||
| import os | ||
|
|
||
| from pyspark.serializers import CompressedSerializer, PickleSerializer | ||
|
|
||
| # Holds broadcasted data received from Java, keyed by its id. | ||
| _broadcastRegistry = {} | ||
|
|
||
|
|
@@ -52,17 +50,38 @@ class Broadcast(object): | |
| Access its value through C{.value}. | ||
| """ | ||
|
|
||
| def __init__(self, bid, value, java_broadcast=None, pickle_registry=None): | ||
| def __init__(self, bid, value, java_broadcast=None, | ||
| pickle_registry=None, path=None): | ||
| """ | ||
| Should not be called directly by users -- use | ||
| L{SparkContext.broadcast()<pyspark.context.SparkContext.broadcast>} | ||
| instead. | ||
| """ | ||
| self.value = value | ||
| self.bid = bid | ||
| if path is None: | ||
| self.value = value | ||
| self._jbroadcast = java_broadcast | ||
| self._pickle_registry = pickle_registry | ||
| self.path = path | ||
|
|
||
| def unpersist(self, blocking=False): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a docstring? It's fine to just copy it over from the Scala equivalent. In this case: /**
* Delete cached copies of this broadcast on the executors. If the broadcast is used after
* this is called, it will need to be re-sent to each executor.
* @param blocking Whether to block until unpersisting has completed
*/ |
||
| self._jbroadcast.unpersist(blocking) | ||
| os.unlink(self.path) | ||
|
|
||
| def __reduce__(self): | ||
| self._pickle_registry.add(self) | ||
| return (_from_id, (self.bid, )) | ||
|
|
||
| def __getattr__(self, item): | ||
| if item == 'value' and self.path is not None: | ||
| ser = CompressedSerializer(PickleSerializer()) | ||
| value = ser.load_stream(open(self.path)).next() | ||
| self.value = value | ||
| return value | ||
|
|
||
| raise AttributeError(item) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| import doctest | ||
| doctest.testmod() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,7 +35,7 @@ | |
|
|
||
| from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \ | ||
| BatchedSerializer, CloudPickleSerializer, PairDeserializer, \ | ||
| PickleSerializer, pack_long | ||
| PickleSerializer, pack_long, CompressedSerializer | ||
| from pyspark.join import python_join, python_left_outer_join, \ | ||
| python_right_outer_join, python_cogroup | ||
| from pyspark.statcounter import StatCounter | ||
|
|
@@ -1809,7 +1809,8 @@ def _jrdd(self): | |
| self._jrdd_deserializer = NoOpSerializer() | ||
| command = (self.func, self._prev_jrdd_deserializer, | ||
| self._jrdd_deserializer) | ||
| pickled_command = CloudPickleSerializer().dumps(command) | ||
| ser = CompressedSerializer(CloudPickleSerializer()) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a good idea. It wouldn't surprise me if the pickle data was highly compressible due to frequently-occuring groups of pickle opcodes, etc. |
||
| pickled_command = ser.dumps(command) | ||
| broadcast_vars = ListConverter().convert( | ||
| [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], | ||
| self.ctx._gateway._gateway_client) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good call here; it was a bad idea to expose these internals in user-facing module doctests.