-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-32916][SHUFFLE] Implementation of shuffle service that leverages push-based shuffle in YARN deployment mode #30062
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
df48e01
6472267
1c78e1d
71f3246
221178f
55b4a5f
f9d0e86
548e2c0
50efba9
8a6e01b
ae5ffac
e51042b
83aca99
04e0efe
71dfd48
0c411c1
d029463
8f3839f
1cd2d03
3356c19
d879beb
48ae819
9b031f7
807cc7b
5b169bc
9ece587
d13c7ad
63843bb
d35aa4b
ba92311
9be25b3
ba51796
1f4fcfe
28edaae
cb1881c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
… reduceId in a string
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -273,16 +273,9 @@ void deleteExecutorDirs(Path[] dirs) { | |
| @Override | ||
| public StreamCallbackWithID receiveBlockDataAsStream(PushBlockStream msg) { | ||
| // Retrieve merged shuffle file metadata | ||
| String[] blockIdParts = msg.blockId.split("_"); | ||
| if (blockIdParts.length != 4 || !blockIdParts[0].equals("shuffle")) { | ||
| throw new IllegalArgumentException("Unexpected shuffle block id format: " + msg.blockId); | ||
| } | ||
| AppShuffleId appShuffleId = new AppShuffleId(msg.appId, Integer.parseInt(blockIdParts[1])); | ||
| int mapIndex = Integer.parseInt(blockIdParts[2]); | ||
| int reduceId = Integer.parseInt(blockIdParts[3]); | ||
| AppShuffleId appShuffleId = new AppShuffleId(msg.appId, msg.shuffleId); | ||
| AppShufflePartitionInfo partitionInfoBeforeCheck = | ||
| getOrCreateAppShufflePartitionInfo(appShuffleId, reduceId); | ||
|
|
||
| getOrCreateAppShufflePartitionInfo(appShuffleId, msg.reduceId); | ||
| // Here partitionInfo will be null in 2 cases: | ||
| // 1) The request is received for a block that has already been merged, this is possible due | ||
| // to the retry logic. | ||
|
|
@@ -321,17 +314,18 @@ public StreamCallbackWithID receiveBlockDataAsStream(PushBlockStream msg) { | |
| final boolean isTooLate = partitionInfoBeforeCheck == null; | ||
| // Check if the given block is already merged by checking the bitmap against the given map index | ||
| final AppShufflePartitionInfo partitionInfo = partitionInfoBeforeCheck != null | ||
| && partitionInfoBeforeCheck.mapTracker.contains(mapIndex) ? null : partitionInfoBeforeCheck; | ||
| && partitionInfoBeforeCheck.mapTracker.contains(msg.mapIndex) ? null | ||
| : partitionInfoBeforeCheck; | ||
| if (partitionInfo != null) { | ||
| return new PushBlockStreamCallback( | ||
| this, msg, appShuffleId, reduceId, mapIndex, partitionInfo); | ||
| this, msg, appShuffleId, msg.reduceId, msg.mapIndex, partitionInfo); | ||
| } else { | ||
| // For a duplicate block or a block which is late, respond back with a callback that handles | ||
| // them differently. | ||
| return new StreamCallbackWithID() { | ||
| @Override | ||
| public String getID() { | ||
| return msg.blockId; | ||
| return msg.streamId; | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -345,7 +339,7 @@ public void onComplete(String streamId) { | |
| if (isTooLate) { | ||
| // Throw an exception here so the block data is drained from channel and server | ||
| // responds RpcFailure to the client. | ||
| throw new RuntimeException(String.format("Block %s %s", msg.blockId, | ||
| throw new RuntimeException(String.format("Block %s %s", msg.streamId, | ||
| ErrorHandler.BlockPushErrorHandler.TOO_LATE_MESSAGE_SUFFIX)); | ||
| } | ||
| // For duplicate block that is received before the shuffle merge finalizes, the | ||
|
|
@@ -463,7 +457,7 @@ private PushBlockStreamCallback( | |
|
|
||
| @Override | ||
| public String getID() { | ||
| return msg.blockId; | ||
| return msg.streamId; | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -635,7 +629,7 @@ public void onComplete(String streamId) throws IOException { | |
| // however is already finalized. We should thus respond RpcFailure to the client. | ||
| if (shufflePartitions == null || !shufflePartitions.containsKey(reduceId)) { | ||
| deferredBufs = null; | ||
| throw new RuntimeException(String.format("Block %s %s", msg.blockId, | ||
| throw new RuntimeException(String.format("Block %s %s", msg.streamId, | ||
| ErrorHandler.BlockPushErrorHandler.TOO_LATE_MESSAGE_SUFFIX)); | ||
| } | ||
| // Check if we can commit this block | ||
|
|
@@ -668,7 +662,7 @@ public void onComplete(String streamId) throws IOException { | |
| deferredBufs = null; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we log a warning too in addition to the exception?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are currently logged during the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. I know this's the expected behavior but I really care about this case and want to have a way to easily monitor it.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a good point. In fact, we have also started working on adding server-side metrics that will account for colliding blocks. Metrics will also make it clearer whether any improvements in this area is effective or not. Adding these metrics are pending tasks listed in the SPIP but I haven't created a Spark jira for it yet.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd prefer to get this information through metrics instead of log. |
||
| throw new RuntimeException(String.format("%s %s to merged shuffle", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The case for collisions are not tested. Had you considered storing the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a
I don't understand the suggestion copletely. The deferredBufs are part of the stream and if the stream doesn't get opportunity to write, then it lands here. We are not storing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a previous conversation related to this here.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have also added another simple UT for collisions.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My idea is sharing those
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. That will impact the memory usage though. Currently these There is another discussion about limiting the number of bufs in memory here which is relevant to this topic. I do think that if we decide to try this, it should be a follow-up because this change requires us to evaluate how much memory the shuffle service will use.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, a follow-up PR is fine for me. My question was really about whether this idea was considered and evaluated. For sure the memory will be increased (at least the time how long this deferred buffs will be kept in the memory but of course as other streams are working this will lead to increased amount too).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have updated this jira
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have left a comment to the JIRA.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Want to add that one of the things we are considering is the necessity of a bounded in-memory buffer on the server side to help with writing the blocks to merged files. |
||
| ErrorHandler.BlockPushErrorHandler.BLOCK_APPEND_COLLISION_DETECTED_MSG_PREFIX, | ||
| msg.blockId)); | ||
| msg.streamId)); | ||
| } | ||
| } | ||
| isWriting = false; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.