-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-29434][Core] Improve the MapStatuses Serialization Performance #26085
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
78bbcf2
9714f39
18e5bda
39502ed
3958c01
a4a807e
d0c3532
ed08f2e
a601356
8dc8fad
0bf182a
bd88abd
f184c4c
bc6a14c
5aadd8f
d7fce82
7095b60
08c8fb2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,19 +17,21 @@ | |
|
|
||
| package org.apache.spark | ||
|
|
||
| import java.io._ | ||
| import java.io.{ByteArrayInputStream, ObjectInputStream, ObjectOutputStream} | ||
| import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, ThreadPoolExecutor, TimeUnit} | ||
| import java.util.concurrent.locks.ReentrantReadWriteLock | ||
|
|
||
| import com.github.luben.zstd.ZstdInputStream | ||
| import com.github.luben.zstd.ZstdOutputStream | ||
| import scala.collection.JavaConverters._ | ||
| import scala.collection.mutable.{HashMap, ListBuffer, Map} | ||
| import scala.concurrent.{ExecutionContext, Future} | ||
| import scala.concurrent.duration.Duration | ||
| import scala.reflect.ClassTag | ||
| import scala.util.control.NonFatal | ||
|
|
||
| import com.github.luben.zstd.ZstdInputStream | ||
| import com.github.luben.zstd.ZstdOutputStream | ||
| import org.apache.commons.io.output.{ByteArrayOutputStream => ApacheByteArrayOutputStream} | ||
|
|
||
| import org.apache.spark.broadcast.{Broadcast, BroadcastManager} | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.internal.config._ | ||
|
|
@@ -812,13 +814,14 @@ private[spark] object MapOutputTracker extends Logging { | |
| // generally be pretty compressible because many map outputs will be on the same hostname. | ||
| def serializeMapStatuses(statuses: Array[MapStatus], broadcastManager: BroadcastManager, | ||
| isLocal: Boolean, minBroadcastSize: Int): (Array[Byte], Broadcast[Array[Byte]]) = { | ||
| import scala.language.reflectiveCalls | ||
| val out = new ByteArrayOutputStream(4096) { | ||
| // exposing `buf` directly to avoid copy | ||
| def getBuf: Array[Byte] = buf | ||
| } | ||
| val objOut = new ObjectOutputStream(out) | ||
| // Using `org.apache.commons.io.output.ByteArrayOutputStream` instead of the standard one | ||
| // This implementation doesn't reallocate the whole memory block but allocates | ||
| // additional buffers. This way no buffers need to be garbage collected and | ||
| // the contents don't have to be copied to the new buffer. | ||
| val out = new ApacheByteArrayOutputStream() | ||
| val compressedOut = new ApacheByteArrayOutputStream() | ||
|
|
||
| val objOut = new ObjectOutputStream(out) | ||
| Utils.tryWithSafeFinally { | ||
| // Since statuses can be modified in parallel, sync on it | ||
| statuses.synchronized { | ||
|
|
@@ -829,16 +832,15 @@ private[spark] object MapOutputTracker extends Logging { | |
| } | ||
|
|
||
| val arr: Array[Byte] = { | ||
| val compressedOut = new ByteArrayOutputStream(4096) | ||
| val zos = new ZstdOutputStream(compressedOut) | ||
| Utils.tryWithSafeFinally { | ||
| compressedOut.write(DIRECT) | ||
| zos.write(out.getBuf, 0, out.size()) | ||
| // `out.writeTo(zos)` will write the uncompressed data from `out` to `zos` | ||
| // without copying to avoid unnecessary allocation and copy of byte[]. | ||
| out.writeTo(zos) | ||
| } { | ||
| zos.close() | ||
| } | ||
| // We don't want to use the internal `buf` of `compressedOut` as it can be larger than | ||
| // the actual used size since it's a buffer kept growing. | ||
| compressedOut.toByteArray | ||
| } | ||
| if (arr.length >= minBroadcastSize) { | ||
|
|
@@ -854,11 +856,11 @@ private[spark] object MapOutputTracker extends Logging { | |
| oos.close() | ||
| } | ||
| val outArr = { | ||
| val compressedOut = new ByteArrayOutputStream(4096) | ||
| compressedOut.reset() | ||
| val zos = new ZstdOutputStream(compressedOut) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, @dbtsai , I am back-porting this into our internal repo. Looks like this compression is unnecessary since
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The actually value of the data (which is already compressed) will not be in the serialized form of
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for your clarification. It's indeed not including the compressed data. |
||
| Utils.tryWithSafeFinally { | ||
| compressedOut.write(BROADCAST) | ||
| zos.write(out.getBuf, 0, out.size()) | ||
| out.writeTo(zos) | ||
| } { | ||
| zos.close() | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.