-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-14267] [SQL] [PYSPARK] execute multiple Python UDFs within single batch #12057
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
f6b7373
8e6e5bc
8dc1adf
dd71ba9
8597bba
72a5ec0
876f9f9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -86,7 +86,7 @@ private[spark] case class ChainedPythonFunctions(funcs: Seq[PythonFunction]) | |
| private[spark] object PythonRunner { | ||
| def apply(func: PythonFunction, bufferSize: Int, reuse_worker: Boolean): PythonRunner = { | ||
| new PythonRunner( | ||
| Seq(ChainedPythonFunctions(Seq(func))), bufferSize, reuse_worker, false, Seq(Seq(0))) | ||
| Seq(ChainedPythonFunctions(Seq(func))), bufferSize, reuse_worker, false, Array(Array(0))) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -101,7 +101,7 @@ private[spark] class PythonRunner( | |
| bufferSize: Int, | ||
| reuse_worker: Boolean, | ||
| isUDF: Boolean, | ||
| argOffsets: Seq[Seq[Int]]) | ||
| argOffsets: Array[Array[Int]]) | ||
| extends Logging { | ||
|
|
||
| require(funcs.length == argOffsets.length, "numArgs should have the same length as funcs") | ||
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,7 +29,7 @@ | |
| from pyspark.broadcast import Broadcast, _broadcastRegistry | ||
| from pyspark.files import SparkFiles | ||
| from pyspark.serializers import write_with_length, write_int, read_long, \ | ||
| write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer, AutoBatchedSerializer | ||
| write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer, BatchedSerializer | ||
| from pyspark import shuffle | ||
|
|
||
| pickleSer = PickleSerializer() | ||
|
|
@@ -101,7 +101,7 @@ def read_udfs(pickleSer, infile): | |
| mapper = eval(mapper_str, udfs) | ||
|
|
||
| func = lambda _, it: map(mapper, it) | ||
| ser = AutoBatchedSerializer(PickleSerializer()) | ||
| ser = BatchedSerializer(PickleSerializer(), 100) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What serializer did we use before? 100 seems arbitrary here.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Before this patch, we use AutoBatchedSerialzier, which could hold thousands of rows (holding more rows in JVM, may cause OOM). |
||
| # profiling is not supported for UDF | ||
| return func, None, ser, ser | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similarly, do you mind adding scaldoc for these two new parameters?