SPARK-25004: Add spark.executor.pyspark.memory limit.

apache · rdblue · May 9, 2018 · Aug 3, 2018 · Aug 3, 2018 · Aug 13, 2018
commit a5004badcea9527873e976a208a83abef2a73b66
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -37,6 +37,7 @@ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.input.PortableDataStream
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.PYSPARK_EXECUTOR_MEMORY
 import org.apache.spark.rdd.RDD
 import org.apache.spark.security.SocketAuthHelper
 import org.apache.spark.util._
@@ -52,6 +53,17 @@ private[spark] class PythonRDD(
   val bufferSize = conf.getInt("spark.buffer.size", 65536)
   val reuseWorker = conf.getBoolean("spark.python.worker.reuse", true)
 
+  val memoryMb = {
+    val allocation = conf.get(PYSPARK_EXECUTOR_MEMORY)
+    if (reuseWorker) {
+      // the shared python worker gets the entire allocation
+      allocation
+    } else {
+      // each python worker gets an equal part of the allocation
+      allocation.map(_ / conf.getInt("spark.executor.cores", 1))
+    }
+  }
+
   override def getPartitions: Array[Partition] = firstParent.partitions
 
   override val partitioner: Option[Partitioner] = {
@@ -61,7 +73,7 @@ private[spark] class PythonRDD(
   val asJavaRDD: JavaRDD[Array[Byte]] = JavaRDD.fromRDD(this)
 
   override def compute(split: Partition, context: TaskContext): Iterator[Array[Byte]] = {
-    val runner = PythonRunner(func, bufferSize, reuseWorker)
+    val runner = PythonRunner(func, bufferSize, reuseWorker, memoryMb)
     runner.compute(firstParent.iterator(split, context), split.index, context)
   }
 

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -65,7 +65,8 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
     bufferSize: Int,
     reuseWorker: Boolean,
     evalType: Int,
-    argOffsets: Array[Array[Int]])
+    argOffsets: Array[Array[Int]],
+    pythonMemoryMb: Option[Long])
   extends Logging {
 
   require(funcs.length == argOffsets.length, "argOffsets should have the same length as funcs")
@@ -95,6 +96,9 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
     if (reuseWorker) {
       envVars.put("SPARK_REUSE_WORKER", "1")
     }
+    if (pythonMemoryMb.isDefined) {
+      envVars.put("PYSPARK_EXECUTOR_MEMORY_MB", pythonMemoryMb.get.toString)
+    }
     val worker: Socket = env.createPythonWorker(pythonExec, envVars.asScala.toMap)
     // Whether is the worker released into idle pool
     val released = new AtomicBoolean(false)
@@ -485,8 +489,12 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
 
 private[spark] object PythonRunner {
 
-  def apply(func: PythonFunction, bufferSize: Int, reuseWorker: Boolean): PythonRunner = {
-    new PythonRunner(Seq(ChainedPythonFunctions(Seq(func))), bufferSize, reuseWorker)
+  def apply(
+      func: PythonFunction,
+      bufferSize: Int,
+      reuseWorker: Boolean,
+      pyMemoryMb: Option[Long]): PythonRunner = {
+    new PythonRunner(Seq(ChainedPythonFunctions(Seq(func))), bufferSize, reuseWorker, pyMemoryMb)
   }
 }
 
@@ -496,9 +504,10 @@ private[spark] object PythonRunner {
 private[spark] class PythonRunner(
     funcs: Seq[ChainedPythonFunctions],
     bufferSize: Int,
-    reuseWorker: Boolean)
+    reuseWorker: Boolean,
+    pyMemoryMb: Option[Long])
   extends BasePythonRunner[Array[Byte], Array[Byte]](
-    funcs, bufferSize, reuseWorker, PythonEvalType.NON_UDF, Array(Array(0))) {
+    funcs, bufferSize, reuseWorker, PythonEvalType.NON_UDF, Array(Array(0)), pyMemoryMb) {
 
   protected override def newWriterThread(
       env: SparkEnv,

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -114,6 +114,10 @@ package object config {
     .checkValue(_ >= 0, "The off-heap memory size must not be negative")
     .createWithDefault(0)
 
+  private[spark] val PYSPARK_EXECUTOR_MEMORY = ConfigBuilder("spark.executor.pyspark.memory")
+      .bytesConf(ByteUnit.MiB)
+      .createOptional
+
   private[spark] val IS_PYTHON_APP = ConfigBuilder("spark.yarn.isPython").internal()
     .booleanConf.createWithDefault(false)
 

diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -22,6 +22,7 @@
 import os
 import sys
 import time
+import resource
 import socket
 import traceback
 
@@ -263,6 +264,27 @@ def main(infile, outfile):
         isBarrier = read_bool(infile)
         boundPort = read_int(infile)
         secret = UTF8Deserializer().loads(infile)
+
+        # set up memory limits
+        memory_limit_mb = int(os.environ.get('PYSPARK_EXECUTOR_MEMORY_MB', "-1"))
+        total_memory = resource.RLIMIT_AS
+        try:
+            (total_memory_limit, max_total_memory) = resource.getrlimit(total_memory)
+            msg = "Current mem: {0} of max {1}\n".format(total_memory_limit, max_total_memory)
+            sys.stderr.write()
+
+            if memory_limit_mb > 0 and total_memory_limit < 0:
+                # convert to bytes
+                total_memory_limit = memory_limit_mb * 1024 * 1024
+
+                msg = "Setting mem to {0} of max {1}\n".format(total_memory_limit, max_total_memory)
+                sys.stderr.write(msg)
+                resource.setrlimit(total_memory, (total_memory_limit, total_memory_limit))
+
+        except (resource.error, OSError) as e:
+            # not all systems support resource limits, so warn instead of failing
+            sys.stderr.write("WARN: Failed to set memory limit: {0}\n".format(e))
+
         # initialize global state
         taskContext = None
         if isBarrier:

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -91,6 +91,13 @@ private[spark] class Client(
   private val executorMemoryOverhead = sparkConf.get(EXECUTOR_MEMORY_OVERHEAD).getOrElse(
     math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toLong, MEMORY_OVERHEAD_MIN)).toInt
 
+  private val isPython = sparkConf.get(IS_PYTHON_APP)
+  private val pysparkWorkerMemory: Int = if (isPython) {
+    sparkConf.get(PYSPARK_EXECUTOR_MEMORY).map(_.toInt).getOrElse(0)
+  } else {
+    0
+  }
+
   private val distCacheMgr = new ClientDistributedCacheManager()
 
   private val principal = sparkConf.get(PRINCIPAL).orNull
@@ -333,7 +340,7 @@ private[spark] class Client(
     val maxMem = newAppResponse.getMaximumResourceCapability().getMemory()
     logInfo("Verifying our application has not requested more than the maximum " +
       s"memory capability of the cluster ($maxMem MB per container)")
-    val executorMem = executorMemory + executorMemoryOverhead
+    val executorMem = executorMemory + executorMemoryOverhead + pysparkWorkerMemory
     if (executorMem > maxMem) {
       throw new IllegalArgumentException(s"Required executor memory ($executorMemory" +
         s"+$executorMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster! " +

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -133,10 +133,17 @@ private[yarn] class YarnAllocator(
   // Additional memory overhead.
   protected val memoryOverhead: Int = sparkConf.get(EXECUTOR_MEMORY_OVERHEAD).getOrElse(
     math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toInt, MEMORY_OVERHEAD_MIN)).toInt
+  protected val pysparkWorkerMemory: Int = if (sparkConf.get(IS_PYTHON_APP)) {
+    sparkConf.get(PYSPARK_EXECUTOR_MEMORY).map(_.toInt).getOrElse(0)
+  } else {
+    0
+  }
   // Number of cores per executor.
   protected val executorCores = sparkConf.get(EXECUTOR_CORES)
   // Resource capability requested for each executors
-  private[yarn] val resource = Resource.newInstance(executorMemory + memoryOverhead, executorCores)
+  private[yarn] val resource = Resource.newInstance(
+    executorMemory + memoryOverhead + pysparkWorkerMemory,
+    executorCores)
 
   private val launcherPool = ThreadUtils.newDaemonCachedThreadPool(
     "ContainerLauncher", sparkConf.get(CONTAINER_LAUNCH_MAX_THREADS))

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AggregateInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AggregateInPandasExec.scala
@@ -23,6 +23,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
+import org.apache.spark.internal.config.PYSPARK_EXECUTOR_MEMORY
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -81,6 +82,17 @@ case class AggregateInPandasExec(
 
     val bufferSize = inputRDD.conf.getInt("spark.buffer.size", 65536)
     val reuseWorker = inputRDD.conf.getBoolean("spark.python.worker.reuse", defaultValue = true)
+    val memoryMb = {
+      val allocation = inputRDD.conf.get(PYSPARK_EXECUTOR_MEMORY)
+      if (reuseWorker) {
+        // the shared python worker gets the entire allocation
+        allocation
+      } else {
+        // each python worker gets an equal part of the allocation
+        allocation.map(_ / inputRDD.conf.getInt("spark.executor.cores", 1))
+      }
+    }
+
     val sessionLocalTimeZone = conf.sessionLocalTimeZone
     val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
 
@@ -139,6 +151,7 @@ case class AggregateInPandasExec(
         pyFuncs,
         bufferSize,
         reuseWorker,
+        memoryMb,
         PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
         argOffsets,
         aggInputSchema,

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowEvalPythonExec.scala
@@ -70,6 +70,7 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
       funcs: Seq[ChainedPythonFunctions],
       bufferSize: Int,
       reuseWorker: Boolean,
+      pyMemoryMb: Option[Long],
       argOffsets: Array[Array[Int]],
       iter: Iterator[InternalRow],
       schema: StructType,
@@ -84,6 +85,7 @@ case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
       funcs,
       bufferSize,
       reuseWorker,
+      pyMemoryMb,
       PythonEvalType.SQL_SCALAR_PANDAS_UDF,
       argOffsets,
       schema,

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -41,13 +41,14 @@ class ArrowPythonRunner(
     funcs: Seq[ChainedPythonFunctions],
     bufferSize: Int,
     reuseWorker: Boolean,
+    pyMemoryMb: Option[Long],
     evalType: Int,
     argOffsets: Array[Array[Int]],
     schema: StructType,
     timeZoneId: String,
     conf: Map[String, String])
   extends BasePythonRunner[Iterator[InternalRow], ColumnarBatch](
-    funcs, bufferSize, reuseWorker, evalType, argOffsets) {
+    funcs, bufferSize, reuseWorker, evalType, argOffsets, pyMemoryMb) {
 
   protected override def newWriterThread(
       env: SparkEnv,

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
@@ -23,6 +23,7 @@ import net.razorvine.pickle.{Pickler, Unpickler}
 
 import org.apache.spark.TaskContext
 import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
+import org.apache.spark.internal.config.PYSPARK_EXECUTOR_MEMORY
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.SparkPlan
@@ -38,6 +39,7 @@ case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
       funcs: Seq[ChainedPythonFunctions],
       bufferSize: Int,
       reuseWorker: Boolean,
+      pyMemoryMb: Option[Long],
       argOffsets: Array[Array[Int]],
       iter: Iterator[InternalRow],
       schema: StructType,
@@ -69,7 +71,7 @@ case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
 
     // Output iterator for results from Python.
     val outputIterator = new PythonUDFRunner(
-        funcs, bufferSize, reuseWorker, PythonEvalType.SQL_BATCHED_UDF, argOffsets)
+        funcs, bufferSize, reuseWorker, PythonEvalType.SQL_BATCHED_UDF, argOffsets, pyMemoryMb)
       .compute(inputIterator, context.partitionId(), context)
 
     val unpickle = new Unpickler

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala
@@ -23,6 +23,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.api.python.ChainedPythonFunctions
+import org.apache.spark.internal.config.PYSPARK_EXECUTOR_MEMORY
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -80,6 +81,7 @@ abstract class EvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chil
       funcs: Seq[ChainedPythonFunctions],
       bufferSize: Int,
       reuseWorker: Boolean,
+      pyMemoryMb: Option[Long],
       argOffsets: Array[Array[Int]],
       iter: Iterator[InternalRow],
       schema: StructType,
@@ -89,6 +91,16 @@ abstract class EvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chil
     val inputRDD = child.execute().map(_.copy())
     val bufferSize = inputRDD.conf.getInt("spark.buffer.size", 65536)
     val reuseWorker = inputRDD.conf.getBoolean("spark.python.worker.reuse", defaultValue = true)
+    val memoryMb = {
+      val allocation = inputRDD.conf.get(PYSPARK_EXECUTOR_MEMORY)
+      if (reuseWorker) {
+        // the shared python worker gets the entire allocation
+        allocation
+      } else {
+        // each python worker gets an equal part of the allocation
+        allocation.map(_ / inputRDD.conf.getInt("spark.executor.cores", 1))
+      }
+    }
 
     inputRDD.mapPartitions { iter =>
       val context = TaskContext.get()
@@ -129,7 +141,7 @@ abstract class EvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chil
       }
 
       val outputRowIterator = evaluate(
-        pyFuncs, bufferSize, reuseWorker, argOffsets, projectedRowIter, schema, context)
+        pyFuncs, bufferSize, reuseWorker, memoryMb, argOffsets, projectedRowIter, schema, context)
 
       val joined = new JoinedRow
       val resultProj = UnsafeProjection.create(output, output)

diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala b/...core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala
@@ -22,6 +22,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.TaskContext
 import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
+import org.apache.spark.internal.config.PYSPARK_EXECUTOR_MEMORY
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -76,6 +77,16 @@ case class FlatMapGroupsInPandasExec(
 
     val bufferSize = inputRDD.conf.getInt("spark.buffer.size", 65536)
     val reuseWorker = inputRDD.conf.getBoolean("spark.python.worker.reuse", defaultValue = true)
+    val memoryMb = {
+      val allocation = inputRDD.conf.get(PYSPARK_EXECUTOR_MEMORY)
+      if (reuseWorker) {
+        // the shared python worker gets the entire allocation
+        allocation
+      } else {
+        // each python worker gets an equal part of the allocation
+        allocation.map(_ / inputRDD.conf.getInt("spark.executor.cores", 1))
+      }
+    }
     val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction)))
     val sessionLocalTimeZone = conf.sessionLocalTimeZone
     val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)
@@ -143,6 +154,7 @@ case class FlatMapGroupsInPandasExec(
         chainedFunc,
         bufferSize,
         reuseWorker,
+        memoryMb,
         PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
         argOffsets,
         dedupSchema,

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonForeachWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonForeachWriter.scala
@@ -24,6 +24,7 @@ import java.util.concurrent.locks.ReentrantLock
 import org.apache.spark.{SparkEnv, TaskContext}
 import org.apache.spark.api.python._
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.PYSPARK_EXECUTOR_MEMORY
 import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.sql.ForeachWriter
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
@@ -48,7 +49,17 @@ class PythonForeachWriter(func: PythonFunction, schema: StructType)
     val conf = SparkEnv.get.conf
     val bufferSize = conf.getInt("spark.buffer.size", 65536)
     val reuseWorker = conf.getBoolean("spark.python.worker.reuse", true)
-    PythonRunner(func, bufferSize, reuseWorker)
+    val memoryMb = {
+      val allocation = conf.get(PYSPARK_EXECUTOR_MEMORY)
+      if (reuseWorker) {
+        // the shared python worker gets the entire allocation
+        allocation
+      } else {
+        // each python worker gets an equal part of the allocation
+        allocation.map(_ / conf.getInt("spark.executor.cores", 1))
+      }
+    }
+    PythonRunner(func, bufferSize, reuseWorker, memoryMb)
   }
 
   private lazy val outputIterator =

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala
@@ -32,9 +32,10 @@ class PythonUDFRunner(
     bufferSize: Int,
     reuseWorker: Boolean,
     evalType: Int,
-    argOffsets: Array[Array[Int]])
+    argOffsets: Array[Array[Int]],
+    pyMemoryMb: Option[Long])
   extends BasePythonRunner[Array[Byte], Array[Byte]](
-    funcs, bufferSize, reuseWorker, evalType, argOffsets) {
+    funcs, bufferSize, reuseWorker, evalType, argOffsets, pyMemoryMb) {
 
   protected override def newWriterThread(
       env: SparkEnv,