Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3f8321a
Integration of ProcessTreeMetrics with PR 21221
Jul 26, 2018
cd16a75
Changing the position of ptree and also make the computation configur…
Aug 7, 2018
94c2b04
Seperate metrics for jvm, python and others and update the tests
Aug 8, 2018
062f5d7
Update JsonProtocolSuite
Sep 25, 2018
245221d
[SPARK-24958] Add executors' process tree total memory information to…
Oct 2, 2018
c72be03
Adressing most of Imran's comments
Oct 3, 2018
8f3c938
Fixing the scala style and some minor comments
Oct 3, 2018
f2dca27
Removing types from the definitions where ever possible
Oct 4, 2018
a9f924c
Using Utils methods when possible or use ProcessBuilder
Oct 5, 2018
a11e3a2
make use of Utils.trywithresources
Oct 5, 2018
34ad625
Changing ExecutorMericType and ExecutorMetrics to use a map instead o…
Oct 9, 2018
415f976
Changing ExecutorMetric to use array instead of a map
Oct 10, 2018
067b81d
A small cosmetic change
Oct 10, 2018
18ee4ad
Merge branch 'master' of https://github.com/apache/spark into ptreeme…
Oct 17, 2018
7f7ed2b
Applying latest review commments. Using Arrays instead of Map for ret…
Oct 23, 2018
f3867ff
Merge branch 'master' of https://github.com/apache/spark into ptreeme…
Nov 5, 2018
0f8f3e2
Fix an issue with jsonProtoclSuite
Nov 5, 2018
ea08c61
Fix scalastyle issue
Nov 5, 2018
8f20857
Applying latest review comments
Nov 14, 2018
6e65360
Using the companion object and other stuff
Nov 27, 2018
4659f4a
Update the use of process builder and applying other review comments
Nov 28, 2018
ef4be38
Small style fixes based on reviews
Nov 30, 2018
805741c
Applying review comments, mostly style related
Nov 30, 2018
4c1f073
emove the unnecessary trywithresources
Nov 30, 2018
0a7402e
Applying the comment about error handling and some more style fixes
Dec 4, 2018
3d65b35
Removing a return
Dec 6, 2018
6eab315
Reordering of info in a test resource file to avoid confusion
Dec 6, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Applying latest review commments. Using Arrays instead of Map for ret…
…urning set of metrics.
  • Loading branch information
Reza Safi committed Oct 23, 2018
commit 7f7ed2bdf5740bd2c4ae8cf2090ba7f016ffb023
11 changes: 6 additions & 5 deletions core/src/main/scala/org/apache/spark/Heartbeater.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package org.apache.spark

import java.util.concurrent.TimeUnit

import org.apache.spark.deploy.history.LogInfo
import org.apache.spark.executor.ExecutorMetrics
import org.apache.spark.internal.Logging
import org.apache.spark.memory.MemoryManager
Expand Down Expand Up @@ -60,12 +59,14 @@ private[spark] class Heartbeater(
heartbeater.awaitTermination(10, TimeUnit.SECONDS)
}

/** Get the current executor level metrics. These are returned as a Map */
/** Get the current executor level metrics. These are returned as an Array */
def getCurrentMetrics(): ExecutorMetrics = {
// figure out how to append all the metrics
var metrics = Map.empty[String, Long]
val metrics = new Array[Long](ExecutorMetricType.numberOfMetrics)
var offset = 0
ExecutorMetricType.metricGetters.foreach { metric =>
metrics ++= metric.getMetricSet(memoryManager)
val newSetOfMetrics = metric.getMetricSet(memoryManager)
Array.copy(newSetOfMetrics, 0, metrics, offset, newSetOfMetrics.size)
offset += newSetOfMetrics.length
}
new ExecutorMetrics(metrics)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ import org.apache.spark.metrics.ExecutorMetricType
@DeveloperApi
class ExecutorMetrics private[spark] extends Serializable {

private val metrics = new Array[Long](ExecutorMetricType.definedMetrics.length)
private val metrics = new Array[Long](ExecutorMetricType.numberOfMetrics)
// the first element is initialized to -1, indicating that the values for the array
// haven't been set yet.
metrics(0) = -1

/** Returns the value for the specified metric. */
def getMetricValue(metricName: String): Long = {
metrics(ExecutorMetricType.metricIdxMap(metricName))
metrics(ExecutorMetricType.definedMetricsAndOffset.get(metricName).get)
}

/** Returns true if the values for the metrics have been set, false otherwise. */
Expand All @@ -55,8 +55,8 @@ class ExecutorMetrics private[spark] extends Serializable {
*/
private[spark] def this(executorMetrics: Map[String, Long]) {
this()
(0 until ExecutorMetricType.definedMetrics.length).foreach { idx =>
metrics(idx) = executorMetrics.getOrElse(ExecutorMetricType.definedMetrics(idx), 0L)
ExecutorMetricType.definedMetricsAndOffset.map { m =>
metrics(m._2) = executorMetrics.getOrElse(m._1, 0L)
}
}

Expand All @@ -69,10 +69,10 @@ class ExecutorMetrics private[spark] extends Serializable {
*/
private[spark] def compareAndUpdatePeakValues(executorMetrics: ExecutorMetrics): Boolean = {
var updated = false
(0 until ExecutorMetricType.definedMetrics.length).foreach { idx =>
if (executorMetrics.metrics(idx) > metrics(idx)) {
ExecutorMetricType.definedMetricsAndOffset.map {m =>
if (executorMetrics.metrics(m._2) > metrics(m._2)) {
updated = true
metrics(idx) = executorMetrics.metrics(idx)
metrics(m._2) = executorMetrics.metrics(m._2)
}
}
updated
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ private[spark] class ProcfsBasedSystems(val procfsDir: String = "/proc/") extend
var pageSize = computePageSize()
var isAvailable: Boolean = isProcfsAvailable
private val pid = computePid()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pageSize is only a var for testing -- instead just optionally pass it in to the constructor

also I think all of these can be private.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I can't call computePageSize() in the constructor signature to compute the default value. Another solution is to check for testing inside computePageSize and if we are testing assign a value to it that is provided in the constructor (default to 4096).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can't put it as a default value, but if you make it a static method, then you can provide an overloaded method which uses it, see squito@cf00835

But, I think your other proposal is even better, if its testing just give it a fixed value (no need to even make it an argument to the constructor at all).

private val ptree = mutable.Map[ Int, Set[Int]]()
private var ptree = mutable.Map[ Int, Set[Int]]()

var allMetrics: ProcfsBasedSystemsMetrics = ProcfsBasedSystemsMetrics(0, 0, 0, 0, 0, 0)

Expand Down Expand Up @@ -84,7 +84,7 @@ private[spark] class ProcfsBasedSystems(val procfsDir: String = "/proc/") extend
return pid;
}
catch {
case e: SparkException => logDebug("IO Exception when trying to compute process tree." +
case e: SparkException => logWarning("Exception when trying to compute process tree." +
" As a result reporting of ProcessTree metrics is stopped", e)
isAvailable = false
return -1
Expand All @@ -95,15 +95,23 @@ private[spark] class ProcfsBasedSystems(val procfsDir: String = "/proc/") extend
if (testing) {
return 0;
}
val cmd = Array("getconf", "PAGESIZE")
val out2 = Utils.executeAndGetOutput(cmd)
return Integer.parseInt(out2.split("\n")(0))
try {
val cmd = Array("getconf", "PAGESIZE")
val out2 = Utils.executeAndGetOutput(cmd)
return Integer.parseInt(out2.split("\n")(0))
} catch {
case e: Exception => logWarning("Exception when trying to compute pagesize, as a" +
" result reporting of ProcessTree metrics is stopped")
isAvailable = false
return 0
}
}

private def computeProcessTree(): Unit = {
if (!isAvailable || testing) {
return
}
ptree = mutable.Map[ Int, Set[Int]]()
val queue = mutable.Queue.empty[Int]
queue += pid
while( !queue.isEmpty ) {
Expand All @@ -121,34 +129,34 @@ private[spark] class ProcfsBasedSystems(val procfsDir: String = "/proc/") extend

private def getChildPids(pid: Int): ArrayBuffer[Int] = {
try {
val cmd = Array("pgrep", "-P", pid.toString)
// val cmd = Array("pgrep", "-P", pid.toString)
val builder = new ProcessBuilder("pgrep", "-P", pid.toString)
val process = builder.start()
val output = new StringBuilder()
// val output = new StringBuilder()
val threadName = "read stdout for " + "pgrep"
def appendToOutput(s: String): Unit = output.append(s).append("\n")
val childPidsInInt = mutable.ArrayBuffer.empty[Int]
def appendChildPid(s: String): Unit = {
if (s != "") {
logDebug("Found a child pid:" + s)
childPidsInInt += Integer.parseInt(s)
}
}
val stdoutThread = Utils.processStreamByLine(threadName,
process.getInputStream, appendToOutput)
process.getInputStream, appendChildPid)
val exitCode = process.waitFor()
stdoutThread.join()
// pgrep will have exit code of 1 if there are more than one child process
// and it will have a exit code of 2 if there is no child process
if (exitCode != 0 && exitCode > 2) {
logError(s"Process $cmd exited with code $exitCode: $output")
val cmd = builder.command().toArray.mkString(" ")
logWarning(s"Process $cmd" +
s" exited with code $exitCode, with stderr:" + s"${process.getErrorStream} ")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont' think process.getErrorStream will have a useful toString. I think you need to read all the data. You probably also have to do that before process.waitFor(), otherwise I think its possible that the process blocks forever waiting for something to read stderr.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed this in the new patch, but not sure if I addressed your concern. Please let me know

throw new SparkException(s"Process $cmd exited with code $exitCode")
}
val childPids = output.toString.split("\n")
val childPidsInInt = mutable.ArrayBuffer.empty[Int]
for (p <- childPids) {
if (p != "") {
logDebug("Found a child pid: " + p)
childPidsInInt += Integer.parseInt(p)
}
}
childPidsInInt
} catch {
case e: IOException => logDebug("IO Exception when trying to compute process tree." +
" As a result reporting of ProcessTree metrics is stopped", e)
case e: Exception => logWarning("Exception when trying to compute process tree." +
" As a result reporting of ProcessTree metrics is stopped.", e)
isAvailable = false
return mutable.ArrayBuffer.empty[Int]
}
Expand All @@ -173,54 +181,42 @@ private[spark] class ProcfsBasedSystems(val procfsDir: String = "/proc/") extend
val vmem = procInfoSplit(22).toLong
val rssPages = procInfoSplit(23).toLong
if (procInfoSplit(1).toLowerCase(Locale.US).contains("java")) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this just be vmem and rssPages, rather than splitting into JVM, Python, and other? Can you explain more about how the separate values would be used?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is separated since it turns out knowing main actors like jvm in seperation can have some value for the user. We just consider jvm (case of pur scala) and python (case of using pyspark). Other stuff can be added per interest in future, but for now we consider everything else under "Other" category

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@edwinalu It would be nice to have a break up of the total memory being consumed. Its easier to tune the parameters knowing what is consuming all the memory. For example if your container died OOMing - it helps to know if it was because of python or JVM. Also R fits in the other category so it makes sense to have all 3 of them as of now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't have much pyspark ourselves, but yes, it seems useful to have the breakdown, and it's easy to sum the values for the total.

allMetrics = ProcfsBasedSystemsMetrics(
allMetrics.jvmVmemTotal + vmem,
allMetrics.jvmRSSTotal + (rssPages*pageSize),
allMetrics.pythonVmemTotal,
allMetrics.pythonRSSTotal,
allMetrics.otherVmemTotal,
allMetrics.otherRSSTotal
allMetrics = allMetrics.copy(
jvmVmemTotal = allMetrics.jvmVmemTotal + vmem,
jvmRSSTotal = allMetrics.jvmRSSTotal + (rssPages*pageSize)
)
}
else if (procInfoSplit(1).toLowerCase(Locale.US).contains("python")) {
allMetrics = ProcfsBasedSystemsMetrics(
allMetrics.jvmVmemTotal,
allMetrics.jvmRSSTotal,
allMetrics.pythonVmemTotal + vmem,
allMetrics.pythonRSSTotal + (rssPages*pageSize),
allMetrics.otherVmemTotal,
allMetrics.otherRSSTotal
allMetrics = allMetrics.copy(
pythonVmemTotal = allMetrics.pythonVmemTotal + vmem,
pythonRSSTotal = allMetrics.pythonRSSTotal + (rssPages*pageSize)
)
}
else {
allMetrics = ProcfsBasedSystemsMetrics(
allMetrics.jvmVmemTotal,
allMetrics.jvmRSSTotal,
allMetrics.pythonVmemTotal,
allMetrics.pythonRSSTotal,
allMetrics.otherVmemTotal + vmem,
allMetrics.otherRSSTotal + (rssPages*pageSize)
allMetrics = allMetrics.copy(
otherVmemTotal = allMetrics.otherVmemTotal + vmem,
otherRSSTotal = allMetrics.otherRSSTotal + (rssPages*pageSize)
)
}
}
}
}
} catch {
case f: FileNotFoundException => logDebug("There was a problem with reading" +
" the stat file of the process", f)
case f: FileNotFoundException => logWarning("There was a problem with reading" +
" the stat file of the process. ", f)
}
}

private[spark] def computeAllMetrics(): Unit = {
private[spark] def computeAllMetrics(): ProcfsBasedSystemsMetrics = {
if (!isAvailable) {
allMetrics = ProcfsBasedSystemsMetrics(0, 0, 0, 0, 0, 0)
return
return ProcfsBasedSystemsMetrics(0, 0, 0, 0, 0, 0)
}
computeProcessTree
val pids = ptree.keySet
allMetrics = ProcfsBasedSystemsMetrics(0, 0, 0, 0, 0, 0)
for (p <- pids) {
computeProcessInfo(p)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the state used here is a little trickier than it needs to be.

computeProcessTree is updating a member variable, even though its only used locally -- it would be easier to follow if instead it just returned the process tree, and then you passed it around. Also I dont' think you actually care about the tree, just the set of pids?

similarly for allMetrics. it doesn't really need to be a member variable, since its use is entirely contained within this function, you could just pass it around.

val pids = discoverPids()
val allMetrics = ...
for (p <- pids) {
  allMetrics = updateMetricsForProcess(allMetrics, p)
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tree was there in case we want to do some other stuff with it, but I guess we can have a tree structure when we actually need it. Right now as you mentioned we don't need it. So I will change it.
the allMetrics was there for testing, but I can change the test anyway.

}
return allMetrics
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,19 @@ import org.apache.spark.memory.MemoryManager
*/
sealed trait ExecutorMetricType {
private[spark] def getMetricValue(memoryManager: MemoryManager): Long = 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function is unused

private[spark] def getMetricSet(memoryManager: MemoryManager): Map[String, Long] =
Map.empty[ String, Long]
private[spark] val name = getClass().getName().stripSuffix("$").split("""\.""").last
private[spark] def getMetricSet(memoryManager: MemoryManager): Array[Long] = {
new Array[Long](0)
}
private[spark] def names = Seq(getClass().getName().stripSuffix("$").split("""\.""").last)
}

private[spark] abstract class MemoryManagerExecutorMetricType(
f: MemoryManager => Long) extends ExecutorMetricType {
override private[spark] def getMetricSet(memoryManager: MemoryManager): Map[String, Long] = {
var metricAsSet = Map.empty[String, Long]
metricAsSet += (name -> f(memoryManager))
override private[spark] def getMetricSet(memoryManager: MemoryManager): Array[Long] = {
val metricAsSet = new Array[Long](names.length)
(0 until names.length ).foreach { idx =>
metricAsSet(idx) = (f(memoryManager))
}
metricAsSet
}
override private[spark] def getMetricValue(memoryManager: MemoryManager): Long = {
Expand All @@ -50,15 +53,22 @@ private[spark] abstract class MBeanExecutorMetricType(mBeanName: String)
ManagementFactory.getPlatformMBeanServer,
new ObjectName(mBeanName).toString, classOf[BufferPoolMXBean])

override private[spark] def getMetricSet(memoryManager: MemoryManager): Array[Long] = {
val metricAsSet = new Array[Long](1)
metricAsSet(0) = bean.getMemoryUsed
metricAsSet
}

override private[spark] def getMetricValue(memoryManager: MemoryManager): Long = {
bean.getMemoryUsed
}
}

case object JVMHeapMemory extends ExecutorMetricType {
override private[spark] def getMetricSet(memoryManager: MemoryManager): Map[String, Long] = {
var metricAsSet = Map.empty[String, Long]
metricAsSet += (name -> ManagementFactory.getMemoryMXBean.getHeapMemoryUsage().getUsed())

override private[spark] def getMetricSet(memoryManager: MemoryManager): Array[Long] = {
val metricAsSet = new Array[Long](1)
metricAsSet(0) = ( ManagementFactory.getMemoryMXBean.getHeapMemoryUsage().getUsed())
metricAsSet
}
override private[spark] def getMetricValue(memoryManager: MemoryManager): Long = {
Expand All @@ -67,9 +77,9 @@ case object JVMHeapMemory extends ExecutorMetricType {
}

case object JVMOffHeapMemory extends ExecutorMetricType {
override private[spark] def getMetricSet(memoryManager: MemoryManager): Map[String, Long] = {
var metricAsSet = Map.empty[String, Long]
metricAsSet += (name -> ManagementFactory.getMemoryMXBean.getNonHeapMemoryUsage().getUsed())
override private[spark] def getMetricSet(memoryManager: MemoryManager): Array[Long] = {
val metricAsSet = new Array[ Long](1)
metricAsSet(0) = ( ManagementFactory.getMemoryMXBean.getNonHeapMemoryUsage().getUsed())
metricAsSet
}
override private[spark] def getMetricValue(memoryManager: MemoryManager): Long = {
Expand All @@ -78,21 +88,22 @@ case object JVMOffHeapMemory extends ExecutorMetricType {
}

case object ProcessTreeMetrics extends ExecutorMetricType {
override private[spark] def getMetricSet(memoryManager: MemoryManager): Map[String, Long] = {
ExecutorMetricType.pTreeInfo.computeAllMetrics()
var processTreeMetrics = Map.empty[String, Long]
processTreeMetrics += ("ProcessTreeJVMVMemory" ->
ExecutorMetricType.pTreeInfo.allMetrics.jvmVmemTotal )
processTreeMetrics += ("ProcessTreeJVMRSSMemory" ->
ExecutorMetricType.pTreeInfo.allMetrics.jvmRSSTotal )
processTreeMetrics += ("ProcessTreePythonVMemory" ->
ExecutorMetricType.pTreeInfo.allMetrics.pythonVmemTotal )
processTreeMetrics += ("ProcessTreePythonRSSMemory" ->
ExecutorMetricType.pTreeInfo.allMetrics.pythonRSSTotal )
processTreeMetrics += ("ProcessTreeOtherVMemory" ->
ExecutorMetricType.pTreeInfo.allMetrics.otherVmemTotal )
processTreeMetrics += ("ProcessTreeOtherRSSMemory" ->
ExecutorMetricType.pTreeInfo.allMetrics.otherRSSTotal )
override val names = Seq(
"ProcessTreeJVMVMemory",
"ProcessTreeJVMRSSMemory",
"ProcessTreePythonVMemory",
"ProcessTreePythonRSSMemory",
"ProcessTreeOtherVMemory",
"ProcessTreeOtherRSSMemory")
override private[spark] def getMetricSet(memoryManager: MemoryManager): Array[Long] = {
val allMetrics = ExecutorMetricType.pTreeInfo.computeAllMetrics()
val processTreeMetrics = new Array[Long](names.length)
processTreeMetrics(0) = allMetrics.jvmVmemTotal
processTreeMetrics(1) = allMetrics.jvmRSSTotal
processTreeMetrics(2) = allMetrics.pythonVmemTotal
processTreeMetrics(3) = allMetrics.pythonRSSTotal
processTreeMetrics(4) = allMetrics.otherVmemTotal
processTreeMetrics(5) = allMetrics.otherRSSTotal
processTreeMetrics
}
}
Expand Down Expand Up @@ -138,26 +149,15 @@ private[spark] object ExecutorMetricType {
MappedPoolMemory,
ProcessTreeMetrics
)
// List of defined metrics
val definedMetrics = IndexedSeq(
"JVMHeapMemory",
"JVMOffHeapMemory",
"OnHeapExecutionMemory",
"OffHeapExecutionMemory",
"OnHeapStorageMemory",
"OffHeapStorageMemory",
"OnHeapUnifiedMemory",
"OffHeapUnifiedMemory",
"DirectPoolMemory",
"MappedPoolMemory",
"ProcessTreeJVMVMemory",
"ProcessTreeJVMRSSMemory",
"ProcessTreePythonVMemory",
"ProcessTreePythonRSSMemory",
"ProcessTreeOtherVMemory",
"ProcessTreeOtherRSSMemory"
)

val metricIdxMap =
Map[String, Int](ExecutorMetricType.definedMetrics.zipWithIndex: _*)
var definedMetricsAndOffset = Map.empty[String, Int]
var numberOfMetrics = 0
metricGetters.foreach { m =>
var metricInSet = 0
while (metricInSet < m.names.length) {
definedMetricsAndOffset += (m.names(metricInSet) -> (metricInSet + numberOfMetrics) )
metricInSet += 1
}
numberOfMetrics += m.names.length
}
}
6 changes: 3 additions & 3 deletions core/src/main/scala/org/apache/spark/status/api/v1/api.scala
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,9 @@ private[spark] class ExecutorMetricsJsonSerializer
jsonGenerator: JsonGenerator,
serializerProvider: SerializerProvider): Unit = {
metrics.foreach { m: ExecutorMetrics =>
val metricsMap = ExecutorMetricType.definedMetrics.map { metricType =>
metricType -> m.getMetricValue(metricType)
}.toMap
val metricsMap = ExecutorMetricType.definedMetricsAndOffset.map { case (metric, _) =>
metric -> m.getMetricValue(metric)
}
jsonGenerator.writeObject(metricsMap)
}
}
Expand Down
Loading