-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-24958][CORE] Add memory from procfs to executor metrics. #22612
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
3f8321a
cd16a75
94c2b04
062f5d7
245221d
c72be03
8f3c938
f2dca27
a9f924c
a11e3a2
34ad625
415f976
067b81d
18ee4ad
7f7ed2b
f3867ff
0f8f3e2
ea08c61
8f20857
6e65360
4659f4a
ef4be38
805741c
4c1f073
0a7402e
3d65b35
6eab315
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,29 +45,33 @@ private[spark] class ProcfsBasedSystems(val procfsDir: String = "/proc/") extend | |
| var pageSize = computePageSize() | ||
| var isAvailable: Boolean = isProcfsAvailable | ||
| private val pid = computePid() | ||
|
||
| private var ptree = mutable.Map[ Int, Set[Int]]() | ||
|
|
||
| var allMetrics: ProcfsBasedSystemsMetrics = ProcfsBasedSystemsMetrics(0, 0, 0, 0, 0, 0) | ||
| // var allMetrics: ProcfsBasedSystemsMetrics = ProcfsBasedSystemsMetrics(0, 0, 0, 0, 0, 0) | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| computeProcessTree() | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| private def isProcfsAvailable: Boolean = { | ||
| private lazy val isProcfsAvailable: Boolean = { | ||
| if (testing) { | ||
| return true | ||
| true | ||
| } | ||
| try { | ||
| if (!Files.exists(Paths.get(procfsDir))) { | ||
| return false | ||
| else { | ||
| var procDirExists = true | ||
| try { | ||
| if (!Files.exists(Paths.get(procfsDir))) { | ||
| procDirExists = false | ||
| } | ||
| } | ||
| catch { | ||
| case f: IOException => | ||
| logWarning("It seems that procfs isn't supported", f) | ||
| procDirExists = false | ||
| } | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| val shouldLogStageExecutorMetrics = | ||
| SparkEnv.get.conf.get(config.EVENT_LOG_STAGE_EXECUTOR_METRICS) | ||
| val shouldLogStageExecutorProcessTreeMetrics = | ||
| SparkEnv.get.conf.get(config.EVENT_LOG_PROCESS_TREE_METRICS) | ||
| procDirExists && shouldLogStageExecutorProcessTreeMetrics && shouldLogStageExecutorMetrics | ||
| } | ||
| catch { | ||
| case f: FileNotFoundException => return false | ||
| } | ||
| val shouldLogStageExecutorMetrics = | ||
| SparkEnv.get.conf.get(config.EVENT_LOG_STAGE_EXECUTOR_METRICS) | ||
| val shouldLogStageExecutorProcessTreeMetrics = | ||
| SparkEnv.get.conf.get(config.EVENT_LOG_PROCESS_TREE_METRICS) | ||
| shouldLogStageExecutorProcessTreeMetrics && shouldLogStageExecutorMetrics | ||
| } | ||
|
|
||
| private def computePid(): Int = { | ||
|
|
@@ -78,13 +82,13 @@ private[spark] class ProcfsBasedSystems(val procfsDir: String = "/proc/") extend | |
| // This can be simplified in java9: | ||
| // https://docs.oracle.com/javase/9/docs/api/java/lang/ProcessHandle.html | ||
| val cmd = Array("bash", "-c", "echo $PPID") | ||
| val length = 10 | ||
| val out2 = Utils.executeAndGetOutput(cmd) | ||
|
||
| val pid = Integer.parseInt(out2.split("\n")(0)) | ||
| return pid; | ||
| } | ||
| catch { | ||
| case e: SparkException => logWarning("Exception when trying to compute process tree." + | ||
| case e: SparkException => | ||
| logWarning("Exception when trying to compute process tree." + | ||
| " As a result reporting of ProcessTree metrics is stopped", e) | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| isAvailable = false | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| return -1 | ||
|
|
@@ -97,8 +101,8 @@ private[spark] class ProcfsBasedSystems(val procfsDir: String = "/proc/") extend | |
| } | ||
| try { | ||
| val cmd = Array("getconf", "PAGESIZE") | ||
| val out2 = Utils.executeAndGetOutput(cmd) | ||
| return Integer.parseInt(out2.split("\n")(0)) | ||
| val out = Utils.executeAndGetOutput(cmd) | ||
| return Integer.parseInt(out.split("\n")(0)) | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } catch { | ||
| case e: Exception => logWarning("Exception when trying to compute pagesize, as a" + | ||
| " result reporting of ProcessTree metrics is stopped") | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
@@ -107,24 +111,23 @@ private[spark] class ProcfsBasedSystems(val procfsDir: String = "/proc/") extend | |
| } | ||
| } | ||
|
|
||
| private def computeProcessTree(): Unit = { | ||
| private def computeProcessTree(): Set[Int] = { | ||
| if (!isAvailable || testing) { | ||
| return | ||
| return Set() | ||
| } | ||
| ptree = mutable.Map[ Int, Set[Int]]() | ||
| var ptree: Set[Int] = Set() | ||
| ptree += pid | ||
| val queue = mutable.Queue.empty[Int] | ||
| queue += pid | ||
| while( !queue.isEmpty ) { | ||
| val p = queue.dequeue() | ||
| val c = getChildPids(p) | ||
| if(!c.isEmpty) { | ||
| queue ++= c | ||
| ptree += (p -> c.toSet) | ||
| } | ||
| else { | ||
| ptree += (p -> Set[Int]()) | ||
| ptree ++= c.toSet | ||
| } | ||
| } | ||
| ptree | ||
| } | ||
|
|
||
| private def getChildPids(pid: Int): ArrayBuffer[Int] = { | ||
|
|
@@ -162,15 +165,17 @@ private[spark] class ProcfsBasedSystems(val procfsDir: String = "/proc/") extend | |
| } | ||
| } | ||
|
|
||
| def computeProcessInfo(pid: Int): Unit = { | ||
| /* | ||
| def computeProcessInfo(allMetrics: ProcfsBasedSystemsMetrics, pid: Int): | ||
| ProcfsBasedSystemsMetrics = { | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| /* | ||
| * Hadoop ProcfsBasedProcessTree class used regex and pattern matching to retrive the memory | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| * info. I tried that but found it not correct during tests, so I used normal string analysis | ||
| * instead. The computation of RSS and Vmem are based on proc(5): | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| * http://man7.org/linux/man-pages/man5/proc.5.html | ||
| */ | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| try { | ||
| val pidDir = new File(procfsDir, pid.toString) | ||
| var allMetricsUpdated = ProcfsBasedSystemsMetrics(0, 0, 0, 0, 0, 0) | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| Utils.tryWithResource( new InputStreamReader( | ||
| new FileInputStream( | ||
| new File(pidDir, procfsStatFile)), Charset.forName("UTF-8"))) { fReader => | ||
|
|
@@ -181,41 +186,42 @@ private[spark] class ProcfsBasedSystems(val procfsDir: String = "/proc/") extend | |
| val vmem = procInfoSplit(22).toLong | ||
| val rssPages = procInfoSplit(23).toLong | ||
| if (procInfoSplit(1).toLowerCase(Locale.US).contains("java")) { | ||
|
||
| allMetrics = allMetrics.copy( | ||
| allMetricsUpdated = allMetrics.copy( | ||
| jvmVmemTotal = allMetrics.jvmVmemTotal + vmem, | ||
| jvmRSSTotal = allMetrics.jvmRSSTotal + (rssPages*pageSize) | ||
| ) | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| else if (procInfoSplit(1).toLowerCase(Locale.US).contains("python")) { | ||
| allMetrics = allMetrics.copy( | ||
| allMetricsUpdated = allMetrics.copy( | ||
| pythonVmemTotal = allMetrics.pythonVmemTotal + vmem, | ||
| pythonRSSTotal = allMetrics.pythonRSSTotal + (rssPages*pageSize) | ||
| ) | ||
| } | ||
| else { | ||
| allMetrics = allMetrics.copy( | ||
| allMetricsUpdated = allMetrics.copy( | ||
| otherVmemTotal = allMetrics.otherVmemTotal + vmem, | ||
| otherRSSTotal = allMetrics.otherRSSTotal + (rssPages*pageSize) | ||
| ) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| allMetricsUpdated | ||
| } catch { | ||
| case f: FileNotFoundException => logWarning("There was a problem with reading" + | ||
| " the stat file of the process. ", f) | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ProcfsBasedSystemsMetrics(0, 0, 0, 0, 0, 0) | ||
| } | ||
| } | ||
|
|
||
| private[spark] def computeAllMetrics(): ProcfsBasedSystemsMetrics = { | ||
| if (!isAvailable) { | ||
| return ProcfsBasedSystemsMetrics(0, 0, 0, 0, 0, 0) | ||
| } | ||
| computeProcessTree | ||
| val pids = ptree.keySet | ||
| allMetrics = ProcfsBasedSystemsMetrics(0, 0, 0, 0, 0, 0) | ||
| val pids = computeProcessTree | ||
| var allMetrics = ProcfsBasedSystemsMetrics(0, 0, 0, 0, 0, 0) | ||
| for (p <- pids) { | ||
| computeProcessInfo(p) | ||
| allMetrics = computeProcessInfo(allMetrics, p) | ||
rezasafi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| return allMetrics | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.