-
Notifications
You must be signed in to change notification settings - Fork 29k
SPARK-6735:[YARN] Adding properties to disable maximum number of executor failure's check or to make it relative to duration #5449
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,7 @@ package org.apache.spark.deploy.yarn | |
| import java.util.Collections | ||
| import java.util.concurrent._ | ||
| import java.util.regex.Pattern | ||
| import java.util.Stack | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Put this up with the |
||
|
|
||
| import scala.collection.JavaConversions._ | ||
| import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} | ||
|
|
@@ -83,6 +84,9 @@ private[yarn] class YarnAllocator( | |
| private var executorIdCounter = 0 | ||
| @volatile private var numExecutorsFailed = 0 | ||
|
|
||
| @volatile private var executorFailureTimeStamps = new Stack[Long]() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we not want to use a queue here, because the first executor failures to leave the list should be the first ones that were added? |
||
| @volatile private var oldestRelativeExecutorFailure = -1L | ||
|
|
||
| @volatile private var targetNumExecutors = args.numExecutors | ||
|
|
||
| // Keep track of which container is running which executor to remove the executors later | ||
|
|
@@ -94,6 +98,13 @@ private[yarn] class YarnAllocator( | |
| // Additional memory overhead. | ||
| protected val memoryOverhead: Int = sparkConf.getInt("spark.yarn.executor.memoryOverhead", | ||
| math.max((MEMORY_OVERHEAD_FACTOR * executorMemory).toInt, MEMORY_OVERHEAD_MIN)) | ||
|
|
||
| // Maximum number of executor failures per minute | ||
| private val relativeMaxExecutorFailurePerMinute = | ||
| sparkConf.getInt("spark.yarn.max.executor.failuresPerMinute", -1) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These should be indented two spaces. This goes for a couple other places as well. |
||
| private val relativeMaxExecutorFailureEnabled = | ||
| if (relativeMaxExecutorFailurePerMinute == -1) true else false | ||
|
|
||
| // Number of cores per executor. | ||
| protected val executorCores = args.executorCores | ||
| // Resource capability requested for each executors | ||
|
|
@@ -119,7 +130,27 @@ private[yarn] class YarnAllocator( | |
|
|
||
| def getNumExecutorsRunning: Int = numExecutorsRunning | ||
|
|
||
| def getNumExecutorsFailed: Int = numExecutorsFailed | ||
| def getNumExecutorsFailed: Int = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This name is a little bit unclear. Maybe merge this method with |
||
| if (relativeMaxExecutorFailureEnabled) { | ||
| getRelevantNumExecutorsFailed | ||
| } else { | ||
| numExecutorsFailed.intValue | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns the the relative number of executor failures within the specified window duration. | ||
| */ | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Extra newline |
||
| def getRelevantNumExecutorsFailed : Int = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add a test for this logic? |
||
| val currentTime = System.currentTimeMillis / 1000 | ||
| val relevantWindowStartTime = currentTime - 60 | ||
| while(relevantWindowStartTime > oldestRelativeExecutorFailure && | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need spaces before and after these parentheses. |
||
| executorFailureTimeStamps.size > 0){ | ||
| oldestRelativeExecutorFailure = executorFailureTimeStamps.pop | ||
| } | ||
| executorFailureTimeStamps.size + 1 | ||
| } | ||
|
|
||
| /** | ||
| * Number of container requests that have not yet been fulfilled. | ||
|
|
@@ -386,6 +417,9 @@ private[yarn] class YarnAllocator( | |
| ". Exit status: " + completedContainer.getExitStatus + | ||
| ". Diagnostics: " + completedContainer.getDiagnostics) | ||
| numExecutorsFailed += 1 | ||
| if (relativeMaxExecutorFailureEnabled) { | ||
| executorFailureTimeStamps.push(System.currentTimeMillis / 1000) | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's an extra space here. Also, the right side of this equation can just be
maxNumExecutorFailures == -1.