apache
diff --git a/‎bin/docker-image-tool.sh‎
Lines changed: 8 additions & 2 deletions b/‎bin/docker-image-tool.sh‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala‎
Lines changed: 5 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala‎
Lines changed: 5 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala‎
Lines changed: 23 additions & 1 deletion b/‎core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala‎
Lines changed: 49 additions & 0 deletions b/‎core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎dev/deps/spark-deps-hadoop-2.6‎
Lines changed: 4 additions & 3 deletions b/‎dev/deps/spark-deps-hadoop-2.6‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎dev/deps/spark-deps-hadoop-2.7‎
Lines changed: 4 additions & 3 deletions b/‎dev/deps/spark-deps-hadoop-2.7‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎dev/deps/spark-deps-hadoop-3.1‎
Lines changed: 4 additions & 3 deletions b/‎dev/deps/spark-deps-hadoop-3.1‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎docs/mllib-data-types.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/mllib-data-types.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala‎
Lines changed: 6 additions & 7 deletions b/‎external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala‎
Lines changed: 6 additions & 7 deletions
@@ -49,6 +49,7 @@ function build {
     # Set image build arguments accordingly if this is a source repo and not a distribution archive.
     IMG_PATH=resource-managers/kubernetes/docker/src/main/dockerfiles
     BUILD_ARGS=(
+      ${BUILD_PARAMS}
       --build-arg
       img_path=$IMG_PATH
       --build-arg
@@ -57,13 +58,14 @@ function build {
   else
     # Not passed as an argument to docker, but used to validate the Spark directory.
     IMG_PATH="kubernetes/dockerfiles"
-    BUILD_ARGS=()
+    BUILD_ARGS=(${BUILD_PARAMS})
   fi
 
   if [ ! -d "$IMG_PATH" ]; then
     error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark."
   fi
   local BINDING_BUILD_ARGS=(
+    ${BUILD_PARAMS}
     --build-arg
     base_img=$(image_ref spark)
   )
@@ -101,6 +103,8 @@ Options:
   -t tag      Tag to apply to the built image, or to identify the image to be pushed.
   -m          Use minikube's Docker daemon.
   -n          Build docker image with --no-cache
+  -b arg      Build arg to build or push the image. For multiple build args, this option needs to
+              be used separately for each build arg.
 
 Using minikube when building images will do so directly into minikube's Docker daemon.
 There is no need to push the images into minikube in that case, they'll be automatically
@@ -130,7 +134,8 @@ TAG=
 BASEDOCKERFILE=
 PYDOCKERFILE=
 NOCACHEARG=
-while getopts f:mr:t:n option
+BUILD_PARAMS=
+while getopts f:mr:t:n:b: option
 do
  case "${option}"
  in
@@ -139,6 +144,7 @@ do
  r) REPO=${OPTARG};;
  t) TAG=${OPTARG};;
  n) NOCACHEARG="--no-cache";;
+ b) BUILD_PARAMS=${BUILD_PARAMS}" --build-arg "${OPTARG};;
  m)
    if ! which minikube 1>/dev/null; then
      error "Cannot find minikube."
 
@@ -697,9 +697,12 @@ private[spark] class TaskSchedulerImpl(
    * do not also submit those same tasks.  That also means that a task completion from an earlier
    * attempt can lead to the entire stage getting marked as successful.
    */
-  private[scheduler] def markPartitionCompletedInAllTaskSets(stageId: Int, partitionId: Int) = {
+  private[scheduler] def markPartitionCompletedInAllTaskSets(
+      stageId: Int,
+      partitionId: Int,
+      taskInfo: TaskInfo) = {
     taskSetsByStageIdAndAttempt.getOrElse(stageId, Map()).values.foreach { tsm =>
-      tsm.markPartitionCompleted(partitionId)
+      tsm.markPartitionCompleted(partitionId, taskInfo)
     }
   }
 
 
@@ -758,7 +758,7 @@ private[spark] class TaskSetManager(
     }
     // There may be multiple tasksets for this stage -- we let all of them know that the partition
     // was completed.  This may result in some of the tasksets getting completed.
-    sched.markPartitionCompletedInAllTaskSets(stageId, tasks(index).partitionId)
+    sched.markPartitionCompletedInAllTaskSets(stageId, tasks(index).partitionId, info)
     // This method is called by "TaskSchedulerImpl.handleSuccessfulTask" which holds the
     // "TaskSchedulerImpl" lock until exiting. To avoid the SPARK-7655 issue, we should not
     // "deserialize" the value when holding a lock to avoid blocking other threads. So we call
@@ -769,9 +769,12 @@ private[spark] class TaskSetManager(
     maybeFinishTaskSet()
   }
 
-  private[scheduler] def markPartitionCompleted(partitionId: Int): Unit = {
+  private[scheduler] def markPartitionCompleted(partitionId: Int, taskInfo: TaskInfo): Unit = {
     partitionToIndex.get(partitionId).foreach { index =>
       if (!successful(index)) {
+        if (speculationEnabled && !isZombie) {
+          successfulTaskDurations.insert(taskInfo.duration)
+        }
         tasksSuccessful += 1
         successful(index) = true
         if (tasksSuccessful == numTasks) {
 
@@ -443,7 +443,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext {
       map{x => List(x)}.toList, "Tried coalescing 9 partitions to 20 but didn't get 9 back")
   }
 
- test("coalesced RDDs with partial locality") {
+  test("coalesced RDDs with partial locality") {
     // Make an RDD that has some locality preferences and some without. This can happen
     // with UnionRDD
     val data = sc.makeRDD((1 to 9).map(i => {
@@ -846,6 +846,28 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext {
     assert(partitions(1) === Seq((1, 3), (3, 8), (3, 8)))
   }
 
+  test("cartesian on empty RDD") {
+    val a = sc.emptyRDD[Int]
+    val b = sc.parallelize(1 to 3)
+    val cartesian_result = Array.empty[(Int, Int)]
+    assert(a.cartesian(a).collect().toList === cartesian_result)
+    assert(a.cartesian(b).collect().toList === cartesian_result)
+    assert(b.cartesian(a).collect().toList === cartesian_result)
+  }
+
+  test("cartesian on non-empty RDDs") {
+    val a = sc.parallelize(1 to 3)
+    val b = sc.parallelize(2 to 4)
+    val c = sc.parallelize(1 to 1)
+    val a_cartesian_b =
+      Array((1, 2), (1, 3), (1, 4), (2, 2), (2, 3), (2, 4), (3, 2), (3, 3), (3, 4))
+    val a_cartesian_c = Array((1, 1), (2, 1), (3, 1))
+    val c_cartesian_a = Array((1, 1), (1, 2), (1, 3))
+    assert(a.cartesian[Int](b).collect().toList.sorted === a_cartesian_b)
+    assert(a.cartesian[Int](c).collect().toList.sorted === a_cartesian_c)
+    assert(c.cartesian[Int](a).collect().toList.sorted === c_cartesian_a)
+  }
+
   test("intersection") {
     val all = sc.parallelize(1 to 10)
     val evens = sc.parallelize(2 to 10 by 2)
 
@@ -1365,6 +1365,55 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg
     assert(taskOption4.get.addedJars === addedJarsMidTaskSet)
   }
 
+  test("[SPARK-24677] Avoid NoSuchElementException from MedianHeap") {
+    val conf = new SparkConf().set("spark.speculation", "true")
+    sc = new SparkContext("local", "test", conf)
+    // Set the speculation multiplier to be 0 so speculative tasks are launched immediately
+    sc.conf.set("spark.speculation.multiplier", "0.0")
+    sc.conf.set("spark.speculation.quantile", "0.1")
+    sc.conf.set("spark.speculation", "true")
+
+    sched = new FakeTaskScheduler(sc)
+    sched.initialize(new FakeSchedulerBackend())
+
+    val dagScheduler = new FakeDAGScheduler(sc, sched)
+    sched.setDAGScheduler(dagScheduler)
+
+    val taskSet1 = FakeTask.createTaskSet(10)
+    val accumUpdatesByTask: Array[Seq[AccumulatorV2[_, _]]] = taskSet1.tasks.map { task =>
+      task.metrics.internalAccums
+    }
+
+    sched.submitTasks(taskSet1)
+    sched.resourceOffers(
+      (0 until 10).map { idx => WorkerOffer(s"exec-$idx", s"host-$idx", 1) })
+
+    val taskSetManager1 = sched.taskSetManagerForAttempt(0, 0).get
+
+    // fail fetch
+    taskSetManager1.handleFailedTask(
+      taskSetManager1.taskAttempts.head.head.taskId, TaskState.FAILED,
+      FetchFailed(null, 0, 0, 0, "fetch failed"))
+
+    assert(taskSetManager1.isZombie)
+    assert(taskSetManager1.runningTasks === 9)
+
+    val taskSet2 = FakeTask.createTaskSet(10, stageAttemptId = 1)
+    sched.submitTasks(taskSet2)
+    sched.resourceOffers(
+      (11 until 20).map { idx => WorkerOffer(s"exec-$idx", s"host-$idx", 1) })
+
+    // Complete the 2 tasks and leave 8 task in running
+    for (id <- Set(0, 1)) {
+      taskSetManager1.handleSuccessfulTask(id, createTaskResult(id, accumUpdatesByTask(id)))
+      assert(sched.endedTasks(id) === Success)
+    }
+
+    val taskSetManager2 = sched.taskSetManagerForAttempt(0, 1).get
+    assert(!taskSetManager2.successfulTaskDurations.isEmpty())
+    taskSetManager2.checkSpeculatableTasks(0)
+  }
+
   private def createTaskResult(
       id: Int,
       accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty): DirectTaskResult[Int] = {
 
@@ -2,7 +2,7 @@ JavaEWAH-0.3.2.jar
 RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 activation-1.1.1.jar
-aircompressor-0.8.jar
+aircompressor-0.10.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.7.jar
@@ -157,8 +157,9 @@ objenesis-2.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.4.4-nohive.jar
-orc-mapreduce-1.4.4-nohive.jar
+orc-core-1.5.2-nohive.jar
+orc-mapreduce-1.5.2-nohive.jar
+orc-shims-1.5.2.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
 
@@ -2,7 +2,7 @@ JavaEWAH-0.3.2.jar
 RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 activation-1.1.1.jar
-aircompressor-0.8.jar
+aircompressor-0.10.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.7.jar
@@ -158,8 +158,9 @@ objenesis-2.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.4.4-nohive.jar
-orc-mapreduce-1.4.4-nohive.jar
+orc-core-1.5.2-nohive.jar
+orc-mapreduce-1.5.2-nohive.jar
+orc-shims-1.5.2.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
 
@@ -4,7 +4,7 @@ RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 accessors-smart-1.2.jar
 activation-1.1.1.jar
-aircompressor-0.8.jar
+aircompressor-0.10.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.7.jar
@@ -176,8 +176,9 @@ okhttp-2.7.5.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
-orc-core-1.4.4-nohive.jar
-orc-mapreduce-1.4.4-nohive.jar
+orc-core-1.5.2-nohive.jar
+orc-mapreduce-1.5.2-nohive.jar
+orc-shims-1.5.2.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.8.jar
 
@@ -317,7 +317,7 @@ Refer to the [`Matrix` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.
 from pyspark.mllib.linalg import Matrix, Matrices
 
 # Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
-dm2 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])
+dm2 = Matrices.dense(3, 2, [1, 3, 5, 2, 4, 6])
 
 # Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
 sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])
@@ -624,7 +624,7 @@ from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
 
 # Create an RDD of coordinate entries.
 #   - This can be done explicitly with the MatrixEntry class:
-entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)])
+entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(2, 1, 3.7)])
 #   - or using (long, long, float) tuples:
 entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)])
 
 
@@ -58,6 +58,7 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
       options: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
     val conf = spark.sparkContext.hadoopConfiguration
+    val parsedOptions = new AvroOptions(options)
 
     // Schema evolution is not supported yet. Here we only pick a single random sample file to
     // figure out the schema of the whole dataset.
@@ -74,7 +75,7 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
       }
 
     // User can specify an optional avro json schema.
-    val avroSchema = options.get(AvroFileFormat.AvroSchema)
+    val avroSchema = parsedOptions.schema
       .map(new Schema.Parser().parse)
       .getOrElse {
         val in = new FsInput(sampleFile.getPath, conf)
@@ -112,10 +113,9 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
       job: Job,
       options: Map[String, String],
       dataSchema: StructType): OutputWriterFactory = {
-    val recordName = options.getOrElse("recordName", "topLevelRecord")
-    val recordNamespace = options.getOrElse("recordNamespace", "")
+    val parsedOptions = new AvroOptions(options)
     val outputAvroSchema = SchemaConverters.toAvroType(
-      dataSchema, nullable = false, recordName, recordNamespace)
+      dataSchema, nullable = false, parsedOptions.recordName, parsedOptions.recordNamespace)
 
     AvroJob.setOutputKeySchema(job, outputAvroSchema)
     val AVRO_COMPRESSION_CODEC = "spark.sql.avro.compression.codec"
@@ -158,11 +158,12 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
 
     val broadcastedConf =
       spark.sparkContext.broadcast(new AvroFileFormat.SerializableConfiguration(hadoopConf))
+    val parsedOptions = new AvroOptions(options)
 
     (file: PartitionedFile) => {
       val log = LoggerFactory.getLogger(classOf[AvroFileFormat])
       val conf = broadcastedConf.value.value
-      val userProvidedSchema = options.get(AvroFileFormat.AvroSchema).map(new Schema.Parser().parse)
+      val userProvidedSchema = parsedOptions.schema.map(new Schema.Parser().parse)
 
       // TODO Removes this check once `FileFormat` gets a general file filtering interface method.
       // Doing input file filtering is improper because we may generate empty tasks that process no
@@ -233,8 +234,6 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
 private[avro] object AvroFileFormat {
   val IgnoreFilesWithoutExtensionProperty = "avro.mapred.ignore.inputs.without.extension"
 
-  val AvroSchema = "avroSchema"
-
   class SerializableConfiguration(@transient var value: Configuration)
       extends Serializable with KryoSerializable {
     @transient private[avro] lazy val log = LoggerFactory.getLogger(getClass)
Original file line number	Diff line number	Diff line change
`@@ -697,9 +697,12 @@ private[spark] class TaskSchedulerImpl(`
`697`	`697`	`* do not also submit those same tasks. That also means that a task completion from an earlier`
`698`	`698`	`* attempt can lead to the entire stage getting marked as successful.`
`699`	`699`	`*/`
`700`		`- private[scheduler] def markPartitionCompletedInAllTaskSets(stageId: Int, partitionId: Int) = {`
	`700`	`+ private[scheduler] def markPartitionCompletedInAllTaskSets(`
	`701`	`+ stageId: Int,`
	`702`	`+ partitionId: Int,`
	`703`	`+ taskInfo: TaskInfo) = {`
`701`	`704`	`taskSetsByStageIdAndAttempt.getOrElse(stageId, Map()).values.foreach { tsm =>`
`702`		`- tsm.markPartitionCompleted(partitionId)`
	`705`	`+ tsm.markPartitionCompleted(partitionId, taskInfo)`
`703`	`706`	`}`
`704`	`707`	`}`
`705`	`708`