apache · dongjoon-hyun · Feb 12, 2018 · Feb 13, 2018 · Feb 13, 2018 · Feb 13, 2018
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -188,6 +188,9 @@ class OrcFileFormat
         if (enableVectorizedReader) {
           val batchReader = new OrcColumnarBatchReader(
             enableOffHeapColumnVector && taskContext.isDefined, copyToSpark, capacity)
+          val iter = new RecordReaderIterator(batchReader)
+          Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
+
           batchReader.initialize(fileSplit, taskAttemptContext)
           batchReader.initBatch(
             reader.getSchema,
@@ -196,8 +199,6 @@ class OrcFileFormat
             partitionSchema,
             file.partitionValues)
 
-          val iter = new RecordReaderIterator(batchReader)
-          Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
           iter.asInstanceOf[Iterator[InternalRow]]
         } else {
           val orcRecordReader = new OrcInputFormat[OrcStruct]

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -20,9 +20,11 @@ package org.apache.spark.sql.execution.datasources.orc
 import java.io.File
 import java.util.Locale
 
+import org.apache.hadoop.fs.Path
 import org.apache.orc.OrcConf.COMPRESS
 import org.scalatest.BeforeAndAfterAll
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -160,6 +162,25 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
       }
     }
   }
+
+  // This should be tested manually because it raises OOM intentionally
+  // in order to cause `Leaked filesystem connection`. The test suite dies, too.
+  ignore("SPARK-23399 Register a task completion listner first for OrcColumnarBatchReader") {
+    withSQLConf(SQLConf.ORC_VECTORIZED_READER_BATCH_SIZE.key -> s"${Int.MaxValue}") {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        Seq(0).toDF("a").write.format("orc").save(new Path(basePath, "first").toString)
+        Seq(1).toDF("a").write.format("orc").save(new Path(basePath, "second").toString)
+        val df = spark.read.orc(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString)
+        val e = intercept[SparkException] {
+          df.collect()
+        }
+        assert(e.getCause.isInstanceOf[OutOfMemoryError])
+      }
+    }
+  }
 }
 
 class OrcSourceSuite extends OrcSuite with SharedSQLContext {