Update some comments

apache · JoshRosen · Jul 27, 2015 · Jul 27, 2015 · Jul 27, 2015 · Jul 27, 2015
commit 16c44e253998c9a0ced7fcde6a16ed7fea17392f
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala
@@ -62,22 +62,20 @@ case class ScriptTransformation(
     def processIterator(inputIterator: Iterator[InternalRow]): Iterator[InternalRow] = {
       val cmd = List("/bin/bash", "-c", script)
       val builder = new ProcessBuilder(cmd)
-      // We need to start threads connected to the process pipeline:
-      // 1) The error msg generated by the script process would be hidden.
-      // 2) If the error msg is too big to chock up the buffer, the input logic would be hung
+
       val proc = builder.start()
       val inputStream = proc.getInputStream
       val outputStream = proc.getOutputStream
       val errorStream = proc.getErrorStream
-      val reader = new BufferedReader(new InputStreamReader(inputStream))
 
-      // TODO make the 2048 configurable?
+      // In order to avoid deadlocks, we need to consume the error output of the child process.
+      // To avoid issues caused by large error output, we use a circular buffer to limit the amount
+      // of error output that we retain. See SPARK-7862 for more discussion of the deadlock / hang
+      // that motivates this.
       val stderrBuffer = new CircularBuffer(2048)
-
-      // Consume the error stream from the pipeline, otherwise it will be blocked if
-      // the pipeline is full.
-      new RedirectThread(errorStream, // input stream from the pipeline
-        stderrBuffer,                 // output to a circular buffer
+      new RedirectThread(
+        errorStream,
+        stderrBuffer,
         "Thread-ScriptTransformation-STDERR-Consumer").start()
 
       val outputProjection = new InterpretedProjection(input, child.output)
@@ -86,10 +84,8 @@ case class ScriptTransformation(
       // inside of a loop
       @Nullable val (inputSerde, inputSoi) = ioschema.initInputSerDe(input).getOrElse((null, null))
 
-      // Put the write(output to the pipeline) into a single thread
-      // and keep the collector as remain in the main thread.
-      // otherwise it will causes deadlock if the data size greater than
-      // the pipeline / buffer capacity.
+      // This new thread will consume the ScriptTransformation's input rows and write them to the
+      // external process. That process's output will be read by this current thread.
       val writerThread = new ScriptTransformationWriterThread(
         inputIterator,
         outputProjection,
@@ -108,6 +104,7 @@ case class ScriptTransformation(
         ioschema.initOutputSerDe(output).getOrElse((null, null))
       }
 
+      val reader = new BufferedReader(new InputStreamReader(inputStream))
       val outputIterator: Iterator[InternalRow] = new Iterator[InternalRow] with HiveInspectors {
         var cacheRow: InternalRow = null
         var curLine: String = null