[SPARK-11500][SQL] Sort file statuses, partitioned tables for the tes…

…t and rename variable.
apache · HyukjinKwon · Nov 6, 2015 · Nov 6, 2015 · Nov 6, 2015 · Nov 9, 2015
commit 4f4706352c84469503ae3c3388098458b570f62f
diff --git a/...e/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/...e/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -398,7 +398,7 @@ private[sql] class ParquetRelation(
         val leaves = currentLeafStatuses.filter { f =>
           isSummaryFile(f.getPath) ||
             !(f.getPath.getName.startsWith("_") || f.getPath.getName.startsWith("."))
-        }.toArray
+        }.toArray.sortWith(_.getPath.toString < _.getPath.toString)
 
         dataStatuses = leaves.filterNot(f => isSummaryFile(f.getPath))
         metadataStatuses =
@@ -465,17 +465,18 @@ private[sql] class ParquetRelation(
           // the ordering of the output columns. There are several things to mention here.
           //
           //  1. If mergeRespectSummaries config is false, then it merges schemas by reducing from
-          //     the first part-file so that the columns of the first file show first.
+          //     the first part-file so that the columns of the lexicographically first file show
+          //     first.
           //
           //  2. If mergeRespectSummaries config is true, then there should be, at least,
-          //     "_metadata"s for all given files. So, we can ensure the columns of the first file
-          //     show first.
+          //     "_metadata"s for all given files, so that we can ensure the columns of
+          //     the lexicographically first file show first.
           //
           //  3. If shouldMergeSchemas is false, but when multiple files are given, there is
           //     no guarantee of the output order, since there might not be a summary file for the
-          //     first file, which ends up putting ahead the columns of the other files. However,
-          //     this should be okay since not enabling shouldMergeSchemas means (assumes) all the
-          //     files have the same schemas.
+          //     lexicographically first file, which ends up putting ahead the columns of
+          //     the other files. However, this should be okay since not enabling
+          //     shouldMergeSchemas means (assumes) all the files have the same schemas.
 
           val needMerged: Seq[FileStatus] =
             if (mergeRespectSummaries) {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -787,10 +787,10 @@ private[sql] object HadoopFsRelation extends Logging {
         status.getAccessTime)
     }.collect()
 
-    val fakeStatusesSeq = fakeStatuses.map { f =>
+    val hadoopFakeStatuses = fakeStatuses.map { f =>
       new FileStatus(
         f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, new Path(f.path))
     }
-    mutable.LinkedHashSet(fakeStatusesSeq: _*)
+    mutable.LinkedHashSet(hadoopFakeStatuses: _*)
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala
@@ -160,16 +160,17 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest {
     import testImplicits._
     withSQLConf(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true") {
       withTempPath { dir =>
-        val pathOne = s"${dir.getCanonicalPath}/table1"
+        val pathOne = s"${dir.getCanonicalPath}/part=1"
         Seq(1, 1).zipWithIndex.toDF("a", "b").write.parquet(pathOne)
-        val pathTwo = s"${dir.getCanonicalPath}/table2"
+        val pathTwo = s"${dir.getCanonicalPath}/part=2"
         Seq(1, 1).zipWithIndex.toDF("c", "b").write.parquet(pathTwo)
-        val pathThree = s"${dir.getCanonicalPath}/table3"
+        val pathThree = s"${dir.getCanonicalPath}/part=3"
         Seq(1, 1).zipWithIndex.toDF("d", "b").write.parquet(pathThree)
 
-        // Here the columns shows according to the order of given files.
-        assert(sqlContext.read.parquet(pathOne, pathTwo, pathThree).schema.map(_.name)
-          === Seq("a", "b", "c", "d"))
+        // The schema consists of the leading columns of the first part-file
+        // in the lexicographic order.
+        assert(sqlContext.read.parquet(dir.getCanonicalPath).schema.map(_.name)
+          === Seq("a", "b", "c", "d", "part"))
       }
     }
   }