expanded test for more cases

apache · BryanCutler · Aug 29, 2018 · Oct 30, 2018 · Oct 30, 2018 · Oct 30, 2018
commit 0d77b0051546eb3a178a06a95f35ccb20ddeeff6
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -4435,10 +4435,26 @@ def test_timestamp_dst(self):
         self.assertPandasEqual(pdf, df_from_pandas.toPandas())
 
     def test_toPandas_batch_order(self):
-        df = self.spark.range(64, numPartitions=8).toDF("a")
-        with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": 4}):
-            pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
-            self.assertPandasEqual(pdf, pdf_arrow)
+
+        # Collects Arrow RecordBatches out of order in driver JVM then re-orders in Python
+        def run_test(num_records, num_parts, max_records):
+            df = self.spark.range(num_records, numPartitions=num_parts).toDF("a")
+            with self.sql_conf({"spark.sql.execution.arrow.maxRecordsPerBatch": max_records}):
+                pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
+                self.assertPandasEqual(pdf, pdf_arrow)
+
+        cases = [
+            (1024, 512, 2),  # Try large num partitions for good chance of not collecting in order
+            (512, 64, 2),    # Try medium num partitions to test out of order collection
+            (64, 8, 2),      # Try small number of partitions to test out of order collection
+            (64, 64, 1),     # Test single batch per partition
+            (64, 1, 64),     # Test single partition, single batch
+            (64, 1, 8),      # Test single partition, multiple batches
+            (30, 7, 2),      # Test different sized partitions
+        ]
+
+        for case in cases:
+            run_test(num_records=case[0], num_parts=case[1], max_records=case[2])
 
 
 @unittest.skipIf(