Update python dataFrame show test and add empty df unit test.

apache · kaka1992 · May 1, 2015 · May 2, 2015 · May 2, 2015 · May 2, 2015
commit 84aec3ec56f9327a2fe08a053568f56fbeffa15a
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -275,9 +275,12 @@ def show(self, n=20):
         >>> df
         DataFrame[age: int, name: string]
         >>> df.show()
-        age name
-        2   Alice
-        5   Bob
+        +---+-----+
+        |age| name|
+        +---+-----+
+        |  2|Alice|
+        |  5|  Bob|
+        +---+-----+
         """
         print(self._jdf.showString(n))
 
@@ -591,12 +594,15 @@ def describe(self, *cols):
         given, this function computes statistics for all numerical columns.
 
         >>> df.describe().show()
-        summary age
-        count   2
-        mean    3.5
-        stddev  1.5
-        min     2
-        max     5
+        +-------+---+
+        |summary|age|
+        +-------+---+
+        |  count|  2|
+        |   mean|3.5|
+        | stddev|1.5|
+        |    min|  2|
+        |    max|  5|
+        +-------+---+
         """
         jdf = self._jdf.describe(self._jseq(cols))
         return DataFrame(jdf, self.sql_ctx)
@@ -801,12 +807,18 @@ def dropna(self, how='any', thresh=None, subset=None):
         :param subset: optional list of column names to consider.
 
         >>> df4.dropna().show()
-        age height name
-        10  80     Alice
+        +---+------+-----+
+        |age|height| name|
+        +---+------+-----+
+        | 10|    80|Alice|
+        +---+------+-----+
 
         >>> df4.na.drop().show()
-        age height name
-        10  80     Alice
+        +---+------+-----+
+        |age|height| name|
+        +---+------+-----+
+        | 10|    80|Alice|
+        +---+------+-----+
         """
         if how is not None and how not in ['any', 'all']:
             raise ValueError("how ('" + how + "') should be 'any' or 'all'")
@@ -837,25 +849,34 @@ def fillna(self, value, subset=None):
             then the non-string column is simply ignored.
 
         >>> df4.fillna(50).show()
-        age height name
-        10  80     Alice
-        5   50     Bob
-        50  50     Tom
-        50  50     null
+        +---+------+-----+
+        |age|height| name|
+        +---+------+-----+
+        | 10|    80|Alice|
+        |  5|    50|  Bob|
+        | 50|    50|  Tom|
+        | 50|    50| null|
+        +---+------+-----+
 
         >>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
-        age height name
-        10  80     Alice
-        5   null   Bob
-        50  null   Tom
-        50  null   unknown
+        +---+------+-------+
+        |age|height|   name|
+        +---+------+-------+
+        | 10|    80|  Alice|
+        |  5|  null|    Bob|
+        | 50|  null|    Tom|
+        | 50|  null|unknown|
+        +---+------+-------+
 
         >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
-        age height name
-        10  80     Alice
-        5   null   Bob
-        50  null   Tom
-        50  null   unknown
+        +---+------+-------+
+        |age|height|   name|
+        +---+------+-------+
+        | 10|    80|  Alice|
+        |  5|  null|    Bob|
+        | 50|  null|    Tom|
+        | 50|  null|unknown|
+        +---+------+-------+
         """
         if not isinstance(value, (float, int, long, basestring, dict)):
             raise ValueError("value should be a float, int, long, string, or dict")
@@ -1220,11 +1241,17 @@ def getItem(self, key):
 
         >>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
         >>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
-        l[0] d[key]
-        1    value
+        +----+------+
+        |l[0]|d[key]|
+        +----+------+
+        |   1| value|
+        +----+------+
         >>> df.select(df.l[0], df.d["key"]).show()
-        l[0] d[key]
-        1    value
+        +----+------+
+        |l[0]|d[key]|
+        +----+------+
+        |   1| value|
+        +----+------+
         """
         return self[key]
 
@@ -1234,11 +1261,17 @@ def getField(self, name):
         >>> from pyspark.sql import Row
         >>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
         >>> df.select(df.r.getField("b")).show()
-        r.b
-        b
+        +---+
+        |r.b|
+        +---+
+        |  b|
+        +---+
         >>> df.select(df.r.a).show()
-        r.a
-        1
+        +---+
+        |r.a|
+        +---+
+        |  1|
+        +---+
         """
         return Column(self._jc.getField(name))
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -599,8 +599,20 @@ class DataFrameSuite extends QueryTest {
   }
 
   test("SPARK-7319 showString") {
-    assert(testData.select($"*").showString(1).split("\n") === Seq("+---+-----+",
-      "|key|value|", "+---+-----+", "|  1|    1|", "+---+-----+"))
+    assert(testData.select($"*").showString(1) === """+---+-----+
+                                                     ||key|value|
+                                                     |+---+-----+
+                                                     ||  1|    1|
+                                                     |+---+-----+
+                                                     |""".stripMargin)
+  }
+
+  test("SPARK-7327 show with empty dataFrame") {
+    assert(testData.select($"*").filter($"key" < 0).showString(1) === """+---+-----+
+                                                                       ||key|value|
+                                                                       |+---+-----+
+                                                                       |+---+-----+
+                                                                       |""".stripMargin)
   }
 
   test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {