Skip to content
Closed
105 changes: 69 additions & 36 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,9 +275,12 @@ def show(self, n=20):
>>> df
DataFrame[age: int, name: string]
>>> df.show()
age name
2 Alice
5 Bob
+---+-----+
|age| name|
+---+-----+
| 2|Alice|
| 5| Bob|
+---+-----+
"""
print(self._jdf.showString(n))

Expand Down Expand Up @@ -591,12 +594,15 @@ def describe(self, *cols):
given, this function computes statistics for all numerical columns.

>>> df.describe().show()
summary age
count 2
mean 3.5
stddev 1.5
min 2
max 5
+-------+---+
|summary|age|
+-------+---+
| count| 2|
| mean|3.5|
| stddev|1.5|
| min| 2|
| max| 5|
+-------+---+
"""
jdf = self._jdf.describe(self._jseq(cols))
return DataFrame(jdf, self.sql_ctx)
Expand Down Expand Up @@ -801,12 +807,18 @@ def dropna(self, how='any', thresh=None, subset=None):
:param subset: optional list of column names to consider.

>>> df4.dropna().show()
age height name
10 80 Alice
+---+------+-----+
|age|height| name|
+---+------+-----+
| 10| 80|Alice|
+---+------+-----+

>>> df4.na.drop().show()
age height name
10 80 Alice
+---+------+-----+
|age|height| name|
+---+------+-----+
| 10| 80|Alice|
+---+------+-----+
"""
if how is not None and how not in ['any', 'all']:
raise ValueError("how ('" + how + "') should be 'any' or 'all'")
Expand Down Expand Up @@ -837,25 +849,34 @@ def fillna(self, value, subset=None):
then the non-string column is simply ignored.

>>> df4.fillna(50).show()
age height name
10 80 Alice
5 50 Bob
50 50 Tom
50 50 null
+---+------+-----+
|age|height| name|
+---+------+-----+
| 10| 80|Alice|
| 5| 50| Bob|
| 50| 50| Tom|
| 50| 50| null|
+---+------+-----+

>>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
age height name
10 80 Alice
5 null Bob
50 null Tom
50 null unknown
+---+------+-------+
|age|height| name|
+---+------+-------+
| 10| 80| Alice|
| 5| null| Bob|
| 50| null| Tom|
| 50| null|unknown|
+---+------+-------+

>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
age height name
10 80 Alice
5 null Bob
50 null Tom
50 null unknown
+---+------+-------+
|age|height| name|
+---+------+-------+
| 10| 80| Alice|
| 5| null| Bob|
| 50| null| Tom|
| 50| null|unknown|
+---+------+-------+
"""
if not isinstance(value, (float, int, long, basestring, dict)):
raise ValueError("value should be a float, int, long, string, or dict")
Expand Down Expand Up @@ -1220,11 +1241,17 @@ def getItem(self, key):

>>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
>>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
l[0] d[key]
1 value
+----+------+
|l[0]|d[key]|
+----+------+
| 1| value|
+----+------+
>>> df.select(df.l[0], df.d["key"]).show()
l[0] d[key]
1 value
+----+------+
|l[0]|d[key]|
+----+------+
| 1| value|
+----+------+
"""
return self[key]

Expand All @@ -1234,11 +1261,17 @@ def getField(self, name):
>>> from pyspark.sql import Row
>>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
>>> df.select(df.r.getField("b")).show()
r.b
b
+---+
|r.b|
+---+
| b|
+---+
>>> df.select(df.r.a).show()
r.a
1
+---+
|r.a|
+---+
| 1|
+---+
"""
return Column(self._jc.getField(name))

Expand Down
28 changes: 22 additions & 6 deletions sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.apache.spark.sql
import java.io.CharArrayWriter
import java.sql.DriverManager


import scala.collection.JavaConversions._
import scala.language.implicitConversions
import scala.reflect.ClassTag
Expand All @@ -28,6 +29,7 @@ import scala.util.control.NonFatal

import com.fasterxml.jackson.core.JsonFactory

import org.apache.commons.lang3.StringUtils
import org.apache.spark.annotation.{DeveloperApi, Experimental}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.api.python.SerDeUtil
Expand Down Expand Up @@ -175,6 +177,7 @@ class DataFrame private[sql](
* @param numRows Number of rows to show
*/
private[sql] def showString(numRows: Int): String = {
val sb = new StringBuilder
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would be great to create unit test for showString in DataFrameSuite.

val data = take(numRows)
val numCols = schema.fieldNames.length

Expand All @@ -194,12 +197,25 @@ class DataFrame private[sql](
}
}

// Pad the cells
rows.map { row =>
row.zipWithIndex.map { case (cell, i) =>
String.format(s"%-${colWidths(i)}s", cell)
}.mkString(" ")
}.mkString("\n")
// Create SeparateLine
val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()

// column names
rows.head.zipWithIndex.map { case (cell, i) =>
StringUtils.leftPad(cell.toString, colWidths(i))
}.addString(sb, "|", "|", "|\n")

sb.append(sep)

// data
rows.tail.map {
_.zipWithIndex.map { case (cell, i) =>
StringUtils.leftPad(cell.toString, colWidths(i))
}.addString(sb, "|", "|", "|\n")
}

sb.append(sep)
sb.toString()
}

override def toString: String = {
Expand Down
17 changes: 17 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,23 @@ class DataFrameSuite extends QueryTest {
testData.select($"*").show(1000)
}

test("SPARK-7319 showString") {
assert(testData.select($"*").showString(1) === """+---+-----+
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's pretty awkward to have this at the end. I'd just create a variable for the expected string so we don't have so much whitespace.

||key|value|
|+---+-----+
|| 1| 1|
|+---+-----+
|""".stripMargin)
}

test("SPARK-7327 show with empty dataFrame") {
assert(testData.select($"*").filter($"key" < 0).showString(1) === """+---+-----+
||key|value|
|+---+-----+
|+---+-----+
|""".stripMargin)
}

test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {
val rowRDD = TestSQLContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false)))
Expand Down