Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ setMethod("isLocal",
setMethod("showDF",
signature(x = "DataFrame"),
function(x, numRows = 20) {
cat(callJMethod(x@sdf, "showString", numToInt(numRows)), "\n")
callJMethod(x@sdf, "showString", numToInt(numRows))
})

#' show
Expand Down
2 changes: 1 addition & 1 deletion R/pkg/inst/tests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,7 @@ test_that("toJSON() returns an RDD of the correct values", {

test_that("showDF()", {
df <- jsonFile(sqlCtx, jsonPath)
expect_output(showDF(df), "age name \nnull Michael\n30 Andy \n19 Justin ")
expect_output(showDF(df), "+----+-------+\n| age| name|\n+----+-------+\n|null|Michael|\n| 30| Andy|\n| 19| Justin|\n+----+-------+\n")
})

test_that("isLocal()", {
Expand Down
105 changes: 69 additions & 36 deletions python/pyspark/sql/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,9 +275,12 @@ def show(self, n=20):
>>> df
DataFrame[age: int, name: string]
>>> df.show()
age name
2 Alice
5 Bob
+---+-----+
|age| name|
+---+-----+
| 2|Alice|
| 5| Bob|
+---+-----+
"""
print(self._jdf.showString(n))

Expand Down Expand Up @@ -591,12 +594,15 @@ def describe(self, *cols):
given, this function computes statistics for all numerical columns.

>>> df.describe().show()
summary age
count 2
mean 3.5
stddev 1.5
min 2
max 5
+-------+---+
|summary|age|
+-------+---+
| count| 2|
| mean|3.5|
| stddev|1.5|
| min| 2|
| max| 5|
+-------+---+
"""
jdf = self._jdf.describe(self._jseq(cols))
return DataFrame(jdf, self.sql_ctx)
Expand Down Expand Up @@ -801,12 +807,18 @@ def dropna(self, how='any', thresh=None, subset=None):
:param subset: optional list of column names to consider.

>>> df4.dropna().show()
age height name
10 80 Alice
+---+------+-----+
|age|height| name|
+---+------+-----+
| 10| 80|Alice|
+---+------+-----+

>>> df4.na.drop().show()
age height name
10 80 Alice
+---+------+-----+
|age|height| name|
+---+------+-----+
| 10| 80|Alice|
+---+------+-----+
"""
if how is not None and how not in ['any', 'all']:
raise ValueError("how ('" + how + "') should be 'any' or 'all'")
Expand Down Expand Up @@ -837,25 +849,34 @@ def fillna(self, value, subset=None):
then the non-string column is simply ignored.

>>> df4.fillna(50).show()
age height name
10 80 Alice
5 50 Bob
50 50 Tom
50 50 null
+---+------+-----+
|age|height| name|
+---+------+-----+
| 10| 80|Alice|
| 5| 50| Bob|
| 50| 50| Tom|
| 50| 50| null|
+---+------+-----+

>>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
age height name
10 80 Alice
5 null Bob
50 null Tom
50 null unknown
+---+------+-------+
|age|height| name|
+---+------+-------+
| 10| 80| Alice|
| 5| null| Bob|
| 50| null| Tom|
| 50| null|unknown|
+---+------+-------+

>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
age height name
10 80 Alice
5 null Bob
50 null Tom
50 null unknown
+---+------+-------+
|age|height| name|
+---+------+-------+
| 10| 80| Alice|
| 5| null| Bob|
| 50| null| Tom|
| 50| null|unknown|
+---+------+-------+
"""
if not isinstance(value, (float, int, long, basestring, dict)):
raise ValueError("value should be a float, int, long, string, or dict")
Expand Down Expand Up @@ -1220,11 +1241,17 @@ def getItem(self, key):

>>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
>>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
l[0] d[key]
1 value
+----+------+
|l[0]|d[key]|
+----+------+
| 1| value|
+----+------+
>>> df.select(df.l[0], df.d["key"]).show()
l[0] d[key]
1 value
+----+------+
|l[0]|d[key]|
+----+------+
| 1| value|
+----+------+
"""
return self[key]

Expand All @@ -1234,11 +1261,17 @@ def getField(self, name):
>>> from pyspark.sql import Row
>>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
>>> df.select(df.r.getField("b")).show()
r.b
b
+---+
|r.b|
+---+
| b|
+---+
>>> df.select(df.r.a).show()
r.a
1
+---+
|r.a|
+---+
| 1|
+---+
"""
return Column(self._jc.getField(name))

Expand Down
28 changes: 22 additions & 6 deletions sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.apache.spark.sql
import java.io.CharArrayWriter
import java.sql.DriverManager


import scala.collection.JavaConversions._
import scala.language.implicitConversions
import scala.reflect.ClassTag
Expand All @@ -28,6 +29,7 @@ import scala.util.control.NonFatal

import com.fasterxml.jackson.core.JsonFactory

import org.apache.commons.lang3.StringUtils
import org.apache.spark.annotation.{DeveloperApi, Experimental}
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.api.python.SerDeUtil
Expand Down Expand Up @@ -175,6 +177,7 @@ class DataFrame private[sql](
* @param numRows Number of rows to show
*/
private[sql] def showString(numRows: Int): String = {
val sb = new StringBuilder
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would be great to create unit test for showString in DataFrameSuite.

val data = take(numRows)
val numCols = schema.fieldNames.length

Expand All @@ -194,12 +197,25 @@ class DataFrame private[sql](
}
}

// Pad the cells
rows.map { row =>
row.zipWithIndex.map { case (cell, i) =>
String.format(s"%-${colWidths(i)}s", cell)
}.mkString(" ")
}.mkString("\n")
// Create SeparateLine
val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()

// column names
rows.head.zipWithIndex.map { case (cell, i) =>
StringUtils.leftPad(cell.toString, colWidths(i))
}.addString(sb, "|", "|", "|\n")

sb.append(sep)

// data
rows.tail.map {
_.zipWithIndex.map { case (cell, i) =>
StringUtils.leftPad(cell.toString, colWidths(i))
}.addString(sb, "|", "|", "|\n")
}

sb.append(sep)
sb.toString()
}

override def toString: String = {
Expand Down
19 changes: 19 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,25 @@ class DataFrameSuite extends QueryTest {
testData.select($"*").show(1000)
}

test("SPARK-7319 showString") {
val expectedAnswer = """+---+-----+
||key|value|
|+---+-----+
|| 1| 1|
|+---+-----+
|""".stripMargin
assert(testData.select($"*").showString(1) === expectedAnswer)
}

test("SPARK-7327 show with empty dataFrame") {
val expectedAnswer = """+---+-----+
||key|value|
|+---+-----+
|+---+-----+
|""".stripMargin
assert(testData.select($"*").filter($"key" < 0).showString(1) === expectedAnswer)
}

test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {
val rowRDD = TestSQLContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false)))
Expand Down