Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
446ae98
[WIP] filter out empty/whitespace JSON lines when skipping parsing
Jan 21, 2019
1544771
Merge branch 'master' of github.com:apache/spark into json_emptyline_…
Jan 21, 2019
236227f
remove println/dumpStack
Jan 21, 2019
e4d9052
add test for non-parsed JSON count
Jan 21, 2019
105e5bb
Merge branch 'master' of github.com:apache/spark into json_emptyline_…
Jan 22, 2019
e8e3189
Merge branch 'json_emptyline_count_test' of github.com:sumitsu/spark …
Jan 22, 2019
13942b8
fix scala import style errors
Jan 22, 2019
5f173d9
Merge branch 'master' of github.com:apache/spark into json_emptyline_…
Jan 22, 2019
7a51764
Merge branch 'master' of github.com:apache/spark into json_emptyline_…
Jan 22, 2019
91305ee
Merge branch 'master' of github.com:apache/spark into json_emptyline_…
Jan 23, 2019
051d84a
push down non-parsed json record filter into FailureSafeParser
Jan 23, 2019
57d2c05
Merge branch 'master' of github.com:apache/spark into json_emptyline_…
Jan 24, 2019
4fffe7f
Merge branch 'master' of github.com:apache/spark into json_emptyline_…
Jan 25, 2019
2252045
Merge branch 'master' of github.com:apache/spark into json_emptyline_…
Jan 26, 2019
532a83d
Merge branch 'master' of github.com:apache/spark into json_emptyline_…
Jan 27, 2019
3cae4da
Merge branch 'master' of github.com:apache/spark into json_emptyline_…
Jan 27, 2019
cd2f30c
Merge branch 'master' of github.com:apache/spark into json_emptyline_…
Jan 27, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add test for non-parsed JSON count
  • Loading branch information
Branden Smith committed Jan 21, 2019
commit e4d9052a4530b0a57b32ad141e606a834f518694
7 changes: 7 additions & 0 deletions sql/core/src/test/resources/test-data/with-empty-line.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{ "a" : 1 , "b" : 2 , "c" : 3 }

{ "a" : 4 , "b" : 5 , "c" : 6 }

{ "a" : 7 , "b" : 8 , "c" : 9 }


Original file line number Diff line number Diff line change
Expand Up @@ -2426,6 +2426,23 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
countForMalformedJSON(0, Seq(""))
}

test("count() for non-multiline input with empty lines") {
val withEmptyLineData = Array(Map("a" -> 1, "b" -> 2, "c" -> 3),
Map("a" -> 4, "b" -> 5, "c" -> 6),
Map("a" -> 7, "b" -> 8, "c" -> 9))
val df = spark.read.json("src/test/resources/test-data/with-empty-line.json")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

plz use testFile.

// important to do this .count() first, prior to caching/persisting/computing/collecting, to
// test the non-parsed-count pathway
assert(df.count() === withEmptyLineData.length,
"JSON DataFrame unparsed-count should exclude whitespace-only lines")
// cache and collect to check that count stays stable under those operations
df.cache()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we dont need this cache.

assert(df.count() === withEmptyLineData.length,
"JSON DataFrame parsed-count should exclude whitespace-only lines")
val collected = df.collect().map(_.getValuesMap(Seq("a", "b", "c")))
assert(collected === withEmptyLineData)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

plz check checkAnswer.

}

test("SPARK-25040: empty strings should be disallowed") {
def failedOnEmptyString(dataType: DataType): Unit = {
val df = spark.read.schema(s"a ${dataType.catalogString}")
Expand Down