-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-23094][SPARK-23723][SPARK-23724][SQL] Support custom encoding for json files #20937
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
b2e92b4
cb2f27b
0d45fd3
1fb9b32
c3b04ee
93d3879
15798a1
cc05ce9
74f2026
4856b8e
084f41f
31cd793
6eacd18
3b4a509
cd1124e
ebf5390
c5b6a35
ef5e6c6
f9b6ad1
3b7714c
edb9167
5ba2881
1509e10
e3184b3
87d259c
76c1d08
88395b5
f2f8ae7
b451a03
c13c159
1cb3ac0
108e8e7
0d20cc6
54baf9f
1d50d94
bb53798
961b482
a794988
dccdaa2
d0abab7
6741796
e4faae1
01f4ef5
24cedb9
d40dda2
ad6496c
358863d
7e5be5e
d138d2d
c26ef5d
5f0b069
ef8248f
2efac08
b2020fa
f99c1e1
6d13d00
77112ef
d632706
bbff402
3af996b
8253811
ab8210c
7c6f115
f553b07
d6a07a1
cb12ea3
eb2965b
7a4865c
dbeb0c1
ac67020
d96b720
75f7bb6
d93dcdc
65b4b73
6b52419
6116bac
5383400
1aeae3c
7e20891
0d3ed3c
5d5c295
e7be77d
6bd841a
6a62679
3b30ce0
fcd0a21
af71324
76dbbed
3207e59
b817184
15df9af
36253f4
aa69559
c35d5d1
58fc5c6
63b5894
1ace082
6c0df03
b4c0d38
f2a259f
482b799
a0ab98b
a7be182
e0cebf4
d3d28aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2283,9 +2283,8 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { | |
| } | ||
| } | ||
|
|
||
| def checkReadJson(lineSep: String, encodingOption: String, encoding: String, | ||
| inferSchema: Boolean, runId: Int): Unit = { | ||
| test(s"SPARK-23724: checks reading json in ${encoding} #${runId}") { | ||
| def checkReadJson(lineSep: String, encoding: String, inferSchema: Boolean, id: Int): Unit = { | ||
| test(s"SPARK-23724: checks reading json in ${encoding} #${id}") { | ||
| val lineSepInBytes = { | ||
| if (lineSep.startsWith("x")) { | ||
|
||
| lineSep.replaceAll("[^0-9A-Fa-f]", "") | ||
|
|
@@ -2309,7 +2308,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { | |
| spark.read.schema(schema) | ||
| } | ||
| val readBack = reader | ||
| .option(encodingOption, encoding) | ||
| .option("encoding", encoding) | ||
| .option("lineSep", lineSep) | ||
| .json(path.getCanonicalPath) | ||
| checkAnswer(readBack, records.map(rec => Row(rec._1, rec._2))) | ||
|
|
@@ -2319,21 +2318,21 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData { | |
|
|
||
| // scalastyle:off nonascii | ||
| List( | ||
| ("|", "encoding", "UTF-8", false), | ||
| ("^", "charset", "UTF-16BE", true), | ||
| ("::", "encoding", "ISO-8859-1", true), | ||
| ("!!!@3", "encoding", "UTF-32LE", false), | ||
| (0x1E.toChar.toString, "charset", "UTF-8", true), | ||
| ("아", "encoding", "UTF-32BE", false), | ||
| ("куку", "encoding", "CP1251", true), | ||
| ("sep", "encoding", "utf-8", false), | ||
| ("\r\n", "encoding", "UTF-16LE", false), | ||
| ("\r\n", "encoding", "utf-16be", true), | ||
| ("\u000d\u000a", "encoding", "UTF-32BE", false), | ||
| ("\u000a\u000d", "encoding", "UTF-8", true), | ||
| ("===", "encoding", "US-ASCII", false), | ||
| ("$^+", "encoding", "utf-32le", true) | ||
| ).zipWithIndex.foreach{case ((d, o, c, s), i) => checkReadJson(d, o, c, s, i)} | ||
| (0, "|", "UTF-8", false), | ||
| (1, "^", "UTF-16BE", true), | ||
| (2, "::", "ISO-8859-1", true), | ||
| (3, "!!!@3", "UTF-32LE", false), | ||
| (4, 0x1E.toChar.toString, "UTF-8", true), | ||
| (5, "아", "UTF-32BE", false), | ||
| (6, "куку", "CP1251", true), | ||
| (7, "sep", "utf-8", false), | ||
| (8, "\r\n", "UTF-16LE", false), | ||
| (9, "\r\n", "utf-16be", true), | ||
| (10, "\u000d\u000a", "UTF-32BE", false), | ||
| (11, "\u000a\u000d", "UTF-8", true), | ||
| (12, "===", "US-ASCII", false), | ||
| (13, "$^+", "utf-32le", true) | ||
| ).foreach{case (i, d, c, s) => checkReadJson(d, c, s, i)} | ||
|
||
| // scalastyle:on nonascii | ||
|
|
||
| test("SPARK-23724: lineSep should be set if encoding if different from UTF-8") { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.