Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
b2e92b4
Test for reading json in UTF-16 with BOM
MaxGekk Feb 11, 2018
cb2f27b
Use user's charset or autodetect it if the charset is not specified
MaxGekk Feb 11, 2018
0d45fd3
Added a type and a comment for charset
MaxGekk Feb 13, 2018
1fb9b32
Replacing the monadic chaining by matching because it is more readable
MaxGekk Feb 13, 2018
c3b04ee
Keeping the old method for backward compatibility
MaxGekk Feb 13, 2018
93d3879
testFile is moved into the test to make more local because it is used…
MaxGekk Feb 13, 2018
15798a1
Adding the charset as third parameter to the text method
MaxGekk Feb 13, 2018
cc05ce9
Removing whitespaces at the end of the line
MaxGekk Feb 13, 2018
74f2026
Fix the comment in javadoc style
MaxGekk Feb 13, 2018
4856b8e
Simplifying of the UTF-16 test
MaxGekk Feb 13, 2018
084f41f
A hint to the exception how to set the charset explicitly
MaxGekk Feb 15, 2018
31cd793
Fix for scala style checks
MaxGekk Feb 15, 2018
6eacd18
Run tests again
MaxGekk Feb 15, 2018
3b4a509
Improving of the exception message
MaxGekk Feb 15, 2018
cd1124e
Appended the original message to the exception
MaxGekk Feb 15, 2018
ebf5390
Multi-line reading of json file in utf-32
MaxGekk Feb 17, 2018
c5b6a35
Autodetect charset of jsons in the multiline mode
MaxGekk Feb 17, 2018
ef5e6c6
Test for reading a json in UTF-16LE in the multiline mode by using us…
MaxGekk Feb 17, 2018
f9b6ad1
Fix test: rename the test file - utf32be -> utf32BE
MaxGekk Feb 18, 2018
3b7714c
Fix code style
MaxGekk Feb 18, 2018
edb9167
Appending the create verb to the method for readability
MaxGekk Feb 18, 2018
5ba2881
Making the createParser as a separate private method
MaxGekk Feb 18, 2018
1509e10
Fix code style
MaxGekk Feb 18, 2018
e3184b3
Checks the charset option is supported
MaxGekk Feb 19, 2018
87d259c
Support charset as a parameter of the json method
MaxGekk Feb 19, 2018
76c1d08
Test for charset different from utf-8
MaxGekk Feb 19, 2018
88395b5
Description of the charset option of the json method
MaxGekk Feb 20, 2018
f2f8ae7
Minor changes in comments: added . at the end of a sentence
MaxGekk Feb 21, 2018
b451a03
Added a test for wrong charset name
MaxGekk Feb 21, 2018
c13c159
Testing that charset in any case is acceptable
MaxGekk Feb 21, 2018
1cb3ac0
Test: user specified wrong (but supported) charset
MaxGekk Feb 21, 2018
108e8e7
Set charset as an option
MaxGekk Feb 25, 2018
0d20cc6
Test: saving to json in UTF-32BE
MaxGekk Feb 23, 2018
54baf9f
Taking user's charset for saved json
MaxGekk Feb 23, 2018
1d50d94
Test: output charset is UTF-8 by default
MaxGekk Feb 23, 2018
bb53798
Changing the readJsonFiles method for readability
MaxGekk Mar 4, 2018
961b482
The test checks that json written by Spark can be read back
MaxGekk Mar 4, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Use user's charset or autodetect it if the charset is not specified
  • Loading branch information
MaxGekk committed Mar 17, 2018
commit cb2f27ba73cb5838e2910c31ca204100bb4eebca
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,13 @@ private[sql] object CreateJacksonParser extends Serializable {
jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))
}

def text(jsonFactory: JsonFactory, record: Text): JsonParser = {
jsonFactory.createParser(record.getBytes, 0, record.getLength)
def text(charset: Option[String])(jsonFactory: JsonFactory, record: Text): JsonParser = {
charset.map {cs =>
val bain = new ByteArrayInputStream(record.getBytes, 0, record.getLength)
jsonFactory.createParser(new InputStreamReader(bain, cs))
}.getOrElse {
jsonFactory.createParser(record.getBytes, 0, record.getLength)
}
}

def inputStream(jsonFactory: JsonFactory, record: InputStream): JsonParser = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ private[sql] class JSONOptions(

val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false)

/** Standard charset name. For example UTF-8, UTF-16 and UTF-32 */
val charset = parameters.get("charset")

/** Sets config options on a Jackson [[JsonFactory]]. */
def setJacksonOptions(factory: JsonFactory): Unit = {
factory.configure(JsonParser.Feature.ALLOW_COMMENTS, allowComments)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,10 @@ object TextInputJsonDataSource extends JsonDataSource {
schema: StructType): Iterator[InternalRow] = {
val linesReader = new HadoopFileLinesReader(file, conf)
Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => linesReader.close()))
val charset = parser.options.charset

val safeParser = new FailureSafeParser[Text](
input => parser.parse(input, CreateJacksonParser.text, textToUTF8String),
input => parser.parse(input, CreateJacksonParser.text(charset), textToUTF8String),
parser.options.parseMode,
schema,
parser.options.columnNameOfCorruptRecord)
Expand Down