Skip to content
Closed
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@
<jline.version>${scala.version}</jline.version>
<jline.groupid>org.scala-lang</jline.groupid>
<codehaus.jackson.version>1.9.13</codehaus.jackson.version>
<fasterxml.jackson.version>2.5.3</fasterxml.jackson.version>
<fasterxml.jackson.version>2.7.3</fasterxml.jackson.version>
<snappy.version>1.1.2.4</snappy.version>
<netlib.java.version>1.1.2</netlib.java.version>
<calcite.version>1.2.0-incubating</calcite.version>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,16 @@ private[sql] object InferSchema {
// record fields' types have been combined.
NullType

case VALUE_STRING => StringType
case VALUE_STRING =>
// If there is only one row, the following non-numeric numbers will be incorrectly
// recognized as StringType.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

E.g., the two tests in JsonParsingOptionsSuite.

val value = parser.getText
if (value.equals("NaN") ||
value.equals("Infinity") ||
value.equals("-Infinity")) {
return DoubleType
}
return StringType
case START_OBJECT =>
val builder = Array.newBuilder[StructField]
while (nextUntil(parser, END_OBJECT)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,9 @@ object JacksonParser extends Logging {
case (VALUE_STRING, FloatType) =>
// Special case handling for NaN and Infinity.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we removing the special handling for float types here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea, should revert it back. BTW, do we actually test "inf" and "-inf" before? Because "inf".toFloat is not legal.

val value = parser.getText
val lowerCaseValue = value.toLowerCase()
if (lowerCaseValue.equals("nan") ||
lowerCaseValue.equals("infinity") ||
lowerCaseValue.equals("-infinity") ||
lowerCaseValue.equals("inf") ||
lowerCaseValue.equals("-inf")) {
if (value.equals("NaN") ||
value.equals("Infinity") ||
value.equals("-Infinity")) {
value.toFloat
} else {
throw new SparkSQLJsonProcessingException(s"Cannot parse $value as FloatType.")
Expand All @@ -146,12 +143,9 @@ object JacksonParser extends Logging {
case (VALUE_STRING, DoubleType) =>
// Special case handling for NaN and Infinity.
val value = parser.getText
val lowerCaseValue = value.toLowerCase()
if (lowerCaseValue.equals("nan") ||
lowerCaseValue.equals("infinity") ||
lowerCaseValue.equals("-infinity") ||
lowerCaseValue.equals("inf") ||
lowerCaseValue.equals("-inf")) {
if (value.equals("NaN") ||
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"infinity".toDouble, "inf".toDouble are not legal. These non-numeric numbers are case-sensitive, both for Jackson and Scala.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should also allow INF, -INF here, to be consistent with the legal inputs of allowNonNumericNumbers.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok. added.

value.equals("Infinity") ||
value.equals("-Infinity")) {
value.toDouble
} else {
throw new SparkSQLJsonProcessingException(s"Cannot parse $value as DoubleType.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,23 +93,45 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext {
assert(df.first().getLong(0) == 18)
}

// The following two tests are not really working - need to look into Jackson's
// JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS.
ignore("allowNonNumericNumbers off") {
val str = """{"age": NaN}"""
val rdd = spark.sparkContext.parallelize(Seq(str))
val df = spark.read.json(rdd)

assert(df.schema.head.name == "_corrupt_record")
test("allowNonNumericNumbers off") {
// non-quoted non-numeric numbers don't work if allowNonNumericNumbers is off.
var testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""",
"""{"age": -Infinity}""", """{"age": +INF}""", """{"age": -INF}""")
testCases.foreach { str =>
val rdd = spark.sparkContext.parallelize(Seq(str))
val df = spark.read.option("allowNonNumericNumbers", "false").json(rdd)

assert(df.schema.head.name == "_corrupt_record")
}

// quoted non-numeric numbers should still work even allowNonNumericNumbers is off.
testCases = Seq("""{"age": "NaN"}""", """{"age": "Infinity"}""", """{"age": "-Infinity"}""")
val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isNegInfinity)

testCases.zipWithIndex.foreach { case (str, idx) =>
val rdd = spark.sparkContext.parallelize(Seq(str))
val df = spark.read.option("allowNonNumericNumbers", "false").json(rdd)

assert(df.schema.head.name == "age")
assert(tests(idx)(df.first().getDouble(0)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why it's double type? Shouldn't it be string if allowNonNumericNumbers is off?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from @rxin comment that we want to support quoted non-numeric numbers when allowNonNumericNumbers is off.

Copy link
Contributor

@cloud-fan cloud-fan May 18, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't make sense to me. What if users really want to use "NaN" as string?
cc @rxin

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then I shouldn't change InferSchem. Tests here are also need to add few doubles. I will update it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In other words, it should be number when the field is inferred as double/float type.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cloud-fan I updated it.

Non-quoted non-numeric numbers are parsed as double when the corresponding field is double/float type. This behavior is as same as before this patch.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did I say that anywhere?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea I don't think that's what I meant there.

}
}

ignore("allowNonNumericNumbers on") {
val str = """{"age": NaN}"""
val rdd = spark.sparkContext.parallelize(Seq(str))
val df = spark.read.option("allowNonNumericNumbers", "true").json(rdd)

assert(df.schema.head.name == "age")
assert(df.first().getDouble(0).isNaN)
test("allowNonNumericNumbers on") {
val testCases: Seq[String] = Seq("""{"age": NaN}""", """{"age": Infinity}""",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we still read them if they are quoted?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, so if we don't set JsonGenerator.Feature.QUOTE_NON_NUMERIC_NUMBERS to false, we can't read them normally.

"""{"age": -Infinity}""", """{"age": +INF}""", """{"age": -INF}""", """{"age": "NaN"}""",
"""{"age": "Infinity"}""", """{"age": "-Infinity"}""")
val tests: Seq[Double => Boolean] = Seq(_.isNaN, _.isPosInfinity, _.isNegInfinity,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Besides, I found that "Inf", "-Inf" seems not working even JsonGenerator.Feature.QUOTE_NON_NUMERIC_NUMBERS is enabled.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to upgrade jackson library version in order to support "INF" and "-INF" (case-sensitive).

_.isPosInfinity, _.isNegInfinity, _.isNaN, _.isPosInfinity, _.isNegInfinity,
_.isPosInfinity, _.isNegInfinity)

testCases.zipWithIndex.foreach { case (str, idx) =>
val rdd = spark.sparkContext.parallelize(Seq(str))
val df = spark.read.option("allowNonNumericNumbers", "true").json(rdd)

assert(df.schema.head.name == "age")
assert(tests(idx)(df.first().getDouble(0)))
}
}

test("allowBackslashEscapingAnyCharacter off") {
Expand Down