Skip to content

Commit 2be5f8d

Browse files
hvanhovellJoshRosen
authored andcommitted
[SPARK-17263][SQL] Add hexadecimal literal parsing
## What changes were proposed in this pull request? This PR adds the ability to parse SQL (hexadecimal) binary literals (AKA bit strings). It follows the following syntax `X'[Hexadecimal Characters]+'`, for example: `X'01AB'` would create a binary the following binary array `0x01AB`. If an uneven number of hexadecimal characters is passed, then the upper 4 bits of the initial byte are kept empty, and the lower 4 bits are filled using the first character. For example `X'1C7'` would create the following binary array `0x01C7`. Binary data (Array[Byte]) does not have a proper `hashCode` and `equals` functions. This meant that comparing `Literal`s containing binary data was a pain. I have updated Literal.hashCode and Literal.equals to deal properly with binary data. ## How was this patch tested? Added tests to the `ExpressionParserSuite`, `SQLQueryTestSuite` and `ExpressionSQLBuilderSuite`. Author: Herman van Hovell <[email protected]> Closes apache#14832 from hvanhovell/SPARK-17263.
1 parent a0aac4b commit 2be5f8d

File tree

6 files changed

+93
-25
lines changed

6 files changed

+93
-25
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ package org.apache.spark.sql.catalyst.expressions
1919

2020
import java.nio.charset.StandardCharsets
2121
import java.sql.{Date, Timestamp}
22+
import java.util
2223
import java.util.Objects
24+
import javax.xml.bind.DatatypeConverter
2325

2426
import org.json4s.JsonAST._
2527

@@ -168,14 +170,29 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression with
168170
override def foldable: Boolean = true
169171
override def nullable: Boolean = value == null
170172

171-
override def toString: String = if (value != null) value.toString else "null"
173+
override def toString: String = value match {
174+
case null => "null"
175+
case binary: Array[Byte] => s"0x" + DatatypeConverter.printHexBinary(binary)
176+
case other => other.toString
177+
}
172178

173-
override def hashCode(): Int = 31 * (31 * Objects.hashCode(dataType)) + Objects.hashCode(value)
179+
override def hashCode(): Int = {
180+
val valueHashCode = value match {
181+
case null => 0
182+
case binary: Array[Byte] => util.Arrays.hashCode(binary)
183+
case other => other.hashCode()
184+
}
185+
31 * Objects.hashCode(dataType) + valueHashCode
186+
}
174187

175188
override def equals(other: Any): Boolean = other match {
189+
case o: Literal if !dataType.equals(o.dataType) => false
176190
case o: Literal =>
177-
dataType.equals(o.dataType) &&
178-
(value == null && null == o.value || value != null && value.equals(o.value))
191+
(value, o.value) match {
192+
case (null, null) => true
193+
case (a: Array[Byte], b: Array[Byte]) => util.Arrays.equals(a, b)
194+
case (a, b) => a != null && a.equals(b)
195+
}
179196
case _ => false
180197
}
181198

@@ -269,6 +286,7 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression with
269286
case (v: Decimal, t: DecimalType) => v + "BD"
270287
case (v: Int, DateType) => s"DATE '${DateTimeUtils.toJavaDate(v)}'"
271288
case (v: Long, TimestampType) => s"TIMESTAMP('${DateTimeUtils.toJavaTimestamp(v)}')"
289+
case (v: Array[Byte], BinaryType) => s"X'${DatatypeConverter.printHexBinary(v)}'"
272290
case _ => value.toString
273291
}
274292
}

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.spark.sql.catalyst.parser
1919

2020
import java.sql.{Date, Timestamp}
21+
import javax.xml.bind.DatatypeConverter
2122

2223
import scala.collection.JavaConverters._
2324
import scala.collection.mutable.ArrayBuffer
@@ -1215,19 +1216,27 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
12151216
* {{{
12161217
* [TYPE] '[VALUE]'
12171218
* }}}
1218-
* Currently Date and Timestamp typed literals are supported.
1219-
*
1220-
* TODO what the added value of this over casting?
1219+
* Currently Date, Timestamp and Binary typed literals are supported.
12211220
*/
12221221
override def visitTypeConstructor(ctx: TypeConstructorContext): Literal = withOrigin(ctx) {
12231222
val value = string(ctx.STRING)
1224-
ctx.identifier.getText.toUpperCase match {
1225-
case "DATE" =>
1226-
Literal(Date.valueOf(value))
1227-
case "TIMESTAMP" =>
1228-
Literal(Timestamp.valueOf(value))
1229-
case other =>
1230-
throw new ParseException(s"Literals of type '$other' are currently not supported.", ctx)
1223+
val valueType = ctx.identifier.getText.toUpperCase
1224+
try {
1225+
valueType match {
1226+
case "DATE" =>
1227+
Literal(Date.valueOf(value))
1228+
case "TIMESTAMP" =>
1229+
Literal(Timestamp.valueOf(value))
1230+
case "X" =>
1231+
val padding = if (value.length % 2 == 1) "0" else ""
1232+
Literal(DatatypeConverter.parseHexBinary(padding + value))
1233+
case other =>
1234+
throw new ParseException(s"Literals of type '$other' are currently not supported.", ctx)
1235+
}
1236+
} catch {
1237+
case e: IllegalArgumentException =>
1238+
val message = Option(e.getMessage).getOrElse(s"Exception parsing $valueType")
1239+
throw new ParseException(message, ctx)
12311240
}
12321241
}
12331242

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -331,16 +331,17 @@ class ExpressionParserSuite extends PlanTest {
331331
test("type constructors") {
332332
// Dates.
333333
assertEqual("dAte '2016-03-11'", Literal(Date.valueOf("2016-03-11")))
334-
intercept[IllegalArgumentException] {
335-
parseExpression("DAtE 'mar 11 2016'")
336-
}
334+
intercept("DAtE 'mar 11 2016'")
337335

338336
// Timestamps.
339337
assertEqual("tImEstAmp '2016-03-11 20:54:00.000'",
340338
Literal(Timestamp.valueOf("2016-03-11 20:54:00.000")))
341-
intercept[IllegalArgumentException] {
342-
parseExpression("timestamP '2016-33-11 20:54:00.000'")
343-
}
339+
intercept("timestamP '2016-33-11 20:54:00.000'")
340+
341+
// Binary.
342+
assertEqual("X'A'", Literal(Array(0x0a).map(_.toByte)))
343+
assertEqual("x'A10C'", Literal(Array(0xa1, 0x0c).map(_.toByte)))
344+
intercept("x'A1OC'")
344345

345346
// Unsupported datatype.
346347
intercept("GEO '(10,-6)'", "Literals of type 'GEO' are currently not supported.")

sql/core/src/test/resources/sql-tests/inputs/literals.sql

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,10 @@ select 90912830918230182310293801923652346786BD, 123.0E-28BD, 123.08BD;
9696

9797
-- out of range big decimal
9898
select 1.20E-38BD;
99+
100+
-- hexadecimal binary literal
101+
select x'2379ACFe';
102+
103+
-- invalid hexadecimal binary literal
104+
select X'XuZ';
105+

sql/core/src/test/resources/sql-tests/results/literals.sql.out

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
-- Automatically generated by SQLQueryTestSuite
2-
-- Number of queries: 40
2+
-- Number of queries: 42
33

44

55
-- !query 0
@@ -289,8 +289,13 @@ select date 'mar 11 2016'
289289
-- !query 31 schema
290290
struct<>
291291
-- !query 31 output
292-
java.lang.IllegalArgumentException
293-
null
292+
org.apache.spark.sql.catalyst.parser.ParseException
293+
294+
Exception parsing DATE(line 1, pos 7)
295+
296+
== SQL ==
297+
select date 'mar 11 2016'
298+
-------^^^
294299

295300

296301
-- !query 32
@@ -306,8 +311,13 @@ select timestamp '2016-33-11 20:54:00.000'
306311
-- !query 33 schema
307312
struct<>
308313
-- !query 33 output
309-
java.lang.IllegalArgumentException
310-
Timestamp format must be yyyy-mm-dd hh:mm:ss[.fffffffff]
314+
org.apache.spark.sql.catalyst.parser.ParseException
315+
316+
Timestamp format must be yyyy-mm-dd hh:mm:ss[.fffffffff](line 1, pos 7)
317+
318+
== SQL ==
319+
select timestamp '2016-33-11 20:54:00.000'
320+
-------^^^
311321

312322

313323
-- !query 34
@@ -376,3 +386,25 @@ DecimalType can only support precision up to 38(line 1, pos 7)
376386
== SQL ==
377387
select 1.20E-38BD
378388
-------^^^
389+
390+
391+
-- !query 40
392+
select x'2379ACFe'
393+
-- !query 40 schema
394+
struct<X'2379ACFE':binary>
395+
-- !query 40 output
396+
#y��
397+
398+
399+
-- !query 41
400+
select X'XuZ'
401+
-- !query 41 schema
402+
struct<>
403+
-- !query 41 output
404+
org.apache.spark.sql.catalyst.parser.ParseException
405+
406+
contains illegal character for hexBinary: 0XuZ(line 1, pos 7)
407+
408+
== SQL ==
409+
select X'XuZ'
410+
-------^^^

sql/hive/src/test/scala/org/apache/spark/sql/catalyst/ExpressionSQLBuilderSuite.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class ExpressionSQLBuilderSuite extends SQLBuilderTest {
4040
checkSQL(Literal(Double.NegativeInfinity), "CAST('-Infinity' AS DOUBLE)")
4141
checkSQL(Literal(Double.NaN), "CAST('NaN' AS DOUBLE)")
4242
checkSQL(Literal(BigDecimal("10.0000000").underlying), "10.0000000BD")
43+
checkSQL(Literal(Array(0x01, 0xA3).map(_.toByte)), "X'01A3'")
4344
checkSQL(
4445
Literal(Timestamp.valueOf("2016-01-01 00:00:00")), "TIMESTAMP('2016-01-01 00:00:00.0')")
4546
// TODO tests for decimals

0 commit comments

Comments
 (0)