Skip to content

Commit b8f27ae

Browse files
MaxGekkcloud-fan
authored andcommitted
[SPARK-24543][SQL] Support any type as DDL string for from_json's schema
## What changes were proposed in this pull request? In the PR, I propose to support any DataType represented as DDL string for the from_json function. After the changes, it will be possible to specify `MapType` in SQL like: ```sql select from_json('{"a":1, "b":2}', 'map<string, int>') ``` and in Scala (similar in other languages) ```scala val in = Seq("""{"a": {"b": 1}}""").toDS() val schema = "map<string, map<string, int>>" val out = in.select(from_json($"value", schema, Map.empty[String, String])) ``` ## How was this patch tested? Added a couple sql tests and modified existing tests for Python and Scala. The former tests were modified because it is not imported for them in which format schema for `from_json` is provided. Author: Maxim Gekk <maxim.gekk@databricks.com> Closes #21550 from MaxGekk/from_json-ddl-schema.
1 parent d3eed8f commit b8f27ae

File tree

7 files changed

+38
-9
lines changed

7 files changed

+38
-9
lines changed

python/pyspark/sql/functions.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2168,8 +2168,7 @@ def from_json(col, schema, options={}):
21682168
[Row(json=Row(a=1))]
21692169
>>> df.select(from_json(df.value, "a INT").alias("json")).collect()
21702170
[Row(json=Row(a=1))]
2171-
>>> schema = MapType(StringType(), IntegerType())
2172-
>>> df.select(from_json(df.value, schema).alias("json")).collect()
2171+
>>> df.select(from_json(df.value, "MAP<STRING,INT>").alias("json")).collect()
21732172
[Row(json={u'a': 1})]
21742173
>>> data = [(1, '''[{"a": 1}]''')]
21752174
>>> schema = ArrayType(StructType([StructField("a", IntegerType())]))

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ import org.apache.spark.sql.catalyst.InternalRow
2828
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
2929
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
3030
import org.apache.spark.sql.catalyst.json._
31-
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
3231
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, BadRecordException, FailFastMode, GenericArrayData, MapData}
3332
import org.apache.spark.sql.internal.SQLConf
3433
import org.apache.spark.sql.types._
@@ -747,8 +746,8 @@ case class StructsToJson(
747746

748747
object JsonExprUtils {
749748

750-
def validateSchemaLiteral(exp: Expression): StructType = exp match {
751-
case Literal(s, StringType) => CatalystSqlParser.parseTableSchema(s.toString)
749+
def validateSchemaLiteral(exp: Expression): DataType = exp match {
750+
case Literal(s, StringType) => DataType.fromDDL(s.toString)
752751
case e => throw new AnalysisException(s"Expected a string literal instead of $e")
753752
}
754753

sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,16 @@ package org.apache.spark.sql.types
1919

2020
import java.util.Locale
2121

22+
import scala.util.control.NonFatal
23+
2224
import org.json4s._
2325
import org.json4s.JsonAST.JValue
2426
import org.json4s.JsonDSL._
2527
import org.json4s.jackson.JsonMethods._
2628

2729
import org.apache.spark.annotation.InterfaceStability
2830
import org.apache.spark.sql.catalyst.expressions.Expression
31+
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
2932
import org.apache.spark.sql.internal.SQLConf
3033
import org.apache.spark.util.Utils
3134

@@ -110,6 +113,14 @@ abstract class DataType extends AbstractDataType {
110113
@InterfaceStability.Stable
111114
object DataType {
112115

116+
def fromDDL(ddl: String): DataType = {
117+
try {
118+
CatalystSqlParser.parseDataType(ddl)
119+
} catch {
120+
case NonFatal(_) => CatalystSqlParser.parseTableSchema(ddl)
121+
}
122+
}
123+
113124
def fromJson(json: String): DataType = parseDataType(parse(json))
114125

115126
private val nonDecimalNameToType = {

sql/core/src/main/scala/org/apache/spark/sql/functions.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3369,7 +3369,7 @@ object functions {
33693369
val dataType = try {
33703370
DataType.fromJson(schema)
33713371
} catch {
3372-
case NonFatal(_) => StructType.fromDDL(schema)
3372+
case NonFatal(_) => DataType.fromDDL(schema)
33733373
}
33743374
from_json(e, dataType, options)
33753375
}

sql/core/src/test/resources/sql-tests/inputs/json-functions.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,7 @@ CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1,
3131
SELECT json_tuple(jsonField, 'b', CAST(NULL AS STRING), a) FROM jsonTable;
3232
-- Clean up
3333
DROP VIEW IF EXISTS jsonTable;
34+
35+
-- from_json - complex types
36+
select from_json('{"a":1, "b":2}', 'map<string, int>');
37+
select from_json('{"a":1, "b":"2"}', 'struct<a:int,b:string>');

sql/core/src/test/resources/sql-tests/results/json-functions.sql.out

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
-- Automatically generated by SQLQueryTestSuite
2-
-- Number of queries: 26
2+
-- Number of queries: 28
33

44

55
-- !query 0
@@ -258,3 +258,19 @@ DROP VIEW IF EXISTS jsonTable
258258
struct<>
259259
-- !query 25 output
260260

261+
262+
263+
-- !query 26
264+
select from_json('{"a":1, "b":2}', 'map<string, int>')
265+
-- !query 26 schema
266+
struct<entries:map<string,int>>
267+
-- !query 26 output
268+
{"a":1,"b":2}
269+
270+
271+
-- !query 27
272+
select from_json('{"a":1, "b":"2"}', 'struct<a:int,b:string>')
273+
-- !query 27 schema
274+
struct<jsontostructs({"a":1, "b":"2"}):struct<a:int,b:string>>
275+
-- !query 27 output
276+
{"a":1,"b":"2"}

sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -354,8 +354,8 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {
354354

355355
test("SPARK-24027: from_json - map<string, map<string, int>>") {
356356
val in = Seq("""{"a": {"b": 1}}""").toDS()
357-
val schema = MapType(StringType, MapType(StringType, IntegerType))
358-
val out = in.select(from_json($"value", schema))
357+
val schema = "map<string, map<string, int>>"
358+
val out = in.select(from_json($"value", schema, Map.empty[String, String]))
359359

360360
checkAnswer(out, Row(Map("a" -> Map("b" -> 1))))
361361
}

0 commit comments

Comments
 (0)