Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
3bf7691
[SPARK-24531][TESTS] Replace 2.3.0 version with 2.3.1
mgaido91 Jun 13, 2018
97097f5
A test for converting Char to String
MaxGekk Jun 13, 2018
87640c7
Support Char in StringConverter
MaxGekk Jun 13, 2018
0fb4669
Evaluate Char literal as String literal
MaxGekk Jun 13, 2018
99dfbfe
Added a test for filtering rows by using Char literal
MaxGekk Jun 14, 2018
56fd592
Cover the case of java.lang.Character
MaxGekk Jun 14, 2018
657f7be
Improving of the test
MaxGekk Jun 14, 2018
534065e
[MINOR][CORE][TEST] Remove unnecessary sort in UnsafeInMemorySorterSuite
jiangxb1987 Jun 14, 2018
fdadc4b
[SPARK-24495][SQL] EnsureRequirement returns wrong plan when reorderi…
mgaido91 Jun 14, 2018
d3eed8f
[SPARK-24563][PYTHON] Catch TypeError when testing existence of HiveC…
icexelloss Jun 14, 2018
b8f27ae
[SPARK-24543][SQL] Support any type as DDL string for from_json's schema
MaxGekk Jun 14, 2018
18cb0c0
[SPARK-24319][SPARK SUBMIT] Fix spark-submit execution where no main …
gaborgsomogyi Jun 14, 2018
270a9a3
[SPARK-24248][K8S] Use level triggering and state reconciliation in s…
mccheah Jun 14, 2018
22daeba
[SPARK-24478][SQL] Move projection and filter push down to physical c…
rdblue Jun 15, 2018
6567fc4
[PYTHON] Fix typo in serializer exception
rberenguel Jun 15, 2018
495d8cf
[SPARK-24490][WEBUI] Use WebUI.addStaticHandler in web UIs
jaceklaskowski Jun 15, 2018
b5ccf0d
[SPARK-24396][SS][PYSPARK] Add Structured Streaming ForeachWriter for…
tdas Jun 15, 2018
90da7dc
[SPARK-24452][SQL][CORE] Avoid possible overflow in int add or multiple
kiszk Jun 15, 2018
e4fee39
[SPARK-24525][SS] Provide an option to limit number of rows in a Memo…
Jun 15, 2018
0f09ab2
Merge remote-tracking branch 'origin/master' into char-to-string
MaxGekk Jun 16, 2018
4210146
Adding ticket number to test's titles
MaxGekk Jun 16, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[SPARK-24543][SQL] Support any type as DDL string for from_json's schema
## What changes were proposed in this pull request?

In the PR, I propose to support any DataType represented as DDL string for the from_json function. After the changes, it will be possible to specify `MapType` in SQL like:
```sql
select from_json('{"a":1, "b":2}', 'map<string, int>')
```
and in Scala (similar in other languages)
```scala
val in = Seq("""{"a": {"b": 1}}""").toDS()
val schema = "map<string, map<string, int>>"
val out = in.select(from_json($"value", schema, Map.empty[String, String]))
```

## How was this patch tested?

Added a couple sql tests and modified existing tests for Python and Scala. The former tests were modified because it is not imported for them in which format schema for `from_json` is provided.

Author: Maxim Gekk <[email protected]>

Closes apache#21550 from MaxGekk/from_json-ddl-schema.
  • Loading branch information
MaxGekk authored and cloud-fan committed Jun 14, 2018
commit b8f27ae3b34134a01998b77db4b7935e7f82a4fe
3 changes: 1 addition & 2 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2168,8 +2168,7 @@ def from_json(col, schema, options={}):
[Row(json=Row(a=1))]
>>> df.select(from_json(df.value, "a INT").alias("json")).collect()
[Row(json=Row(a=1))]
>>> schema = MapType(StringType(), IntegerType())
>>> df.select(from_json(df.value, schema).alias("json")).collect()
>>> df.select(from_json(df.value, "MAP<STRING,INT>").alias("json")).collect()
[Row(json={u'a': 1})]
>>> data = [(1, '''[{"a": 1}]''')]
>>> schema = ArrayType(StructType([StructField("a", IntegerType())]))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.json._
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, BadRecordException, FailFastMode, GenericArrayData, MapData}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
Expand Down Expand Up @@ -747,8 +746,8 @@ case class StructsToJson(

object JsonExprUtils {

def validateSchemaLiteral(exp: Expression): StructType = exp match {
case Literal(s, StringType) => CatalystSqlParser.parseTableSchema(s.toString)
def validateSchemaLiteral(exp: Expression): DataType = exp match {
case Literal(s, StringType) => DataType.fromDDL(s.toString)
case e => throw new AnalysisException(s"Expected a string literal instead of $e")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,16 @@ package org.apache.spark.sql.types

import java.util.Locale

import scala.util.control.NonFatal

import org.json4s._
import org.json4s.JsonAST.JValue
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.annotation.InterfaceStability
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.util.Utils

Expand Down Expand Up @@ -110,6 +113,14 @@ abstract class DataType extends AbstractDataType {
@InterfaceStability.Stable
object DataType {

def fromDDL(ddl: String): DataType = {
try {
CatalystSqlParser.parseDataType(ddl)
} catch {
case NonFatal(_) => CatalystSqlParser.parseTableSchema(ddl)
}
}

def fromJson(json: String): DataType = parseDataType(parse(json))

private val nonDecimalNameToType = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3369,7 +3369,7 @@ object functions {
val dataType = try {
DataType.fromJson(schema)
} catch {
case NonFatal(_) => StructType.fromDDL(schema)
case NonFatal(_) => DataType.fromDDL(schema)
}
from_json(e, dataType, options)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,7 @@ CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1,
SELECT json_tuple(jsonField, 'b', CAST(NULL AS STRING), a) FROM jsonTable;
-- Clean up
DROP VIEW IF EXISTS jsonTable;

-- from_json - complex types
select from_json('{"a":1, "b":2}', 'map<string, int>');
select from_json('{"a":1, "b":"2"}', 'struct<a:int,b:string>');
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 26
-- Number of queries: 28


-- !query 0
Expand Down Expand Up @@ -258,3 +258,19 @@ DROP VIEW IF EXISTS jsonTable
struct<>
-- !query 25 output



-- !query 26
select from_json('{"a":1, "b":2}', 'map<string, int>')
-- !query 26 schema
struct<entries:map<string,int>>
-- !query 26 output
{"a":1,"b":2}


-- !query 27
select from_json('{"a":1, "b":"2"}', 'struct<a:int,b:string>')
-- !query 27 schema
struct<jsontostructs({"a":1, "b":"2"}):struct<a:int,b:string>>
-- !query 27 output
{"a":1,"b":"2"}
Original file line number Diff line number Diff line change
Expand Up @@ -354,8 +354,8 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {

test("SPARK-24027: from_json - map<string, map<string, int>>") {
val in = Seq("""{"a": {"b": 1}}""").toDS()
val schema = MapType(StringType, MapType(StringType, IntegerType))
val out = in.select(from_json($"value", schema))
val schema = "map<string, map<string, int>>"
val out = in.select(from_json($"value", schema, Map.empty[String, String]))

checkAnswer(out, Row(Map("a" -> Map("b" -> 1))))
}
Expand Down