Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Clean up nullability in ScriptTransformation
  • Loading branch information
JoshRosen committed Jul 27, 2015
commit b43e4ec31b3bd4030c9e9985a64b2f186df17825
10 changes: 5 additions & 5 deletions sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
Original file line number Diff line number Diff line change
Expand Up @@ -874,15 +874,15 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
}

def matchSerDe(clause: Seq[ASTNode])
: (Seq[(String, String)], String, Seq[(String, String)]) = clause match {
: (Seq[(String, String)], Option[String], Seq[(String, String)]) = clause match {
case Token("TOK_SERDEPROPS", propsClause) :: Nil =>
val rowFormat = propsClause.map {
case Token(name, Token(value, Nil) :: Nil) => (name, value)
}
(rowFormat, "", Nil)
(rowFormat, None, Nil)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought it was confusing to use an empty string to represent a missing value, hence this change.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we use null here without changing the type?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could but I feel that's a bit less clear and more error-prone.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 to this change.


case Token("TOK_SERDENAME", Token(serdeClass, Nil) :: Nil) :: Nil =>
(Nil, serdeClass, Nil)
(Nil, Some(serdeClass), Nil)

case Token("TOK_SERDENAME", Token(serdeClass, Nil) ::
Token("TOK_TABLEPROPERTIES",
Expand All @@ -891,9 +891,9 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
case Token("TOK_TABLEPROPERTY", Token(name, Nil) :: Token(value, Nil) :: Nil) =>
(name, value)
}
(Nil, serdeClass, serdeProps)
(Nil, Some(serdeClass), serdeProps)

case Nil => (Nil, "", Nil)
case Nil => (Nil, None, Nil)
}

val (inRowFormat, inSerdeClass, inSerdeProps) = matchSerDe(inputSerdeClause)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.execution

import java.io.{BufferedReader, DataInputStream, DataOutputStream, EOFException, InputStreamReader}
import java.util.Properties
import javax.annotation.Nullable

import scala.collection.JavaConversions._

Expand Down Expand Up @@ -68,7 +69,11 @@ case class ScriptTransformation(
val errorStream = proc.getErrorStream
val reader = new BufferedReader(new InputStreamReader(inputStream))

val (outputSerde, outputSoi) = ioschema.initOutputSerDe(output)
// This nullability is a performance optimization in order to avoid an Option.foreach() call
// inside of a loop
@Nullable val (outputSerde, outputSoi) = {
ioschema.initOutputSerDe(output).getOrElse((null, null))
}

val iterator: Iterator[InternalRow] = new Iterator[InternalRow] with HiveInspectors {
var cacheRow: InternalRow = null
Expand Down Expand Up @@ -146,7 +151,9 @@ case class ScriptTransformation(
}
}

val (inputSerde, inputSoi) = ioschema.initInputSerDe(input)
// This nullability is a performance optimization in order to avoid an Option.foreach() call
// inside of a loop
@Nullable val (inputSerde, inputSoi) = ioschema.initInputSerDe(input).getOrElse((null, null))
val dataOutputStream = new DataOutputStream(outputStream)
val outputProjection = new InterpretedProjection(input, child.output)

Expand Down Expand Up @@ -200,33 +207,43 @@ private[hive]
case class HiveScriptIOSchema (
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The changes in this class are purely code-cleanup, mostly related to making more variables private and performing some cleanup related to nullability.

inputRowFormat: Seq[(String, String)],
outputRowFormat: Seq[(String, String)],
inputSerdeClass: String,
outputSerdeClass: String,
inputSerdeClass: Option[String],
outputSerdeClass: Option[String],
inputSerdeProps: Seq[(String, String)],
outputSerdeProps: Seq[(String, String)],
schemaLess: Boolean) extends ScriptInputOutputSchema with HiveInspectors {

val defaultFormat = Map(("TOK_TABLEROWFORMATFIELD", "\t"),
("TOK_TABLEROWFORMATLINES", "\n"))
private val defaultFormat = Map(
("TOK_TABLEROWFORMATFIELD", "\t"),
("TOK_TABLEROWFORMATLINES", "\n")
)

val inputRowFormatMap = inputRowFormat.toMap.withDefault((k) => defaultFormat(k))
val outputRowFormatMap = outputRowFormat.toMap.withDefault((k) => defaultFormat(k))


def initInputSerDe(input: Seq[Expression]): (AbstractSerDe, ObjectInspector) = {
val (columns, columnTypes) = parseAttrs(input)
val serde = initSerDe(inputSerdeClass, columns, columnTypes, inputSerdeProps)
(serde, initInputSoi(serde, columns, columnTypes))
def initInputSerDe(input: Seq[Expression]): Option[(AbstractSerDe, ObjectInspector)] = {
inputSerdeClass.map { serdeClass =>
val (columns, columnTypes) = parseAttrs(input)
val serde = initSerDe(serdeClass, columns, columnTypes, inputSerdeProps)
val fieldObjectInspectors = columnTypes.map(toInspector)
val objectInspector = ObjectInspectorFactory
.getStandardStructObjectInspector(columns, fieldObjectInspectors)
.asInstanceOf[ObjectInspector]
(serde, objectInspector)
}
}

def initOutputSerDe(output: Seq[Attribute]): (AbstractSerDe, StructObjectInspector) = {
val (columns, columnTypes) = parseAttrs(output)
val serde = initSerDe(outputSerdeClass, columns, columnTypes, outputSerdeProps)
(serde, initOutputputSoi(serde))
def initOutputSerDe(output: Seq[Attribute]): Option[(AbstractSerDe, StructObjectInspector)] = {
outputSerdeClass.map { serdeClass =>
val (columns, columnTypes) = parseAttrs(output)
val serde = initSerDe(serdeClass, columns, columnTypes, outputSerdeProps)
val structObjectInspector = serde.getObjectInspector().asInstanceOf[StructObjectInspector]
(serde, structObjectInspector)
}
}

def parseAttrs(attrs: Seq[Expression]): (Seq[String], Seq[DataType]) = {

private def parseAttrs(attrs: Seq[Expression]): (Seq[String], Seq[DataType]) = {
val columns = attrs.map {
case aref: AttributeReference => aref.name
case e: NamedExpression => e.name
Expand All @@ -242,52 +259,29 @@ case class HiveScriptIOSchema (
(columns, columnTypes)
}

def initSerDe(serdeClassName: String, columns: Seq[String],
columnTypes: Seq[DataType], serdeProps: Seq[(String, String)]): AbstractSerDe = {
private def initSerDe(
serdeClassName: String,
columns: Seq[String],
columnTypes: Seq[DataType],
serdeProps: Seq[(String, String)]): AbstractSerDe = {

val serde: AbstractSerDe = if (serdeClassName != "") {
val serde: AbstractSerDe = {
val trimed_class = serdeClassName.split("'")(1)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to fix this in a commit in a few minutes. This is really messy: it would be better to perform this work in the parser rather than doing it here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not only is this messy, I think it's also wrong:

  • This will only support single quotes.
  • It creates the potential for parser errors to become runtime errors instead of analysis errors.

Utils.classForName(trimed_class)
.newInstance.asInstanceOf[AbstractSerDe]
} else {
null
}

if (serde != null) {
val columnTypesNames = columnTypes.map(_.toTypeInfo.getTypeName()).mkString(",")
val columnTypesNames = columnTypes.map(_.toTypeInfo.getTypeName()).mkString(",")

var propsMap = serdeProps.map(kv => {
(kv._1.split("'")(1), kv._2.split("'")(1))
}).toMap + (serdeConstants.LIST_COLUMNS -> columns.mkString(","))
propsMap = propsMap + (serdeConstants.LIST_COLUMN_TYPES -> columnTypesNames)
var propsMap = serdeProps.map(kv => {
(kv._1.split("'")(1), kv._2.split("'")(1))
}).toMap + (serdeConstants.LIST_COLUMNS -> columns.mkString(","))
propsMap = propsMap + (serdeConstants.LIST_COLUMN_TYPES -> columnTypesNames)

val properties = new Properties()
properties.putAll(propsMap)
serde.initialize(null, properties)
}
val properties = new Properties()
properties.putAll(propsMap)
serde.initialize(null, properties)

serde
}

def initInputSoi(inputSerde: AbstractSerDe, columns: Seq[String], columnTypes: Seq[DataType])
: ObjectInspector = {

if (inputSerde != null) {
val fieldObjectInspectors = columnTypes.map(toInspector(_))
ObjectInspectorFactory
.getStandardStructObjectInspector(columns, fieldObjectInspectors)
.asInstanceOf[ObjectInspector]
} else {
null
}
}

def initOutputputSoi(outputSerde: AbstractSerDe): StructObjectInspector = {
if (outputSerde != null) {
outputSerde.getObjectInspector().asInstanceOf[StructObjectInspector]
} else {
null
}
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ class ScriptTransformationSuite extends SparkPlanTest {
private val ioschema = HiveScriptIOSchema(
inputRowFormat = Seq.empty,
outputRowFormat = Seq.empty,
inputSerdeClass = "",
outputSerdeClass = "",
inputSerdeClass = None,
outputSerdeClass = None,
inputSerdeProps = Seq.empty,
outputSerdeProps = Seq.empty,
schemaLess = false
Expand Down