-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-6638] [SQL] Improve performance of StringType in SQL #5350
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 35 commits
685fd07
21f67c6
4699c3a
d32abd1
a85fb27
6b499ac
5f9e120
38c303e
c7dd4d2
bb52e44
8b45864
23a766c
9dc32d1
73e4363
956b0a4
9f4c194
537631c
28d6f32
28f3d81
e5fa5b8
8d17f21
fd11364
ac18ae6
2089d24
13d9d42
867bf50
1314a37
5116b43
08d897b
b04a19c
744788f
341ec2c
59025c8
6d776a9
2772f0d
3b7bfa8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -77,6 +77,9 @@ object CatalystTypeConverters { | |
| } | ||
| new GenericRowWithSchema(ar, structType) | ||
|
|
||
| case (d: String, _) => | ||
| UTF8String(d) | ||
|
|
||
| case (d: BigDecimal, _) => | ||
| Decimal(d) | ||
|
|
||
|
|
@@ -175,6 +178,11 @@ object CatalystTypeConverters { | |
| case other => other | ||
| } | ||
|
|
||
| case dataType: StringType => (item: Any) => extractOption(item) match { | ||
| case s: String => UTF8String(s) | ||
| case other => other | ||
| } | ||
|
|
||
| case _ => | ||
| (item: Any) => extractOption(item) match { | ||
| case d: BigDecimal => Decimal(d) | ||
|
|
@@ -184,6 +192,26 @@ object CatalystTypeConverters { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Converts Scala objects to catalyst rows / types. | ||
| * | ||
| * Note: This should be called before do evaluation on Row | ||
| * (It does not support UDT) | ||
| * This is used to create an RDD or test results with correct types for Catalyst. | ||
| */ | ||
| def convertToCatalyst(a: Any): Any = a match { | ||
| case s: String => UTF8String(s) | ||
| case d: java.sql.Date => DateUtils.fromJavaDate(d) | ||
| case d: BigDecimal => Decimal(d) | ||
| case d: java.math.BigDecimal => Decimal(d) | ||
| case seq: Seq[Any] => seq.map(convertToCatalyst) | ||
| case r: Row => Row(r.toSeq.map(convertToCatalyst): _*) | ||
| case arr: Array[Any] => arr.toSeq.map(convertToCatalyst).toArray | ||
| case m: Map[Any, Any] => | ||
| m.map { case (k, v) => (convertToCatalyst(k), convertToCatalyst(v)) }.toMap | ||
| case other => other | ||
| } | ||
|
|
||
| /** | ||
| * Converts Catalyst types used internally in rows to standard Scala types | ||
| * This method is slow, and for batch conversion you should be using converter | ||
|
|
@@ -211,6 +239,9 @@ object CatalystTypeConverters { | |
| case (i: Int, DateType) => | ||
| DateUtils.toJavaDate(i) | ||
|
|
||
| case (s: UTF8String, StringType) => | ||
| s.toString() | ||
|
|
||
| case (other, _) => | ||
| other | ||
| } | ||
|
|
@@ -262,6 +293,12 @@ object CatalystTypeConverters { | |
| case other => other | ||
| } | ||
|
|
||
| case StringType => | ||
| (item: Any) => item match { | ||
| case s: UTF8String => s.toString() | ||
| case other => other | ||
| } | ||
|
|
||
| case other => | ||
| (item: Any) => item | ||
| } | ||
|
|
@@ -284,9 +321,9 @@ object CatalystTypeConverters { | |
| row: Row, | ||
| schema: StructType, | ||
| converters: Array[Any => Any]): Row = { | ||
| val ar = new Array[Any](row.size) | ||
| val ar = new Array[Any](converters.size) | ||
| var idx = 0 | ||
| while (idx < row.size) { | ||
| while (idx < converters.size && idx < row.size) { | ||
|
||
| ar(idx) = converters(idx)(row(idx)) | ||
| idx += 1 | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,6 @@ import java.sql.{Date, Timestamp} | |
| import java.text.{DateFormat, SimpleDateFormat} | ||
|
|
||
| import org.apache.spark.Logging | ||
| import org.apache.spark.sql.catalyst.errors.TreeNodeException | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
| /** Cast the child expression to the target data type. */ | ||
|
|
@@ -112,21 +111,21 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
|
|
||
| // UDFToString | ||
| private[this] def castToString(from: DataType): Any => Any = from match { | ||
| case BinaryType => buildCast[Array[Byte]](_, new String(_, "UTF-8")) | ||
| case DateType => buildCast[Int](_, d => DateUtils.toString(d)) | ||
| case TimestampType => buildCast[Timestamp](_, timestampToString) | ||
| case _ => buildCast[Any](_, _.toString) | ||
| case BinaryType => buildCast[Array[Byte]](_, UTF8String(_)) | ||
| case DateType => buildCast[Int](_, d => UTF8String(DateUtils.toString(d))) | ||
| case TimestampType => buildCast[Timestamp](_, t => UTF8String(timestampToString(t))) | ||
| case _ => buildCast[Any](_, o => UTF8String(o.toString)) | ||
| } | ||
|
|
||
| // BinaryConverter | ||
| private[this] def castToBinary(from: DataType): Any => Any = from match { | ||
| case StringType => buildCast[String](_, _.getBytes("UTF-8")) | ||
| case StringType => buildCast[UTF8String](_, _.getBytes) | ||
| } | ||
|
|
||
| // UDFToBoolean | ||
| private[this] def castToBoolean(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, _.length() != 0) | ||
| buildCast[UTF8String](_, _.length() != 0) | ||
| case TimestampType => | ||
| buildCast[Timestamp](_, t => t.getTime() != 0 || t.getNanos() != 0) | ||
| case DateType => | ||
|
|
@@ -151,8 +150,9 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // TimestampConverter | ||
| private[this] def castToTimestamp(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => { | ||
| buildCast[UTF8String](_, utfs => { | ||
| // Throw away extra if more than 9 decimal places | ||
| val s = utfs.toString | ||
| val periodIdx = s.indexOf(".") | ||
| var n = s | ||
| if (periodIdx != -1 && n.length() - periodIdx > 9) { | ||
|
|
@@ -227,8 +227,8 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // DateConverter | ||
| private[this] def castToDate(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => | ||
| try DateUtils.fromJavaDate(Date.valueOf(s)) | ||
| buildCast[UTF8String](_, s => | ||
| try DateUtils.fromJavaDate(Date.valueOf(s.toString)) | ||
| catch { case _: java.lang.IllegalArgumentException => null } | ||
| ) | ||
| case TimestampType => | ||
|
|
@@ -245,7 +245,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // LongConverter | ||
| private[this] def castToLong(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toLong catch { | ||
| buildCast[UTF8String](_, s => try s.toString.toLong catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
@@ -261,7 +261,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // IntConverter | ||
| private[this] def castToInt(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toInt catch { | ||
| buildCast[UTF8String](_, s => try s.toString.toInt catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
@@ -277,7 +277,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // ShortConverter | ||
| private[this] def castToShort(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toShort catch { | ||
| buildCast[UTF8String](_, s => try s.toString.toShort catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
@@ -293,7 +293,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // ByteConverter | ||
| private[this] def castToByte(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toByte catch { | ||
| buildCast[UTF8String](_, s => try s.toString.toByte catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
@@ -323,7 +323,9 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
|
|
||
| private[this] def castToDecimal(from: DataType, target: DecimalType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try changePrecision(Decimal(s.toDouble), target) catch { | ||
| buildCast[UTF8String](_, s => try { | ||
| changePrecision(Decimal(s.toString.toDouble), target) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not quite related to your change. But, why we convert the string to double first?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess Double is the wide format and range for numbers, or we need to have a special parser for it. |
||
| } catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
@@ -348,7 +350,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // DoubleConverter | ||
| private[this] def castToDouble(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toDouble catch { | ||
| buildCast[UTF8String](_, s => try s.toString.toDouble catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
@@ -364,7 +366,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
| // FloatConverter | ||
| private[this] def castToFloat(from: DataType): Any => Any = from match { | ||
| case StringType => | ||
| buildCast[String](_, s => try s.toFloat catch { | ||
| buildCast[UTF8String](_, s => try s.toString.toFloat catch { | ||
| case _: NumberFormatException => null | ||
| }) | ||
| case BooleanType => | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -230,13 +230,17 @@ final class SpecificMutableRow(val values: Array[MutableValue]) extends MutableR | |
| new GenericRow(newValues) | ||
| } | ||
|
|
||
| override def update(ordinal: Int, value: Any): Unit = { | ||
| if (value == null) setNullAt(ordinal) else values(ordinal).update(value) | ||
| override def update(ordinal: Int, value: Any) { | ||
| if (value == null) { | ||
| setNullAt(ordinal) | ||
| } else { | ||
| values(ordinal).update(value) | ||
| } | ||
| } | ||
|
|
||
| override def setString(ordinal: Int, value: String): Unit = update(ordinal, value) | ||
| override def setString(ordinal: Int, value: String): Unit = update(ordinal, UTF8String(value)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why we are still expecting a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an API, so we should keep it. |
||
|
|
||
| override def getString(ordinal: Int): String = apply(ordinal).asInstanceOf[String] | ||
| override def getString(ordinal: Int): String = apply(ordinal).toString | ||
|
|
||
| override def setInt(ordinal: Int, value: Int): Unit = { | ||
| val currentValue = values(ordinal).asInstanceOf[MutableInt] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are we excepting a
Dateobject or itsIntrepresentation? For internal use, I guess we expect an int. But, since users can also call it, for them, aDateobject is expected, right?Right now, if I call
getDate, I will get aClassCastException?Can you file a jira for it for 1.4 and mark it as a blocker?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://issues.apache.org/jira/browse/SPARK-6784