-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-8245][SQL] FormatNumber/Length Support for Expression #7034
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,11 +17,10 @@ | |
|
|
||
| package org.apache.spark.sql.catalyst.expressions | ||
|
|
||
| import java.text.DecimalFormat | ||
| import java.util.Locale | ||
| import java.util.regex.Pattern | ||
|
|
||
| import org.apache.commons.lang3.StringUtils | ||
|
|
||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.analysis.UnresolvedException | ||
| import org.apache.spark.sql.catalyst.expressions.codegen._ | ||
|
|
@@ -553,17 +552,23 @@ case class Substring(str: Expression, pos: Expression, len: Expression) | |
| } | ||
|
|
||
| /** | ||
| * A function that return the length of the given string expression. | ||
| * A function that return the length of the given string or binary expression. | ||
| */ | ||
| case class StringLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { | ||
| case class Length(child: Expression) extends UnaryExpression with ExpectsInputTypes { | ||
| override def dataType: DataType = IntegerType | ||
| override def inputTypes: Seq[DataType] = Seq(StringType) | ||
| override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType)) | ||
|
|
||
| protected override def nullSafeEval(string: Any): Any = | ||
| string.asInstanceOf[UTF8String].numChars | ||
| protected override def nullSafeEval(value: Any): Any = child.dataType match { | ||
| case StringType => value.asInstanceOf[UTF8String].numChars | ||
| case BinaryType => value.asInstanceOf[Array[Byte]].length | ||
| } | ||
|
|
||
| override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { | ||
| defineCodeGen(ctx, ev, c => s"($c).numChars()") | ||
| child.dataType match { | ||
| case StringType => defineCodeGen(ctx, ev, c => s"($c).numChars()") | ||
| case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length") | ||
| case NullType => defineCodeGen(ctx, ev, c => s"-1") | ||
| } | ||
| } | ||
|
|
||
| override def prettyName: String = "length" | ||
|
|
@@ -668,3 +673,74 @@ case class Encode(value: Expression, charset: Expression) | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Formats the number X to a format like '#,###,###.##', rounded to D decimal places, | ||
| * and returns the result as a string. If D is 0, the result has no decimal point or | ||
| * fractional part. | ||
| */ | ||
| case class FormatNumber(x: Expression, d: Expression) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. override prettyName
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. note: this is done
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry, yes, it's done, but in the end of this class code. |
||
| extends BinaryExpression with ExpectsInputTypes { | ||
|
|
||
| override def left: Expression = x | ||
| override def right: Expression = d | ||
| override def dataType: DataType = StringType | ||
| override def inputTypes: Seq[AbstractDataType] = Seq(NumericType, IntegerType) | ||
| override def foldable: Boolean = x.foldable && d.foldable | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i don't think u need to define foldable and nullable |
||
| override def nullable: Boolean = x.nullable || d.nullable | ||
|
|
||
| @transient | ||
| private var lastDValue: Int = -100 | ||
|
|
||
| @transient | ||
| private val pattern: StringBuffer = new StringBuffer() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be a lazy transient val so scala will initialize pattern on the executors after serialization?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry, @rxin, I am not so sure your mean, do you mean
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what i'm worried about is whether we would get a null value on the executors, because "new StringBuffer()" is not called. |
||
|
|
||
| @transient | ||
| private val numberFormat: DecimalFormat = new DecimalFormat("") | ||
|
|
||
| override def eval(input: InternalRow): Any = { | ||
| val xObject = x.eval(input) | ||
| if (xObject == null) { | ||
| return null | ||
| } | ||
|
|
||
| val dObject = d.eval(input) | ||
|
|
||
| if (dObject == null || dObject.asInstanceOf[Int] < 0) { | ||
| throw new IllegalArgumentException( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does hive also throw an exception? we could also just return null ...
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic is copied from Hive. Let's keep the same behavior?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just find it really weird to throw a runtime illegal argument exception... imagine you have some large dataset, and the very last record has d < 0...
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, will update that. |
||
| s"Argument 2 of function FORMAT_NUMBER must be >= 0, but $dObject was found") | ||
| } | ||
| val dValue = dObject.asInstanceOf[Int] | ||
|
|
||
| if (dValue != lastDValue) { | ||
| // construct a new DecimalFormat only if a new dValue | ||
| pattern.delete(0, pattern.length()) | ||
| pattern.append("#,###,###,###,###,###,##0") | ||
|
|
||
| // decimal place | ||
| if (dValue > 0) { | ||
| pattern.append(".") | ||
|
|
||
| var i = 0 | ||
| while (i < dValue) { | ||
| i += 1 | ||
| pattern.append("0") | ||
| } | ||
| } | ||
| val dFormat = new DecimalFormat(pattern.toString()) | ||
| lastDValue = dValue; | ||
| numberFormat.applyPattern(dFormat.toPattern()) | ||
| } | ||
|
|
||
| x.dataType match { | ||
| case ByteType => UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Byte])) | ||
| case ShortType => UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Short])) | ||
| case FloatType => UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Float])) | ||
| case IntegerType => UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Int])) | ||
| case LongType => UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Long])) | ||
| case DoubleType => UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Double])) | ||
| case _: DecimalType => | ||
| UTF8String.fromString(numberFormat.format(xObject.asInstanceOf[Decimal].toJavaBigDecimal)) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1685,20 +1685,42 @@ object functions { | |
| ////////////////////////////////////////////////////////////////////////////////////////////// | ||
|
|
||
| /** | ||
| * Computes the length of a given string value. | ||
| * Computes the length of a given string / binary value | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def strlen(e: Column): Column = StringLength(e.expr) | ||
| def length(e: Column): Column = Length(e.expr) | ||
|
|
||
| /** | ||
| * Computes the length of a given string column. | ||
| * Computes the length of a given string / binary column | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def strlen(columnName: String): Column = strlen(Column(columnName)) | ||
| def length(columnName: String): Column = length(Column(columnName)) | ||
|
|
||
| /** | ||
| * Formats the number X to a format like '#,###,###.##', rounded to D decimal places, | ||
| * and returns the result as a string. If D is 0, the result has no decimal point or | ||
| * fractional part. | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def formatNumber(x: Column, d: Column): Column = FormatNumber(x.expr, d.expr) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's use format_number to be consistent with sql. |
||
|
|
||
| /** | ||
| * Formats the number X to a format like '#,###,###.##', rounded to D decimal places, | ||
| * and returns the result as a string. If D is 0, the result has no decimal point or | ||
| * fractional part. | ||
| * | ||
| * @group string_funcs | ||
| * @since 1.5.0 | ||
| */ | ||
| def formatNumber(columnXName: String, columnDName: String): Column = { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The second argument possible be constant integer in most of the real world cases, leave it for the further discussion, as we have lots of similar cases, @rxin maybe you can change them all in a single PR, after all of the expressions jira issus resolved.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd just support a constant number, without column name here. basically two functions format_number(e: Column, d: Int) format_number(columnName: String, d: Int) |
||
| formatNumber(Column(columnXName), Column(columnDName)) | ||
| } | ||
|
|
||
| /** | ||
| * Computes the Levenshtein distance of the two given strings. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't need to support NullType here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It will causes exception in
StringFunctionSuite, as we will not runAnalyzerat all there.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can just remove that test case, can't you?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh, yes, we can do that now since you've handled the NullType in a single place.