Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions common/utils/src/main/resources/error/error-classes.json
Original file line number Diff line number Diff line change
Expand Up @@ -1788,6 +1788,11 @@
"expects a binary value with 16, 24 or 32 bytes, but got <actualLength> bytes."
]
},
"BINARY_FORMAT" : {
"message" : [
"expects one of binary formats 'base64', 'hex', 'utf-8', but got <invalidFormat>."
]
},
"DATETIME_UNIT" : {
"message" : [
"expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal <invalidValue>."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ supports 16-byte CBC IVs and 12-byte GCM IVs, but got `<actualLength>` bytes for

expects a binary value with 16, 24 or 32 bytes, but got `<actualLength>` bytes.

## BINARY_FORMAT

expects one of binary formats 'base64', 'hex', 'utf-8', but got `<invalidFormat>`.

## DATETIME_UNIT

expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal `<invalidValue>`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGe
import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper
import org.apache.spark.sql.catalyst.util.ToNumberParser
import org.apache.spark.sql.errors.QueryCompilationErrors
import org.apache.spark.sql.types.{AbstractDataType, DataType, DatetimeType, Decimal, DecimalType, StringType}
import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, DatetimeType, Decimal, DecimalType, StringType}
import org.apache.spark.unsafe.types.UTF8String

abstract class ToNumberBase(left: Expression, right: Expression, errorOnFail: Boolean)
Expand Down Expand Up @@ -209,6 +209,10 @@ case class TryToNumber(left: Expression, right: Expression)
wrapped by angle brackets if the input value is negative.
('<1>').
If `expr` is a datetime, `format` shall be a valid datetime pattern, see <a href="https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html">Datetime Patterns</a>.
If `expr` is a binary, it is converted to a string in one of the formats:
'base64': a base 64 string.
'hex': a string in the hexadecimal format.
'utf-8': the input binary is decoded to UTF-8 string.
""",
examples = """
Examples:
Expand All @@ -224,6 +228,12 @@ case class TryToNumber(left: Expression, right: Expression)
12,454.8-
> SELECT _FUNC_(date'2016-04-08', 'y');
2016
> SELECT _FUNC_(x'537061726b2053514c', 'base64');
U3BhcmsgU1FM
> SELECT _FUNC_(x'537061726b2053514c', 'hex');
537061726B2053514C
> SELECT _FUNC_(encode('abc', 'utf-8'), 'utf-8');
abc
""",
since = "3.4.0",
group = "string_funcs")
Expand All @@ -232,10 +242,20 @@ object ToCharacterBuilder extends ExpressionBuilder {
override def build(funcName: String, expressions: Seq[Expression]): Expression = {
val numArgs = expressions.length
if (expressions.length == 2) {
val inputExpr = expressions.head
val (inputExpr, format) = (expressions(0), expressions(1))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a bit weird, we can just write 2 lines

val inputExpr = expressions(0)
val format = expressions(1)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't think it is the right approach to split semantically one operation: "assign names".

inputExpr.dataType match {
case _: DatetimeType => DateFormatClass(inputExpr, expressions(1))
case _ => ToCharacter(inputExpr, expressions(1))
case _: DatetimeType => DateFormatClass(inputExpr, format)
case _: BinaryType =>
if (!(format.dataType == StringType && format.foldable)) {
throw QueryCompilationErrors.requireLiteralParameter(funcName, "format", "string")
}
format.eval().asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT).trim match {
case "base64" => Base64(inputExpr)
case "hex" => Hex(inputExpr)
case "utf-8" => new Decode(Seq(inputExpr, format))
case invalid => throw QueryCompilationErrors.binaryFormatError(funcName, invalid)
}
case _ => ToCharacter(inputExpr, format)
}
} else {
throw QueryCompilationErrors.wrongNumArgsError(funcName, Seq(2), numArgs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,15 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat
"functionName" -> toSQLId("format_string")))
}

def binaryFormatError(funcName: String, invalidFormat: String): Throwable = {
new AnalysisException(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My feeling is that, if the error is only thrown in one place, we don't need to add a method here, just throw new AnalysisException... in the caller side.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One of purposes of gathering exceptions to Query*Errors to don't depend on forming exceptions: errors classes, quoting, context and so on. The caller has to provide only valuable info and don't worry about any technical stuff.

Don't think that this PR is right place to begin don't do that.

errorClass = "INVALID_PARAMETER_VALUE.BINARY_FORMAT",
messageParameters = Map(
"parameter" -> toSQLId("format"),
"functionName" -> toSQLId(funcName),
"invalidFormat" -> toSQLValue(invalidFormat, StringType)))
}

def unorderablePivotColError(pivotCol: Expression): Throwable = {
new AnalysisException(
errorClass = "INCOMPARABLE_PIVOT_COLUMN",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -854,28 +854,53 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
)
}

test("to_char") {
val df = Seq(78.12).toDF("a")
checkAnswer(
df.selectExpr("to_char(a, '$99.99')"),
Seq(Row("$78.12"))
)
checkAnswer(
df.select(to_char(col("a"), lit("$99.99"))),
Seq(Row("$78.12"))
)
}

test("to_varchar") {
val df = Seq(78.12).toDF("a")
checkAnswer(
df.selectExpr("to_varchar(a, '$99.99')"),
Seq(Row("$78.12"))
)
checkAnswer(
df.select(to_varchar(col("a"), lit("$99.99"))),
Seq(Row("$78.12"))
)
test("to_char/to_varchar") {
Seq(
"to_char" -> ((e: Column, fmt: Column) => to_char(e, fmt)),
"to_varchar" -> ((e: Column, fmt: Column) => to_varchar(e, fmt))
).foreach { case (funcName, func) =>
val df = Seq(78.12).toDF("a")
checkAnswer(df.selectExpr(s"$funcName(a, '$$99.99')"), Seq(Row("$78.12")))
checkAnswer(df.select(func(col("a"), lit("$99.99"))), Seq(Row("$78.12")))

val df2 = Seq((Array(100.toByte), "base64")).toDF("input", "format")
checkAnswer(df2.selectExpr(s"$funcName(input, 'hex')"), Seq(Row("64")))
checkAnswer(df2.select(func(col("input"), lit("hex"))), Seq(Row("64")))
checkAnswer(df2.selectExpr(s"$funcName(input, 'base64')"), Seq(Row("ZA==")))
checkAnswer(df2.select(func(col("input"), lit("base64"))), Seq(Row("ZA==")))
checkAnswer(df2.selectExpr(s"$funcName(input, 'utf-8')"), Seq(Row("d")))
checkAnswer(df2.select(func(col("input"), lit("utf-8"))), Seq(Row("d")))

checkError(
exception = intercept[AnalysisException] {
df2.select(func(col("input"), col("format"))).collect()
},
errorClass = "_LEGACY_ERROR_TEMP_1100",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: Open an JIRA to assign proper name for the error class.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the PR #42737

parameters = Map(
"argName" -> "format",
"funcName" -> "to_char",
"requiredType" -> "string"))
checkError(
exception = intercept[AnalysisException] {
df2.select(func(col("input"), lit("invalid_format"))).collect()
},
errorClass = "INVALID_PARAMETER_VALUE.BINARY_FORMAT",
parameters = Map(
"parameter" -> "`format`",
"functionName" -> "`to_char`",
"invalidFormat" -> "'invalid_format'"))
checkError(
exception = intercept[AnalysisException] {
sql(s"select $funcName('a', 'b', 'c')")
},
errorClass = "WRONG_NUM_ARGS.WITHOUT_SUGGESTION",
parameters = Map(
"functionName" -> s"`$funcName`",
"expectedNum" -> "2",
"actualNum" -> "3",
"docroot" -> SPARK_DOC_ROOT),
context = ExpectedContext("", "", 7, 21 + funcName.length, s"$funcName('a', 'b', 'c')"))
}
}

test("to_number") {
Expand Down