apache · MaxGekk · Aug 23, 2023 · Aug 26, 2023 · Aug 26, 2023 · Aug 26, 2023
diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json
@@ -1788,6 +1788,11 @@
           "expects a binary value with 16, 24 or 32 bytes, but got <actualLength> bytes."
         ]
       },
+      "BINARY_FORMAT" : {
+        "message" : [
+          "expects one of binary formats 'base64', 'hex', 'utf-8', but got <invalidFormat>."
+        ]
+      },
       "DATETIME_UNIT" : {
         "message" : [
           "expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal <invalidValue>."

diff --git a/docs/sql-error-conditions-invalid-parameter-value-error-class.md b/docs/sql-error-conditions-invalid-parameter-value-error-class.md
@@ -37,6 +37,10 @@ supports 16-byte CBC IVs and 12-byte GCM IVs, but got `<actualLength>` bytes for
 
 expects a binary value with 16, 24 or 32 bytes, but got `<actualLength>` bytes.
 
+## BINARY_FORMAT
+
+expects one of binary formats 'base64', 'hex', 'utf-8', but got `<invalidFormat>`.
+
 ## DATETIME_UNIT
 
 expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal `<invalidValue>`.

diff --git a/...st/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala b/...st/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGe
 import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper
 import org.apache.spark.sql.catalyst.util.ToNumberParser
 import org.apache.spark.sql.errors.QueryCompilationErrors
-import org.apache.spark.sql.types.{AbstractDataType, DataType, DatetimeType, Decimal, DecimalType, StringType}
+import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, DatetimeType, Decimal, DecimalType, StringType}
 import org.apache.spark.unsafe.types.UTF8String
 
 abstract class ToNumberBase(left: Expression, right: Expression, errorOnFail: Boolean)
@@ -209,6 +209,10 @@ case class TryToNumber(left: Expression, right: Expression)
           wrapped by angle brackets if the input value is negative.
           ('<1>').
       If `expr` is a datetime, `format` shall be a valid datetime pattern, see <a href="https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html">Datetime Patterns</a>.
+      If `expr` is a binary, it is converted to a string in one of the formats:
+        'base64': a base 64 string.
+        'hex': a string in the hexadecimal format.
+        'utf-8': the input binary is decoded to UTF-8 string.
   """,
   examples = """
     Examples:
@@ -224,6 +228,12 @@ case class TryToNumber(left: Expression, right: Expression)
        12,454.8-
       > SELECT _FUNC_(date'2016-04-08', 'y');
        2016
+      > SELECT _FUNC_(x'537061726b2053514c', 'base64');
+       U3BhcmsgU1FM
+      > SELECT _FUNC_(x'537061726b2053514c', 'hex');
+       537061726B2053514C
+      > SELECT _FUNC_(encode('abc', 'utf-8'), 'utf-8');
+       abc
   """,
   since = "3.4.0",
   group = "string_funcs")
@@ -232,10 +242,20 @@ object ToCharacterBuilder extends ExpressionBuilder {
   override def build(funcName: String, expressions: Seq[Expression]): Expression = {
     val numArgs = expressions.length
     if (expressions.length == 2) {
-      val inputExpr = expressions.head
+      val (inputExpr, format) = (expressions(0), expressions(1))
       inputExpr.dataType match {
-        case _: DatetimeType => DateFormatClass(inputExpr, expressions(1))
-        case _ => ToCharacter(inputExpr, expressions(1))
+        case _: DatetimeType => DateFormatClass(inputExpr, format)
+        case _: BinaryType =>
+          if (!(format.dataType == StringType && format.foldable)) {
+            throw QueryCompilationErrors.requireLiteralParameter(funcName, "format", "string")
+          }
+          format.eval().asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT).trim match {
+            case "base64" => Base64(inputExpr)
+            case "hex" => Hex(inputExpr)
+            case "utf-8" => new Decode(Seq(inputExpr, format))
+            case invalid => throw QueryCompilationErrors.binaryFormatError(funcName, invalid)
+          }
+        case _ => ToCharacter(inputExpr, format)
       }
     } else {
       throw QueryCompilationErrors.wrongNumArgsError(funcName, Seq(2), numArgs)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
@@ -157,6 +157,15 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat
         "functionName" -> toSQLId("format_string")))
   }
 
+  def binaryFormatError(funcName: String, invalidFormat: String): Throwable = {
+    new AnalysisException(
+      errorClass = "INVALID_PARAMETER_VALUE.BINARY_FORMAT",
+      messageParameters = Map(
+        "parameter" -> toSQLId("format"),
+        "functionName" -> toSQLId(funcName),
+        "invalidFormat" -> toSQLValue(invalidFormat, StringType)))
+  }
+
   def unorderablePivotColError(pivotCol: Expression): Throwable = {
     new AnalysisException(
       errorClass = "INCOMPARABLE_PIVOT_COLUMN",

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -854,28 +854,53 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
     )
   }
 
-  test("to_char") {
-    val df = Seq(78.12).toDF("a")
-    checkAnswer(
-      df.selectExpr("to_char(a, '$99.99')"),
-      Seq(Row("$78.12"))
-    )
-    checkAnswer(
-      df.select(to_char(col("a"), lit("$99.99"))),
-      Seq(Row("$78.12"))
-    )
-  }
-
-  test("to_varchar") {
-    val df = Seq(78.12).toDF("a")
-    checkAnswer(
-      df.selectExpr("to_varchar(a, '$99.99')"),
-      Seq(Row("$78.12"))
-    )
-    checkAnswer(
-      df.select(to_varchar(col("a"), lit("$99.99"))),
-      Seq(Row("$78.12"))
-    )
+  test("to_char/to_varchar") {
+    Seq(
+      "to_char" -> ((e: Column, fmt: Column) => to_char(e, fmt)),
+      "to_varchar" -> ((e: Column, fmt: Column) => to_varchar(e, fmt))
+    ).foreach { case (funcName, func) =>
+      val df = Seq(78.12).toDF("a")
+      checkAnswer(df.selectExpr(s"$funcName(a, '$$99.99')"), Seq(Row("$78.12")))
+      checkAnswer(df.select(func(col("a"), lit("$99.99"))), Seq(Row("$78.12")))
+
+      val df2 = Seq((Array(100.toByte), "base64")).toDF("input", "format")
+      checkAnswer(df2.selectExpr(s"$funcName(input, 'hex')"), Seq(Row("64")))
+      checkAnswer(df2.select(func(col("input"), lit("hex"))), Seq(Row("64")))
+      checkAnswer(df2.selectExpr(s"$funcName(input, 'base64')"), Seq(Row("ZA==")))
+      checkAnswer(df2.select(func(col("input"), lit("base64"))), Seq(Row("ZA==")))
+      checkAnswer(df2.selectExpr(s"$funcName(input, 'utf-8')"), Seq(Row("d")))
+      checkAnswer(df2.select(func(col("input"), lit("utf-8"))), Seq(Row("d")))
+
+      checkError(
+        exception = intercept[AnalysisException] {
+          df2.select(func(col("input"), col("format"))).collect()
+        },
+        errorClass = "_LEGACY_ERROR_TEMP_1100",
+        parameters = Map(
+          "argName" -> "format",
+          "funcName" -> "to_char",
+          "requiredType" -> "string"))
+      checkError(
+        exception = intercept[AnalysisException] {
+          df2.select(func(col("input"), lit("invalid_format"))).collect()
+        },
+        errorClass = "INVALID_PARAMETER_VALUE.BINARY_FORMAT",
+        parameters = Map(
+          "parameter" -> "`format`",
+          "functionName" -> "`to_char`",
+          "invalidFormat" -> "'invalid_format'"))
+      checkError(
+        exception = intercept[AnalysisException] {
+          sql(s"select $funcName('a', 'b', 'c')")
+        },
+        errorClass = "WRONG_NUM_ARGS.WITHOUT_SUGGESTION",
+        parameters = Map(
+          "functionName" -> s"`$funcName`",
+          "expectedNum" -> "2",
+          "actualNum" -> "3",
+          "docroot" -> SPARK_DOC_ROOT),
+        context = ExpectedContext("", "", 7, 21 + funcName.length, s"$funcName('a', 'b', 'c')"))
+    }
   }
 
   test("to_number") {