-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-26216][SQL] Do not use case class as public API (UserDefinedFunction) #23178
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,114 +38,106 @@ import org.apache.spark.sql.types.DataType | |
| * @since 1.3.0 | ||
| */ | ||
| @Stable | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm +1 for this PR, but I'm just wondering if this
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it better to change it to
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea actually I was wondering about the same thing.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd go ahead and leave the Since version. The API is essentially unchanged, though there are some marginal breaking compile time changes. But same is true of many things we are changing in 3.0. I've tagged the JIRA with
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not a new API anyway, it will be weird to change since to 3.0.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. Thank you, @HyukjinKwon , @srowen , @cloud-fan . |
||
| case class UserDefinedFunction protected[sql] ( | ||
| f: AnyRef, | ||
| dataType: DataType, | ||
| inputTypes: Option[Seq[DataType]]) { | ||
|
|
||
| private var _nameOption: Option[String] = None | ||
| private var _nullable: Boolean = true | ||
| private var _deterministic: Boolean = true | ||
|
|
||
| // This is a `var` instead of in the constructor for backward compatibility of this case class. | ||
| // TODO: revisit this case class in Spark 3.0, and narrow down the public surface. | ||
| private[sql] var nullableTypes: Option[Seq[Boolean]] = None | ||
| sealed trait UserDefinedFunction { | ||
|
|
||
| /** | ||
| * Returns true when the UDF can return a nullable value. | ||
| * | ||
| * @since 2.3.0 | ||
| */ | ||
| def nullable: Boolean = _nullable | ||
| def nullable: Boolean | ||
|
|
||
| /** | ||
| * Returns true iff the UDF is deterministic, i.e. the UDF produces the same output given the same | ||
| * input. | ||
| * | ||
| * @since 2.3.0 | ||
| */ | ||
| def deterministic: Boolean = _deterministic | ||
| def deterministic: Boolean | ||
|
|
||
| /** | ||
| * Returns an expression that invokes the UDF, using the given arguments. | ||
| * | ||
| * @since 1.3.0 | ||
| */ | ||
| @scala.annotation.varargs | ||
| def apply(exprs: Column*): Column = { | ||
| def apply(exprs: Column*): Column | ||
|
|
||
| /** | ||
| * Updates UserDefinedFunction with a given name. | ||
| * | ||
| * @since 2.3.0 | ||
| */ | ||
| def withName(name: String): UserDefinedFunction | ||
|
|
||
| /** | ||
| * Updates UserDefinedFunction to non-nullable. | ||
| * | ||
| * @since 2.3.0 | ||
| */ | ||
| def asNonNullable(): UserDefinedFunction | ||
|
|
||
| /** | ||
| * Updates UserDefinedFunction to nondeterministic. | ||
| * | ||
| * @since 2.3.0 | ||
| */ | ||
| def asNondeterministic(): UserDefinedFunction | ||
| } | ||
|
|
||
| private[sql] case class SparkUserDefinedFunction( | ||
| f: AnyRef, | ||
| dataType: DataType, | ||
| inputTypes: Option[Seq[DataType]], | ||
| nullableTypes: Option[Seq[Boolean]], | ||
| name: Option[String] = None, | ||
| nullable: Boolean = true, | ||
| deterministic: Boolean = true) extends UserDefinedFunction { | ||
|
|
||
| @scala.annotation.varargs | ||
| override def apply(exprs: Column*): Column = { | ||
| // TODO: make sure this class is only instantiated through `SparkUserDefinedFunction.create()` | ||
| // and `nullableTypes` is always set. | ||
| if (nullableTypes.isEmpty) { | ||
| nullableTypes = Some(ScalaReflection.getParameterTypeNullability(f)) | ||
| } | ||
| if (inputTypes.isDefined) { | ||
| assert(inputTypes.get.length == nullableTypes.get.length) | ||
| } | ||
|
|
||
| val inputsNullSafe = nullableTypes.getOrElse { | ||
| ScalaReflection.getParameterTypeNullability(f) | ||
| } | ||
|
|
||
| Column(ScalaUDF( | ||
| f, | ||
| dataType, | ||
| exprs.map(_.expr), | ||
| nullableTypes.get, | ||
| inputsNullSafe, | ||
| inputTypes.getOrElse(Nil), | ||
| udfName = _nameOption, | ||
| nullable = _nullable, | ||
| udfDeterministic = _deterministic)) | ||
| } | ||
|
|
||
| private def copyAll(): UserDefinedFunction = { | ||
| val udf = copy() | ||
| udf._nameOption = _nameOption | ||
| udf._nullable = _nullable | ||
| udf._deterministic = _deterministic | ||
| udf.nullableTypes = nullableTypes | ||
| udf | ||
| udfName = name, | ||
| nullable = nullable, | ||
| udfDeterministic = deterministic)) | ||
| } | ||
|
|
||
| /** | ||
| * Updates UserDefinedFunction with a given name. | ||
| * | ||
| * @since 2.3.0 | ||
| */ | ||
| def withName(name: String): UserDefinedFunction = { | ||
| val udf = copyAll() | ||
| udf._nameOption = Option(name) | ||
| udf | ||
| override def withName(name: String): UserDefinedFunction = { | ||
| copy(name = Option(name)) | ||
| } | ||
|
|
||
| /** | ||
| * Updates UserDefinedFunction to non-nullable. | ||
| * | ||
| * @since 2.3.0 | ||
| */ | ||
| def asNonNullable(): UserDefinedFunction = { | ||
| override def asNonNullable(): UserDefinedFunction = { | ||
| if (!nullable) { | ||
| this | ||
| } else { | ||
| val udf = copyAll() | ||
| udf._nullable = false | ||
| udf | ||
| copy(nullable = false) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Updates UserDefinedFunction to nondeterministic. | ||
| * | ||
| * @since 2.3.0 | ||
| */ | ||
| def asNondeterministic(): UserDefinedFunction = { | ||
| if (!_deterministic) { | ||
| override def asNondeterministic(): UserDefinedFunction = { | ||
| if (!deterministic) { | ||
| this | ||
| } else { | ||
| val udf = copyAll() | ||
| udf._deterministic = false | ||
| udf | ||
| copy(deterministic = false) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // We have to use a name different than `UserDefinedFunction` here, to avoid breaking the binary | ||
| // compatibility of the auto-generate UserDefinedFunction object. | ||
| private[sql] object SparkUserDefinedFunction { | ||
|
|
||
| def create( | ||
|
|
@@ -157,8 +149,7 @@ private[sql] object SparkUserDefinedFunction { | |
| } else { | ||
| Some(inputSchemas.map(_.get.dataType)) | ||
| } | ||
| val udf = new UserDefinedFunction(f, dataType, inputTypes) | ||
| udf.nullableTypes = Some(inputSchemas.map(_.map(_.nullable).getOrElse(true))) | ||
| udf | ||
| val nullableTypes = Some(inputSchemas.map(_.map(_.nullable).getOrElse(true))) | ||
| SparkUserDefinedFunction(f, dataType, inputTypes, nullableTypes) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we get rid of this in #23351?