-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-23377][ML] Fixes Bucketizer with multiple columns persistence bug #20594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,10 @@ package org.apache.spark.ml.feature | |
|
|
||
| import java.{util => ju} | ||
|
|
||
| import org.json4s.JsonDSL._ | ||
| import org.json4s.JValue | ||
| import org.json4s.jackson.JsonMethods._ | ||
|
|
||
| import org.apache.spark.SparkException | ||
| import org.apache.spark.annotation.Since | ||
| import org.apache.spark.ml.Model | ||
|
|
@@ -213,6 +217,9 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String | |
| override def copy(extra: ParamMap): Bucketizer = { | ||
| defaultCopy[Bucketizer](extra).setParent(parent) | ||
| } | ||
|
|
||
| @Since("2.3.0") | ||
| override def write: MLWriter = new Bucketizer.BucketizerWriter(this) | ||
| } | ||
|
|
||
| @Since("1.6.0") | ||
|
|
@@ -290,6 +297,28 @@ object Bucketizer extends DefaultParamsReadable[Bucketizer] { | |
| } | ||
| } | ||
|
|
||
|
|
||
| private[Bucketizer] class BucketizerWriter(instance: Bucketizer) extends MLWriter { | ||
|
|
||
| override protected def saveImpl(path: String): Unit = { | ||
| // SPARK-23377: The default params will be saved and loaded as user-supplied params. | ||
| // Once `inputCols` is set, the default value of `outputCol` param causes the error | ||
| // when checking exclusive params. As a temporary to fix it, we skip the default value | ||
| // of `outputCol` if `inputCols` is set when saving the metadata. | ||
| // TODO: If we modify the persistence mechanism later to better handle default params, | ||
| // we can get rid of this. | ||
| var paramWithoutOutputCol: Option[JValue] = None | ||
| if (instance.isSet(instance.inputCols)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this can create a lot of issues with the Python API. Please see #20410 for reference. Thus I am against this fix, unless we first fix the problem I linked
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? I think they are orthogonal and this shouldn't cause the issue in Python side. Besides, as the PySpark multi-column support is not added yet (it's reverted), I think we don't hit the Python API issue. This is a quick fix to deal with the persistence bug. I'm not sure we should be blocked.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes I think #20410 is not related to this PR for now. But I am afraid in the future, when we add more functionality, potential bugs will possible to be triggered. |
||
| val params = instance.extractParamMap().toSeq.asInstanceOf[Seq[ParamPair[Any]]] | ||
|
||
| val jsonParams = params.filter(_.param != instance.outputCol).map { case ParamPair(p, v) => | ||
| p.name -> parse(p.jsonEncode(v)) | ||
| }.toList | ||
| paramWithoutOutputCol = Some(render(jsonParams)) | ||
| } | ||
| DefaultParamsWriter.saveMetadata(instance, path, sc, paramMap = paramWithoutOutputCol) | ||
| } | ||
| } | ||
|
|
||
| @Since("1.6.0") | ||
| override def load(path: String): Bucketizer = super.load(path) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need for this since annotation; the signature isn't changed in 2.3.0