-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-5893][ML] Add bucketizer #5980
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from 1 commit
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
5fe190e
add bucketizer
yinxusen 4024cf1
add test suite
yinxusen 998bc87
check buckets
yinxusen 11fb00a
change it into an Estimator
yinxusen 2466322
refactor Bucketizer
yinxusen fb30d79
fix and test binary search
yinxusen ac77859
fix style error
yinxusen 3a16cc2
refine comments and names
yinxusen c3cc770
add more unit test for binary search
yinxusen eacfcfa
change ML attribute from splits into buckets
yinxusen 34f124a
Removed lowerInclusive, upperInclusive params from Bucketizer, and us…
jkbradley 1ca973a
one more bucketizer test
jkbradley dc8c843
Merge pull request #4 from jkbradley/yinxusen-SPARK-5893
yinxusen File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
refine comments and names
- Loading branch information
commit 3a16cc2f1836dbd8fe22f6603c641ba3aedf5bea
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,19 +32,21 @@ import org.apache.spark.sql.types.{DoubleType, StructField, StructType} | |
| * `Bucketizer` maps a column of continuous features to a column of feature buckets. | ||
| */ | ||
| @AlphaComponent | ||
| private[ml] final class Bucketizer(override val parent: Estimator[Bucketizer]) | ||
| final class Bucketizer private[ml] (override val parent: Estimator[Bucketizer]) | ||
| extends Model[Bucketizer] with HasInputCol with HasOutputCol { | ||
|
|
||
| def this() = this(null) | ||
|
|
||
| /** | ||
| * Parameter for mapping continuous features into buckets. With n splits, there are n+1 buckets. | ||
| * A bucket defined by splits x,y holds values in the range [x,y). | ||
| * A bucket defined by splits x,y holds values in the range [x,y). Note that the splits should be | ||
| * strictly increasing. | ||
| * @group param | ||
| */ | ||
| val splits: Param[Array[Double]] = new Param[Array[Double]](this, "splits", | ||
| "Split points for mapping continuous features into buckets. With n splits, there are n+1" + | ||
| " buckets. A bucket defined by splits x,y holds values in the range [x,y).", | ||
| "Split points for mapping continuous features into buckets. With n splits, there are n+1 " + | ||
| "buckets. A bucket defined by splits x,y holds values in the range [x,y). The splits " + | ||
| "should be strictly increasing.", | ||
| Bucketizer.checkSplits) | ||
|
|
||
| /** @group getParam */ | ||
|
|
@@ -53,9 +55,15 @@ private[ml] final class Bucketizer(override val parent: Estimator[Bucketizer]) | |
| /** @group setParam */ | ||
| def setSplits(value: Array[Double]): this.type = set(splits, value) | ||
|
|
||
| /** @group Param */ | ||
| /** | ||
| * An indicator of the inclusiveness of negative infinite. If true, then use implicit bin | ||
| * (-inf, getSplits.head). If false, then throw exception if values < getSplits.head are | ||
| * encountered. | ||
| * @group Param */ | ||
| val lowerInclusive: BooleanParam = new BooleanParam(this, "lowerInclusive", | ||
| "An indicator of the inclusiveness of negative infinite.") | ||
| "An indicator of the inclusiveness of negative infinite. If true, then use implicit bin " + | ||
| "(-inf, getSplits.head). If false, then throw exception if values < getSplits.head are " + | ||
| "encountered.") | ||
| setDefault(lowerInclusive -> true) | ||
|
|
||
| /** @group getParam */ | ||
|
|
@@ -64,9 +72,15 @@ private[ml] final class Bucketizer(override val parent: Estimator[Bucketizer]) | |
| /** @group setParam */ | ||
| def setLowerInclusive(value: Boolean): this.type = set(lowerInclusive, value) | ||
|
|
||
| /** @group Param */ | ||
| /** | ||
| * An indicator of the inclusiveness of positive infinite. If true, then use implicit bin | ||
| * [getSplits.last, inf). If false, then throw exception if values > getSplits.last are | ||
| * encountered. | ||
| * @group Param */ | ||
| val upperInclusive: BooleanParam = new BooleanParam(this, "upperInclusive", | ||
| "An indicator of the inclusiveness of positive infinite.") | ||
| "An indicator of the inclusiveness of positive infinite. If true, then use implicit bin " + | ||
| "[getSplits.last, inf). If false, then throw exception if values > getSplits.last are " + | ||
| "encountered.") | ||
| setDefault(upperInclusive -> true) | ||
|
|
||
| /** @group getParam */ | ||
|
|
@@ -93,9 +107,7 @@ private[ml] final class Bucketizer(override val parent: Estimator[Bucketizer]) | |
| } | ||
|
|
||
| private def prepOutputField(schema: StructType): StructField = { | ||
| val attr = new NominalAttribute( | ||
| name = Some($(outputCol)), | ||
| isOrdinal = Some(true), | ||
| val attr = new NominalAttribute(name = Some($(outputCol)), isOrdinal = Some(true), | ||
| values = Some($(splits).map(_.toString))) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Each value should correspond to 1 bin, not 1 split, right? This will have to take the inclusive parameters into account too. |
||
|
|
||
| attr.toStructField() | ||
|
|
@@ -109,7 +121,7 @@ private[ml] final class Bucketizer(override val parent: Estimator[Bucketizer]) | |
| } | ||
| } | ||
|
|
||
| object Bucketizer { | ||
| private[feature] object Bucketizer { | ||
| /** | ||
| * The given splits should match 1) its size is larger than zero; 2) it is ordered in a strictly | ||
| * increasing way. | ||
|
|
@@ -137,8 +149,8 @@ object Bucketizer { | |
| lowerInclusive: Boolean, | ||
| upperInclusive: Boolean): Double = { | ||
| if ((feature < splits.head && !lowerInclusive) || (feature > splits.last && !upperInclusive)) { | ||
| throw new Exception(s"Feature $feature out of bound, check your features or loose the" + | ||
| s" lower/upper bound constraint.") | ||
| throw new RuntimeException(s"Feature $feature out of bound, check your features or loosen " + | ||
| s"the lower/upper bound constraint.") | ||
| } | ||
| var left = 0 | ||
| var right = splits.length - 2 | ||
|
|
@@ -153,6 +165,6 @@ object Bucketizer { | |
| left = mid + 1 | ||
| } | ||
| } | ||
| throw new Exception(s"Failed to find a bucket for feature $feature.") | ||
| throw new RuntimeException(s"Unexpected error: failed to find a bucket for feature $feature.") | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know it's implied, but can this please state that the splits should be strictly increasing?