-
Notifications
You must be signed in to change notification settings - Fork 29k
[Spark-8530] [ML] add python API for MinMaxScaler #7150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
77f57ef
3333ec9
583bacf
86e8482
7b97e6a
99042c5
9785b56
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,7 +24,7 @@ | |
| __all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder', | ||
| 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel', | ||
| 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', | ||
| 'Word2Vec', 'Word2VecModel'] | ||
| 'Word2Vec', 'Word2VecModel', 'MinMaxScaler', 'MinMaxScalerModel'] | ||
|
|
||
|
|
||
| @inherit_doc | ||
|
|
@@ -1030,6 +1030,87 @@ class Word2VecModel(JavaModel): | |
| """ | ||
|
|
||
|
|
||
| @inherit_doc | ||
| class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol): | ||
| """ | ||
| Rescale each feature individually to a common range [min, max] linearly using column summary | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mark with |
||
| statistics, which is also known as min-max normalization or Rescaling. The rescaled value for | ||
| feature E is calculated as, | ||
|
|
||
| Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min | ||
|
|
||
| For the case E_{max} == E_{min}, Rescaled(e_i) = 0.5 * (max + min) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please copy full Scala doc: "Note that since zero values will probably be transformed to non-zero values, output of the transformer will be DenseVector even for sparse input." |
||
|
|
||
| >>> from pyspark.mllib.linalg import Vectors | ||
| >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) | ||
| >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled") | ||
| >>> model = mmScaler.fit(df) | ||
| >>> model.transform(df).collect()[1].scaled | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please change this to model.transform(df).show()? That looks nice (and is Ok for 2 rows). |
||
| DenseVector([1.0]) | ||
| """ | ||
|
|
||
| # a placeholder to make it appear in the generated doc | ||
| min = Param(Params._dummy(), "min", "Lower bound of the output feature range") | ||
| max = Param(Params._dummy(), "max", "Upper bound of the output feature range") | ||
|
|
||
| @keyword_only | ||
| def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None): | ||
| """ | ||
| __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None) | ||
| """ | ||
| super(MinMaxScaler, self).__init__() | ||
| self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid) | ||
| self.min = Param(self, "min", "Lower bound of the output feature range") | ||
| self.max = Param(self, "max", "Upper bound of the output feature range") | ||
| self._setDefault(min=0.0, max=1.0) | ||
| kwargs = self.__init__._input_kwargs | ||
| self.setParams(**kwargs) | ||
|
|
||
| @keyword_only | ||
| def setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None): | ||
| """ | ||
| setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None) | ||
| Sets params for this MinMaxScaler. | ||
| """ | ||
| kwargs = self.setParams._input_kwargs | ||
| return self._set(**kwargs) | ||
|
|
||
| def setMin(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`min`. | ||
| """ | ||
| self._paramMap[self.min] = value | ||
| return self | ||
|
|
||
| def getMin(self): | ||
| """ | ||
| Gets the value of min or its default value. | ||
| """ | ||
| return self.getOrDefault(self.min) | ||
|
|
||
| def setMax(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`max`. | ||
| """ | ||
| self._paramMap[self.max] = value | ||
| return self | ||
|
|
||
| def getMax(self): | ||
| """ | ||
| Gets the value of max or its default value. | ||
| """ | ||
| return self.getOrDefault(self.max) | ||
|
|
||
| def _create_model(self, java_model): | ||
| return MinMaxScalerModel(java_model) | ||
|
|
||
|
|
||
| class MinMaxScalerModel(JavaModel): | ||
| """ | ||
| Model fitted by MinMaxScaler. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nicer to write: |
||
| """ | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| import doctest | ||
| from pyspark.context import SparkContext | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please keep sorted