From 77f57ef41d12c2b5fc061305b71932f39207b93f Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 1 Jul 2015 14:52:02 +0800 Subject: [PATCH 1/3] add python API for MinMaxScaler --- python/pyspark/ml/feature.py | 83 +++++++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 8804dace849b..ec8a6fc9dae4 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -24,7 +24,7 @@ __all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', - 'Word2Vec', 'Word2VecModel'] + 'Word2Vec', 'Word2VecModel', 'MinMaxScaler', 'MinMaxScalerModel'] @inherit_doc @@ -1030,6 +1030,87 @@ class Word2VecModel(JavaModel): """ +@inherit_doc +class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol): + """ + Rescale each feature individually to a common range [min, max] linearly using column summary + statistics, which is also known as min-max normalization or Rescaling. The rescaled value for + feature E is calculated as, + + Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min + + For the case E_{max} == E_{min}, Rescaled(e_i) = 0.5 * (max + min) + + >>> from pyspark.mllib.linalg import Vectors + >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) + >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled") + >>> model = mmScaler.fit(df) + >>> model.transform(df).collect()[1].scaled + DenseVector([1.0]) + """ + + # a placeholder to make it appear in the generated doc + min = Param(Params._dummy(), "min", "Lower bound of the output feature range") + max = Param(Params._dummy(), "max", "Upper bound of the output feature range") + + @keyword_only + def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None): + """ + __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None) + """ + super(MinMaxScaler, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid) + self.min = Param(self, "min", "Lower bound of the output feature range") + self.max = Param(self, "max", "Upper bound of the output feature range") + self._setDefault(min=0.0, max=1.0) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None): + """ + setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None) + Sets params for this MinMaxScaler. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setMin(self, value): + """ + Sets the value of :py:attr:`min`. + """ + self._paramMap[self.min] = value + return self + + def getMin(self): + """ + Gets the value of min or its default value. + """ + return self.getOrDefault(self.min) + + def setMax(self, value): + """ + Sets the value of :py:attr:`max`. + """ + self._paramMap[self.max] = value + return self + + def getMax(self): + """ + Gets the value of max or its default value. + """ + return self.getOrDefault(self.max) + + def _create_model(self, java_model): + return MinMaxScalerModel(java_model) + + +class MinMaxScalerModel(JavaModel): + """ + Model fitted by MinMaxScaler. + """ + + if __name__ == "__main__": import doctest from pyspark.context import SparkContext From 7b97e6acc70a02013f4f43b3a6f2ae9e6d6b4c6a Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 19 Aug 2015 11:59:11 +0800 Subject: [PATCH 2/3] change ut and comment --- python/pyspark/ml/feature.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 5f3041af7b72..1921db508086 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1298,16 +1298,25 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol): statistics, which is also known as min-max normalization or Rescaling. The rescaled value for feature E is calculated as, - Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min + Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min - For the case E_{max} == E_{min}, Rescaled(e_i) = 0.5 * (max + min) + For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min) + + Note that since zero values will probably be transformed to non-zero values, output of the + transformer will be DenseVector even for sparse input. >>> from pyspark.mllib.linalg import Vectors >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled") >>> model = mmScaler.fit(df) - >>> model.transform(df).collect()[1].scaled - DenseVector([1.0]) + >>> model.transform(df).show() + +-----+------+ + | a|scaled| + +-----+------+ + |[0.0]| [0.0]| + |[2.0]| [1.0]| + +-----+------+ + ... """ # a placeholder to make it appear in the generated doc From 9785b5693955244f87585eb45a7ee5f6102620c3 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Thu, 10 Sep 2015 11:24:55 +0800 Subject: [PATCH 3/3] add some comments --- python/pyspark/ml/feature.py | 194 ++++++++++++++++++----------------- 1 file changed, 99 insertions(+), 95 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 05eca4824230..4310e5e96b75 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -27,11 +27,11 @@ from pyspark.mllib.linalg import _convert_to_vector __all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', - 'IndexToString', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel', - 'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer', - 'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer', - 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer', - 'Word2Vec', 'Word2VecModel'] + 'IndexToString', 'MinMaxScaler', 'MinMaxScalerModel', 'NGram', 'Normalizer', + 'OneHotEncoder', 'PCA', 'PCAModel', 'PolynomialExpansion', 'RegexTokenizer', + 'RFormula', 'RFormulaModel', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', + 'StopWordsRemover', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', + 'VectorAssembler', 'VectorIndexer', 'VectorSlicer', 'Word2Vec', 'Word2VecModel'] @inherit_doc @@ -406,6 +406,100 @@ class IDFModel(JavaModel): """ +@inherit_doc +class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol): + """ + .. note:: Experimental + + Rescale each feature individually to a common range [min, max] linearly using column summary + statistics, which is also known as min-max normalization or Rescaling. The rescaled value for + feature E is calculated as, + + Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min + + For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min) + + Note that since zero values will probably be transformed to non-zero values, output of the + transformer will be DenseVector even for sparse input. + + >>> from pyspark.mllib.linalg import Vectors + >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) + >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled") + >>> model = mmScaler.fit(df) + >>> model.transform(df).show() + +-----+------+ + | a|scaled| + +-----+------+ + |[0.0]| [0.0]| + |[2.0]| [1.0]| + +-----+------+ + ... + """ + + # a placeholder to make it appear in the generated doc + min = Param(Params._dummy(), "min", "Lower bound of the output feature range") + max = Param(Params._dummy(), "max", "Upper bound of the output feature range") + + @keyword_only + def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None): + """ + __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None) + """ + super(MinMaxScaler, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid) + self.min = Param(self, "min", "Lower bound of the output feature range") + self.max = Param(self, "max", "Upper bound of the output feature range") + self._setDefault(min=0.0, max=1.0) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None): + """ + setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None) + Sets params for this MinMaxScaler. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setMin(self, value): + """ + Sets the value of :py:attr:`min`. + """ + self._paramMap[self.min] = value + return self + + def getMin(self): + """ + Gets the value of min or its default value. + """ + return self.getOrDefault(self.min) + + def setMax(self, value): + """ + Sets the value of :py:attr:`max`. + """ + self._paramMap[self.max] = value + return self + + def getMax(self): + """ + Gets the value of max or its default value. + """ + return self.getOrDefault(self.max) + + def _create_model(self, java_model): + return MinMaxScalerModel(java_model) + + +class MinMaxScalerModel(JavaModel): + """ + .. note:: Experimental + + Model fitted by :py:class:`MinMaxScaler`. + """ + + @inherit_doc @ignore_unicode_prefix class NGram(JavaTransformer, HasInputCol, HasOutputCol): @@ -1677,96 +1771,6 @@ class RFormulaModel(JavaModel): """ -@inherit_doc -class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol): - """ - Rescale each feature individually to a common range [min, max] linearly using column summary - statistics, which is also known as min-max normalization or Rescaling. The rescaled value for - feature E is calculated as, - - Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min - - For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min) - - Note that since zero values will probably be transformed to non-zero values, output of the - transformer will be DenseVector even for sparse input. - - >>> from pyspark.mllib.linalg import Vectors - >>> df = sqlContext.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"]) - >>> mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled") - >>> model = mmScaler.fit(df) - >>> model.transform(df).show() - +-----+------+ - | a|scaled| - +-----+------+ - |[0.0]| [0.0]| - |[2.0]| [1.0]| - +-----+------+ - ... - """ - - # a placeholder to make it appear in the generated doc - min = Param(Params._dummy(), "min", "Lower bound of the output feature range") - max = Param(Params._dummy(), "max", "Upper bound of the output feature range") - - @keyword_only - def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None): - """ - __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None) - """ - super(MinMaxScaler, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid) - self.min = Param(self, "min", "Lower bound of the output feature range") - self.max = Param(self, "max", "Upper bound of the output feature range") - self._setDefault(min=0.0, max=1.0) - kwargs = self.__init__._input_kwargs - self.setParams(**kwargs) - - @keyword_only - def setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None): - """ - setParams(self, min=0.0, max=1.0, inputCol=None, outputCol=None) - Sets params for this MinMaxScaler. - """ - kwargs = self.setParams._input_kwargs - return self._set(**kwargs) - - def setMin(self, value): - """ - Sets the value of :py:attr:`min`. - """ - self._paramMap[self.min] = value - return self - - def getMin(self): - """ - Gets the value of min or its default value. - """ - return self.getOrDefault(self.min) - - def setMax(self, value): - """ - Sets the value of :py:attr:`max`. - """ - self._paramMap[self.max] = value - return self - - def getMax(self): - """ - Gets the value of max or its default value. - """ - return self.getOrDefault(self.max) - - def _create_model(self, java_model): - return MinMaxScalerModel(java_model) - - -class MinMaxScalerModel(JavaModel): - """ - Model fitted by MinMaxScaler. - """ - - if __name__ == "__main__": import doctest from pyspark.context import SparkContext