From 1dc45790c143fb87f2ce2db9720680f2e8cc9236 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 5 Aug 2015 16:11:50 -0700 Subject: [PATCH 01/21] SPARK-9654 Add string indexer inverse in PySpark --- python/pyspark/ml/feature.py | 21 +++++++++++++++++++++ python/pyspark/ml/wrapper.py | 8 ++++++++ 2 files changed, 29 insertions(+) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 535d55326646..ded2005cf777 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -731,6 +731,10 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol): >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), ... key=lambda x: x[0]) [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)] + >>> itd = model.invert("labelIndex", "label2").transform(td) + >>> sorted(set([(i[0], i[1]) for i in itd.select(itd.id, itd.label2).collect()]), + ... key=lambda x: x[0]) + [] """ @keyword_only @@ -761,6 +765,23 @@ class StringIndexerModel(JavaModel): Model fitted by StringIndexer. """ + def invert(self, inputCol, outputCol): + """ + Return a model to perform the inverse transformation. + Note: By default we keep the original columns during this transformation, so the inverse + should only be used on new columns such as predicted labels. + """ + return StringIndexerInverse(self._java_obj.invert(inputCol, outputCol)) + + +class StringIndexerInverse(JavaTransformer): + """ + Transform a provided column back to the original input types using either + the metadata on the input column. + Note: By default we keep the original columns during this transformation, + so the inverse should only be used on new columns such as predicted labels. + """ + @inherit_doc @ignore_unicode_prefix diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 253705bde913..8af0c12265f0 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -141,6 +141,14 @@ class JavaTransformer(Transformer, JavaWrapper): __metaclass__ = ABCMeta + def __init__(self, java_obj): + """ + Initialize this instance with a Java model object. + Subclasses should esnure they have the transformer Java object + available as _java_obj. + """ + self._java_obj = java_obj + def _transform(self, dataset): self._transfer_params_to_java() return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx) From 0445fcc95e3767bcf387ef6bcff127cfcf44926d Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 5 Aug 2015 16:14:00 -0700 Subject: [PATCH 02/21] doc fix --- python/pyspark/ml/feature.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index ded2005cf777..863f6ec74ec6 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -776,9 +776,9 @@ def invert(self, inputCol, outputCol): class StringIndexerInverse(JavaTransformer): """ - Transform a provided column back to the original input types using either - the metadata on the input column. - Note: By default we keep the original columns during this transformation, + Transform a provided column back to the original input types using the metadata on + the input column. + Note: By default we keep the original columns during StringIndexerModel's transformation, so the inverse should only be used on new columns such as predicted labels. """ From af2f869bbc7387708aae59e02fa3ba1a507d9da6 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 5 Aug 2015 17:13:35 -0700 Subject: [PATCH 03/21] Don't changge the base class init, fill out the doctest for the invert. --- python/pyspark/ml/feature.py | 11 +++++++++-- python/pyspark/ml/wrapper.py | 11 ++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 863f6ec74ec6..9d8ca760856c 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -731,10 +731,10 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol): >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), ... key=lambda x: x[0]) [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)] - >>> itd = model.invert("labelIndex", "label2").transform(td) + >>> itd = model.invert("indexed", "label2").transform(td) >>> sorted(set([(i[0], i[1]) for i in itd.select(itd.id, itd.label2).collect()]), ... key=lambda x: x[0]) - [] + [(0, u'a'), (1, u'b'), (2, u'c'), (3, u'a'), (4, u'a'), (5, u'c')] """ @keyword_only @@ -782,6 +782,13 @@ class StringIndexerInverse(JavaTransformer): so the inverse should only be used on new columns such as predicted labels. """ + def __init__(self, java_obj): + """ + Initialize this instace of the StringIndexerInverse using the provided java_obj. + """ + self._java_obj = java_obj + + @inherit_doc @ignore_unicode_prefix diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 8af0c12265f0..8218c7c5f801 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -136,19 +136,12 @@ def _fit(self, dataset): class JavaTransformer(Transformer, JavaWrapper): """ Base class for :py:class:`Transformer`s that wrap Java/Scala - implementations. + implementations. Subclasses should ensure they have the transformer Java object + available as _java_obj. """ __metaclass__ = ABCMeta - def __init__(self, java_obj): - """ - Initialize this instance with a Java model object. - Subclasses should esnure they have the transformer Java object - available as _java_obj. - """ - self._java_obj = java_obj - def _transform(self, dataset): self._transfer_params_to_java() return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx) From 510bce5ab969c4a105a14a4ba903f885819109ff Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 5 Aug 2015 17:38:18 -0700 Subject: [PATCH 04/21] remove extra blank line --- python/pyspark/ml/feature.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 9d8ca760856c..e0c77a08029c 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -789,7 +789,6 @@ def __init__(self, java_obj): self._java_obj = java_obj - @inherit_doc @ignore_unicode_prefix class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol): From c6da160a0dc2328b380d602a61f8f8f280112e34 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 5 Aug 2015 18:32:07 -0700 Subject: [PATCH 05/21] get rid of unicude specificers in doctest --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index e0c77a08029c..af6b86fcf57d 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -734,7 +734,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol): >>> itd = model.invert("indexed", "label2").transform(td) >>> sorted(set([(i[0], i[1]) for i in itd.select(itd.id, itd.label2).collect()]), ... key=lambda x: x[0]) - [(0, u'a'), (1, u'b'), (2, u'c'), (3, u'a'), (4, u'a'), (5, u'c')] + [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')] """ @keyword_only From 9f5af3a4d3873f27ad53dd9f90b7bc40e29ec5cb Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 5 Aug 2015 19:10:15 -0700 Subject: [PATCH 06/21] Deal with the difference between 2.X and 3.X with the output by just converting to regular string --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index af6b86fcf57d..ee1faffdd2c9 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -732,7 +732,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol): ... key=lambda x: x[0]) [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)] >>> itd = model.invert("indexed", "label2").transform(td) - >>> sorted(set([(i[0], i[1]) for i in itd.select(itd.id, itd.label2).collect()]), + >>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]), ... key=lambda x: x[0]) [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')] """ From 7b3b5ca2c5c4acfa61d444bf2d6c3867e8dfef95 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 11 Aug 2015 18:22:51 -0700 Subject: [PATCH 07/21] Use the standard constructor method for the StringIndexInverse --- .../spark/ml/feature/StringIndexer.scala | 5 ++ python/pyspark/ml/feature.py | 49 +++++++++++++++++-- 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index 24250e4c4cf9..020b1a5d27c9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -122,6 +122,11 @@ class StringIndexerModel ( map } + /** + * The labels used for applying this transformation + */ + private[spark] def getLabels() = labels + /** @group setParam */ def setHandleInvalid(value: String): this.type = set(handleInvalid, value) setDefault(handleInvalid, "error") diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index ee1faffdd2c9..45490604fd92 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -731,7 +731,8 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol): >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), ... key=lambda x: x[0]) [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)] - >>> itd = model.invert("indexed", "label2").transform(td) + >>> inverter = model.invert("indexed", "label2") + >>> itd = inverter.transform(td) >>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]), ... key=lambda x: x[0]) [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')] @@ -771,22 +772,60 @@ def invert(self, inputCol, outputCol): Note: By default we keep the original columns during this transformation, so the inverse should only be used on new columns such as predicted labels. """ - return StringIndexerInverse(self._java_obj.invert(inputCol, outputCol)) + labels = self._java_obj.getLabels() + return StringIndexerInverse(inputCol=inputCol, outputCol=outputCol, + labels=labels) -class StringIndexerInverse(JavaTransformer): +class StringIndexerInverse(JavaTransformer, HasInputCol, HasOutputCol): """ Transform a provided column back to the original input types using the metadata on the input column. Note: By default we keep the original columns during StringIndexerModel's transformation, so the inverse should only be used on new columns such as predicted labels. """ + # a placeholder to make the labels show up in generated doc + labels = Param(Params._dummy(), "lables", + "Optional labels to be provided by the user, if not supplied column " + + "metadata is read for labels. The default value is an empty array, " + + "but the empty array is ignored and column metadata used instead.") - def __init__(self, java_obj): + @keyword_only + def __init__(self, inputCol=None, outputCol=None, labels=[]): """ Initialize this instace of the StringIndexerInverse using the provided java_obj. """ - self._java_obj = java_obj + super(StringIndexerInverse, self).__init__() + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexerInverse", + self.uid) + self.labels = Param(self, "labels", + "Optional labels to be provided by the user, if not supplied column " + + "metadata is read for labels. The default value is an empty array, " + + "but the empty array is ignored and column metadata used instead.") + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, inputCol=None, outputCol=None, labels=[]): + """ + setParams(self, inputCol="input", outputCol="output", labels=[]) + Sets params for this StringIndexerInverse + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def setLabels(self, value): + """ + Specify the labels to be used. + """ + self._paramMap[self.labels] = value + return self + + def getLabels(self): + """ + Get the labels. + """ + return self.getOrDefault(self.labels) @inherit_doc From 244e0833a2c62f0b5d7deecb723d7eaa8b90f79b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 14 Aug 2015 14:18:08 -0700 Subject: [PATCH 08/21] Update for index to string changeover --- python/pyspark/ml/feature.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 45490604fd92..5cd8ba49ecac 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -26,13 +26,12 @@ from pyspark.mllib.common import inherit_doc from pyspark.mllib.linalg import _convert_to_vector -__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', - 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', +__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'IndexToString', 'NGram', + 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel'] - @inherit_doc class Binarizer(JavaTransformer, HasInputCol, HasOutputCol): """ @@ -731,7 +730,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol): >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), ... key=lambda x: x[0]) [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)] - >>> inverter = model.invert("indexed", "label2") + >>> inverter = IndexToString("indexed", "label2", model.labels()) >>> itd = inverter.transform(td) >>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]), ... key=lambda x: x[0]) @@ -757,6 +756,10 @@ def setParams(self, inputCol=None, outputCol=None): kwargs = self.setParams._input_kwargs return self._set(**kwargs) + @property + def labels(self): + return self._java_obj.labels + def _create_model(self, java_model): return StringIndexerModel(java_model) @@ -766,21 +769,10 @@ class StringIndexerModel(JavaModel): Model fitted by StringIndexer. """ - def invert(self, inputCol, outputCol): - """ - Return a model to perform the inverse transformation. - Note: By default we keep the original columns during this transformation, so the inverse - should only be used on new columns such as predicted labels. - """ - labels = self._java_obj.getLabels() - return StringIndexerInverse(inputCol=inputCol, outputCol=outputCol, - labels=labels) - - -class StringIndexerInverse(JavaTransformer, HasInputCol, HasOutputCol): +class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): """ - Transform a provided column back to the original input types using the metadata on - the input column. + Convert provided indexes back to strings using either the metadata on the input column + or user provided labels. Note: By default we keep the original columns during StringIndexerModel's transformation, so the inverse should only be used on new columns such as predicted labels. """ @@ -796,7 +788,7 @@ def __init__(self, inputCol=None, outputCol=None, labels=[]): Initialize this instace of the StringIndexerInverse using the provided java_obj. """ super(StringIndexerInverse, self).__init__() - self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexerInverse", + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString", self.uid) self.labels = Param(self, "labels", "Optional labels to be provided by the user, if not supplied column " + From e95b61b5d7bd90f6547cc30853c92f6bf1561589 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 14 Aug 2015 14:41:55 -0700 Subject: [PATCH 09/21] Move the property on to the model, remove references to old class name --- python/pyspark/ml/feature.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 5cd8ba49ecac..d10f363c8342 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -32,6 +32,7 @@ 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel'] + @inherit_doc class Binarizer(JavaTransformer, HasInputCol, HasOutputCol): """ @@ -730,7 +731,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol): >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), ... key=lambda x: x[0]) [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)] - >>> inverter = IndexToString("indexed", "label2", model.labels()) + >>> inverter = IndexToString(inputCol="indexed", outputCol="label2", labels=model.labels()) >>> itd = inverter.transform(td) >>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]), ... key=lambda x: x[0]) @@ -756,10 +757,6 @@ def setParams(self, inputCol=None, outputCol=None): kwargs = self.setParams._input_kwargs return self._set(**kwargs) - @property - def labels(self): - return self._java_obj.labels - def _create_model(self, java_model): return StringIndexerModel(java_model) @@ -768,6 +765,10 @@ class StringIndexerModel(JavaModel): """ Model fitted by StringIndexer. """ + @property + def labels(self): + return self._java_obj.labels + class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): """ @@ -785,9 +786,9 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): @keyword_only def __init__(self, inputCol=None, outputCol=None, labels=[]): """ - Initialize this instace of the StringIndexerInverse using the provided java_obj. + Initialize this instace of the IndexToString using the provided java_obj. """ - super(StringIndexerInverse, self).__init__() + super(IndexToString, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString", self.uid) self.labels = Param(self, "labels", @@ -801,7 +802,7 @@ def __init__(self, inputCol=None, outputCol=None, labels=[]): def setParams(self, inputCol=None, outputCol=None, labels=[]): """ setParams(self, inputCol="input", outputCol="output", labels=[]) - Sets params for this StringIndexerInverse + Sets params for this IndexToString """ kwargs = self.setParams._input_kwargs return self._set(**kwargs) From b1795aaad4bf8c38a13d83a37759317910ea5211 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 18 Aug 2015 15:55:29 -0700 Subject: [PATCH 10/21] CR feedback --- python/pyspark/ml/feature.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index d10f363c8342..accea307c01d 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -770,15 +770,19 @@ def labels(self): return self._java_obj.labels +@inherit_doc class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): """ - Convert provided indexes back to strings using either the metadata on the input column - or user provided labels. - Note: By default we keep the original columns during StringIndexerModel's transformation, - so the inverse should only be used on new columns such as predicted labels. + .. note:: Experimental + A [[Transformer]] that maps a column of string indices back to a new column of corresponding + string values using either the ML attributes of the input column, or if provided using the + labels supplied by the user. + All original columns are kept during transformation. + @see [[StringIndexer]] for converting strings into indices """ + # a placeholder to make the labels show up in generated doc - labels = Param(Params._dummy(), "lables", + labels = Param(Params._dummy(), "labels", "Optional labels to be provided by the user, if not supplied column " + "metadata is read for labels. The default value is an empty array, " + "but the empty array is ignored and column metadata used instead.") @@ -786,7 +790,7 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): @keyword_only def __init__(self, inputCol=None, outputCol=None, labels=[]): """ - Initialize this instace of the IndexToString using the provided java_obj. + __init__(self, inputCol=None, outputCol=None, labels=[]) """ super(IndexToString, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString", @@ -801,7 +805,7 @@ def __init__(self, inputCol=None, outputCol=None, labels=[]): @keyword_only def setParams(self, inputCol=None, outputCol=None, labels=[]): """ - setParams(self, inputCol="input", outputCol="output", labels=[]) + setParams(self, inputCol=None, outputCol=None, labels=[]) Sets params for this IndexToString """ kwargs = self.setParams._input_kwargs From ab90dcd948c8871b3b2ff68c60e8f5651c34b656 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 18 Aug 2015 15:56:24 -0700 Subject: [PATCH 11/21] switch link to pydoc style --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index accea307c01d..841b6ef59f2f 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -778,7 +778,7 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): string values using either the ML attributes of the input column, or if provided using the labels supplied by the user. All original columns are kept during transformation. - @see [[StringIndexer]] for converting strings into indices + See L{StringIndexer} for converting strings into indices """ # a placeholder to make the labels show up in generated doc From c400e169e6b4b75372cef71aadd75cbb464e0b7f Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 18 Aug 2015 16:04:37 -0700 Subject: [PATCH 12/21] remove getLabels function (CR feedback) now that labels is public. --- .../scala/org/apache/spark/ml/feature/StringIndexer.scala | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index 020b1a5d27c9..24250e4c4cf9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -122,11 +122,6 @@ class StringIndexerModel ( map } - /** - * The labels used for applying this transformation - */ - private[spark] def getLabels() = labels - /** @group setParam */ def setHandleInvalid(value: String): this.type = set(handleInvalid, value) setDefault(handleInvalid, "error") From 64de5c9313e89ac21ac6f8ecad1916ec23f65dd5 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 28 Aug 2015 14:55:27 -0700 Subject: [PATCH 13/21] Some CR feedback --- python/pyspark/ml/feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index c7cebcac3efc..fd9ad635102e 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -870,14 +870,14 @@ def setParams(self, inputCol=None, outputCol=None, labels=[]): def setLabels(self, value): """ - Specify the labels to be used. + Sets the value of :py:attr:`labels`. """ self._paramMap[self.labels] = value return self def getLabels(self): """ - Get the labels. + Gets the value of :py:attr:`labels` or its default value. """ return self.getOrDefault(self.labels) From 2316a90328a541cebf920e1403e6a6122ae6fbe8 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 28 Aug 2015 14:55:57 -0700 Subject: [PATCH 14/21] Use None instead of empty array --- python/pyspark/ml/feature.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index fd9ad635102e..42a6583c58bf 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -845,9 +845,9 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): "but the empty array is ignored and column metadata used instead.") @keyword_only - def __init__(self, inputCol=None, outputCol=None, labels=[]): + def __init__(self, inputCol=None, outputCol=None, labels=None): """ - __init__(self, inputCol=None, outputCol=None, labels=[]) + __init__(self, inputCol=None, outputCol=None, labels=None) """ super(IndexToString, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString", @@ -860,9 +860,9 @@ def __init__(self, inputCol=None, outputCol=None, labels=[]): self.setParams(**kwargs) @keyword_only - def setParams(self, inputCol=None, outputCol=None, labels=[]): + def setParams(self, inputCol=None, outputCol=None, labels=None): """ - setParams(self, inputCol=None, outputCol=None, labels=[]) + setParams(self, inputCol=None, outputCol=None, labels=None) Sets params for this IndexToString """ kwargs = self.setParams._input_kwargs From 28afcfd68f2aad6f23b6a1e14940c805489c35ff Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 31 Aug 2015 21:31:29 -0700 Subject: [PATCH 15/21] Some CR feedback (note: still sorting our one of the params) --- python/pyspark/ml/feature.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 6a42268151c2..bec9eabe37f6 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -945,6 +945,7 @@ def labels(self): class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): """ .. note:: Experimental + A [[Transformer]] that maps a column of string indices back to a new column of corresponding string values using either the ML attributes of the input column, or if provided using the labels supplied by the user. @@ -977,7 +978,7 @@ def __init__(self, inputCol=None, outputCol=None, labels=None): def setParams(self, inputCol=None, outputCol=None, labels=None): """ setParams(self, inputCol=None, outputCol=None, labels=None) - Sets params for this IndexToString + Sets params for this IndexToString. """ kwargs = self.setParams._input_kwargs return self._set(**kwargs) From f19445d6d59344bbcb71c2e3b48754000c4fe82c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 31 Aug 2015 23:48:25 -0700 Subject: [PATCH 16/21] Change description text --- python/pyspark/ml/feature.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index bec9eabe37f6..e37f70aa4f82 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -955,9 +955,8 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): # a placeholder to make the labels show up in generated doc labels = Param(Params._dummy(), "labels", - "Optional labels to be provided by the user, if not supplied column " + - "metadata is read for labels. The default value is an empty array, " + - "but the empty array is ignored and column metadata used instead.") + "Optional array of labels to be provided by the user, if not supplied or " + + "empty, column metadata is read for labels") @keyword_only def __init__(self, inputCol=None, outputCol=None, labels=None): @@ -968,9 +967,8 @@ def __init__(self, inputCol=None, outputCol=None, labels=None): self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString", self.uid) self.labels = Param(self, "labels", - "Optional labels to be provided by the user, if not supplied column " + - "metadata is read for labels. The default value is an empty array, " + - "but the empty array is ignored and column metadata used instead.") + "Optional array of labels to be provided by the user, if not supplied or " + + "empty, column metadata is read for labels") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) From 8fca8b3387e6a92ccafc479c3c556309fee77604 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 1 Sep 2015 11:59:54 -0700 Subject: [PATCH 17/21] punctuation --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 3a96d338fc71..b051368d3e9f 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -951,7 +951,7 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): string values using either the ML attributes of the input column, or if provided using the labels supplied by the user. All original columns are kept during transformation. - See L{StringIndexer} for converting strings into indices + See L{StringIndexer} for converting strings into indices. """ # a placeholder to make the labels show up in generated doc From 3ef852f7e0ca3248e03ca70c74fae707502b8a28 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 1 Sep 2015 12:43:46 -0700 Subject: [PATCH 18/21] remove unrelated change --- sql/hive/pom.xml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index c09ae1726283..be1607476e25 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -36,12 +36,6 @@ - - - org.scala-lang - scala-library - 2.10.3 - com.twitter From 41d0d27597e2fb4042ce19a2d9a793414c5958b2 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 1 Sep 2015 14:07:52 -0700 Subject: [PATCH 19/21] long line fix --- python/pyspark/ml/feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index b051368d3e9f..6420cf7efdaa 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -968,8 +968,8 @@ def __init__(self, inputCol=None, outputCol=None, labels=None): self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString", self.uid) self.labels = Param(self, "labels", - "Optional array of labels to be provided by the user, if not supplied or " + - "empty, column metadata is read for labels") + "Optional array of labels to be provided by the user, if not " + + "supplied or empty, column metadata is read for labels") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) From cd5d41881d1ff35ffd3352a9e8a44ff95dfb2b04 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 8 Sep 2015 17:11:53 -0700 Subject: [PATCH 20/21] Add missing period --- .../main/scala/org/apache/spark/ml/feature/StringIndexer.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index 24250e4c4cf9..e0eeb636889a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -102,7 +102,7 @@ class StringIndexer(override val uid: String) extends Estimator[StringIndexerMod * [[StringIndexerModel.transform]] would return the input dataset unmodified. * This is a temporary fix for the case when target labels do not exist during prediction. * - * @param labels Ordered list of labels, corresponding to indices to be assigned + * @param labels Ordered list of labels, corresponding to indices to be assigned. */ @Experimental class StringIndexerModel ( From 4f56b1781ac48571dd52326783f1485dbd059530 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 8 Sep 2015 17:12:28 -0700 Subject: [PATCH 21/21] Fix link to transformer class, copy scala doc for labels --- python/pyspark/ml/feature.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 6420cf7efdaa..2d5f1c49bb5c 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -939,6 +939,9 @@ class StringIndexerModel(JavaModel): """ @property def labels(self): + """ + Ordered list of labels, corresponding to indices to be assigned. + """ return self._java_obj.labels @@ -947,9 +950,9 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): """ .. note:: Experimental - A [[Transformer]] that maps a column of string indices back to a new column of corresponding - string values using either the ML attributes of the input column, or if provided using the - labels supplied by the user. + A :py:class:`Transformer` that maps a column of string indices back to a new column of + corresponding string values using either the ML attributes of the input column, or if + provided using the labels supplied by the user. All original columns are kept during transformation. See L{StringIndexer} for converting strings into indices. """