Skip to content
Closed
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1dc4579
SPARK-9654 Add string indexer inverse in PySpark
holdenk Aug 5, 2015
0445fcc
doc fix
holdenk Aug 5, 2015
af2f869
Don't changge the base class init, fill out the doctest for the invert.
holdenk Aug 6, 2015
510bce5
remove extra blank line
holdenk Aug 6, 2015
c6da160
get rid of unicude specificers in doctest
holdenk Aug 6, 2015
9f5af3a
Deal with the difference between 2.X and 3.X with the output by just …
holdenk Aug 6, 2015
7b3b5ca
Use the standard constructor method for the StringIndexInverse
holdenk Aug 12, 2015
244e083
Update for index to string changeover
holdenk Aug 14, 2015
e95b61b
Move the property on to the model, remove references to old class name
holdenk Aug 14, 2015
b1795aa
CR feedback
holdenk Aug 18, 2015
ab90dcd
switch link to pydoc style
holdenk Aug 18, 2015
43ae197
Merge in master
holdenk Aug 18, 2015
c400e16
remove getLabels function (CR feedback) now that labels is public.
holdenk Aug 18, 2015
64de5c9
Some CR feedback
holdenk Aug 28, 2015
2316a90
Use None instead of empty array
holdenk Aug 28, 2015
15390bb
merge in master
holdenk Sep 1, 2015
28afcfd
Some CR feedback (note: still sorting our one of the params)
holdenk Sep 1, 2015
f19445d
Change description text
holdenk Sep 1, 2015
51ae7ee
merge in master
holdenk Sep 1, 2015
ed0ca91
moar merge
holdenk Sep 1, 2015
8fca8b3
punctuation
holdenk Sep 1, 2015
3ef852f
remove unrelated change
holdenk Sep 1, 2015
41d0d27
long line fix
holdenk Sep 1, 2015
cd5d418
Add missing period
holdenk Sep 9, 2015
4f56b17
Fix link to transformer class, copy scala doc for labels
holdenk Sep 9, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update for index to string changeover
  • Loading branch information
holdenk committed Aug 14, 2015
commit 244e0833a2c62f0b5d7deecb723d7eaa8b90f79b
30 changes: 11 additions & 19 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,12 @@
from pyspark.mllib.common import inherit_doc
from pyspark.mllib.linalg import _convert_to_vector

__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer',
'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler',
__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'IndexToString', 'NGram',
'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler',
'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer',
'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'PCA',
'PCAModel', 'RFormula', 'RFormulaModel']


@inherit_doc
class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
"""
Expand Down Expand Up @@ -731,7 +730,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol):
>>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]),
... key=lambda x: x[0])
[(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)]
>>> inverter = model.invert("indexed", "label2")
>>> inverter = IndexToString("indexed", "label2", model.labels())
>>> itd = inverter.transform(td)
>>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]),
... key=lambda x: x[0])
Expand All @@ -757,6 +756,10 @@ def setParams(self, inputCol=None, outputCol=None):
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)

@property
def labels(self):
return self._java_obj.labels

def _create_model(self, java_model):
return StringIndexerModel(java_model)

Expand All @@ -766,21 +769,10 @@ class StringIndexerModel(JavaModel):
Model fitted by StringIndexer.
"""

def invert(self, inputCol, outputCol):
"""
Return a model to perform the inverse transformation.
Note: By default we keep the original columns during this transformation, so the inverse
should only be used on new columns such as predicted labels.
"""
labels = self._java_obj.getLabels()
return StringIndexerInverse(inputCol=inputCol, outputCol=outputCol,
labels=labels)


class StringIndexerInverse(JavaTransformer, HasInputCol, HasOutputCol):
class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use inherit_doc tag

"""
Transform a provided column back to the original input types using the metadata on
the input column.
Convert provided indexes back to strings using either the metadata on the input column
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please copy updated Scala doc here.

Also, please mark as Experimental (as in, e.g., RFormula)

or user provided labels.
Note: By default we keep the original columns during StringIndexerModel's transformation,
so the inverse should only be used on new columns such as predicted labels.
"""
Expand All @@ -796,7 +788,7 @@ def __init__(self, inputCol=None, outputCol=None, labels=[]):
Initialize this instace of the StringIndexerInverse using the provided java_obj.
"""
super(StringIndexerInverse, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexerInverse",
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString",
self.uid)
self.labels = Param(self, "labels",
"Optional labels to be provided by the user, if not supplied column " +
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as L957

Expand Down