-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-9654][ML][PYSPARK] Add IndexToString to PySpark #7976
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
1dc4579
0445fcc
af2f869
510bce5
c6da160
9f5af3a
7b3b5ca
244e083
e95b61b
b1795aa
ab90dcd
43ae197
c400e16
64de5c9
2316a90
15390bb
28afcfd
f19445d
51ae7ee
ed0ca91
8fca8b3
3ef852f
41d0d27
cd5d418
4f56b17
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -731,7 +731,8 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol): | |
| >>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), | ||
| ... key=lambda x: x[0]) | ||
| [(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)] | ||
| >>> itd = model.invert("indexed", "label2").transform(td) | ||
| >>> inverter = model.invert("indexed", "label2") | ||
| >>> itd = inverter.transform(td) | ||
| >>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]), | ||
| ... key=lambda x: x[0]) | ||
| [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')] | ||
|
|
@@ -771,22 +772,60 @@ def invert(self, inputCol, outputCol): | |
| Note: By default we keep the original columns during this transformation, so the inverse | ||
| should only be used on new columns such as predicted labels. | ||
| """ | ||
| return StringIndexerInverse(self._java_obj.invert(inputCol, outputCol)) | ||
| labels = self._java_obj.getLabels() | ||
| return StringIndexerInverse(inputCol=inputCol, outputCol=outputCol, | ||
| labels=labels) | ||
|
|
||
|
|
||
| class StringIndexerInverse(JavaTransformer): | ||
| class StringIndexerInverse(JavaTransformer, HasInputCol, HasOutputCol): | ||
| """ | ||
| Transform a provided column back to the original input types using the metadata on | ||
| the input column. | ||
| Note: By default we keep the original columns during StringIndexerModel's transformation, | ||
| so the inverse should only be used on new columns such as predicted labels. | ||
| """ | ||
| # a placeholder to make the labels show up in generated doc | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. insert newline above |
||
| labels = Param(Params._dummy(), "lables", | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo: "lables" |
||
| "Optional labels to be provided by the user, if not supplied column " + | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: "if not supplied" -> "if equal to the empty array then"
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes less sense, if it isn't supplied then it uses the column metadata. |
||
| "metadata is read for labels. The default value is an empty array, " + | ||
| "but the empty array is ignored and column metadata used instead.") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After the above nit, this becomes redundant IMO. Since this is a matter of taste, feel free to keep or cut |
||
|
|
||
| def __init__(self, java_obj): | ||
| @keyword_only | ||
| def __init__(self, inputCol=None, outputCol=None, labels=[]): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should avoid using mutable values
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My concern is the underlying Scala code uses an empty array as the default. |
||
| """ | ||
| Initialize this instace of the StringIndexerInverse using the provided java_obj. | ||
| """ | ||
| self._java_obj = java_obj | ||
| super(StringIndexerInverse, self).__init__() | ||
| self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StringIndexerInverse", | ||
| self.uid) | ||
| self.labels = Param(self, "labels", | ||
| "Optional labels to be provided by the user, if not supplied column " + | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as L957 |
||
| "metadata is read for labels. The default value is an empty array, " + | ||
| "but the empty array is ignored and column metadata used instead.") | ||
| kwargs = self.__init__._input_kwargs | ||
| self.setParams(**kwargs) | ||
|
|
||
| @keyword_only | ||
| def setParams(self, inputCol=None, outputCol=None, labels=[]): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here, using
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My concern is the underlying Scala code uses an empty array as the default. |
||
| """ | ||
| setParams(self, inputCol="input", outputCol="output", labels=[]) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. correct col defaults: None |
||
| Sets params for this StringIndexerInverse | ||
| """ | ||
| kwargs = self.setParams._input_kwargs | ||
| return self._set(**kwargs) | ||
|
|
||
| def setLabels(self, value): | ||
| """ | ||
| Specify the labels to be used. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| """ | ||
| self._paramMap[self.labels] = value | ||
| return self | ||
|
|
||
| def getLabels(self): | ||
| """ | ||
| Get the labels. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| """ | ||
| return self.getOrDefault(self.labels) | ||
|
|
||
|
|
||
| @inherit_doc | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no longer needed since "label" is a public val