Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Move doc tests to tests.py.
  • Loading branch information
yanboliang committed Jul 1, 2017
commit 6f3bea8f1e4fcb9c45dc22a542c8e43801bfa5f8
13 changes: 2 additions & 11 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -2097,14 +2097,6 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja
>>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]),
... key=lambda x: x[0])
[(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')]
>>> testData2 = sc.parallelize([Row(id=0, label="a"), Row(id=1, label="d"),
... Row(id=2, label=None)], 2)
>>> dfKeep= spark.createDataFrame(testData2)
>>> modelKeep = stringIndexer.setHandleInvalid("keep").fit(stringIndDf)
>>> tdK = modelKeep.transform(dfKeep)
>>> sorted(set([(i[0], i[1]) for i in tdK.select(tdK.id, tdK.indexed).collect()]),
... key=lambda x: x[0])
[(0, 0.0), (1, 3.0), (2, 3.0)]
>>> stringIndexerPath = temp_path + "/string-indexer"
>>> stringIndexer.save(stringIndexerPath)
>>> loadedIndexer = StringIndexer.load(stringIndexerPath)
Expand Down Expand Up @@ -2133,7 +2125,6 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja
.. versionadded:: 1.4.0
"""


stringOrderType = Param(Params._dummy(), "stringOrderType",
"How to order labels of string column. The first label after " +
"ordering is assigned an index of 0. Supported options: " +
Expand Down Expand Up @@ -2188,14 +2179,14 @@ def getStringOrderType(self):
"""
return self.getOrDefault(self.stringOrderType)

@since("2.2.0")
@since("2.3.0")
def setHandleInvalid(self, value):
"""
Sets the value of :py:attr:`handleInvalid`.
"""
return self._set(handleInvalid=value)

@since("2.2.0")
@since("2.3.0")
def getHandleInvalid(self):
"""
Gets the value of :py:attr:`handleInvalid` or its default value.
Expand Down
21 changes: 21 additions & 0 deletions python/pyspark/ml/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,27 @@ def test_rformula_string_indexer_order_type(self):
for i in range(0, len(expected)):
self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))

def test_string_indexer_handle_invalid(self):
df = self.spark.createDataFrame([
(0, "a"),
(1, "d"),
(2, None)], ["id", "label"])

si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep",
stringOrderType="alphabetAsc")
model1 = si1.fit(df)
td1 = model1.transform(df)
actual1 = td1.select("id", "indexed").collect()
expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
self.assertEqual(actual1, expected1)

si2 = si1.setHandleInvalid("skip")
model2 = si2.fit(df)
td2 = model2.transform(df)
actual2 = td2.select("id", "indexed").collect()
expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
self.assertEqual(actual2, expected2)


class HasInducedError(Params):

Expand Down