Move doc tests to tests.py.

apache · yanboliang · Mar 10, 2017 · Mar 10, 2017 · Mar 10, 2017 · Mar 10, 2017
commit 6f3bea8f1e4fcb9c45dc22a542c8e43801bfa5f8
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
@@ -2097,14 +2097,6 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja
     >>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]),
     ...     key=lambda x: x[0])
     [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')]
-    >>> testData2 = sc.parallelize([Row(id=0, label="a"), Row(id=1, label="d"),
-    ...     Row(id=2, label=None)], 2)
-    >>> dfKeep= spark.createDataFrame(testData2)
-    >>> modelKeep = stringIndexer.setHandleInvalid("keep").fit(stringIndDf)
-    >>> tdK = modelKeep.transform(dfKeep)
-    >>> sorted(set([(i[0], i[1]) for i in tdK.select(tdK.id, tdK.indexed).collect()]),
-    ...     key=lambda x: x[0])
-    [(0, 0.0), (1, 3.0), (2, 3.0)]
     >>> stringIndexerPath = temp_path + "/string-indexer"
     >>> stringIndexer.save(stringIndexerPath)
     >>> loadedIndexer = StringIndexer.load(stringIndexerPath)
@@ -2133,7 +2125,6 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja
     .. versionadded:: 1.4.0
     """
 
-
     stringOrderType = Param(Params._dummy(), "stringOrderType",
                             "How to order labels of string column. The first label after " +
                             "ordering is assigned an index of 0. Supported options: " +
@@ -2188,14 +2179,14 @@ def getStringOrderType(self):
         """
         return self.getOrDefault(self.stringOrderType)
 
-    @since("2.2.0")
+    @since("2.3.0")
     def setHandleInvalid(self, value):
         """
         Sets the value of :py:attr:`handleInvalid`.
         """
         return self._set(handleInvalid=value)
 
-    @since("2.2.0")
+    @since("2.3.0")
     def getHandleInvalid(self):
         """
         Gets the value of :py:attr:`handleInvalid` or its default value.

diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
@@ -551,6 +551,27 @@ def test_rformula_string_indexer_order_type(self):
         for i in range(0, len(expected)):
             self.assertTrue(all(observed[i]["features"].toArray() == expected[i]))
 
+    def test_string_indexer_handle_invalid(self):
+        df = self.spark.createDataFrame([
+            (0, "a"),
+            (1, "d"),
+            (2, None)], ["id", "label"])
+
+        si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep",
+                            stringOrderType="alphabetAsc")
+        model1 = si1.fit(df)
+        td1 = model1.transform(df)
+        actual1 = td1.select("id", "indexed").collect()
+        expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
+        self.assertEqual(actual1, expected1)
+
+        si2 = si1.setHandleInvalid("skip")
+        model2 = si2.fit(df)
+        td2 = model2.transform(df)
+        actual2 = td2.select("id", "indexed").collect()
+        expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
+        self.assertEqual(actual2, expected2)
+
 
 class HasInducedError(Params):