[SPARK-21468][PYSPARK][ML] Python API for FeatureHasher

Nick Pentreath · Nick Pentreath · commit 988b84d7ed43 · 2017-08-21T14:35:38.000+02:00
Add Python API for `FeatureHasher` transformer. ## How was this patch tested? New doc test. Author: Nick Pentreath <nickp@za.ibm.com> Closes #18970 from MLnick/SPARK-21468-pyspark-hasher.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
@@ -64,17 +64,17 @@ import org.apache.spark.util.collection.OpenHashMap
  *   ).toDF("real", "bool", "stringNum", "string")
  *
  *   val hasher = new FeatureHasher()
- *    .setInputCols("real", "bool", "stringNum", "num")
+ *    .setInputCols("real", "bool", "stringNum", "string")
  *    .setOutputCol("features")
  *
- *   hasher.transform(df).show()
+ *   hasher.transform(df).show(false)
  *
- *   +----+-----+---------+------+--------------------+
- *   |real| bool|stringNum|string|            features|
- *   +----+-----+---------+------+--------------------+
- *   | 2.0| true|        1|   foo|(262144,[51871,63...|
- *   | 3.0|false|        2|   bar|(262144,[6031,806...|
- *   +----+-----+---------+------+--------------------+
+ *   +----+-----+---------+------+------------------------------------------------------+
+ *   |real|bool |stringNum|string|features                                              |
+ *   +----+-----+---------+------+------------------------------------------------------+
+ *   |2.0 |true |1        |foo   |(262144,[51871,63643,174475,253195],[1.0,1.0,2.0,1.0])|
+ *   |3.0 |false|2        |bar   |(262144,[6031,80619,140467,174475],[1.0,1.0,1.0,3.0]) |
+ *   +----+-----+---------+------+------------------------------------------------------+
  * }}}
  */
 @Experimental
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
@@ -34,6 +34,7 @@
            'CountVectorizer', 'CountVectorizerModel',
            'DCT',
            'ElementwiseProduct',
+           'FeatureHasher',
            'HashingTF',
            'IDF', 'IDFModel',
            'Imputer', 'ImputerModel',
@@ -696,6 +697,82 @@ def getScalingVec(self):
         return self.getOrDefault(self.scalingVec)
 
 
+@inherit_doc
+class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, JavaMLReadable,
+                    JavaMLWritable):
+    """
+    .. note:: Experimental
+
+    Feature hashing projects a set of categorical or numerical features into a feature vector of
+    specified dimension (typically substantially smaller than that of the original feature
+    space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing)
+    to map features to indices in the feature vector.
+
+    The FeatureHasher transformer operates on multiple columns. Each column may contain either
+    numeric or categorical features. Behavior and handling of column data types is as follows:
+
+    * Numeric columns:
+        For numeric features, the hash value of the column name is used to map the
+        feature value to its index in the feature vector. Numeric features are never
+        treated as categorical, even when they are integers. You must explicitly
+        convert numeric columns containing categorical features to strings first.
+
+    * String columns:
+        For categorical features, the hash value of the string "column_name=value"
+        is used to map to the vector index, with an indicator value of `1.0`.
+        Thus, categorical features are "one-hot" encoded
+        (similarly to using :py:class:`OneHotEncoder` with `dropLast=false`).
+
+    * Boolean columns:
+        Boolean values are treated in the same way as string columns. That is,
+        boolean features are represented as "column_name=true" or "column_name=false",
+        with an indicator value of `1.0`.
+
+    Null (missing) values are ignored (implicitly zero in the resulting feature vector).
+
+    Since a simple modulo is used to transform the hash function to a vector index,
+    it is advisable to use a power of two as the `numFeatures` parameter;
+    otherwise the features will not be mapped evenly to the vector indices.
+
+    >>> data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")]
+    >>> cols = ["real", "bool", "stringNum", "string"]
+    >>> df = spark.createDataFrame(data, cols)
+    >>> hasher = FeatureHasher(inputCols=cols, outputCol="features")
+    >>> hasher.transform(df).head().features
+    SparseVector(262144, {51871: 1.0, 63643: 1.0, 174475: 2.0, 253195: 1.0})
+    >>> hasherPath = temp_path + "/hasher"
+    >>> hasher.save(hasherPath)
+    >>> loadedHasher = FeatureHasher.load(hasherPath)
+    >>> loadedHasher.getNumFeatures() == hasher.getNumFeatures()
+    True
+    >>> loadedHasher.transform(df).head().features == hasher.transform(df).head().features
+    True
+
+    .. versionadded:: 2.3.0
+    """
+
+    @keyword_only
+    def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
+        """
+        __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
+        """
+        super(FeatureHasher, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid)
+        self._setDefault(numFeatures=1 << 18)
+        kwargs = self._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    @since("2.3.0")
+    def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
+        """
+        setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
+        Sets params for this FeatureHasher.
+        """
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
+
 @inherit_doc
 class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable,
                 JavaMLWritable):