Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,17 @@ import org.apache.spark.util.collection.OpenHashMap
* ).toDF("real", "bool", "stringNum", "string")
*
* val hasher = new FeatureHasher()
* .setInputCols("real", "bool", "stringNum", "num")
* .setInputCols("real", "bool", "stringNum", "string")
* .setOutputCol("features")
*
* hasher.transform(df).show()
* hasher.transform(df).show(false)
*
* +----+-----+---------+------+--------------------+
* |real| bool|stringNum|string| features|
* +----+-----+---------+------+--------------------+
* | 2.0| true| 1| foo|(262144,[51871,63...|
* | 3.0|false| 2| bar|(262144,[6031,806...|
* +----+-----+---------+------+--------------------+
* +----+-----+---------+------+------------------------------------------------------+
* |real|bool |stringNum|string|features |
* +----+-----+---------+------+------------------------------------------------------+
* |2.0 |true |1 |foo |(262144,[51871,63643,174475,253195],[1.0,1.0,2.0,1.0])|
* |3.0 |false|2 |bar |(262144,[6031,80619,140467,174475],[1.0,1.0,1.0,3.0]) |
* +----+-----+---------+------+------------------------------------------------------+
* }}}
*/
@Experimental
Expand Down
77 changes: 77 additions & 0 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
'CountVectorizer', 'CountVectorizerModel',
'DCT',
'ElementwiseProduct',
'FeatureHasher',
'HashingTF',
'IDF', 'IDFModel',
'Imputer', 'ImputerModel',
Expand Down Expand Up @@ -696,6 +697,82 @@ def getScalingVec(self):
return self.getOrDefault(self.scalingVec)


@inherit_doc
class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, JavaMLReadable,
JavaMLWritable):
"""
.. note:: Experimental

Feature hashing projects a set of categorical or numerical features into a feature vector of
specified dimension (typically substantially smaller than that of the original feature
space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing)
to map features to indices in the feature vector.

The FeatureHasher transformer operates on multiple columns. Each column may contain either
numeric or categorical features. Behavior and handling of column data types is as follows:

* Numeric columns:
For numeric features, the hash value of the column name is used to map the
feature value to its index in the feature vector. Numeric features are never
treated as categorical, even when they are integers. You must explicitly
convert numeric columns containing categorical features to strings first.

* String columns:
For categorical features, the hash value of the string "column_name=value"
is used to map to the vector index, with an indicator value of `1.0`.
Thus, categorical features are "one-hot" encoded
(similarly to using :py:class:`OneHotEncoder` with `dropLast=false`).

* Boolean columns:
Boolean values are treated in the same way as string columns. That is,
boolean features are represented as "column_name=true" or "column_name=false",
with an indicator value of `1.0`.

Null (missing) values are ignored (implicitly zero in the resulting feature vector).

Since a simple modulo is used to transform the hash function to a vector index,
it is advisable to use a power of two as the `numFeatures` parameter;
otherwise the features will not be mapped evenly to the vector indices.

>>> data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")]
>>> cols = ["real", "bool", "stringNum", "string"]
>>> df = spark.createDataFrame(data, cols)
>>> hasher = FeatureHasher(inputCols=cols, outputCol="features")
>>> hasher.transform(df).head().features
SparseVector(262144, {51871: 1.0, 63643: 1.0, 174475: 2.0, 253195: 1.0})
>>> hasherPath = temp_path + "/hasher"
>>> hasher.save(hasherPath)
>>> loadedHasher = FeatureHasher.load(hasherPath)
>>> loadedHasher.getNumFeatures() == hasher.getNumFeatures()
True
>>> loadedHasher.transform(df).head().features == hasher.transform(df).head().features
True

.. versionadded:: 2.3.0
"""

@keyword_only
def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
"""
__init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
"""
super(FeatureHasher, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid)
self._setDefault(numFeatures=1 << 18)
kwargs = self._input_kwargs
self.setParams(**kwargs)

@keyword_only
@since("2.3.0")
def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
"""
setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
Sets params for this FeatureHasher.
"""
kwargs = self._input_kwargs
return self._set(**kwargs)


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should there be a getNumFeatures() method to return the param?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nevermind, I forgot it's in the shared param HasNumFeatures

@inherit_doc
class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable,
JavaMLWritable):
Expand Down