Skip to content

Commit 3ead289

Browse files
author
Nick Pentreath
committed
Add Python API for FeatureHasher
1 parent 4ebd41e commit 3ead289

File tree

1 file changed

+77
-0
lines changed

1 file changed

+77
-0
lines changed

python/pyspark/ml/feature.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
'CountVectorizer', 'CountVectorizerModel',
3535
'DCT',
3636
'ElementwiseProduct',
37+
'FeatureHasher',
3738
'HashingTF',
3839
'IDF', 'IDFModel',
3940
'Imputer', 'ImputerModel',
@@ -696,6 +697,82 @@ def getScalingVec(self):
696697
return self.getOrDefault(self.scalingVec)
697698

698699

700+
@inherit_doc
701+
class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, JavaMLReadable,
702+
JavaMLWritable):
703+
"""
704+
.. note:: Experimental
705+
706+
Feature hashing projects a set of categorical or numerical features into a feature vector of
707+
specified dimension (typically substantially smaller than that of the original feature
708+
space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing)
709+
to map features to indices in the feature vector.
710+
711+
The FeatureHasher transformer operates on multiple columns. Each column may contain either
712+
numeric or categorical features. Behavior and handling of column data types is as follows:
713+
714+
* Numeric columns:
715+
For numeric features, the hash value of the column name is used to map the
716+
feature value to its index in the feature vector. Numeric features are never
717+
treated as categorical, even when they are integers. You must explicitly
718+
convert numeric columns containing categorical features to strings first.
719+
720+
* String columns:
721+
For categorical features, the hash value of the string "column_name=value"
722+
is used to map to the vector index, with an indicator value of `1.0`.
723+
Thus, categorical features are "one-hot" encoded
724+
(similarly to using :py:class:`OneHotEncoder` with `dropLast=false`).
725+
726+
* Boolean columns:
727+
Boolean values are treated in the same way as string columns. That is,
728+
boolean features are represented as "column_name=true" or "column_name=false",
729+
with an indicator value of `1.0`.
730+
731+
Null (missing) values are ignored (implicitly zero in the resulting feature vector).
732+
733+
Since a simple modulo is used to transform the hash function to a vector index,
734+
it is advisable to use a power of two as the `numFeatures` parameter;
735+
otherwise the features will not be mapped evenly to the vector indices.
736+
737+
>>> data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")]
738+
>>> cols = ["real", "bool", "stringNum", "string"]
739+
>>> df = spark.createDataFrame(data, cols)
740+
>>> hasher = FeatureHasher(inputCols=cols, outputCol="features")
741+
>>> hasher.transform(df).head().features
742+
SparseVector(262144, {51871: 1.0, 63643: 1.0, 174475: 2.0, 253195: 1.0})
743+
>>> hasherPath = temp_path + "/hasher"
744+
>>> hasher.save(hasherPath)
745+
>>> loadedHasher = FeatureHasher.load(hasherPath)
746+
>>> loadedHasher.getNumFeatures() == hasher.getNumFeatures()
747+
True
748+
>>> loadedHasher.transform(df).head().features == hasher.transform(df).head().features
749+
True
750+
751+
.. versionadded:: 2.3.0
752+
"""
753+
754+
@keyword_only
755+
def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
756+
"""
757+
__init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
758+
"""
759+
super(FeatureHasher, self).__init__()
760+
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid)
761+
self._setDefault(numFeatures=1 << 18)
762+
kwargs = self._input_kwargs
763+
self.setParams(**kwargs)
764+
765+
@keyword_only
766+
@since("2.3.0")
767+
def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
768+
"""
769+
setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
770+
Sets params for this FeatureHasher.
771+
"""
772+
kwargs = self._input_kwargs
773+
return self._set(**kwargs)
774+
775+
699776
@inherit_doc
700777
class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable,
701778
JavaMLWritable):

0 commit comments

Comments
 (0)