|
34 | 34 | 'CountVectorizer', 'CountVectorizerModel', |
35 | 35 | 'DCT', |
36 | 36 | 'ElementwiseProduct', |
| 37 | + 'FeatureHasher', |
37 | 38 | 'HashingTF', |
38 | 39 | 'IDF', 'IDFModel', |
39 | 40 | 'Imputer', 'ImputerModel', |
@@ -696,6 +697,82 @@ def getScalingVec(self): |
696 | 697 | return self.getOrDefault(self.scalingVec) |
697 | 698 |
|
698 | 699 |
|
| 700 | +@inherit_doc |
| 701 | +class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, JavaMLReadable, |
| 702 | + JavaMLWritable): |
| 703 | + """ |
| 704 | + .. note:: Experimental |
| 705 | +
|
| 706 | + Feature hashing projects a set of categorical or numerical features into a feature vector of |
| 707 | + specified dimension (typically substantially smaller than that of the original feature |
| 708 | + space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing) |
| 709 | + to map features to indices in the feature vector. |
| 710 | +
|
| 711 | + The FeatureHasher transformer operates on multiple columns. Each column may contain either |
| 712 | + numeric or categorical features. Behavior and handling of column data types is as follows: |
| 713 | +
|
| 714 | + * Numeric columns: |
| 715 | + For numeric features, the hash value of the column name is used to map the |
| 716 | + feature value to its index in the feature vector. Numeric features are never |
| 717 | + treated as categorical, even when they are integers. You must explicitly |
| 718 | + convert numeric columns containing categorical features to strings first. |
| 719 | +
|
| 720 | + * String columns: |
| 721 | + For categorical features, the hash value of the string "column_name=value" |
| 722 | + is used to map to the vector index, with an indicator value of `1.0`. |
| 723 | + Thus, categorical features are "one-hot" encoded |
| 724 | + (similarly to using :py:class:`OneHotEncoder` with `dropLast=false`). |
| 725 | +
|
| 726 | + * Boolean columns: |
| 727 | + Boolean values are treated in the same way as string columns. That is, |
| 728 | + boolean features are represented as "column_name=true" or "column_name=false", |
| 729 | + with an indicator value of `1.0`. |
| 730 | +
|
| 731 | + Null (missing) values are ignored (implicitly zero in the resulting feature vector). |
| 732 | +
|
| 733 | + Since a simple modulo is used to transform the hash function to a vector index, |
| 734 | + it is advisable to use a power of two as the `numFeatures` parameter; |
| 735 | + otherwise the features will not be mapped evenly to the vector indices. |
| 736 | +
|
| 737 | + >>> data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")] |
| 738 | + >>> cols = ["real", "bool", "stringNum", "string"] |
| 739 | + >>> df = spark.createDataFrame(data, cols) |
| 740 | + >>> hasher = FeatureHasher(inputCols=cols, outputCol="features") |
| 741 | + >>> hasher.transform(df).head().features |
| 742 | + SparseVector(262144, {51871: 1.0, 63643: 1.0, 174475: 2.0, 253195: 1.0}) |
| 743 | + >>> hasherPath = temp_path + "/hasher" |
| 744 | + >>> hasher.save(hasherPath) |
| 745 | + >>> loadedHasher = FeatureHasher.load(hasherPath) |
| 746 | + >>> loadedHasher.getNumFeatures() == hasher.getNumFeatures() |
| 747 | + True |
| 748 | + >>> loadedHasher.transform(df).head().features == hasher.transform(df).head().features |
| 749 | + True |
| 750 | +
|
| 751 | + .. versionadded:: 2.3.0 |
| 752 | + """ |
| 753 | + |
| 754 | + @keyword_only |
| 755 | + def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None): |
| 756 | + """ |
| 757 | + __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None) |
| 758 | + """ |
| 759 | + super(FeatureHasher, self).__init__() |
| 760 | + self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid) |
| 761 | + self._setDefault(numFeatures=1 << 18) |
| 762 | + kwargs = self._input_kwargs |
| 763 | + self.setParams(**kwargs) |
| 764 | + |
| 765 | + @keyword_only |
| 766 | + @since("2.3.0") |
| 767 | + def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None): |
| 768 | + """ |
| 769 | + setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None) |
| 770 | + Sets params for this FeatureHasher. |
| 771 | + """ |
| 772 | + kwargs = self._input_kwargs |
| 773 | + return self._set(**kwargs) |
| 774 | + |
| 775 | + |
699 | 776 | @inherit_doc |
700 | 777 | class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable, |
701 | 778 | JavaMLWritable): |
|
0 commit comments