-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Additional Feature Selection operators #50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
97b88d7
c751bbe
7afba07
fa2178f
55995ba
5b02770
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
…checks, cleaning up
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -412,11 +412,11 @@ def export(self, output_file_name): | |
| from sklearn.cross_validation import StratifiedShuffleSplit | ||
| ''' | ||
| if '_dt_feature_selection' in operators_used: pipeline_text += 'from itertools import combinations\n' | ||
| if '_variance_threshold' in operators_used: pipeline_text += 'from sklearn.feature_selection import VarianceThreshold' | ||
| if '_select_kbest' in operators_used: pipeline_text += 'from sklearn.feature_selection import SelectKBest' | ||
| if '_select_percentile' in operators_used: pipeline_text += 'from sklearn.feature_selection import SelectPercentile' | ||
| if '_select_percentile' or '_select_kbest' in operators_used: pipeline_text += 'from sklearn.feature_selection import chi2' | ||
| if '_rfe' in operators_used: pipeline_text += 'from sklearn.feature_selection import RFE' | ||
| if '_variance_threshold' in operators_used: pipeline_text += 'from sklearn.feature_selection import VarianceThreshold\n' | ||
| if '_select_kbest' in operators_used: pipeline_text += 'from sklearn.feature_selection import SelectKBest\n' | ||
| if '_select_percentile' in operators_used: pipeline_text += 'from sklearn.feature_selection import SelectPercentile\n' | ||
| if '_select_percentile' or '_select_kbest' in operators_used: pipeline_text += 'from sklearn.feature_selection import chi2\n' | ||
| if '_rfe' in operators_used: pipeline_text += 'from sklearn.feature_selection import RFE\n' | ||
| if 'decision_tree' in operators_used: pipeline_text += 'from sklearn.tree import DecisionTreeClassifier\n' | ||
| if 'random_forest' in operators_used: pipeline_text += 'from sklearn.ensemble import RandomForestClassifier\n' | ||
| if 'logistic_regression' in operators_used: pipeline_text += 'from sklearn.linear_model import LogisticRegression\n' | ||
|
|
@@ -593,39 +593,77 @@ def export(self, output_file_name): | |
| training_features = {0}.loc[training_indeces].drop('class', axis=1) | ||
|
|
||
| selector = VarianceThreshold(threshold={1}) | ||
| selector.fit(training_features.values) | ||
| mask = selector.get_support(True) | ||
| {2} = {0}[mask + ['class']] | ||
| try: | ||
| selector.fit(training_features.values) | ||
| except ValueError: | ||
| {2} = {0}[['guess', 'class', 'group']] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no |
||
| try: | ||
| mask = selector.get_support(True) | ||
| mask_cols = list(training_features[mask].columns) + ['guess', 'class', 'group'] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no |
||
| {2} = {0}[mask_cols] | ||
| except: | ||
| pass | ||
| '''.format(operator[2], operator[3], result_name) | ||
| elif operator_name == '_select_kbest': | ||
| operator_text += ''' | ||
| #Using Scikit-learn's SelectKBest for feature selection | ||
| training_features = {0}.loc[training_indeces].drop('class', axis=1) | ||
|
|
||
| selector = SelectKBest(chi2, k={1}) | ||
| selector.fit(training_features.values) | ||
| mask = selector.get_support(True) | ||
| {2} = {0}[mask + ['class']] | ||
| training_class_vals = {0}.loc[training_indeces, 'class'].values | ||
| if {1} <= 0 or {1} >= len(training_features.columns): | ||
| {1} = 'all' | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be the variable |
||
| if len(training_features.columns) == 0: | ||
| {2} = {0}.copy() | ||
| else: | ||
| selector = SelectKBest(chi2, k={1}) | ||
| selector.fit(training_features.values, training_class_vals) | ||
| mask = selector.get_support(True) | ||
| mask_cols = list(training_features[mask].columns) + ['guess', 'class', 'group'] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no |
||
| {2} = {0}[mask_cols] | ||
| '''.format(operator[2], operator[3], result_name) | ||
| elif operator_name == '_select_percentile': | ||
| operator_text += ''' | ||
| #Using Scikit-learn's SelectPercentile for feature selection | ||
| training_features = {0}.loc[training_indeces].drop('class', axis=1) | ||
| training_class_vals = {0}.loc[training_indeces, 'class'].values | ||
|
|
||
| selector = SelectPercentile(chi2, k={1}) | ||
| selector.fit(training_features.values) | ||
| mask = selector.get_support(True) | ||
| {2} = {0}[mask + ['class']] | ||
| if {1} < 0: | ||
| {1} = 0 | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be the variable |
||
| if {1} > 100: | ||
| {1} = 100 | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be the variable |
||
| if len(training_features.columns) == 0: | ||
| {2} = {0}.copy() | ||
| else: | ||
| selector = SelectPercentile(chi2, percentile={1}) | ||
| selector.fit(training_features.values, training_class_vals) | ||
| mask = selector.get_support(True) | ||
| mask_cols = list(training_features[mask].columns) + ['guess', 'class', 'group'] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no |
||
| {2} = {0}[mask_cols] | ||
| '''.format(operator[2], operator[3], result_name) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like this export case is incomplete and should read something like: mask_cols = list(training_features[mask].columns) + ['class']
{2} = {0}[mask_cols + ['class']] |
||
| elif operator_name == '_rfe': | ||
| operator_text += ''' | ||
| #Using Scikit-learn's Recursive Feature Elimination | ||
| training_features = {0}.loc[training_indeces].drop('class', axis=1) | ||
| training_class_vals = {0}.loc[training_indeces, 'class'].values | ||
|
|
||
| selector = RFE(SVC(kernel='linear'), {1}, {2}) | ||
| selector.fit(training_features.values) | ||
| mask = selector.get_support(True) | ||
| {3} = {0}[mask + ['class']] | ||
| if {2} <= 0.05: | ||
| {2} = 0.05 | ||
| if {2} > 1: | ||
| {2} = 1.0 | ||
| if {1} < 0: | ||
| {1} = 1 | ||
| if {1} > len(training_features.columns): | ||
| {1} = len(training_features.columns) | ||
| if len(training_features.columns) == 0: | ||
| {3} = {0}.copy() | ||
| else: | ||
| selector = RFE(SVC(kernel='linear'), n_features_to_select={1}, step={2}) | ||
| try: | ||
| selector.fit(training_features.values, training_class_vals) | ||
| mask = selector.get_support(True) | ||
| mask_cols = list(training_features[mask].columns) + ['guess', 'class', 'group'] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no |
||
| {3} = {0}[mask_cols] | ||
| except ValueError: | ||
| {3} = {0}[['guess', 'class', 'group']] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no |
||
| '''.format(operator[2], operator[3], operator[4], result_name) | ||
| pipeline_text += operator_text | ||
|
|
||
|
|
@@ -1007,12 +1045,11 @@ def _rfe(self, input_df, num_features, step): | |
| Returns a DataFrame containing the the num_pairs best feature pairs | ||
|
|
||
| """ | ||
|
|
||
| training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) | ||
| training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values | ||
|
|
||
| if step <= 0: | ||
| step = 0.1 | ||
| if step <= 0.05: | ||
| step = 0.05 | ||
| if step > 1: | ||
| step = 1.0 | ||
| if num_features < 0: | ||
|
|
@@ -1024,11 +1061,13 @@ def _rfe(self, input_df, num_features, step): | |
|
|
||
| estimator = SVC(kernel='linear') | ||
| selector = RFE(estimator, n_features_to_select=num_features, step=step) | ||
| selector.fit(training_features, training_class_vals)#.reshape((training_class_vals.shape[0], 1))) | ||
| mask = selector.get_support(True) | ||
| mask_cols = list(training_features[mask].columns) + ['guess', 'class', 'group'] | ||
| return input_df[mask_cols].copy() | ||
|
|
||
| try: | ||
| selector.fit(training_features, training_class_vals) | ||
| mask = selector.get_support(True) | ||
| mask_cols = list(training_features[mask].columns) + ['guess', 'class', 'group'] | ||
| return input_df[mask_cols].copy() | ||
| except ValueError: | ||
| return input_df[['guess', 'class', 'group']].copy() | ||
|
|
||
| def _select_percentile(self, input_df, percentile): | ||
| """Uses Scikit-learn's SelectPercentile feature selection to learn the subset of features that belong in the highest <percentile> percentile according to some scoring function | ||
|
|
@@ -1046,7 +1085,6 @@ def _select_percentile(self, input_df, percentile): | |
| Returns a DataFrame containing the the num_pairs best feature pairs | ||
|
|
||
| """ | ||
|
|
||
| training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) | ||
| training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values | ||
|
|
||
|
|
@@ -1058,7 +1096,7 @@ def _select_percentile(self, input_df, percentile): | |
| return input_df.copy() | ||
|
|
||
| selector = SelectPercentile(chi2, percentile=percentile) | ||
| selector.fit(training_features, training_class_vals)#.reshape((training_class_vals.shape[0], 1))) | ||
| selector.fit(training_features, training_class_vals) | ||
| mask = selector.get_support(True) | ||
| mask_cols = list(training_features[mask].columns) + ['guess', 'class', 'group'] | ||
| return input_df[mask_cols].copy() | ||
|
|
@@ -1082,7 +1120,6 @@ def _select_kbest(self, input_df, k): | |
| Returns a DataFrame containing the the num_pairs best feature pairs | ||
|
|
||
| """ | ||
|
|
||
| training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) | ||
| training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values | ||
|
|
||
|
|
@@ -1092,7 +1129,7 @@ def _select_kbest(self, input_df, k): | |
| return input_df.copy() | ||
|
|
||
| selector = SelectKBest(chi2, k=k) | ||
| selector.fit(training_features, training_class_vals)#.reshape((training_class_vals.shape[0], 1))) | ||
| selector.fit(training_features, training_class_vals) | ||
| mask = selector.get_support(True) | ||
| mask_cols = list(training_features[mask].columns) + ['guess', 'class', 'group'] | ||
| return input_df[mask_cols].copy() | ||
|
|
@@ -1121,11 +1158,10 @@ def _variance_threshold(self, input_df, threshold): | |
| selector = VarianceThreshold(threshold=threshold) | ||
| try: | ||
| selector.fit(training_features) | ||
| except ValueError: | ||
| return input_df.copy() | ||
| except ValueError: #when none features are above the variance threshold | ||
| return input_df[['guess', 'class', 'group']].copy() | ||
| mask = selector.get_support(True) | ||
| mask_cols = list(training_features[mask].columns) + ['guess', 'class', 'group'] | ||
| #return input_df[[mask , 'guess', 'class', 'group']].copy() | ||
| return input_df[mask_cols].copy() | ||
|
|
||
| def _dt_feature_selection(self, input_df, num_pairs): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
RFE should also import SVC.