Add OneHotEncoder and create configs that support sparse matrices

EpistasisLab · rhiever · Jun 15, 2017 · May 2, 2017 · May 8, 2017 · May 10, 2017
commit 00f9abefbdcf5f490074988f2d265722257352c2
diff --git a/tpot/built_in_operators.py b/tpot/built_in_operators.py
@@ -20,6 +20,7 @@
 """
 
 import numpy as np
+
 from sklearn.base import BaseEstimator
 from sklearn.utils import check_array
 

diff --git a/tpot/config_classifier_sparse.py b/tpot/config_classifier_sparse.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+
+"""Copyright 2015-Present Randal S. Olson.
+
+This file is part of the TPOT library.
+
+TPOT is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as
+published by the Free Software Foundation, either version 3 of
+the License, or (at your option) any later version.
+
+TPOT is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with TPOT. If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import numpy as np
+
+classifier_config_sparse = {
+    'tpot.one_hot_encoder.OneHotEncoder': {
+    },
+
+    'sklearn.neighbors.KNeighborsClassifier': {
+        'n_neighbors': range(1, 101),
+        'weights': ["uniform", "distance"],
+        'p': [1, 2]
+    },
+
+    'sklearn.ensemble.RandomForestClassifier': {
+        'n_estimators': [100],
+        'criterion': ["gini", "entropy"],
+        'max_features': np.arange(0.05, 1.01, 0.05),
+        'min_samples_split': range(2, 21),
+        'min_samples_leaf':  range(1, 21),
+        'bootstrap': [True, False]
+    },
+
+    'sklearn.feature_selection.SelectFwe': {
+        'alpha': np.arange(0, 0.05, 0.001),
+        'score_func': {
+            'sklearn.feature_selection.f_classif': None
+        }
+    },
+
+    'sklearn.feature_selection.SelectPercentile': {
+        'percentile': range(1, 100),
+        'score_func': {
+            'sklearn.feature_selection.f_classif': None
+        }
+    },
+
+    'sklearn.feature_selection.VarianceThreshold': {
+        'threshold': np.arange(0.05, 1.01, 0.05)
+    },
+
+    'sklearn.feature_selection.RFE': {
+        'step': np.arange(0.05, 1.01, 0.05),
+        'estimator': {
+            'sklearn.ensemble.ExtraTreesClassifier': {
+                'n_estimators': [100],
+                'criterion': ['gini', 'entropy'],
+                'max_features': np.arange(0.05, 1.01, 0.05)
+            }
+        }
+    },
+
+    'sklearn.feature_selection.SelectFromModel': {
+        'threshold': np.arange(0, 1.01, 0.05),
+        'estimator': {
+            'sklearn.ensemble.ExtraTreesClassifier': {
+                'n_estimators': [100],
+                'criterion': ['gini', 'entropy'],
+                'max_features': np.arange(0.05, 1.01, 0.05)
+            }
+        }
+    },
+
+    'sklearn.linear_model.LogisticRegression': {
+        'penalty': ["l1", "l2"],
+        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.],
+        'dual': [True, False]
+    },
+
+    'sklearn.naive_bayes.BernoulliNB': {
+        'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
+        'fit_prior': [True, False]
+    },
+
+    'sklearn.naive_bayes.MultinomialNB': {
+        'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
+        'fit_prior': [True, False]
+    },
+
+    'sklearn.svm.LinearSVC': {
+        'penalty': ["l1", "l2"],
+        'loss': ["hinge", "squared_hinge"],
+        'dual': [True, False],
+        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
+        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]
+    },
+
+    'xgboost.XGBClassifier': {
+        'n_estimators': [100],
+        'max_depth': range(1, 11),
+        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
+        'subsample': np.arange(0.05, 1.01, 0.05),
+        'min_child_weight': range(1, 21),
+        'nthread': [1]
+    }
+}
diff --git a/tpot/config_regressor.py b/tpot/config_regressor.py
@@ -23,7 +23,6 @@
 
 regressor_config_dict = {
 
-
     'sklearn.linear_model.ElasticNetCV': {
         'l1_ratio': np.arange(0.0, 1.01, 0.05),
         'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
@@ -91,7 +90,6 @@
     'sklearn.linear_model.RidgeCV': {
     },
 
-
     'xgboost.XGBRegressor': {
         'n_estimators': [100],
         'max_depth': range(1, 11),

diff --git a/tpot/config_regressor_sparse.py b/tpot/config_regressor_sparse.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+
+"""Copyright 2015-Present Randal S. Olson.
+
+This file is part of the TPOT library.
+
+TPOT is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as
+published by the Free Software Foundation, either version 3 of
+the License, or (at your option) any later version.
+
+TPOT is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with TPOT. If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import numpy as np
+
+regressor_config_sparse = {
+    'tpot.one_hot_encoder.OneHotEncoder': {
+    },
+
+    'sklearn.neighbors.KNeighborsRegressor': {
+        'n_neighbors': range(1, 101),
+        'weights': ["uniform", "distance"],
+        'p': [1, 2]
+    },
+
+    'sklearn.ensemble.RandomForestRegressor': {
+        'n_estimators': [100],
+        'max_features': np.arange(0.05, 1.01, 0.05),
+        'min_samples_split': range(2, 21),
+        'min_samples_leaf': range(1, 21),
+        'bootstrap': [True, False]
+    },
+
+    'sklearn.feature_selection.SelectFwe': {
+        'alpha': np.arange(0, 0.05, 0.001),
+        'score_func': {
+            'sklearn.feature_selection.f_classif': None
+        }
+    },
+
+    'sklearn.feature_selection.SelectPercentile': {
+        'percentile': range(1, 100),
+        'score_func': {
+            'sklearn.feature_selection.f_classif': None
+        }
+    },
+
+    'sklearn.feature_selection.VarianceThreshold': {
+        'threshold': np.arange(0.05, 1.01, 0.05)
+    },
+
+    'sklearn.feature_selection.SelectFromModel': {
+        'threshold': np.arange(0, 1.01, 0.05),
+        'estimator': {
+            'sklearn.ensemble.ExtraTreesRegressor': {
+                'n_estimators': [100],
+                'max_features': np.arange(0.05, 1.01, 0.05)
+            }
+        }
+    },
+
+    'sklearn.linear_model.ElasticNetCV': {
+        'l1_ratio': np.arange(0.0, 1.01, 0.05),
+        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
+    },
+
+    'sklearn.linear_model.RidgeCV': {
+    },
+
+    'sklearn.svm.LinearSVR': {
+        'loss': ["epsilon_insensitive", "squared_epsilon_insensitive"],
+        'dual': [True, False],
+        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
+        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.],
+        'epsilon': [1e-4, 1e-3, 1e-2, 1e-1, 1.]
+    },
+
+    'xgboost.XGBRegressor': {
+        'n_estimators': [100],
+        'max_depth': range(1, 11),
+        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
+        'subsample': np.arange(0.05, 1.01, 0.05),
+        'min_child_weight': range(1, 21),
+        'nthread': [1]
+    }}
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,6 +20,7 @@ @@
     """
     import numpy as np
     from sklearn.base import BaseEstimator
     from sklearn.utils import check_array
@@ Expand Down @@