diff --git a/setup.py b/setup.py
index 2d61ea0b3..efaf74364 100644
--- a/setup.py
+++ b/setup.py
@@ -8,6 +8,7 @@ def calculate_version():
version = list(filter(lambda x: '__version__' in x, initpy))[0].split('\'')[1]
return version
+
package_version = calculate_version()
setup(
@@ -35,9 +36,11 @@ def calculate_version():
''',
zip_safe=True,
install_requires=['numpy>=1.12.1', 'scipy>=0.19.0', 'scikit-learn>=0.18.1', 'deap>=1.0', 'update_checker>=0.16', 'tqdm>=4.11.2'],
- extras_require={'xgboost': ['xgboost>=0.6'],
- 'skrebate': ['skrebate>=0.3.4'],
- 'mdr': ['scikit-mdr>=0.4.2']},
+ extras_require={
+ 'xgboost': ['xgboost>=0.6'],
+ 'skrebate': ['skrebate>=0.3.4'],
+ 'mdr': ['scikit-mdr>=0.4.2']
+ },
classifiers=[
'Intended Audience :: Science/Research',
'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)',
diff --git a/tests.py b/tests.py
index 21163100d..174d0cd03 100644
--- a/tests.py
+++ b/tests.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -36,14 +35,13 @@
import numpy as np
import inspect
import random
-import time
-from datetime import datetime
import subprocess
from sklearn.datasets import load_digits, load_boston
from sklearn.model_selection import train_test_split, cross_val_score
from deap import creator
from tqdm import tqdm
+from nose.tools import assert_raises
# Set up the MNIST data set for testing
mnist_data = load_digits()
@@ -59,28 +57,40 @@
random.seed(42)
test_operator_key = 'sklearn.feature_selection.SelectKBest'
-TPOTSelectKBest,TPOTSelectKBest_args = TPOTOperatorClassFactory(test_operator_key,
- classifier_config_dict[test_operator_key])
+TPOTSelectKBest, TPOTSelectKBest_args = TPOTOperatorClassFactory(
+ test_operator_key,
+ classifier_config_dict[test_operator_key]
+)
+
def test_driver():
- """Assert that the TPOT driver output normal result"""
+ """Assert that the TPOT driver output normal result."""
batcmd = "python -m tpot.driver tests.csv -is , -target class -g 2 -p 2 -os 4 -cv 5 -s 45 -v 1"
ret_stdout = subprocess.check_output(batcmd, shell=True)
+
try:
ret_val = float(ret_stdout.decode('UTF-8').split('\n')[-2].split(': ')[-1])
- except:
+ except Exception:
ret_val = -float('inf')
+
assert ret_val > 0.0
def test_init_custom_parameters():
- """Assert that the TPOT instantiator stores the TPOT variables properly"""
-
- tpot_obj = TPOTClassifier(population_size=500, generations=1000, offspring_size=2000,
- mutation_rate=0.05, crossover_rate=0.9,
- scoring='accuracy', cv=10,
- verbosity=1, random_state=42,
- disable_update_check=True, warm_start=True)
+ """Assert that the TPOT instantiator stores the TPOT variables properly."""
+ tpot_obj = TPOTClassifier(
+ population_size=500,
+ generations=1000,
+ offspring_size=2000,
+ mutation_rate=0.05,
+ crossover_rate=0.9,
+ scoring='accuracy',
+ cv=10,
+ verbosity=1,
+ random_state=42,
+ disable_update_check=True,
+ warm_start=True
+ )
assert tpot_obj.population_size == 500
assert tpot_obj.generations == 1000
@@ -99,40 +109,38 @@ def test_init_custom_parameters():
def test_init_default_scoring():
- """Assert that TPOT intitializes with the correct default scoring function"""
-
+ """Assert that TPOT intitializes with the correct default scoring function."""
tpot_obj = TPOTRegressor()
assert tpot_obj.scoring_function == 'neg_mean_squared_error'
tpot_obj = TPOTClassifier()
assert tpot_obj.scoring_function == 'accuracy'
+
def test_invaild_score_warning():
- """Assert that the TPOT fit function raises a ValueError when the scoring metrics is not available in SCORERS"""
- try:
- tpot_obj = TPOTClassifier(scoring='balanced_accuray') # typo for balanced_accuracy
- assert False
- except ValueError:
- pass
- try:
- tpot_obj = TPOTClassifier(scoring='balanced_accuracy') # correct one
- assert True
- except:
- assert False
+ """Assert that the TPOT fit function raises a ValueError when the scoring metrics is not available in SCORERS."""
+ # Mis-spelled scorer
+ assert_raises(ValueError, TPOTClassifier, scoring='balanced_accuray')
+ # Correctly spelled
+ TPOTClassifier(scoring='balanced_accuracy')
+
def test_invaild_dataset_warning():
- """Assert that the TPOT fit function raises a ValueError when dataset is not in right format"""
- tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0)
- bad_training_classes = training_classes.reshape((1, len(training_classes)))# common mistake in classes
- try:
- tpot_obj.fit(training_features ,bad_training_classes) # typo for balanced_accuracy
- assert False
- except ValueError:
- pass
+ """Assert that the TPOT fit function raises a ValueError when dataset is not in right format."""
+ tpot_obj = TPOTClassifier(
+ random_state=42,
+ population_size=1,
+ offspring_size=2,
+ generations=1,
+ verbosity=0
+ )
+ # common mistake in classes
+ bad_training_classes = training_classes.reshape((1, len(training_classes)))
+ assert_raises(ValueError, tpot_obj.fit, training_features, bad_training_classes)
-def test_init_max_time_mins():
- """Assert that the TPOT init stores max run time and sets generations to 1000000"""
+def test_init_max_time_mins():
+ """Assert that the TPOT init stores max run time and sets generations to 1000000."""
tpot_obj = TPOTClassifier(max_time_mins=30, generations=1000)
assert tpot_obj.generations == 1000000
@@ -140,8 +148,7 @@ def test_init_max_time_mins():
def test_get_params():
- """Assert that get_params returns the exact dictionary of parameters used by TPOT"""
-
+ """Assert that get_params returns the exact dictionary of parameters used by TPOT."""
kwargs = {
'population_size': 500,
'generations': 1000,
@@ -162,21 +169,21 @@ def test_get_params():
def test_set_params():
- """Assert that set_params returns a reference to the TPOT instance"""
-
+ """Assert that set_params returns a reference to the TPOT instance."""
tpot_obj = TPOTClassifier()
assert tpot_obj.set_params() is tpot_obj
def test_set_params_2():
- """Assert that set_params updates TPOT's instance variables"""
+ """Assert that set_params updates TPOT's instance variables."""
tpot_obj = TPOTClassifier(generations=2)
tpot_obj.set_params(generations=3)
assert tpot_obj.generations == 3
+
def test_lite_params():
- """Assert that TPOT uses TPOT's lite dictionary of operators when config_dict is \'TPOT light\' or \'TPOT MDR\'"""
+ """Assert that TPOT uses TPOT's lite dictionary of operators when config_dict is 'TPOT light' or 'TPOT MDR'."""
tpot_obj = TPOTClassifier(config_dict='TPOT light')
assert tpot_obj.config_dict == classifier_config_dict_light
@@ -186,24 +193,20 @@ def test_lite_params():
tpot_obj = TPOTRegressor(config_dict='TPOT light')
assert tpot_obj.config_dict == regressor_config_dict_light
- try:
- tpot_obj = TPOTRegressor(config_dict='TPOT MDR')
- assert False
- except TypeError:
- assert True
+ assert_raises(TypeError, TPOTRegressor, config_dict='TPOT MDR')
def test_random_ind():
- """Assert that the TPOTClassifier can generate the same pipeline with same random seed"""
+ """Assert that the TPOTClassifier can generate the same pipeline with same random seed."""
tpot_obj = TPOTClassifier(random_state=43)
pipeline1 = str(tpot_obj._toolbox.individual())
tpot_obj = TPOTClassifier(random_state=43)
pipeline2 = str(tpot_obj._toolbox.individual())
assert pipeline1 == pipeline2
-def test_random_ind_2():
- """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 45"""
+def test_random_ind_2():
+ """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 45."""
tpot_obj = TPOTClassifier(random_state=45)
tpot_obj._pbar = tqdm(total=1, disable=True)
pipeline = tpot_obj._toolbox.individual()
@@ -228,50 +231,47 @@ def test_random_ind_2():
exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
"""
+
assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
-def test_score():
- """Assert that the TPOT score function raises a RuntimeError when no optimized pipeline exists"""
+def test_score():
+ """Assert that the TPOT score function raises a RuntimeError when no optimized pipeline exists."""
tpot_obj = TPOTClassifier()
-
- try:
- tpot_obj.score(testing_features, testing_classes)
- assert False # Should be unreachable
- except RuntimeError:
- pass
+ assert_raises(RuntimeError, tpot_obj.score, testing_features, testing_classes)
def test_score_2():
- """Assert that the TPOTClassifier score function outputs a known score for a fix pipeline"""
-
+ """Assert that the TPOTClassifier score function outputs a known score for a fix pipeline."""
tpot_obj = TPOTClassifier()
- known_score = 0.977777777778 # Assumes use of the TPOT balanced_accuracy function
+ known_score = 0.977777777778 # Assumes use of the TPOT accuracy function
# Reify pipeline with known score
- pipeline_string= ('KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, '
- 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)')
+ pipeline_string = (
+ 'KNeighborsClassifier('
+ 'input_matrix, '
+ 'KNeighborsClassifier__n_neighbors=10, '
+ 'KNeighborsClassifier__p=1, '
+ 'KNeighborsClassifier__weights=uniform'
+ ')'
+ )
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features, training_classes)
# Get score from TPOT
score = tpot_obj.score(testing_features, testing_classes)
- # http://stackoverflow.com/questions/5595425/
- def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
- return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+ assert np.allclose(known_score, score)
- assert isclose(known_score, score)
def test_score_3():
- """Assert that the TPOTRegressor score function outputs a known score for a fix pipeline"""
-
+ """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline."""
tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error')
- known_score = 12.3727966005 # Assumes use of mse
+ known_score = 12.3727966005 # Assumes use of mse
# Reify pipeline with known score
-
- pipeline_string = ("ExtraTreesRegressor("
+ pipeline_string = (
+ "ExtraTreesRegressor("
"GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8,"
"GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber,"
"GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5,"
@@ -279,28 +279,25 @@ def test_score_3():
"GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25),"
"ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5,"
"ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, "
- "ExtraTreesRegressor__n_estimators=100)")
+ "ExtraTreesRegressor__n_estimators=100)"
+ )
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)
+
# Get score from TPOT
score = tpot_obj.score(testing_features_r, testing_classes_r)
+ assert np.allclose(known_score, score)
- # http://stackoverflow.com/questions/5595425/
- def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
- return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
-
- assert isclose(known_score, score)
def test_sample_weight_func():
- """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline with sample weights"""
-
+ """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline with sample weights."""
tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error')
# Reify pipeline with known scor
-
- pipeline_string = ("ExtraTreesRegressor("
+ pipeline_string = (
+ "ExtraTreesRegressor("
"GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8,"
"GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber,"
"GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5,"
@@ -308,7 +305,8 @@ def test_sample_weight_func():
"GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25),"
"ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5,"
"ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, "
- "ExtraTreesRegressor__n_estimators=100)")
+ "ExtraTreesRegressor__n_estimators=100)"
+ )
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)
@@ -332,50 +330,51 @@ def test_sample_weight_func():
np.random.seed(42)
tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r, **training_classes_r_weight_dict)
# Get score from TPOT
- known_score = 12.643383517 # Assumes use of mse
+ known_score = 12.643383517 # Assumes use of mse
score = tpot_obj.score(testing_features_r, testing_classes_r)
- # http://stackoverflow.com/questions/5595425/
- def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
- return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
assert np.allclose(cv_score1, cv_score2)
assert not np.allclose(cv_score1, cv_score_weight)
- assert isclose(known_score, score)
+ assert np.allclose(known_score, score)
-def test_predict():
- """Assert that the TPOT predict function raises a RuntimeError when no optimized pipeline exists"""
+def test_predict():
+ """Assert that the TPOT predict function raises a RuntimeError when no optimized pipeline exists."""
tpot_obj = TPOTClassifier()
-
- try:
- tpot_obj.predict(testing_features)
- assert False # Should be unreachable
- except RuntimeError:
- pass
+ assert_raises(RuntimeError, tpot_obj.predict, testing_features)
def test_predict_2():
- """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)"""
-
+ """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)."""
tpot_obj = TPOTClassifier()
- pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
- ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
- 'DecisionTreeClassifier__min_samples_split=5)')
+ pipeline_string = (
+ 'DecisionTreeClassifier('
+ 'input_matrix, '
+ 'DecisionTreeClassifier__criterion=gini, '
+ 'DecisionTreeClassifier__max_depth=8, '
+ 'DecisionTreeClassifier__min_samples_leaf=5, '
+ 'DecisionTreeClassifier__min_samples_split=5'
+ ')'
+ )
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features, training_classes)
-
result = tpot_obj.predict(testing_features)
assert result.shape == (testing_features.shape[0],)
-def test_predict_proba():
- """Assert that the TPOT predict_proba function returns a numpy matrix of shape (num_testing_rows, num_testing_classes)"""
+def test_predict_proba():
+ """Assert that the TPOT predict_proba function returns a numpy matrix of shape (num_testing_rows, num_testing_classes)."""
tpot_obj = TPOTClassifier()
- pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
- ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
- 'DecisionTreeClassifier__min_samples_split=5)')
+ pipeline_string = (
+ 'DecisionTreeClassifier('
+ 'input_matrix, '
+ 'DecisionTreeClassifier__criterion=gini, '
+ 'DecisionTreeClassifier__max_depth=8, '
+ 'DecisionTreeClassifier__min_samples_leaf=5, '
+ 'DecisionTreeClassifier__min_samples_split=5)'
+ )
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features, training_classes)
@@ -387,63 +386,76 @@ def test_predict_proba():
def test_predict_proba2():
- """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float)"""
-
+ """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float)."""
tpot_obj = TPOTClassifier()
- pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
- ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
- 'DecisionTreeClassifier__min_samples_split=5)')
+ pipeline_string = (
+ 'DecisionTreeClassifier('
+ 'input_matrix, '
+ 'DecisionTreeClassifier__criterion=gini, '
+ 'DecisionTreeClassifier__max_depth=8, '
+ 'DecisionTreeClassifier__min_samples_leaf=5, '
+ 'DecisionTreeClassifier__min_samples_split=5)'
+ )
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features, training_classes)
result = tpot_obj.predict_proba(testing_features)
+ rows, columns = result.shape
- rows = result.shape[0]
- columns = result.shape[1]
+ for i in range(rows):
+ for j in range(columns):
+ float_range(result[i][j])
- try:
- for i in range(rows):
- for j in range(columns):
- float_range(result[i][j])
- assert True
- except Exception:
- assert False
def test_warm_start():
- """Assert that the TPOT warm_start flag stores the pop and pareto_front from the first run"""
+ """Assert that the TPOT warm_start flag stores the pop and pareto_front from the first run."""
tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, warm_start=True)
tpot_obj.fit(training_features, training_classes)
- assert tpot_obj._pop != None
- assert tpot_obj._pareto_front != None
+ assert tpot_obj._pop is not None
+ assert tpot_obj._pareto_front is not None
first_pop = tpot_obj._pop
- first_pareto_front = tpot_obj._pareto_front
-
tpot_obj.random_state = 21
tpot_obj.fit(training_features, training_classes)
assert tpot_obj._pop == first_pop
+
def test_fit():
- """Assert that the TPOT fit function provides an optimized pipeline"""
- tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0)
+ """Assert that the TPOT fit function provides an optimized pipeline."""
+ tpot_obj = TPOTClassifier(
+ random_state=42,
+ population_size=1,
+ offspring_size=2,
+ generations=1,
+ verbosity=0
+ )
tpot_obj.fit(training_features, training_classes)
assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
assert not (tpot_obj._start_datetime is None)
+
def test_fit2():
- """Assert that the TPOT fit function provides an optimized pipeline when config_dict is \'TPOT light\'"""
- tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, config_dict='TPOT light')
+ """Assert that the TPOT fit function provides an optimized pipeline when config_dict is 'TPOT light'."""
+ tpot_obj = TPOTClassifier(
+ random_state=42,
+ population_size=1,
+ offspring_size=2,
+ generations=1,
+ verbosity=0,
+ config_dict='TPOT light'
+ )
tpot_obj.fit(training_features, training_classes)
assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
assert not (tpot_obj._start_datetime is None)
+
def testTPOTOperatorClassFactory():
- """Assert that the TPOT operators class factory"""
+ """Assert that the TPOT operators class factory."""
test_config_dict = {
'sklearn.svm.LinearSVC': {
'penalty': ["l1", "l2"],
@@ -463,23 +475,25 @@ def testTPOTOperatorClassFactory():
'threshold': np.arange(0.0, 1.01, 0.05)
}
}
+
tpot_operator_list = []
tpot_argument_list = []
+
for key in sorted(test_config_dict.keys()):
- op,args = TPOTOperatorClassFactory(key, test_config_dict[key])
+ op, args = TPOTOperatorClassFactory(key, test_config_dict[key])
tpot_operator_list.append(op)
tpot_argument_list += args
+
assert len(tpot_operator_list) == 3
assert len(tpot_argument_list) == 9
- assert tpot_operator_list[0].root == True
- assert tpot_operator_list[1].root == False
+ assert tpot_operator_list[0].root is True
+ assert tpot_operator_list[1].root is False
assert tpot_operator_list[2].type() == "Classifier or Regressor"
assert tpot_argument_list[1].values == [True, False]
def check_export(op, tpot_obj):
- """Assert that a TPOT operator exports as expected"""
-
+ """Assert that a TPOT operator exports as expected."""
prng = np.random.RandomState(42)
np.random.seed(42)
@@ -492,7 +506,7 @@ def check_export(op, tpot_obj):
def test_operators():
- """Assert that the TPOT operators match the output of their sklearn counterparts"""
+ """Assert that the TPOT operators match the output of their sklearn counterparts."""
tpot_obj = TPOTClassifier(random_state=42)
for op in tpot_obj.operators:
check_export.description = ("Assert that the TPOT {} operator exports "
@@ -501,22 +515,20 @@ def test_operators():
def test_export():
- """Assert that TPOT's export function throws a RuntimeError when no optimized pipeline exists"""
+ """Assert that TPOT's export function throws a RuntimeError when no optimized pipeline exists."""
tpot_obj = TPOTClassifier()
-
- try:
- tpot_obj.export("test_export.py")
- assert False # Should be unreachable
- except RuntimeError:
- pass
+ assert_raises(RuntimeError, tpot_obj.export, "test_export.py")
def test_generate_pipeline_code():
- """Assert that generate_pipeline_code() returns the correct code given a specific pipeline"""
+ """Assert that generate_pipeline_code() returns the correct code given a specific pipeline."""
tpot_obj = TPOTClassifier()
- pipeline = ['KNeighborsClassifier',
- ['CombineDFs',
- ['GradientBoostingClassifier',
+ pipeline = [
+ 'KNeighborsClassifier',
+ [
+ 'CombineDFs',
+ [
+ 'GradientBoostingClassifier',
'input_matrix',
38.0,
5,
@@ -524,12 +536,18 @@ def test_generate_pipeline_code():
5,
0.05,
0.5],
- ['GaussianNB',
- ['ZeroCount',
- 'input_matrix']]],
+ [
+ 'GaussianNB',
+ [
+ 'ZeroCount',
+ 'input_matrix'
+ ]
+ ]
+ ],
18,
'uniform',
- 2]
+ 2
+ ]
expected_code = """make_pipeline(
make_union(
@@ -548,9 +566,8 @@ def test_generate_pipeline_code():
assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators)
-
def test_generate_import_code():
- """Assert that generate_import_code() returns the correct set of dependancies for a given pipeline"""
+ """Assert that generate_import_code() returns the correct set of dependancies for a given pipeline."""
tpot_obj = TPOTClassifier()
pipeline = creator.Individual.from_string('GaussianNB(RobustScaler(input_matrix))', tpot_obj._pset)
@@ -569,41 +586,58 @@ def test_generate_import_code():
"""
assert expected_code == generate_import_code(pipeline, tpot_obj.operators)
+
def test_mutNodeReplacement():
- """Assert that mutNodeReplacement() returns the correct type of mutation node in a fixed pipeline"""
+ """Assert that mutNodeReplacement() returns the correct type of mutation node in a fixed pipeline."""
tpot_obj = TPOTClassifier()
- pipeline_string= ('KNeighborsClassifier(CombineDFs('
- 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
- ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
- 'DecisionTreeClassifier__min_samples_split=5),SelectKBest(input_matrix, SelectKBest__k=20)'
- 'KNeighborsClassifier__n_neighbors=10, '
- 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform')
+ pipeline_string = (
+ 'KNeighborsClassifier(CombineDFs('
+ 'DecisionTreeClassifier(input_matrix, '
+ 'DecisionTreeClassifier__criterion=gini, '
+ 'DecisionTreeClassifier__max_depth=8, '
+ 'DecisionTreeClassifier__min_samples_leaf=5, '
+ 'DecisionTreeClassifier__min_samples_split=5'
+ '), '
+ 'SelectKBest('
+ 'input_matrix, '
+ 'SelectKBest__k=20'
+ ')'
+ 'KNeighborsClassifier__n_neighbors=10, '
+ 'KNeighborsClassifier__p=1, '
+ 'KNeighborsClassifier__weights=uniform'
+ ')'
+ )
+
pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
pipeline[0].ret = Output_Array
old_ret_type_list = [node.ret for node in pipeline]
old_prims_list = [node for node in pipeline if node.arity != 0]
- mut_ind = mutNodeReplacement(pipeline, pset = tpot_obj._pset)
+ mut_ind = mutNodeReplacement(pipeline, pset=tpot_obj._pset)
new_ret_type_list = [node.ret for node in mut_ind[0]]
new_prims_list = [node for node in mut_ind[0] if node.arity != 0]
- if new_prims_list == old_prims_list: # Terminal mutated
+
+ if new_prims_list == old_prims_list: # Terminal mutated
assert new_ret_type_list == old_ret_type_list
- else: # Primitive mutated
+ else: # Primitive mutated
diff_prims = list(set(new_prims_list).symmetric_difference(old_prims_list))
assert diff_prims[0].ret == diff_prims[1].ret
+
assert mut_ind[0][0].ret == Output_Array
def test_export_pipeline():
- """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline"""
+ """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline."""
tpot_obj = TPOTClassifier()
- pipeline_string= ('KNeighborsClassifier(CombineDFs('
- 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini'
- ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
- 'DecisionTreeClassifier__min_samples_split=5),SelectKBest(input_matrix, SelectKBest__k=20)'
- 'KNeighborsClassifier__n_neighbors=10, '
- 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform')
- pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
+ pipeline_string = (
+ 'KNeighborsClassifier(CombineDFs('
+ 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, '
+ 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
+ 'DecisionTreeClassifier__min_samples_split=5),SelectKBest(input_matrix, SelectKBest__k=20)'
+ 'KNeighborsClassifier__n_neighbors=10, '
+ 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform'
+ )
+ pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
expected_code = """import numpy as np
from copy import copy
@@ -636,11 +670,18 @@ def test_export_pipeline():
"""
assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
+
def test_export_pipeline_2():
- """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline (only one classifier)"""
+ """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline (only one classifier)."""
tpot_obj = TPOTClassifier()
- pipeline_string= ('KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, '
- 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)')
+ pipeline_string = (
+ 'KNeighborsClassifier('
+ 'input_matrix, '
+ 'KNeighborsClassifier__n_neighbors=10, '
+ 'KNeighborsClassifier__p=1, '
+ 'KNeighborsClassifier__weights=uniform'
+ ')'
+ )
pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
expected_code = """import numpy as np
@@ -660,12 +701,15 @@ def test_export_pipeline_2():
"""
assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
+
def test_export_pipeline_3():
- """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with a preprocessor"""
+ """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with a preprocessor."""
tpot_obj = TPOTClassifier()
- pipeline_string= ('DecisionTreeClassifier(SelectKBest(input_matrix, SelectKBest__k=20),'
- 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
- 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)')
+ pipeline_string = (
+ 'DecisionTreeClassifier(SelectKBest(input_matrix, SelectKBest__k=20),'
+ 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
+ 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)'
+ )
pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
expected_code = """import numpy as np
@@ -691,15 +735,15 @@ def test_export_pipeline_3():
"""
assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
+
def test_operator_export():
- """Assert that a TPOT operator can export properly with a function as a parameter to a classifier"""
+ """Assert that a TPOT operator can export properly with a function as a parameter to a classifier."""
export_string = TPOTSelectKBest.export(5)
assert export_string == "SelectKBest(score_func=f_classif, k=5)"
def test_indent():
- """Assert that indenting a multiline string by 4 spaces prepends 4 spaces before each new line"""
-
+ """Assert that indenting a multiline string by 4 spaces prepends 4 spaces before each new line."""
multiline_string = """test
test1
test2
@@ -714,18 +758,18 @@ def test_indent():
def test_operator_type():
- """Assert that TPOT operators return their type, e.g. "Classifier", "Preprocessor" """
+ """Assert that TPOT operators return their type, e.g. 'Classifier', 'Preprocessor'."""
assert TPOTSelectKBest.type() == "Preprocessor or Selector"
def test_get_by_name():
- """Assert that the Operator class returns operators by name appropriately"""
+ """Assert that the Operator class returns operators by name appropriately."""
tpot_obj = TPOTClassifier()
assert get_by_name("SelectKBest", tpot_obj.operators).__class__ == TPOTSelectKBest.__class__
def test_gen():
- """Assert that TPOT's gen_grow_safe function returns a pipeline of expected structure"""
+ """Assert that TPOT's gen_grow_safe function returns a pipeline of expected structure."""
tpot_obj = TPOTClassifier()
pipeline = tpot_obj._gen_grow_safe(tpot_obj._pset, 1, 3)
@@ -735,46 +779,30 @@ def test_gen():
def test_positive_integer():
- """Assert that the TPOT CLI interface's integer parsing throws an exception when n < 0"""
- try:
- positive_integer('-1')
- assert False # Should be unreachable
- except Exception:
- pass
+ """Assert that the TPOT CLI interface's integer parsing throws an exception when n < 0."""
+ assert_raises(Exception, positive_integer, '-1')
def test_positive_integer_2():
- """Assert that the TPOT CLI interface's integer parsing returns the integer value of a string encoded integer when n > 0"""
+ """Assert that the TPOT CLI interface's integer parsing returns the integer value of a string encoded integer when n > 0."""
assert 1 == positive_integer('1')
def test_positive_integer_3():
- """Assert that the TPOT CLI interface's integer parsing throws an exception when n is not an integer"""
- try:
- positive_integer('foobar')
- assert False # Should be unreachable
- except Exception:
- pass
+ """Assert that the TPOT CLI interface's integer parsing throws an exception when n is not an integer."""
+ assert_raises(Exception, positive_integer, 'foobar')
def test_float_range():
- """Assert that the TPOT CLI interface's float range returns a float with input is in 0. - 1.0"""
+ """Assert that the TPOT CLI interface's float range returns a float with input is in 0. - 1.0."""
assert 0.5 == float_range('0.5')
def test_float_range_2():
- """Assert that the TPOT CLI interface's float range throws an exception when input it out of range"""
- try:
- float_range('2.0')
- assert False # Should be unreachable
- except Exception:
- pass
+ """Assert that the TPOT CLI interface's float range throws an exception when input it out of range."""
+ assert_raises(Exception, float_range, '2.0')
def test_float_range_3():
- """Assert that the TPOT CLI interface's float range throws an exception when input is not a float"""
- try:
- float_range('foobar')
- assert False # Should be unreachable
- except Exception:
- pass
+ """Assert that the TPOT CLI interface's float range throws an exception when input is not a float."""
+ assert_raises(Exception, float_range, 'foobar')
diff --git a/tpot/__init__.py b/tpot/__init__.py
index 6a11bcd96..f1d674528 100644
--- a/tpot/__init__.py
+++ b/tpot/__init__.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
diff --git a/tpot/_version.py b/tpot/_version.py
index e154a1d7c..7c3861417 100644
--- a/tpot/_version.py
+++ b/tpot/_version.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
diff --git a/tpot/base.py b/tpot/base.py
index df1b8fc17..59cf712da 100644
--- a/tpot/base.py
+++ b/tpot/base.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -25,7 +24,6 @@
import inspect
import warnings
import sys
-import time
from functools import partial
from datetime import datetime
from multiprocessing import cpu_count
@@ -66,16 +64,18 @@
import _thread
except ImportError:
import thread as _thread
+
def handler(dwCtrlType, hook_sigint=_thread.interrupt_main):
- if dwCtrlType == 0: # CTRL_C_EVENT
+ """SIGINT handler function."""
+ if dwCtrlType == 0: # CTRL_C_EVENT
hook_sigint()
- return 1 # don't chain to the next handler
+ return 1 # don't chain to the next handler
return 0
win32api.SetConsoleCtrlHandler(handler, 1)
class TPOTBase(BaseEstimator):
- """TPOT automatically creates and optimizes machine learning pipelines using genetic programming"""
+ """Automatically creates and optimizes machine learning pipelines using GP."""
def __init__(self, generations=100, population_size=100, offspring_size=None,
mutation_rate=0.9, crossover_rate=0.1,
@@ -83,7 +83,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
max_time_mins=None, max_eval_time_mins=5,
random_state=None, config_dict=None, warm_start=False,
verbosity=0, disable_update_check=False):
- """Sets up the genetic programming algorithm for pipeline optimization.
+ """Set up the genetic programming algorithm for pipeline optimization.
Parameters
----------
@@ -194,48 +194,34 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
else:
self.offspring_size = population_size
- if config_dict:
- if config_dict == 'TPOT light':
- if self.classification:
- self.config_dict = classifier_config_dict_light
- else:
- self.config_dict = regressor_config_dict_light
- elif config_dict == 'TPOT MDR':
- if self.classification:
- self.config_dict = tpot_mdr_classifier_config_dict
- else:
- raise TypeError('The TPOT MDR operator configuration file does not currently '
- 'work with TPOTRegressor. Please use TPOTClassifier instead.')
- else:
- try:
- with open(config_dict, 'r') as input_file:
- file_string = input_file.read()
- operator_dict = eval(file_string[file_string.find('{'):(file_string.rfind('}') + 1)])
- except:
- raise TypeError('The operator configuration file is in a bad format or not available. '
- 'Please check the configuration file before running TPOT.')
- else:
- self.config_dict = self.default_config_dict
+ self._setup_config(config_dict)
self.operators = []
self.arguments = []
for key in sorted(self.config_dict.keys()):
- op_class, arg_types = TPOTOperatorClassFactory(key, self.config_dict[key],
- BaseClass=Operator, ArgBaseClass=ARGType)
+ op_class, arg_types = TPOTOperatorClassFactory(
+ key,
+ self.config_dict[key],
+ BaseClass=Operator,
+ ArgBaseClass=ARGType
+ )
if op_class:
self.operators.append(op_class)
self.arguments += arg_types
- # Schedule TPOT to run for many generations if the user specifies a run-time limit
- # TPOT will automatically interrupt itself when the timer runs out
- if not (max_time_mins is None):
+ # Schedule TPOT to run for many generations if the user specifies a
+ # run-time limit TPOT will automatically interrupt itself when the timer
+ # runs out
+ if max_time_mins is not None:
self.generations = 1000000
self.mutation_rate = mutation_rate
self.crossover_rate = crossover_rate
if self.mutation_rate + self.crossover_rate > 1:
- raise ValueError('The sum of the crossover and mutation probabilities must be <= 1.0.')
+ raise ValueError(
+ 'The sum of the crossover and mutation probabilities must be <= 1.0.'
+ )
self.verbosity = verbosity
self.operators_context = {
@@ -245,43 +231,42 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
'FunctionTransformer': FunctionTransformer,
'copy': copy
}
-
-
-
self._pbar = None
- # Dictionary of individuals that have already been evaluated in previous generations
+ # Dictionary of individuals that have already been evaluated in previous
+ # generations
self._evaluated_individuals = {}
-
self.random_state = random_state
- # If the user passed a custom scoring function, store it in the sklearn SCORERS dictionary
+ # If the user passed a custom scoring function, store it in the sklearn
+ # SCORERS dictionary
if scoring:
if hasattr(scoring, '__call__'):
scoring_name = scoring.__name__
-
- if 'loss' in scoring_name or 'error' in scoring_name:
- greater_is_better = False
- else:
- greater_is_better = True
-
+ greater_is_better = 'loss' not in scoring_name and 'error' not in scoring_name
SCORERS[scoring_name] = make_scorer(scoring, greater_is_better=greater_is_better)
self.scoring_function = scoring_name
else:
if scoring not in SCORERS:
- raise ValueError('The scoring function {} is not available. '
- 'Please choose a valid scoring function from the TPOT '
- 'documentation.'.format(scoring))
+ raise ValueError(
+ 'The scoring function {} is not available. Please '
+ 'choose a valid scoring function from the TPOT '
+ 'documentation.'.format(scoring)
+ )
self.scoring_function = scoring
self.cv = cv
# If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module
if sys.platform.startswith('win') and n_jobs != 1:
- print('Warning: Although parallelization is currently supported in TPOT for Windows, '
- 'pressing Ctrl+C will freeze the optimization process without saving the best pipeline!'
- 'Thus, Please DO NOT press Ctrl+C during the optimization procss if n_jobs is not equal to 1.'
- 'For quick test in Windows, please set n_jobs to 1 for saving the best pipeline '
- 'in the middle of the optimization process via Ctrl+C.')
+ print(
+ 'Warning: Although parallelization is currently supported in '
+ 'TPOT for Windows, pressing Ctrl+C will freeze the optimization '
+ 'process without saving the best pipeline! Thus, Please DO NOT '
+ 'press Ctrl+C during the optimization procss if n_jobs is not '
+ 'equal to 1. For quick test in Windows, please set n_jobs to 1 '
+ 'for saving the best pipeline in the middle of the optimization '
+ 'process via Ctrl+C.'
+ )
if n_jobs == -1:
self.n_jobs = cpu_count()
else:
@@ -290,45 +275,76 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
self._setup_pset()
self._setup_toolbox()
+ def _setup_config(self, config_dict):
+ if config_dict:
+ if config_dict == 'TPOT light':
+ if self.classification:
+ self.config_dict = classifier_config_dict_light
+ else:
+ self.config_dict = regressor_config_dict_light
+ elif config_dict == 'TPOT MDR':
+ if self.classification:
+ self.config_dict = tpot_mdr_classifier_config_dict
+ else:
+ raise TypeError(
+ 'The TPOT MDR operator configuration file does not '
+ 'currently work with TPOTRegressor. Please use '
+ 'TPOTClassifier instead.'
+ )
+ else:
+ try:
+ with open(config_dict, 'r') as input_file:
+ file_string = input_file.read()
+ self.config_dict = eval(file_string[file_string.find('{'):(file_string.rfind('}') + 1)])
+ except Exception:
+ raise TypeError(
+ 'The operator configuration file is in a bad format or '
+ 'not available. Please check the configuration file '
+ 'before running TPOT.'
+ )
+ else:
+ self.config_dict = self.default_config_dict
+
def _setup_pset(self):
if self.random_state is not None:
random.seed(self.random_state)
np.random.seed(self.random_state)
self._pset = gp.PrimitiveSetTyped('MAIN', [np.ndarray], Output_Array)
-
- # Rename pipeline input to "input_df"
self._pset.renameArguments(ARG0='input_matrix')
+ self._add_operators()
+ self._add_terminals()
+ if self.verbosity > 2:
+ print('{} operators have been imported by TPOT.'.format(len(self.operators)))
- # Add all operators to the primitive set
- for op in self.operators:
-
- if op.root:
+ def _add_operators(self):
+ for operator in self.operators:
+ if operator.root:
# We need to add rooted primitives twice so that they can
# return both an Output_Array (and thus be the root of the tree),
# and return a np.ndarray so they can exist elsewhere in the tree.
- p_types = (op.parameter_types()[0], Output_Array)
- self._pset.addPrimitive(op, *p_types)
+ p_types = (operator.parameter_types()[0], Output_Array)
+ self._pset.addPrimitive(operator, *p_types)
- self._pset.addPrimitive(op, *op.parameter_types())
+ self._pset.addPrimitive(operator, *operator.parameter_types())
# Import required modules into local namespace so that pipelines
# may be evaluated directly
- for key in sorted(op.import_hash.keys()):
- module_list = ', '.join(sorted(op.import_hash[key]))
+ for key in sorted(operator.import_hash.keys()):
+ module_list = ', '.join(sorted(operator.import_hash[key]))
if key.startswith('tpot.'):
exec('from {} import {}'.format(key[4:], module_list))
else:
exec('from {} import {}'.format(key, module_list))
- for var in op.import_hash[key]:
+ for var in operator.import_hash[key]:
self.operators_context[var] = eval(var)
self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray)
- # Terminals
+ def _add_terminals(self):
for _type in self.arguments:
type_values = list(_type.values)
if 'nthread' not in _type.__name__:
@@ -338,10 +354,6 @@ def _setup_pset(self):
terminal_name = _type.__name__ + "=" + str(val)
self._pset.addTerminal(val, _type, name=terminal_name)
- if self.verbosity > 2:
- print('{} operators have been imported by TPOT.'.format(len(self.operators)))
-
-
def _setup_toolbox(self):
creator.create('FitnessMulti', base.Fitness, weights=(-1.0, 1.0))
creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMulti)
@@ -357,8 +369,7 @@ def _setup_toolbox(self):
self._toolbox.register('mutate', self._random_mutation_operator)
def fit(self, features, classes, sample_weight=None):
- """Fits a machine learning pipeline that maximizes classification score
- on the provided data
+ """Fit an optimitzed machine learning pipeline.
Uses genetic programming to optimize a machine learning pipeline that
maximizes classification score on the provided features and classes.
@@ -389,7 +400,7 @@ def fit(self, features, classes, sample_weight=None):
try:
clf = clf.fit(features, classes)
- except:
+ except Exception:
raise ValueError('Error: Input data is not in a valid format. '
'Please confirm that the input data is scikit-learn compatible. '
'For example, the features must be a 2-D array and target labels '
@@ -397,7 +408,7 @@ def fit(self, features, classes, sample_weight=None):
# Set the seed for the GP run
if self.random_state is not None:
- random.seed(self.random_state) # deap uses random
+ random.seed(self.random_state) # deap uses random
np.random.seed(self.random_state)
self._start_datetime = datetime.now()
@@ -411,7 +422,7 @@ def fit(self, features, classes, sample_weight=None):
pop = self._toolbox.population(n=self.population_size)
def pareto_eq(ind1, ind2):
- """Determines whether two individuals are equal on the Pareto front
+ """Determine whether two individuals are equal on the Pareto front.
Parameters
----------
@@ -445,11 +456,19 @@ def pareto_eq(ind1, ind2):
try:
with warnings.catch_warnings():
warnings.simplefilter('ignore')
- pop, _ = eaMuPlusLambda(population=pop, toolbox=self._toolbox,
- mu=self.population_size, lambda_=self.offspring_size,
- cxpb=self.crossover_rate, mutpb=self.mutation_rate,
- ngen=self.generations, pbar=self._pbar, halloffame=self._pareto_front,
- verbose=self.verbosity, max_time_mins=self.max_time_mins)
+ pop, _ = eaMuPlusLambda(
+ population=pop,
+ toolbox=self._toolbox,
+ mu=self.population_size,
+ lambda_=self.offspring_size,
+ cxpb=self.crossover_rate,
+ mutpb=self.mutation_rate,
+ ngen=self.generations,
+ pbar=self._pbar,
+ halloffame=self._pareto_front,
+ verbose=self.verbosity,
+ max_time_mins=self.max_time_mins
+ )
# store population for the next call
if self.warm_start:
@@ -458,7 +477,7 @@ def pareto_eq(ind1, ind2):
# Allow for certain exceptions to signal a premature fit() cancellation
except (KeyboardInterrupt, SystemExit):
if self.verbosity > 0:
- self._pbar.write('') # just for better interface
+ self._pbar.write('')
self._pbar.write('TPOT closed prematurely. Will use the current best pipeline.')
finally:
# Close the progress bar
@@ -507,7 +526,7 @@ def pareto_eq(ind1, ind2):
self._pareto_front_fitted_pipelines[str(pipeline)].fit(features, classes)
def predict(self, features):
- """Uses the optimized pipeline to predict the classes for a feature set
+ """Use the optimized pipeline to predict the classes for a feature set.
Parameters
----------
@@ -525,8 +544,7 @@ def predict(self, features):
return self._fitted_pipeline.predict(features.astype(np.float64))
def fit_predict(self, features, classes):
- """Convenience function that fits a pipeline then predicts on the
- provided features
+ """Call fit and predict in sequence.
Parameters
----------
@@ -545,7 +563,7 @@ def fit_predict(self, features, classes):
return self.predict(features)
def score(self, testing_features, testing_classes):
- """Estimates the balanced testing accuracy of the optimized pipeline.
+ """Estimate the balanced testing accuracy of the optimized pipeline.
Parameters
----------
@@ -563,12 +581,17 @@ def score(self, testing_features, testing_classes):
if self._fitted_pipeline is None:
raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.')
- # If the scoring function is a string, we must adjust to use the sklearn scoring interface
- return abs(SCORERS[self.scoring_function](self._fitted_pipeline,
- testing_features.astype(np.float64), testing_classes.astype(np.float64)))
+ # If the scoring function is a string, we must adjust to use the sklearn
+ # scoring interface
+ score = SCORERS[self.scoring_function](
+ self._fitted_pipeline,
+ testing_features.astype(np.float64),
+ testing_classes.astype(np.float64)
+ )
+ return abs(score)
def predict_proba(self, features):
- """Uses the optimized pipeline to estimate the class probabilities for a feature set
+ """Use the optimized pipeline to estimate the class probabilities for a feature set.
Parameters
----------
@@ -589,7 +612,7 @@ def predict_proba(self, features):
return self._fitted_pipeline.predict_proba(features.astype(np.float64))
def set_params(self, **params):
- """Set the parameters of a TPOT instance
+ """Set the parameters of TPOT.
Returns
-------
@@ -600,7 +623,7 @@ def set_params(self, **params):
return self
def export(self, output_file_name):
- """Exports the current optimized pipeline as Python code
+ """Export the current optimized pipeline as Python code.
Parameters
----------
@@ -619,7 +642,7 @@ def export(self, output_file_name):
output_file.write(export_pipeline(self._optimized_pipeline, self.operators, self._pset))
def _compile_to_sklearn(self, expr):
- """Compiles a DEAP pipeline into a sklearn pipeline
+ """Compile a DEAP pipeline into a sklearn pipeline.
Parameters
----------
@@ -634,7 +657,7 @@ def _compile_to_sklearn(self, expr):
return eval(sklearn_pipeline, self.operators_context)
def _set_param_recursive(self, pipeline_steps, parameter, value):
- """Recursively iterates through all objects in the pipeline and sets the given parameter to the specified value
+ """Recursively iterate through all objects in the pipeline and set a given parameter.
Parameters
----------
@@ -660,8 +683,8 @@ def _set_param_recursive(self, pipeline_steps, parameter, value):
if hasattr(obj, parameter):
setattr(obj, parameter, value)
- def _evaluate_individuals(self, individuals, features, classes, sample_weight = None):
- """Determines the `individual`'s fitness
+ def _evaluate_individuals(self, individuals, features, classes, sample_weight=None):
+ """Determine the fit of the provided individuals.
Parameters
----------
@@ -727,8 +750,7 @@ def _evaluate_individuals(self, individuals, features, classes, sample_weight =
operator_count = 0
for i in range(len(individual)):
node = individual[i]
- if ((type(node) is deap.gp.Terminal) or
- type(node) is deap.gp.Primitive and node.name == 'CombineDFs'):
+ if ((type(node) is deap.gp.Terminal) or (type(node) is deap.gp.Primitive and node.name == 'CombineDFs')):
continue
operator_count += 1
except Exception:
@@ -744,11 +766,21 @@ def _evaluate_individuals(self, individuals, features, classes, sample_weight =
# evalurate pipeline
resulting_score_list = []
# chunk size for pbar update
- for chunk_idx in range(0, len(sklearn_pipeline_list),self.n_jobs*4):
+ for chunk_idx in range(0, len(sklearn_pipeline_list), self.n_jobs * 4):
+ jobs = []
+ for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx + self.n_jobs * 4]:
+ job = delayed(_wrapped_cross_val_score)(
+ sklearn_pipeline,
+ features,
+ classes,
+ self.cv,
+ self.scoring_function,
+ sample_weight,
+ self.max_eval_time_mins
+ )
+ jobs.append(job)
parallel = Parallel(n_jobs=self.n_jobs, verbose=0, pre_dispatch='2*n_jobs')
- tmp_result_score = parallel(delayed(_wrapped_cross_val_score)(sklearn_pipeline, features, classes,
- self.cv, self.scoring_function, sample_weight, self.max_eval_time_mins)
- for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx+self.n_jobs*4])
+ tmp_result_score = parallel(jobs)
# update pbar
for val in tmp_result_score:
if not self._pbar.disable:
@@ -779,7 +811,7 @@ def _mate_operator(self, ind1, ind2):
@_pre_test
def _random_mutation_operator(self, individual):
- """Perform a replacement, insertion, or shrink mutation on an individual
+ """Perform a replacement, insertion, or shrink mutation on an individual.
Parameters
----------
@@ -801,8 +833,7 @@ def _random_mutation_operator(self, individual):
return np.random.choice(mutation_techniques)(individual)
def _gen_grow_safe(self, pset, min_, max_, type_=None):
- """Generate an expression where each leaf might have a different depth
- between min_ and max_.
+ """Generate an expression where each leaf might have a different depth between min_ and max_.
Parameters
----------
@@ -822,8 +853,7 @@ def _gen_grow_safe(self, pset, min_, max_, type_=None):
A grown tree with leaves at possibly different depths.
"""
def condition(height, depth, type_):
- """Expression generation stops when the depth is equal to height or
- when it is randomly determined that a a node should be a terminal"""
+ """Stop when the depth is equal to height or when a node should be a terminal."""
return type_ not in [np.ndarray, Output_Array] or depth == height
return self._generate(pset, min_, max_, condition, type_)
@@ -831,8 +861,10 @@ def condition(height, depth, type_):
# Generate function stolen straight from deap.gp.generate
@_pre_test
def _generate(self, pset, min_, max_, condition, type_=None):
- """Generate a Tree as a list of list. The tree is build from the root to
- the leaves, and it stop growing when the condition is fulfilled.
+ """Generate a Tree as a list of lists.
+
+ The tree is build from the root to the leaves, and it stop growing when
+ the condition is fulfilled.
Parameters
----------
@@ -870,10 +902,10 @@ def _generate(self, pset, min_, max_, condition, type_=None):
term = np.random.choice(pset.terminals[type_])
except IndexError:
_, _, traceback = sys.exc_info()
- raise IndexError("The gp.generate function tried to add "
- "a terminal of type '%s', but there is "
- "none available." % (type_,)).\
- with_traceback(traceback)
+ raise IndexError(
+ 'The gp.generate function tried to add a terminal of '
+ 'type \'%s\', but there is none available.' % (type_,)
+ ).with_traceback(traceback)
if inspect.isclass(term):
term = term()
expr.append(term)
@@ -882,10 +914,10 @@ def _generate(self, pset, min_, max_, condition, type_=None):
prim = np.random.choice(pset.primitives[type_])
except IndexError:
_, _, traceback = sys.exc_info()
- raise IndexError("The gp.generate function tried to add "
- "a primitive of type '%s', but there is "
- "none available." % (type_,)).\
- with_traceback(traceback)
+ raise IndexError(
+ 'The gp.generate function tried to add a terminal of '
+ 'type \'%s\', but there is none available.' % (type_,)
+ ).with_traceback(traceback)
expr.append(prim)
for arg in reversed(prim.args):
stack.append((depth+1, arg))
diff --git a/tpot/built_in_operators.py b/tpot/built_in_operators.py
index df2a420e9..a7d49889f 100644
--- a/tpot/built_in_operators.py
+++ b/tpot/built_in_operators.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -26,18 +25,14 @@
class ZeroCount(BaseEstimator):
-
- """Preprocessor that adds two virtual features to the dataset, one for the count of zero values in the feature set, and one for the count of non-zeros in the feature set"""
-
- def __init__(self):
- pass
+ """Adds the count of zeros and count of non-zeros per sample as features."""
def fit(self, X, y=None):
- """Dummy function to fit in with the sklearn API"""
+ """Dummy function to fit in with the sklearn API."""
return self
def transform(self, X, y=None):
- """Transform data by adding two virtual features
+ """Transform data by adding two virtual features.
Parameters
----------
@@ -57,19 +52,27 @@ def transform(self, X, y=None):
X_transformed = np.copy(X)
- non_zero = np.apply_along_axis(lambda row: np.count_nonzero(row),
- axis=1, arr=X_transformed)
- zero_col = np.apply_along_axis(lambda row: (n_features - np.count_nonzero(row)),
- axis=1, arr=X_transformed)
+ non_zero = np.apply_along_axis(
+ lambda row: np.count_nonzero(row),
+ axis=1,
+ arr=X_transformed
+ )
+ zero_col = np.apply_along_axis(
+ lambda row: (n_features - np.count_nonzero(row)),
+ axis=1,
+ arr=X_transformed
+ )
X_transformed = np.insert(X_transformed, n_features, non_zero, axis=1)
X_transformed = np.insert(X_transformed, n_features + 1, zero_col, axis=1)
return X_transformed
+
class CombineDFs(object):
- """Operator to combine two DataFrames"""
+ """Combine two DataFrames."""
@property
def __name__(self):
+ """Instance name is the same as the class name."""
return self.__class__.__name__
diff --git a/tpot/config_classifier.py b/tpot/config_classifier.py
index b3595d77e..7fdcb6229 100644
--- a/tpot/config_classifier.py
+++ b/tpot/config_classifier.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -17,18 +16,11 @@
You should have received a copy of the GNU Lesser General Public
License along with TPOT. If not, see .
-
-
-dictionary format (json-like format):
-key:
- operator name
-value:
- source: module source (e.g sklearn.tree)
- dependencies: depended module (e.g. ExtraTreesClassifier in selectors RFE); None for no dependency
- params: a dictionary of parameter names (keys) and parameter ranges (values); None for no dependency
"""
import numpy as np
+# Check the TPOT documentation for information on the structure of config dicts
+
classifier_config_dict = {
# Classifiers
@@ -168,22 +160,22 @@
'alpha': np.arange(0, 0.05, 0.001),
'score_func': {
'sklearn.feature_selection.f_classif': None
- } # read from dependencies ! need add an exception in preprocess_args
+ } # read from dependencies ! need add an exception in preprocess_args
},
'sklearn.feature_selection.SelectKBest': {
- 'k': range(1, 100), # need check range!
+ 'k': range(1, 100), # TODO: Check range
'score_func': {
'sklearn.feature_selection.f_classif': None
- }
+ }
},
'sklearn.feature_selection.SelectPercentile': {
'percentile': range(1, 100),
'score_func': {
'sklearn.feature_selection.f_classif': None
- }
+ }
},
'sklearn.feature_selection.VarianceThreshold': {
@@ -197,18 +189,18 @@
'n_estimators': [100],
'criterion': ['gini', 'entropy'],
'max_features': np.arange(0.05, 1.01, 0.05)
- }
+ }
}
},
- 'sklearn.feature_selection.SelectFromModel': {
+ 'sklearn.feature_selection.SelectFromModel': {
'threshold': np.arange(0, 1.01, 0.05),
'estimator': {
'sklearn.ensemble.ExtraTreesClassifier': {
'n_estimators': [100],
'criterion': ['gini', 'entropy'],
'max_features': np.arange(0.05, 1.01, 0.05)
- }
+ }
}
}
diff --git a/tpot/config_classifier_light.py b/tpot/config_classifier_light.py
index d06115489..a4006d567 100644
--- a/tpot/config_classifier_light.py
+++ b/tpot/config_classifier_light.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -17,18 +16,11 @@
You should have received a copy of the GNU Lesser General Public
License along with TPOT. If not, see .
-
-
-dictionary format (json-like format):
-key:
- operator name
-value:
- source: module source (e.g sklearn.tree)
- dependencies: depended module (e.g. ExtraTreesClassifier in selectors RFE); None for no dependency
- params: a dictionary of parameter names (keys) and parameter ranges (values); None for no dependency
"""
+
import numpy as np
+# Check the TPOT documentation for information on the structure of config dicts
classifier_config_dict_light = {
# Classifiers
@@ -109,15 +101,15 @@
'alpha': np.arange(0, 0.05, 0.001),
'score_func': {
'sklearn.feature_selection.f_classif': None
- } # read from dependencies ! need add an exception in preprocess_args
+ } # read from dependencies ! need add an exception in preprocess_args
},
'sklearn.feature_selection.SelectKBest': {
- 'k': range(1, 100), # need check range!
+ 'k': range(1, 100), # TODO: Check range
'score_func': {
'sklearn.feature_selection.f_classif': None
- }
+ }
},
'sklearn.feature_selection.SelectPercentile': {
diff --git a/tpot/config_classifier_mdr.py b/tpot/config_classifier_mdr.py
index a90a9a702..8404cefdd 100644
--- a/tpot/config_classifier_mdr.py
+++ b/tpot/config_classifier_mdr.py
@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -16,16 +15,10 @@
You should have received a copy of the GNU Lesser General Public
License along with TPOT. If not, see .
-
-dictionary format (json-like format):
-key:
- operator name
-value:
- source: module source (e.g sklearn.tree)
- dependencies: depended module (e.g. SVC in selectors RFE); None for no dependency
- params: a dictionary of parameter names (keys) and parameter ranges (values); None for no dependency
"""
+# Check the TPOT documentation for information on the structure of config dicts
+
tpot_mdr_classifier_config_dict = {
# Classifiers
diff --git a/tpot/config_regressor.py b/tpot/config_regressor.py
index 99d21ce3b..7bf86c633 100644
--- a/tpot/config_regressor.py
+++ b/tpot/config_regressor.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -17,18 +16,11 @@
You should have received a copy of the GNU Lesser General Public
License along with TPOT. If not, see .
-
-
-dictionary format (json-like format):
-key:
- operator name
-value:
- source: module source (e.g sklearn.tree)
- dependencies: depended module (e.g. ExtraTreesClassifier in selectors RFE); None for no dependency
- params: a dictionary of parameter names (keys) and parameter ranges (values); None for no params
"""
import numpy as np
+# Check the TPOT documentation for information on the structure of config dicts
+
regressor_config_dict = {
@@ -168,22 +160,22 @@
'alpha': np.arange(0, 0.05, 0.001),
'score_func': {
'sklearn.feature_selection.f_classif': None
- } # read from dependencies ! need add an exception in preprocess_args
+ } # read from dependencies ! need add an exception in preprocess_args
},
'sklearn.feature_selection.SelectKBest': {
- 'k': range(1, 100), # need check range!
+ 'k': range(1, 100), # TODO: Check range
'score_func': {
'sklearn.feature_selection.f_classif': None
- }
+ }
},
'sklearn.feature_selection.SelectPercentile': {
'percentile': range(1, 100),
'score_func': {
'sklearn.feature_selection.f_classif': None
- }
+ }
},
'sklearn.feature_selection.VarianceThreshold': {
@@ -193,12 +185,11 @@
'sklearn.feature_selection.SelectFromModel': {
'threshold': np.arange(0, 1.01, 0.05),
'estimator': {
- 'sklearn.ensemble.ExtraTreesRegressor': {
- 'n_estimators': [100],
- 'max_features': np.arange(0.05, 1.01, 0.05)
- }
- }
-
+ 'sklearn.ensemble.ExtraTreesRegressor': {
+ 'n_estimators': [100],
+ 'max_features': np.arange(0.05, 1.01, 0.05)
+ }
+ }
}
}
diff --git a/tpot/config_regressor_light.py b/tpot/config_regressor_light.py
index b0e8b73d8..3fd4c237b 100644
--- a/tpot/config_regressor_light.py
+++ b/tpot/config_regressor_light.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -17,20 +16,12 @@
You should have received a copy of the GNU Lesser General Public
License along with TPOT. If not, see .
-
-
-dictionary format (json-like format):
-key:
- operator name
-value:
- source: module source (e.g sklearn.tree)
- dependencies: depended module (e.g. ExtraTreesClassifier in selectors RFE); None for no dependency
- params: a dictionary of parameter names (keys) and parameter ranges (values); None for no params
"""
import numpy as np
-regressor_config_dict_light = {
+# Check the TPOT documentation for information on the structure of config dicts
+regressor_config_dict_light = {
'sklearn.linear_model.ElasticNetCV': {
'l1_ratio': np.arange(0.0, 1.01, 0.05),
@@ -115,22 +106,22 @@
'alpha': np.arange(0, 0.05, 0.001),
'score_func': {
'sklearn.feature_selection.f_classif': None
- } # read from dependencies ! need add an exception in preprocess_args
+ } # read from dependencies ! need add an exception in preprocess_args
},
'sklearn.feature_selection.SelectKBest': {
- 'k': range(1, 100), # need check range!
+ 'k': range(1, 100), # TODO: Check range
'score_func': {
'sklearn.feature_selection.f_classif': None
- }
+ }
},
'sklearn.feature_selection.SelectPercentile': {
'percentile': range(1, 100),
'score_func': {
'sklearn.feature_selection.f_classif': None
- }
+ }
},
'sklearn.feature_selection.VarianceThreshold': {
diff --git a/tpot/decorators.py b/tpot/decorators.py
index fbe4ac09a..102b0c09a 100644
--- a/tpot/decorators.py
+++ b/tpot/decorators.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -26,6 +25,9 @@
from sklearn.datasets import make_classification, make_regression
from .export_utils import expr_to_tree, generate_pipeline_code
from deap import creator
+
+NUM_TESTS = 10
+
# generate a small data set for a new pipeline, in order to check if the pipeline
# has unsuppported combinations in params
pretest_X, pretest_y = make_classification(n_samples=50, n_features=10, random_state=42)
@@ -33,35 +35,48 @@
def _pre_test(func):
- """Decorator that wraps functions to check if the pipeline works with a pretest data set
- If not, then rerun the func until it generates a good pipeline
+ """Check if the wrapped function works with a pretest data set.
+
+ Reruns the wrapped function until it generates a good pipeline, for a max of
+ NUM_TESTS times.
Parameters
----------
func: function
- The function being decorated
+ The decorated function.
Returns
-------
- wrapped_func: function
+ check_pipeline: function
A wrapper function around the func parameter
"""
@wraps(func)
def check_pipeline(self, *args, **kwargs):
bad_pipeline = True
- num_test = 0 # number of tests
- while bad_pipeline and num_test < 10: # a pool for workable pipeline
- # clone individual before each func call so it is not altered for the possible next cycle loop
+ num_test = 0 # number of tests
+
+ # a pool for workable pipeline
+ while bad_pipeline and num_test < NUM_TESTS:
+ # clone individual before each func call so it is not altered for
+ # the possible next cycle loop
args = [self._toolbox.clone(arg) if isinstance(arg, creator.Individual) else arg for arg in args]
+
try:
with warnings.catch_warnings():
warnings.simplefilter('ignore')
+
expr = func(self, *args, **kwargs)
- # mutation operator returns tuple (ind,); crossover operator returns tuple (ind1, ind2)
+ # mutation operator returns tuple (ind,); crossover operator
+ # returns tuple of (ind1, ind2)
expr_tuple = expr if isinstance(expr, tuple) else (expr,)
+
for expr_test in expr_tuple:
- #print(num_test, generate_pipeline_code(expr_to_tree(expr), self.operators)) # debug
- sklearn_pipeline = eval(generate_pipeline_code(expr_to_tree(expr_test, self._pset), self.operators), self.operators_context)
+ pipeline_code = generate_pipeline_code(
+ expr_to_tree(expr_test, self._pset),
+ self.operators
+ )
+ sklearn_pipeline = eval(pipeline_code, self.operators_context)
+
if self.classification:
sklearn_pipeline.fit(pretest_X, pretest_y)
else:
@@ -69,11 +84,16 @@ def check_pipeline(self, *args, **kwargs):
bad_pipeline = False
except BaseException as e:
if self.verbosity > 2:
- print_function = print
+ message = '_pre_test decorator: {fname}: num_test={n} {e}'.format(
+ n=num_test,
+ fname=func.__name__,
+ e=e
+ )
# Use the pbar output stream if it's active
if not isinstance(self._pbar, type(None)):
- print_function = self._pbar.write
- print_function('_pre_test decorator: {fname}: num_test={n} {e}'.format(n=num_test, fname=func.__name__, e=e))
+ self._pbar.write(message)
+ else:
+ print(message)
finally:
num_test += 1
diff --git a/tpot/driver.py b/tpot/driver.py
old mode 100644
new mode 100755
index 69c63cbb8..d1e5fb176
--- a/tpot/driver.py
+++ b/tpot/driver.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -29,7 +28,7 @@
def positive_integer(value):
- """Ensures that the provided value is a positive integer. Throws an exception otherwise.
+ """Ensure that the provided value is a positive integer.
Parameters
----------
@@ -51,7 +50,7 @@ def positive_integer(value):
def float_range(value):
- """Ensures that the provided value is a float integer in the range [0., 1.]. Throws an exception otherwise.
+ """Ensure that the provided value is a float integer in the range [0., 1.].
Parameters
----------
@@ -65,176 +64,387 @@ def float_range(value):
"""
try:
value = float(value)
- except:
+ except Exception:
raise argparse.ArgumentTypeError('Invalid float value: \'{}\''.format(value))
if value < 0.0 or value > 1.0:
raise argparse.ArgumentTypeError('Invalid float value: \'{}\''.format(value))
return value
-def main():
- """Main function that is called when TPOT is run on the command line"""
- parser = argparse.ArgumentParser(description='A Python tool that '
- 'automatically creates and optimizes machine learning pipelines using '
- 'genetic programming.', add_help=False)
-
- parser.add_argument('INPUT_FILE', type=str, help='Data file to use in the TPOT '
- 'optimization process. Ensure that the class label column is labeled as "class".')
-
- parser.add_argument('-h', '--help', action='help',
- help='Show this help message and exit.')
-
- parser.add_argument('-is', action='store', dest='INPUT_SEPARATOR', default='\t',
- type=str, help='Character used to separate columns in the input file.')
-
- parser.add_argument('-target', action='store', dest='TARGET_NAME', default='class',
- type=str, help='Name of the target column in the input file.')
-
- parser.add_argument('-mode', action='store', dest='TPOT_MODE',
- choices=['classification', 'regression'], default='classification', type=str,
- help='Whether TPOT is being used for a supervised classification or regression problem.')
-
- parser.add_argument('-o', action='store', dest='OUTPUT_FILE', default='',
- type=str, help='File to export the code for the final optimized pipeline.')
-
- parser.add_argument('-g', action='store', dest='GENERATIONS', default=100,
- type=positive_integer, help='Number of iterations to run the pipeline optimization process.\n'
- 'Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. '
- 'TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.')
-
- parser.add_argument('-p', action='store', dest='POPULATION_SIZE', default=100,
- type=positive_integer, help='Number of individuals to retain in the GP population every generation.\n'
- 'Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. '
- 'TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.')
-
- parser.add_argument('-os', action='store', dest='OFFSPRING_SIZE', default=None,
- type=positive_integer, help='Number of offspring to produce in each GP generation. '
- 'By default, OFFSPRING_SIZE = POPULATION_SIZE.')
-
- parser.add_argument('-mr', action='store', dest='MUTATION_RATE', default=0.9,
- type=float_range, help='GP mutation rate in the range [0.0, 1.0]. This tells the '
- 'GP algorithm how many pipelines to apply random changes to every generation. '
- 'We recommend using the default parameter unless you understand how the mutation '
- 'rate affects GP algorithms.')
-
- parser.add_argument('-xr', action='store', dest='CROSSOVER_RATE', default=0.1,
- type=float_range, help='GP crossover rate in the range [0.0, 1.0]. This tells the '
- 'GP algorithm how many pipelines to "breed" every generation. '
- 'We recommend using the default parameter unless you understand how the crossover '
- 'rate affects GP algorithms.')
-
- parser.add_argument('-scoring', action='store', dest='SCORING_FN', default=None,
- type=str, help='Function used to evaluate the quality of a given pipeline for '
- 'the problem. By default, accuracy is used for classification problems and mean '
- 'squared error (mse) is used for regression problems. '
- 'TPOT assumes that any function with "error" or "loss" in the name is meant to '
- 'be minimized, whereas any other functions will be maximized. '
- 'Offers the same options as cross_val_score: '
- '"accuracy", "adjusted_rand_score", "average_precision", "f1", "f1_macro", '
- '"f1_micro", "f1_samples", "f1_weighted", "log_loss", "mean_absolute_error", '
- '"mean_squared_error", "median_absolute_error", "precision", "precision_macro", '
- '"precision_micro", "precision_samples", "precision_weighted", "r2", "recall", '
- '"recall_macro", "recall_micro", "recall_samples", "recall_weighted", "roc_auc"')
-
- parser.add_argument('-cv', action='store', dest='CV', default=5,
- type=int, help='Number of folds to evaluate each pipeline over in '
- 'k-fold cross-validation during the TPOT optimization process.')
-
- parser.add_argument('-njobs', action='store', dest='NUM_JOBS', default=1,
- type=int, help='Number of CPUs for evaluating pipelines in parallel '
- ' during the TPOT optimization process. Assigning this to -1 will use as many '
- 'cores as available on the computer.')
-
- parser.add_argument('-maxtime', action='store', dest='MAX_TIME_MINS', default=None,
- type=int, help='How many minutes TPOT has to optimize the pipeline. This '
- 'setting will override the GENERATIONS parameter '
- 'and allow TPOT to run until it runs out of time.')
-
- parser.add_argument('-maxeval', action='store', dest='MAX_EVAL_MINS', default=5,
- type=float, help='How many minutes TPOT has to evaluate a single pipeline. '
- 'Setting this parameter to higher values will allow TPOT to explore more complex '
- 'pipelines but will also allow TPOT to run longer.')
-
- parser.add_argument('-s', action='store', dest='RANDOM_STATE', default=None,
- type=int, help='Random number generator seed for reproducibility. Set '
- 'this seed if you want your TPOT run to be reproducible with the same '
- 'seed and data set in the future.')
-
- parser.add_argument('-config', action='store', dest='CONFIG_FILE', default='',
- type=str, help='Configuration file for customizing the operators and parameters '
- 'that TPOT uses in the optimization process.')
-
- parser.add_argument('-v', action='store', dest='VERBOSITY', default=1,
- choices=[0, 1, 2, 3], type=int, help='How much information TPOT communicates '
- 'while it is running: 0 = none, 1 = minimal, 2 = high, 3 = all. '
- 'A setting of 2 or higher will add a progress bar during the optimization procedure.')
-
- parser.add_argument('--no-update-check', action='store_true',
- dest='DISABLE_UPDATE_CHECK', default=False,
- help='Flag indicating whether the TPOT version checker should be disabled.')
-
- parser.add_argument('--version', action='version',
+def _get_arg_parser():
+ """Main function that is called when TPOT is run on the command line."""
+ parser = argparse.ArgumentParser(
+ description=(
+ 'A Python tool that automatically creates and optimizes machine '
+ 'learning pipelines using genetic programming.'
+ ),
+ add_help=False
+ )
+
+ parser.add_argument(
+ 'INPUT_FILE',
+ type=str,
+ help=(
+ 'Data file to use in the TPOT optimization process. Ensure that '
+ 'the class label column is labeled as "class".'
+ )
+ )
+
+ parser.add_argument(
+ '-h',
+ '--help',
+ action='help',
+ help='Show this help message and exit.'
+ )
+
+ parser.add_argument(
+ '-is',
+ action='store',
+ dest='INPUT_SEPARATOR',
+ default='\t',
+ type=str,
+ help='Character used to separate columns in the input file.'
+ )
+
+ parser.add_argument(
+ '-target',
+ action='store',
+ dest='TARGET_NAME',
+ default='class',
+ type=str,
+ help='Name of the target column in the input file.'
+ )
+
+ parser.add_argument(
+ '-mode',
+ action='store',
+ dest='TPOT_MODE',
+ choices=['classification', 'regression'],
+ default='classification',
+ type=str,
+ help=(
+ 'Whether TPOT is being used for a supervised classification or '
+ 'regression problem.'
+ )
+ )
+
+ parser.add_argument(
+ '-o',
+ action='store',
+ dest='OUTPUT_FILE',
+ default='',
+ type=str,
+ help='File to export the code for the final optimized pipeline.'
+ )
+
+ parser.add_argument(
+ '-g',
+ action='store',
+ dest='GENERATIONS',
+ default=100,
+ type=positive_integer,
+ help=(
+ 'Number of iterations to run the pipeline optimization process. '
+ 'Generally, TPOT will work better when you give it more '
+ 'generations (and therefore time) to optimize the pipeline. TPOT '
+ 'will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE '
+ 'pipelines in total.'
+ )
+ )
+
+ parser.add_argument(
+ '-p',
+ action='store',
+ dest='POPULATION_SIZE',
+ default=100,
+ type=positive_integer,
+ help=(
+ 'Number of individuals to retain in the GP population every '
+ 'generation. Generally, TPOT will work better when you give it '
+ 'more individuals (and therefore time) to optimize the pipeline. '
+ 'TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE '
+ 'pipelines in total.'
+ )
+ )
+
+ parser.add_argument(
+ '-os',
+ action='store',
+ dest='OFFSPRING_SIZE',
+ default=None,
+ type=positive_integer,
+ help=(
+ 'Number of offspring to produce in each GP generation. By default,'
+ 'OFFSPRING_SIZE = POPULATION_SIZE.'
+ )
+ )
+
+ parser.add_argument(
+ '-mr',
+ action='store',
+ dest='MUTATION_RATE',
+ default=0.9,
+ type=float_range,
+ help=(
+ 'GP mutation rate in the range [0.0, 1.0]. This tells the GP '
+ 'algorithm how many pipelines to apply random changes to every '
+ 'generation. We recommend using the default parameter unless you '
+ 'understand how the mutation rate affects GP algorithms.'
+ )
+ )
+
+ parser.add_argument(
+ '-xr',
+ action='store',
+ dest='CROSSOVER_RATE',
+ default=0.1,
+ type=float_range,
+ help=(
+ 'GP crossover rate in the range [0.0, 1.0]. This tells the GP '
+ 'algorithm how many pipelines to "breed" every generation. We '
+ 'recommend using the default parameter unless you understand how '
+ 'the crossover rate affects GP algorithms.'
+ )
+ )
+
+ parser.add_argument(
+ '-scoring',
+ action='store',
+ dest='SCORING_FN',
+ default=None,
+ type=str,
+ help=(
+ 'Function used to evaluate the quality of a given pipeline for the '
+ 'problem. By default, accuracy is used for classification problems '
+ 'and mean squared error (mse) is used for regression problems. '
+ 'TPOT assumes that any function with "error" or "loss" in the name '
+ 'is meant to be minimized, whereas any other functions will be '
+ 'maximized. Offers the same options as cross_val_score: '
+ 'accuracy, '
+ 'adjusted_rand_score, '
+ 'average_precision, '
+ 'f1, '
+ 'f1_macro, '
+ 'f1_micro, '
+ 'f1_samples, '
+ 'f1_weighted, '
+ 'log_loss, '
+ 'mean_absolute_error, '
+ 'mean_squared_error, '
+ 'median_absolute_error, '
+ 'precision, '
+ 'precision_macro, '
+ 'precision_micro, '
+ 'precision_samples, '
+ 'precision_weighted, '
+ 'r2, '
+ 'recall, '
+ 'recall_macro, '
+ 'recall_micro, '
+ 'recall_samples, '
+ 'recall_weighted, '
+ 'roc_auc'
+ )
+ )
+
+ parser.add_argument(
+ '-cv',
+ action='store',
+ dest='NUM_CV_FOLDS',
+ default=5,
+ type=int,
+ help=(
+ 'Number of folds to evaluate each pipeline over in k-fold '
+ 'cross-validation during the TPOT optimization process.'
+ )
+ )
+
+ parser.add_argument(
+ '-njobs',
+ action='store',
+ dest='NUM_JOBS',
+ default=1,
+ type=int,
+ help=(
+ 'Number of CPUs for evaluating pipelines in parallel during the '
+ 'TPOT optimization process. Assigning this to -1 will use as many '
+ 'cores as available on the computer.'
+ )
+ )
+
+ parser.add_argument(
+ '-maxtime',
+ action='store',
+ dest='MAX_TIME_MINS',
+ default=None,
+ type=int,
+ help=(
+ 'How many minutes TPOT has to optimize the pipeline. This setting '
+ 'will override the GENERATIONS parameter and allow TPOT to run '
+ 'until it runs out of time.'
+ )
+ )
+
+ parser.add_argument(
+ '-maxeval',
+ action='store',
+ dest='MAX_EVAL_MINS',
+ default=5,
+ type=float,
+ help=(
+ 'How many minutes TPOT has to evaluate a single pipeline. Setting '
+ 'this parameter to higher values will allow TPOT to explore more '
+ 'complex pipelines but will also allow TPOT to run longer.'
+ )
+ )
+
+ parser.add_argument(
+ '-s',
+ action='store',
+ dest='RANDOM_STATE',
+ default=None,
+ type=int,
+ help=(
+ 'Random number generator seed for reproducibility. Set this seed '
+ 'if you want your TPOT run to be reproducible with the same seed '
+ 'and data set in the future.'
+ )
+ )
+
+ parser.add_argument(
+ '-config',
+ action='store',
+ dest='CONFIG_FILE',
+ default='',
+ type=str,
+ help=(
+ 'Configuration file for customizing the operators and parameters '
+ 'that TPOT uses in the optimization process. Must be a python '
+ 'module containing a dict export named "tpot_config".'
+ )
+ )
+
+ parser.add_argument(
+ '-v',
+ action='store',
+ dest='VERBOSITY',
+ default=1,
+ choices=[0, 1, 2, 3],
+ type=int,
+ help=(
+ 'How much information TPOT communicates while it is running: '
+ '0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or '
+ 'higher will add a progress bar during the optimization procedure.'
+ )
+ )
+
+ parser.add_argument(
+ '--no-update-check',
+ action='store_true',
+ dest='DISABLE_UPDATE_CHECK',
+ default=False,
+ help='Flag indicating whether the TPOT version checker should be disabled.'
+ )
+
+ parser.add_argument(
+ '--version',
+ action='version',
version='TPOT {version}'.format(version=__version__),
- help='Show the TPOT version number and exit.')
+ help='Show the TPOT version number and exit.'
+ )
+
+ return parser
+
+
+def _print_args(args):
+ print('\nTPOT settings:')
+ args.__dict__
+ for arg, arg_val in list(enumerate(args.__dict__)):
+ if arg == 'DISABLE_UPDATE_CHECK':
+ continue
+ elif arg == 'SCORING_FN' and arg_val is None:
+ if args.TPOT_MODE == 'classification':
+ arg_val = 'accuracy'
+ else:
+ arg_val = 'mean_squared_error'
+ elif arg == 'OFFSPRING_SIZE' and arg_val is None:
+ arg_val = args.__dict__['POPULATION_SIZE']
+ print('{}\t=\t{}'.format(arg, arg_val))
+ print('')
- args = parser.parse_args()
- if args.VERBOSITY >= 2:
- print('\nTPOT settings:')
- for arg in sorted(args.__dict__):
- arg_val = args.__dict__[arg]
- if arg == 'DISABLE_UPDATE_CHECK':
- continue
- elif arg == 'SCORING_FN' and arg_val is None:
- if args.TPOT_MODE == 'classification':
- arg_val = 'accuracy'
- else:
- arg_val = 'mean_squared_error'
- elif arg == 'OFFSPRING_SIZE' and arg_val is None:
- arg_val = args.__dict__['POPULATION_SIZE']
- print('{}\t=\t{}'.format(arg, arg_val))
- print('')
-
- input_data = np.recfromcsv(args.INPUT_FILE, delimiter=args.INPUT_SEPARATOR, dtype=np.float64, case_sensitive=True)
+def _read_data_file(args):
+ input_data = np.recfromcsv(
+ args.INPUT_FILE,
+ delimiter=args.INPUT_SEPARATOR,
+ dtype=np.float64,
+ case_sensitive=True
+ )
+
if args.TARGET_NAME not in input_data.dtype.names:
- raise ValueError('The provided data file does not seem to have a target column. '
- 'Please make sure to specify the target column using the -target parameter.')
+ raise ValueError(
+ 'The provided data file does not seem to have a target column. '
+ 'Please make sure to specify the target column using the -target '
+ 'parameter.'
+ )
- features = np.delete(input_data.view(np.float64).reshape(input_data.size, -1),
- input_data.dtype.names.index(args.TARGET_NAME), axis=1)
+ return input_data
- training_features, testing_features, training_classes, testing_classes = \
- train_test_split(features, input_data[args.TARGET_NAME], random_state=args.RANDOM_STATE)
- if args.TPOT_MODE == 'classification':
- tpot_type = TPOTClassifier
- else:
- tpot_type = TPOTRegressor
+def main():
+ """Perform a TPOT run."""
+ args = _get_arg_parser().parse_args()
+
+ if args.VERBOSITY >= 2:
+ _print_args(args)
- tpot = tpot_type(generations=args.GENERATIONS, population_size=args.POPULATION_SIZE,
- offspring_size=args.OFFSPRING_SIZE, mutation_rate=args.MUTATION_RATE, crossover_rate=args.CROSSOVER_RATE,
- cv=args.CV, n_jobs=args.NUM_JOBS, scoring=args.SCORING_FN,
- max_time_mins=args.MAX_TIME_MINS, max_eval_time_mins=args.MAX_EVAL_MINS,
- random_state=args.RANDOM_STATE, config_dict=args.CONFIG_FILE,
- verbosity=args.VERBOSITY, disable_update_check=args.DISABLE_UPDATE_CHECK)
+ input_data = _read_data_file(args)
+ features = np.delete(
+ input_data.view(np.float64).reshape(input_data.size, -1),
+ input_data.dtype.names.index(args.TARGET_NAME),
+ axis=1
+ )
+
+ training_features, testing_features, training_classes, testing_classes = \
+ train_test_split(features, input_data[args.TARGET_NAME], random_state=args.RANDOM_STATE)
+ tpot_type = TPOTClassifier if args.TPOT_MODE == 'classification' else TPOTRegressor
+ tpot = tpot_type(
+ generations=args.GENERATIONS,
+ population_size=args.POPULATION_SIZE,
+ offspring_size=args.OFFSPRING_SIZE,
+ mutation_rate=args.MUTATION_RATE,
+ crossover_rate=args.CROSSOVER_RATE,
+ cv=args.NUM_CV_FOLDS,
+ n_jobs=args.NUM_JOBS,
+ scoring=args.SCORING_FN,
+ max_time_mins=args.MAX_TIME_MINS,
+ max_eval_time_mins=args.MAX_EVAL_MINS,
+ random_state=args.RANDOM_STATE,
+ config_dict=args.CONFIG_FILE,
+ verbosity=args.VERBOSITY,
+ disable_update_check=args.DISABLE_UPDATE_CHECK
+ )
print('')
tpot.fit(training_features, training_classes)
if args.VERBOSITY in [1, 2] and tpot._optimized_pipeline:
- training_score = max([tpot._pareto_front.keys[x].wvalues[1] for x in range(len(tpot._pareto_front.keys))])
+ training_score = max([x.wvalues[1] for x in tpot._pareto_front.keys])
print('\nTraining score: {}'.format(abs(training_score)))
print('Holdout score: {}'.format(tpot.score(testing_features, testing_classes)))
elif args.VERBOSITY >= 3 and tpot._pareto_front:
print('Final Pareto front testing scores:')
-
- for pipeline, pipeline_scores in zip(tpot._pareto_front.items, reversed(tpot._pareto_front.keys)):
+ pipelines = zip(tpot._pareto_front.items, reversed(tpot._pareto_front.keys))
+ for pipeline, pipeline_scores in pipelines:
tpot._fitted_pipeline = tpot._pareto_front_fitted_pipelines[str(pipeline)]
- print('{}\t{}\t{}'.format(int(abs(pipeline_scores.wvalues[0])),
- tpot.score(testing_features, testing_classes),
- pipeline))
+ print('{TRAIN_SCORE}\t{TEST_SCORE}\t{PIPELINE}'.format(
+ TRAIN_SCORE=int(abs(pipeline_scores.wvalues[0])),
+ TEST_SCORE=tpot.score(testing_features, testing_classes),
+ PIPELINE=pipeline
+ )
+ )
if args.OUTPUT_FILE != '':
tpot.export(args.OUTPUT_FILE)
diff --git a/tpot/export_utils.py b/tpot/export_utils.py
index 19c63e12e..f5a98cf79 100644
--- a/tpot/export_utils.py
+++ b/tpot/export_utils.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -22,8 +21,9 @@
import deap
+
def get_by_name(opname, operators):
- """Returns operator class instance by name
+ """Return operator class instance by name.
Parameters
----------
@@ -39,16 +39,20 @@ def get_by_name(opname, operators):
"""
ret_op_classes = [op for op in operators if op.__name__ == opname]
+
if len(ret_op_classes) == 0:
raise TypeError('Cannot found operator {} in operator dictionary'.format(opname))
elif len(ret_op_classes) > 1:
- print('Found multiple operator {} in operator dictionary'.format(opname),
- 'Please check your dictionary file.')
+ print(
+ 'Found multiple operator {} in operator dictionary. Please check '
+ 'your dictionary file.'.format(opname)
+ )
ret_op_class = ret_op_classes[0]
return ret_op_class
+
def export_pipeline(exported_pipeline, operators, pset):
- """Generates the source code of a TPOT Pipeline
+ """Generate source code for a TPOT Pipeline.
Parameters
----------
@@ -76,7 +80,7 @@ def export_pipeline(exported_pipeline, operators, pset):
def expr_to_tree(ind, pset):
- """Convert the unstructured DEAP pipeline into a tree data-structure
+ """Convert the unstructured DEAP pipeline into a tree data-structure.
Parameters
----------
@@ -98,9 +102,9 @@ def expr_to_tree(ind, pset):
def prim_to_list(prim, args):
if isinstance(prim, deap.gp.Terminal):
if prim.name in pset.context:
- return pset.context[prim.name]
+ return pset.context[prim.name]
else:
- return prim.value
+ return prim.value
return [prim.name] + args
@@ -119,7 +123,7 @@ def prim_to_list(prim, args):
def generate_import_code(pipeline, operators):
- """Generate all library import calls for use in TPOT.export()
+ """Generate all library import calls for use in TPOT.export().
Parameters
----------
@@ -135,19 +139,50 @@ def generate_import_code(pipeline, operators):
optimized pipeline
"""
- # operator[1] is the name of the operator
- operators_used = [x.name for x in pipeline if isinstance(x, deap.gp.Primitive)]
+ def merge_imports(old_dict, new_dict):
+ # Key is a module name
+ for key in new_dict.keys():
+ if key in old_dict.keys():
+ # Union imports from the same module
+ old_dict[key] = set(old_dict[key]) | set(new_dict[key])
+ else:
+ old_dict[key] = set(new_dict[key])
+ operators_used = [x.name for x in pipeline if isinstance(x, deap.gp.Primitive)]
pipeline_text = 'import numpy as np\n\n'
+ pipeline_imports = _starting_imports(pipeline, operators, operators_used)
+
+ # Build dict of import requirments from list of operators
+ import_relations = {op.__name__: op.import_hash for op in operators}
+
+ # Build import dict from operators used
+ for op in operators_used:
+ try:
+ operator_import = import_relations[op]
+ merge_imports(pipeline_imports, operator_import)
+ except KeyError:
+ pass # Operator does not require imports
+
+ # Build import string
+ for key in sorted(pipeline_imports.keys()):
+ module_list = ', '.join(sorted(pipeline_imports[key]))
+ pipeline_text += 'from {} import {}\n'.format(key, module_list)
+
+ pipeline_text += """
+# NOTE: Make sure that the class is labeled 'class' in the data file
+tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
+features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
+training_features, testing_features, training_classes, testing_classes = \\
+ train_test_split(features, tpot_data['class'], random_state=42)
+"""
+
+ return pipeline_text
+
+def _starting_imports(pipeline, operators, operators_used):
# number of operators
num_op = len(operators_used)
- # Build dict of import requirments from list of operators
- import_relations = {}
- for op in operators:
- import_relations[op.__name__] = op.import_hash
-
# number of classifier/regressor or CombineDFs
num_op_root = 0
for op in operators_used:
@@ -158,9 +193,8 @@ def generate_import_code(pipeline, operators):
else:
num_op_root += 1
- # Always start with these imports
if num_op_root > 1:
- pipeline_imports = {
+ return {
'sklearn.model_selection': ['train_test_split'],
'sklearn.pipeline': ['make_pipeline', 'make_union'],
'sklearn.preprocessing': ['FunctionTransformer'],
@@ -168,50 +202,19 @@ def generate_import_code(pipeline, operators):
'copy': ['copy']
}
elif num_op > 1:
- pipeline_imports = {
+ return {
'sklearn.model_selection': ['train_test_split'],
'sklearn.pipeline': ['make_pipeline']
}
- else: # if operators # == 1 and classifier/regressor # == 1, this import statement is simpler
- pipeline_imports = {
+ # if operators # == 1 and classifier/regressor # == 1, this import statement is simpler
+ else:
+ return {
'sklearn.model_selection': ['train_test_split']
}
- # Build import dict from operators used
- for op in operators_used:
- def merge_imports(old_dict, new_dict):
- # Key is a module name
- for key in new_dict.keys():
- if key in old_dict.keys():
- # Union imports from the same module
- old_dict[key] = set(old_dict[key]) | set(new_dict[key])
- else:
- old_dict[key] = set(new_dict[key])
-
- try:
- operator_import = import_relations[op]
- merge_imports(pipeline_imports, operator_import)
- except KeyError:
- pass # Operator does not require imports
-
- # Build import string
- for key in sorted(pipeline_imports.keys()):
- module_list = ', '.join(sorted(pipeline_imports[key]))
- pipeline_text += 'from {} import {}\n'.format(key, module_list)
-
- pipeline_text += """
-# NOTE: Make sure that the class is labeled 'class' in the data file
-tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
-features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
-training_features, testing_features, training_classes, testing_classes = \\
- train_test_split(features, tpot_data['class'], random_state=42)
-"""
-
- return pipeline_text
-
def pipeline_code_wrapper(pipeline_code):
- """Generate code specific to the execution of the sklearn pipeline
+ """Generate code specific to the execution of the sklearn pipeline.
Parameters
----------
@@ -232,7 +235,7 @@ def pipeline_code_wrapper(pipeline_code):
def generate_pipeline_code(pipeline_tree, operators):
- """Generate code specific to the construction of the sklearn Pipeline
+ """Generate code specific to the construction of the sklearn Pipeline.
Parameters
----------
@@ -244,12 +247,13 @@ def generate_pipeline_code(pipeline_tree, operators):
Source code for the sklearn pipeline
"""
- steps = process_operator(pipeline_tree, operators)
+ steps = _process_operator(pipeline_tree, operators)
pipeline_text = "make_pipeline(\n{STEPS}\n)".format(STEPS=_indent(",\n".join(steps), 4))
return pipeline_text
+
def generate_export_pipeline_code(pipeline_tree, operators):
- """Generate code specific to the construction of the sklearn Pipeline for export_pipeline
+ """Generate code specific to the construction of the sklearn Pipeline for export_pipeline.
Parameters
----------
@@ -261,17 +265,19 @@ def generate_export_pipeline_code(pipeline_tree, operators):
Source code for the sklearn pipeline
"""
- steps = process_operator(pipeline_tree, operators)
+ steps = _process_operator(pipeline_tree, operators)
# number of steps in a pipeline
num_step = len(steps)
if num_step > 1:
pipeline_text = "make_pipeline(\n{STEPS}\n)".format(STEPS=_indent(",\n".join(steps), 4))
- else: # only one operator (root = True)
- pipeline_text = "{STEPS}".format(STEPS=_indent(",\n".join(steps), 0))
+ # only one operator (root = True)
+ else:
+ pipeline_text = "{STEPS}".format(STEPS=_indent(",\n".join(steps), 0))
return pipeline_text
-def process_operator(operator, operators, depth=0):
+
+def _process_operator(operator, operators, depth=0):
steps = []
op_name = operator[0]
@@ -284,7 +290,7 @@ def process_operator(operator, operators, depth=0):
tpot_op = get_by_name(op_name, operators)
if input_name != 'input_matrix':
- steps.extend(process_operator(input_name, operators, depth + 1))
+ steps.extend(_process_operator(input_name, operators, depth + 1))
# If the step is an estimator and is not the last step then we must
# add its guess as a synthetic feature
@@ -299,7 +305,7 @@ def process_operator(operator, operators, depth=0):
def _indent(text, amount):
- """Indent a multiline string by some number of spaces
+ """Indent a multiline string by some number of spaces.
Parameters
----------
@@ -329,9 +335,9 @@ def _make_branch(branch):
if tpot_op.root:
return """make_union(VotingClassifier([('branch',
{}
-)]), FunctionTransformer(copy))""".format(_indent(process_operator(branch, operators)[0], 4))
+)]), FunctionTransformer(copy))""".format(_indent(_process_operator(branch, operators)[0], 4))
else:
- return process_operator(branch, operators)[0]
+ return _process_operator(branch, operators)[0]
else: # We're going to have to make a pipeline
tpot_op = get_by_name(branch[0], operators)
diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py
index 8714a16de..ef3ac4385 100644
--- a/tpot/gp_deap.py
+++ b/tpot/gp_deap.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -35,6 +34,10 @@
import warnings
import threading
+# Limit loops to generate a different individual by crossover/mutation
+MAX_MUT_LOOPS = 50
+
+
def varOr(population, toolbox, lambda_, cxpb, mutpb):
"""Part of an evolutionary algorithm applying only the variation part
(crossover, mutation **or** reproduction). The modified individuals have
@@ -70,15 +73,15 @@ def varOr(population, toolbox, lambda_, cxpb, mutpb):
offspring = []
for _ in range(lambda_):
op_choice = np.random.random()
- if op_choice < cxpb: # Apply crossover
- idxs = np.random.randint(0, len(population),size=2)
+ if op_choice < cxpb: # Apply crossover
+ idxs = np.random.randint(0, len(population), size=2)
ind1, ind2 = toolbox.clone(population[idxs[0]]), toolbox.clone(population[idxs[1]])
ind_str = str(ind1)
num_loop = 0
- while ind_str == str(ind1) and num_loop < 50 : # 50 loops at most to generate a different individual by crossover
+ while ind_str == str(ind1) and num_loop < MAX_MUT_LOOPS:
ind1, ind2 = toolbox.mate(ind1, ind2)
num_loop += 1
- if ind_str != str(ind1): # check if crossover happened
+ if ind_str != str(ind1): # check if crossover happened
del ind1.fitness.values
offspring.append(ind1)
elif op_choice < cxpb + mutpb: # Apply mutation
@@ -86,20 +89,21 @@ def varOr(population, toolbox, lambda_, cxpb, mutpb):
ind = toolbox.clone(population[idx])
ind_str = str(ind)
num_loop = 0
- while ind_str == str(ind) and num_loop < 50 : # 50 loops at most to generate a different individual by mutation
+ while ind_str == str(ind) and num_loop < MAX_MUT_LOOPS:
ind, = toolbox.mutate(ind)
num_loop += 1
- if ind_str != str(ind): # check if mutation happened
+ if ind_str != str(ind): # check if mutation happened
del ind.fitness.values
offspring.append(ind)
- else: # Apply reproduction
+ else: # Apply reproduction
idx = np.random.randint(0, len(population))
offspring.append(toolbox.clone(population[idx]))
return offspring
+
def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar,
- stats=None, halloffame=None, verbose=0, max_time_mins = None):
+ stats=None, halloffame=None, verbose=0, max_time_mins=None):
"""This is the :math:`(\mu + \lambda)` evolutionary algorithm.
:param population: A list of individuals.
:param toolbox: A :class:`~deap.base.Toolbox` that contains the evolution
@@ -177,7 +181,6 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar,
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values = fit
-
# Update the hall of fame with the generated individuals
if halloffame is not None:
halloffame.update(offspring)
@@ -196,9 +199,12 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar,
elif verbose == 3:
pbar.write('Generation {} - Current Pareto front scores:'.format(gen))
for pipeline, pipeline_scores in zip(halloffame.items, reversed(halloffame.keys)):
- pbar.write('{}\t{}\t{}'.format(int(abs(pipeline_scores.wvalues[0])),
- abs(pipeline_scores.wvalues[1]),
- pipeline))
+ pbar.write('{}\t{}\t{}'.format(
+ int(abs(pipeline_scores.wvalues[0])),
+ abs(pipeline_scores.wvalues[1]),
+ pipeline
+ )
+ )
pbar.write('')
# Update the statistics with the new population
@@ -235,7 +241,7 @@ def cxOnePoint(ind1, ind2):
types1[node.ret].append(idx)
common_types = []
for idx, node in enumerate(ind2[1:], 1):
- if node.ret in types1 and not node.ret in types2:
+ if node.ret in types1 and node.ret not in types2:
common_types.append(node.ret)
types2[node.ret].append(idx)
@@ -283,10 +289,11 @@ def mutNodeReplacement(individual, pset):
# find next primitive if any
rindex = None
if index + 1 < len(individual):
- for i, tmpnode in enumerate(individual[index+1:], index+ 1):
+ for i, tmpnode in enumerate(individual[index + 1:], index + 1):
if isinstance(tmpnode, gp.Primitive) and tmpnode.ret in tmpnode.args:
rindex = i
- #pset.primitives[node.ret] can get a list of the type of node
+
+ # pset.primitives[node.ret] can get a list of the type of node
# for example: if op.root is True then the node.ret is Output_DF object
# based on the function _setup_pset. Then primitives is the list of classifor or regressor
primitives = pset.primitives[node.ret]
@@ -323,9 +330,11 @@ def __init__(self, *args, **kwargs):
self.result = -float('inf')
self._stopevent = threading.Event()
self.daemon = True
+
def stop(self):
self._stopevent.set()
threading.Thread.join(self)
+
def run(self):
# Note: changed name of the thread to "MainThread" to avoid such warning from joblib (maybe bugs)
# Note: Need attention if using parallel execution model of scikit-learn
@@ -337,20 +346,29 @@ def run(self):
except Exception as e:
pass
+
def _wrapped_cross_val_score(sklearn_pipeline, features, classes,
cv, scoring_function, sample_weight, max_eval_time_mins):
- #sys.tracebacklimit = 0
max_time_seconds = max(int(max_eval_time_mins * 60), 1)
sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)
# build a job for cross_val_score
- tmp_it = Interruptable_cross_val_score(clone(sklearn_pipeline), features, classes,
- scoring=scoring_function, cv=cv, n_jobs=1, verbose=0, fit_params=sample_weight_dict)
+ tmp_it = Interruptable_cross_val_score(
+ clone(sklearn_pipeline),
+ features,
+ classes,
+ scoring=scoring_function,
+ cv=cv,
+ n_jobs=1,
+ verbose=0,
+ fit_params=sample_weight_dict
+ )
tmp_it.start()
tmp_it.join(max_time_seconds)
+
if tmp_it.isAlive():
resulting_score = 'Timeout'
else:
resulting_score = np.mean(tmp_it.result)
- #sys.tracebacklimit = 1000
+
tmp_it.stop()
return resulting_score
diff --git a/tpot/gp_types.py b/tpot/gp_types.py
index 8b2b6608c..8cf9a44d9 100644
--- a/tpot/gp_types.py
+++ b/tpot/gp_types.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -20,8 +19,8 @@
"""
-class Output_Array(object):
- """Output data type of pipelines"""
+class Output_Array(object):
+ """Output data type of pipelines."""
pass
diff --git a/tpot/metrics.py b/tpot/metrics.py
index 98df0666c..6c35af84c 100644
--- a/tpot/metrics.py
+++ b/tpot/metrics.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -25,7 +24,7 @@
def balanced_accuracy(y_true, y_pred):
- """Default scoring function: balanced accuracy
+ """Default scoring function: balanced accuracy.
Balanced accuracy computes each class' accuracy on a per-class basis using a
one-vs-rest encoding, then computes an unweighted average of the class accuracies.
@@ -59,4 +58,5 @@ def balanced_accuracy(y_true, y_pred):
return np.mean(all_class_accuracies)
+
SCORERS['balanced_accuracy'] = make_scorer(balanced_accuracy)
diff --git a/tpot/operator_utils.py b/tpot/operator_utils.py
index dfc0cc294..f5228a528 100644
--- a/tpot/operator_utils.py
+++ b/tpot/operator_utils.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -27,24 +26,24 @@
class Operator(object):
- """Base class for operators in TPOT"""
- def __init__(self):
- pass
+ """Base class for operators in TPOT."""
+
root = False # Whether this operator type can be the root of the tree
import_hash = None
sklearn_class = None
arg_types = None
- dep_op_list = {} # the estimator or score_func as params in this operators
+ dep_op_list = {} # the estimator or score_func as params in this operators
class ARGType(object):
- """Base class for parameter specifications"""
- def __init__(self):
- pass
+ """Base class for parameter specifications."""
+
+ pass
def source_decode(sourcecode):
- """ Decode operator source and import operator class
+ """Decode operator source and import operator class.
+
Parameters
----------
sourcecode: string
@@ -73,10 +72,12 @@ def source_decode(sourcecode):
except ImportError:
print('Warning: {} is not available and will not be used by TPOT.'.format(sourcecode))
op_obj = None
+
return import_str, op_str, op_obj
+
def set_sample_weight(pipeline_steps, sample_weight=None):
- """Recursively iterates through all objects in the pipeline and sets sample weight
+ """Recursively iterates through all objects in the pipeline and sets sample weight.
Parameters
----------
@@ -96,19 +97,21 @@ def set_sample_weight(pipeline_steps, sample_weight=None):
if inspect.getargspec(obj.fit).args.count('sample_weight'):
step_sw = pname + '__sample_weight'
sample_weight_dict[step_sw] = sample_weight
+
if sample_weight_dict:
return sample_weight_dict
else:
return None
+
def ARGTypeClassFactory(classname, prange, BaseClass=ARGType):
- """
- Dynamically create parameter type class
- """
- return type(classname, (BaseClass,), {'values':prange})
+ """Dynamically create parameter type class."""
+ return type(classname, (BaseClass,), {'values': prange})
+
def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass=ARGType):
- """Dynamically create operator class
+ """Dynamically create operator class.
+
Parameters
----------
opsourse: string
@@ -130,14 +133,12 @@ def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass=
a list of parameter class
"""
-
-
class_profile = {}
-
dep_op_list = {}
import_str, op_str, op_obj = source_decode(opsourse)
+
if not op_obj:
- return None, None # nothing return
+ return None, None
else:
# define if the operator can be the root of a pipeline
if issubclass(op_obj, ClassifierMixin) or issubclass(op_obj, RegressorMixin):
@@ -145,20 +146,22 @@ def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass=
optype = "Classifier or Regressor"
else:
optype = "Preprocessor or Selector"
+
@classmethod
def op_type(cls):
- """Returns the type of the operator, e.g:
- ("Classifier", "Regressor", "Selector", "Preprocessor")
+ """Return the operator type.
+
+ Possible values:
+ "Classifier", "Regressor", "Selector", "Preprocessor"
"""
return optype
class_profile['type'] = op_type
-
class_profile['sklearn_class'] = op_obj
-
import_hash = {}
import_hash[import_str] = [op_str]
arg_types = []
+
for pname in sorted(opdict.keys()):
prange = opdict[pname]
if not isinstance(prange, dict):
@@ -171,7 +174,7 @@ def op_type(cls):
import_hash[import_str].append(dep_op_str)
else:
import_hash[dep_import_str] = [dep_op_str]
- dep_op_list[pname]=dep_op_str
+ dep_op_list[pname] = dep_op_str
if dval:
for dpname in sorted(dval.keys()):
dprange = dval[dpname]
@@ -180,10 +183,10 @@ def op_type(cls):
class_profile['arg_types'] = tuple(arg_types)
class_profile['import_hash'] = import_hash
class_profile['dep_op_list'] = dep_op_list
+
@classmethod
def parameter_types(cls):
- """Return tuple of argument types for calling of the operator and the
- return type of the operator
+ """Return the argument and return types of an operator.
Parameters
----------
@@ -198,12 +201,11 @@ def parameter_types(cls):
"""
return ([np.ndarray] + arg_types, np.ndarray)
-
class_profile['parameter_types'] = parameter_types
+
@classmethod
def export(cls, *args):
- """Represent the operator as a string so that it can be exported to a
- file
+ """Represent the operator as a string so that it can be exported to a file.
Parameters
----------
@@ -218,28 +220,33 @@ def export(cls, *args):
SklearnClassName(param1="val1", param2=val2)
"""
-
op_arguments = []
+
if dep_op_list:
dep_op_arguments = {}
+
for arg_class, arg_value in zip(arg_types, args):
if arg_value == "DEFAULT":
continue
aname_split = arg_class.__name__.split('__')
if isinstance(arg_value, str):
arg_value = '\"{}\"'.format(arg_value)
- if len(aname_split) == 2: # simple parameter
+ if len(aname_split) == 2: # simple parameter
op_arguments.append("{}={}".format(aname_split[-1], arg_value))
- else: # parameter of internal operator as a parameter in the operator, usually in Selector
+ # Parameter of internal operator as a parameter in the
+ # operator, usually in Selector
+ else:
if not list(dep_op_list.values()).count(aname_split[1]):
raise TypeError('Warning: the operator {} is not in right format in the operator dictionary'.format(aname_split[0]))
else:
if aname_split[1] not in dep_op_arguments:
dep_op_arguments[aname_split[1]] = []
dep_op_arguments[aname_split[1]].append("{}={}".format(aname_split[-1], arg_value))
+
tmp_op_args = []
if dep_op_list:
- # to make sure the inital operators is the first parameter just for better persentation
+ # To make sure the inital operators is the first parameter just
+ # for better persentation
for dep_op_pname, dep_op_str in dep_op_list.items():
if dep_op_str == 'f_classif':
arg_value = dep_op_str
diff --git a/tpot/tpot.py b/tpot/tpot.py
index 41a6f8c21..8f85a23cc 100644
--- a/tpot/tpot.py
+++ b/tpot/tpot.py
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
-"""
-Copyright 2015-Present Randal S. Olson
+"""Copyright 2015-Present Randal S. Olson.
This file is part of the TPOT library.
@@ -26,18 +25,18 @@
class TPOTClassifier(TPOTBase):
- """TPOT estimator for classification problems"""
+ """TPOT estimator for classification problems."""
scoring_function = 'accuracy' # Classification scoring
- default_config_dict = classifier_config_dict # Classification dictionary
+ default_config_dict = classifier_config_dict # Classification dictionary
classification = True
regression = False
class TPOTRegressor(TPOTBase):
- """TPOT estimator for regression problems"""
+ """TPOT estimator for regression problems."""
scoring_function = 'neg_mean_squared_error' # Regression scoring
- default_config_dict = regressor_config_dict # Regression dictionary
+ default_config_dict = regressor_config_dict # Regression dictionary
classification = False
regression = True