From 0b9ea715c1f0708dd6be05c3c2af3cd1aea12ec7 Mon Sep 17 00:00:00 2001 From: teaearlgraycold Date: Fri, 21 Apr 2017 21:22:43 -0400 Subject: [PATCH 1/3] Conform to PEP8 standards --- setup.py | 9 +- tests.py | 373 +++++++++++++--------- tpot/__init__.py | 3 +- tpot/_version.py | 3 +- tpot/base.py | 268 +++++++++------- tpot/built_in_operators.py | 31 +- tpot/config_classifier.py | 28 +- tpot/config_classifier_light.py | 20 +- tpot/config_classifier_mdr.py | 13 +- tpot/config_regressor.py | 33 +- tpot/config_regressor_light.py | 23 +- tpot/decorators.py | 50 ++- tpot/driver.py | 528 +++++++++++++++++++++++--------- tpot/export_utils.py | 138 +++++---- tpot/gp_deap.py | 55 ++-- tpot/gp_types.py | 7 +- tpot/metrics.py | 6 +- tpot/operator_utils.py | 75 +++-- tpot/tpot.py | 11 +- 19 files changed, 1011 insertions(+), 663 deletions(-) mode change 100644 => 100755 tpot/driver.py diff --git a/setup.py b/setup.py index 2d61ea0b3..efaf74364 100644 --- a/setup.py +++ b/setup.py @@ -8,6 +8,7 @@ def calculate_version(): version = list(filter(lambda x: '__version__' in x, initpy))[0].split('\'')[1] return version + package_version = calculate_version() setup( @@ -35,9 +36,11 @@ def calculate_version(): ''', zip_safe=True, install_requires=['numpy>=1.12.1', 'scipy>=0.19.0', 'scikit-learn>=0.18.1', 'deap>=1.0', 'update_checker>=0.16', 'tqdm>=4.11.2'], - extras_require={'xgboost': ['xgboost>=0.6'], - 'skrebate': ['skrebate>=0.3.4'], - 'mdr': ['scikit-mdr>=0.4.2']}, + extras_require={ + 'xgboost': ['xgboost>=0.6'], + 'skrebate': ['skrebate>=0.3.4'], + 'mdr': ['scikit-mdr>=0.4.2'] + }, classifiers=[ 'Intended Audience :: Science/Research', 'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)', diff --git a/tests.py b/tests.py index 21163100d..c081d3ff2 100644 --- a/tests.py +++ b/tests.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -36,8 +35,6 @@ import numpy as np import inspect import random -import time -from datetime import datetime import subprocess from sklearn.datasets import load_digits, load_boston @@ -59,28 +56,46 @@ random.seed(42) test_operator_key = 'sklearn.feature_selection.SelectKBest' -TPOTSelectKBest,TPOTSelectKBest_args = TPOTOperatorClassFactory(test_operator_key, - classifier_config_dict[test_operator_key]) +TPOTSelectKBest, TPOTSelectKBest_args = TPOTOperatorClassFactory( + test_operator_key, + classifier_config_dict[test_operator_key] +) + + +# http://stackoverflow.com/questions/5595425/ +def is_close(a, b, rel_tol=1e-09, abs_tol=0.0): + """Determine if two floats are close in value, but not necessarily equal.""" + return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) + def test_driver(): - """Assert that the TPOT driver output normal result""" + """Assert that the TPOT driver output normal result.""" batcmd = "python -m tpot.driver tests.csv -is , -target class -g 2 -p 2 -os 4 -cv 5 -s 45 -v 1" ret_stdout = subprocess.check_output(batcmd, shell=True) + try: ret_val = float(ret_stdout.decode('UTF-8').split('\n')[-2].split(': ')[-1]) - except: + except Exception: ret_val = -float('inf') + assert ret_val > 0.0 def test_init_custom_parameters(): - """Assert that the TPOT instantiator stores the TPOT variables properly""" - - tpot_obj = TPOTClassifier(population_size=500, generations=1000, offspring_size=2000, - mutation_rate=0.05, crossover_rate=0.9, - scoring='accuracy', cv=10, - verbosity=1, random_state=42, - disable_update_check=True, warm_start=True) + """Assert that the TPOT instantiator stores the TPOT variables properly.""" + tpot_obj = TPOTClassifier( + population_size=500, + generations=1000, + offspring_size=2000, + mutation_rate=0.05, + crossover_rate=0.9, + scoring='accuracy', + cv=10, + verbosity=1, + random_state=42, + disable_update_check=True, + warm_start=True + ) assert tpot_obj.population_size == 500 assert tpot_obj.generations == 1000 @@ -99,40 +114,41 @@ def test_init_custom_parameters(): def test_init_default_scoring(): - """Assert that TPOT intitializes with the correct default scoring function""" - + """Assert that TPOT intitializes with the correct default scoring function.""" tpot_obj = TPOTRegressor() assert tpot_obj.scoring_function == 'neg_mean_squared_error' tpot_obj = TPOTClassifier() assert tpot_obj.scoring_function == 'accuracy' + def test_invaild_score_warning(): - """Assert that the TPOT fit function raises a ValueError when the scoring metrics is not available in SCORERS""" + """Assert that the TPOT fit function raises a ValueError when the scoring metrics is not available in SCORERS.""" try: - tpot_obj = TPOTClassifier(scoring='balanced_accuray') # typo for balanced_accuracy + TPOTClassifier(scoring='balanced_accuray') # typo for balanced_accuracy assert False except ValueError: pass try: - tpot_obj = TPOTClassifier(scoring='balanced_accuracy') # correct one + TPOTClassifier(scoring='balanced_accuracy') # correct one assert True - except: + except Exception: assert False + def test_invaild_dataset_warning(): - """Assert that the TPOT fit function raises a ValueError when dataset is not in right format""" + """Assert that the TPOT fit function raises a ValueError when dataset is not in right format.""" tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0) - bad_training_classes = training_classes.reshape((1, len(training_classes)))# common mistake in classes + bad_training_classes = training_classes.reshape((1, len(training_classes))) # common mistake in classes try: - tpot_obj.fit(training_features ,bad_training_classes) # typo for balanced_accuracy + tpot_obj.fit(training_features, bad_training_classes) # typo for balanced_accuracy assert False except ValueError: pass -def test_init_max_time_mins(): - """Assert that the TPOT init stores max run time and sets generations to 1000000""" +def test_init_max_time_mins(): + """Assert that the TPOT init stores max run time and sets generations to 1000000.""" tpot_obj = TPOTClassifier(max_time_mins=30, generations=1000) assert tpot_obj.generations == 1000000 @@ -140,8 +156,7 @@ def test_init_max_time_mins(): def test_get_params(): - """Assert that get_params returns the exact dictionary of parameters used by TPOT""" - + """Assert that get_params returns the exact dictionary of parameters used by TPOT.""" kwargs = { 'population_size': 500, 'generations': 1000, @@ -162,21 +177,21 @@ def test_get_params(): def test_set_params(): - """Assert that set_params returns a reference to the TPOT instance""" - + """Assert that set_params returns a reference to the TPOT instance.""" tpot_obj = TPOTClassifier() assert tpot_obj.set_params() is tpot_obj def test_set_params_2(): - """Assert that set_params updates TPOT's instance variables""" + """Assert that set_params updates TPOT's instance variables.""" tpot_obj = TPOTClassifier(generations=2) tpot_obj.set_params(generations=3) assert tpot_obj.generations == 3 + def test_lite_params(): - """Assert that TPOT uses TPOT's lite dictionary of operators when config_dict is \'TPOT light\' or \'TPOT MDR\'""" + """Assert that TPOT uses TPOT's lite dictionary of operators when config_dict is 'TPOT light' or 'TPOT MDR'.""" tpot_obj = TPOTClassifier(config_dict='TPOT light') assert tpot_obj.config_dict == classifier_config_dict_light @@ -194,16 +209,16 @@ def test_lite_params(): def test_random_ind(): - """Assert that the TPOTClassifier can generate the same pipeline with same random seed""" + """Assert that the TPOTClassifier can generate the same pipeline with same random seed.""" tpot_obj = TPOTClassifier(random_state=43) pipeline1 = str(tpot_obj._toolbox.individual()) tpot_obj = TPOTClassifier(random_state=43) pipeline2 = str(tpot_obj._toolbox.individual()) assert pipeline1 == pipeline2 -def test_random_ind_2(): - """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 45""" +def test_random_ind_2(): + """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 45.""" tpot_obj = TPOTClassifier(random_state=45) tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() @@ -228,11 +243,12 @@ def test_random_ind_2(): exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features) """ + assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) -def test_score(): - """Assert that the TPOT score function raises a RuntimeError when no optimized pipeline exists""" +def test_score(): + """Assert that the TPOT score function raises a RuntimeError when no optimized pipeline exists.""" tpot_obj = TPOTClassifier() try: @@ -243,35 +259,36 @@ def test_score(): def test_score_2(): - """Assert that the TPOTClassifier score function outputs a known score for a fix pipeline""" - + """Assert that the TPOTClassifier score function outputs a known score for a fix pipeline.""" tpot_obj = TPOTClassifier() known_score = 0.977777777778 # Assumes use of the TPOT balanced_accuracy function # Reify pipeline with known score - pipeline_string= ('KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, ' - 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)') + pipeline_string = ( + 'KNeighborsClassifier(' + 'input_matrix, ' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1, ' + 'KNeighborsClassifier__weights=uniform' + ')' + ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) - # http://stackoverflow.com/questions/5595425/ - def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): - return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) + assert is_close(known_score, score) - assert isclose(known_score, score) def test_score_3(): - """Assert that the TPOTRegressor score function outputs a known score for a fix pipeline""" - + """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline.""" tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error') - known_score = 12.3727966005 # Assumes use of mse + known_score = 12.3727966005 # Assumes use of mse # Reify pipeline with known score - - pipeline_string = ("ExtraTreesRegressor(" + pipeline_string = ( + "ExtraTreesRegressor(" "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8," "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber," "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5," @@ -279,28 +296,25 @@ def test_score_3(): "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25)," "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5," "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, " - "ExtraTreesRegressor__n_estimators=100)") + "ExtraTreesRegressor__n_estimators=100)" + ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r) + # Get score from TPOT score = tpot_obj.score(testing_features_r, testing_classes_r) + assert is_close(known_score, score) - # http://stackoverflow.com/questions/5595425/ - def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): - return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) - - assert isclose(known_score, score) def test_sample_weight_func(): - """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline with sample weights""" - + """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline with sample weights.""" tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error') # Reify pipeline with known scor - - pipeline_string = ("ExtraTreesRegressor(" + pipeline_string = ( + "ExtraTreesRegressor(" "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8," "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber," "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5," @@ -308,7 +322,8 @@ def test_sample_weight_func(): "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25)," "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5," "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, " - "ExtraTreesRegressor__n_estimators=100)") + "ExtraTreesRegressor__n_estimators=100)" + ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r) @@ -332,19 +347,16 @@ def test_sample_weight_func(): np.random.seed(42) tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r, **training_classes_r_weight_dict) # Get score from TPOT - known_score = 12.643383517 # Assumes use of mse + known_score = 12.643383517 # Assumes use of mse score = tpot_obj.score(testing_features_r, testing_classes_r) - # http://stackoverflow.com/questions/5595425/ - def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): - return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert np.allclose(cv_score1, cv_score2) assert not np.allclose(cv_score1, cv_score_weight) - assert isclose(known_score, score) + assert is_close(known_score, score) -def test_predict(): - """Assert that the TPOT predict function raises a RuntimeError when no optimized pipeline exists""" +def test_predict(): + """Assert that the TPOT predict function raises a RuntimeError when no optimized pipeline exists.""" tpot_obj = TPOTClassifier() try: @@ -355,12 +367,17 @@ def test_predict(): def test_predict_2(): - """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,)""" - + """Assert that the TPOT predict function returns a numpy matrix of shape (num_testing_rows,).""" tpot_obj = TPOTClassifier() - pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' - ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' - 'DecisionTreeClassifier__min_samples_split=5)') + pipeline_string = ( + 'DecisionTreeClassifier(' + 'input_matrix, ' + 'DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, ' + 'DecisionTreeClassifier__min_samples_split=5' + ')' + ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) @@ -369,13 +386,18 @@ def test_predict_2(): assert result.shape == (testing_features.shape[0],) -def test_predict_proba(): - """Assert that the TPOT predict_proba function returns a numpy matrix of shape (num_testing_rows, num_testing_classes)""" +def test_predict_proba(): + """Assert that the TPOT predict_proba function returns a numpy matrix of shape (num_testing_rows, num_testing_classes).""" tpot_obj = TPOTClassifier() - pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' - ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' - 'DecisionTreeClassifier__min_samples_split=5)') + pipeline_string = ( + 'DecisionTreeClassifier(' + 'input_matrix, ' + 'DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, ' + 'DecisionTreeClassifier__min_samples_split=5)' + ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) @@ -387,18 +409,21 @@ def test_predict_proba(): def test_predict_proba2(): - """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float)""" - + """Assert that the TPOT predict_proba function returns a numpy matrix filled with probabilities (float).""" tpot_obj = TPOTClassifier() - pipeline_string= ('DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' - ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' - 'DecisionTreeClassifier__min_samples_split=5)') + pipeline_string = ( + 'DecisionTreeClassifier(' + 'input_matrix, ' + 'DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, ' + 'DecisionTreeClassifier__min_samples_split=5)' + ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict_proba(testing_features) - rows = result.shape[0] columns = result.shape[1] @@ -410,40 +435,55 @@ def test_predict_proba2(): except Exception: assert False + def test_warm_start(): - """Assert that the TPOT warm_start flag stores the pop and pareto_front from the first run""" + """Assert that the TPOT warm_start flag stores the pop and pareto_front from the first run.""" tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, warm_start=True) tpot_obj.fit(training_features, training_classes) - assert tpot_obj._pop != None - assert tpot_obj._pareto_front != None + assert tpot_obj._pop is not None + assert tpot_obj._pareto_front is not None first_pop = tpot_obj._pop - first_pareto_front = tpot_obj._pareto_front - tpot_obj.random_state = 21 tpot_obj.fit(training_features, training_classes) assert tpot_obj._pop == first_pop + def test_fit(): - """Assert that the TPOT fit function provides an optimized pipeline""" - tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0) + """Assert that the TPOT fit function provides an optimized pipeline.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0 + ) tpot_obj.fit(training_features, training_classes) assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) assert not (tpot_obj._start_datetime is None) + def test_fit2(): - """Assert that the TPOT fit function provides an optimized pipeline when config_dict is \'TPOT light\'""" - tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0, config_dict='TPOT light') + """Assert that the TPOT fit function provides an optimized pipeline when config_dict is 'TPOT light'.""" + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0, + config_dict='TPOT light' + ) tpot_obj.fit(training_features, training_classes) assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) assert not (tpot_obj._start_datetime is None) + def testTPOTOperatorClassFactory(): - """Assert that the TPOT operators class factory""" + """Assert that the TPOT operators class factory.""" test_config_dict = { 'sklearn.svm.LinearSVC': { 'penalty': ["l1", "l2"], @@ -463,23 +503,25 @@ def testTPOTOperatorClassFactory(): 'threshold': np.arange(0.0, 1.01, 0.05) } } + tpot_operator_list = [] tpot_argument_list = [] + for key in sorted(test_config_dict.keys()): - op,args = TPOTOperatorClassFactory(key, test_config_dict[key]) + op, args = TPOTOperatorClassFactory(key, test_config_dict[key]) tpot_operator_list.append(op) tpot_argument_list += args + assert len(tpot_operator_list) == 3 assert len(tpot_argument_list) == 9 - assert tpot_operator_list[0].root == True - assert tpot_operator_list[1].root == False + assert tpot_operator_list[0].root is True + assert tpot_operator_list[1].root is False assert tpot_operator_list[2].type() == "Classifier or Regressor" assert tpot_argument_list[1].values == [True, False] def check_export(op, tpot_obj): - """Assert that a TPOT operator exports as expected""" - + """Assert that a TPOT operator exports as expected.""" prng = np.random.RandomState(42) np.random.seed(42) @@ -492,7 +534,7 @@ def check_export(op, tpot_obj): def test_operators(): - """Assert that the TPOT operators match the output of their sklearn counterparts""" + """Assert that the TPOT operators match the output of their sklearn counterparts.""" tpot_obj = TPOTClassifier(random_state=42) for op in tpot_obj.operators: check_export.description = ("Assert that the TPOT {} operator exports " @@ -501,7 +543,7 @@ def test_operators(): def test_export(): - """Assert that TPOT's export function throws a RuntimeError when no optimized pipeline exists""" + """Assert that TPOT's export function throws a RuntimeError when no optimized pipeline exists.""" tpot_obj = TPOTClassifier() try: @@ -512,11 +554,14 @@ def test_export(): def test_generate_pipeline_code(): - """Assert that generate_pipeline_code() returns the correct code given a specific pipeline""" + """Assert that generate_pipeline_code() returns the correct code given a specific pipeline.""" tpot_obj = TPOTClassifier() - pipeline = ['KNeighborsClassifier', - ['CombineDFs', - ['GradientBoostingClassifier', + pipeline = [ + 'KNeighborsClassifier', + [ + 'CombineDFs', + [ + 'GradientBoostingClassifier', 'input_matrix', 38.0, 5, @@ -524,12 +569,18 @@ def test_generate_pipeline_code(): 5, 0.05, 0.5], - ['GaussianNB', - ['ZeroCount', - 'input_matrix']]], + [ + 'GaussianNB', + [ + 'ZeroCount', + 'input_matrix' + ] + ] + ], 18, 'uniform', - 2] + 2 + ] expected_code = """make_pipeline( make_union( @@ -548,9 +599,8 @@ def test_generate_pipeline_code(): assert expected_code == generate_pipeline_code(pipeline, tpot_obj.operators) - def test_generate_import_code(): - """Assert that generate_import_code() returns the correct set of dependancies for a given pipeline""" + """Assert that generate_import_code() returns the correct set of dependancies for a given pipeline.""" tpot_obj = TPOTClassifier() pipeline = creator.Individual.from_string('GaussianNB(RobustScaler(input_matrix))', tpot_obj._pset) @@ -569,41 +619,58 @@ def test_generate_import_code(): """ assert expected_code == generate_import_code(pipeline, tpot_obj.operators) + def test_mutNodeReplacement(): - """Assert that mutNodeReplacement() returns the correct type of mutation node in a fixed pipeline""" + """Assert that mutNodeReplacement() returns the correct type of mutation node in a fixed pipeline.""" tpot_obj = TPOTClassifier() - pipeline_string= ('KNeighborsClassifier(CombineDFs(' - 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' - ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' - 'DecisionTreeClassifier__min_samples_split=5),SelectKBest(input_matrix, SelectKBest__k=20)' - 'KNeighborsClassifier__n_neighbors=10, ' - 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform') + pipeline_string = ( + 'KNeighborsClassifier(CombineDFs(' + 'DecisionTreeClassifier(input_matrix, ' + 'DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8, ' + 'DecisionTreeClassifier__min_samples_leaf=5, ' + 'DecisionTreeClassifier__min_samples_split=5' + '), ' + 'SelectKBest(' + 'input_matrix, ' + 'SelectKBest__k=20' + ')' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1, ' + 'KNeighborsClassifier__weights=uniform' + ')' + ) + pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) pipeline[0].ret = Output_Array old_ret_type_list = [node.ret for node in pipeline] old_prims_list = [node for node in pipeline if node.arity != 0] - mut_ind = mutNodeReplacement(pipeline, pset = tpot_obj._pset) + mut_ind = mutNodeReplacement(pipeline, pset=tpot_obj._pset) new_ret_type_list = [node.ret for node in mut_ind[0]] new_prims_list = [node for node in mut_ind[0] if node.arity != 0] - if new_prims_list == old_prims_list: # Terminal mutated + + if new_prims_list == old_prims_list: # Terminal mutated assert new_ret_type_list == old_ret_type_list - else: # Primitive mutated + else: # Primitive mutated diff_prims = list(set(new_prims_list).symmetric_difference(old_prims_list)) assert diff_prims[0].ret == diff_prims[1].ret + assert mut_ind[0][0].ret == Output_Array def test_export_pipeline(): - """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline""" + """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline.""" tpot_obj = TPOTClassifier() - pipeline_string= ('KNeighborsClassifier(CombineDFs(' - 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini' - ', DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' - 'DecisionTreeClassifier__min_samples_split=5),SelectKBest(input_matrix, SelectKBest__k=20)' - 'KNeighborsClassifier__n_neighbors=10, ' - 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform') - pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) + pipeline_string = ( + 'KNeighborsClassifier(CombineDFs(' + 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' + 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' + 'DecisionTreeClassifier__min_samples_split=5),SelectKBest(input_matrix, SelectKBest__k=20)' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform' + ) + pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np from copy import copy @@ -636,11 +703,18 @@ def test_export_pipeline(): """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) + def test_export_pipeline_2(): - """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline (only one classifier)""" + """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline (only one classifier).""" tpot_obj = TPOTClassifier() - pipeline_string= ('KNeighborsClassifier(input_matrix, KNeighborsClassifier__n_neighbors=10, ' - 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform)') + pipeline_string = ( + 'KNeighborsClassifier(' + 'input_matrix, ' + 'KNeighborsClassifier__n_neighbors=10, ' + 'KNeighborsClassifier__p=1, ' + 'KNeighborsClassifier__weights=uniform' + ')' + ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np @@ -660,12 +734,15 @@ def test_export_pipeline_2(): """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) + def test_export_pipeline_3(): - """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with a preprocessor""" + """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with a preprocessor.""" tpot_obj = TPOTClassifier() - pipeline_string= ('DecisionTreeClassifier(SelectKBest(input_matrix, SelectKBest__k=20),' - 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' - 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)') + pipeline_string = ( + 'DecisionTreeClassifier(SelectKBest(input_matrix, SelectKBest__k=20),' + 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' + 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' + ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np @@ -691,15 +768,15 @@ def test_export_pipeline_3(): """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) + def test_operator_export(): - """Assert that a TPOT operator can export properly with a function as a parameter to a classifier""" + """Assert that a TPOT operator can export properly with a function as a parameter to a classifier.""" export_string = TPOTSelectKBest.export(5) assert export_string == "SelectKBest(score_func=f_classif, k=5)" def test_indent(): - """Assert that indenting a multiline string by 4 spaces prepends 4 spaces before each new line""" - + """Assert that indenting a multiline string by 4 spaces prepends 4 spaces before each new line.""" multiline_string = """test test1 test2 @@ -714,18 +791,18 @@ def test_indent(): def test_operator_type(): - """Assert that TPOT operators return their type, e.g. "Classifier", "Preprocessor" """ + """Assert that TPOT operators return their type, e.g. 'Classifier', 'Preprocessor'.""" assert TPOTSelectKBest.type() == "Preprocessor or Selector" def test_get_by_name(): - """Assert that the Operator class returns operators by name appropriately""" + """Assert that the Operator class returns operators by name appropriately.""" tpot_obj = TPOTClassifier() assert get_by_name("SelectKBest", tpot_obj.operators).__class__ == TPOTSelectKBest.__class__ def test_gen(): - """Assert that TPOT's gen_grow_safe function returns a pipeline of expected structure""" + """Assert that TPOT's gen_grow_safe function returns a pipeline of expected structure.""" tpot_obj = TPOTClassifier() pipeline = tpot_obj._gen_grow_safe(tpot_obj._pset, 1, 3) @@ -735,7 +812,7 @@ def test_gen(): def test_positive_integer(): - """Assert that the TPOT CLI interface's integer parsing throws an exception when n < 0""" + """Assert that the TPOT CLI interface's integer parsing throws an exception when n < 0.""" try: positive_integer('-1') assert False # Should be unreachable @@ -744,12 +821,12 @@ def test_positive_integer(): def test_positive_integer_2(): - """Assert that the TPOT CLI interface's integer parsing returns the integer value of a string encoded integer when n > 0""" + """Assert that the TPOT CLI interface's integer parsing returns the integer value of a string encoded integer when n > 0.""" assert 1 == positive_integer('1') def test_positive_integer_3(): - """Assert that the TPOT CLI interface's integer parsing throws an exception when n is not an integer""" + """Assert that the TPOT CLI interface's integer parsing throws an exception when n is not an integer.""" try: positive_integer('foobar') assert False # Should be unreachable @@ -758,12 +835,12 @@ def test_positive_integer_3(): def test_float_range(): - """Assert that the TPOT CLI interface's float range returns a float with input is in 0. - 1.0""" + """Assert that the TPOT CLI interface's float range returns a float with input is in 0. - 1.0.""" assert 0.5 == float_range('0.5') def test_float_range_2(): - """Assert that the TPOT CLI interface's float range throws an exception when input it out of range""" + """Assert that the TPOT CLI interface's float range throws an exception when input it out of range.""" try: float_range('2.0') assert False # Should be unreachable @@ -772,7 +849,7 @@ def test_float_range_2(): def test_float_range_3(): - """Assert that the TPOT CLI interface's float range throws an exception when input is not a float""" + """Assert that the TPOT CLI interface's float range throws an exception when input is not a float.""" try: float_range('foobar') assert False # Should be unreachable diff --git a/tpot/__init__.py b/tpot/__init__.py index 6a11bcd96..f1d674528 100644 --- a/tpot/__init__.py +++ b/tpot/__init__.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. diff --git a/tpot/_version.py b/tpot/_version.py index e154a1d7c..7c3861417 100644 --- a/tpot/_version.py +++ b/tpot/_version.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. diff --git a/tpot/base.py b/tpot/base.py index df1b8fc17..59cf712da 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -25,7 +24,6 @@ import inspect import warnings import sys -import time from functools import partial from datetime import datetime from multiprocessing import cpu_count @@ -66,16 +64,18 @@ import _thread except ImportError: import thread as _thread + def handler(dwCtrlType, hook_sigint=_thread.interrupt_main): - if dwCtrlType == 0: # CTRL_C_EVENT + """SIGINT handler function.""" + if dwCtrlType == 0: # CTRL_C_EVENT hook_sigint() - return 1 # don't chain to the next handler + return 1 # don't chain to the next handler return 0 win32api.SetConsoleCtrlHandler(handler, 1) class TPOTBase(BaseEstimator): - """TPOT automatically creates and optimizes machine learning pipelines using genetic programming""" + """Automatically creates and optimizes machine learning pipelines using GP.""" def __init__(self, generations=100, population_size=100, offspring_size=None, mutation_rate=0.9, crossover_rate=0.1, @@ -83,7 +83,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, warm_start=False, verbosity=0, disable_update_check=False): - """Sets up the genetic programming algorithm for pipeline optimization. + """Set up the genetic programming algorithm for pipeline optimization. Parameters ---------- @@ -194,48 +194,34 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, else: self.offspring_size = population_size - if config_dict: - if config_dict == 'TPOT light': - if self.classification: - self.config_dict = classifier_config_dict_light - else: - self.config_dict = regressor_config_dict_light - elif config_dict == 'TPOT MDR': - if self.classification: - self.config_dict = tpot_mdr_classifier_config_dict - else: - raise TypeError('The TPOT MDR operator configuration file does not currently ' - 'work with TPOTRegressor. Please use TPOTClassifier instead.') - else: - try: - with open(config_dict, 'r') as input_file: - file_string = input_file.read() - operator_dict = eval(file_string[file_string.find('{'):(file_string.rfind('}') + 1)]) - except: - raise TypeError('The operator configuration file is in a bad format or not available. ' - 'Please check the configuration file before running TPOT.') - else: - self.config_dict = self.default_config_dict + self._setup_config(config_dict) self.operators = [] self.arguments = [] for key in sorted(self.config_dict.keys()): - op_class, arg_types = TPOTOperatorClassFactory(key, self.config_dict[key], - BaseClass=Operator, ArgBaseClass=ARGType) + op_class, arg_types = TPOTOperatorClassFactory( + key, + self.config_dict[key], + BaseClass=Operator, + ArgBaseClass=ARGType + ) if op_class: self.operators.append(op_class) self.arguments += arg_types - # Schedule TPOT to run for many generations if the user specifies a run-time limit - # TPOT will automatically interrupt itself when the timer runs out - if not (max_time_mins is None): + # Schedule TPOT to run for many generations if the user specifies a + # run-time limit TPOT will automatically interrupt itself when the timer + # runs out + if max_time_mins is not None: self.generations = 1000000 self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate if self.mutation_rate + self.crossover_rate > 1: - raise ValueError('The sum of the crossover and mutation probabilities must be <= 1.0.') + raise ValueError( + 'The sum of the crossover and mutation probabilities must be <= 1.0.' + ) self.verbosity = verbosity self.operators_context = { @@ -245,43 +231,42 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, 'FunctionTransformer': FunctionTransformer, 'copy': copy } - - - self._pbar = None - # Dictionary of individuals that have already been evaluated in previous generations + # Dictionary of individuals that have already been evaluated in previous + # generations self._evaluated_individuals = {} - self.random_state = random_state - # If the user passed a custom scoring function, store it in the sklearn SCORERS dictionary + # If the user passed a custom scoring function, store it in the sklearn + # SCORERS dictionary if scoring: if hasattr(scoring, '__call__'): scoring_name = scoring.__name__ - - if 'loss' in scoring_name or 'error' in scoring_name: - greater_is_better = False - else: - greater_is_better = True - + greater_is_better = 'loss' not in scoring_name and 'error' not in scoring_name SCORERS[scoring_name] = make_scorer(scoring, greater_is_better=greater_is_better) self.scoring_function = scoring_name else: if scoring not in SCORERS: - raise ValueError('The scoring function {} is not available. ' - 'Please choose a valid scoring function from the TPOT ' - 'documentation.'.format(scoring)) + raise ValueError( + 'The scoring function {} is not available. Please ' + 'choose a valid scoring function from the TPOT ' + 'documentation.'.format(scoring) + ) self.scoring_function = scoring self.cv = cv # If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module if sys.platform.startswith('win') and n_jobs != 1: - print('Warning: Although parallelization is currently supported in TPOT for Windows, ' - 'pressing Ctrl+C will freeze the optimization process without saving the best pipeline!' - 'Thus, Please DO NOT press Ctrl+C during the optimization procss if n_jobs is not equal to 1.' - 'For quick test in Windows, please set n_jobs to 1 for saving the best pipeline ' - 'in the middle of the optimization process via Ctrl+C.') + print( + 'Warning: Although parallelization is currently supported in ' + 'TPOT for Windows, pressing Ctrl+C will freeze the optimization ' + 'process without saving the best pipeline! Thus, Please DO NOT ' + 'press Ctrl+C during the optimization procss if n_jobs is not ' + 'equal to 1. For quick test in Windows, please set n_jobs to 1 ' + 'for saving the best pipeline in the middle of the optimization ' + 'process via Ctrl+C.' + ) if n_jobs == -1: self.n_jobs = cpu_count() else: @@ -290,45 +275,76 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self._setup_pset() self._setup_toolbox() + def _setup_config(self, config_dict): + if config_dict: + if config_dict == 'TPOT light': + if self.classification: + self.config_dict = classifier_config_dict_light + else: + self.config_dict = regressor_config_dict_light + elif config_dict == 'TPOT MDR': + if self.classification: + self.config_dict = tpot_mdr_classifier_config_dict + else: + raise TypeError( + 'The TPOT MDR operator configuration file does not ' + 'currently work with TPOTRegressor. Please use ' + 'TPOTClassifier instead.' + ) + else: + try: + with open(config_dict, 'r') as input_file: + file_string = input_file.read() + self.config_dict = eval(file_string[file_string.find('{'):(file_string.rfind('}') + 1)]) + except Exception: + raise TypeError( + 'The operator configuration file is in a bad format or ' + 'not available. Please check the configuration file ' + 'before running TPOT.' + ) + else: + self.config_dict = self.default_config_dict + def _setup_pset(self): if self.random_state is not None: random.seed(self.random_state) np.random.seed(self.random_state) self._pset = gp.PrimitiveSetTyped('MAIN', [np.ndarray], Output_Array) - - # Rename pipeline input to "input_df" self._pset.renameArguments(ARG0='input_matrix') + self._add_operators() + self._add_terminals() + if self.verbosity > 2: + print('{} operators have been imported by TPOT.'.format(len(self.operators))) - # Add all operators to the primitive set - for op in self.operators: - - if op.root: + def _add_operators(self): + for operator in self.operators: + if operator.root: # We need to add rooted primitives twice so that they can # return both an Output_Array (and thus be the root of the tree), # and return a np.ndarray so they can exist elsewhere in the tree. - p_types = (op.parameter_types()[0], Output_Array) - self._pset.addPrimitive(op, *p_types) + p_types = (operator.parameter_types()[0], Output_Array) + self._pset.addPrimitive(operator, *p_types) - self._pset.addPrimitive(op, *op.parameter_types()) + self._pset.addPrimitive(operator, *operator.parameter_types()) # Import required modules into local namespace so that pipelines # may be evaluated directly - for key in sorted(op.import_hash.keys()): - module_list = ', '.join(sorted(op.import_hash[key])) + for key in sorted(operator.import_hash.keys()): + module_list = ', '.join(sorted(operator.import_hash[key])) if key.startswith('tpot.'): exec('from {} import {}'.format(key[4:], module_list)) else: exec('from {} import {}'.format(key, module_list)) - for var in op.import_hash[key]: + for var in operator.import_hash[key]: self.operators_context[var] = eval(var) self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray) - # Terminals + def _add_terminals(self): for _type in self.arguments: type_values = list(_type.values) if 'nthread' not in _type.__name__: @@ -338,10 +354,6 @@ def _setup_pset(self): terminal_name = _type.__name__ + "=" + str(val) self._pset.addTerminal(val, _type, name=terminal_name) - if self.verbosity > 2: - print('{} operators have been imported by TPOT.'.format(len(self.operators))) - - def _setup_toolbox(self): creator.create('FitnessMulti', base.Fitness, weights=(-1.0, 1.0)) creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMulti) @@ -357,8 +369,7 @@ def _setup_toolbox(self): self._toolbox.register('mutate', self._random_mutation_operator) def fit(self, features, classes, sample_weight=None): - """Fits a machine learning pipeline that maximizes classification score - on the provided data + """Fit an optimitzed machine learning pipeline. Uses genetic programming to optimize a machine learning pipeline that maximizes classification score on the provided features and classes. @@ -389,7 +400,7 @@ def fit(self, features, classes, sample_weight=None): try: clf = clf.fit(features, classes) - except: + except Exception: raise ValueError('Error: Input data is not in a valid format. ' 'Please confirm that the input data is scikit-learn compatible. ' 'For example, the features must be a 2-D array and target labels ' @@ -397,7 +408,7 @@ def fit(self, features, classes, sample_weight=None): # Set the seed for the GP run if self.random_state is not None: - random.seed(self.random_state) # deap uses random + random.seed(self.random_state) # deap uses random np.random.seed(self.random_state) self._start_datetime = datetime.now() @@ -411,7 +422,7 @@ def fit(self, features, classes, sample_weight=None): pop = self._toolbox.population(n=self.population_size) def pareto_eq(ind1, ind2): - """Determines whether two individuals are equal on the Pareto front + """Determine whether two individuals are equal on the Pareto front. Parameters ---------- @@ -445,11 +456,19 @@ def pareto_eq(ind1, ind2): try: with warnings.catch_warnings(): warnings.simplefilter('ignore') - pop, _ = eaMuPlusLambda(population=pop, toolbox=self._toolbox, - mu=self.population_size, lambda_=self.offspring_size, - cxpb=self.crossover_rate, mutpb=self.mutation_rate, - ngen=self.generations, pbar=self._pbar, halloffame=self._pareto_front, - verbose=self.verbosity, max_time_mins=self.max_time_mins) + pop, _ = eaMuPlusLambda( + population=pop, + toolbox=self._toolbox, + mu=self.population_size, + lambda_=self.offspring_size, + cxpb=self.crossover_rate, + mutpb=self.mutation_rate, + ngen=self.generations, + pbar=self._pbar, + halloffame=self._pareto_front, + verbose=self.verbosity, + max_time_mins=self.max_time_mins + ) # store population for the next call if self.warm_start: @@ -458,7 +477,7 @@ def pareto_eq(ind1, ind2): # Allow for certain exceptions to signal a premature fit() cancellation except (KeyboardInterrupt, SystemExit): if self.verbosity > 0: - self._pbar.write('') # just for better interface + self._pbar.write('') self._pbar.write('TPOT closed prematurely. Will use the current best pipeline.') finally: # Close the progress bar @@ -507,7 +526,7 @@ def pareto_eq(ind1, ind2): self._pareto_front_fitted_pipelines[str(pipeline)].fit(features, classes) def predict(self, features): - """Uses the optimized pipeline to predict the classes for a feature set + """Use the optimized pipeline to predict the classes for a feature set. Parameters ---------- @@ -525,8 +544,7 @@ def predict(self, features): return self._fitted_pipeline.predict(features.astype(np.float64)) def fit_predict(self, features, classes): - """Convenience function that fits a pipeline then predicts on the - provided features + """Call fit and predict in sequence. Parameters ---------- @@ -545,7 +563,7 @@ def fit_predict(self, features, classes): return self.predict(features) def score(self, testing_features, testing_classes): - """Estimates the balanced testing accuracy of the optimized pipeline. + """Estimate the balanced testing accuracy of the optimized pipeline. Parameters ---------- @@ -563,12 +581,17 @@ def score(self, testing_features, testing_classes): if self._fitted_pipeline is None: raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') - # If the scoring function is a string, we must adjust to use the sklearn scoring interface - return abs(SCORERS[self.scoring_function](self._fitted_pipeline, - testing_features.astype(np.float64), testing_classes.astype(np.float64))) + # If the scoring function is a string, we must adjust to use the sklearn + # scoring interface + score = SCORERS[self.scoring_function]( + self._fitted_pipeline, + testing_features.astype(np.float64), + testing_classes.astype(np.float64) + ) + return abs(score) def predict_proba(self, features): - """Uses the optimized pipeline to estimate the class probabilities for a feature set + """Use the optimized pipeline to estimate the class probabilities for a feature set. Parameters ---------- @@ -589,7 +612,7 @@ def predict_proba(self, features): return self._fitted_pipeline.predict_proba(features.astype(np.float64)) def set_params(self, **params): - """Set the parameters of a TPOT instance + """Set the parameters of TPOT. Returns ------- @@ -600,7 +623,7 @@ def set_params(self, **params): return self def export(self, output_file_name): - """Exports the current optimized pipeline as Python code + """Export the current optimized pipeline as Python code. Parameters ---------- @@ -619,7 +642,7 @@ def export(self, output_file_name): output_file.write(export_pipeline(self._optimized_pipeline, self.operators, self._pset)) def _compile_to_sklearn(self, expr): - """Compiles a DEAP pipeline into a sklearn pipeline + """Compile a DEAP pipeline into a sklearn pipeline. Parameters ---------- @@ -634,7 +657,7 @@ def _compile_to_sklearn(self, expr): return eval(sklearn_pipeline, self.operators_context) def _set_param_recursive(self, pipeline_steps, parameter, value): - """Recursively iterates through all objects in the pipeline and sets the given parameter to the specified value + """Recursively iterate through all objects in the pipeline and set a given parameter. Parameters ---------- @@ -660,8 +683,8 @@ def _set_param_recursive(self, pipeline_steps, parameter, value): if hasattr(obj, parameter): setattr(obj, parameter, value) - def _evaluate_individuals(self, individuals, features, classes, sample_weight = None): - """Determines the `individual`'s fitness + def _evaluate_individuals(self, individuals, features, classes, sample_weight=None): + """Determine the fit of the provided individuals. Parameters ---------- @@ -727,8 +750,7 @@ def _evaluate_individuals(self, individuals, features, classes, sample_weight = operator_count = 0 for i in range(len(individual)): node = individual[i] - if ((type(node) is deap.gp.Terminal) or - type(node) is deap.gp.Primitive and node.name == 'CombineDFs'): + if ((type(node) is deap.gp.Terminal) or (type(node) is deap.gp.Primitive and node.name == 'CombineDFs')): continue operator_count += 1 except Exception: @@ -744,11 +766,21 @@ def _evaluate_individuals(self, individuals, features, classes, sample_weight = # evalurate pipeline resulting_score_list = [] # chunk size for pbar update - for chunk_idx in range(0, len(sklearn_pipeline_list),self.n_jobs*4): + for chunk_idx in range(0, len(sklearn_pipeline_list), self.n_jobs * 4): + jobs = [] + for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx + self.n_jobs * 4]: + job = delayed(_wrapped_cross_val_score)( + sklearn_pipeline, + features, + classes, + self.cv, + self.scoring_function, + sample_weight, + self.max_eval_time_mins + ) + jobs.append(job) parallel = Parallel(n_jobs=self.n_jobs, verbose=0, pre_dispatch='2*n_jobs') - tmp_result_score = parallel(delayed(_wrapped_cross_val_score)(sklearn_pipeline, features, classes, - self.cv, self.scoring_function, sample_weight, self.max_eval_time_mins) - for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx+self.n_jobs*4]) + tmp_result_score = parallel(jobs) # update pbar for val in tmp_result_score: if not self._pbar.disable: @@ -779,7 +811,7 @@ def _mate_operator(self, ind1, ind2): @_pre_test def _random_mutation_operator(self, individual): - """Perform a replacement, insertion, or shrink mutation on an individual + """Perform a replacement, insertion, or shrink mutation on an individual. Parameters ---------- @@ -801,8 +833,7 @@ def _random_mutation_operator(self, individual): return np.random.choice(mutation_techniques)(individual) def _gen_grow_safe(self, pset, min_, max_, type_=None): - """Generate an expression where each leaf might have a different depth - between min_ and max_. + """Generate an expression where each leaf might have a different depth between min_ and max_. Parameters ---------- @@ -822,8 +853,7 @@ def _gen_grow_safe(self, pset, min_, max_, type_=None): A grown tree with leaves at possibly different depths. """ def condition(height, depth, type_): - """Expression generation stops when the depth is equal to height or - when it is randomly determined that a a node should be a terminal""" + """Stop when the depth is equal to height or when a node should be a terminal.""" return type_ not in [np.ndarray, Output_Array] or depth == height return self._generate(pset, min_, max_, condition, type_) @@ -831,8 +861,10 @@ def condition(height, depth, type_): # Generate function stolen straight from deap.gp.generate @_pre_test def _generate(self, pset, min_, max_, condition, type_=None): - """Generate a Tree as a list of list. The tree is build from the root to - the leaves, and it stop growing when the condition is fulfilled. + """Generate a Tree as a list of lists. + + The tree is build from the root to the leaves, and it stop growing when + the condition is fulfilled. Parameters ---------- @@ -870,10 +902,10 @@ def _generate(self, pset, min_, max_, condition, type_=None): term = np.random.choice(pset.terminals[type_]) except IndexError: _, _, traceback = sys.exc_info() - raise IndexError("The gp.generate function tried to add " - "a terminal of type '%s', but there is " - "none available." % (type_,)).\ - with_traceback(traceback) + raise IndexError( + 'The gp.generate function tried to add a terminal of ' + 'type \'%s\', but there is none available.' % (type_,) + ).with_traceback(traceback) if inspect.isclass(term): term = term() expr.append(term) @@ -882,10 +914,10 @@ def _generate(self, pset, min_, max_, condition, type_=None): prim = np.random.choice(pset.primitives[type_]) except IndexError: _, _, traceback = sys.exc_info() - raise IndexError("The gp.generate function tried to add " - "a primitive of type '%s', but there is " - "none available." % (type_,)).\ - with_traceback(traceback) + raise IndexError( + 'The gp.generate function tried to add a terminal of ' + 'type \'%s\', but there is none available.' % (type_,) + ).with_traceback(traceback) expr.append(prim) for arg in reversed(prim.args): stack.append((depth+1, arg)) diff --git a/tpot/built_in_operators.py b/tpot/built_in_operators.py index df2a420e9..df0d8b59c 100644 --- a/tpot/built_in_operators.py +++ b/tpot/built_in_operators.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -26,18 +25,14 @@ class ZeroCount(BaseEstimator): - - """Preprocessor that adds two virtual features to the dataset, one for the count of zero values in the feature set, and one for the count of non-zeros in the feature set""" - - def __init__(self): - pass + """Adds the count of zeros and count of non-zeros per sample as features.""" def fit(self, X, y=None): - """Dummy function to fit in with the sklearn API""" + """Dummy function to fit in with the sklearn API.""" return self def transform(self, X, y=None): - """Transform data by adding two virtual features + """Transform data by adding two virtual features. Parameters ---------- @@ -57,19 +52,27 @@ def transform(self, X, y=None): X_transformed = np.copy(X) - non_zero = np.apply_along_axis(lambda row: np.count_nonzero(row), - axis=1, arr=X_transformed) - zero_col = np.apply_along_axis(lambda row: (n_features - np.count_nonzero(row)), - axis=1, arr=X_transformed) + non_zero = np.apply_along_axis( + lambda row: np.count_nonzero(row), + axis=1, + arr=X_transformed + ) + zero_col = np.apply_along_axis( + lambda row: (n_features - np.count_nonzero(row)), + axis=1, + arr=X_transformed + ) X_transformed = np.insert(X_transformed, n_features, non_zero, axis=1) X_transformed = np.insert(X_transformed, n_features + 1, zero_col, axis=1) return X_transformed + class CombineDFs(object): - """Operator to combine two DataFrames""" + """Combine two DataFrames.""" @property def __name__(self): + """Instance ame is the same as the class name.""" return self.__class__.__name__ diff --git a/tpot/config_classifier.py b/tpot/config_classifier.py index b3595d77e..7fdcb6229 100644 --- a/tpot/config_classifier.py +++ b/tpot/config_classifier.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -17,18 +16,11 @@ You should have received a copy of the GNU Lesser General Public License along with TPOT. If not, see . - - -dictionary format (json-like format): -key: - operator name -value: - source: module source (e.g sklearn.tree) - dependencies: depended module (e.g. ExtraTreesClassifier in selectors RFE); None for no dependency - params: a dictionary of parameter names (keys) and parameter ranges (values); None for no dependency """ import numpy as np +# Check the TPOT documentation for information on the structure of config dicts + classifier_config_dict = { # Classifiers @@ -168,22 +160,22 @@ 'alpha': np.arange(0, 0.05, 0.001), 'score_func': { 'sklearn.feature_selection.f_classif': None - } # read from dependencies ! need add an exception in preprocess_args + } # read from dependencies ! need add an exception in preprocess_args }, 'sklearn.feature_selection.SelectKBest': { - 'k': range(1, 100), # need check range! + 'k': range(1, 100), # TODO: Check range 'score_func': { 'sklearn.feature_selection.f_classif': None - } + } }, 'sklearn.feature_selection.SelectPercentile': { 'percentile': range(1, 100), 'score_func': { 'sklearn.feature_selection.f_classif': None - } + } }, 'sklearn.feature_selection.VarianceThreshold': { @@ -197,18 +189,18 @@ 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': np.arange(0.05, 1.01, 0.05) - } + } } }, - 'sklearn.feature_selection.SelectFromModel': { + 'sklearn.feature_selection.SelectFromModel': { 'threshold': np.arange(0, 1.01, 0.05), 'estimator': { 'sklearn.ensemble.ExtraTreesClassifier': { 'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features': np.arange(0.05, 1.01, 0.05) - } + } } } diff --git a/tpot/config_classifier_light.py b/tpot/config_classifier_light.py index d06115489..a4006d567 100644 --- a/tpot/config_classifier_light.py +++ b/tpot/config_classifier_light.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -17,18 +16,11 @@ You should have received a copy of the GNU Lesser General Public License along with TPOT. If not, see . - - -dictionary format (json-like format): -key: - operator name -value: - source: module source (e.g sklearn.tree) - dependencies: depended module (e.g. ExtraTreesClassifier in selectors RFE); None for no dependency - params: a dictionary of parameter names (keys) and parameter ranges (values); None for no dependency """ + import numpy as np +# Check the TPOT documentation for information on the structure of config dicts classifier_config_dict_light = { # Classifiers @@ -109,15 +101,15 @@ 'alpha': np.arange(0, 0.05, 0.001), 'score_func': { 'sklearn.feature_selection.f_classif': None - } # read from dependencies ! need add an exception in preprocess_args + } # read from dependencies ! need add an exception in preprocess_args }, 'sklearn.feature_selection.SelectKBest': { - 'k': range(1, 100), # need check range! + 'k': range(1, 100), # TODO: Check range 'score_func': { 'sklearn.feature_selection.f_classif': None - } + } }, 'sklearn.feature_selection.SelectPercentile': { diff --git a/tpot/config_classifier_mdr.py b/tpot/config_classifier_mdr.py index a90a9a702..8404cefdd 100644 --- a/tpot/config_classifier_mdr.py +++ b/tpot/config_classifier_mdr.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -16,16 +15,10 @@ You should have received a copy of the GNU Lesser General Public License along with TPOT. If not, see . - -dictionary format (json-like format): -key: - operator name -value: - source: module source (e.g sklearn.tree) - dependencies: depended module (e.g. SVC in selectors RFE); None for no dependency - params: a dictionary of parameter names (keys) and parameter ranges (values); None for no dependency """ +# Check the TPOT documentation for information on the structure of config dicts + tpot_mdr_classifier_config_dict = { # Classifiers diff --git a/tpot/config_regressor.py b/tpot/config_regressor.py index 99d21ce3b..7bf86c633 100644 --- a/tpot/config_regressor.py +++ b/tpot/config_regressor.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -17,18 +16,11 @@ You should have received a copy of the GNU Lesser General Public License along with TPOT. If not, see . - - -dictionary format (json-like format): -key: - operator name -value: - source: module source (e.g sklearn.tree) - dependencies: depended module (e.g. ExtraTreesClassifier in selectors RFE); None for no dependency - params: a dictionary of parameter names (keys) and parameter ranges (values); None for no params """ import numpy as np +# Check the TPOT documentation for information on the structure of config dicts + regressor_config_dict = { @@ -168,22 +160,22 @@ 'alpha': np.arange(0, 0.05, 0.001), 'score_func': { 'sklearn.feature_selection.f_classif': None - } # read from dependencies ! need add an exception in preprocess_args + } # read from dependencies ! need add an exception in preprocess_args }, 'sklearn.feature_selection.SelectKBest': { - 'k': range(1, 100), # need check range! + 'k': range(1, 100), # TODO: Check range 'score_func': { 'sklearn.feature_selection.f_classif': None - } + } }, 'sklearn.feature_selection.SelectPercentile': { 'percentile': range(1, 100), 'score_func': { 'sklearn.feature_selection.f_classif': None - } + } }, 'sklearn.feature_selection.VarianceThreshold': { @@ -193,12 +185,11 @@ 'sklearn.feature_selection.SelectFromModel': { 'threshold': np.arange(0, 1.01, 0.05), 'estimator': { - 'sklearn.ensemble.ExtraTreesRegressor': { - 'n_estimators': [100], - 'max_features': np.arange(0.05, 1.01, 0.05) - } - } - + 'sklearn.ensemble.ExtraTreesRegressor': { + 'n_estimators': [100], + 'max_features': np.arange(0.05, 1.01, 0.05) + } + } } } diff --git a/tpot/config_regressor_light.py b/tpot/config_regressor_light.py index b0e8b73d8..3fd4c237b 100644 --- a/tpot/config_regressor_light.py +++ b/tpot/config_regressor_light.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -17,20 +16,12 @@ You should have received a copy of the GNU Lesser General Public License along with TPOT. If not, see . - - -dictionary format (json-like format): -key: - operator name -value: - source: module source (e.g sklearn.tree) - dependencies: depended module (e.g. ExtraTreesClassifier in selectors RFE); None for no dependency - params: a dictionary of parameter names (keys) and parameter ranges (values); None for no params """ import numpy as np -regressor_config_dict_light = { +# Check the TPOT documentation for information on the structure of config dicts +regressor_config_dict_light = { 'sklearn.linear_model.ElasticNetCV': { 'l1_ratio': np.arange(0.0, 1.01, 0.05), @@ -115,22 +106,22 @@ 'alpha': np.arange(0, 0.05, 0.001), 'score_func': { 'sklearn.feature_selection.f_classif': None - } # read from dependencies ! need add an exception in preprocess_args + } # read from dependencies ! need add an exception in preprocess_args }, 'sklearn.feature_selection.SelectKBest': { - 'k': range(1, 100), # need check range! + 'k': range(1, 100), # TODO: Check range 'score_func': { 'sklearn.feature_selection.f_classif': None - } + } }, 'sklearn.feature_selection.SelectPercentile': { 'percentile': range(1, 100), 'score_func': { 'sklearn.feature_selection.f_classif': None - } + } }, 'sklearn.feature_selection.VarianceThreshold': { diff --git a/tpot/decorators.py b/tpot/decorators.py index fbe4ac09a..102b0c09a 100644 --- a/tpot/decorators.py +++ b/tpot/decorators.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -26,6 +25,9 @@ from sklearn.datasets import make_classification, make_regression from .export_utils import expr_to_tree, generate_pipeline_code from deap import creator + +NUM_TESTS = 10 + # generate a small data set for a new pipeline, in order to check if the pipeline # has unsuppported combinations in params pretest_X, pretest_y = make_classification(n_samples=50, n_features=10, random_state=42) @@ -33,35 +35,48 @@ def _pre_test(func): - """Decorator that wraps functions to check if the pipeline works with a pretest data set - If not, then rerun the func until it generates a good pipeline + """Check if the wrapped function works with a pretest data set. + + Reruns the wrapped function until it generates a good pipeline, for a max of + NUM_TESTS times. Parameters ---------- func: function - The function being decorated + The decorated function. Returns ------- - wrapped_func: function + check_pipeline: function A wrapper function around the func parameter """ @wraps(func) def check_pipeline(self, *args, **kwargs): bad_pipeline = True - num_test = 0 # number of tests - while bad_pipeline and num_test < 10: # a pool for workable pipeline - # clone individual before each func call so it is not altered for the possible next cycle loop + num_test = 0 # number of tests + + # a pool for workable pipeline + while bad_pipeline and num_test < NUM_TESTS: + # clone individual before each func call so it is not altered for + # the possible next cycle loop args = [self._toolbox.clone(arg) if isinstance(arg, creator.Individual) else arg for arg in args] + try: with warnings.catch_warnings(): warnings.simplefilter('ignore') + expr = func(self, *args, **kwargs) - # mutation operator returns tuple (ind,); crossover operator returns tuple (ind1, ind2) + # mutation operator returns tuple (ind,); crossover operator + # returns tuple of (ind1, ind2) expr_tuple = expr if isinstance(expr, tuple) else (expr,) + for expr_test in expr_tuple: - #print(num_test, generate_pipeline_code(expr_to_tree(expr), self.operators)) # debug - sklearn_pipeline = eval(generate_pipeline_code(expr_to_tree(expr_test, self._pset), self.operators), self.operators_context) + pipeline_code = generate_pipeline_code( + expr_to_tree(expr_test, self._pset), + self.operators + ) + sklearn_pipeline = eval(pipeline_code, self.operators_context) + if self.classification: sklearn_pipeline.fit(pretest_X, pretest_y) else: @@ -69,11 +84,16 @@ def check_pipeline(self, *args, **kwargs): bad_pipeline = False except BaseException as e: if self.verbosity > 2: - print_function = print + message = '_pre_test decorator: {fname}: num_test={n} {e}'.format( + n=num_test, + fname=func.__name__, + e=e + ) # Use the pbar output stream if it's active if not isinstance(self._pbar, type(None)): - print_function = self._pbar.write - print_function('_pre_test decorator: {fname}: num_test={n} {e}'.format(n=num_test, fname=func.__name__, e=e)) + self._pbar.write(message) + else: + print(message) finally: num_test += 1 diff --git a/tpot/driver.py b/tpot/driver.py old mode 100644 new mode 100755 index 69c63cbb8..78d576478 --- a/tpot/driver.py +++ b/tpot/driver.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -23,13 +22,14 @@ import numpy as np import argparse from sklearn.model_selection import train_test_split +from sklearn.preprocessing import Imputer from .tpot import TPOTClassifier, TPOTRegressor from ._version import __version__ def positive_integer(value): - """Ensures that the provided value is a positive integer. Throws an exception otherwise. + """Ensure that the provided value is a positive integer. Parameters ---------- @@ -51,7 +51,7 @@ def positive_integer(value): def float_range(value): - """Ensures that the provided value is a float integer in the range [0., 1.]. Throws an exception otherwise. + """Ensure that the provided value is a float integer in the range [0., 1.]. Parameters ---------- @@ -65,176 +65,406 @@ def float_range(value): """ try: value = float(value) - except: + except Exception: raise argparse.ArgumentTypeError('Invalid float value: \'{}\''.format(value)) if value < 0.0 or value > 1.0: raise argparse.ArgumentTypeError('Invalid float value: \'{}\''.format(value)) return value -def main(): - """Main function that is called when TPOT is run on the command line""" - parser = argparse.ArgumentParser(description='A Python tool that ' - 'automatically creates and optimizes machine learning pipelines using ' - 'genetic programming.', add_help=False) - - parser.add_argument('INPUT_FILE', type=str, help='Data file to use in the TPOT ' - 'optimization process. Ensure that the class label column is labeled as "class".') - - parser.add_argument('-h', '--help', action='help', - help='Show this help message and exit.') - - parser.add_argument('-is', action='store', dest='INPUT_SEPARATOR', default='\t', - type=str, help='Character used to separate columns in the input file.') - - parser.add_argument('-target', action='store', dest='TARGET_NAME', default='class', - type=str, help='Name of the target column in the input file.') - - parser.add_argument('-mode', action='store', dest='TPOT_MODE', - choices=['classification', 'regression'], default='classification', type=str, - help='Whether TPOT is being used for a supervised classification or regression problem.') - - parser.add_argument('-o', action='store', dest='OUTPUT_FILE', default='', - type=str, help='File to export the code for the final optimized pipeline.') - - parser.add_argument('-g', action='store', dest='GENERATIONS', default=100, - type=positive_integer, help='Number of iterations to run the pipeline optimization process.\n' - 'Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. ' - 'TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.') - - parser.add_argument('-p', action='store', dest='POPULATION_SIZE', default=100, - type=positive_integer, help='Number of individuals to retain in the GP population every generation.\n' - 'Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. ' - 'TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.') - - parser.add_argument('-os', action='store', dest='OFFSPRING_SIZE', default=None, - type=positive_integer, help='Number of offspring to produce in each GP generation. ' - 'By default, OFFSPRING_SIZE = POPULATION_SIZE.') - - parser.add_argument('-mr', action='store', dest='MUTATION_RATE', default=0.9, - type=float_range, help='GP mutation rate in the range [0.0, 1.0]. This tells the ' - 'GP algorithm how many pipelines to apply random changes to every generation. ' - 'We recommend using the default parameter unless you understand how the mutation ' - 'rate affects GP algorithms.') - - parser.add_argument('-xr', action='store', dest='CROSSOVER_RATE', default=0.1, - type=float_range, help='GP crossover rate in the range [0.0, 1.0]. This tells the ' - 'GP algorithm how many pipelines to "breed" every generation. ' - 'We recommend using the default parameter unless you understand how the crossover ' - 'rate affects GP algorithms.') - - parser.add_argument('-scoring', action='store', dest='SCORING_FN', default=None, - type=str, help='Function used to evaluate the quality of a given pipeline for ' - 'the problem. By default, accuracy is used for classification problems and mean ' - 'squared error (mse) is used for regression problems. ' - 'TPOT assumes that any function with "error" or "loss" in the name is meant to ' - 'be minimized, whereas any other functions will be maximized. ' - 'Offers the same options as cross_val_score: ' - '"accuracy", "adjusted_rand_score", "average_precision", "f1", "f1_macro", ' - '"f1_micro", "f1_samples", "f1_weighted", "log_loss", "mean_absolute_error", ' - '"mean_squared_error", "median_absolute_error", "precision", "precision_macro", ' - '"precision_micro", "precision_samples", "precision_weighted", "r2", "recall", ' - '"recall_macro", "recall_micro", "recall_samples", "recall_weighted", "roc_auc"') - - parser.add_argument('-cv', action='store', dest='CV', default=5, - type=int, help='Number of folds to evaluate each pipeline over in ' - 'k-fold cross-validation during the TPOT optimization process.') - - parser.add_argument('-njobs', action='store', dest='NUM_JOBS', default=1, - type=int, help='Number of CPUs for evaluating pipelines in parallel ' - ' during the TPOT optimization process. Assigning this to -1 will use as many ' - 'cores as available on the computer.') - - parser.add_argument('-maxtime', action='store', dest='MAX_TIME_MINS', default=None, - type=int, help='How many minutes TPOT has to optimize the pipeline. This ' - 'setting will override the GENERATIONS parameter ' - 'and allow TPOT to run until it runs out of time.') - - parser.add_argument('-maxeval', action='store', dest='MAX_EVAL_MINS', default=5, - type=float, help='How many minutes TPOT has to evaluate a single pipeline. ' - 'Setting this parameter to higher values will allow TPOT to explore more complex ' - 'pipelines but will also allow TPOT to run longer.') - - parser.add_argument('-s', action='store', dest='RANDOM_STATE', default=None, - type=int, help='Random number generator seed for reproducibility. Set ' - 'this seed if you want your TPOT run to be reproducible with the same ' - 'seed and data set in the future.') - - parser.add_argument('-config', action='store', dest='CONFIG_FILE', default='', - type=str, help='Configuration file for customizing the operators and parameters ' - 'that TPOT uses in the optimization process.') - - parser.add_argument('-v', action='store', dest='VERBOSITY', default=1, - choices=[0, 1, 2, 3], type=int, help='How much information TPOT communicates ' - 'while it is running: 0 = none, 1 = minimal, 2 = high, 3 = all. ' - 'A setting of 2 or higher will add a progress bar during the optimization procedure.') - - parser.add_argument('--no-update-check', action='store_true', - dest='DISABLE_UPDATE_CHECK', default=False, - help='Flag indicating whether the TPOT version checker should be disabled.') - - parser.add_argument('--version', action='version', +def _get_arg_parser(): + """Main function that is called when TPOT is run on the command line.""" + parser = argparse.ArgumentParser( + description=( + 'A Python tool that automatically creates and optimizes machine ' + 'learning pipelines using genetic programming.' + ), + add_help=False + ) + + parser.add_argument( + 'INPUT_FILE', + type=str, + help=( + 'Data file to use in the TPOT optimization process. Ensure that ' + 'the class label column is labeled as "class".' + ) + ) + + parser.add_argument( + '-h', + '--help', + action='help', + help='Show this help message and exit.' + ) + + parser.add_argument( + '-is', + action='store', + dest='INPUT_SEPARATOR', + default='\t', + type=str, + help='Character used to separate columns in the input file.' + ) + + parser.add_argument( + '-target', + action='store', + dest='TARGET_NAME', + default='class', + type=str, + help='Name of the target column in the input file.' + ) + + parser.add_argument( + '-mode', + action='store', + dest='TPOT_MODE', + choices=['classification', 'regression'], + default='classification', + type=str, + help=( + 'Whether TPOT is being used for a supervised classification or ' + 'regression problem.' + ) + ) + + parser.add_argument( + '-o', + action='store', + dest='OUTPUT_FILE', + default='', + type=str, + help='File to export the code for the final optimized pipeline.' + ) + + parser.add_argument( + '-g', + action='store', + dest='GENERATIONS', + default=100, + type=positive_integer, + help=( + 'Number of iterations to run the pipeline optimization process. ' + 'Generally, TPOT will work better when you give it more ' + 'generations (and therefore time) to optimize the pipeline. TPOT ' + 'will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE ' + 'pipelines in total.' + ) + ) + + parser.add_argument( + '-p', + action='store', + dest='POPULATION_SIZE', + default=100, + type=positive_integer, + help=( + 'Number of individuals to retain in the GP population every ' + 'generation. Generally, TPOT will work better when you give it ' + 'more individuals (and therefore time) to optimize the pipeline. ' + 'TPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE ' + 'pipelines in total.' + ) + ) + + parser.add_argument( + '-os', + action='store', + dest='OFFSPRING_SIZE', + default=None, + type=positive_integer, + help=( + 'Number of offspring to produce in each GP generation. By default,' + 'OFFSPRING_SIZE = POPULATION_SIZE.' + ) + ) + + parser.add_argument( + '-mr', + action='store', + dest='MUTATION_RATE', + default=0.9, + type=float_range, + help=( + 'GP mutation rate in the range [0.0, 1.0]. This tells the GP ' + 'algorithm how many pipelines to apply random changes to every ' + 'generation. We recommend using the default parameter unless you ' + 'understand how the mutation rate affects GP algorithms.' + ) + ) + + parser.add_argument( + '-xr', + action='store', + dest='CROSSOVER_RATE', + default=0.1, + type=float_range, + help=( + 'GP crossover rate in the range [0.0, 1.0]. This tells the GP ' + 'algorithm how many pipelines to "breed" every generation. We ' + 'recommend using the default parameter unless you understand how ' + 'the crossover rate affects GP algorithms.' + ) + ) + + parser.add_argument( + '-scoring', + action='store', + dest='SCORING_FN', + default=None, + type=str, + help=( + 'Function used to evaluate the quality of a given pipeline for the ' + 'problem. By default, accuracy is used for classification problems ' + 'and mean squared error (mse) is used for regression problems. ' + 'TPOT assumes that any function with "error" or "loss" in the name ' + 'is meant to be minimized, whereas any other functions will be ' + 'maximized. Offers the same options as cross_val_score: ' + 'accuracy, ' + 'adjusted_rand_score, ' + 'average_precision, ' + 'f1, ' + 'f1_macro, ' + 'f1_micro, ' + 'f1_samples, ' + 'f1_weighted, ' + 'log_loss, ' + 'mean_absolute_error, ' + 'mean_squared_error, ' + 'median_absolute_error, ' + 'precision, ' + 'precision_macro, ' + 'precision_micro, ' + 'precision_samples, ' + 'precision_weighted, ' + 'r2, ' + 'recall, ' + 'recall_macro, ' + 'recall_micro, ' + 'recall_samples, ' + 'recall_weighted, ' + 'roc_auc' + ) + ) + + parser.add_argument( + '-cv', + action='store', + dest='NUM_CV_FOLDS', + default=5, + type=int, + help=( + 'Number of folds to evaluate each pipeline over in k-fold ' + 'cross-validation during the TPOT optimization process.' + ) + ) + + parser.add_argument( + '-njobs', + action='store', + dest='NUM_JOBS', + default=1, + type=int, + help=( + 'Number of CPUs for evaluating pipelines in parallel during the ' + 'TPOT optimization process. Assigning this to -1 will use as many ' + 'cores as available on the computer.' + ) + ) + + parser.add_argument( + '-maxtime', + action='store', + dest='MAX_TIME_MINS', + default=None, + type=int, + help=( + 'How many minutes TPOT has to optimize the pipeline. This setting ' + 'will override the GENERATIONS parameter and allow TPOT to run ' + 'until it runs out of time.' + ) + ) + + parser.add_argument( + '-maxeval', + action='store', + dest='MAX_EVAL_MINS', + default=5, + type=float, + help=( + 'How many minutes TPOT has to evaluate a single pipeline. Setting ' + 'this parameter to higher values will allow TPOT to explore more ' + 'complex pipelines but will also allow TPOT to run longer.' + ) + ) + + parser.add_argument( + '-s', + action='store', + dest='RANDOM_STATE', + default=None, + type=int, + help=( + 'Random number generator seed for reproducibility. Set this seed ' + 'if you want your TPOT run to be reproducible with the same seed ' + 'and data set in the future.' + ) + ) + + parser.add_argument( + '-config', + action='store', + dest='CONFIG_FILE', + default='', + type=str, + help=( + 'Configuration file for customizing the operators and parameters ' + 'that TPOT uses in the optimization process. Must be a python ' + 'module containing a dict export named "tpot_config".' + ) + ) + + parser.add_argument( + '-impute', + action='store', + dest='IMPUTE', + default=None, + help=( + 'If set, TPOT will take the provided missing value string and ' + 'impute the value of all data points with that value.' + ) + ) + + parser.add_argument( + '-v', + action='store', + dest='VERBOSITY', + default=1, + choices=[0, 1, 2, 3], + type=int, + help=( + 'How much information TPOT communicates while it is running: ' + '0 = none, 1 = minimal, 2 = high, 3 = all. A setting of 2 or ' + 'higher will add a progress bar during the optimization procedure.' + ) + ) + + parser.add_argument( + '--no-update-check', + action='store_true', + dest='DISABLE_UPDATE_CHECK', + default=False, + help='Flag indicating whether the TPOT version checker should be disabled.' + ) + + parser.add_argument( + '--version', + action='version', version='TPOT {version}'.format(version=__version__), - help='Show the TPOT version number and exit.') + help='Show the TPOT version number and exit.' + ) + + return parser + + +def _print_args(args): + print('\nTPOT settings:') + args.__dict__ + for arg, arg_val in list(enumerate(args.__dict__)): + if arg == 'DISABLE_UPDATE_CHECK': + continue + elif arg == 'SCORING_FN' and arg_val is None: + if args.TPOT_MODE == 'classification': + arg_val = 'accuracy' + else: + arg_val = 'mean_squared_error' + elif arg == 'OFFSPRING_SIZE' and arg_val is None: + arg_val = args.__dict__['POPULATION_SIZE'] + print('{}\t=\t{}'.format(arg, arg_val)) + print('') - args = parser.parse_args() - if args.VERBOSITY >= 2: - print('\nTPOT settings:') - for arg in sorted(args.__dict__): - arg_val = args.__dict__[arg] - if arg == 'DISABLE_UPDATE_CHECK': - continue - elif arg == 'SCORING_FN' and arg_val is None: - if args.TPOT_MODE == 'classification': - arg_val = 'accuracy' - else: - arg_val = 'mean_squared_error' - elif arg == 'OFFSPRING_SIZE' and arg_val is None: - arg_val = args.__dict__['POPULATION_SIZE'] - print('{}\t=\t{}'.format(arg, arg_val)) - print('') - - input_data = np.recfromcsv(args.INPUT_FILE, delimiter=args.INPUT_SEPARATOR, dtype=np.float64, case_sensitive=True) +def _read_data_file(args): + input_data = np.recfromcsv( + args.INPUT_FILE, + delimiter=args.INPUT_SEPARATOR, + dtype=np.float64, + case_sensitive=True + ) + if args.TARGET_NAME not in input_data.dtype.names: - raise ValueError('The provided data file does not seem to have a target column. ' - 'Please make sure to specify the target column using the -target parameter.') + raise ValueError( + 'The provided data file does not seem to have a target column. ' + 'Please make sure to specify the target column using the -target ' + 'parameter.' + ) - features = np.delete(input_data.view(np.float64).reshape(input_data.size, -1), - input_data.dtype.names.index(args.TARGET_NAME), axis=1) + return input_data - training_features, testing_features, training_classes, testing_classes = \ - train_test_split(features, input_data[args.TARGET_NAME], random_state=args.RANDOM_STATE) - if args.TPOT_MODE == 'classification': - tpot_type = TPOTClassifier - else: - tpot_type = TPOTRegressor +def _impute_missing_values(features, missing_value): + imputer = Imputer(missing_values=missing_value) + return imputer.fit_transform(features) + - tpot = tpot_type(generations=args.GENERATIONS, population_size=args.POPULATION_SIZE, - offspring_size=args.OFFSPRING_SIZE, mutation_rate=args.MUTATION_RATE, crossover_rate=args.CROSSOVER_RATE, - cv=args.CV, n_jobs=args.NUM_JOBS, scoring=args.SCORING_FN, - max_time_mins=args.MAX_TIME_MINS, max_eval_time_mins=args.MAX_EVAL_MINS, - random_state=args.RANDOM_STATE, config_dict=args.CONFIG_FILE, - verbosity=args.VERBOSITY, disable_update_check=args.DISABLE_UPDATE_CHECK) +def main(): + """Perform a TPOT run.""" + args = _get_arg_parser().parse_args() + + if args.VERBOSITY >= 2: + _print_args(args) + + input_data = _read_data_file(args) + features = np.delete( + input_data.view(np.float64).reshape(input_data.size, -1), + input_data.dtype.names.index(args.TARGET_NAME), + axis=1 + ) + + if args.IMPUTE: + features = _impute_missing_values(features, args.IMPUTE) + + training_features, testing_features, training_classes, testing_classes = \ + train_test_split(features, input_data[args.TARGET_NAME], random_state=args.RANDOM_STATE) + tpot_type = TPOTClassifier if args.TPOT_MODE == 'classification' else TPOTRegressor + tpot = tpot_type( + generations=args.GENERATIONS, + population_size=args.POPULATION_SIZE, + offspring_size=args.OFFSPRING_SIZE, + mutation_rate=args.MUTATION_RATE, + crossover_rate=args.CROSSOVER_RATE, + cv=args.NUM_CV_FOLDS, + n_jobs=args.NUM_JOBS, + scoring=args.SCORING_FN, + max_time_mins=args.MAX_TIME_MINS, + max_eval_time_mins=args.MAX_EVAL_MINS, + random_state=args.RANDOM_STATE, + config_dict=args.CONFIG_FILE, + verbosity=args.VERBOSITY, + disable_update_check=args.DISABLE_UPDATE_CHECK + ) print('') tpot.fit(training_features, training_classes) - if args.VERBOSITY in [1, 2] and tpot._optimized_pipeline: - training_score = max([tpot._pareto_front.keys[x].wvalues[1] for x in range(len(tpot._pareto_front.keys))]) + if args.VERBOSITY < 3 and tpot._optimized_pipeline: + training_score = max([x.wvalues[1] for x in tpot._pareto_front.keys]) print('\nTraining score: {}'.format(abs(training_score))) print('Holdout score: {}'.format(tpot.score(testing_features, testing_classes))) elif args.VERBOSITY >= 3 and tpot._pareto_front: print('Final Pareto front testing scores:') - - for pipeline, pipeline_scores in zip(tpot._pareto_front.items, reversed(tpot._pareto_front.keys)): + pipelines = zip(tpot._pareto_front.items, reversed(tpot._pareto_front.keys)) + for pipeline, pipeline_scores in pipelines: tpot._fitted_pipeline = tpot._pareto_front_fitted_pipelines[str(pipeline)] - print('{}\t{}\t{}'.format(int(abs(pipeline_scores.wvalues[0])), - tpot.score(testing_features, testing_classes), - pipeline)) + print('{TRAIN_SCORE}\t{TEST_SCORE}\t{PIPELINE}'.format( + TRAIN_SCORE=int(abs(pipeline_scores.wvalues[0])), + TEST_SCORE=tpot.score(testing_features, testing_classes), + PIPELINE=pipeline + ) + ) if args.OUTPUT_FILE != '': tpot.export(args.OUTPUT_FILE) diff --git a/tpot/export_utils.py b/tpot/export_utils.py index 19c63e12e..f5a98cf79 100644 --- a/tpot/export_utils.py +++ b/tpot/export_utils.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -22,8 +21,9 @@ import deap + def get_by_name(opname, operators): - """Returns operator class instance by name + """Return operator class instance by name. Parameters ---------- @@ -39,16 +39,20 @@ def get_by_name(opname, operators): """ ret_op_classes = [op for op in operators if op.__name__ == opname] + if len(ret_op_classes) == 0: raise TypeError('Cannot found operator {} in operator dictionary'.format(opname)) elif len(ret_op_classes) > 1: - print('Found multiple operator {} in operator dictionary'.format(opname), - 'Please check your dictionary file.') + print( + 'Found multiple operator {} in operator dictionary. Please check ' + 'your dictionary file.'.format(opname) + ) ret_op_class = ret_op_classes[0] return ret_op_class + def export_pipeline(exported_pipeline, operators, pset): - """Generates the source code of a TPOT Pipeline + """Generate source code for a TPOT Pipeline. Parameters ---------- @@ -76,7 +80,7 @@ def export_pipeline(exported_pipeline, operators, pset): def expr_to_tree(ind, pset): - """Convert the unstructured DEAP pipeline into a tree data-structure + """Convert the unstructured DEAP pipeline into a tree data-structure. Parameters ---------- @@ -98,9 +102,9 @@ def expr_to_tree(ind, pset): def prim_to_list(prim, args): if isinstance(prim, deap.gp.Terminal): if prim.name in pset.context: - return pset.context[prim.name] + return pset.context[prim.name] else: - return prim.value + return prim.value return [prim.name] + args @@ -119,7 +123,7 @@ def prim_to_list(prim, args): def generate_import_code(pipeline, operators): - """Generate all library import calls for use in TPOT.export() + """Generate all library import calls for use in TPOT.export(). Parameters ---------- @@ -135,19 +139,50 @@ def generate_import_code(pipeline, operators): optimized pipeline """ - # operator[1] is the name of the operator - operators_used = [x.name for x in pipeline if isinstance(x, deap.gp.Primitive)] + def merge_imports(old_dict, new_dict): + # Key is a module name + for key in new_dict.keys(): + if key in old_dict.keys(): + # Union imports from the same module + old_dict[key] = set(old_dict[key]) | set(new_dict[key]) + else: + old_dict[key] = set(new_dict[key]) + operators_used = [x.name for x in pipeline if isinstance(x, deap.gp.Primitive)] pipeline_text = 'import numpy as np\n\n' + pipeline_imports = _starting_imports(pipeline, operators, operators_used) + + # Build dict of import requirments from list of operators + import_relations = {op.__name__: op.import_hash for op in operators} + + # Build import dict from operators used + for op in operators_used: + try: + operator_import = import_relations[op] + merge_imports(pipeline_imports, operator_import) + except KeyError: + pass # Operator does not require imports + + # Build import string + for key in sorted(pipeline_imports.keys()): + module_list = ', '.join(sorted(pipeline_imports[key])) + pipeline_text += 'from {} import {}\n'.format(key, module_list) + + pipeline_text += """ +# NOTE: Make sure that the class is labeled 'class' in the data file +tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) +features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) +training_features, testing_features, training_classes, testing_classes = \\ + train_test_split(features, tpot_data['class'], random_state=42) +""" + + return pipeline_text + +def _starting_imports(pipeline, operators, operators_used): # number of operators num_op = len(operators_used) - # Build dict of import requirments from list of operators - import_relations = {} - for op in operators: - import_relations[op.__name__] = op.import_hash - # number of classifier/regressor or CombineDFs num_op_root = 0 for op in operators_used: @@ -158,9 +193,8 @@ def generate_import_code(pipeline, operators): else: num_op_root += 1 - # Always start with these imports if num_op_root > 1: - pipeline_imports = { + return { 'sklearn.model_selection': ['train_test_split'], 'sklearn.pipeline': ['make_pipeline', 'make_union'], 'sklearn.preprocessing': ['FunctionTransformer'], @@ -168,50 +202,19 @@ def generate_import_code(pipeline, operators): 'copy': ['copy'] } elif num_op > 1: - pipeline_imports = { + return { 'sklearn.model_selection': ['train_test_split'], 'sklearn.pipeline': ['make_pipeline'] } - else: # if operators # == 1 and classifier/regressor # == 1, this import statement is simpler - pipeline_imports = { + # if operators # == 1 and classifier/regressor # == 1, this import statement is simpler + else: + return { 'sklearn.model_selection': ['train_test_split'] } - # Build import dict from operators used - for op in operators_used: - def merge_imports(old_dict, new_dict): - # Key is a module name - for key in new_dict.keys(): - if key in old_dict.keys(): - # Union imports from the same module - old_dict[key] = set(old_dict[key]) | set(new_dict[key]) - else: - old_dict[key] = set(new_dict[key]) - - try: - operator_import = import_relations[op] - merge_imports(pipeline_imports, operator_import) - except KeyError: - pass # Operator does not require imports - - # Build import string - for key in sorted(pipeline_imports.keys()): - module_list = ', '.join(sorted(pipeline_imports[key])) - pipeline_text += 'from {} import {}\n'.format(key, module_list) - - pipeline_text += """ -# NOTE: Make sure that the class is labeled 'class' in the data file -tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) -features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) -training_features, testing_features, training_classes, testing_classes = \\ - train_test_split(features, tpot_data['class'], random_state=42) -""" - - return pipeline_text - def pipeline_code_wrapper(pipeline_code): - """Generate code specific to the execution of the sklearn pipeline + """Generate code specific to the execution of the sklearn pipeline. Parameters ---------- @@ -232,7 +235,7 @@ def pipeline_code_wrapper(pipeline_code): def generate_pipeline_code(pipeline_tree, operators): - """Generate code specific to the construction of the sklearn Pipeline + """Generate code specific to the construction of the sklearn Pipeline. Parameters ---------- @@ -244,12 +247,13 @@ def generate_pipeline_code(pipeline_tree, operators): Source code for the sklearn pipeline """ - steps = process_operator(pipeline_tree, operators) + steps = _process_operator(pipeline_tree, operators) pipeline_text = "make_pipeline(\n{STEPS}\n)".format(STEPS=_indent(",\n".join(steps), 4)) return pipeline_text + def generate_export_pipeline_code(pipeline_tree, operators): - """Generate code specific to the construction of the sklearn Pipeline for export_pipeline + """Generate code specific to the construction of the sklearn Pipeline for export_pipeline. Parameters ---------- @@ -261,17 +265,19 @@ def generate_export_pipeline_code(pipeline_tree, operators): Source code for the sklearn pipeline """ - steps = process_operator(pipeline_tree, operators) + steps = _process_operator(pipeline_tree, operators) # number of steps in a pipeline num_step = len(steps) if num_step > 1: pipeline_text = "make_pipeline(\n{STEPS}\n)".format(STEPS=_indent(",\n".join(steps), 4)) - else: # only one operator (root = True) - pipeline_text = "{STEPS}".format(STEPS=_indent(",\n".join(steps), 0)) + # only one operator (root = True) + else: + pipeline_text = "{STEPS}".format(STEPS=_indent(",\n".join(steps), 0)) return pipeline_text -def process_operator(operator, operators, depth=0): + +def _process_operator(operator, operators, depth=0): steps = [] op_name = operator[0] @@ -284,7 +290,7 @@ def process_operator(operator, operators, depth=0): tpot_op = get_by_name(op_name, operators) if input_name != 'input_matrix': - steps.extend(process_operator(input_name, operators, depth + 1)) + steps.extend(_process_operator(input_name, operators, depth + 1)) # If the step is an estimator and is not the last step then we must # add its guess as a synthetic feature @@ -299,7 +305,7 @@ def process_operator(operator, operators, depth=0): def _indent(text, amount): - """Indent a multiline string by some number of spaces + """Indent a multiline string by some number of spaces. Parameters ---------- @@ -329,9 +335,9 @@ def _make_branch(branch): if tpot_op.root: return """make_union(VotingClassifier([('branch', {} -)]), FunctionTransformer(copy))""".format(_indent(process_operator(branch, operators)[0], 4)) +)]), FunctionTransformer(copy))""".format(_indent(_process_operator(branch, operators)[0], 4)) else: - return process_operator(branch, operators)[0] + return _process_operator(branch, operators)[0] else: # We're going to have to make a pipeline tpot_op = get_by_name(branch[0], operators) diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index 8714a16de..c649f506d 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -35,6 +34,7 @@ import warnings import threading + def varOr(population, toolbox, lambda_, cxpb, mutpb): """Part of an evolutionary algorithm applying only the variation part (crossover, mutation **or** reproduction). The modified individuals have @@ -71,14 +71,14 @@ def varOr(population, toolbox, lambda_, cxpb, mutpb): for _ in range(lambda_): op_choice = np.random.random() if op_choice < cxpb: # Apply crossover - idxs = np.random.randint(0, len(population),size=2) + idxs = np.random.randint(0, len(population), size=2) ind1, ind2 = toolbox.clone(population[idxs[0]]), toolbox.clone(population[idxs[1]]) ind_str = str(ind1) num_loop = 0 - while ind_str == str(ind1) and num_loop < 50 : # 50 loops at most to generate a different individual by crossover + while ind_str == str(ind1) and num_loop < 50: # 50 loops at most to generate a different individual by crossover ind1, ind2 = toolbox.mate(ind1, ind2) num_loop += 1 - if ind_str != str(ind1): # check if crossover happened + if ind_str != str(ind1): # check if crossover happened del ind1.fitness.values offspring.append(ind1) elif op_choice < cxpb + mutpb: # Apply mutation @@ -86,20 +86,21 @@ def varOr(population, toolbox, lambda_, cxpb, mutpb): ind = toolbox.clone(population[idx]) ind_str = str(ind) num_loop = 0 - while ind_str == str(ind) and num_loop < 50 : # 50 loops at most to generate a different individual by mutation + while ind_str == str(ind) and num_loop < 50: # 50 loops at most to generate a different individual by mutation ind, = toolbox.mutate(ind) num_loop += 1 - if ind_str != str(ind): # check if mutation happened + if ind_str != str(ind): # check if mutation happened del ind.fitness.values offspring.append(ind) - else: # Apply reproduction + else: # Apply reproduction idx = np.random.randint(0, len(population)) offspring.append(toolbox.clone(population[idx])) return offspring + def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, - stats=None, halloffame=None, verbose=0, max_time_mins = None): + stats=None, halloffame=None, verbose=0, max_time_mins=None): """This is the :math:`(\mu + \lambda)` evolutionary algorithm. :param population: A list of individuals. :param toolbox: A :class:`~deap.base.Toolbox` that contains the evolution @@ -177,7 +178,6 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit - # Update the hall of fame with the generated individuals if halloffame is not None: halloffame.update(offspring) @@ -196,9 +196,12 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, elif verbose == 3: pbar.write('Generation {} - Current Pareto front scores:'.format(gen)) for pipeline, pipeline_scores in zip(halloffame.items, reversed(halloffame.keys)): - pbar.write('{}\t{}\t{}'.format(int(abs(pipeline_scores.wvalues[0])), - abs(pipeline_scores.wvalues[1]), - pipeline)) + pbar.write('{}\t{}\t{}'.format( + int(abs(pipeline_scores.wvalues[0])), + abs(pipeline_scores.wvalues[1]), + pipeline + ) + ) pbar.write('') # Update the statistics with the new population @@ -235,7 +238,7 @@ def cxOnePoint(ind1, ind2): types1[node.ret].append(idx) common_types = [] for idx, node in enumerate(ind2[1:], 1): - if node.ret in types1 and not node.ret in types2: + if node.ret in types1 and node.ret not in types2: common_types.append(node.ret) types2[node.ret].append(idx) @@ -283,10 +286,11 @@ def mutNodeReplacement(individual, pset): # find next primitive if any rindex = None if index + 1 < len(individual): - for i, tmpnode in enumerate(individual[index+1:], index+ 1): + for i, tmpnode in enumerate(individual[index + 1:], index + 1): if isinstance(tmpnode, gp.Primitive) and tmpnode.ret in tmpnode.args: rindex = i - #pset.primitives[node.ret] can get a list of the type of node + + # pset.primitives[node.ret] can get a list of the type of node # for example: if op.root is True then the node.ret is Output_DF object # based on the function _setup_pset. Then primitives is the list of classifor or regressor primitives = pset.primitives[node.ret] @@ -323,9 +327,11 @@ def __init__(self, *args, **kwargs): self.result = -float('inf') self._stopevent = threading.Event() self.daemon = True + def stop(self): self._stopevent.set() threading.Thread.join(self) + def run(self): # Note: changed name of the thread to "MainThread" to avoid such warning from joblib (maybe bugs) # Note: Need attention if using parallel execution model of scikit-learn @@ -337,20 +343,29 @@ def run(self): except Exception as e: pass + def _wrapped_cross_val_score(sklearn_pipeline, features, classes, cv, scoring_function, sample_weight, max_eval_time_mins): - #sys.tracebacklimit = 0 max_time_seconds = max(int(max_eval_time_mins * 60), 1) sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight) # build a job for cross_val_score - tmp_it = Interruptable_cross_val_score(clone(sklearn_pipeline), features, classes, - scoring=scoring_function, cv=cv, n_jobs=1, verbose=0, fit_params=sample_weight_dict) + tmp_it = Interruptable_cross_val_score( + clone(sklearn_pipeline), + features, + classes, + scoring=scoring_function, + cv=cv, + n_jobs=1, + verbose=0, + fit_params=sample_weight_dict + ) tmp_it.start() tmp_it.join(max_time_seconds) + if tmp_it.isAlive(): resulting_score = 'Timeout' else: resulting_score = np.mean(tmp_it.result) - #sys.tracebacklimit = 1000 + tmp_it.stop() return resulting_score diff --git a/tpot/gp_types.py b/tpot/gp_types.py index 8b2b6608c..8cf9a44d9 100644 --- a/tpot/gp_types.py +++ b/tpot/gp_types.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -20,8 +19,8 @@ """ -class Output_Array(object): - """Output data type of pipelines""" +class Output_Array(object): + """Output data type of pipelines.""" pass diff --git a/tpot/metrics.py b/tpot/metrics.py index 98df0666c..6c35af84c 100644 --- a/tpot/metrics.py +++ b/tpot/metrics.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -25,7 +24,7 @@ def balanced_accuracy(y_true, y_pred): - """Default scoring function: balanced accuracy + """Default scoring function: balanced accuracy. Balanced accuracy computes each class' accuracy on a per-class basis using a one-vs-rest encoding, then computes an unweighted average of the class accuracies. @@ -59,4 +58,5 @@ def balanced_accuracy(y_true, y_pred): return np.mean(all_class_accuracies) + SCORERS['balanced_accuracy'] = make_scorer(balanced_accuracy) diff --git a/tpot/operator_utils.py b/tpot/operator_utils.py index dfc0cc294..f5228a528 100644 --- a/tpot/operator_utils.py +++ b/tpot/operator_utils.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -27,24 +26,24 @@ class Operator(object): - """Base class for operators in TPOT""" - def __init__(self): - pass + """Base class for operators in TPOT.""" + root = False # Whether this operator type can be the root of the tree import_hash = None sklearn_class = None arg_types = None - dep_op_list = {} # the estimator or score_func as params in this operators + dep_op_list = {} # the estimator or score_func as params in this operators class ARGType(object): - """Base class for parameter specifications""" - def __init__(self): - pass + """Base class for parameter specifications.""" + + pass def source_decode(sourcecode): - """ Decode operator source and import operator class + """Decode operator source and import operator class. + Parameters ---------- sourcecode: string @@ -73,10 +72,12 @@ def source_decode(sourcecode): except ImportError: print('Warning: {} is not available and will not be used by TPOT.'.format(sourcecode)) op_obj = None + return import_str, op_str, op_obj + def set_sample_weight(pipeline_steps, sample_weight=None): - """Recursively iterates through all objects in the pipeline and sets sample weight + """Recursively iterates through all objects in the pipeline and sets sample weight. Parameters ---------- @@ -96,19 +97,21 @@ def set_sample_weight(pipeline_steps, sample_weight=None): if inspect.getargspec(obj.fit).args.count('sample_weight'): step_sw = pname + '__sample_weight' sample_weight_dict[step_sw] = sample_weight + if sample_weight_dict: return sample_weight_dict else: return None + def ARGTypeClassFactory(classname, prange, BaseClass=ARGType): - """ - Dynamically create parameter type class - """ - return type(classname, (BaseClass,), {'values':prange}) + """Dynamically create parameter type class.""" + return type(classname, (BaseClass,), {'values': prange}) + def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass=ARGType): - """Dynamically create operator class + """Dynamically create operator class. + Parameters ---------- opsourse: string @@ -130,14 +133,12 @@ def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass= a list of parameter class """ - - class_profile = {} - dep_op_list = {} import_str, op_str, op_obj = source_decode(opsourse) + if not op_obj: - return None, None # nothing return + return None, None else: # define if the operator can be the root of a pipeline if issubclass(op_obj, ClassifierMixin) or issubclass(op_obj, RegressorMixin): @@ -145,20 +146,22 @@ def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass= optype = "Classifier or Regressor" else: optype = "Preprocessor or Selector" + @classmethod def op_type(cls): - """Returns the type of the operator, e.g: - ("Classifier", "Regressor", "Selector", "Preprocessor") + """Return the operator type. + + Possible values: + "Classifier", "Regressor", "Selector", "Preprocessor" """ return optype class_profile['type'] = op_type - class_profile['sklearn_class'] = op_obj - import_hash = {} import_hash[import_str] = [op_str] arg_types = [] + for pname in sorted(opdict.keys()): prange = opdict[pname] if not isinstance(prange, dict): @@ -171,7 +174,7 @@ def op_type(cls): import_hash[import_str].append(dep_op_str) else: import_hash[dep_import_str] = [dep_op_str] - dep_op_list[pname]=dep_op_str + dep_op_list[pname] = dep_op_str if dval: for dpname in sorted(dval.keys()): dprange = dval[dpname] @@ -180,10 +183,10 @@ def op_type(cls): class_profile['arg_types'] = tuple(arg_types) class_profile['import_hash'] = import_hash class_profile['dep_op_list'] = dep_op_list + @classmethod def parameter_types(cls): - """Return tuple of argument types for calling of the operator and the - return type of the operator + """Return the argument and return types of an operator. Parameters ---------- @@ -198,12 +201,11 @@ def parameter_types(cls): """ return ([np.ndarray] + arg_types, np.ndarray) - class_profile['parameter_types'] = parameter_types + @classmethod def export(cls, *args): - """Represent the operator as a string so that it can be exported to a - file + """Represent the operator as a string so that it can be exported to a file. Parameters ---------- @@ -218,28 +220,33 @@ def export(cls, *args): SklearnClassName(param1="val1", param2=val2) """ - op_arguments = [] + if dep_op_list: dep_op_arguments = {} + for arg_class, arg_value in zip(arg_types, args): if arg_value == "DEFAULT": continue aname_split = arg_class.__name__.split('__') if isinstance(arg_value, str): arg_value = '\"{}\"'.format(arg_value) - if len(aname_split) == 2: # simple parameter + if len(aname_split) == 2: # simple parameter op_arguments.append("{}={}".format(aname_split[-1], arg_value)) - else: # parameter of internal operator as a parameter in the operator, usually in Selector + # Parameter of internal operator as a parameter in the + # operator, usually in Selector + else: if not list(dep_op_list.values()).count(aname_split[1]): raise TypeError('Warning: the operator {} is not in right format in the operator dictionary'.format(aname_split[0])) else: if aname_split[1] not in dep_op_arguments: dep_op_arguments[aname_split[1]] = [] dep_op_arguments[aname_split[1]].append("{}={}".format(aname_split[-1], arg_value)) + tmp_op_args = [] if dep_op_list: - # to make sure the inital operators is the first parameter just for better persentation + # To make sure the inital operators is the first parameter just + # for better persentation for dep_op_pname, dep_op_str in dep_op_list.items(): if dep_op_str == 'f_classif': arg_value = dep_op_str diff --git a/tpot/tpot.py b/tpot/tpot.py index 41a6f8c21..8f85a23cc 100644 --- a/tpot/tpot.py +++ b/tpot/tpot.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -""" -Copyright 2015-Present Randal S. Olson +"""Copyright 2015-Present Randal S. Olson. This file is part of the TPOT library. @@ -26,18 +25,18 @@ class TPOTClassifier(TPOTBase): - """TPOT estimator for classification problems""" + """TPOT estimator for classification problems.""" scoring_function = 'accuracy' # Classification scoring - default_config_dict = classifier_config_dict # Classification dictionary + default_config_dict = classifier_config_dict # Classification dictionary classification = True regression = False class TPOTRegressor(TPOTBase): - """TPOT estimator for regression problems""" + """TPOT estimator for regression problems.""" scoring_function = 'neg_mean_squared_error' # Regression scoring - default_config_dict = regressor_config_dict # Regression dictionary + default_config_dict = regressor_config_dict # Regression dictionary classification = False regression = True From 2b8636d6b3d2c8afbc2960b24f15759e0b0d04ea Mon Sep 17 00:00:00 2001 From: teaearlgraycold Date: Fri, 21 Apr 2017 22:16:01 -0400 Subject: [PATCH 2/3] Use assert_raises in tests --- tests.py | 97 ++++++++++++++++---------------------------------------- 1 file changed, 27 insertions(+), 70 deletions(-) diff --git a/tests.py b/tests.py index c081d3ff2..c277b5cb7 100644 --- a/tests.py +++ b/tests.py @@ -41,6 +41,7 @@ from sklearn.model_selection import train_test_split, cross_val_score from deap import creator from tqdm import tqdm +from nose.tools import assert_raises # Set up the MNIST data set for testing mnist_data = load_digits() @@ -124,27 +125,24 @@ def test_init_default_scoring(): def test_invaild_score_warning(): """Assert that the TPOT fit function raises a ValueError when the scoring metrics is not available in SCORERS.""" - try: - TPOTClassifier(scoring='balanced_accuray') # typo for balanced_accuracy - assert False - except ValueError: - pass - try: - TPOTClassifier(scoring='balanced_accuracy') # correct one - assert True - except Exception: - assert False + # Mis-spelled scorer + assert_raises(ValueError, TPOTClassifier, scoring='balanced_accuray') + # Correctly spelled + TPOTClassifier(scoring='balanced_accuracy') def test_invaild_dataset_warning(): """Assert that the TPOT fit function raises a ValueError when dataset is not in right format.""" - tpot_obj = TPOTClassifier(random_state=42, population_size=1, offspring_size=2, generations=1, verbosity=0) - bad_training_classes = training_classes.reshape((1, len(training_classes))) # common mistake in classes - try: - tpot_obj.fit(training_features, bad_training_classes) # typo for balanced_accuracy - assert False - except ValueError: - pass + tpot_obj = TPOTClassifier( + random_state=42, + population_size=1, + offspring_size=2, + generations=1, + verbosity=0 + ) + # common mistake in classes + bad_training_classes = training_classes.reshape((1, len(training_classes))) + assert_raises(ValueError, tpot_obj.fit, training_features, bad_training_classes) def test_init_max_time_mins(): @@ -201,11 +199,7 @@ def test_lite_params(): tpot_obj = TPOTRegressor(config_dict='TPOT light') assert tpot_obj.config_dict == regressor_config_dict_light - try: - tpot_obj = TPOTRegressor(config_dict='TPOT MDR') - assert False - except TypeError: - assert True + assert_raises(TypeError, TPOTRegressor, config_dict='TPOT MDR') def test_random_ind(): @@ -250,12 +244,7 @@ def test_random_ind_2(): def test_score(): """Assert that the TPOT score function raises a RuntimeError when no optimized pipeline exists.""" tpot_obj = TPOTClassifier() - - try: - tpot_obj.score(testing_features, testing_classes) - assert False # Should be unreachable - except RuntimeError: - pass + assert_raises(RuntimeError, tpot_obj.score, testing_features, testing_classes) def test_score_2(): @@ -358,12 +347,7 @@ def test_sample_weight_func(): def test_predict(): """Assert that the TPOT predict function raises a RuntimeError when no optimized pipeline exists.""" tpot_obj = TPOTClassifier() - - try: - tpot_obj.predict(testing_features) - assert False # Should be unreachable - except RuntimeError: - pass + assert_raises(RuntimeError, tpot_obj.predict, testing_features) def test_predict_2(): @@ -381,7 +365,6 @@ def test_predict_2(): tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features, training_classes) - result = tpot_obj.predict(testing_features) assert result.shape == (testing_features.shape[0],) @@ -424,16 +407,11 @@ def test_predict_proba2(): tpot_obj._fitted_pipeline.fit(training_features, training_classes) result = tpot_obj.predict_proba(testing_features) - rows = result.shape[0] - columns = result.shape[1] + rows, columns = result.shape - try: - for i in range(rows): - for j in range(columns): - float_range(result[i][j]) - assert True - except Exception: - assert False + for i in range(rows): + for j in range(columns): + float_range(result[i][j]) def test_warm_start(): @@ -545,12 +523,7 @@ def test_operators(): def test_export(): """Assert that TPOT's export function throws a RuntimeError when no optimized pipeline exists.""" tpot_obj = TPOTClassifier() - - try: - tpot_obj.export("test_export.py") - assert False # Should be unreachable - except RuntimeError: - pass + assert_raises(RuntimeError, tpot_obj.export, "test_export.py") def test_generate_pipeline_code(): @@ -813,11 +786,7 @@ def test_gen(): def test_positive_integer(): """Assert that the TPOT CLI interface's integer parsing throws an exception when n < 0.""" - try: - positive_integer('-1') - assert False # Should be unreachable - except Exception: - pass + assert_raises(Exception, positive_integer, '-1') def test_positive_integer_2(): @@ -827,11 +796,7 @@ def test_positive_integer_2(): def test_positive_integer_3(): """Assert that the TPOT CLI interface's integer parsing throws an exception when n is not an integer.""" - try: - positive_integer('foobar') - assert False # Should be unreachable - except Exception: - pass + assert_raises(Exception, positive_integer, 'foobar') def test_float_range(): @@ -841,17 +806,9 @@ def test_float_range(): def test_float_range_2(): """Assert that the TPOT CLI interface's float range throws an exception when input it out of range.""" - try: - float_range('2.0') - assert False # Should be unreachable - except Exception: - pass + assert_raises(Exception, float_range, '2.0') def test_float_range_3(): """Assert that the TPOT CLI interface's float range throws an exception when input is not a float.""" - try: - float_range('foobar') - assert False # Should be unreachable - except Exception: - pass + assert_raises(Exception, float_range, 'foobar') From e67714d80fd5d66cdc6d07d6f3bc0183c796d218 Mon Sep 17 00:00:00 2001 From: teaearlgraycold Date: Tue, 25 Apr 2017 16:44:26 -0400 Subject: [PATCH 3/3] Use np.allclose instead of is_close function in tests --- tests.py | 14 ++++---------- tpot/built_in_operators.py | 2 +- tpot/driver.py | 22 +--------------------- tpot/gp_deap.py | 9 ++++++--- 4 files changed, 12 insertions(+), 35 deletions(-) diff --git a/tests.py b/tests.py index c277b5cb7..174d0cd03 100644 --- a/tests.py +++ b/tests.py @@ -63,12 +63,6 @@ ) -# http://stackoverflow.com/questions/5595425/ -def is_close(a, b, rel_tol=1e-09, abs_tol=0.0): - """Determine if two floats are close in value, but not necessarily equal.""" - return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) - - def test_driver(): """Assert that the TPOT driver output normal result.""" batcmd = "python -m tpot.driver tests.csv -is , -target class -g 2 -p 2 -os 4 -cv 5 -s 45 -v 1" @@ -250,7 +244,7 @@ def test_score(): def test_score_2(): """Assert that the TPOTClassifier score function outputs a known score for a fix pipeline.""" tpot_obj = TPOTClassifier() - known_score = 0.977777777778 # Assumes use of the TPOT balanced_accuracy function + known_score = 0.977777777778 # Assumes use of the TPOT accuracy function # Reify pipeline with known score pipeline_string = ( @@ -267,7 +261,7 @@ def test_score_2(): # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) - assert is_close(known_score, score) + assert np.allclose(known_score, score) def test_score_3(): @@ -294,7 +288,7 @@ def test_score_3(): # Get score from TPOT score = tpot_obj.score(testing_features_r, testing_classes_r) - assert is_close(known_score, score) + assert np.allclose(known_score, score) def test_sample_weight_func(): @@ -341,7 +335,7 @@ def test_sample_weight_func(): assert np.allclose(cv_score1, cv_score2) assert not np.allclose(cv_score1, cv_score_weight) - assert is_close(known_score, score) + assert np.allclose(known_score, score) def test_predict(): diff --git a/tpot/built_in_operators.py b/tpot/built_in_operators.py index df0d8b59c..a7d49889f 100644 --- a/tpot/built_in_operators.py +++ b/tpot/built_in_operators.py @@ -74,5 +74,5 @@ class CombineDFs(object): @property def __name__(self): - """Instance ame is the same as the class name.""" + """Instance name is the same as the class name.""" return self.__class__.__name__ diff --git a/tpot/driver.py b/tpot/driver.py index 78d576478..d1e5fb176 100755 --- a/tpot/driver.py +++ b/tpot/driver.py @@ -22,7 +22,6 @@ import numpy as np import argparse from sklearn.model_selection import train_test_split -from sklearn.preprocessing import Imputer from .tpot import TPOTClassifier, TPOTRegressor from ._version import __version__ @@ -325,17 +324,6 @@ def _get_arg_parser(): ) ) - parser.add_argument( - '-impute', - action='store', - dest='IMPUTE', - default=None, - help=( - 'If set, TPOT will take the provided missing value string and ' - 'impute the value of all data points with that value.' - ) - ) - parser.add_argument( '-v', action='store', @@ -403,11 +391,6 @@ def _read_data_file(args): return input_data -def _impute_missing_values(features, missing_value): - imputer = Imputer(missing_values=missing_value) - return imputer.fit_transform(features) - - def main(): """Perform a TPOT run.""" args = _get_arg_parser().parse_args() @@ -422,9 +405,6 @@ def main(): axis=1 ) - if args.IMPUTE: - features = _impute_missing_values(features, args.IMPUTE) - training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, input_data[args.TARGET_NAME], random_state=args.RANDOM_STATE) tpot_type = TPOTClassifier if args.TPOT_MODE == 'classification' else TPOTRegressor @@ -449,7 +429,7 @@ def main(): tpot.fit(training_features, training_classes) - if args.VERBOSITY < 3 and tpot._optimized_pipeline: + if args.VERBOSITY in [1, 2] and tpot._optimized_pipeline: training_score = max([x.wvalues[1] for x in tpot._pareto_front.keys]) print('\nTraining score: {}'.format(abs(training_score))) print('Holdout score: {}'.format(tpot.score(testing_features, testing_classes))) diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index c649f506d..ef3ac4385 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -34,6 +34,9 @@ import warnings import threading +# Limit loops to generate a different individual by crossover/mutation +MAX_MUT_LOOPS = 50 + def varOr(population, toolbox, lambda_, cxpb, mutpb): """Part of an evolutionary algorithm applying only the variation part @@ -70,12 +73,12 @@ def varOr(population, toolbox, lambda_, cxpb, mutpb): offspring = [] for _ in range(lambda_): op_choice = np.random.random() - if op_choice < cxpb: # Apply crossover + if op_choice < cxpb: # Apply crossover idxs = np.random.randint(0, len(population), size=2) ind1, ind2 = toolbox.clone(population[idxs[0]]), toolbox.clone(population[idxs[1]]) ind_str = str(ind1) num_loop = 0 - while ind_str == str(ind1) and num_loop < 50: # 50 loops at most to generate a different individual by crossover + while ind_str == str(ind1) and num_loop < MAX_MUT_LOOPS: ind1, ind2 = toolbox.mate(ind1, ind2) num_loop += 1 if ind_str != str(ind1): # check if crossover happened @@ -86,7 +89,7 @@ def varOr(population, toolbox, lambda_, cxpb, mutpb): ind = toolbox.clone(population[idx]) ind_str = str(ind) num_loop = 0 - while ind_str == str(ind) and num_loop < 50: # 50 loops at most to generate a different individual by mutation + while ind_str == str(ind) and num_loop < MAX_MUT_LOOPS: ind, = toolbox.mutate(ind) num_loop += 1 if ind_str != str(ind): # check if mutation happened