From 0c71438a838e524fbf825fe68016aa853b08138f Mon Sep 17 00:00:00 2001 From: Weixuan Date: Fri, 12 May 2017 11:22:20 -0400 Subject: [PATCH] Revert "Add subsample option in TPOT" --- docs_sources/using.md | 10 ---------- tests.py | 26 +------------------------- tpot/base.py | 41 +++++++++-------------------------------- tpot/driver.py | 13 ------------- 4 files changed, 10 insertions(+), 80 deletions(-) diff --git a/docs_sources/using.md b/docs_sources/using.md index cbe219ad9..4e0e13ae8 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -95,11 +95,6 @@ See the section on scoring functions for more de Any integer >1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. --sub -SUBSAMPLE -(0.0, 1.0] -Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. - -njobs NUM_JOBS @@ -248,11 +243,6 @@ See the section on scoring functions for more de Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -subsample -(0.0, 1.0] -Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. - - n_jobs Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. diff --git a/tests.py b/tests.py index b9924cd06..92fdfc030 100644 --- a/tests.py +++ b/tests.py @@ -239,7 +239,7 @@ def test_init_default_scoring(): def test_invaild_score_warning(): - """Assert that the TPOT intitializes raises a ValueError when the scoring metrics is not available in SCORERS.""" + """Assert that the TPOT fit function raises a ValueError when the scoring metrics is not available in SCORERS.""" # Mis-spelled scorer assert_raises(ValueError, TPOTClassifier, scoring='balanced_accuray') # Correctly spelled @@ -260,14 +260,6 @@ def test_invaild_dataset_warning(): assert_raises(ValueError, tpot_obj.fit, training_features, bad_training_classes) -def test_invaild_subsample_ratio_warning(): - """Assert that the TPOT intitializes raises a ValueError when subsample ratio is not in the range (0.0, 1.0].""" - # Invalid ratio - assert_raises(ValueError, TPOTClassifier, subsample=0.0) - # Valid ratio - TPOTClassifier(subsample=0.1) - - def test_init_max_time_mins(): """Assert that the TPOT init stores max run time and sets generations to 1000000.""" tpot_obj = TPOTClassifier(max_time_mins=30, generations=1000) @@ -622,22 +614,6 @@ def test_fit2(): assert not (tpot_obj._start_datetime is None) -def test_fit3(): - """Assert that the TPOT fit function provides an optimized pipeline with subsample is 0.8""" - tpot_obj = TPOTClassifier( - random_state=42, - population_size=1, - offspring_size=2, - generations=1, - subsample=0.8, - verbosity=0 - ) - tpot_obj.fit(training_features, training_classes) - - assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) - assert not (tpot_obj._start_datetime is None) - - def test_evaluated_individuals(): """Assert that _evaluated_individuals stores corrent pipelines and their CV scores.""" tpot_obj = TPOTClassifier( diff --git a/tpot/base.py b/tpot/base.py index cddc94fda..a2a1f72c4 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -40,7 +40,6 @@ from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer from sklearn.ensemble import VotingClassifier -from sklearn.model_selection import train_test_split from sklearn.metrics.scorer import make_scorer from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier @@ -82,7 +81,7 @@ class TPOTBase(BaseEstimator): def __init__(self, generations=100, population_size=100, offspring_size=None, mutation_rate=0.9, crossover_rate=0.1, - scoring=None, cv=5, subsample=1.0, n_jobs=1, + scoring=None, cv=5, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, warm_start=False, verbosity=0, disable_update_check=False): @@ -123,7 +122,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, Offers the same options as sklearn.model_selection.cross_val_score as well as a built-in score "balanced_accuracy": - ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', + ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', @@ -131,9 +130,6 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, cv: int (default: 5) Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. - subsample: float (default: 1.0) - Subsample ratio of the training instance. Setting it to 0.5 means that TPOT - randomly collects half of training samples for pipeline optimization process. n_jobs: int (default: 1) Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available @@ -266,11 +262,6 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self.scoring_function = scoring self.cv = cv - self.subsample = subsample - if self.subsample <= 0.0 or self.subsample > 1.0: - raise ValueError( - 'The subsample ratio of the training instance must be in the range (0.0, 1.0].' - ) # If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module if sys.platform.startswith('win') and n_jobs != 1: print( @@ -401,9 +392,9 @@ def fit(self, features, classes, sample_weight=None): """Fit an optimitzed machine learning pipeline. Uses genetic programming to optimize a machine learning pipeline that - maximizes score on the provided features and classes. Performs an internal - stratified training/testing cross-validaton split to avoid overfitting - on the provided data. + maximizes classification score on the provided features and classes. + Performs an internal stratified training/testing cross-validaton split + to avoid overfitting on the provided data. Parameters ---------- @@ -430,24 +421,10 @@ def fit(self, features, classes, sample_weight=None): try: clf = clf.fit(features, classes) except Exception: - raise ValueError( - 'Error: Input data is not in a valid format. ' - 'Please confirm that the input data is scikit-learn compatible. ' - 'For example, the features must be a 2-D array and target labels ' - 'must be a 1-D array.' - ) - - # Randomly collect a subsample of training samples for pipeline optimization process. - if self.subsample < 1.0: - features, _, classes, _ = train_test_split(features, classes, train_size=self.subsample, random_state=self.random_state) - # Raise a warning message if the training size is less than 1500 when subsample is not default value - if features.shape[0] < 1500: - print( - 'Warning: Although subsample can accelerate pipeline optimization process, ' - 'too small training sample size may cause unpredictable effect on maximizing ' - 'score in pipeline optimization process. Increasing subsample ratio may get ' - 'a more reasonable outcome from optimization process in TPOT.' - ) + raise ValueError('Error: Input data is not in a valid format. ' + 'Please confirm that the input data is scikit-learn compatible. ' + 'For example, the features must be a 2-D array and target labels ' + 'must be a 1-D array.') # Set the seed for the GP run if self.random_state is not None: diff --git a/tpot/driver.py b/tpot/driver.py index 3c7faba8f..fcac41601 100755 --- a/tpot/driver.py +++ b/tpot/driver.py @@ -259,18 +259,6 @@ def _get_arg_parser(): ) ) - parser.add_argument( - '-sub', - action='store', - dest='SUBSAMPLE', - default=1.0, - type=float, - help=( - 'Subsample ratio of the training instance. Setting it to 0.5 means that TPOT' - 'randomly collects half of training samples for pipeline optimization process.' - ) - ) - parser.add_argument( '-njobs', action='store', @@ -426,7 +414,6 @@ def main(args): mutation_rate=args.MUTATION_RATE, crossover_rate=args.CROSSOVER_RATE, cv=args.NUM_CV_FOLDS, - subsample=args.SUBSAMPLE, n_jobs=args.NUM_JOBS, scoring=args.SCORING_FN, max_time_mins=args.MAX_TIME_MINS,