From 0c71438a838e524fbf825fe68016aa853b08138f Mon Sep 17 00:00:00 2001
From: Weixuan <weixuanf@upenn.edu>
Date: Fri, 12 May 2017 11:22:20 -0400
Subject: [PATCH] Revert "Add subsample option in TPOT"

---
 docs_sources/using.md | 10 ----------
 tests.py              | 26 +-------------------------
 tpot/base.py          | 41 +++++++++--------------------------------
 tpot/driver.py        | 13 -------------
 4 files changed, 10 insertions(+), 80 deletions(-)
diff --git a/docs_sources/using.md b/docs_sources/using.md
index cbe219ad9..4e0e13ae8 100644
--- a/docs_sources/using.md
+++ b/docs_sources/using.md
@@ -95,11 +95,6 @@ See the section on <a href="#scoringfunctions">scoring functions</a> for more de
 <td>Any integer >1</td>
 <td>Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.</td>
 </tr>
-<td>-sub</td>
-<td>SUBSAMPLE</td>
-<td>(0.0, 1.0]</td>
-<td>Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.</td>
-</tr>
 <tr>
 <td>-njobs</td>
 <td>NUM_JOBS</td>
@@ -248,11 +243,6 @@ See the section on <a href="#scoringfunctions">scoring functions</a> for more de
 <td>Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.</td>
 </tr>
 <tr>
-<td>subsample</td>
-<td>(0.0, 1.0]</td>
-<td>Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.</td>
-</tr>
-<tr>
 <td>n_jobs</td>
 <td>Any positive integer or -1</td>
 <td>Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process.
diff --git a/tests.py b/tests.py
index b9924cd06..92fdfc030 100644
--- a/tests.py
+++ b/tests.py
@@ -239,7 +239,7 @@ def test_init_default_scoring():
 
 
 def test_invaild_score_warning():
-    """Assert that the TPOT intitializes raises a ValueError when the scoring metrics is not available in SCORERS."""
+    """Assert that the TPOT fit function raises a ValueError when the scoring metrics is not available in SCORERS."""
     # Mis-spelled scorer
     assert_raises(ValueError, TPOTClassifier, scoring='balanced_accuray')
     # Correctly spelled
@@ -260,14 +260,6 @@ def test_invaild_dataset_warning():
     assert_raises(ValueError, tpot_obj.fit, training_features, bad_training_classes)
 
 
-def test_invaild_subsample_ratio_warning():
-    """Assert that the TPOT intitializes raises a ValueError when subsample ratio is not in the range (0.0, 1.0]."""
-    # Invalid ratio
-    assert_raises(ValueError, TPOTClassifier, subsample=0.0)
-    # Valid ratio
-    TPOTClassifier(subsample=0.1)
-
-
 def test_init_max_time_mins():
     """Assert that the TPOT init stores max run time and sets generations to 1000000."""
     tpot_obj = TPOTClassifier(max_time_mins=30, generations=1000)
@@ -622,22 +614,6 @@ def test_fit2():
     assert not (tpot_obj._start_datetime is None)
 
 
-def test_fit3():
-    """Assert that the TPOT fit function provides an optimized pipeline with subsample is 0.8"""
-    tpot_obj = TPOTClassifier(
-        random_state=42,
-        population_size=1,
-        offspring_size=2,
-        generations=1,
-        subsample=0.8,
-        verbosity=0
-    )
-    tpot_obj.fit(training_features, training_classes)
-
-    assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
-    assert not (tpot_obj._start_datetime is None)
-
-
 def test_evaluated_individuals():
     """Assert that _evaluated_individuals stores corrent pipelines and their CV scores."""
     tpot_obj = TPOTClassifier(
diff --git a/tpot/base.py b/tpot/base.py
index cddc94fda..a2a1f72c4 100644
--- a/tpot/base.py
+++ b/tpot/base.py
@@ -40,7 +40,6 @@
 from sklearn.pipeline import make_pipeline, make_union
 from sklearn.preprocessing import FunctionTransformer
 from sklearn.ensemble import VotingClassifier
-from sklearn.model_selection import train_test_split
 from sklearn.metrics.scorer import make_scorer
 from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
 
@@ -82,7 +81,7 @@ class TPOTBase(BaseEstimator):
 
     def __init__(self, generations=100, population_size=100, offspring_size=None,
                  mutation_rate=0.9, crossover_rate=0.1,
-                 scoring=None, cv=5, subsample=1.0, n_jobs=1,
+                 scoring=None, cv=5, n_jobs=1,
                  max_time_mins=None, max_eval_time_mins=5,
                  random_state=None, config_dict=None, warm_start=False,
                  verbosity=0, disable_update_check=False):
@@ -123,7 +122,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
             Offers the same options as sklearn.model_selection.cross_val_score as well as
             a built-in score "balanced_accuracy":
 
-            ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',
+            ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced accuracy',
             'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted',
             'precision', 'precision_macro', 'precision_micro', 'precision_samples',
             'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro',
@@ -131,9 +130,6 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
         cv: int (default: 5)
             Number of folds to evaluate each pipeline over in k-fold cross-validation
             during the TPOT optimization process.
-        subsample: float (default: 1.0)
-            Subsample ratio of the training instance. Setting it to 0.5 means that TPOT
-            randomly collects half of training samples for pipeline optimization process.
         n_jobs: int (default: 1)
             Number of CPUs for evaluating pipelines in parallel during the TPOT
             optimization process. Assigning this to -1 will use as many cores as available
@@ -266,11 +262,6 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
                 self.scoring_function = scoring
 
         self.cv = cv
-        self.subsample = subsample
-        if self.subsample <= 0.0 or self.subsample > 1.0:
-            raise ValueError(
-                'The subsample ratio of the training instance must be in the range (0.0, 1.0].'
-            )
         # If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module
         if sys.platform.startswith('win') and n_jobs != 1:
             print(
@@ -401,9 +392,9 @@ def fit(self, features, classes, sample_weight=None):
         """Fit an optimitzed machine learning pipeline.
 
         Uses genetic programming to optimize a machine learning pipeline that
-        maximizes score on the provided features and classes. Performs an internal
-        stratified training/testing cross-validaton split to avoid overfitting
-        on the provided data.
+        maximizes classification score on the provided features and classes.
+        Performs an internal stratified training/testing cross-validaton split
+        to avoid overfitting on the provided data.
 
         Parameters
         ----------
@@ -430,24 +421,10 @@ def fit(self, features, classes, sample_weight=None):
         try:
             clf = clf.fit(features, classes)
         except Exception:
-            raise ValueError(
-                            'Error: Input data is not in a valid format. '
-                            'Please confirm that the input data is scikit-learn compatible. '
-                            'For example, the features must be a 2-D array and target labels '
-                            'must be a 1-D array.'
-                            )
-
-        # Randomly collect a subsample of training samples for pipeline optimization process.
-        if self.subsample < 1.0:
-            features, _, classes, _ = train_test_split(features, classes, train_size=self.subsample, random_state=self.random_state)
-            # Raise a warning message if the training size is less than 1500 when subsample is not default value
-            if features.shape[0] < 1500:
-                print(
-                    'Warning: Although subsample can accelerate pipeline optimization process, '
-                    'too small training sample size may cause unpredictable effect on maximizing '
-                    'score in pipeline optimization process. Increasing subsample ratio may get '
-                    'a more reasonable outcome from optimization process in TPOT.'
-                    )
+            raise ValueError('Error: Input data is not in a valid format. '
+                             'Please confirm that the input data is scikit-learn compatible. '
+                             'For example, the features must be a 2-D array and target labels '
+                             'must be a 1-D array.')
 
         # Set the seed for the GP run
         if self.random_state is not None:
diff --git a/tpot/driver.py b/tpot/driver.py
index 3c7faba8f..fcac41601 100755
--- a/tpot/driver.py
+++ b/tpot/driver.py
@@ -259,18 +259,6 @@ def _get_arg_parser():
         )
     )
 
-    parser.add_argument(
-        '-sub',
-        action='store',
-        dest='SUBSAMPLE',
-        default=1.0,
-        type=float,
-        help=(
-            'Subsample ratio of the training instance. Setting it to 0.5 means that TPOT'
-            'randomly collects half of training samples for pipeline optimization process.'
-        )
-    )
-
     parser.add_argument(
         '-njobs',
         action='store',
@@ -426,7 +414,6 @@ def main(args):
         mutation_rate=args.MUTATION_RATE,
         crossover_rate=args.CROSSOVER_RATE,
         cv=args.NUM_CV_FOLDS,
-        subsample=args.SUBSAMPLE,
         n_jobs=args.NUM_JOBS,
         scoring=args.SCORING_FN,
         max_time_mins=args.MAX_TIME_MINS,