EpistasisLab · rhiever · May 12, 2017 · May 3, 2017 · May 3, 2017 · May 3, 2017
diff --git a/docs_sources/using.md b/docs_sources/using.md
@@ -95,6 +95,11 @@ See the section on <a href="#scoringfunctions">scoring functions</a> for more de
 <td>Any integer >1</td>
 <td>Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.</td>
 </tr>
+<td>-sub</td>
+<td>SUBSAMPLE</td>
+<td>(0.0, 1.0]</td>
+<td>Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.</td>
+</tr>
 <tr>
 <td>-njobs</td>
 <td>NUM_JOBS</td>
@@ -243,6 +248,11 @@ See the section on <a href="#scoringfunctions">scoring functions</a> for more de
 <td>Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.</td>
 </tr>
 <tr>
+<td>subsample</td>
+<td>(0.0, 1.0]</td>
+<td>Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.</td>
+</tr>
+<tr>
 <td>n_jobs</td>
 <td>Any positive integer or -1</td>
 <td>Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process.

diff --git a/tests.py b/tests.py
@@ -238,7 +238,7 @@ def test_init_default_scoring():
 
 
 def test_invaild_score_warning():
-    """Assert that the TPOT fit function raises a ValueError when the scoring metrics is not available in SCORERS."""
+    """Assert that the TPOT intitializes raises a ValueError when the scoring metrics is not available in SCORERS."""
     # Mis-spelled scorer
     assert_raises(ValueError, TPOTClassifier, scoring='balanced_accuray')
     # Correctly spelled
@@ -259,6 +259,14 @@ def test_invaild_dataset_warning():
     assert_raises(ValueError, tpot_obj.fit, training_features, bad_training_classes)
 
 
+def test_invaild_subsample_ratio_warning():
+    """Assert that the TPOT intitializes raises a ValueError when subsample ratio is not in the range (0.0, 1.0]."""
+    # Invalid ratio
+    assert_raises(ValueError, TPOTClassifier, subsample=0.0)
+    # Valid ratio
+    TPOTClassifier(subsample=0.1)
+
+
 def test_init_max_time_mins():
     """Assert that the TPOT init stores max run time and sets generations to 1000000."""
     tpot_obj = TPOTClassifier(max_time_mins=30, generations=1000)
@@ -612,6 +620,22 @@ def test_fit2():
     assert not (tpot_obj._start_datetime is None)
 
 
+def test_fit3():
+    """Assert that the TPOT fit function provides an optimized pipeline with subsample is 0.8"""
+    tpot_obj = TPOTClassifier(
+        random_state=42,
+        population_size=1,
+        offspring_size=2,
+        generations=1,
+        subsample=0.8,
+        verbosity=0
+    )
+    tpot_obj.fit(training_features, training_classes)
+
+    assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
+    assert not (tpot_obj._start_datetime is None)
+
+
 def test_evaluated_individuals():
     """Assert that _evaluated_individuals stores corrent pipelines and their CV scores."""
     tpot_obj = TPOTClassifier(

diff --git a/tpot/base.py b/tpot/base.py
@@ -40,6 +40,7 @@
 from sklearn.pipeline import make_pipeline, make_union
 from sklearn.preprocessing import FunctionTransformer
 from sklearn.ensemble import VotingClassifier
+from sklearn.model_selection import train_test_split
 from sklearn.metrics.scorer import make_scorer
 from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
 
@@ -80,7 +81,7 @@ class TPOTBase(BaseEstimator):
 
     def __init__(self, generations=100, population_size=100, offspring_size=None,
                  mutation_rate=0.9, crossover_rate=0.1,
-                 scoring=None, cv=5, n_jobs=1,
+                 scoring=None, cv=5, subsample=1.0, n_jobs=1,
                  max_time_mins=None, max_eval_time_mins=5,
                  random_state=None, config_dict=None, warm_start=False,
                  verbosity=0, disable_update_check=False):
@@ -121,14 +122,17 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
             Offers the same options as sklearn.model_selection.cross_val_score as well as
             a built-in score "balanced_accuracy":
 
-            ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced accuracy',
+            ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',
             'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted',
             'precision', 'precision_macro', 'precision_micro', 'precision_samples',
             'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro',
             'recall_samples', 'recall_weighted', 'roc_auc']
         cv: int (default: 5)
             Number of folds to evaluate each pipeline over in k-fold cross-validation
             during the TPOT optimization process.
+        subsample: float (default: 1.0)
+            Subsample ratio of the training instance. Setting it to 0.5 means that TPOT
+            randomly collects half of training samples for pipeline optimization process.
         n_jobs: int (default: 1)
             Number of CPUs for evaluating pipelines in parallel during the TPOT
             optimization process. Assigning this to -1 will use as many cores as available
@@ -261,6 +265,11 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
                 self.scoring_function = scoring
 
         self.cv = cv
+        self.subsample = subsample
+        if self.subsample <= 0.0 or self.subsample > 1.0:
+            raise ValueError(
+                'The subsample ratio of the training instance must be in the range (0.0, 1.0].'
+            )
         # If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module
         if sys.platform.startswith('win') and n_jobs != 1:
             print(
@@ -395,9 +404,9 @@ def fit(self, features, classes, sample_weight=None):
         """Fit an optimitzed machine learning pipeline.
 
         Uses genetic programming to optimize a machine learning pipeline that
-        maximizes classification score on the provided features and classes.
-        Performs an internal stratified training/testing cross-validaton split
-        to avoid overfitting on the provided data.
+        maximizes score on the provided features and classes. Performs an internal
+        stratified training/testing cross-validaton split to avoid overfitting
+        on the provided data.
 
         Parameters
         ----------
@@ -424,10 +433,24 @@ def fit(self, features, classes, sample_weight=None):
         try:
             clf = clf.fit(features, classes)
         except Exception:
-            raise ValueError('Error: Input data is not in a valid format. '
-                             'Please confirm that the input data is scikit-learn compatible. '
-                             'For example, the features must be a 2-D array and target labels '
-                             'must be a 1-D array.')
+            raise ValueError(
+                            'Error: Input data is not in a valid format. '
+                            'Please confirm that the input data is scikit-learn compatible. '
+                            'For example, the features must be a 2-D array and target labels '
+                            'must be a 1-D array.'
+                            )
+
+        # Randomly collect a subsample of training samples for pipeline optimization process.
+        if self.subsample < 1.0:
+            features, _, classes, _ = train_test_split(features, classes, train_size=self.subsample, random_state=self.random_state)
+            # Raise a warning message if the training size is less than 1500 when subsample is not default value
+            if features.shape[0] < 1500:
+                print(
+                    'Warning: Although subsample can accelerate pipeline optimization process, '
+                    'too small training sample size may cause unpredictable effect on maximizing '
+                    'score in pipeline optimization process. Increasing subsample ratio may get '
+                    'a more reasonable outcome from optimization process in TPOT.'
+                    )
 
         # Set the seed for the GP run
         if self.random_state is not None:

diff --git a/tpot/driver.py b/tpot/driver.py
@@ -259,6 +259,18 @@ def _get_arg_parser():
         )
     )
 
+    parser.add_argument(
+        '-sub',
+        action='store',
+        dest='SUBSAMPLE',
+        default=1.0,
+        type=float,
+        help=(
+            'Subsample ratio of the training instance. Setting it to 0.5 means that TPOT'
+            'randomly collects half of training samples for pipeline optimization process.'
+        )
+    )
+
     parser.add_argument(
         '-njobs',
         action='store',
@@ -414,6 +426,7 @@ def main(args):
         mutation_rate=args.MUTATION_RATE,
         crossover_rate=args.CROSSOVER_RATE,
         cv=args.NUM_CV_FOLDS,
+        subsample=args.SUBSAMPLE,
         n_jobs=args.NUM_JOBS,
         scoring=args.SCORING_FN,
         max_time_mins=args.MAX_TIME_MINS,