Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs_sources/using.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ See the section on <a href="#scoringfunctions">scoring functions</a> for more de
<td>Any integer >1</td>
<td>Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.</td>
</tr>
<td>-sub</td>
<td>SUBSAMPLE</td>
<td>(0.0, 1.0]</td>
<td>Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.</td>
</tr>
<tr>
<td>-njobs</td>
<td>NUM_JOBS</td>
Expand Down Expand Up @@ -243,6 +248,11 @@ See the section on <a href="#scoringfunctions">scoring functions</a> for more de
<td>Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.</td>
</tr>
<tr>
<td>subsample</td>
<td>(0.0, 1.0]</td>
<td>Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.</td>
</tr>
<tr>
<td>n_jobs</td>
<td>Any positive integer or -1</td>
<td>Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process.
Expand Down
26 changes: 25 additions & 1 deletion tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ def test_init_default_scoring():


def test_invaild_score_warning():
"""Assert that the TPOT fit function raises a ValueError when the scoring metrics is not available in SCORERS."""
"""Assert that the TPOT intitializes raises a ValueError when the scoring metrics is not available in SCORERS."""
# Mis-spelled scorer
assert_raises(ValueError, TPOTClassifier, scoring='balanced_accuray')
# Correctly spelled
Expand All @@ -259,6 +259,14 @@ def test_invaild_dataset_warning():
assert_raises(ValueError, tpot_obj.fit, training_features, bad_training_classes)


def test_invaild_subsample_ratio_warning():
"""Assert that the TPOT intitializes raises a ValueError when subsample ratio is not in the range (0.0, 1.0]."""
# Invalid ratio
assert_raises(ValueError, TPOTClassifier, subsample=0.0)
# Valid ratio
TPOTClassifier(subsample=0.1)


def test_init_max_time_mins():
"""Assert that the TPOT init stores max run time and sets generations to 1000000."""
tpot_obj = TPOTClassifier(max_time_mins=30, generations=1000)
Expand Down Expand Up @@ -612,6 +620,22 @@ def test_fit2():
assert not (tpot_obj._start_datetime is None)


def test_fit3():
"""Assert that the TPOT fit function provides an optimized pipeline with subsample is 0.8"""
tpot_obj = TPOTClassifier(
random_state=42,
population_size=1,
offspring_size=2,
generations=1,
subsample=0.8,
verbosity=0
)
tpot_obj.fit(training_features, training_classes)

assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
assert not (tpot_obj._start_datetime is None)


def test_evaluated_individuals():
"""Assert that _evaluated_individuals stores corrent pipelines and their CV scores."""
tpot_obj = TPOTClassifier(
Expand Down
41 changes: 32 additions & 9 deletions tpot/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics.scorer import make_scorer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

Expand Down Expand Up @@ -80,7 +81,7 @@ class TPOTBase(BaseEstimator):

def __init__(self, generations=100, population_size=100, offspring_size=None,
mutation_rate=0.9, crossover_rate=0.1,
scoring=None, cv=5, n_jobs=1,
scoring=None, cv=5, subsample=1.0, n_jobs=1,
max_time_mins=None, max_eval_time_mins=5,
random_state=None, config_dict=None, warm_start=False,
verbosity=0, disable_update_check=False):
Expand Down Expand Up @@ -121,14 +122,17 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
Offers the same options as sklearn.model_selection.cross_val_score as well as
a built-in score "balanced_accuracy":

['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced accuracy',
['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',
'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted',
'precision', 'precision_macro', 'precision_micro', 'precision_samples',
'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro',
'recall_samples', 'recall_weighted', 'roc_auc']
cv: int (default: 5)
Number of folds to evaluate each pipeline over in k-fold cross-validation
during the TPOT optimization process.
subsample: float (default: 1.0)
Subsample ratio of the training instance. Setting it to 0.5 means that TPOT
randomly collects half of training samples for pipeline optimization process.
n_jobs: int (default: 1)
Number of CPUs for evaluating pipelines in parallel during the TPOT
optimization process. Assigning this to -1 will use as many cores as available
Expand Down Expand Up @@ -261,6 +265,11 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
self.scoring_function = scoring

self.cv = cv
self.subsample = subsample
if self.subsample <= 0.0 or self.subsample > 1.0:
raise ValueError(
'The subsample ratio of the training instance must be in the range (0.0, 1.0].'
)
# If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module
if sys.platform.startswith('win') and n_jobs != 1:
print(
Expand Down Expand Up @@ -395,9 +404,9 @@ def fit(self, features, classes, sample_weight=None):
"""Fit an optimitzed machine learning pipeline.

Uses genetic programming to optimize a machine learning pipeline that
maximizes classification score on the provided features and classes.
Performs an internal stratified training/testing cross-validaton split
to avoid overfitting on the provided data.
maximizes score on the provided features and classes. Performs an internal
stratified training/testing cross-validaton split to avoid overfitting
on the provided data.

Parameters
----------
Expand All @@ -424,10 +433,24 @@ def fit(self, features, classes, sample_weight=None):
try:
clf = clf.fit(features, classes)
except Exception:
raise ValueError('Error: Input data is not in a valid format. '
'Please confirm that the input data is scikit-learn compatible. '
'For example, the features must be a 2-D array and target labels '
'must be a 1-D array.')
raise ValueError(
'Error: Input data is not in a valid format. '
'Please confirm that the input data is scikit-learn compatible. '
'For example, the features must be a 2-D array and target labels '
'must be a 1-D array.'
)

# Randomly collect a subsample of training samples for pipeline optimization process.
if self.subsample < 1.0:
features, _, classes, _ = train_test_split(features, classes, train_size=self.subsample, random_state=self.random_state)
# Raise a warning message if the training size is less than 1500 when subsample is not default value
if features.shape[0] < 1500:
print(
'Warning: Although subsample can accelerate pipeline optimization process, '
'too small training sample size may cause unpredictable effect on maximizing '
'score in pipeline optimization process. Increasing subsample ratio may get '
'a more reasonable outcome from optimization process in TPOT.'
)

# Set the seed for the GP run
if self.random_state is not None:
Expand Down
13 changes: 13 additions & 0 deletions tpot/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,18 @@ def _get_arg_parser():
)
)

parser.add_argument(
'-sub',
action='store',
dest='SUBSAMPLE',
default=1.0,
type=float,
help=(
'Subsample ratio of the training instance. Setting it to 0.5 means that TPOT'
'randomly collects half of training samples for pipeline optimization process.'
)
)

parser.add_argument(
'-njobs',
action='store',
Expand Down Expand Up @@ -414,6 +426,7 @@ def main(args):
mutation_rate=args.MUTATION_RATE,
crossover_rate=args.CROSSOVER_RATE,
cv=args.NUM_CV_FOLDS,
subsample=args.SUBSAMPLE,
n_jobs=args.NUM_JOBS,
scoring=args.SCORING_FN,
max_time_mins=args.MAX_TIME_MINS,
Expand Down