diff --git a/docs_sources/using.md b/docs_sources/using.md
index cbe219ad9..4e0e13ae8 100644
--- a/docs_sources/using.md
+++ b/docs_sources/using.md
@@ -95,11 +95,6 @@ See the section on scoring functions for more de
| n_jobs |
Any positive integer or -1 |
Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process.
diff --git a/tests.py b/tests.py
index b9924cd06..92fdfc030 100644
--- a/tests.py
+++ b/tests.py
@@ -239,7 +239,7 @@ def test_init_default_scoring():
def test_invaild_score_warning():
- """Assert that the TPOT intitializes raises a ValueError when the scoring metrics is not available in SCORERS."""
+ """Assert that the TPOT fit function raises a ValueError when the scoring metrics is not available in SCORERS."""
# Mis-spelled scorer
assert_raises(ValueError, TPOTClassifier, scoring='balanced_accuray')
# Correctly spelled
@@ -260,14 +260,6 @@ def test_invaild_dataset_warning():
assert_raises(ValueError, tpot_obj.fit, training_features, bad_training_classes)
-def test_invaild_subsample_ratio_warning():
- """Assert that the TPOT intitializes raises a ValueError when subsample ratio is not in the range (0.0, 1.0]."""
- # Invalid ratio
- assert_raises(ValueError, TPOTClassifier, subsample=0.0)
- # Valid ratio
- TPOTClassifier(subsample=0.1)
-
-
def test_init_max_time_mins():
"""Assert that the TPOT init stores max run time and sets generations to 1000000."""
tpot_obj = TPOTClassifier(max_time_mins=30, generations=1000)
@@ -622,22 +614,6 @@ def test_fit2():
assert not (tpot_obj._start_datetime is None)
-def test_fit3():
- """Assert that the TPOT fit function provides an optimized pipeline with subsample is 0.8"""
- tpot_obj = TPOTClassifier(
- random_state=42,
- population_size=1,
- offspring_size=2,
- generations=1,
- subsample=0.8,
- verbosity=0
- )
- tpot_obj.fit(training_features, training_classes)
-
- assert isinstance(tpot_obj._optimized_pipeline, creator.Individual)
- assert not (tpot_obj._start_datetime is None)
-
-
def test_evaluated_individuals():
"""Assert that _evaluated_individuals stores corrent pipelines and their CV scores."""
tpot_obj = TPOTClassifier(
diff --git a/tpot/base.py b/tpot/base.py
index cddc94fda..a2a1f72c4 100644
--- a/tpot/base.py
+++ b/tpot/base.py
@@ -40,7 +40,6 @@
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import VotingClassifier
-from sklearn.model_selection import train_test_split
from sklearn.metrics.scorer import make_scorer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
@@ -82,7 +81,7 @@ class TPOTBase(BaseEstimator):
def __init__(self, generations=100, population_size=100, offspring_size=None,
mutation_rate=0.9, crossover_rate=0.1,
- scoring=None, cv=5, subsample=1.0, n_jobs=1,
+ scoring=None, cv=5, n_jobs=1,
max_time_mins=None, max_eval_time_mins=5,
random_state=None, config_dict=None, warm_start=False,
verbosity=0, disable_update_check=False):
@@ -123,7 +122,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
Offers the same options as sklearn.model_selection.cross_val_score as well as
a built-in score "balanced_accuracy":
- ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',
+ ['accuracy', 'adjusted_rand_score', 'average_precision', 'balanced accuracy',
'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted',
'precision', 'precision_macro', 'precision_micro', 'precision_samples',
'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro',
@@ -131,9 +130,6 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
cv: int (default: 5)
Number of folds to evaluate each pipeline over in k-fold cross-validation
during the TPOT optimization process.
- subsample: float (default: 1.0)
- Subsample ratio of the training instance. Setting it to 0.5 means that TPOT
- randomly collects half of training samples for pipeline optimization process.
n_jobs: int (default: 1)
Number of CPUs for evaluating pipelines in parallel during the TPOT
optimization process. Assigning this to -1 will use as many cores as available
@@ -266,11 +262,6 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
self.scoring_function = scoring
self.cv = cv
- self.subsample = subsample
- if self.subsample <= 0.0 or self.subsample > 1.0:
- raise ValueError(
- 'The subsample ratio of the training instance must be in the range (0.0, 1.0].'
- )
# If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module
if sys.platform.startswith('win') and n_jobs != 1:
print(
@@ -401,9 +392,9 @@ def fit(self, features, classes, sample_weight=None):
"""Fit an optimitzed machine learning pipeline.
Uses genetic programming to optimize a machine learning pipeline that
- maximizes score on the provided features and classes. Performs an internal
- stratified training/testing cross-validaton split to avoid overfitting
- on the provided data.
+ maximizes classification score on the provided features and classes.
+ Performs an internal stratified training/testing cross-validaton split
+ to avoid overfitting on the provided data.
Parameters
----------
@@ -430,24 +421,10 @@ def fit(self, features, classes, sample_weight=None):
try:
clf = clf.fit(features, classes)
except Exception:
- raise ValueError(
- 'Error: Input data is not in a valid format. '
- 'Please confirm that the input data is scikit-learn compatible. '
- 'For example, the features must be a 2-D array and target labels '
- 'must be a 1-D array.'
- )
-
- # Randomly collect a subsample of training samples for pipeline optimization process.
- if self.subsample < 1.0:
- features, _, classes, _ = train_test_split(features, classes, train_size=self.subsample, random_state=self.random_state)
- # Raise a warning message if the training size is less than 1500 when subsample is not default value
- if features.shape[0] < 1500:
- print(
- 'Warning: Although subsample can accelerate pipeline optimization process, '
- 'too small training sample size may cause unpredictable effect on maximizing '
- 'score in pipeline optimization process. Increasing subsample ratio may get '
- 'a more reasonable outcome from optimization process in TPOT.'
- )
+ raise ValueError('Error: Input data is not in a valid format. '
+ 'Please confirm that the input data is scikit-learn compatible. '
+ 'For example, the features must be a 2-D array and target labels '
+ 'must be a 1-D array.')
# Set the seed for the GP run
if self.random_state is not None:
diff --git a/tpot/driver.py b/tpot/driver.py
index 3c7faba8f..fcac41601 100755
--- a/tpot/driver.py
+++ b/tpot/driver.py
@@ -259,18 +259,6 @@ def _get_arg_parser():
)
)
- parser.add_argument(
- '-sub',
- action='store',
- dest='SUBSAMPLE',
- default=1.0,
- type=float,
- help=(
- 'Subsample ratio of the training instance. Setting it to 0.5 means that TPOT'
- 'randomly collects half of training samples for pipeline optimization process.'
- )
- )
-
parser.add_argument(
'-njobs',
action='store',
@@ -426,7 +414,6 @@ def main(args):
mutation_rate=args.MUTATION_RATE,
crossover_rate=args.CROSSOVER_RATE,
cv=args.NUM_CV_FOLDS,
- subsample=args.SUBSAMPLE,
n_jobs=args.NUM_JOBS,
scoring=args.SCORING_FN,
max_time_mins=args.MAX_TIME_MINS,
|