Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs_sources/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,21 +168,21 @@ The update checker will tell you when a new version of TPOT has been released.
<tr>
<td width="20%" style="vertical-align:top; background:#F5F5F5;"><strong>Attributes:</strong></td>
<td width="80%" style="background:white;">
<strong>_fitted_pipeline</strong>: scikit-learn Pipeline object
<strong>fitted_pipeline_</strong>: scikit-learn Pipeline object
<blockquote>
The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.
</blockquote>

<strong>_pareto_front_fitted_pipelines</strong>: Python dictionary
<strong>pareto_front_fitted_pipelines_</strong>: Python dictionary
<blockquote>
Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.
<br /><br />
The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.
<br /><br />
Note: <em>_pareto_front_fitted_pipelines</em> is only available when <em>verbosity</em>=3.
Note: <em>pareto_front_fitted_pipelines_</em> is only available when <em>verbosity</em>=3.
</blockquote>

<strong>_evaluated_individuals</strong>: Python dictionary
<strong>evaluated_individuals_</strong>: Python dictionary
<blockquote>
Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, quality metric for the pipeline).
<br /><br />
Expand Down
54 changes: 27 additions & 27 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def test_init_custom_parameters():
assert tpot_obj.warm_start is True
assert tpot_obj.verbosity == 1
assert tpot_obj._optimized_pipeline is None
assert tpot_obj._fitted_pipeline is None
assert tpot_obj.fitted_pipeline_ is None
assert not (tpot_obj._pset is None)
assert not (tpot_obj._toolbox is None)

Expand Down Expand Up @@ -326,9 +326,9 @@ def test_timeout():
"ExtraTreesRegressor__n_estimators=100)"
)
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
# test _wrapped_cross_val_score with cv=20 so that it is impossible to finish in 1 second
return_value = _wrapped_cross_val_score(tpot_obj._fitted_pipeline,
return_value = _wrapped_cross_val_score(tpot_obj.fitted_pipeline_,
training_features_r,
training_classes_r,
cv=20,
Expand Down Expand Up @@ -492,8 +492,8 @@ def test_score_2():
')'
)
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features, training_classes)
tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj.fitted_pipeline_.fit(training_features, training_classes)
# Get score from TPOT
score = tpot_obj.score(testing_features, testing_classes)

Expand All @@ -518,8 +518,8 @@ def test_score_3():
"ExtraTreesRegressor__n_estimators=100)"
)
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)
tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj.fitted_pipeline_.fit(training_features_r, training_classes_r)

# Get score from TPOT
score = tpot_obj.score(testing_features_r, testing_classes_r)
Expand All @@ -545,27 +545,27 @@ def test_sample_weight_func():
"ExtraTreesRegressor__n_estimators=100)"
)
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r)
tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj.fitted_pipeline_.fit(training_features_r, training_classes_r)

tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)

# make up a sample weight
training_classes_r_weight = np.array(range(1, len(training_classes_r)+1))
training_classes_r_weight_dict = set_sample_weight(tpot_obj._fitted_pipeline.steps, training_classes_r_weight)
training_classes_r_weight_dict = set_sample_weight(tpot_obj.fitted_pipeline_.steps, training_classes_r_weight)

np.random.seed(42)
cv_score1 = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error')
cv_score1 = cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error')

np.random.seed(42)
cv_score2 = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error')
cv_score2 = cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error')

np.random.seed(42)
cv_score_weight = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error', fit_params=training_classes_r_weight_dict)
cv_score_weight = cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error', fit_params=training_classes_r_weight_dict)

np.random.seed(42)
tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r, **training_classes_r_weight_dict)
tpot_obj.fitted_pipeline_.fit(training_features_r, training_classes_r, **training_classes_r_weight_dict)
# Get score from TPOT
known_score = 11.5790430757
score = tpot_obj.score(testing_features_r, testing_classes_r)
Expand Down Expand Up @@ -612,8 +612,8 @@ def test_predict_2():
')'
)
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features, training_classes)
tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj.fitted_pipeline_.fit(training_features, training_classes)
result = tpot_obj.predict(testing_features)

assert result.shape == (testing_features.shape[0],)
Expand All @@ -630,8 +630,8 @@ def test_predict_proba():
'DecisionTreeClassifier__min_samples_split=5)'
)
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features, training_classes)
tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj.fitted_pipeline_.fit(training_features, training_classes)

result = tpot_obj.predict_proba(testing_features)
num_labels = np.amax(testing_classes) + 1
Expand All @@ -651,8 +651,8 @@ def test_predict_proba2():
'DecisionTreeClassifier__min_samples_split=5)'
)
tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj._fitted_pipeline.fit(training_features, training_classes)
tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline)
tpot_obj.fitted_pipeline_.fit(training_features, training_classes)

result = tpot_obj.predict_proba(testing_features)
rows, columns = result.shape
Expand Down Expand Up @@ -725,8 +725,8 @@ def test_fit3():
assert not (tpot_obj._start_datetime is None)


def test_evaluated_individuals():
"""Assert that _evaluated_individuals stores corrent pipelines and their CV scores."""
def test_evaluated_individuals_():
"""Assert that evaluated_individuals_ stores corrent pipelines and their CV scores."""
tpot_obj = TPOTClassifier(
random_state=42,
population_size=2,
Expand All @@ -736,8 +736,8 @@ def test_evaluated_individuals():
config_dict='TPOT light'
)
tpot_obj.fit(training_features, training_classes)
assert isinstance(tpot_obj._evaluated_individuals, dict)
for pipeline_string in sorted(tpot_obj._evaluated_individuals.keys()):
assert isinstance(tpot_obj.evaluated_individuals_, dict)
for pipeline_string in sorted(tpot_obj.evaluated_individuals_.keys()):
deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline)
tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42)
Expand All @@ -747,8 +747,8 @@ def test_evaluated_individuals():
mean_cv_scores = np.mean(cv_scores)
except:
mean_cv_scores = -float('inf')
assert np.allclose(tpot_obj._evaluated_individuals[pipeline_string][1], mean_cv_scores)
assert np.allclose(tpot_obj._evaluated_individuals[pipeline_string][0], operator_count)
assert np.allclose(tpot_obj.evaluated_individuals_[pipeline_string][1], mean_cv_scores)
assert np.allclose(tpot_obj.evaluated_individuals_[pipeline_string][0], operator_count)


def test_evaluate_individuals():
Expand Down
40 changes: 20 additions & 20 deletions tpot/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,

self._pareto_front = None
self._optimized_pipeline = None
self._fitted_pipeline = None
self.fitted_pipeline_ = None
self._fitted_imputer = None
self._pop = None
self.warm_start = warm_start
Expand Down Expand Up @@ -257,7 +257,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,

# Dictionary of individuals that have already been evaluated in previous
# generations
self._evaluated_individuals = {}
self.evaluated_individuals_ = {}
self.random_state = random_state

# If the user passed a custom scoring function, store it in the sklearn
Expand Down Expand Up @@ -425,7 +425,7 @@ def fit(self, features, classes, sample_weight=None, groups=None):
TPOT and all scikit-learn algorithms assume that the features will be numerical
and there will be no missing values. As such, when a feature matrix is provided
to TPOT, all missing values will automatically be replaced (i.e., imputed) using
median value imputation.
median value imputation.

If you wish to use a different imputation strategy than median imputation, please
make sure to apply imputation to your feature set prior to passing it to TPOT.
Expand Down Expand Up @@ -562,11 +562,11 @@ def pareto_eq(ind1, ind2):
'TPOTClassifier object. Please make sure you '
'passed the data to TPOT correctly.')
else:
self._fitted_pipeline = self._toolbox.compile(expr=self._optimized_pipeline)
self.fitted_pipeline_ = self._toolbox.compile(expr=self._optimized_pipeline)

with warnings.catch_warnings():
warnings.simplefilter('ignore')
self._fitted_pipeline.fit(features, classes)
self.fitted_pipeline_.fit(features, classes)

if self.verbosity in [1, 2]:
# Add an extra line of spacing if the progress bar was used
Expand All @@ -575,14 +575,14 @@ def pareto_eq(ind1, ind2):
print('Best pipeline: {}'.format(self._optimized_pipeline))

# Store and fit the entire Pareto front if sciencing
elif self.verbosity >= 3 and self._pareto_front:
self._pareto_front_fitted_pipelines = {}
elif self._pareto_front:
self.pareto_front_fitted_pipelines_ = {}

for pipeline in self._pareto_front.items:
self._pareto_front_fitted_pipelines[str(pipeline)] = self._toolbox.compile(expr=pipeline)
self.pareto_front_fitted_pipelines_[str(pipeline)] = self._toolbox.compile(expr=pipeline)
with warnings.catch_warnings():
warnings.simplefilter('ignore')
self._pareto_front_fitted_pipelines[str(pipeline)].fit(features, classes)
self.pareto_front_fitted_pipelines_[str(pipeline)].fit(features, classes)
break

except (KeyboardInterrupt, SystemExit, Exception) as e:
Expand Down Expand Up @@ -614,15 +614,15 @@ def predict(self, features):
Predicted classes for the samples in the feature matrix

"""
if not self._fitted_pipeline:
if not self.fitted_pipeline_:
raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.')

features = features.astype(np.float64)

if np.any(np.isnan(features)):
features = self._impute_values(features)

return self._fitted_pipeline.predict(features)
return self.fitted_pipeline_.predict(features)

def fit_predict(self, features, classes):
"""Call fit and predict in sequence.
Expand Down Expand Up @@ -659,13 +659,13 @@ def score(self, testing_features, testing_classes):
The estimated test set accuracy

"""
if self._fitted_pipeline is None:
if self.fitted_pipeline_ is None:
raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.')

# If the scoring function is a string, we must adjust to use the sklearn
# scoring interface
score = SCORERS[self.scoring_function](
self._fitted_pipeline,
self.fitted_pipeline_,
testing_features.astype(np.float64),
testing_classes.astype(np.float64)
)
Expand All @@ -685,12 +685,12 @@ def predict_proba(self, features):
The class probabilities of the input samples

"""
if not self._fitted_pipeline:
if not self.fitted_pipeline_:
raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.')
else:
if not(hasattr(self._fitted_pipeline, 'predict_proba')):
if not(hasattr(self.fitted_pipeline_, 'predict_proba')):
raise RuntimeError('The fitted pipeline does not have the predict_proba() function.')
return self._fitted_pipeline.predict_proba(features.astype(np.float64))
return self.fitted_pipeline_.predict_proba(features.astype(np.float64))

def set_params(self, **params):
"""Set the parameters of TPOT.
Expand Down Expand Up @@ -859,9 +859,9 @@ def _evaluate_individuals(self, individuals, features, classes, sample_weight=No
if not self._pbar.disable:
self._pbar.update(1)
# Check if the individual was evaluated before
elif individual_str in self._evaluated_individuals:
elif individual_str in self.evaluated_individuals_:
# Get fitness score from previous evaluation
fitnesses_dict[indidx] = self._evaluated_individuals[individual_str]
fitnesses_dict[indidx] = self.evaluated_individuals_[individual_str]
if self.verbosity > 2:
self._pbar.write('Pipeline encountered that has previously been evaluated during the '
'optimization process. Using the score from the previous evaluation.')
Expand Down Expand Up @@ -923,8 +923,8 @@ def _evaluate_individuals(self, individuals, features, classes, sample_weight=No

for resulting_score, operator_count, individual_str, test_idx in zip(resulting_score_list, operator_count_list, eval_individuals_str, test_idx_list):
if type(resulting_score) in [float, np.float64, np.float32]:
self._evaluated_individuals[individual_str] = (max(1, operator_count), resulting_score)
fitnesses_dict[test_idx] = self._evaluated_individuals[individual_str]
self.evaluated_individuals_[individual_str] = (max(1, operator_count), resulting_score)
fitnesses_dict[test_idx] = self.evaluated_individuals_[individual_str]
else:
raise ValueError('Scoring function does not return a float.')

Expand Down