diff --git a/docs_sources/api.md b/docs_sources/api.md index 773cd1f1b..9580d0ecf 100644 --- a/docs_sources/api.md +++ b/docs_sources/api.md @@ -168,21 +168,21 @@ The update checker will tell you when a new version of TPOT has been released.
The best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.-_pareto_front_fitted_pipelines: Python dictionary +pareto_front_fitted_pipelines_: Python dictionary
Dictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.-_evaluated_individuals: Python dictionary +evaluated_individuals_: Python dictionary
The TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.
-Note: _pareto_front_fitted_pipelines is only available when verbosity=3. +Note: pareto_front_fitted_pipelines_ is only available when verbosity=3.
Dictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, quality metric for the pipeline).
diff --git a/tests.py b/tests.py index dedebc65c..f282fb780 100644 --- a/tests.py +++ b/tests.py @@ -234,7 +234,7 @@ def test_init_custom_parameters(): assert tpot_obj.warm_start is True assert tpot_obj.verbosity == 1 assert tpot_obj._optimized_pipeline is None - assert tpot_obj._fitted_pipeline is None + assert tpot_obj.fitted_pipeline_ is None assert not (tpot_obj._pset is None) assert not (tpot_obj._toolbox is None) @@ -326,9 +326,9 @@ def test_timeout(): "ExtraTreesRegressor__n_estimators=100)" ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) # test _wrapped_cross_val_score with cv=20 so that it is impossible to finish in 1 second - return_value = _wrapped_cross_val_score(tpot_obj._fitted_pipeline, + return_value = _wrapped_cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_classes_r, cv=20, @@ -492,8 +492,8 @@ def test_score_2(): ')' ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - tpot_obj._fitted_pipeline.fit(training_features, training_classes) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features, training_classes) # Get score from TPOT score = tpot_obj.score(testing_features, testing_classes) @@ -518,8 +518,8 @@ def test_score_3(): "ExtraTreesRegressor__n_estimators=100)" ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features_r, training_classes_r) # Get score from TPOT score = tpot_obj.score(testing_features_r, testing_classes_r) @@ -545,27 +545,27 @@ def test_sample_weight_func(): "ExtraTreesRegressor__n_estimators=100)" ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features_r, training_classes_r) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) # make up a sample weight training_classes_r_weight = np.array(range(1, len(training_classes_r)+1)) - training_classes_r_weight_dict = set_sample_weight(tpot_obj._fitted_pipeline.steps, training_classes_r_weight) + training_classes_r_weight_dict = set_sample_weight(tpot_obj.fitted_pipeline_.steps, training_classes_r_weight) np.random.seed(42) - cv_score1 = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error') + cv_score1 = cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error') np.random.seed(42) - cv_score2 = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error') + cv_score2 = cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error') np.random.seed(42) - cv_score_weight = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error', fit_params=training_classes_r_weight_dict) + cv_score_weight = cross_val_score(tpot_obj.fitted_pipeline_, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error', fit_params=training_classes_r_weight_dict) np.random.seed(42) - tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r, **training_classes_r_weight_dict) + tpot_obj.fitted_pipeline_.fit(training_features_r, training_classes_r, **training_classes_r_weight_dict) # Get score from TPOT known_score = 11.5790430757 score = tpot_obj.score(testing_features_r, testing_classes_r) @@ -612,8 +612,8 @@ def test_predict_2(): ')' ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - tpot_obj._fitted_pipeline.fit(training_features, training_classes) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features, training_classes) result = tpot_obj.predict(testing_features) assert result.shape == (testing_features.shape[0],) @@ -630,8 +630,8 @@ def test_predict_proba(): 'DecisionTreeClassifier__min_samples_split=5)' ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - tpot_obj._fitted_pipeline.fit(training_features, training_classes) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features, training_classes) result = tpot_obj.predict_proba(testing_features) num_labels = np.amax(testing_classes) + 1 @@ -651,8 +651,8 @@ def test_predict_proba2(): 'DecisionTreeClassifier__min_samples_split=5)' ) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) - tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) - tpot_obj._fitted_pipeline.fit(training_features, training_classes) + tpot_obj.fitted_pipeline_ = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) + tpot_obj.fitted_pipeline_.fit(training_features, training_classes) result = tpot_obj.predict_proba(testing_features) rows, columns = result.shape @@ -725,8 +725,8 @@ def test_fit3(): assert not (tpot_obj._start_datetime is None) -def test_evaluated_individuals(): - """Assert that _evaluated_individuals stores corrent pipelines and their CV scores.""" +def test_evaluated_individuals_(): + """Assert that evaluated_individuals_ stores corrent pipelines and their CV scores.""" tpot_obj = TPOTClassifier( random_state=42, population_size=2, @@ -736,8 +736,8 @@ def test_evaluated_individuals(): config_dict='TPOT light' ) tpot_obj.fit(training_features, training_classes) - assert isinstance(tpot_obj._evaluated_individuals, dict) - for pipeline_string in sorted(tpot_obj._evaluated_individuals.keys()): + assert isinstance(tpot_obj.evaluated_individuals_, dict) + for pipeline_string in sorted(tpot_obj.evaluated_individuals_.keys()): deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) tpot_obj._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) @@ -747,8 +747,8 @@ def test_evaluated_individuals(): mean_cv_scores = np.mean(cv_scores) except: mean_cv_scores = -float('inf') - assert np.allclose(tpot_obj._evaluated_individuals[pipeline_string][1], mean_cv_scores) - assert np.allclose(tpot_obj._evaluated_individuals[pipeline_string][0], operator_count) + assert np.allclose(tpot_obj.evaluated_individuals_[pipeline_string][1], mean_cv_scores) + assert np.allclose(tpot_obj.evaluated_individuals_[pipeline_string][0], operator_count) def test_evaluate_individuals(): diff --git a/tpot/base.py b/tpot/base.py index 10f2c1531..11e1db0da 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -201,7 +201,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self._pareto_front = None self._optimized_pipeline = None - self._fitted_pipeline = None + self.fitted_pipeline_ = None self._fitted_imputer = None self._pop = None self.warm_start = warm_start @@ -257,7 +257,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, # Dictionary of individuals that have already been evaluated in previous # generations - self._evaluated_individuals = {} + self.evaluated_individuals_ = {} self.random_state = random_state # If the user passed a custom scoring function, store it in the sklearn @@ -425,7 +425,7 @@ def fit(self, features, classes, sample_weight=None, groups=None): TPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values. As such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed) using - median value imputation. + median value imputation. If you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. @@ -562,11 +562,11 @@ def pareto_eq(ind1, ind2): 'TPOTClassifier object. Please make sure you ' 'passed the data to TPOT correctly.') else: - self._fitted_pipeline = self._toolbox.compile(expr=self._optimized_pipeline) + self.fitted_pipeline_ = self._toolbox.compile(expr=self._optimized_pipeline) with warnings.catch_warnings(): warnings.simplefilter('ignore') - self._fitted_pipeline.fit(features, classes) + self.fitted_pipeline_.fit(features, classes) if self.verbosity in [1, 2]: # Add an extra line of spacing if the progress bar was used @@ -575,14 +575,14 @@ def pareto_eq(ind1, ind2): print('Best pipeline: {}'.format(self._optimized_pipeline)) # Store and fit the entire Pareto front if sciencing - elif self.verbosity >= 3 and self._pareto_front: - self._pareto_front_fitted_pipelines = {} + elif self._pareto_front: + self.pareto_front_fitted_pipelines_ = {} for pipeline in self._pareto_front.items: - self._pareto_front_fitted_pipelines[str(pipeline)] = self._toolbox.compile(expr=pipeline) + self.pareto_front_fitted_pipelines_[str(pipeline)] = self._toolbox.compile(expr=pipeline) with warnings.catch_warnings(): warnings.simplefilter('ignore') - self._pareto_front_fitted_pipelines[str(pipeline)].fit(features, classes) + self.pareto_front_fitted_pipelines_[str(pipeline)].fit(features, classes) break except (KeyboardInterrupt, SystemExit, Exception) as e: @@ -614,7 +614,7 @@ def predict(self, features): Predicted classes for the samples in the feature matrix """ - if not self._fitted_pipeline: + if not self.fitted_pipeline_: raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') features = features.astype(np.float64) @@ -622,7 +622,7 @@ def predict(self, features): if np.any(np.isnan(features)): features = self._impute_values(features) - return self._fitted_pipeline.predict(features) + return self.fitted_pipeline_.predict(features) def fit_predict(self, features, classes): """Call fit and predict in sequence. @@ -659,13 +659,13 @@ def score(self, testing_features, testing_classes): The estimated test set accuracy """ - if self._fitted_pipeline is None: + if self.fitted_pipeline_ is None: raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') # If the scoring function is a string, we must adjust to use the sklearn # scoring interface score = SCORERS[self.scoring_function]( - self._fitted_pipeline, + self.fitted_pipeline_, testing_features.astype(np.float64), testing_classes.astype(np.float64) ) @@ -685,12 +685,12 @@ def predict_proba(self, features): The class probabilities of the input samples """ - if not self._fitted_pipeline: + if not self.fitted_pipeline_: raise RuntimeError('A pipeline has not yet been optimized. Please call fit() first.') else: - if not(hasattr(self._fitted_pipeline, 'predict_proba')): + if not(hasattr(self.fitted_pipeline_, 'predict_proba')): raise RuntimeError('The fitted pipeline does not have the predict_proba() function.') - return self._fitted_pipeline.predict_proba(features.astype(np.float64)) + return self.fitted_pipeline_.predict_proba(features.astype(np.float64)) def set_params(self, **params): """Set the parameters of TPOT. @@ -859,9 +859,9 @@ def _evaluate_individuals(self, individuals, features, classes, sample_weight=No if not self._pbar.disable: self._pbar.update(1) # Check if the individual was evaluated before - elif individual_str in self._evaluated_individuals: + elif individual_str in self.evaluated_individuals_: # Get fitness score from previous evaluation - fitnesses_dict[indidx] = self._evaluated_individuals[individual_str] + fitnesses_dict[indidx] = self.evaluated_individuals_[individual_str] if self.verbosity > 2: self._pbar.write('Pipeline encountered that has previously been evaluated during the ' 'optimization process. Using the score from the previous evaluation.') @@ -923,8 +923,8 @@ def _evaluate_individuals(self, individuals, features, classes, sample_weight=No for resulting_score, operator_count, individual_str, test_idx in zip(resulting_score_list, operator_count_list, eval_individuals_str, test_idx_list): if type(resulting_score) in [float, np.float64, np.float32]: - self._evaluated_individuals[individual_str] = (max(1, operator_count), resulting_score) - fitnesses_dict[test_idx] = self._evaluated_individuals[individual_str] + self.evaluated_individuals_[individual_str] = (max(1, operator_count), resulting_score) + fitnesses_dict[test_idx] = self.evaluated_individuals_[individual_str] else: raise ValueError('Scoring function does not return a float.')