-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Less freezing time in multiprocessing with large arrays #440
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 32 commits
a79079b
fa1acfb
5fab09a
ed3bdf7
5e32488
39cff19
eafc240
e50814c
c10ba2e
6bc031c
3019275
0b3680c
91caa55
6b00655
d9c1863
4517abb
94b4a37
3ce128b
02fd277
af98b96
9cafac7
5447fe0
1f97655
23ca6d3
3ce4a30
6515732
633e9e8
ac77725
dd7df4e
a6ff510
dcf640e
4d87038
ec96ecd
7978f7d
24c030f
0382753
ac3a086
39ac993
224a9bc
c20d911
a4956d4
7cea3bf
454f54a
dc40489
1fc2860
18927b0
568f55d
37c1529
179fdf1
7b1eb27
fd2f1c3
c3b2167
cccf676
211eed9
00fc6ff
1e0a8c4
d8e1904
af01d55
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,3 +5,5 @@ scikit-learn==0.18.1 | |
| scipy==0.19.0 | ||
| tqdm==4.11.2 | ||
| update-checker==0.16 | ||
| stopit==1.1.1 | ||
| dask==0.14.1 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,7 @@ | |
| from functools import partial | ||
| from datetime import datetime | ||
| from multiprocessing import cpu_count | ||
| from dask import compute, delayed, multiprocessing | ||
|
|
||
| import numpy as np | ||
| import deap | ||
|
|
@@ -35,12 +36,11 @@ | |
| from copy import copy | ||
|
|
||
| from sklearn.base import BaseEstimator | ||
| from sklearn.externals.joblib import Parallel, delayed | ||
| from sklearn.pipeline import make_pipeline, make_union | ||
| from sklearn.preprocessing import FunctionTransformer | ||
| from sklearn.ensemble import VotingClassifier | ||
| from sklearn.metrics.scorer import make_scorer | ||
| from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier | ||
| from sklearn.utils import check_X_y | ||
|
|
||
| from update_checker import update_check | ||
|
|
||
|
|
@@ -191,6 +191,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, | |
| self.generations = generations | ||
| self.max_time_mins = max_time_mins | ||
| self.max_eval_time_mins = max_eval_time_mins | ||
| self.max_eval_time_seconds = max(int(self.max_eval_time_mins * 60), 1) | ||
|
|
||
| # Set offspring_size equal to population_size by default | ||
| if offspring_size: | ||
|
|
@@ -260,17 +261,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, | |
| self.scoring_function = scoring | ||
|
|
||
| self.cv = cv | ||
| # If the OS is windows, reset cpu number to 1 since the OS did not have multiprocessing module | ||
| if sys.platform.startswith('win') and n_jobs != 1: | ||
| print( | ||
| 'Warning: Although parallelization is currently supported in ' | ||
| 'TPOT for Windows, pressing Ctrl+C will freeze the optimization ' | ||
| 'process without saving the best pipeline! Thus, Please DO NOT ' | ||
| 'press Ctrl+C during the optimization procss if n_jobs is not ' | ||
| 'equal to 1. For quick test in Windows, please set n_jobs to 1 ' | ||
| 'for saving the best pipeline in the middle of the optimization ' | ||
| 'process via Ctrl+C.' | ||
| ) | ||
|
|
||
| if n_jobs == -1: | ||
| self.n_jobs = cpu_count() | ||
| else: | ||
|
|
@@ -396,22 +387,25 @@ def fit(self, features, classes, sample_weight=None): | |
| None | ||
|
|
||
| """ | ||
| features = features.astype(np.float64) | ||
|
|
||
| # Check that the input data is formatted correctly for scikit-learn | ||
| if self.classification: | ||
| clf = DecisionTreeClassifier(max_depth=5) | ||
| else: | ||
| clf = DecisionTreeRegressor(max_depth=5) | ||
|
|
||
| # Check that the input data is formatted correctly for scikit-learn and convert classes to np.float64 | ||
| try: | ||
| clf = clf.fit(features, classes) | ||
| features, classes = check_X_y(features, classes) | ||
| except Exception: | ||
| raise ValueError('Error: Input data is not in a valid format. ' | ||
| 'Please confirm that the input data is scikit-learn compatible. ' | ||
| 'For example, the features must be a 2-D array and target labels ' | ||
| 'must be a 1-D array.') | ||
|
|
||
| if (features.shape[0] > 10000 or features.shape[1] > 100) and self.n_jobs !=1: | ||
| print( | ||
| 'Warning: Although parallelization is currently supported in TPOT, ' | ||
| 'a known freezing issue in joblib has been reported with large dataset.' | ||
| 'Parallelization with large dataset may freeze or crash the optimization ' | ||
| 'process without time controls by max_eval_time_mins! Please set n_jobs to 1 ' | ||
| 'if freezing or crash happened. However, scikit-learn also use joblib in ' | ||
| 'multiple estimators so that freezing may also happen with n_jobs=1' | ||
| ) | ||
|
|
||
| # Set the seed for the GP run | ||
| if self.random_state is not None: | ||
| random.seed(self.random_state) # deap uses random | ||
|
|
@@ -771,24 +765,32 @@ def _evaluate_individuals(self, individuals, features, classes, sample_weight=No | |
|
|
||
| # evalurate pipeline | ||
| resulting_score_list = [] | ||
| # chunk size for pbar update | ||
| for chunk_idx in range(0, len(sklearn_pipeline_list), self.n_jobs * 4): | ||
| jobs = [] | ||
| for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx + self.n_jobs * 4]: | ||
| job = delayed(_wrapped_cross_val_score)( | ||
| sklearn_pipeline, | ||
| features, | ||
| classes, | ||
| self.cv, | ||
| self.scoring_function, | ||
| sample_weight, | ||
| self.max_eval_time_mins | ||
| ) | ||
| jobs.append(job) | ||
| parallel = Parallel(n_jobs=self.n_jobs, verbose=0, pre_dispatch='2*n_jobs') | ||
| tmp_result_score = parallel(jobs) | ||
| # update pbar | ||
| for val in tmp_result_score: | ||
| if self.n_jobs == 1: | ||
| tmp_scores = [_wrapped_cross_val_score( | ||
| sklearn_pipeline, | ||
| features, | ||
| classes, | ||
| self.cv, | ||
| self.scoring_function, | ||
| sample_weight, | ||
| timeout=self.max_eval_time_seconds | ||
| ) for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx + self.n_jobs * 4]] | ||
| else: | ||
| jobs = [] | ||
| for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx + self.n_jobs * 4]: | ||
| job = delayed(_wrapped_cross_val_score)( | ||
| sklearn_pipeline, | ||
| features, | ||
| classes, | ||
| self.cv, | ||
| self.scoring_function, | ||
| sample_weight, | ||
| timeout=self.max_eval_time_seconds | ||
| ) | ||
| jobs.append(job) | ||
| tmp_scores = compute(*jobs, get=multiprocessing.get, num_workers=self.n_jobs) | ||
|
||
| for val in tmp_scores: | ||
| if not self._pbar.disable: | ||
| self._pbar.update(1) | ||
| if val == 'Timeout': | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Warnings should only print if verbosity>2.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make sense. I fixed it.