Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit a0de0e9

Browse files
committed
Enable auto chackplointing on SigTerm
1 parent 3a4d5b5 commit a0de0e9

File tree

6 files changed

+86
-5
lines changed

6 files changed

+86
-5
lines changed

coreneuron/apps/main1.cpp

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212
* @brief File containing main driver routine for CoreNeuron
1313
*/
1414

15-
#include <cstring>
1615
#include <climits>
16+
#include <cstring>
17+
#include <csignal>
1718
#include <memory>
1819
#include <vector>
1920

@@ -114,11 +115,24 @@ char* prepare_args(int& argc, char**& argv, int use_mpi, const char* arg) {
114115
// return actual data to be freed
115116
return first;
116117
}
117-
}
118+
119+
} // extern "C"
118120

119121
namespace coreneuron {
122+
120123
void call_prcellstate_for_prcellgid(int prcellgid, int compute_gpu, int is_init);
121124

125+
126+
static std::string check_restore() {
127+
auto restore_path = corenrn_param.restorepath;
128+
const auto auto_chkpt_path = corenrn_param.outpath + "/_corenrn_ckpt";
129+
if (restore_path.empty() && fs_isdir(auto_chkpt_path.c_str())) {
130+
restore_path = auto_chkpt_path;
131+
}
132+
return restore_path;
133+
}
134+
135+
122136
void nrn_init_and_load_data(int argc,
123137
char* argv[],
124138
bool is_mapping_needed = false,
@@ -168,7 +182,7 @@ void nrn_init_and_load_data(int argc,
168182
set_globals(corenrn_param.datpath.c_str(), (corenrn_param.seed >= 0), corenrn_param.seed);
169183

170184
// set global variables for start time, timestep and temperature
171-
std::string restore_path = corenrn_param.restorepath;
185+
std::string restore_path = check_restore();
172186
t = restore_time(restore_path.c_str());
173187

174188
if (corenrn_param.dt != -1000.) { // command line arg highest precedence
@@ -402,6 +416,20 @@ std::unique_ptr<ReportHandler> create_report_handler(ReportConfiguration& config
402416
return report_handler;
403417
}
404418

419+
/**
420+
* \brief Installs a SIGTERM handler so that we finish the current simulation without losing data
421+
* \return True if a checkpoint was performed. False otherwise (not enough elapsed time)
422+
*/
423+
static void install_sigterm_handler() {
424+
auto sigh = [](int) {
425+
std::cerr << "SIGTERM caught! Halting sim and dumping checkpoint" << std::endl;
426+
coreneuron::stoprun = true;
427+
};
428+
if (std::signal(SIGTERM, sigh) == SIG_ERR) {
429+
std::cerr << "Could not install SIGTERM handler" << std::endl;
430+
}
431+
}
432+
405433
} // namespace coreneuron
406434

407435
/// The following high-level functions are marked as "extern C"
@@ -482,6 +510,9 @@ extern "C" int run_solve_core(int argc, char** argv) {
482510
if (nrnmpi_myid == 0) {
483511
mkdir_p(output_dir.c_str());
484512
}
513+
514+
install_sigterm_handler();
515+
485516
#if NRNMPI
486517
nrnmpi_barrier();
487518
#endif

coreneuron/io/file_utils.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,13 @@ int mkdir_p(const char* path) {
5151
delete[] dirpath;
5252
return 0;
5353
}
54+
55+
bool fs_exists(const char* path) {
56+
struct stat buffer;
57+
return (stat(path, &buffer) == 0);
58+
}
59+
60+
bool fs_isdir(const char* path) {
61+
struct stat buffer;
62+
return (stat(path, &buffer) == 0 && S_ISDIR(buffer.st_mode));
63+
}

coreneuron/io/file_utils.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,12 @@
2121
*/
2222
int mkdir_p(const char* path);
2323

24+
/** @brief Checks an arbitrary path exists
25+
*/
26+
bool fs_exists(const char* path);
27+
28+
/** @brief Checks an arbitrary path is an existing directory
29+
*/
30+
bool fs_isdir(const char* path);
31+
2432
#endif /* ifndef NRN_FILE_UTILS */

coreneuron/nrnconf.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ extern double pi;
3737
extern double t, dt;
3838
extern int rev_dt;
3939
extern int secondorder;
40-
extern bool stoprun;
40+
extern bool volatile stoprun;
4141
extern const char* bbcore_write_version;
4242
#define tstopbit (1 << 15)
4343
#define tstopset stoprun |= tstopbit

coreneuron/sim/fadvance_core.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# =============================================================================.
77
*/
88

9+
#include <cstdlib>
910
#include <functional>
1011

1112
#include "coreneuron/coreneuron.hpp"
@@ -23,12 +24,19 @@
2324
#include "coreneuron/utils/progressbar/progressbar.h"
2425
#include "coreneuron/utils/profile/profiler_interface.h"
2526
#include "coreneuron/io/nrn2core_direct.h"
27+
#include "coreneuron/io/nrn_checkpoint.hpp"
28+
29+
// Do an auto checkpoint only if execution lasted longer than this var (secs)
30+
#define CHECKPOINT_MIN_RUNTIME (4 * 3600) // 4h
2631

2732
namespace coreneuron {
2833

2934
extern corenrn_parameters corenrn_param;
3035
static void* nrn_fixed_step_thread(NrnThread*);
3136
static void* nrn_fixed_step_group_thread(NrnThread*, int, int, int&);
37+
static bool nrn_auto_checkpoint();
38+
static time_t sim_start_time;
39+
3240

3341
void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */
3442
if (adt != nrn_threads[0]._dt) {
@@ -109,6 +117,7 @@ void nrn_fixed_single_steps_minimal(int total_sim_steps, double tstop) {
109117
#endif
110118
nrn_fixed_step_minimal();
111119
if (stoprun) {
120+
nrn_auto_checkpoint();
112121
break;
113122
}
114123
current_steps++;
@@ -141,6 +150,7 @@ void nrn_fixed_step_group_minimal(int total_sim_steps) {
141150
nrn_flush_reports(nrn_threads[0]._t);
142151
#endif
143152
if (stoprun) {
153+
nrn_auto_checkpoint();
144154
break;
145155
}
146156
current_steps++;
@@ -377,4 +387,26 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) {
377387

378388
return nullptr;
379389
}
390+
391+
/**
392+
* \brief Does a checkpoint of the simulation in enough time has passed
393+
* \return True if a checkpoint was performed. False otherwise (not enough elapsed time)
394+
*/
395+
static bool nrn_auto_checkpoint() {
396+
time_t cur_time = time(NULL);
397+
int elapsed_secs = difftime(sim_start_time, cur_time);
398+
if (elapsed_secs < CHECKPOINT_MIN_RUNTIME) {
399+
return false;
400+
}
401+
// Write to tmp location first because allocated time may not be enough to complete
402+
const auto ckpt_tmp = corenrn_param.outpath + "/_corenrn_ckpt_dirty",
403+
ckpt_dir = corenrn_param.outpath + "/_corenrn_ckpt";
404+
Instrumentor::phase p("Checkpointing");
405+
write_checkpoint(nrn_threads, nrn_nthread, ckpt_tmp.c_str());
406+
system(("/bin/rm -rf '" + ckpt_dir + "'; " + "/bin/mv '" + ckpt_tmp + "' '" + ckpt_dir + "'")
407+
.c_str());
408+
return true;
409+
}
410+
411+
380412
} // namespace coreneuron

coreneuron/utils/nrnoc_aux.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
#include "coreneuron/utils/nrnoc_aux.hpp"
1616

1717
namespace coreneuron {
18-
bool stoprun;
18+
bool volatile stoprun;
1919
int v_structure_change;
2020
int diam_changed;
2121
#define MAXERRCOUNT 5

0 commit comments

Comments
 (0)