Skip to content

Commit 38e6c9a

Browse files
rst0gitavagin
authored andcommitted
seize: fix pause devices for frozen containers
The container checkpointing procedure in Kubernetes freezes running containers to create a consistent snapshot of both the runtime state and the rootfs of the container. However, when checkpointing a GPU container, the container must be unfrozen before invoking the cuda-checkpoint tool. This is achieved in prepare_freezer_for_interrupt_only_mode(), which needs to be called before the PAUSE_DEVICES hook. The patch introducing this functionality fixes this problem for containers with multiple processes. However, if the container has a single process, prepare_freezer_for_interrupt_only_mode() must be invoked immediately before the PAUSE_DEVICES hook. Fixes: checkpoint-restore#2514 Signed-off-by: Radostin Stoyanov <[email protected]>
1 parent 77c8014 commit 38e6c9a

File tree

1 file changed

+16
-6
lines changed

1 file changed

+16
-6
lines changed

criu/seize.c

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,22 +1060,32 @@ int collect_pstree(void)
10601060
*/
10611061
alarm(opts.timeout);
10621062

1063-
ret = run_plugins(PAUSE_DEVICES, pid);
1064-
if (ret < 0 && ret != -ENOTSUP) {
1065-
goto err;
1066-
}
1067-
10681063
if (opts.freeze_cgroup && cgroup_version())
10691064
goto err;
10701065

10711066
pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1);
10721067

10731068
if (opts.freeze_cgroup && !compel_interrupt_only_mode) {
1069+
ret = run_plugins(PAUSE_DEVICES, pid);
1070+
if (ret < 0 && ret != -ENOTSUP) {
1071+
goto err;
1072+
}
1073+
10741074
if (freeze_processes())
10751075
goto err;
10761076
} else {
10771077
if (opts.freeze_cgroup && prepare_freezer_for_interrupt_only_mode())
10781078
goto err;
1079+
1080+
/*
1081+
* Call PAUSE_DEVICES after prepare_freezer_for_interrupt_only_mode()
1082+
* to be able to checkpoint containers in a frozen state.
1083+
*/
1084+
ret = run_plugins(PAUSE_DEVICES, pid);
1085+
if (ret < 0 && ret != -ENOTSUP) {
1086+
goto err;
1087+
}
1088+
10791089
if (compel_interrupt_task(pid)) {
10801090
set_cr_errno(ESRCH);
10811091
goto err;
@@ -1136,4 +1146,4 @@ int checkpoint_devices(void)
11361146
exit_code = 0;
11371147
err:
11381148
return exit_code;
1139-
}
1149+
}

0 commit comments

Comments
 (0)