From 34226fd243b599b8c02dad3ef1530cef2016dabe Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sat, 18 Jan 2025 13:43:15 +0000 Subject: [PATCH 001/137] ci: try GitHub arm runners Signed-off-by: Adrian Reber --- .github/workflows/actuated-aarch64-test.yaml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/workflows/actuated-aarch64-test.yaml b/.github/workflows/actuated-aarch64-test.yaml index 8b0a63fc7b..567746a5f4 100644 --- a/.github/workflows/actuated-aarch64-test.yaml +++ b/.github/workflows/actuated-aarch64-test.yaml @@ -1,4 +1,4 @@ -name: Actuated aarch64 test +name: aarch64 test on: [push, pull_request] @@ -11,32 +11,38 @@ jobs: build: # Actuated runners are not available in all repositories. if: ${{ github.repository == 'checkpoint-restore/criu' }} - # The memory size and the number of CPUs can be freely selected. - # 3GB and 4 CPUs seems to be enough according to the result from 'vmmeter'. - runs-on: actuated-arm64-4cpu-3gb + # The memory size and the number of CPUs can be freely selected for + # the actuated runners. 3GB and 4 CPUs seems to be enough according to the + # result from 'vmmeter'. + runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: + os: [actuated-arm64-4cpu-3gb, ubuntu-24.04-arm] target: [GCC=1, CLANG=1] steps: # https://gist.github.com/alexellis/1f33e581c75e11e161fe613c46180771#file-metering-gha-md # vmmeter start - name: Prepare arkade + if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} uses: alexellis/arkade-get@master with: crane: latest print-summary: false - name: Install vmmeter + if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} run: | crane export --platform linux/arm64 ghcr.io/openfaasltd/vmmeter:latest | sudo tar -xvf - -C /usr/local/bin - name: Run vmmeter + if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} uses: self-actuated/vmmeter-action@master # vmmeter end - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }} + - name: Run Tests ${{ matrix.target }}/${{ matrix.os }} # Following tests are failing on the actuated VMs: # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) From daa548bbfb189beb3c2b632a39081f8713b5222f Mon Sep 17 00:00:00 2001 From: Yuanhong Peng Date: Wed, 2 Apr 2025 18:48:12 +0800 Subject: [PATCH 002/137] criu: Do not print failed message when there is no late stage hook This is highly confusing, and it seems that the ret variable is not handled in the subsequent process. Signed-off-by: Yuanhong Peng --- criu/cr-restore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1f4881dab0..583b446e0b 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2258,7 +2258,7 @@ static int restore_root_task(struct pstree_item *init) * might actually be a true error code but that would be also * captured in the plugin so no need to print the error here. */ - if (ret < 0) + if (ret < 0 && ret != -ENOTSUP) pr_debug("restore late stage hook for external plugin failed\n"); } From 9a1e979666275f2b94aa42f83bb4bd86ef00b7ea Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 2 Apr 2025 21:13:16 +0000 Subject: [PATCH 003/137] compel: fix the stack test The stack test incorrectly assumed the page immediately following the stack pointer could never be changed. This doesn't work, because this page can be a part of another mapping. This commit introduces a dedicated "stack redzone," a small guard region directly after the stack. The stack test is modified to specifically check for corruption within this redzone. Signed-off-by: Andrei Vagin --- compel/include/uapi/infect.h | 9 +++ compel/src/lib/infect.c | 6 +- compel/test/stack/spy.c | 113 +---------------------------------- 3 files changed, 12 insertions(+), 116 deletions(-) diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index ed97d64dd6..1f61876ffb 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -13,6 +13,15 @@ #define PARASITE_START_AREA_MIN (4096) +#define PARASITE_STACK_SIZE (16 << 10) +/* + * A stack redzone is a small, protected region of memory located immediately + * after a parasite stack. It is intended to remain unchanged. While it can be + * implemented as a guard page, we want to avoid the overhead of additional + * remote system calls. + */ +#define PARASITE_STACK_REDZONE 128 + extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index a9bbd64004..4ea27bc633 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -38,8 +38,6 @@ #define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - (size_t)((struct sockaddr_un *)0)->sun_path) #endif -#define PARASITE_STACK_SIZE (16 << 10) - #ifndef SECCOMP_MODE_DISABLED #define SECCOMP_MODE_DISABLED 0 #endif @@ -1064,7 +1062,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, p += RESTORE_STACK_SIGFRAME; p += PARASITE_STACK_SIZE; - ctl->rstack = ctl->remote_map + p; + ctl->rstack = ctl->remote_map + p - PARASITE_STACK_REDZONE; /* * x86-64 ABI requires a 16 bytes aligned stack. @@ -1078,7 +1076,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, if (nr_threads > 1) { p += PARASITE_STACK_SIZE; - ctl->r_thread_stack = ctl->remote_map + p; + ctl->r_thread_stack = ctl->remote_map + p - PARASITE_STACK_REDZONE; } ret = arch_fetch_sas(ctl, ctl->rsigframe); diff --git a/compel/test/stack/spy.c b/compel/test/stack/spy.c index 9b7c9a7f09..184c8ab318 100644 --- a/compel/test/stack/spy.c +++ b/compel/test/stack/spy.c @@ -50,70 +50,6 @@ static void *get_parasite_rstack_start(struct parasite_ctl *ctl) return rstack_start; } -static int page_writable(struct parasite_ctl *ctl, int pid, void *page) -{ - FILE *maps; - size_t maps_line_len = 0; - char *maps_line = NULL; - char victim_maps_path[6 + 11 + 5 + 1]; - int written; - int ret = 0; - - if (((uintptr_t)page & (page_size() - 1)) != 0) { - fprintf(stderr, "Page address not aligned\n"); - ret = -1; - goto done; - } - - written = snprintf(victim_maps_path, sizeof(victim_maps_path), "/proc/%d/maps", pid); - if (written < 0 || written >= sizeof(victim_maps_path)) { - fprintf(stderr, "Failed to create path string to victim's /proc/%d/maps file\n", pid); - ret = -1; - goto done; - } - - maps = fopen(victim_maps_path, "r"); - if (maps == NULL) { - perror("Can't open victim's /proc/$pid/maps"); - ret = -1; - goto done; - } - - while (getline(&maps_line, &maps_line_len, maps) != -1) { - unsigned long vmstart, vmend; - char r, w; - - if (sscanf(maps_line, "%lx-%lx %c%c", &vmstart, &vmend, &r, &w) < 4) { - fprintf(stderr, "Can't parse victim's /proc/%d/maps; line: %s\n", pid, maps_line); - ret = -1; - goto free_linebuf; - } - - if (page >= (void *)vmstart && page < (void *)vmend) { - if (w == 'w') { - if (r != 'r') { - fprintf(stderr, "Expecting writable memory to also be readable"); - ret = -1; - goto free_linebuf; - } - ret = 1; - } - break; - } - } - - if (errno) { - perror("Can't read victim's /proc/$pid/maps"); - ret = -1; - } - -free_linebuf: - free(maps_line); - fclose(maps); -done: - return ret; -} - static void *read_proc_mem(int pid, void *offset, size_t len) { char victim_mem_path[6 + 11 + 4 + 1]; @@ -153,51 +89,6 @@ static void *read_proc_mem(int pid, void *offset, size_t len) return NULL; } -static int save_data_near_stack(struct parasite_ctl *ctl, int pid, void *stack, void **saved_data, - size_t *saved_data_size) -{ - size_t page_mask = page_size() - 1; - size_t saved_size = 0; - size_t stack_size_last_page = (uintptr_t)stack & page_mask; - void *next_page = stack; - - if (stack_size_last_page != 0) { - size_t empty_space_last_page = page_size() - stack_size_last_page; - saved_size = min(empty_space_last_page, (size_t)SAVED_DATA_MAX); - next_page += page_size() - stack_size_last_page; - } - - while (saved_size < SAVED_DATA_MAX && next_page != NULL) { - switch (page_writable(ctl, pid, next_page)) { - case 1: - saved_size = min((size_t)(saved_size + page_size()), (size_t)SAVED_DATA_MAX); - next_page += page_size(); - break; - case 0: - next_page = NULL; - break; - default: - return -1; - } - } - - if (saved_size > 0) { - void *sd; - - sd = read_proc_mem(pid, stack, saved_size); - if (sd == NULL) - return -1; - - *saved_data = sd; - } else { - *saved_data = NULL; - } - - *saved_data_size = saved_size; - - return 0; -} - static int check_saved_data(struct parasite_ctl *ctl, int pid, void *stack, void *saved_data, size_t saved_data_size) { if (saved_data != NULL) { @@ -221,7 +112,7 @@ static int do_infection(int pid) struct infect_ctx *ictx; int *arg; void *stack; - size_t saved_data_size; + size_t saved_data_size = PARASITE_STACK_REDZONE; int saved_data_check; compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); @@ -257,8 +148,6 @@ static int do_infection(int pid) err_and_ret("Can't register cleanup function with atexit\n"); stack = get_parasite_rstack_start(ctl); - if (save_data_near_stack(ctl, pid, stack, &saved_data, &saved_data_size)) - err_and_ret("Can't save data above stack\n"); if (compel_start_daemon(ctl)) err_and_ret("Can't start daemon in victim\n"); From 5ff52326e15b90dc59ed8ae317735201277a2377 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 27 Mar 2025 14:21:03 +0000 Subject: [PATCH 004/137] restore: use the new kernel interface to restore timers Thomas Gleixner introduced the new interface to create posix timers with specifed timer IDs: https://github.com/torvalds/linux/commit/ec2d0c04624b3c8a7eb1682e006717fa20cfbe24 Previously, CRIU recreated timers by repeatedly creating and deleting them until the desired ID was reached. This approach isn't fast, especially for timers with large IDs. For example, restoring two timers with IDs 1000000 and 2000000 took approximately 1.5 seconds. The new `prctl()` based interface allows direct creation of timers with specified IDs, reducing the restoration time to around 3 microseconds for the same example. Signed-off-by: Andrei Vagin --- criu/cr-check.c | 10 ++++++++ criu/include/kerndat.h | 1 + criu/include/prctl.h | 7 ++++++ criu/include/restorer.h | 1 + criu/kerndat.c | 20 +++++++++++++++ criu/pie/restorer.c | 54 +++++++++++++++++++++++++++++++++++++---- criu/timer.c | 2 ++ 7 files changed, 90 insertions(+), 5 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index 0388cbe7fe..7b4a6415a5 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1392,6 +1392,14 @@ static int check_pagemap_scan(void) return 0; } +static int check_timer_cr_ids(void) +{ + if (!kdat.has_timer_cr_ids) + return -1; + + return 0; +} + /* musl doesn't have a statx wrapper... */ struct staty { __u32 stx_dev_major; @@ -1703,6 +1711,7 @@ int cr_check(void) ret |= check_ipv6_freebind(); ret |= check_pagemap_scan(); ret |= check_overlayfs_maps(); + ret |= check_timer_cr_ids(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1825,6 +1834,7 @@ static struct feature_list feature_list[] = { { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, { "pagemap_scan", check_pagemap_scan }, + { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, { NULL, NULL }, }; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index e03a573419..bd8744d62b 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -89,6 +89,7 @@ struct kerndat_s { bool has_pagemap_scan; bool has_shstk; bool has_close_range; + bool has_timer_cr_ids; }; extern struct kerndat_s kdat; diff --git a/criu/include/prctl.h b/criu/include/prctl.h index f5f23c9692..2966659dad 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -97,4 +97,11 @@ struct prctl_mm_map { #define PR_GET_THP_DISABLE 42 #endif +#ifndef PR_TIMER_CREATE_RESTORE_IDS +#define PR_TIMER_CREATE_RESTORE_IDS 77 +# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 +# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 +# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 +#endif + #endif /* __CR_PRCTL_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index a4fb7ea794..56bea0fcc0 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -170,6 +170,7 @@ struct task_restore_args { struct restore_posix_timer *posix_timers; unsigned int posix_timers_n; + bool posix_timer_cr_ids; struct restore_timerfd *timerfd; unsigned int timerfd_n; diff --git a/criu/kerndat.c b/criu/kerndat.c index 5939005a41..930117b0a4 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1720,6 +1720,22 @@ static int kerndat_has_close_range(void) return 0; } +static int kerndat_has_timer_cr_ids(void) +{ + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) == -1) { + if (errno == EINVAL) { + pr_debug("PR_TIMER_CREATE_RESTORE_IDS isn't supported\n"); + return 0; + } + pr_perror("prctl returned unexpected error code"); + return -1; + } + + kdat.has_timer_cr_ids = true; + return 0; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1981,6 +1997,10 @@ int kerndat_init(void) pr_err("kerndat_has_close_range has failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_timer_cr_ids()) { + pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 348ce6659b..9867a3ddd5 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1235,9 +1235,23 @@ static int timerfd_arm(struct task_restore_args *args) static int create_posix_timers(struct task_restore_args *args) { - int ret, i; + int ret, i, exit_code = -1; kernel_timer_t next_id = 0, timer_id; struct sigevent sev; + bool create_restore_ids = false; + + if (!args->posix_timers_n) + return 0; + + /* prctl returns EINVAL if PR_TIMER_CREATE_RESTORE_IDS isn't supported. */ + ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_ON, 0, 0, 0); + if (ret == 0) { + create_restore_ids = true; + } else if (ret != -EINVAL) { + pr_err("Can't enabled PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); + return -1; + } for (i = 0; i < args->posix_timers_n; i++) { sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify; @@ -1249,16 +1263,36 @@ static int create_posix_timers(struct task_restore_args *args) #endif sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; + if (create_restore_ids) { + /* + * With enabled PR_TIMER_CREATE_RESTORE_IDS, the + * timer_create syscall creates a new timer with the + * specified ID. + */ + timer_id = args->posix_timers[i].spt.it_id; + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); + if (ret < 0) { + pr_err("Can't create posix timer - %d: %d\n", i, ret); + goto out; + } + if (timer_id != args->posix_timers[i].spt.it_id) { + pr_err("Unexpected timer id %u (expected %lu)\n", + timer_id, args->posix_timers[i].spt.it_id); + goto out; + } + continue; + } + while (1) { ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); if (ret < 0) { pr_err("Can't create posix timer - %d\n", i); - return ret; + goto out; } if (timer_id != next_id) { pr_err("Can't create timers, kernel don't give them consequently\n"); - return -1; + goto out; } next_id++; @@ -1268,12 +1302,22 @@ static int create_posix_timers(struct task_restore_args *args) ret = sys_timer_delete(timer_id); if (ret < 0) { pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id); - return ret; + goto out; } } } - return 0; + exit_code = 0; +out: + if (create_restore_ids) { + ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_OFF, 0, 0, 0); + if (ret != 0) { + pr_err("Can't disable PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); + exit_code = -1; + } + } + return exit_code; } static void restore_posix_timers(struct task_restore_args *args) diff --git a/criu/timer.c b/criu/timer.c index 0413e2a720..856501be6b 100644 --- a/criu/timer.c +++ b/criu/timer.c @@ -195,6 +195,7 @@ int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) if (!img) return -1; + ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; ta->posix_timers_n = 0; while (1) { PosixTimerEntry *pte; @@ -234,6 +235,7 @@ int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) return prepare_posix_timers_from_fd(pid, ta); ta->posix_timers_n = tte->n_posix; + ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; for (i = 0; i < ta->posix_timers_n; i++) { t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); if (!t) From e7aee3c5c723e95e1c0e787f4c57919c2fc58c60 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 10 Apr 2025 20:56:23 +0100 Subject: [PATCH 005/137] cuda: use pr_perror for libc function errors When handing errors for functions such as `ptrace()`, `pipe()`, and `fork()` it would be better to use `pr_perror` instead of `pr_err` as it would include a message describing the encountered error. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 99e4caf743..1aaad6842b 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -93,7 +93,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) int fd[2], buf_off; if (pipe(fd) != 0) { - pr_err("Couldn't create pipes for reading cuda-checkpoint output\n"); + pr_perror("Couldn't create pipes for reading cuda-checkpoint output"); return -1; } @@ -101,7 +101,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) int child_pid = fork(); if (child_pid == -1) { - pr_err("Failed to fork to exec cuda-checkpoint\n"); + pr_perror("Failed to fork to exec cuda-checkpoint"); close(fd[READ]); close(fd[WRITE]); return -1; @@ -166,7 +166,6 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) } if (WIFSIGNALED(status)) { int sig = WTERMSIG(status); - pr_err("cuda-checkpoint unexpectedly signaled with %d: %s\n", sig, strsignal(sig)); } else if (WIFEXITED(status)) { exit_code = WEXITSTATUS(status); @@ -283,8 +282,8 @@ static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigse * a compel_interrupt_task() */ if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) { - pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n", - restore_tid); + pr_perror("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state", + restore_tid); return -1; } @@ -295,12 +294,12 @@ static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigse } if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) { - pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid); + pr_perror("Failed to set ptrace options on interrupt for restore tid %d", restore_tid); return -1; } if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) { - pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid); + pr_perror("Unable to restore original sigmask to restore tid %d", restore_tid); return -1; } @@ -312,7 +311,7 @@ static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) k_rtsigset_t block; if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) { - pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid); + pr_perror("Failed to get current sigmask for restore tid %d", restore_tid); return -1; } @@ -320,18 +319,18 @@ static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) ksigdelset(&block, SIGTRAP); if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) { - pr_err("Failed to block signals on restore tid %d\n", restore_tid); + pr_perror("Failed to block signals on restore tid %d", restore_tid); return -1; } // Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) { - pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid); + pr_perror("Could not clear ptrace options on restore tid %d", restore_tid); return -1; } if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) { - pr_err("Could not resume cuda restore tid %d\n", restore_tid); + pr_perror("Could not resume cuda restore tid %d", restore_tid); return -1; } From 6805841660e741eda203ef8339a895281f2095e9 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 10 Apr 2025 21:14:05 +0100 Subject: [PATCH 006/137] cuda: remove redundant goto label The `goto interrupt` label is unnecessary as the code directly returns after `cuda_process_checkpoint_action()`. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 1aaad6842b..9ccb042249 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -395,12 +395,9 @@ int cuda_plugin_checkpoint_devices(int pid) status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); - goto interrupt; } -interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); - return status != 0 ? -1 : int_ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); From 74799ae023f82d99efac8d67974705087f208567 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 9 Apr 2025 13:25:44 +0000 Subject: [PATCH 007/137] aarch64: fix build with missing NT_ARM_PAC_ENABLED_KEYS On a RHEL 8 based system building CRIU fails with: criu/arch/aarch64/crtools.c: In function 'save_pac_keys': criu/arch/aarch64/crtools.c:73:39: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_PACA_KEYS'? ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); ^~~~~~~~~~~~~~~~~~~~~~~ NT_ARM_PACA_KEYS criu/arch/aarch64/crtools.c:73:39: note: each undeclared identifier is reported only once for each function it appears in criu/arch/aarch64/crtools.c: In function 'arch_ptrace_restore': criu/arch/aarch64/crtools.c:261:44: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_PACA_KEYS'? if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { ^~~~~~~~~~~~~~~~~~~~~~~ NT_ARM_PACA_KEYS This adds the missing define if it is undefined. Signed-off-by: Adrian Reber --- criu/arch/aarch64/crtools.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index 6cde03ee38..c077dd06bc 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -23,6 +23,10 @@ #include "compel/infect.h" #include "pstree.h" +#ifndef NT_ARM_PAC_ENABLED_KEYS +#define NT_ARM_PAC_ENABLED_KEYS 0x40a /* AArch64 pointer authentication enabled keys. */ +#endif + extern unsigned long getauxval(unsigned long type); #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e From b9da95b0b2c5f42b24725d673bf287b3c00bbc40 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Tue, 23 Jan 2024 08:22:07 -0800 Subject: [PATCH 008/137] s390: Fix FP reg restore after parasite code runs Currently we save FP regs before parasite code runs, and restore after for --leave-running, --check-only, and in case of errors. In case of errors the error may have happened before FP regs were saved, so we should only restore them if they were actually saved. Signed-off-by: Younes Manton --- criu/arch/s390/crtools.c | 90 +++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 38 deletions(-) diff --git a/criu/arch/s390/crtools.c b/criu/arch/s390/crtools.c index 96cef819e3..e08c838783 100644 --- a/criu/arch/s390/crtools.c +++ b/criu/arch/s390/crtools.c @@ -142,6 +142,29 @@ static void print_core_fp_regs(const char *msg, CoreEntry *core) print_core_ri_cb(core); } +/* + * Allocate floating point registers + */ +static UserS390FpregsEntry *allocate_fp_regs(void) +{ + UserS390FpregsEntry *fpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + return NULL; + user_s390_fpregs_entry__init(fpregs); + + fpregs->n_fprs = 16; + fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); + if (!fpregs->fprs) + goto fail_free_fpregs; + return fpregs; + +fail_free_fpregs: + xfree(fpregs); + return NULL; +} + /* * Allocate VxrsLow registers */ @@ -294,7 +317,13 @@ int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_stru CoreEntry *core = arg; gpregs = CORE_THREAD_ARCH_INFO(core)->gpregs; - fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; + /* + * We delay allocating this until now because checkpointing can fail earlier. + * When it fails we need to know if we reached here or not so that the cleanup + * code doesn't restore FPRs that were never saved in the first place. + */ + fpregs = allocate_fp_regs(); + CORE_THREAD_ARCH_INFO(core)->fpregs = fpregs; /* Vector registers */ if (f->flags & USER_FPREGS_VXRS) { @@ -399,36 +428,15 @@ int restore_fpu(struct rt_sigframe *f, CoreEntry *core) return 0; } -/* - * Allocate floating point registers - */ -static UserS390FpregsEntry *allocate_fp_regs(void) -{ - UserS390FpregsEntry *fpregs; - - fpregs = xmalloc(sizeof(*fpregs)); - if (!fpregs) - return NULL; - user_s390_fpregs_entry__init(fpregs); - - fpregs->n_fprs = 16; - fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); - if (!fpregs->fprs) - goto fail_free_fpregs; - return fpregs; - -fail_free_fpregs: - xfree(fpregs); - return NULL; -} - /* * Free floating point registers */ static void free_fp_regs(UserS390FpregsEntry *fpregs) { - xfree(fpregs->fprs); - xfree(fpregs); + if (fpregs) { + xfree(fpregs->fprs); + xfree(fpregs); + } } /* @@ -487,15 +495,17 @@ int arch_alloc_thread_info(CoreEntry *core) ti_s390->gpregs = allocate_gp_regs(); if (!ti_s390->gpregs) goto fail_free_ti_s390; - ti_s390->fpregs = allocate_fp_regs(); - if (!ti_s390->fpregs) - goto fail_free_gp_regs; + + /* + * Delay allocating space until needed. Checkpointing can fail before that + * and the cleanup code needs to be able to tell if FPRs were saved or not + * before trying to restore the register state. + */ + ti_s390->fpregs = NULL; CORE_THREAD_ARCH_INFO(core) = ti_s390; return 0; -fail_free_gp_regs: - free_gp_regs(ti_s390->gpregs); fail_free_ti_s390: xfree(ti_s390); return -1; @@ -678,14 +688,18 @@ static int set_task_regs(pid_t pid, CoreEntry *core) user_fpregs_struct_t fpregs; memset(&fpregs, 0, sizeof(fpregs)); - /* Floating point registers */ + /* + * Floating point registers + * Optional on checkpoint; checkpoint may have failed and we may reach here as part of cleanup + * so there's no guarantee that we saved FPRs for this thread. + */ cfpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; - if (!cfpregs) - return -1; - fpregs.prfpreg.fpc = cfpregs->fpc; - memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); - if (set_fp_regs(pid, &fpregs) < 0) - return -1; + if (cfpregs) { + fpregs.prfpreg.fpc = cfpregs->fpc; + memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); + if (set_fp_regs(pid, &fpregs) < 0) + return -1; + } /* Vector registers (optional) */ cvxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; if (cvxrs_low != NULL) { From 5de61a721fbc56de68094f19ac34466d66f7374f Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 21 Apr 2025 06:33:41 +0000 Subject: [PATCH 009/137] net: nftables: avoid restore failure if the CRIU nft table already exist CRIU locks the network during restore in an "empty" network namespace. However, "empty" in this context means CRIU isn't restoring the namespace. This network namespace can be the same namespace where processes have been dumped and so the network is already locked in it. Fixes #2650 Signed-off-by: Andrei Vagin --- criu/cr-restore.c | 2 +- criu/include/net.h | 2 +- criu/net.c | 30 +++++++++++++++++------------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 583b446e0b..30932f60a2 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2119,7 +2119,7 @@ static int restore_root_task(struct pstree_item *init) * the '--empty-ns net' mode no iptables C/R is done and we * need to return these rules by hands. */ - ret = network_lock_internal(); + ret = network_lock_internal(/* restore = */ true); if (ret) goto out_kill; } diff --git a/criu/include/net.h b/criu/include/net.h index 5e8a848620..7c5ede21e1 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -31,7 +31,7 @@ extern int collect_net_namespaces(bool for_dump); extern int network_lock(void); extern void network_unlock(void); -extern int network_lock_internal(void); +extern int network_lock_internal(bool restore); extern struct ns_desc net_ns_desc; diff --git a/criu/net.c b/criu/net.c index ee46f1c495..300df480b0 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3206,12 +3206,12 @@ static inline FILE *redirect_nftables_output(struct nft_ctx *nft) } #endif -static inline int nftables_lock_network_internal(void) +static inline int nftables_lock_network_internal(bool restore) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) cleanup_file FILE *fp = NULL; struct nft_ctx *nft; - int ret = 0; + int ret = 0, exit_code = -1; char table[32]; char buf[128]; @@ -3224,11 +3224,16 @@ static inline int nftables_lock_network_internal(void) fp = redirect_nftables_output(nft); if (!fp) - goto out; + goto err2; snprintf(buf, sizeof(buf), "create table %s", table); - if (NFT_RUN_CMD(nft, buf)) + ret = NFT_RUN_CMD(nft, buf); + if (ret) { + /* The network has been locked on dump. */ + if (restore && errno == EEXIST) + return 0; goto err2; + } snprintf(buf, sizeof(buf), "add chain %s output { type filter hook output priority 0; policy drop; }", table); if (NFT_RUN_CMD(nft, buf)) @@ -3246,17 +3251,16 @@ static inline int nftables_lock_network_internal(void) if (NFT_RUN_CMD(nft, buf)) goto err1; - goto out; - + exit_code = 0; +out: + nft_ctx_free(nft); + return exit_code; err1: snprintf(buf, sizeof(buf), "delete table %s", table); NFT_RUN_CMD(nft, buf); err2: - ret = -1; pr_err("Locking network failed using nftables\n"); -out: - nft_ctx_free(nft); - return ret; + goto out; #else pr_err("CRIU was built without libnftables support\n"); return -1; @@ -3288,7 +3292,7 @@ static int iptables_network_lock_internal(void) return ret; } -int network_lock_internal(void) +int network_lock_internal(bool restore) { int ret = 0, nsret; @@ -3301,7 +3305,7 @@ int network_lock_internal(void) if (opts.network_lock_method == NETWORK_LOCK_IPTABLES) ret = iptables_network_lock_internal(); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) - ret = nftables_lock_network_internal(); + ret = nftables_lock_network_internal(restore); if (restore_ns(nsret, &net_ns_desc)) ret = -1; @@ -3427,7 +3431,7 @@ int network_lock(void) if (run_scripts(ACT_NET_LOCK)) return -1; - return network_lock_internal(); + return network_lock_internal(false); } void network_unlock(void) From b6dca31162562385cb0657af3443666990a28c01 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Apr 2025 14:12:31 +0100 Subject: [PATCH 010/137] aarch64/crtools: fix define for missing constants Building CRIU package on Debian 11 aarch64 fails with criu/arch/aarch64/crtools.c: In function 'save_pac_keys': criu/arch/aarch64/crtools.c:32:31: error: storage size of 'paca' isn't known struct user_pac_address_keys paca; ^~~~ criu/arch/aarch64/crtools.c:33:31: error: storage size of 'pacg' isn't known struct user_pac_generic_keys pacg; ^~~~ criu/arch/aarch64/crtools.c:47:15: error: 'HWCAP_PACA' undeclared (first use in this function); did you mean 'HWCAP_FCMA'? if (hwcaps & HWCAP_PACA) { ^~~~~~~~~~ HWCAP_FCMA criu/arch/aarch64/crtools.c:47:15: note: each undeclared identifier is reported only once for each function it appears in criu/arch/aarch64/crtools.c:53:44: error: 'NT_ARM_PACA_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:73:39: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function) ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); ^~~~~~~~~~~~~~~~~~~~~~~ criu/arch/aarch64/crtools.c:82:15: error: 'HWCAP_PACG' undeclared (first use in this function); did you mean 'HWCAP_AES'? if (hwcaps & HWCAP_PACG) { ^~~~~~~~~~ HWCAP_AES criu/arch/aarch64/crtools.c:88:44: error: 'NT_ARM_PACG_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:33:31: error: unused variable 'pacg' [-Werror=unused-variable] struct user_pac_generic_keys pacg; ^~~~ criu/arch/aarch64/crtools.c:32:31: error: unused variable 'paca' [-Werror=unused-variable] struct user_pac_address_keys paca; ^~~~ criu/arch/aarch64/crtools.c: In function 'arch_ptrace_restore': criu/arch/aarch64/crtools.c:227:31: error: storage size of 'upaca' isn't known struct user_pac_address_keys upaca; ^~~~~ criu/arch/aarch64/crtools.c:228:31: error: storage size of 'upacg' isn't known struct user_pac_generic_keys upacg; ^~~~~ criu/arch/aarch64/crtools.c:241:18: error: 'HWCAP_PACA' undeclared (first use in this function); did you mean 'HWCAP_FCMA'? if (!(hwcaps & HWCAP_PACA)) { ^~~~~~~~~~ HWCAP_FCMA criu/arch/aarch64/crtools.c:255:44: error: 'NT_ARM_PACA_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:261:44: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function) if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { ^~~~~~~~~~~~~~~~~~~~~~~ criu/arch/aarch64/crtools.c:268:18: error: 'HWCAP_PACG' undeclared (first use in this function); did you mean 'HWCAP_AES'? if (!(hwcaps & HWCAP_PACG)) { ^~~~~~~~~~ HWCAP_AES criu/arch/aarch64/crtools.c:275:44: error: 'NT_ARM_PACG_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:233:6: error: variable 'ret' set but not used [-Werror=unused-but-set-variable] int ret; ^~~ criu/arch/aarch64/crtools.c:228:31: error: unused variable 'upacg' [-Werror=unused-variable] struct user_pac_generic_keys upacg; ^~~~~ criu/arch/aarch64/crtools.c:227:31: error: unused variable 'upaca' [-Werror=unused-variable] struct user_pac_address_keys upaca; ^~~~~ This patch adds the missing constants and structs if undefined. Signed-off-by: Radostin Stoyanov --- criu/arch/aarch64/crtools.c | 47 +++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index c077dd06bc..3ed5c9d635 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -23,6 +23,45 @@ #include "compel/infect.h" #include "pstree.h" +/* + * cr_user_pac_* are a copy of the corresponding uapi structs + * in arch/arm64/include/uapi/asm/ptrace.h + */ +struct cr_user_pac_address_keys { + __uint128_t apiakey; + __uint128_t apibkey; + __uint128_t apdakey; + __uint128_t apdbkey; +}; + +struct cr_user_pac_generic_keys { + __uint128_t apgakey; +}; + +/* + * The following HWCAP constants are copied from + * arch/arm64/include/uapi/asm/hwcap.h + */ +#ifndef HWCAP_PACA +#define HWCAP_PACA (1 << 30) +#endif + +#ifndef HWCAP_PACG +#define HWCAP_PACG (1UL << 31) +#endif + +/* + * The following NT_ARM_PAC constants are copied from + * include/uapi/linux/elf.h + */ +#ifndef NT_ARM_PACA_KEYS +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#endif + +#ifndef NT_ARM_PACG_KEYS +#define NT_ARM_PACG_KEYS 0x408 +#endif + #ifndef NT_ARM_PAC_ENABLED_KEYS #define NT_ARM_PAC_ENABLED_KEYS 0x40a /* AArch64 pointer authentication enabled keys. */ #endif @@ -33,8 +72,8 @@ extern unsigned long getauxval(unsigned long type); static int save_pac_keys(int pid, CoreEntry *core) { - struct user_pac_address_keys paca; - struct user_pac_generic_keys pacg; + struct cr_user_pac_address_keys paca; + struct cr_user_pac_generic_keys pacg; PacKeys *pac_entry; long pac_enabled_key; struct iovec iov; @@ -228,8 +267,8 @@ int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) int arch_ptrace_restore(int pid, struct pstree_item *item) { unsigned long hwcaps = getauxval(AT_HWCAP); - struct user_pac_address_keys upaca; - struct user_pac_generic_keys upacg; + struct cr_user_pac_address_keys upaca; + struct cr_user_pac_generic_keys upacg; PacAddressKeys *paca; PacGenericKeys *pacg; long pac_enabled_keys; From 3dd3fe79984c4bc423bd38bac04f11ad7564dca1 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 2 Apr 2025 12:02:46 +0800 Subject: [PATCH 011/137] mount: restore root mount flags Mount flags belong to mount and mount namespace of the Container, so we should preserve them, as Container user will not expect mounts switching between ro and rw over c/r. Fixes: #2632 v5: fix both mount-v1 and mount-v2 Signed-off-by: Pavel Tikhomirov --- criu/mount-v2.c | 6 ++++++ criu/mount.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/criu/mount-v2.c b/criu/mount-v2.c index cdebc83182..1e33ac12a2 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -443,6 +443,7 @@ static int do_bind_mount_v2(struct mount_info *mi) /* Mounts root container mount. */ static int do_mount_root_v2(struct mount_info *mi) { + unsigned long mflags = mi->flags & (~MS_PROPAGATE); unsigned long flags = MS_BIND; int fd; @@ -477,6 +478,11 @@ static int do_mount_root_v2(struct mount_info *mi) return -1; } + if (mflags && mount(NULL, mi->plain_mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + mi->mounted = true; return 0; diff --git a/criu/mount.c b/criu/mount.c index 82bbd52d6c..06b9595427 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -2690,9 +2690,16 @@ static bool can_mount_now(struct mount_info *mi) static int do_mount_root(struct mount_info *mi) { + unsigned long mflags = mi->flags & (~MS_PROPAGATE); + if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, mi->shared_id, mi->master_id)) return -1; + if (mflags && mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + return fetch_rt_stat(mi, service_mountpoint(mi)); } From 3b3aa766c5ad587cf079294dad1208fe3429d159 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 2 Apr 2025 12:47:46 +0800 Subject: [PATCH 012/137] zdtm/lib: add "bind" desc option Add {'bind': 'path/to/bindmount'} zdtm descriptor option, so that in test mount namespace a directory bindmount can be created before running the test. This is useful to leave test directory writable (e.g. for logs) while the test makes root mount readonly. note: We create this bindmount early so that all test files are opened on it initially and not on the below mount. Will be used in mnt_ro_root test. Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 3 +++ test/zdtm/lib/ns.c | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/test/zdtm.py b/test/zdtm.py index 37ebe63b7b..e3ddc762a3 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -443,6 +443,7 @@ def __init__(self, name, desc, flavor, freezer, rootless): self._bins = [name] self._env = {'TMPDIR': os.environ.get('TMPDIR', '/tmp')} self._deps = desc.get('deps', []) + self._bind = desc.get('bind') self.auto_reap = True def __make_action(self, act, env=None, root=None): @@ -513,6 +514,8 @@ def start(self): if self.__flavor.ns: env['ZDTM_NEWNS'] = "1" env['ZDTM_ROOT'] = self.__flavor.root + if self._bind: + env['ZDTM_BIND'] = self._bind env['ZDTM_DEV'] = self.__flavor.devpath env['PATH'] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 3c0dbdeb80..5fe81561fe 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -28,8 +28,9 @@ extern int pivot_root(const char *new_root, const char *put_old); static int prepare_mntns(void) { int dfd, ret; - char *root, *criu_path, *dev_path; + char *root, *criu_path, *dev_path, *zdtm_bind; char path[PATH_MAX]; + char bind_path[PATH_MAX]; root = getenv("ZDTM_ROOT"); if (!root) { @@ -52,6 +53,18 @@ static int prepare_mntns(void) return -1; } + zdtm_bind = getenv("ZDTM_BIND"); + if (zdtm_bind) { + /* + * Bindmount the directory to itself. + */ + snprintf(bind_path, sizeof(bind_path), "%s/%s", root, zdtm_bind); + if (mount(bind_path, bind_path, NULL, MS_BIND, NULL)) { + fprintf(stderr, "Can't bind-mount ZDTM_BIND: %m\n"); + return -1; + } + } + dev_path = getenv("ZDTM_DEV"); if (dev_path) { snprintf(path, sizeof(path), "%s/dev", root); From 8166a52c912f084bef7c1744a66cfbb222a71599 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 2 Apr 2025 12:47:46 +0800 Subject: [PATCH 013/137] zdtm: add mnt_ro_root test It makes root mount readonly and checks that it is still readonly after migration. Make zdtm/static writable for logs via "bind" desc option. v2: explain why we don't have explicit rw/ro flag check v3: use new zdtm "bind" desc option Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/ns.c | 3 +++ test/zdtm/static/Makefile | 1 + test/zdtm/static/mnt_ro_root.c | 32 +++++++++++++++++++++++++++++++ test/zdtm/static/mnt_ro_root.desc | 6 ++++++ 4 files changed, 42 insertions(+) create mode 100644 test/zdtm/static/mnt_ro_root.c create mode 100644 test/zdtm/static/mnt_ro_root.desc diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 5fe81561fe..822e09c928 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -57,6 +57,9 @@ static int prepare_mntns(void) if (zdtm_bind) { /* * Bindmount the directory to itself. + * e.g.: The mnt_ro_root test makes "/" mount readonly, but we + * still want to write logs to /zdtm/static/ so let's make it + * separate writable bind mount. */ snprintf(bind_path, sizeof(bind_path), "%s/%s", root, zdtm_bind); if (mount(bind_path, bind_path, NULL, MS_BIND, NULL)) { diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 6a19cad3c2..81e44de221 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -423,6 +423,7 @@ TST_DIR = \ mntns_ghost \ mntns_ghost01 \ mntns_ro_root \ + mnt_ro_root \ mntns_link_ghost \ mntns_shared_bind \ mntns_shared_bind02 \ diff --git a/test/zdtm/static/mnt_ro_root.c b/test/zdtm/static/mnt_ro_root.c new file mode 100644 index 0000000000..2d8370150b --- /dev/null +++ b/test/zdtm/static/mnt_ro_root.c @@ -0,0 +1,32 @@ +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if root mount remains read-only after c/r"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + if (mount(NULL, "/", NULL, MS_REMOUNT | MS_RDONLY | MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* + * Note: In zdtm.py:check_visible_state() we already check for all + * tests, that all mounts in the test's mount namespace remain the + * same, by comparing mountinfo before and after c/r. So rw/ro mount + * option inconsistency will be detected there and we don't need to + * check it in the test itself. + */ + pass(); + return 0; +} diff --git a/test/zdtm/static/mnt_ro_root.desc b/test/zdtm/static/mnt_ro_root.desc new file mode 100644 index 0000000000..c9a8e4f186 --- /dev/null +++ b/test/zdtm/static/mnt_ro_root.desc @@ -0,0 +1,6 @@ +{ + 'flavor': 'ns uns', + 'flags': 'suid', + 'feature': 'mnt_id', + 'bind': 'zdtm/static', +} From 7eb337d63217e1ae9e22d85156b997e210411a9b Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 7 May 2025 14:06:55 +0100 Subject: [PATCH 014/137] sk-inet: add message how to disable MPTCP in Go With Go version 1.24, ListenConfig now uses MPTCP by default [1]. Checkpoint/restore for this protocol is not currently supported and adding support requires kernel changes that are not trivial to implement. As a result, checkpointing of many containers that run Go programs is likely to fail with the following error [2]: (00.026522) Error (criu/sk-inet.c:130): inet: Unsupported proto 262 for socket 2f9bc5 This patch adds a message with suggested workaround for this problem. [1] https://go.dev/doc/go1.24#netpkgnet [2] https://github.com/checkpoint-restore/criu/issues/2655 Signed-off-by: Radostin Stoyanov --- criu/sk-inet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 92f53e5697..a191e78c48 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -128,6 +128,8 @@ static int can_dump_ipproto(unsigned int ino, int proto, int type) break; default: pr_err("Unsupported proto %d for socket %x\n", proto, ino); + if (proto == IPPROTO_MPTCP) + pr_err("For Go programs, consider using \"GODEBUG=multipathtcp=0\" to disable MPTCP\n"); return 0; } From f1608ef8f7544e4e0583b158907aa3c2217555ad Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 6 May 2025 15:38:26 +0000 Subject: [PATCH 015/137] kerndat: check that hardware breakpoints work In some cases, they might not work in virtual machines if the hypervisor doesn't virtualize them. For example, they don't work in AMD SEV virtual machines if the Debug Virtualization extension isn't supported or isn't enabled in SEV_FEATURES. Fixes #2658 Signed-off-by: Andrei Vagin --- criu/cr-check.c | 17 +++++++++ criu/cr-restore.c | 3 +- criu/include/kerndat.h | 1 + criu/kerndat.c | 80 +++++++++++++++++++++++++++++++++++++++++ criu/parasite-syscall.c | 2 +- 5 files changed, 101 insertions(+), 2 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index 7b4a6415a5..9c4778490e 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1589,6 +1589,17 @@ static int check_overlayfs_maps(void) return status == 0 ? 0 : -1; } +static int check_breakpoints(void) +{ + if (!kdat.has_breakpoints) { + pr_warn("Hardware breakpoints don't seem to work\n"); + return -1; + } + + return 0; +} + + static int (*chk_feature)(void); /* @@ -1616,6 +1627,7 @@ static int (*chk_feature)(void); return ret; \ } \ } while (0) + int cr_check(void) { struct ns_id *ns; @@ -1724,6 +1736,10 @@ int cr_check(void) ret |= check_autofs(); ret |= check_compat_cr(); } + /* + * Category 4 - optional. + */ + check_breakpoints(); pr_msg("%s\n", ret ? CHECK_MAYBE : CHECK_GOOD); return ret; @@ -1836,6 +1852,7 @@ static struct feature_list feature_list[] = { { "pagemap_scan", check_pagemap_scan }, { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, + { "breakpoints", check_breakpoints }, { NULL, NULL }, }; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 30932f60a2..cabe2f464d 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1820,6 +1820,7 @@ static int restore_rseq_cs(void) static int catch_tasks(bool root_seized) { struct pstree_item *item; + bool nobp = fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints; for_each_pstree_item(item) { int status, i, ret; @@ -1847,7 +1848,7 @@ static int catch_tasks(bool root_seized) return -1; } - ret = compel_stop_pie(pid, rsti(item)->breakpoint, fault_injected(FI_NO_BREAKPOINTS)); + ret = compel_stop_pie(pid, rsti(item)->breakpoint, nobp); if (ret < 0) return -1; } diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index bd8744d62b..c5deb32832 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -90,6 +90,7 @@ struct kerndat_s { bool has_shstk; bool has_close_range; bool has_timer_cr_ids; + bool has_breakpoints; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index 930117b0a4..fa43f7d3f2 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1736,6 +1736,83 @@ static int kerndat_has_timer_cr_ids(void) return 0; } +static void breakpoint_func(void) +{ + if (raise(SIGSTOP)) + pr_perror("Unable to kill itself with SIGSTOP"); + exit(1); +} + +/* + * kerndat_breakpoints checks that hardware breakpoints work as they should. + * In some cases, they might not work in virtual machines if the hypervisor + * doesn't virtualize them. For example, they don't work in AMD SEV virtual + * machines if the Debug Virtualization extension isn't supported or isn't + * enabled in SEV_FEATURES. + */ +static int kerndat_breakpoints(void) +{ + int status, ret, exit_code = -1; + pid_t pid; + + pid = fork(); + if (pid == -1) { + pr_perror("fork"); + return -1; + } + if (pid == 0) { + if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { + pr_perror("ptrace(PTRACE_TRACEME)"); + exit(1); + } + raise(SIGSTOP); + breakpoint_func(); + exit(1); + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for initial stop"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) { + pr_err("Child didn't stop as expected: status=%x\n", status); + goto err; + } + ret = ptrace_set_breakpoint(pid, &breakpoint_func); + if (ret < 0) { + pr_err("Failed to set breakpoint\n"); + goto err; + } + if (ret == 0) { + pr_debug("Hardware breakpoints appear to be disabled\n"); + goto out; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for breakpoint trigger"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP) { + pr_warn("Hardware breakpoints don't seem to work (status=%x)\n", status); + goto out; + } + kdat.has_breakpoints = true; +out: + exit_code = 0; +err: + if (kill(pid, SIGKILL)) { + pr_perror("Failed to kill the child process"); + exit_code = -1; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("Failed to wait for the child process"); + exit_code = -1; + } + if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) { + pr_err("The child exited with unexpected code: %x\n", status); + exit_code = -1; + } + return exit_code; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1999,6 +2076,9 @@ int kerndat_init(void) } if (!ret && kerndat_has_timer_cr_ids()) { pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + } + if (!ret && kerndat_breakpoints()) { + pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); ret = -1; } diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 6db9d21fee..e19847b377 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -421,7 +421,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, ictx->flags |= INFECT_NO_MEMFD; if (fault_injected(FI_PARASITE_CONNECT)) ictx->flags |= INFECT_FAIL_CONNECT; - if (fault_injected(FI_NO_BREAKPOINTS)) + if (fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints) ictx->flags |= INFECT_NO_BREAKPOINTS; if (kdat.compat_cr) ictx->flags |= INFECT_COMPATIBLE; From 77c801478c6a7f6413a6ed8db5e81e5759e83950 Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Wed, 14 May 2025 19:02:06 +0200 Subject: [PATCH 016/137] make: remove checks and warnings for bsd strlcat and strlcpy In 0a7c5fd1bd8d1e49e273b51ff39af473d6c68cbc we swapped the BSD implementation of strlcat and strlcpy in favor of our own replacement. The checks and the predefined macros are not needed anymore. Signed-off-by: Lorenzo Fontana --- Makefile.config | 4 ++-- scripts/feature-tests.mak | 28 ---------------------------- 2 files changed, 2 insertions(+), 30 deletions(-) diff --git a/Makefile.config b/Makefile.config index 5ab689d411..5cf4b8216d 100644 --- a/Makefile.config +++ b/Makefile.config @@ -9,7 +9,7 @@ ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) LIBS_FEATURES += -lbsd FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD else - $(info Note: Building without setproctitle() and strlcpy() support.) + $(info Note: Building without setproctitle() support.) $(info $S Install libbsd-devel (RPM) / libbsd-dev (DEB) to fix.) endif @@ -84,7 +84,7 @@ endif export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) -FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ +FEATURES_LIST := TCP_REPAIR PTRACE_PEEKSIGINFO \ SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ OPENAT2 NO_LIBC_RSEQ_DEFS diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index fb5d2ef7ad..727e9689ea 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -35,34 +35,6 @@ int main(void) } endef -define FEATURE_TEST_STRLCPY - -#include - -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - -int main(void) -{ - return strlcpy(NULL, NULL, 0); -} -endef - -define FEATURE_TEST_STRLCAT - -#include - -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - -int main(void) -{ - return strlcat(NULL, NULL, 0); -} -endef - define FEATURE_TEST_PTRACE_PEEKSIGINFO #include From 38e6c9a5c50bafecfb4c0912ccf598cdb122bcee Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 11 May 2025 11:33:29 +0100 Subject: [PATCH 017/137] seize: fix pause devices for frozen containers The container checkpointing procedure in Kubernetes freezes running containers to create a consistent snapshot of both the runtime state and the rootfs of the container. However, when checkpointing a GPU container, the container must be unfrozen before invoking the cuda-checkpoint tool. This is achieved in prepare_freezer_for_interrupt_only_mode(), which needs to be called before the PAUSE_DEVICES hook. The patch introducing this functionality fixes this problem for containers with multiple processes. However, if the container has a single process, prepare_freezer_for_interrupt_only_mode() must be invoked immediately before the PAUSE_DEVICES hook. Fixes: #2514 Signed-off-by: Radostin Stoyanov --- criu/seize.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index f56357ac7b..23f192d46d 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1060,22 +1060,32 @@ int collect_pstree(void) */ alarm(opts.timeout); - ret = run_plugins(PAUSE_DEVICES, pid); - if (ret < 0 && ret != -ENOTSUP) { - goto err; - } - if (opts.freeze_cgroup && cgroup_version()) goto err; pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); if (opts.freeze_cgroup && !compel_interrupt_only_mode) { + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (freeze_processes()) goto err; } else { if (opts.freeze_cgroup && prepare_freezer_for_interrupt_only_mode()) goto err; + + /* + * Call PAUSE_DEVICES after prepare_freezer_for_interrupt_only_mode() + * to be able to checkpoint containers in a frozen state. + */ + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (compel_interrupt_task(pid)) { set_cr_errno(ESRCH); goto err; @@ -1136,4 +1146,4 @@ int checkpoint_devices(void) exit_code = 0; err: return exit_code; -} \ No newline at end of file +} From 630064538fba810d2828c516a073f33322010c35 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 16 May 2025 12:43:14 +0100 Subject: [PATCH 018/137] sk-inet: add MPTCP definition Building CRIU on Ubuntu 20.04 fails with the following error: criu/sk-inet.c: In function 'can_dump_ipproto': criu/sk-inet.c:131:16: error: 'IPPROTO_MPTCP' undeclared (first use in this function); did you mean 'IPPROTO_MTP'? 131 | if (proto == IPPROTO_MPTCP) | ^~~~~~~~~~~~~ | IPPROTO_MTP Add definition for MPTCP to fix this error. Signed-off-by: Radostin Stoyanov --- criu/sk-inet.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index a191e78c48..1238b03dc5 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -44,6 +44,11 @@ #define PB_ALEN_INET 1 #define PB_ALEN_INET6 4 +/* Definition for older kernels without MPTCP support (e.g. Ubuntu 20.04) */ +#ifndef IPPROTO_MPTCP +#define IPPROTO_MPTCP 262 +#endif + static LIST_HEAD(inet_ports); struct inet_port { From c12182dcbe9f291d8f2f593022027f18ef8eb2a6 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Fri, 29 Nov 2024 02:07:38 +0000 Subject: [PATCH 019/137] criu: Introduce a new device plugin hook for restore Currently, in the target process, device-related restore operations and other restore operations almost run sequentially. When the target process executes the corresponding CRIU hook functions, it can't perform other restore operations. However, for GPU applications, some device restore operations have no logical dependencies on other common restore operations and can be parallelized with other operations to speed up the process. Instead of launching a thread in child processes for parallelization, this patch chooses to add a new hook, `POST_FORKING`, in the main CRIU process to handle these restore operations. This is because the restoration of memory state in the restore blob is one of the most time-consuming parts of all restore logic. The main CRIU process can easily parallelize these operations, whereas parallelizing in threads within child processes is challenging. - POST_FORKING *POST_FORKING: Hook to enable the main CRIU process to perform some restore operations of plugins. Signed-off-by: Yanning Yang --- criu/cr-restore.c | 3 +++ criu/include/criu-plugin.h | 4 ++++ criu/plugin.c | 1 + 3 files changed, 8 insertions(+) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index cabe2f464d..9cc77b21ff 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2132,6 +2132,9 @@ static int restore_root_task(struct pstree_item *init) __restore_switch_stage(CR_STATE_FORKING); skip_ns_bouncing: + ret = run_plugins(POST_FORKING); + if (ret < 0 && ret != -ENOTSUP) + goto out_kill; ret = restore_wait_inprogress_tasks(); if (ret < 0) diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 392ea9f534..9fb21a4497 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -60,6 +60,8 @@ enum { CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11, + CR_PLUGIN_HOOK__POST_FORKING = 12, + CR_PLUGIN_HOOK__MAX }; @@ -78,6 +80,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); enum { CR_PLUGIN_STAGE__DUMP, @@ -152,5 +155,6 @@ typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat); typedef int(cr_plugin_update_vma_map_t)(const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); typedef int(cr_plugin_resume_devices_late_t)(int pid); +typedef int(cr_plugin_post_forking_t)(void); #endif /* __CRIU_PLUGIN_H__ */ diff --git a/criu/plugin.c b/criu/plugin.c index 65e79a0692..18da0499d7 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -59,6 +59,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); + __assign_hook(POST_FORKING, "cr_plugin_post_forking"); #undef __assign_hook From d2f12fada4acf0010da9eefce50cf9410313219f Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Fri, 29 Nov 2024 02:13:28 +0000 Subject: [PATCH 020/137] cr-restore: Move `cr_plugin_init` after `fdstore_init` Currently, when CRIU calls `cr_plugin_init`, `fdstore` is not initialized. However, during the plugin restore procedure, there may be some common file operations used in multiple hooks. This patch moves `cr_plugin_init` after `fdstore_init`, allowing `cr_plugin_init` to use `fdstore` to place these file operations. Signed-off-by: Yanning Yang --- criu/cr-restore.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9cc77b21ff..c1d1f4b9d5 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2366,41 +2366,47 @@ int cr_restore_tasks(void) return 1; if (check_img_inventory(/* restore = */ true) < 0) - goto err; - - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) return -1; if (init_stats(RESTORE_STATS)) - goto err; + return -1; if (lsm_check_opts()) - goto err; + return -1; timing_start(TIME_RESTORE); if (cpu_init() < 0) - goto err; + return -1; if (vdso_init_restore()) - goto err; + return -1; if (tty_init_restore()) - goto err; + return -1; if (opts.cpu_cap & CPU_CAP_IMAGE) { if (cpu_validate_cpuinfo()) - goto err; + return -1; } if (prepare_task_entries() < 0) - goto err; + return -1; if (prepare_pstree() < 0) - goto err; + return -1; if (fdstore_init()) - goto err; + return -1; + + /* + * For the AMDGPU plugin, its parallel restore feature needs to use fdstore to store + * its socket file descriptor. This allows the main process and the target process to + * communicate with each other through this file descriptor. Therefore, cr_plugin_init + * must be initialized after fdstore_init. + */ + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; if (inherit_fd_move_to_fdstore()) goto err; From fb7540e4f7745577d09f7aac11bae45ee7c57b64 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:34:14 +0000 Subject: [PATCH 021/137] pstree: Add `has_children` function Currently, parallel restore only focuses on the single-process situation. Therefore, it needs an interface to know if there is only one process to restore. This patch adds a `has_children` function in `pstree.h` and replaces some existing implementations with this function. Signed-off-by: Yanning Yang --- criu/cr-dump.c | 2 +- criu/include/pstree.h | 1 + criu/pstree.c | 9 +++++++-- criu/seize.c | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 302078caa0..b8cf7d64d9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1396,7 +1396,7 @@ static int dump_zombies(void) item->sid = pps_buf.sid; item->pgid = pps_buf.pgid; - BUG_ON(!list_empty(&item->children)); + BUG_ON(has_children(item)); if (!item->sid) { pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n", diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 1137046d43..b750a919e6 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -104,6 +104,7 @@ extern void pstree_insert_pid(struct pid *pid_node); extern struct pid *pstree_pid_by_virt(pid_t pid); extern struct pstree_item *root_item; +extern bool has_children(struct pstree_item *item); extern struct pstree_item *pstree_item_next(struct pstree_item *item); #define for_each_pstree_item(pi) for (pi = root_item; pi != NULL; pi = pstree_item_next(pi)) diff --git a/criu/pstree.c b/criu/pstree.c index 660f1b9d99..75c2fc8d0a 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -182,7 +182,7 @@ void free_pstree(struct pstree_item *root_item) struct pstree_item *item = root_item, *parent; while (item) { - if (!list_empty(&item->children)) { + if (has_children(item)) { item = list_first_entry(&item->children, struct pstree_item, sibling); continue; } @@ -244,10 +244,15 @@ int init_pstree_helper(struct pstree_item *ret) return 0; } +bool has_children(struct pstree_item *item) +{ + return !list_empty(&item->children); +} + /* Deep first search on children */ struct pstree_item *pstree_item_next(struct pstree_item *item) { - if (!list_empty(&item->children)) + if (has_children(item)) return list_first_entry(&item->children, struct pstree_item, sibling); while (item->parent) { diff --git a/criu/seize.c b/criu/seize.c index 23f192d46d..d0cf7b36c8 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1008,7 +1008,7 @@ static int collect_task(struct pstree_item *item) if (ret < 0) goto err_close; - if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) { + if ((item->pid->state == TASK_DEAD) && has_children(item)) { pr_err("Zombie with children?! O_o Run, run, run!\n"); goto err_close; } From cde13396b4cf349416cd203cb004e25294d7eb8d Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:36:33 +0000 Subject: [PATCH 022/137] plugins/amdgpu: Add socket operations When enabling parallel restore, the target process and the main CRIU process need an IPC interface to communicate and transfer restore commands. This patch adds a Unix domain TCP socket and stores this socket in `fdstore`. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 59 ++++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 6 +++ 2 files changed, 65 insertions(+) create mode 100644 plugins/amdgpu/amdgpu_socket_utils.c create mode 100644 plugins/amdgpu/amdgpu_socket_utils.h diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c new file mode 100644 index 0000000000..9e957ae54b --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include +#include + +#include "amdgpu_socket_utils.h" +#include "criu-log.h" +#include "common/scm.h" +#include "fdstore.h" +#include "util-pie.h" +#include "util.h" + +int parallel_socket_addr_len; +struct sockaddr_un parallel_socket_addr; +int parallel_socket_id = 0; + +static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len) +{ + addr->sun_family = AF_UNIX; + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id); + *len = SUN_LEN(addr); + *addr->sun_path = '\0'; +} + +int install_parallel_sock(void) +{ + int ret = 0; + int sock_fd; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("socket creation failed"); + return -1; + } + + amdgpu_socket_name_gen(¶llel_socket_addr, ¶llel_socket_addr_len); + ret = bind(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("bind failed"); + goto err; + } + + ret = listen(sock_fd, SOMAXCONN); + if (ret < 0) { + pr_perror("listen failed"); + goto err; + } + + parallel_socket_id = fdstore_add(sock_fd); + if (parallel_socket_id < 0) { + ret = -1; + goto err; + } +err: + close(sock_fd); + return ret; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h new file mode 100644 index 0000000000..4e7aa2aa41 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -0,0 +1,6 @@ +#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ + +int install_parallel_sock(void); + +#endif \ No newline at end of file From 1c773a3a16898dcfee1c2721334598cc4894b6e5 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:38:48 +0000 Subject: [PATCH 023/137] plugins/amdgpu: Add parallel restore command Currently the restore of buffer object comsumes a significant amount of time. However, this part has no logical dependencies with other restore operations. This patch introduce some structures and some helper functions for the target process to offload this task to the main CRIU process. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 261 +++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 48 +++++ 2 files changed, 309 insertions(+) diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c index 9e957ae54b..c8bf6d1ba3 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.c +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "amdgpu_socket_utils.h" #include "criu-log.h" @@ -53,6 +54,266 @@ int install_parallel_sock(void) ret = -1; goto err; } +err: + close(sock_fd); + return ret; +} + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd) +{ + parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num]; + restore_entry->gpu_id = gpu_id; + restore_entry->write_id = restore_cmd->cmd_head.fd_write_num; + restore_entry->write_offset = 0; + restore_entry->read_offset = offset; + restore_entry->size = size; + + restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd; + + restore_cmd->cmd_head.entry_num += 1; + restore_cmd->cmd_head.fd_write_num += 1; +} + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor }; + restore_cmd->cmd_head.gpu_num += 1; +} + +static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + return 0; +} + +static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Send parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Send dmabuf fds fail"); + return -1; + } + return 0; +} + +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd; + int ret = 0; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + ret = send_metadata(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_gpu_ids(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_cmds(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_dmabuf_fds(sock_fd, restore_cmd); + +err: + close(sock_fd); + return ret; +} + +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->cmd_head.id = id; + restore_cmd->cmd_head.fd_write_num = 0; + restore_cmd->cmd_head.entry_num = 0; + restore_cmd->cmd_head.gpu_num = 0; + + restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + if (restore_cmd->gpu_ids) + xfree(restore_cmd->gpu_ids); + if (restore_cmd->fds_write) + xfree(restore_cmd->fds_write); + if (restore_cmd->entries) + xfree(restore_cmd->entries); +} + +static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +static int check_quit_cmd(parallel_restore_cmd *restore_cmd) +{ + return restore_cmd->cmd_head.fd_write_num == 0; +} + +static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Recv parallel restore command head fail"); + return -1; + } + return 0; +} + +static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Recv parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Recv dmabuf fds fail"); + return -1; + } + return 0; +} + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd, client_fd; + int ret = 0; + + sock_fd = fdstore_get(parallel_socket_id); + if (sock_fd < 0) + return -1; + + client_fd = accept(sock_fd, NULL, NULL); + if (client_fd < 0) { + ret = client_fd; + goto err_accept; + } + + ret = recv_metadata(client_fd, restore_cmd); + if (ret) { + goto err; + } + + // Return 1 to quit + if (check_quit_cmd(restore_cmd)) { + ret = 1; + goto err; + } + + ret = init_parallel_restore_cmd_by_head(restore_cmd); + if (ret) { + goto err; + } + + ret = recv_gpu_ids(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_cmds(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_dmabuf_fds(client_fd, restore_cmd); + +err: + close(client_fd); +err_accept: + close(sock_fd); + return ret; +} + +int close_parallel_restore_server(void) +{ + int sock_fd; + int ret = 0; + parallel_restore_cmd_head cmd_head; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head)); + if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + err: close(sock_fd); return ret; diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h index 4e7aa2aa41..d7200c6bd5 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.h +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -1,6 +1,54 @@ #ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ #define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +typedef struct { + int id; + int fd_write_num; /* The number of buffer objects to be restored. */ + int entry_num; /* The number of restore commands.*/ + int gpu_num; +} parallel_restore_cmd_head; + +typedef struct { + int gpu_id; + int minor; +} parallel_gpu_info; + +typedef struct { + int gpu_id; + int write_id; + uint64_t read_offset; + uint64_t write_offset; + uint64_t size; +} parallel_restore_entry; + +typedef struct { + parallel_restore_cmd_head cmd_head; + int *fds_write; + parallel_gpu_info *gpu_ids; + parallel_restore_entry *entries; +} parallel_restore_cmd; + +/* + * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU + * buffer object. However, initially, the ownership of these buffer objects and the metadata for + * restoration are all with the target process. Therefore, we introduce a series of functions to + * help the target process send these tasks to the main CRIU process. + */ +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd); + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + int install_parallel_sock(void); +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd); + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd); + +int close_parallel_restore_server(void); + #endif \ No newline at end of file From 98daa045249ee08be7b10de6b4c5102b0e7ca854 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Wed, 15 Jan 2025 06:38:27 +0000 Subject: [PATCH 024/137] plugins/amdgpu: Implement parallel restore This patch implements the entire logic to enable the offloading of buffer object content restoration. The goal of this patch is to offload the buffer object content restoration to the main CRIU process so that this restoration can occur in parallel with other restoration logic (mainly the restoration of memory state in the restore blob, which is time-consuming) to speed up the restore phase. The restoration of buffer object content usually takes a significant amount of time for GPU applications, so parallelizing it with other operations can reduce the overall restore time. It has three parts: the first replaces the restoration of buffer objects in the target process by sending a parallel restore command to the main CRIU process; the second implements the POST_FORKING hook in the amdgpu plugin to enable buffer object content restoration in the main CRIU process; the third stops the parallel thread in the RESUME_DEVICES_LATE hook. This optimization only focuses on the single-process situation (common case). In other scenarios, it will turn to the original method. This is achieved with the new `parallel_disabled` flag. Signed-off-by: Yanning Yang --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 418 +++++++++++++++++++++--- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- plugins/amdgpu/amdgpu_plugin_topology.h | 1 + 4 files changed, 373 insertions(+), 50 deletions(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index a20d1d1639..4bf5e499fb 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -27,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc-c --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 96c0861628..69194fbc79 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -28,11 +28,13 @@ #include "xmalloc.h" #include "criu-log.h" #include "files.h" +#include "pstree.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "amdgpu_socket_utils.h" #include "img-streamer.h" #include "image.h" @@ -64,6 +66,18 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; +/* + * In the case of a single process (common case), this optimization can effectively + * reduce the restore latency with parallel restore. In the case of multiple processes, + * states are already restored in parallel within different processes. Therefore, this + * optimization does not introduce further improvement and will be disabled by default + * in this case. The flag, parallel_disabled, is used to control whether the + * optimization is enabled or disabled. + */ +bool parallel_disabled = false; + +pthread_t parallel_thread = 0; +int parallel_thread_result = 0; /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -351,6 +365,15 @@ int amdgpu_plugin_init(int stage) maps_init(&restore_maps); if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (has_children(root_item)) { + pr_info("Parallel restore disabled\n"); + parallel_disabled = true; + } else { + if (install_parallel_sock() < 0) { + pr_err("Failed to install parallel socket\n"); + return -1; + } + } /* Default Values */ kfd_fw_version_check = true; kfd_sdma_fw_version_check = true; @@ -1439,14 +1462,9 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) { - struct thread_data *thread_datas; + struct thread_data *thread_datas = NULL; int thread_i, ret = 0; - - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; - goto exit; - } + int offset = 0; for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -1489,56 +1507,101 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf } } - thread_i = 0; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - uint32_t target_gpu_id; + if (!parallel_disabled) { + parallel_restore_cmd restore_cmd; + pr_info("Begin to send parallel restore cmd\n"); + ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd); + if (ret) + goto exit_parallel; - if (!e->device_entries[i]->gpu_id) - continue; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + uint32_t target_gpu_id; + struct tp_node *dev; - /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + if (!e->device_entries[i]->gpu_id) + continue; - /* We need the fd for actual_gpu_id */ - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit_parallel; + } + parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd); + + for (int j = 0; j < e->num_of_bos; j++) { + if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id) + continue; + if (bo_buckets[j].alloc_flags & + (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, + bo_buckets[j].size, offset, &restore_cmd); + offset += bo_buckets[j].size; + } + } + } + ret = send_parallel_restore_cmd(&restore_cmd); +exit_parallel: + free_parallel_restore_cmd(&restore_cmd); + } else { + thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); + if (!thread_datas) { + ret = -ENOMEM; goto exit; } - thread_datas[thread_i].id = id; - thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; - thread_datas[thread_i].bo_buckets = bo_buckets; - thread_datas[thread_i].bo_entries = e->bo_entries; - thread_datas[thread_i].pid = e->pid; - thread_datas[thread_i].num_of_bos = e->num_of_bos; + thread_i = 0; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + struct tp_node *dev; + int ret_thread = 0; + uint32_t target_gpu_id; - thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[thread_i].drm_fd < 0) { - ret = -thread_datas[thread_i].drm_fd; - goto exit; - } + if (!e->device_entries[i]->gpu_id) + continue; - ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, - (void *)&thread_datas[thread_i]); - if (ret_thread) { - pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); - ret = -ret_thread; - goto exit; + /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + + /* We need the fd for actual_gpu_id */ + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit; + } + + thread_datas[thread_i].id = id; + thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; + thread_datas[thread_i].bo_buckets = bo_buckets; + thread_datas[thread_i].bo_entries = e->bo_entries; + thread_datas[thread_i].pid = e->pid; + thread_datas[thread_i].num_of_bos = e->num_of_bos; + + thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); + if (thread_datas[thread_i].drm_fd < 0) { + ret = -thread_datas[thread_i].drm_fd; + goto exit; + } + + ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, + (void *)&thread_datas[thread_i]); + if (ret_thread) { + pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); + ret = -ret_thread; + goto exit; + } + thread_i++; } - thread_i++; - } - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + for (int i = 0; i < e->num_of_gpus; i++) { + pthread_join(thread_datas[i].thread, NULL); + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; + if (thread_datas[i].ret) { + ret = thread_datas[i].ret; + goto exit; + } } } exit: @@ -1546,8 +1609,8 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) close(bo_buckets[i].dmabuf_fd); } - - xfree(thread_datas); + if (thread_datas) + xfree(thread_datas); return ret; } @@ -1836,6 +1899,24 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (plugin_disabled) return -ENOTSUP; + if (!parallel_disabled) { + pr_info("Close parallel restore server\n"); + if (close_parallel_restore_server()) { + pr_err("Close parallel restore server fail\n"); + return -1; + } + + exit_code = pthread_join(parallel_thread, NULL); + if (exit_code) { + pr_err("Failed to join parallel thread ret:%d\n", exit_code); + return -1; + } + if (parallel_thread_result) { + pr_err("Parallel restore fail\n"); + return parallel_thread_result; + } + } + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1862,3 +1943,244 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) + +int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, + amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) +{ + return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); +} + +int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) +{ + int ret = 0; + int drm_fd = -1; + uint32_t major, minor; + + struct amdgpu_gpu_info gpu_info = { 0 }; + + drm_fd = open_drm_render_device(dev_minor); + if (drm_fd < 0) { + return drm_fd; + } + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev); + if (ret) { + pr_perror("Failed to initialize device"); + goto err; + } + + ret = amdgpu_query_gpu_info(*h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto err; + } + *max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + return 0; +err: + amdgpu_device_deinitialize(*h_dev); + return ret; +} + +FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size) +{ + char img_path[PATH_MAX]; + size_t image_size = 0; + FILE *bo_contents_fp = NULL; + + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id); + bo_contents_fp = open_img_file(img_path, false, &image_size); + if (!bo_contents_fp) { + pr_perror("Cannot fopen %s", img_path); + return NULL; + } + + if (tot_size != image_size) { + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size); + fclose(bo_contents_fp); + return NULL; + } + return bo_contents_fp; +} + +struct parallel_thread_data { + pthread_t thread; + uint32_t gpu_id; + int minor; + parallel_restore_cmd *restore_cmd; + int ret; +}; + +void *parallel_restore_bo_contents(void *_thread_data) +{ + struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0; + FILE *bo_contents_fp = NULL; + parallel_restore_entry *entry; + parallel_restore_cmd *restore_cmd = thread_data->restore_cmd; + int ret = 0; + int offset = 0; + void *buffer = NULL; + + ret = init_dev(thread_data->minor, &h_dev, &max_copy_size); + if (ret) { + goto err; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) { + total_bo_size += restore_cmd->entries[i].size; + max_bo_size = max(restore_cmd->entries[i].size, max_bo_size); + } + } + + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size); + if (bo_contents_fp == NULL) { + ret = -1; + goto err_sdma; + } + offset = ftell(bo_contents_fp); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto err_sdma; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id) + continue; + + entry = &restore_cmd->entries[i]; + fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + goto err_sdma; + } + } + +err_sdma: + if (bo_contents_fp) + fclose(bo_contents_fp); + if (buffer) + xfree(buffer); + amdgpu_device_deinitialize(h_dev); +err: + thread_data->ret = ret; + return NULL; +} + +void *restore_device_parallel_worker(void *arg) +{ + while (1) { + parallel_restore_cmd restore_cmd = { 0 }; + struct parallel_thread_data *thread_datas = NULL; + int ret; + int error_occurred = 0, join_ret = 0, created_threads = 0; + + ret = recv_parallel_restore_cmd(&restore_cmd); + if (ret) { + if (ret == 1) { + *(int *)arg = 0; + goto exit; + } + goto err; + } + + thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num); + if (!thread_datas) { + ret = -ENOMEM; + goto err; + } + + for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) { + thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id; + thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor; + thread_datas[created_threads].restore_cmd = &restore_cmd; + + ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents, + (void *)&thread_datas[created_threads]); + if (ret) { + pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret); + error_occurred = 1; + break; + } + } + + for (int i = 0; i < created_threads; i++) { + join_ret = pthread_join(thread_datas[i].thread, NULL); + if (join_ret != 0) { + pr_err("pthread_join failed for Thread[0x%x] ret:%d\n", + thread_datas[i].gpu_id, join_ret); + if (!error_occurred) { + ret = join_ret; + error_occurred = 1; + } + } + + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + + /* Check thread return value */ + if (thread_datas[i].ret && !error_occurred) { + ret = thread_datas[i].ret; + error_occurred = 1; + } + } + + if (thread_datas) + xfree(thread_datas); +err: + free_parallel_restore_cmd(&restore_cmd); + + if (ret) { + *(int *)arg = ret; + return NULL; + } + } +exit: + return NULL; +} + +/* + * While the background thread is running, some processing functions (e.g., stop_cgroupd) + * in the main thread need to block SIGCHLD. To prevent interference from this background + * thread, SIGCHLD is blocked in this thread. + */ +static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg) +{ + int ret = 0; + sigset_t blockmask, oldmask; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + ret = pthread_create(newthread, NULL, f, arg); + if (ret) { + pr_err("Create worker thread fail: %d\n", ret); + return -1; + } + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + return 0; +} + +int amdgpu_plugin_post_forking(void) +{ + if (plugin_disabled) + return -ENOTSUP; + + if (parallel_disabled) + return 0; + + return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 5b4396a0cc..730f2e0284 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -45,7 +45,7 @@ bool kfd_capability_check = true; */ int fd_next = -1; -static int open_drm_render_device(int minor) +int open_drm_render_device(int minor) { char path[128]; int fd, ret_fd; diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index c890e3ddae..e19f8e7ce9 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -118,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index); +int open_drm_render_device(int minor); int node_get_drm_render_device(struct tp_node *node); void sys_close_drm_render_devices(struct tp_system *sys); From 4375b495d12897c7044fac6d24e6de3e3782702a Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:44:35 +0000 Subject: [PATCH 025/137] plugins/amdgpu: Update `README.md` and `criu-amdgpu-plugin.txt` Signed-off-by: Yanning Yang --- Documentation/criu-amdgpu-plugin.txt | 1 + plugins/amdgpu/README.md | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index 68803f3dbc..fe76fc3bc6 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -15,6 +15,7 @@ Checkpoint / Restore inside a docker container Pytorch Tensorflow Using CRIU Image Streamer +Parallel Restore DESCRIPTION ----------- diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 1078eafe6f..b808fbc4f0 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -3,7 +3,8 @@ Supporting ROCm with CRIU _Felix Kuehling _
_Rajneesh Bardwaj _
-_David Yat Sin _ +_David Yat Sin _
+_Yanning Yang _ # Introduction @@ -224,6 +225,26 @@ to resume execution on the GPUs. *This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC patch series.* +## Restoring BO content in parallel + +Restoring the BO content is an important part in the restore of GPU state and +usually takes a significant amount of time. A possible location for this +procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook +blocks the target process from performing other restore operations, which +hinders further optimization of the restore process. + +Therefore, a new plugin hook that runs in the master restore process is +introduced, and it interacts with the `cr_plugin_restore_file` hook to complete +the restore of BO content. Specifically, the target process only needs to send +the relevant BOs to the master restore process, while this new hook handles all +the restore of buffer objects. Through this method, during the restore of the BO +content, the target process can perform other restore operations, thus +accelerating the restore procedure. This is an implementation of the gCROP +method proposed in the ACM SoCC'24 paper: [On-demand and Parallel +Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510). + +*This optimization technique is enabled by the `__POST_FORKING` hook.* + ## Other CRIU changes In addition to the new plugins, we need to make some changes to CRIU itself to From 9440ce4cf5f6fad2b2f38b5c7e9aabb50e13d0cc Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 30 Apr 2025 11:39:18 +0800 Subject: [PATCH 026/137] zdtm.py: add an option to change pycriu import path By default zdtm expects that criu is built from source first and only then you can run zdtm tests against it. But what if you really want to run tests against a criu version installed on the system? Yes there is already a nice option for zdtm to change the criu binary it uses "--criu-bin", but it would still end up using the pycriu module from source and you would still have to build everything beforehand. Let's add an option to change the path where zdtm searches for pycriu module "--pycriu-search-path". This way we can run zdtm tests on the criu installed on the system directly without building criu from source, e.g. on Fedora it works like: test/zdtm.py run --criu-bin /usr/sbin/criu \ --pycriu-search-path /usr/lib/python3.13/site-packages \ -t zdtm/static/env00 Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index e3ddc762a3..d5514af712 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -22,11 +22,11 @@ import tempfile import time import uuid +import site from builtins import input, int, open, range, str, zip import yaml -import pycriu as crpc from zdtm.criu_config import criu_config # File to store content of streamed images @@ -1142,6 +1142,24 @@ def __init__(self, opts): self.__img_streamer_process = None self.__tls = self.__tls_options() if opts['tls'] else [] self.__criu_bin = opts['criu_bin'] + + global crpc + pycriu_search_path = opts.get('pycriu_search_path') + if pycriu_search_path: + sys.path.insert(0, pycriu_search_path) + + try: + import pycriu as crpc + if pycriu_search_path: + print(f"pycriu loaded from: {crpc.__file__}") + except ImportError: + if not pycriu_search_path: + print("Consider building CRIU or using '--pycriu-search-path' option.") + raise + finally: + if pycriu_search_path: + sys.path.pop(0) + self.__crit_bin = opts['crit_bin'] self.__pre_dump_mode = opts['pre_dump_mode'] self.__preload_libfault = bool(opts['preload_libfault']) @@ -2169,7 +2187,8 @@ def run_test(self, name, desc, flavor): 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint') + 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint', + 'pycriu_search_path') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2860,6 +2879,9 @@ def get_cli_args(): rp.add_argument("--criu-bin", help="Path to criu binary", default='../criu/criu') + rp.add_argument("--pycriu-search-path", + help=f"Path to search for pycriu module first (e.g., {site.getsitepackages()[0]})", + default=None) rp.add_argument("--crit-bin", help="Path to crit binary", default='../crit/crit') From a1d578c58cff44c9c1ee5461dcecee6d46642006 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 19 May 2025 11:53:18 +0800 Subject: [PATCH 027/137] zdtm: fix check for criu binary The opts['action'] contains actor function and not the action name, so we should compare it with a function. While on it let's also add a comment about --criu-bin option if CRIU binary is missing. Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/zdtm.py b/test/zdtm.py index d5514af712..3339dd8167 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1611,6 +1611,7 @@ def check(feature): def available(): if not os.access(opts['criu_bin'], os.X_OK): print("CRIU binary not found at %s" % opts['criu_bin']) + print("Consider building CRIU or using '--criu-bin' option.") sys.exit(1) def kill(self): @@ -2972,7 +2973,7 @@ def fork_zdtm(): if opts['debug']: sys.settrace(traceit) - if opts['action'] == 'run': + if opts['action'] == run_tests: criu.available() for tst in test_classes.values(): tst.available() From 6a23dcf4351eb7fbdbfe0d39b1c2f2e261e12bbf Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 16 May 2025 19:26:01 +0000 Subject: [PATCH 028/137] image: use `protoc` instead of `protoc-c` The new protoc 1.5.2 reports warnings: `protoc-c` is deprecated. Please use `protoc` instead! Signed-off-by: Andrei Vagin --- images/Makefile | 4 ++-- plugins/amdgpu/Makefile | 2 +- test/others/rpc/Makefile | 2 +- test/others/unix-callback/Makefile | 2 +- test/zdtm/static/Makefile | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/images/Makefile b/images/Makefile index 1e40b8a8f0..d966fbfca0 100644 --- a/images/Makefile +++ b/images/Makefile @@ -58,7 +58,7 @@ proto-obj-y += ext-file.o proto-obj-y += cgroup.o proto-obj-y += userns.o proto-obj-y += pidns.o -proto-obj-y += google/protobuf/descriptor.o # To make protoc-c happy and compile opts.proto +proto-obj-y += google/protobuf/descriptor.o # To make protoc happy and compile opts.proto proto-obj-y += opts.o proto-obj-y += seccomp.o proto-obj-y += binfmt-misc.o @@ -96,7 +96,7 @@ makefile-deps := Makefile $(obj)/Makefile define gen-proto-rules $(obj)/$(1).pb-c.c $(obj)/$(1).pb-c.h: $(obj)/$(1).proto $(addsuffix .pb-c.c,$(addprefix $(obj)/,$(2))) $(makefile-deps) $$(E) " PBCC " $$@ - $$(Q) protoc-c --proto_path=$(obj)/ --c_out=$(obj)/ $$< + $$(Q) protoc --proto_path=$(obj)/ --c_out=$(obj)/ $$< ifeq ($(PROTOUFIX),y) $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$@ $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$(patsubst %.c,%.h,$$@) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 4bf5e499fb..870a039cdb 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -25,7 +25,7 @@ else endif criu-amdgpu.pb-c.c: criu-amdgpu.proto - protoc-c --proto_path=. --c_out=. criu-amdgpu.proto + protoc --proto_path=. --c_out=. criu-amdgpu.proto amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index b2f907abee..384eb05397 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -47,7 +47,7 @@ rpc_pb2.py: rpc.proto protoc --proto_path=. --python_out=. rpc.proto rpc.pb-c.c: rpc.proto - protoc-c --proto_path=. --c_out=. rpc.proto + protoc --proto_path=. --c_out=. rpc.proto clean: rm -rf build rpc.pb-c.o test-c.o test-c rpc.pb-c.c rpc.pb-c.h rpc_pb2.py rpc_pb2.pyc criu diff --git a/test/others/unix-callback/Makefile b/test/others/unix-callback/Makefile index 25bcf228b3..9840440773 100644 --- a/test/others/unix-callback/Makefile +++ b/test/others/unix-callback/Makefile @@ -4,7 +4,7 @@ run: all ./run.sh unix.pb-c.c: unix.proto - protoc-c --proto_path=. --c_out=. unix.proto + protoc --proto_path=. --c_out=. unix.proto unix-lib.so: unix-lib.c unix.pb-c.c gcc -g -Werror -Wall -shared -nostartfiles unix-lib.c unix.pb-c.c -o unix-lib.so -iquote ../../../criu/include -fPIC diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 81e44de221..61cacbb4eb 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -734,7 +734,7 @@ criu-rtc.pb-c.c: criu-rtc.proto $(Q)echo $@ >> .gitignore $(Q)echo $(@:%.c=%.h) >> .gitignore $(E) " PBCC " $@ - $(Q)protoc-c --proto_path=. --c_out=. criu-rtc.proto + $(Q)protoc --proto_path=. --c_out=. criu-rtc.proto criu-rtc.so: criu-rtc.c criu-rtc.pb-c.c $(E) " LD " $@ From ccea26809dbbe61f5b1b9fbb757abe34ba7ee97c Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 16:27:32 +0200 Subject: [PATCH 029/137] criu/proc_parse: support MADV_WIPEONFORK/VM_WIPEONFORK Support VM_WIPEONFORK [1] by detecting it from /proc//smaps and setting a corresponding MADV_WIPEONFORK flag on vma. [1] https://github.com/torvalds/linux/commit/d2cd9ede6e193dd7d88b6d27399e96229a551b19 Signed-off-by: Alexander Mikhalitsyn --- criu/include/mman.h | 3 +++ criu/proc_parse.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/criu/include/mman.h b/criu/include/mman.h index 8ca71fadf9..a553564909 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -13,5 +13,8 @@ #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif #endif /* __CR_MMAN_H__ */ diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 99dc518a5e..a97ee11d14 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -160,6 +160,8 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); + else if (_vmflag_match(tok, "wf")) + *madv |= (1ul << MADV_WIPEONFORK); /* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */ if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) From 0fcea3affec34479e1300df96d176b3ffd46df65 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 16:32:01 +0200 Subject: [PATCH 030/137] test/zdtm/static/maps02: add MADV_WIPEONFORK testcase In addition to that I did small non-functional corrections. Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/get_smaps_bits.c | 6 ++++++ test/zdtm/static/maps02.c | 16 ++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/test/zdtm/static/get_smaps_bits.c b/test/zdtm/static/get_smaps_bits.c index 31d0d92b2f..d9ac8b1ce7 100644 --- a/test/zdtm/static/get_smaps_bits.c +++ b/test/zdtm/static/get_smaps_bits.c @@ -18,6 +18,10 @@ #define MADV_DONTDUMP 16 #endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) { char *tok; @@ -57,6 +61,8 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); + else if (_vmflag_match(tok, "wf")) + *madv |= (1ul << MADV_WIPEONFORK); /* * Anything else is just ignored. diff --git a/test/zdtm/static/maps02.c b/test/zdtm/static/maps02.c index 29f1372c9a..37c09dc71c 100644 --- a/test/zdtm/static/maps02.c +++ b/test/zdtm/static/maps02.c @@ -6,7 +6,11 @@ #define MADV_DONTDUMP 16 #endif -const char *test_doc = "Test shared memory with advises"; +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + +const char *test_doc = "Test private memory with advises"; const char *test_author = "Cyrill Gorcunov "; struct mmap_data { @@ -43,12 +47,12 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) int main(int argc, char **argv) { - struct mmap_data m[5] = {}; + struct mmap_data m[6] = {}; size_t i; test_init(argc, argv); - test_msg("Alloc growsdown\n"); + test_msg("Alloc dontfork\n"); if (alloc_anon_mmap(&m[0], MAP_PRIVATE | MAP_ANONYMOUS, MADV_DONTFORK)) return -1; @@ -64,10 +68,14 @@ int main(int argc, char **argv) if (alloc_anon_mmap(&m[3], MAP_PRIVATE | MAP_ANONYMOUS, MADV_HUGEPAGE)) return -1; - test_msg("Alloc dontfork/random|mergeable\n"); + test_msg("Alloc mergeable\n"); if (alloc_anon_mmap(&m[4], MAP_PRIVATE | MAP_ANONYMOUS, MADV_MERGEABLE)) return -1; + test_msg("Alloc wipeonfork\n"); + if (alloc_anon_mmap(&m[5], MAP_PRIVATE | MAP_ANONYMOUS, MADV_WIPEONFORK)) + return -1; + test_msg("Fetch existing flags/adv\n"); for (i = 0; i < sizeof(m) / sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].orig_flags, &m[i].orig_madv)) From aea53cb7f997422c248fca2679c26e384162e002 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 17:11:28 +0200 Subject: [PATCH 031/137] criu/proc_parse: support MAP_DROPPABLE mappings Support MAP_DROPPABLE [1] by detecting it from /proc//smaps and restoring it as a normal private mapping flag on vma with only difference that instead of MAP_PRIVATE we should use MAP_DROPPABLE. [1] https://github.com/torvalds/linux/commit/9651fcedf7b92d3f7f1ab179e8ab55b85ee10fc1 Signed-off-by: Alexander Mikhalitsyn --- criu/include/mman.h | 3 +++ criu/mem.c | 12 ++++++++++++ criu/proc_parse.c | 16 ++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/criu/include/mman.h b/criu/include/mman.h index a553564909..086753bcf5 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -4,6 +4,9 @@ #ifndef MAP_HUGETLB #define MAP_HUGETLB 0x40000 #endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif diff --git a/criu/mem.c b/criu/mem.c index c9578ef441..803cb545b5 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -10,6 +10,7 @@ #include "cr_options.h" #include "servicefd.h" #include "mem.h" +#include "mman.h" #include "parasite-syscall.h" #include "parasite.h" #include "page-pipe.h" @@ -398,6 +399,17 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str if (vma_entry_is(vma->e, VMA_AREA_VVAR)) return 0; + /* + * 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") + * tells us that: + * Under memory pressure, mm can just drop the pages (so that they're + * zero when read back again). + * + * Let's just skip MAP_DROPPABLE mappings pages dump logic. + */ + if (vma->e->flags & MAP_DROPPABLE) + return 0; + /* * To facilitate any combination of pre-dump modes to run after * one another, we need to take extra care as discussed below. diff --git a/criu/proc_parse.c b/criu/proc_parse.c index a97ee11d14..d7eb256626 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -144,6 +144,8 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; + else if (_vmflag_match(tok, "dp")) + *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) @@ -206,6 +208,20 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) if (vma_area->e->madv) vma_area->e->has_madv = true; + + /* + * We set MAP_PRIVATE flag on vma_area->e->flags right after parsing + * a first line of VMA entry in /proc//smaps file: + * 7fa84fa70000-7fa84fa95000 rw-p 00000000 00:00 0 + * but it's too early and we can't distinguish between MAP_DROPPABLE + * and MAP_PRIVATE mappings yet, as they both private mappings in nature + * and at this point we haven't yet read "VmFlags:" line in smaps. + * + * Let's detect this situation and drop MAP_PRIVATE flag while keep + * MAP_DROPPABLE, otherwise restorer's restore_mapping() helper will fail. + */ + if ((vma_area->e->flags & MAP_PRIVATE) && (vma_area->e->flags & MAP_DROPPABLE)) + vma_area->e->flags &= ~MAP_PRIVATE; } static inline int is_anon_shmem_map(dev_t dev) From 5873082778843db75878bbe906de21c91dc9fef2 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 18:55:46 +0200 Subject: [PATCH 032/137] pycriu/images/pb2dict: add MAP_DROPPABLE flag Signed-off-by: Alexander Mikhalitsyn --- lib/pycriu/images/pb2dict.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index e3dd95ac0a..6c4f688896 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -83,6 +83,7 @@ def _custom_conv(field): mmap_flags_map = [ ('MAP_SHARED', 0x1), ('MAP_PRIVATE', 0x2), + ('MAP_DROPPABLE', 0x08), ('MAP_ANON', 0x20), ('MAP_GROWSDOWN', 0x0100), ] From 87ed9e0db3b4d3fb9c3824eedc63915bd475dbd0 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 17:21:23 +0200 Subject: [PATCH 033/137] test/zdtm/static/maps02: add MAP_DROPPABLE testcase Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/get_smaps_bits.c | 6 ++++++ test/zdtm/static/maps02.c | 20 +++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/test/zdtm/static/get_smaps_bits.c b/test/zdtm/static/get_smaps_bits.c index d9ac8b1ce7..3d952ac95d 100644 --- a/test/zdtm/static/get_smaps_bits.c +++ b/test/zdtm/static/get_smaps_bits.c @@ -6,6 +6,10 @@ #define MAP_HUGETLB 0x40000 #endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif @@ -45,6 +49,8 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; + else if (_vmflag_match(tok, "dp")) + *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) diff --git a/test/zdtm/static/maps02.c b/test/zdtm/static/maps02.c index 37c09dc71c..38244f0205 100644 --- a/test/zdtm/static/maps02.c +++ b/test/zdtm/static/maps02.c @@ -2,6 +2,10 @@ #include "zdtmtst.h" #include "get_smaps_bits.h" +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif @@ -27,8 +31,14 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) { m->start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, flags, -1, 0); if (m->start == MAP_FAILED) { - pr_perror("mmap failed"); - return -1; + if (errno == EINVAL) { + test_msg("mmap failed, no kernel support\n"); + *m = (struct mmap_data){}; + return 0; + } else { + pr_perror("mmap failed"); + return -1; + } } if (madvise(m->start, MEM_SIZE, adv)) { @@ -47,7 +57,7 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) int main(int argc, char **argv) { - struct mmap_data m[6] = {}; + struct mmap_data m[7] = {}; size_t i; test_init(argc, argv); @@ -76,6 +86,10 @@ int main(int argc, char **argv) if (alloc_anon_mmap(&m[5], MAP_PRIVATE | MAP_ANONYMOUS, MADV_WIPEONFORK)) return -1; + test_msg("Alloc droppable\n"); + if (alloc_anon_mmap(&m[6], MAP_DROPPABLE | MAP_ANONYMOUS, MADV_NORMAL)) + return -1; + test_msg("Fetch existing flags/adv\n"); for (i = 0; i < sizeof(m) / sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].orig_flags, &m[i].orig_madv)) From d8b6b1617c09473c394ea9dcd2db85aab60ec0bb Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 21:11:29 +0200 Subject: [PATCH 034/137] test/zdtm/static: add maps11 test for MAP_DROPPABLE/MADV_WIPEONFORK In this test we want to ensure that contents of droppable mappings and mappings with MADV_WIPEONFORK is properly restored in parent/child processes. Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/Makefile | 1 + test/zdtm/static/maps11.c | 205 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 test/zdtm/static/maps11.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 61cacbb4eb..34fc90513a 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -150,6 +150,7 @@ TST_NOFILE := \ maps05 \ maps09 \ maps10 \ + maps11 \ mlock_setuid \ xids00 \ groups \ diff --git a/test/zdtm/static/maps11.c b/test/zdtm/static/maps11.c new file mode 100644 index 0000000000..df309714b0 --- /dev/null +++ b/test/zdtm/static/maps11.c @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + +const char *test_doc = "Test MAP_DROPPABLE/MADV_WIPEONFORK mappings with 2 processes"; +const char *test_author = "Alexander Mikhalitsyn "; + +bool mem_is_zero(const uint8_t *buffer, size_t length) +{ + size_t i; + + for (i = 0; i < length; i++) + if (buffer[i] != 0) + return false; + + return true; +} + +int main(int argc, char **argv) +{ + uint8_t *p1, *p2; + pid_t pid; + int status; + const char data[] = "MADV_WIPEONFORK vma data"; + bool criu_was_there = false; + struct stat st1, st2; + + test_init(argc, argv); + + p1 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, + MAP_DROPPABLE | MAP_ANONYMOUS, 0, 0); + if (p1 == MAP_FAILED) { + if (errno == EINVAL) { + skip("mmap failed, no kernel support for MAP_DROPPABLE\n"); + goto skip; + } else { + pr_perror("mmap failed"); + return -1; + } + } + + p2 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (p2 == MAP_FAILED) { + pr_perror("mmap failed"); + return 1; + } + + if (madvise(p2, sizeof(data), MADV_WIPEONFORK)) { + pr_perror("madvise failed"); + return -1; + } + + /* contents of this mapping is supposed to be dropped after C/R */ + memcpy(p1, data, sizeof(data)); + + /* contents of this mapping is supposed to be dropped after fork() */ + memcpy(p2, data, sizeof(data)); + + /* + * Let's spawn a process before C/R so our mappings get inherited + * then, after C/R we need to ensure that CRIU memory premapping + * machinery works properly. + * + * It is important, because we restore MADV_WIPEONFORK on a later + * stages (after vma premapping happens) and we need to ensure that + * CRIU handles everything in a right way. + */ + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + test_waitsig(); + + /* + * Both mappings have VM_WIPEONFORK flag set, + * so we expect to have it null-ified after fork(). + */ + if (!mem_is_zero(p1, sizeof(data)) || + !mem_is_zero(p2, sizeof(data))) { + pr_err("1st child: memory check failed\n"); + return 1; + } + + return 0; + } + + /* + * A simple way to detect if C/R happened is to compare st_ino + * fields of stat() on the procfs files of the current task. + * + * Hopefully, this terrible hack is never used in real-world + * applications ;-) Here, we only need this to make test + * to pass with/without --nocr option. + */ + if (stat("/proc/self/status", &st1)) { + pr_perror("stat"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* signal a child process to continue */ + if (kill(pid, SIGTERM)) { + pr_perror("kill"); + goto err; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("1st waitpid"); + goto err; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("1st process didn't exit cleanly: status=%d", status); + goto err; + } + + if (stat("/proc/self/status", &st2)) { + pr_perror("stat"); + return 1; + } + + /* detect CRIU */ + criu_was_there = st1.st_ino != st2.st_ino; + + /* + * We should mark failure if one of the following happens: + * 1. MAP_DROPPABLE memory is not zero after C/R + * 2. MAP_DROPPABLE memory somehow changed without C/R + * (kernel issue? memory pressure?) + * 3. MADV_WIPEONFORK memory is not preserved + * + * We care about 2nd case only because we would like test + * to pass even with --nocr zdtm.py option. + */ + if ((criu_was_there && !mem_is_zero(p1, sizeof(data))) || + (!criu_was_there && memcmp(p1, data, sizeof(data))) || + memcmp(p2, data, sizeof(data))) { + fail("Data mismatch"); + return 1; + } + + /* contents of these mappings is supposed to be dropped after fork() */ + memcpy(p1, data, sizeof(data)); + memcpy(p2, data, sizeof(data)); + + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + if (!mem_is_zero(p1, sizeof(data)) || + !mem_is_zero(p2, sizeof(data))) { + pr_err("2nd child: memory check failed\n"); + return 1; + } + + return 0; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("2nd waitpid"); + goto err; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("2nd process didn't exit cleanly: status=%d", status); + goto err; + } + + pass(); + + return 0; +err: + if (waitpid(-1, NULL, WNOHANG) == 0) { + kill(pid, SIGTERM); + wait(NULL); + } + return 1; + +skip: + test_daemon(); + test_waitsig(); + pass(); + return 0; +} From 4dd73555091e04681b573e1abcc99a343fbaf287 Mon Sep 17 00:00:00 2001 From: Prajwal S N Date: Mon, 14 Apr 2025 14:06:40 +0530 Subject: [PATCH 035/137] feat: introduce Nix flake CRIU currently requires a number of dependencies in order to build from source. The package names vary across distributions and package managers. A Nix flake allows developers to spin up a dev environment with `nix develop`, eliminating the hassle of manual dependency management. It also prevents polluting the global package set on the machine. Signed-off-by: Prajwal S N --- CONTRIBUTING.md | 2 +- flake.lock | 61 +++++++++++++++++++++++++++++++++++++++ flake.nix | 77 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 37965e5fba..712e7b8132 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -34,7 +34,7 @@ To clone CRIU repo and switch to the proper branch, run: ### Compile -First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. +First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. Alternatively, you can use the Nix flake to set up a development environment by running `nix develop`. To compile CRIU, run: diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000000..90c914452b --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1744463964, + "narHash": "sha256-LWqduOgLHCFxiTNYi3Uj5Lgz0SR+Xhw3kr/3Xd0GPTM=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "2631b0b7abcea6e640ce31cd78ea58910d31e650", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000000..dc2429ffc9 --- /dev/null +++ b/flake.nix @@ -0,0 +1,77 @@ +{ + description = "CRIU development environment"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + + # Dependencies for CRIU + criuDeps = with pkgs; [ + # Compiler and build essentials + gcc + gnumake + pkg-config + + # Protocol Buffers + protobuf + protobufc + python3Packages.protobuf + + # Other required libraries + libuuid + libbsd + iproute2 + nftables + libcap + libnet + libnl + libaio + gnutls + libdrm + + # ZDTM + python3Packages.pyyaml + ]; + + # Multilib support for 32-bit compatibility + # criuDeps32bit = with pkgs; [ + # glibc.dev + # glibc + # gcc-unwrapped + # ]; + + devShell = pkgs.mkShell { + buildInputs = criuDeps; # ++ (if pkgs.stdenv.isx86_64 then criuDeps32bit else []); + + shellHook = '' + echo "CRIU development environment" + echo "==============================" + echo "" + echo "Useful commands:" + echo " make - Build CRIU" + echo " make test - Run tests (requires ZDTM dependencies)" + echo "" + ''; + + # Add proper flags for multilib support + # NIX_CFLAGS_COMPILE = pkgs.lib.optional pkgs.stdenv.isx86_64 "-m32"; + + # Make sure the shell can find headers for multilib + # PKG_CONFIG_PATH = pkgs.lib.makeSearchPath "lib/pkgconfig" criuDeps; + }; + in + { + # Export the development shell + devShells.default = devShell; + + # Build CRIU package as well + packages.default = pkgs.criu; + } + ); +} From 6d86a29afb1fc08ffe6045650ac9b18f23ea4087 Mon Sep 17 00:00:00 2001 From: Liana Koleva <43767763+lianakoleva@users.noreply.github.com> Date: Wed, 26 Mar 2025 17:41:51 +0000 Subject: [PATCH 036/137] crtools: simplify check for cpuinfo subcommands The cpuinfo command requires a "dump" or "check" subcommand. Thus, we replace `CR_CPUINFO` with `CR_CPUINFO_DUMP` and `CR_CPUINFO_CHECK`. This allows us to remove unnecessary subcommand check in `image_dir_mode()` and perform all parsing in `parse_criu_mode()`. With this change the check for validating the cpuinfo subcommand is now done only once with `CR_CPUINFO_DUMP` or `CR_CPUINFO_CHECK` enum. Signed-off-by: Liana Koleva <43767763+lianakoleva@users.noreply.github.com> Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 2 +- criu/crtools.c | 57 ++++++++++++++++++++------------------- criu/include/cr_options.h | 3 ++- 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index b9d11ced22..d8c5967bc9 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -1261,7 +1261,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (pid == 0) { int ret = 1; - opts.mode = CR_CPUINFO; + opts.mode = (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ? CR_CPUINFO_DUMP : CR_CPUINFO_CHECK; if (setup_opts_from_req(sk, msg->opts)) goto cout; diff --git a/criu/crtools.c b/criu/crtools.c index 6f493850b9..4734c90f2f 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -54,19 +54,17 @@ void flush_early_log_to_stderr(void) flush_early_log_buffer(STDERR_FILENO); } -static int image_dir_mode(char *argv[], int optind) +static int image_dir_mode(void) { switch (opts.mode) { case CR_DUMP: /* fallthrough */ + case CR_CPUINFO_DUMP: + /* fallthrough */ case CR_PRE_DUMP: return O_DUMP; case CR_RESTORE: return O_RSTR; - case CR_CPUINFO: - if (!strcmp(argv[optind + 1], "dump")) - return O_DUMP; - /* fallthrough */ default: return -1; } @@ -76,7 +74,7 @@ static int image_dir_mode(char *argv[], int optind) return -1; } -static int parse_criu_mode(char *mode) +static int parse_criu_mode(char *mode, char *subcommand) { if (!strcmp(mode, "dump")) opts.mode = CR_DUMP; @@ -96,8 +94,12 @@ static int parse_criu_mode(char *mode) opts.mode = CR_SWRK; else if (!strcmp(mode, "dedup")) opts.mode = CR_DEDUP; - else if (!strcmp(mode, "cpuinfo")) - opts.mode = CR_CPUINFO; + else if (!strcmp(mode, "cpuinfo") && subcommand == NULL) + return -2; + else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "dump")) + opts.mode = CR_CPUINFO_DUMP; + else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "check")) + opts.mode = CR_CPUINFO_CHECK; else if (!strcmp(mode, "exec")) opts.mode = CR_EXEC_DEPRECATED; else if (!strcmp(mode, "show")) @@ -115,6 +117,7 @@ int main(int argc, char *argv[], char *envp[]) bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; + char *subcommand; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); @@ -165,9 +168,15 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - if (parse_criu_mode(argv[optind])) { + has_sub_command = (argc - optind) > 1; + subcommand = has_sub_command ? argv[optind + 1] : NULL; + ret = parse_criu_mode(argv[optind], subcommand); + if (ret == -1) { pr_err("unknown command: %s\n", argv[optind]); goto usage; + } else if (ret == -2) { + pr_err("cpuinfo requires an action: dump or check\n"); + goto usage; } /* * util_init initializes criu_run_id and compel_run_id so that sockets @@ -223,25 +232,20 @@ int main(int argc, char *argv[], char *envp[]) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; - } else { + } else if (opts.mode != CR_CPUINFO_DUMP && opts.mode != CR_CPUINFO_CHECK && has_sub_command) { /* No subcommands except for cpuinfo and restore --exec-cmd */ - if (opts.mode != CR_CPUINFO && has_sub_command) { - pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); - goto usage; - } else if (opts.mode == CR_CPUINFO && !has_sub_command) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); + goto usage; } - if (opts.stream && image_dir_mode(argv, optind) == -1) { + if (opts.stream && image_dir_mode() == -1) { pr_err("--stream cannot be used with the %s command\n", argv[optind]); goto usage; } /* We must not open imgs dir, if service is called */ if (opts.mode != CR_SERVICE) { - ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); + ret = open_image_dir(opts.imgs_dir, image_dir_mode()); if (ret < 0) { pr_err("Couldn't open image dir %s\n", opts.imgs_dir); return 1; @@ -335,15 +339,12 @@ int main(int argc, char *argv[], char *envp[]) if (opts.mode == CR_DEDUP) return cr_dedup() != 0; - if (opts.mode == CR_CPUINFO) { - if (!argv[optind + 1]) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } - if (!strcmp(argv[optind + 1], "dump")) - return cpuinfo_dump(); - else if (!strcmp(argv[optind + 1], "check")) - return cpuinfo_check(); + if (opts.mode == CR_CPUINFO_DUMP) { + return cpuinfo_dump(); + } + + if (opts.mode == CR_CPUINFO_CHECK) { + return cpuinfo_check(); } if (opts.mode == CR_EXEC_DEPRECATED) { diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index ab0bd8fa36..4df8056b7b 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -125,7 +125,8 @@ enum criu_mode { CR_SERVICE, CR_SWRK, CR_DEDUP, - CR_CPUINFO, + CR_CPUINFO_DUMP, + CR_CPUINFO_CHECK, CR_EXEC_DEPRECATED, CR_SHOW_DEPRECATED, }; From 79f9740842ce4fcd2ca034eba4a89f2e83bd60fb Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 20 May 2025 14:47:55 +0000 Subject: [PATCH 037/137] crtools: do a few minor cleanups Signed-off-by: Andrei Vagin --- criu/crtools.c | 140 +++++++++++++++++++++++++------------------------ 1 file changed, 72 insertions(+), 68 deletions(-) diff --git a/criu/crtools.c b/criu/crtools.c index 4734c90f2f..509e73d741 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -74,40 +74,55 @@ static int image_dir_mode(void) return -1; } -static int parse_criu_mode(char *mode, char *subcommand) +struct { + char *cmd; + int mode; +} commands[] = { + { "dump", CR_DUMP }, + { "pre-dump", CR_PRE_DUMP }, + { "restore", CR_RESTORE }, + { "lazy-pages", CR_LAZY_PAGES }, + { "check", CR_CHECK }, + { "page-server", CR_PAGE_SERVER }, + { "service", CR_SERVICE }, + { "swrk", CR_SWRK }, + { "dedup", CR_DEDUP }, + { "exec", CR_EXEC_DEPRECATED }, + { "show", CR_SHOW_DEPRECATED }, +}; + +static int parse_criu_mode(int argc, char **argv, int *optind) { - if (!strcmp(mode, "dump")) - opts.mode = CR_DUMP; - else if (!strcmp(mode, "pre-dump")) - opts.mode = CR_PRE_DUMP; - else if (!strcmp(mode, "restore")) - opts.mode = CR_RESTORE; - else if (!strcmp(mode, "lazy-pages")) - opts.mode = CR_LAZY_PAGES; - else if (!strcmp(mode, "check")) - opts.mode = CR_CHECK; - else if (!strcmp(mode, "page-server")) - opts.mode = CR_PAGE_SERVER; - else if (!strcmp(mode, "service")) - opts.mode = CR_SERVICE; - else if (!strcmp(mode, "swrk")) - opts.mode = CR_SWRK; - else if (!strcmp(mode, "dedup")) - opts.mode = CR_DEDUP; - else if (!strcmp(mode, "cpuinfo") && subcommand == NULL) - return -2; - else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "dump")) - opts.mode = CR_CPUINFO_DUMP; - else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "check")) - opts.mode = CR_CPUINFO_CHECK; - else if (!strcmp(mode, "exec")) - opts.mode = CR_EXEC_DEPRECATED; - else if (!strcmp(mode, "show")) - opts.mode = CR_SHOW_DEPRECATED; - else - return -1; + char *cmd = argv[*optind]; + bool has_sub_command = (argc - *optind) > 1; + char *subcommand = has_sub_command ? argv[*optind + 1] : NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(commands); i++) { + if (strcmp(cmd, commands[i].cmd)) + continue; + opts.mode = commands[i].mode; + return 0; + } - return 0; + if (!strcmp(cmd, "cpuinfo")) { + if (subcommand == NULL) { + pr_err("cpuinfo requires an action: dump or check\n"); + return -1; + } + if (!strcmp(subcommand, "dump")) + opts.mode = CR_CPUINFO_DUMP; + else if (!strcmp(subcommand, "check")) + opts.mode = CR_CPUINFO_CHECK; + else { + pr_err("unknown cpuinfo sub-command: %s\n", subcommand); + return -1; + } + (*optind)++; + return 0; + } + pr_err("unknown command: %s\n", argv[*optind]); + return -1; } int main(int argc, char *argv[], char *envp[]) @@ -117,7 +132,7 @@ int main(int argc, char *argv[], char *envp[]) bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; - char *subcommand; + char *cmd; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); @@ -168,16 +183,11 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - has_sub_command = (argc - optind) > 1; - subcommand = has_sub_command ? argv[optind + 1] : NULL; - ret = parse_criu_mode(argv[optind], subcommand); - if (ret == -1) { - pr_err("unknown command: %s\n", argv[optind]); - goto usage; - } else if (ret == -2) { - pr_err("cpuinfo requires an action: dump or check\n"); + cmd = argv[optind]; + ret = parse_criu_mode(argc, argv, &optind); + if (ret) goto usage; - } + /* * util_init initializes criu_run_id and compel_run_id so that sockets * are generated with an unique name identifying the specific process @@ -232,14 +242,13 @@ int main(int argc, char *argv[], char *envp[]) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; - } else if (opts.mode != CR_CPUINFO_DUMP && opts.mode != CR_CPUINFO_CHECK && has_sub_command) { - /* No subcommands except for cpuinfo and restore --exec-cmd */ - pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); + } else if (has_sub_command) { + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", cmd); goto usage; } if (opts.stream && image_dir_mode() == -1) { - pr_err("--stream cannot be used with the %s command\n", argv[optind]); + pr_err("--stream cannot be used with the %s command\n", cmd); goto usage; } @@ -290,14 +299,13 @@ int main(int argc, char *argv[], char *envp[]) if (opts.img_parent) pr_info("Will do snapshot from %s\n", opts.img_parent); - if (opts.mode == CR_DUMP) { + switch (opts.mode) { + case CR_DUMP: if (!opts.tree_id) goto opt_pid_missing; return cr_dump_tasks(opts.tree_id); - } - - if (opts.mode == CR_PRE_DUMP) { + case CR_PRE_DUMP: if (!opts.tree_id) goto opt_pid_missing; @@ -307,9 +315,7 @@ int main(int argc, char *argv[], char *envp[]) } return cr_pre_dump_tasks(opts.tree_id) != 0; - } - - if (opts.mode == CR_RESTORE) { + case CR_RESTORE: if (opts.tree_id) pr_warn("Using -t with criu restore is obsoleted\n"); @@ -322,43 +328,41 @@ int main(int argc, char *argv[], char *envp[]) } return ret != 0; - } - if (opts.mode == CR_LAZY_PAGES) + case CR_LAZY_PAGES: return cr_lazy_pages(opts.daemon_mode) != 0; - if (opts.mode == CR_CHECK) + case CR_CHECK: return cr_check() != 0; - if (opts.mode == CR_PAGE_SERVER) + case CR_PAGE_SERVER: return cr_page_server(opts.daemon_mode, false, -1) != 0; - if (opts.mode == CR_SERVICE) + case CR_SERVICE: return cr_service(opts.daemon_mode); - if (opts.mode == CR_DEDUP) + case CR_DEDUP: return cr_dedup() != 0; - if (opts.mode == CR_CPUINFO_DUMP) { + case CR_CPUINFO_DUMP: return cpuinfo_dump(); - } - if (opts.mode == CR_CPUINFO_CHECK) { + case CR_CPUINFO_CHECK: return cpuinfo_check(); - } - if (opts.mode == CR_EXEC_DEPRECATED) { + case CR_EXEC_DEPRECATED: pr_err("The \"exec\" action is deprecated by the Compel library.\n"); return -1; - } - if (opts.mode == CR_SHOW_DEPRECATED) { + case CR_SHOW_DEPRECATED: pr_err("The \"show\" action is deprecated by the CRIT utility.\n"); pr_err("To view an image use the \"crit decode -i $name --pretty\" command.\n"); return -1; - } - pr_err("unknown command: %s\n", argv[optind]); + case CR_UNSET: + default: + pr_err("unknown command: %s\n", cmd); + } usage: pr_msg("\n" "Usage:\n" From d8014247d38d8eeed7c25ca0e6bf9fcfa59a7de0 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 23 May 2025 08:33:20 +0100 Subject: [PATCH 038/137] cpuinfo: show error when image is missing The `criu cpuinfo check` command calls cpu_validate_cpuinfo(), which attempts to open the cpuinfo.img file using `open_image()`. If the image file is not found, `open_image()` returns an "empty image" object. As a result, `cpu_validate_cpuinfo()` tries to read from it and fails with the following error: (00.002473) Error (criu/protobuf.c:72): Unexpected EOF on (empty-image) This patch adds a check for an empty image and appropriate error message. Signed-off-by: Radostin Stoyanov --- criu/arch/ppc64/cpu.c | 6 ++++++ criu/arch/s390/cpu.c | 6 ++++++ criu/arch/x86/cpu.c | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/criu/arch/ppc64/cpu.c b/criu/arch/ppc64/cpu.c index bb5b7256e2..b87230f40a 100644 --- a/criu/arch/ppc64/cpu.c +++ b/criu/arch/ppc64/cpu.c @@ -64,6 +64,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/s390/cpu.c b/criu/arch/s390/cpu.c index 3f430f4550..e227fad5e1 100644 --- a/criu/arch/s390/cpu.c +++ b/criu/arch/s390/cpu.c @@ -87,6 +87,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + ret = 0; if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index dfa31569fa..2e1f2de9ad 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -407,6 +407,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0) goto err; From b255f4686f85e223fe1f48a37849a27c27845534 Mon Sep 17 00:00:00 2001 From: Ivan Pravdin Date: Tue, 6 May 2025 22:40:25 -0400 Subject: [PATCH 039/137] rpc/log: return first error always Use shared first error buffer to return correct first error in rpc. Fixes: #338 Signed-off-by: Ivan Pravdin --- criu/cr-service.c | 24 +++++++++++++++++++++++- criu/log.c | 4 ++++ test/others/rpc/errno.py | 22 +++++++++++++++++++++- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index d8c5967bc9..a1089ad5c7 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -895,6 +895,11 @@ static int check(int sk, CriuOpts *req) resp.type = CRIU_REQ_TYPE__CHECK; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -919,6 +924,7 @@ static int check(int sk, CriuOpts *req) resp.success = true; out: + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -927,6 +933,11 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) int pid, status; bool success = false; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -1005,6 +1016,11 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT; struct ps_info info; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + if (pipe(start_pipe)) { pr_perror("No start pipe"); goto out; @@ -1078,6 +1094,7 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) out: resp.type = CRIU_REQ_TYPE__PAGE_SERVER; resp.success = success; + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -1252,6 +1269,11 @@ static int handle_cpuinfo(int sk, CriuReq *msg) bool success = false; int pid, status; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -1301,7 +1323,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) out: resp.type = msg->type; resp.success = success; - + set_resp_err(&resp); return send_criu_msg(sk, &resp); } diff --git a/criu/log.c b/criu/log.c index 70e267fd65..a02a8df204 100644 --- a/criu/log.c +++ b/criu/log.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -114,6 +115,9 @@ static struct str_and_lock *first_err; int log_keep_err(void) { + if (first_err) + return 0; + first_err = shmalloc(sizeof(struct str_and_lock)); if (first_err == NULL) return -1; diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index 4ea6c9d441..a5a3eb54dc 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -40,7 +40,7 @@ def recv_resp(self): resp.ParseFromString(self.s.recv(self._MAX_MSG_SIZE)) return resp - def check_resp(self, resp, typ, err): + def check_resp(self, resp, typ, err, errmsg = None): if resp.type != typ: raise Exception('Unexpected response type ' + str(resp.type)) @@ -49,6 +49,9 @@ def check_resp(self, resp, typ, err): if err and resp.cr_errno != err: raise Exception('Unexpected cr_errno ' + str(resp.cr_errno)) + + if errmsg and errmsg not in resp.cr_errmsg: + raise Exception('Unexpected cr_msg \'' + str(resp.cr_errmsg) + '\'') def no_process(self): print('Try to dump unexisting process') @@ -131,12 +134,29 @@ def bad_request(self): self.check_resp(resp, rpc.EMPTY, None) print('Success') + + def child_first_err(self): + print('Receive correct first error message') + + req = self.get_base_req() + req.type = rpc.CHECK + + # mntns_compat_mode options is only allowed on restore + req.opts.mntns_compat_mode = True + + self.send_req(req) + resp = self.recv_resp() + + self.check_resp(resp, rpc.CHECK, None, "Option --mntns-compat-mode is only valid on restore\n") + + print('Success') def run(self): self.no_process() self.process_exists() self.bad_options() self.bad_request() + self.child_first_err() t = test() From e58191dd65211bee485a7cf0a23f705fa385539d Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sun, 8 Jun 2025 13:24:11 +0800 Subject: [PATCH 040/137] ipc/sysctl: fix CTL_FLAGS_IPC_EACCES_SKIP by making it a flag Having CTL_FLAGS_IPC_EACCES_SKIP == (CTL_FLAGS_OPTIONAL | CTL_FLAGS_READ_EIO_SKIP) is probably not what we want. So let's make it a real distinct flag. Fixes: 840735aa0 ("ipc_sysctl: Prioritize restoring IPC variables using non usernsd approach") Signed-off-by: Pavel Tikhomirov --- criu/include/sysctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/include/sysctl.h b/criu/include/sysctl.h index cb3eba8174..2d689a9a04 100644 --- a/criu/include/sysctl.h +++ b/criu/include/sysctl.h @@ -37,6 +37,6 @@ enum { #define CTL_FLAGS_OPTIONAL 1 #define CTL_FLAGS_HAS 2 #define CTL_FLAGS_READ_EIO_SKIP 4 -#define CTL_FLAGS_IPC_EACCES_SKIP 5 +#define CTL_FLAGS_IPC_EACCES_SKIP 8 #endif /* __CR_SYSCTL_H__ */ From e767425d8d59d61782ab4ac74d89dcb8d854ece3 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sun, 8 Jun 2025 13:34:19 +0800 Subject: [PATCH 041/137] net/sysctl: fix missprint in an error message Fixes: f38e58836 ("net/sysctl: c/r ipv4/ping_group_range value") Signed-off-by: Pavel Tikhomirov --- criu/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/net.c b/criu/net.c index 300df480b0..e5d2f1c4d1 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2147,7 +2147,7 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) size_t n = *pn; if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { - pr_err("unix: Unexpected entries in sysctlig (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + pr_err("unix: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); return -EINVAL; } From ef7f5a76db3cde598aeb1055074b20797f115902 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 10 Jun 2025 11:33:59 +0800 Subject: [PATCH 042/137] net/sysctl: fix broken ipv4_sysctls_op We have ability to skip sysctl if there is no value, but we still give n requests to sysctl_op, that is not correct and probably can segfault on nullptr access. Fix it by adding ri to count non skipped requests. To be on the safe side, let's add a check that ri == n on read, as we should not do any skips there. While on it lets fix bad error message prefix: s/unix/ipv4/. Remove excess has_iarg set, and add sarg reset to NULL for the case sysctl_op skipped it. Signed-off-by: Andrei Vagin Signed-off-by: Pavel Tikhomirov --- criu/net.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/criu/net.c b/criu/net.c index e5d2f1c4d1..2c018ef7bb 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2144,51 +2144,53 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) char path[ARRAY_SIZE(ipv4_sysctl_entries)][MAX_IPV4_SYSCTL_PATH] = {}; struct sysctl_req req[ARRAY_SIZE(ipv4_sysctl_entries)] = {}; SysctlEntry **sysctl = *rsysctl; - size_t n = *pn; + size_t n = *pn, ri; if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { - pr_err("unix: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + pr_err("ipv4: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); return -EINVAL; } if (opts.weak_sysctls || op == CTL_READ) flags = CTL_FLAGS_OPTIONAL; - for (i = 0; i < n; i++) { - snprintf(path[i], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); - req[i].name = path[i]; - req[i].flags = flags; + for (i = 0, ri = 0; i < n; i++) { + snprintf(path[ri], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); + req[ri].name = path[ri]; + req[ri].flags = flags; switch (sysctl[i]->type) { case SYSCTL_TYPE__CTL_STR: - req[i].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); + req[ri].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); /* skip write if have no value */ if (op == CTL_WRITE && !sysctl[i]->sarg) continue; - req[i].arg = sysctl[i]->sarg; + req[ri].arg = sysctl[i]->sarg; break; default: pr_err("ipv4: Unknown sysctl type %d\n", sysctl[i]->type); return -1; } + ri++; } - ret = sysctl_op(req, n, op, CLONE_NEWNET); + ret = sysctl_op(req, ri, op, CLONE_NEWNET); if (ret < 0) { - pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); + pr_err("ipv4: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); return -1; } if (op == CTL_READ) { bool has_entries = false; + BUG_ON(ri != n); for (i = 0; i < n; i++) { if (req[i].flags & CTL_FLAGS_HAS) { - sysctl[i]->has_iarg = true; - if (!has_entries) - has_entries = true; + has_entries = true; + } else { + sysctl[i]->sarg = NULL; } } From d9893a6aa0b947d148ef982c1b9e20951fba71ed Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sun, 8 Jun 2025 14:07:13 +0800 Subject: [PATCH 043/137] net/sysctl: make ipv4/ping_group_range work in user namespaces We dump sysctls from criu user namespace, but restore from restored user namespace. So group id values should be mapped to the restored user namespace gid space to restore correctly. Signed-off-by: Andrei Vagin Signed-off-by: Pavel Tikhomirov --- criu/net.c | 44 ++++++++++++++++++++++++++ test/zdtm/static/netns_sub_sysctl.desc | 2 +- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/criu/net.c b/criu/net.c index 2c018ef7bb..e5775a3287 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2203,6 +2203,42 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) return 0; } +static int ipv4_sysctls_ping_group_range_map_gid(SysctlEntry *ent, size_t size) +{ + int start, end, ustart, uend, ret; + + if (sscanf(ent->sarg, "%d %d", &start, &end) != 2) { + pr_err("Failed to parse ping_group_range: %s\n", ent->sarg); + return -1; + } + + /* + * The default is "1 0", which means no group + * is allowed to create ICMP Echo sockets. + */ + if (start == 1 && end == 0) { + pr_debug("The ping_group_range is set to default, skipping it.\n"); + ent->sarg = NULL; + return 0; + } + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + ustart = userns_gid(start); + uend = userns_gid(end); + pr_debug("Mapping ping_group_range %d %d to userns -> %d %d\n", + start, end, ustart, uend); + + ret = snprintf(ent->sarg, size, "%d\t%d\n", ustart, uend); + if (ret < 0 || ret >= size) { + pr_err("Failed to map ping_group_range: %d\t%d\n", ustart, uend); + return -1; + } + + return 0; +} + static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) { void *buf, *o_buf; @@ -2220,6 +2256,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) SysctlEntry *ipv4_sysctls = NULL; size_t ipv4_sysctl_size = ARRAY_SIZE(ipv4_sysctl_entries); char ping_group_range[MAX_STR_IPV4_SYSCTL_LEN + 1] = {}; + int ping_group_range_id = -1; NetnsId *ids; struct netns_id *p; @@ -2310,6 +2347,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (!strcmp(ipv4_sysctl_entries[i], "ping_group_range")) { netns.ipv4_sysctl[i]->type = SYSCTL_TYPE__CTL_STR; netns.ipv4_sysctl[i]->sarg = ping_group_range; + ping_group_range_id = i; } else { /* Need to handle this case when we have more sysctls */ BUG(); @@ -2338,6 +2376,12 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (ret < 0) goto err_free; + BUG_ON(ping_group_range_id == -1); + ret = ipv4_sysctls_ping_group_range_map_gid(netns.ipv4_sysctl[ping_group_range_id], + MAX_STR_IPV4_SYSCTL_LEN + 1); + if (ret < 0) + goto err_free; + ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); err_free: xfree(o_buf); diff --git a/test/zdtm/static/netns_sub_sysctl.desc b/test/zdtm/static/netns_sub_sysctl.desc index 5358426683..0c357aefe4 100644 --- a/test/zdtm/static/netns_sub_sysctl.desc +++ b/test/zdtm/static/netns_sub_sysctl.desc @@ -1,4 +1,4 @@ { - 'flavor': 'ns', + 'flavor': 'ns uns', 'flags': 'suid' } From 60ac146010ecb8f0346874b30c69d19ef62cb123 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 9 Jun 2025 21:17:57 -0700 Subject: [PATCH 044/137] zdtm/netns_sub_sysctl: skip unsupported sysctls net/unix/max_dgram_qlen can't be tuned from non-root userns before: v5.17-rc1~170^2~215 ("net: Enable max_dgram_qlen unix sysctl to be configurable by non-init user namespaces") Signed-off-by: Andrei Vagin --- test/zdtm/static/netns_sub_sysctl.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/zdtm/static/netns_sub_sysctl.c b/test/zdtm/static/netns_sub_sysctl.c index 0f94c40a79..03b478b7d7 100644 --- a/test/zdtm/static/netns_sub_sysctl.c +++ b/test/zdtm/static/netns_sub_sysctl.c @@ -1,4 +1,6 @@ #include +#include +#include #include "zdtmtst.h" #include "sysctl.h" @@ -20,6 +22,7 @@ typedef struct { int new; char s_old[MAX_STR_SYSCTL_LEN]; char s_new[MAX_STR_SYSCTL_LEN]; + bool set; } sysctl_opt_t; #define CONF_UNIX_BASE "/proc/sys/net/unix" @@ -38,6 +41,11 @@ int main(int argc, char **argv) test_init(argc, argv); for (p = net_unix_params; p->path != NULL; p++) { + if (access(p->path, W_OK) != 0) { + test_msg("%s doesn't exist\n", p->path); + continue; + } + p->set = true; if (p->type == SYSCTL_INT) { p->old = (((unsigned)lrand48()) % 1023) + 1; if (sysctl_write_int(p->path, p->old)) { @@ -56,6 +64,8 @@ int main(int argc, char **argv) test_waitsig(); for (p = net_unix_params; p->path != NULL; p++) { + if (!p->set) + continue; if (p->type == SYSCTL_INT) { if (sysctl_read_int(p->path, &p->new)) ret = 1; From 6ebab3d26225f8e815b8042125d3ac161086bacf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Fri, 27 Dec 2024 03:47:35 +0530 Subject: [PATCH 045/137] sk-inet: Add support for checkpoint/restore of ICMP sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently there is no option to checkpoint/restore programs that use ICMP sockets, such as `ping`. This patch adds support for the same. Fixes #2557 Signed-off-by: समीर सिंह Sameer Singh --- criu/sk-inet.c | 7 +++++-- criu/sockets.c | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 1238b03dc5..6e0acf2ce3 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -130,6 +130,8 @@ static int can_dump_ipproto(unsigned int ino, int proto, int type) case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: break; default: pr_err("Unsupported proto %d for socket %x\n", proto, ino); @@ -922,8 +924,9 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) } if (ie->src_port) { - if (inet_bind(sk, ii)) - goto err; + if (ie->proto != IPPROTO_ICMP && ie->proto != IPPROTO_ICMPV6) + if (inet_bind(sk, ii)) + goto err; } /* diff --git a/criu/sockets.c b/criu/sockets.c index f9ce999bed..0affccad02 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -65,7 +65,7 @@ const char *socket_proto_name(unsigned int proto, char *nm, size_t size) [IPPROTO_IPV6] = __stringify_1(IPPROTO_IPV6), [IPPROTO_RSVP] = __stringify_1(IPPROTO_RSVP), [IPPROTO_GRE] = __stringify_1(IPPROTO_GRE), [IPPROTO_ESP] = __stringify_1(IPPROTO_ESP), [IPPROTO_AH] = __stringify_1(IPPROTO_AH), [IPPROTO_UDPLITE] = __stringify_1(IPPROTO_UDPLITE), - [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), + [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), [IPPROTO_ICMPV6] = __stringify_1(IPPROTO_ICMPV6), }; return __socket_const_name(nm, size, protos, ARRAY_SIZE(protos), proto); } @@ -131,10 +131,12 @@ enum socket_cl_bits { INET_UDP_CL_BIT, INET_UDPLITE_CL_BIT, INET_RAW_CL_BIT, + INET_ICMP_CL_BIT, INET6_TCP_CL_BIT, INET6_UDP_CL_BIT, INET6_UDPLITE_CL_BIT, INET6_RAW_CL_BIT, + INET6_ICMP_CL_BIT, UNIX_CL_BIT, PACKET_CL_BIT, _MAX_CL_BIT, @@ -161,6 +163,8 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET_RAW_CL_BIT; + if (proto == IPPROTO_ICMP) + return INET_ICMP_CL_BIT; } if (family == AF_INET6) { if (proto == IPPROTO_TCP) @@ -171,6 +175,8 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET6_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET6_RAW_CL_BIT; + if (proto == IPPROTO_ICMPV6) + return INET6_ICMP_CL_BIT; } pr_err("Unknown pair family %d proto %d\n", family, proto); @@ -282,6 +288,12 @@ void preload_socket_modules(void) req.r.i.sdiag_protocol = IPPROTO_RAW; probe_diag(nl, &req, -ENOENT); + req.r.i.sdiag_protocol = IPPROTO_ICMP; + probe_diag(nl, &req, -ENOENT); + + req.r.i.sdiag_protocol = IPPROTO_ICMPV6; + probe_diag(nl, &req, -ENOENT); + close(nl); pr_info("Done probing\n"); } @@ -773,6 +785,10 @@ static int inet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg) case IPPROTO_RAW: type = SOCK_RAW; break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + type = SOCK_DGRAM; + break; default: BUG_ON(1); return -1; @@ -797,7 +813,7 @@ static int collect_err(int err, struct ns_id *ns, void *arg) char family[32], proto[32]; char msg[256]; - snprintf(msg, sizeof(msg), "Sockects collect procedure family %s proto %s", + snprintf(msg, sizeof(msg), "Sockets collect procedure family %s proto %s", socket_family_name(gr->family, family, sizeof(family)), socket_proto_name(gr->protocol, proto, sizeof(proto))); @@ -905,6 +921,13 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; + /* Collect IPv4 ICMP sockets */ + req.r.i.sdiag_family = AF_INET; + req.r.i.sdiag_protocol = IPPROTO_ICMP; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); + /* Collect IPv6 TCP sockets */ req.r.i.sdiag_family = AF_INET6; req.r.i.sdiag_protocol = IPPROTO_TCP; @@ -944,6 +967,13 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; + /* Collect IPv6 ICMP sockets */ + req.r.i.sdiag_family = AF_INET6; + req.r.i.sdiag_protocol = IPPROTO_ICMPV6; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); + req.r.p.sdiag_family = AF_PACKET; req.r.p.sdiag_protocol = 0; req.r.p.pdiag_show = PACKET_SHOW_INFO | PACKET_SHOW_MCLIST | PACKET_SHOW_FANOUT | PACKET_SHOW_RING_CFG; From 0d52a54b98ebf2dd4c33148cfedd22a79592ee63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Sat, 28 Dec 2024 09:35:11 +0530 Subject: [PATCH 046/137] test: add static tests for ICMP socket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ZDTM static tests for IP4/ICMP and IP6/ICMP socket feature. Signed-off-by: समीर सिंह Sameer Singh Signed-off-by: Andrei Vagin --- test/zdtm/static/Makefile | 3 + test/zdtm/static/socket6_icmp.c | 1 + test/zdtm/static/socket_icmp.c | 128 ++++++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+) create mode 120000 test/zdtm/static/socket6_icmp.c create mode 100644 test/zdtm/static/socket_icmp.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 34fc90513a..d427659e0e 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -37,6 +37,8 @@ TST_NOFILE := \ socket_udp-corked \ socket6_udp \ socket_udp_shutdown \ + socket_icmp \ + socket6_icmp \ sk-freebind \ sk-freebind-false \ socket_udplite \ @@ -630,6 +632,7 @@ socket-tcp6-closed: CFLAGS += -D ZDTM_IPV6 socket-tcp6-closed: CFLAGS += -D ZDTM_IPV4V6 socket-tcp-closed-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK socket-tcp-skip-in-flight: CFLAGS += -D ZDTM_IPV4V6 +socket6-icmp: CFLAGS += -DZDTM_IPV6 sock_ip_opts01: CFLAGS += -DZDTM_VAL_ZERO sock_tcp_opts01: CFLAGS += -DZDTM_VAL_ZERO tun_ns: CFLAGS += -DTUN_NS diff --git a/test/zdtm/static/socket6_icmp.c b/test/zdtm/static/socket6_icmp.c new file mode 120000 index 0000000000..24d8fd8067 --- /dev/null +++ b/test/zdtm/static/socket6_icmp.c @@ -0,0 +1 @@ +socket_icmp.c \ No newline at end of file diff --git a/test/zdtm/static/socket_icmp.c b/test/zdtm/static/socket_icmp.c new file mode 100644 index 0000000000..f72e348bf4 --- /dev/null +++ b/test/zdtm/static/socket_icmp.c @@ -0,0 +1,128 @@ +#include "zdtmtst.h" + +const char *test_doc = "static test for ICMP socket\n"; +const char *test_author = "समीर सिंह Sameer Singh \n"; + +/* Description: + * Send a ping to localhost using ICMP socket + */ + +#include +#include +#include +#include +#if defined(ZDTM_IPV6) +#include +#else +#include +#endif +#include +#include +#include + +#include "sysctl.h" + +#define PACKET_SIZE 64 +#define RECV_TIMEOUT 1 + +static int echo_id = 1234; + +#if defined(ZDTM_IPV6) +#define TEST_ICMP_ECHOREPLY ICMP6_ECHOREPLY +#else +#define TEST_ICMP_ECHOREPLY ICMP_ECHOREPLY +#endif +int main(int argc, char **argv) +{ + int ret, sock, seq = 0; + char packet[PACKET_SIZE], recv_packet[PACKET_SIZE]; + + struct timeval tv; +#if defined(ZDTM_IPV6) + struct sockaddr_in6 addr, recv_addr; +#else + struct icmphdr icmp_header, *icmp_reply; +#endif + struct sockaddr_in addr, recv_addr; + socklen_t addr_len; + + // Allow GIDs 0-58468 to open an unprivileged ICMP socket + if (sysctl_write_str("/proc/sys/net/ipv4/ping_group_range", "0 58468")) + return -1; + + test_init(argc, argv); + +#if defined(ZDTM_IPV6) + sock = socket(PF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6); +#else + sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP); +#endif + if (sock < 0) { + pr_perror("Can't create socket"); + return 1; + } + + tv.tv_sec = RECV_TIMEOUT; + tv.tv_usec = 0; + if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0) { + pr_perror("Can't set socket option"); + return 1; + } + + memset(&addr, 0, sizeof(addr)); + memset(&icmp_header, 0, sizeof(icmp_header)); +#if defined(ZDTM_IPV6) + addr.sin6_family = AF_INET6; + inet_pton(AF_INET6, "::1", &addr.sin6_addr); + + icmp_header.icmp6_type = ICMP6_ECHO_REQUEST; + icmp_header.icmp6_code = 0; + icmp_header.icmp6_id = echo_id; + icmp_header.icmp6_seq = seq; +#else + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + + icmp_header.type = ICMP_ECHO; + icmp_header.code = 0; + icmp_header.un.echo.id = echo_id; + icmp_header.un.echo.sequence = seq; +#endif + + memcpy(packet, &icmp_header, sizeof(icmp_header)); + memset(packet + sizeof(icmp_header), 0xa5, + PACKET_SIZE - sizeof(icmp_header)); + + test_daemon(); + test_waitsig(); + + ret = sendto(sock, packet, PACKET_SIZE, 0, + (struct sockaddr *)&addr, sizeof(addr)); + + if (ret < 0) { + fail("Can't send"); + return 1; + } + + addr_len = sizeof(recv_addr); + + ret = recvfrom(sock, recv_packet, sizeof(recv_packet), 0, + (struct sockaddr *)&recv_addr, &addr_len); + + if (ret < 0) { + fail("Can't recv"); + return 1; + } + + icmp_reply = (struct icmphdr *)recv_packet; + + if (icmp_reply->type != ICMP_ECHOREPLY) { + fail("Got no ICMP_ECHO_REPLY"); + return 1; + } + + close(sock); + + pass(); + return 0; +} From 1436cd00937cd4d68607d92caaea8b79ef5f450c Mon Sep 17 00:00:00 2001 From: Chuan Qiu Date: Thu, 12 Jun 2025 22:49:26 -0700 Subject: [PATCH 047/137] mount: Fix trailing / when a file is bind-mounted E.g. I have a /etc/hosts in workspace mounted from the host, and get the following message. (00.141008) 1: mnt-v2: Create plain mountpoint /tmp/.criu.mntns.K1biY1/mnt-0000000938 for 938 (00.141546) 1: mnt-v2: Mounting unsupported @938 (0) (00.141887) 1: mnt-v2: Bind /tmp/agent/1-d8c746c6fda3a8b2/workspace/etc/hosts/ to /tmp/.criu.mntns.K1biY1/mnt-0000000938 (00.142179) 1: Error (criu/mount-v2.c:319): mnt-v2: Failed to open_tree /tmp/agent/1-d8c746c6fda3a8b2/workspace/etc/hosts/: Not a directory (00.143774) Error (criu/cr-restore.c:2320): Restoring FAILED. Signed-off-by: Chuan Qiu --- criu/mount.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/criu/mount.c b/criu/mount.c index 06b9595427..b643a7f26e 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -888,7 +888,11 @@ static int resolve_external_mounts(struct mount_info *info) cut_root = cut_root_for_bind(m->root, match->root); - p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + if (cut_root[0] == '\0') { + p = xstrdup(match->ns_mountpoint + 1); + } else { + p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + } if (!p) return -1; From 9f6f0392d54cdb8bb38a64963ff1be8ee3f249f3 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 20 Jun 2025 13:44:32 +0800 Subject: [PATCH 048/137] zdtm: Add ztatic/mnt_ext_file_bind_auto test The test creates a file bindmount in criu mntns and binds it into test mntns, this external file bindmount is autodetected and restored via "--external mnt[]" criu option. Note: In previous patch we fix the problem on this code path where file bindmount restore fails as there is excess "/" in source path. Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/Makefile | 1 + test/zdtm/static/mnt_ext_file_bind_auto.c | 104 +++++++++++++++++++ test/zdtm/static/mnt_ext_file_bind_auto.desc | 4 + 3 files changed, 109 insertions(+) create mode 100644 test/zdtm/static/mnt_ext_file_bind_auto.c create mode 100644 test/zdtm/static/mnt_ext_file_bind_auto.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index d427659e0e..ab69f389ed 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -381,6 +381,7 @@ TST_FILE = \ sk-unix-listen02 \ sk-unix-listen03 \ sk-unix-listen04 \ + mnt_ext_file_bind_auto \ TST_DIR = \ cwd00 \ diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.c b/test/zdtm/static/mnt_ext_file_bind_auto.c new file mode 100644 index 0000000000..0c3b9f5fbd --- /dev/null +++ b/test/zdtm/static/mnt_ext_file_bind_auto.c @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if external file mount works"; +const char *test_author = "Pavel Tikhomirov "; + +char *filename = "mnt_ext_file_bind_auto_bind_auto.file"; +TEST_OPTION(filename, string, "file name", 1); + +char *source = "mnt_ext_file_bind_auto_bind_auto.source"; + +int create_file(const char *path) +{ + int fd; + + fd = open(path, O_CREAT | O_RDWR, 0644); + if (fd < 0) { + pr_perror("open"); + return -1; + } + + close(fd); + return 0; +} + +int main(int argc, char **argv) +{ + char *zdtm_newns = getenv("ZDTM_NEWNS"); + char *tmp = "/tmp/zdtm_ext_file_bind_auto.tmp"; + char *sourcefile = "/tmp/zdtm_ext_file_bind_auto.file"; + char *root, tmpfile[PATH_MAX], testfile[PATH_MAX]; + + root = getenv("ZDTM_ROOT"); + if (root == NULL) { + pr_perror("root"); + return 1; + } + + if (!zdtm_newns) { + pr_perror("ZDTM_NEWNS is not set"); + return 1; + } else if (strcmp(zdtm_newns, "1")) { + goto test; + } + + /* Prepare file bindmount in criu root (source for external file bindmount) */ + mkdir(tmp, 0755); + if (mount(source, tmp, "tmpfs", 0, NULL)) { + pr_perror("mount tmpfs"); + return 1; + } + if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { + pr_perror("make private"); + return 1; + } + + sprintf(tmpfile, "%s/%s", tmp, filename); + if (create_file(tmpfile)) + return 1; + + if (create_file(sourcefile)) + return 1; + + if (mount(tmpfile, sourcefile, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + + umount2(tmp, MNT_DETACH); + + /* Prepare file in test root (mount point for external file bindmount) */ + sprintf(testfile, "%s/%s", root, filename); + if (create_file(testfile)) + return 1; + + /* + * Create temporary mntns, next mounts will not show up in criu mntns + * and will be inherited into test mntns + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + if (mount(sourcefile, testfile, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } +test: + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + pass(); + return 0; +} diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.desc b/test/zdtm/static/mnt_ext_file_bind_auto.desc new file mode 100644 index 0000000000..825b081274 --- /dev/null +++ b/test/zdtm/static/mnt_ext_file_bind_auto.desc @@ -0,0 +1,4 @@ +{ 'opts': '--external mnt[]', + 'feature': 'mnt_id', + 'flavor': 'ns uns', + 'flags': 'suid'} From 0b3cba78775118d16d0306ff92ec4420316d6421 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 11 Jul 2025 22:16:49 +0100 Subject: [PATCH 049/137] images: remove symlink for descriptor.proto Currently the build scripts create the following symlink: criu-4.1/images/google/protobuf/descriptor.proto -> /usr/include/google/protobuf/descriptor.proto This symlink points to a system-wide absolute-path target. Also, this symlink ends up in the release tarball. The tarball may later be downloaded and unpacked by e.g. OS distributions. If unpacking is done using Python 3.14+, it will fail. This happens because Python 3.14 will switch the default behavior of extractall() from "fully trusting the content of archive" to "disallow common attack vectors while extracting the archive". With this new behavior, extractall() raises an exception when at least one file in the archive extracts or points to outside of the extraction directory (these are called path traversal attacks and zip slip attacks). Reported-by: Dmitrii Kuvaiskii Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 7 ------- .lgtm.yml | 5 ----- images/Makefile | 17 ++++++++++++++++- images/google/protobuf/descriptor.proto | 1 - 4 files changed, 16 insertions(+), 14 deletions(-) delete mode 120000 images/google/protobuf/descriptor.proto diff --git a/.cirrus.yml b/.cirrus.yml index a4b53a54b0..bddd5a3f1c 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -15,7 +15,6 @@ task: setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-no-vdso @@ -33,7 +32,6 @@ task: memory: 8G setup_script: | - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel libuuid-devel @@ -67,7 +65,6 @@ task: setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-rawhide @@ -88,7 +85,6 @@ task: setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-non-root @@ -101,7 +97,6 @@ task: script: uname -a build_script: | scripts/ci/apt-install make - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci local task: @@ -113,7 +108,6 @@ task: script: uname -a build_script: | scripts/ci/apt-install make - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci local CLANG=1 task: @@ -125,6 +119,5 @@ task: script: uname -a build_script: | scripts/ci/prepare-for-fedora-rawhide.sh - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 make -C test/zdtm -j 4 diff --git a/.lgtm.yml b/.lgtm.yml index 0dd49cda41..4beadcc637 100644 --- a/.lgtm.yml +++ b/.lgtm.yml @@ -23,8 +23,3 @@ extraction: - "python3-yaml" - "libnl-route-3-dev" - "gnutls-dev" - configure: - command: - - "ls -laR images/google" - - "ln -s /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto" - - "ls -laR images/google" diff --git a/images/Makefile b/images/Makefile index d966fbfca0..e94346eee9 100644 --- a/images/Makefile +++ b/images/Makefile @@ -58,7 +58,6 @@ proto-obj-y += ext-file.o proto-obj-y += cgroup.o proto-obj-y += userns.o proto-obj-y += pidns.o -proto-obj-y += google/protobuf/descriptor.o # To make protoc happy and compile opts.proto proto-obj-y += opts.o proto-obj-y += seccomp.o proto-obj-y += binfmt-misc.o @@ -91,6 +90,22 @@ endef makefile-deps := Makefile $(obj)/Makefile +# +# Generate descriptor.pb-c.c and descriptor.pb-c.h to compile opts.proto. +PROTOBUF_DIR := images/google +DESCRIPTOR_DIR := $(PROTOBUF_DIR)/protobuf +GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf +$(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto + $$(Q) echo "Generating descriptor.pb-c.c" + $$(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< + +cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d + +submrproper: + $$(Q) rm -rf $(PROTOBUF_DIR) +.PHONY: submrproper +mrproper: submrproper + # # Generates rules needed to compile protobuf files. define gen-proto-rules diff --git a/images/google/protobuf/descriptor.proto b/images/google/protobuf/descriptor.proto deleted file mode 120000 index 07a4c9add6..0000000000 --- a/images/google/protobuf/descriptor.proto +++ /dev/null @@ -1 +0,0 @@ -/usr/include/google/protobuf/descriptor.proto \ No newline at end of file From 306318e06a6071852597e6ffd349cbc41d61f964 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 22 Jul 2025 20:14:45 -0700 Subject: [PATCH 050/137] images/Makefile: fix using $(Q) Commit 68f92b551 used `$$(Q)` instead of `$(Q)` in the Makefile target, which resulted in the following error: $(Q) echo "Generating descriptor.pb-c.c" /bin/sh: 1: Q: not found Generating descriptor.pb-c.c $(Q) protoc --proto_path=/usr/include --proto_path=images/ --c_out=images/ /usr/include/google/protobuf/descriptor.proto /bin/sh: 1: Q: not found as well as: $(Q) rm -rf images/google /bin/sh: line 1: Q: command not found Fix it. Signed-off-by: Kir Kolyshkin --- images/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/images/Makefile b/images/Makefile index e94346eee9..cb30a51268 100644 --- a/images/Makefile +++ b/images/Makefile @@ -96,13 +96,13 @@ PROTOBUF_DIR := images/google DESCRIPTOR_DIR := $(PROTOBUF_DIR)/protobuf GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto - $$(Q) echo "Generating descriptor.pb-c.c" - $$(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< + $(Q) echo "Generating descriptor.pb-c.c" + $(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d submrproper: - $$(Q) rm -rf $(PROTOBUF_DIR) + $(Q) rm -rf $(PROTOBUF_DIR) .PHONY: submrproper mrproper: submrproper From cae35ff2f3036400c1c236b9852a4413c0eff7c7 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 22 Jul 2025 22:44:50 -0700 Subject: [PATCH 051/137] Keep images/google/protobuf directory Commit 68f92b551 removed images/google/protobuf directory, so it is re-created each time during the build process. This resulted in a weird behavior change. Previously, one could do something like this: git clone $CRURL criu (cd criu && sudo make install-criu) rm -rf criu This worked fine, including running rm -rf as a non-root user, since no new directories were created under criu -- all directories were still owned by the original user. Since commit 68f92b551 the same sequence fails: rm: cannot remove '/home/runner/criu/images/google/protobuf/descriptor.pb-c.c': Permission denied rm: cannot remove '/home/runner/criu/images/google/protobuf/descriptor.pb-c.d': Permission denied rm: cannot remove '/home/runner/criu/images/google/protobuf/descriptor.pb-c.h': Permission denied A workaround is to keep empty images/google/protobuf directory, which is what this commit does. Signed-off-by: Kir Kolyshkin --- .gitignore | 2 -- images/Makefile | 5 ++--- images/google/protobuf/.gitignore | 2 ++ 3 files changed, 4 insertions(+), 5 deletions(-) create mode 100644 images/google/protobuf/.gitignore diff --git a/.gitignore b/.gitignore index 854657d1c1..94daa13ea0 100644 --- a/.gitignore +++ b/.gitignore @@ -20,8 +20,6 @@ compel/compel compel/compel-host-bin images/*.c images/*.h -images/google/protobuf/*.c -images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest diff --git a/images/Makefile b/images/Makefile index cb30a51268..6f310e553c 100644 --- a/images/Makefile +++ b/images/Makefile @@ -92,8 +92,7 @@ makefile-deps := Makefile $(obj)/Makefile # # Generate descriptor.pb-c.c and descriptor.pb-c.h to compile opts.proto. -PROTOBUF_DIR := images/google -DESCRIPTOR_DIR := $(PROTOBUF_DIR)/protobuf +DESCRIPTOR_DIR := images/google/protobuf GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto $(Q) echo "Generating descriptor.pb-c.c" @@ -102,7 +101,7 @@ $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d submrproper: - $(Q) rm -rf $(PROTOBUF_DIR) + $(Q) rm -f $(DESCRIPTOR_DIR)/* .PHONY: submrproper mrproper: submrproper diff --git a/images/google/protobuf/.gitignore b/images/google/protobuf/.gitignore new file mode 100644 index 0000000000..68359a7869 --- /dev/null +++ b/images/google/protobuf/.gitignore @@ -0,0 +1,2 @@ +*.c +*.h From ef20d8814b7d0b256922d68ac75781c80a663f96 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 22 Jul 2025 23:07:37 -0700 Subject: [PATCH 052/137] images/Makefile: use msg-gen In general, we use "$(E)" instead of "$(Q) echo", but we also have a msg-gen macro which can be used here. Signed-off-by: Kir Kolyshkin --- images/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/Makefile b/images/Makefile index 6f310e553c..2c33152e91 100644 --- a/images/Makefile +++ b/images/Makefile @@ -95,7 +95,7 @@ makefile-deps := Makefile $(obj)/Makefile DESCRIPTOR_DIR := images/google/protobuf GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto - $(Q) echo "Generating descriptor.pb-c.c" + $(call msg-gen, $@) $(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d From 474ab7ec7a768f3b9d9b6da44c8a346c232acc8b Mon Sep 17 00:00:00 2001 From: Ignacio Moreno Gonzalez Date: Wed, 16 Jul 2025 16:32:25 +0200 Subject: [PATCH 053/137] compel: flush caches after parasite injection After the CRIU process saves the parasite code for the target thread in the shared mmap, it is necessary to call __clear_cache before the target thread executes the code. Without this step, the target thread may not see the correct code to execute, which can result in a SIGILL signal. For the specific arm64 case. this is important so that the newly copied code is flushed from d-cache to RAM, so that the target thread sees the new code. The change is based on commit 6be10a2 by @fu.lin and on input received from @adrianreber. [ avagin: tweak code comment ] Signed-off-by: Ignacio Moreno Gonzalez Signed-off-by: Andrei Vagin --- compel/src/lib/infect.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 4ea27bc633..22fcf24fad 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1054,6 +1054,16 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, memcpy(ctl->local_map, ctl->pblob.hdr.mem, ctl->pblob.hdr.bsize); compel_relocs_apply(ctl->local_map, ctl->remote_map, &ctl->pblob); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(ctl->local_map, ctl->local_map + ctl->pblob.hdr.bsize); p = parasite_size; From a5767c1f87313f2c17a197b7ef3e840c117c4dc2 Mon Sep 17 00:00:00 2001 From: Ignacio Moreno Gonzalez Date: Wed, 16 Jul 2025 16:38:13 +0200 Subject: [PATCH 054/137] restore: flush caches during restore See the previous commit for rationale and architecture-specific details. [ avagin: tweak code comment ] Signed-off-by: Ignacio Moreno Gonzalez Signed-off-by: Andrei Vagin --- criu/cr-restore.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index c1d1f4b9d5..b376035631 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2569,6 +2569,17 @@ static int remap_restorer_blob(void *addr) restorer_setup_c_header_desc(&pbd, true); compel_relocs_apply(addr, addr, &pbd); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(addr, addr + pbd.hdr.bsize); + return 0; } From 94779376bdccf82a35dcccec13c8cfc852e63c86 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 25 Jul 2025 07:53:55 +0100 Subject: [PATCH 055/137] vagrant: fix 'qemu' install Installing this package currently fails with the following message: Package qemu is not available, but is referred to by another package. This may mean that the package is missing, has been obsoleted, or is only available from another source E: Package 'qemu' has no installation candidate Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index ed5a011787..c3e15007c2 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -22,7 +22,7 @@ setup() { wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb - ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu \ + ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base \ openssh-client systemctl restart libvirtd From 731059b67ce2c67a302789786aa352804e3fb471 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 25 Jul 2025 08:34:31 +0100 Subject: [PATCH 056/137] vagrant: update image to fedora 42 Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index c3e15007c2..81af5d2e5f 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -6,9 +6,9 @@ set -e set -x -VAGRANT_VERSION=2.4.1 -FEDORA_VERSION=40 -FEDORA_BOX_VERSION=40.20240414.0 +VAGRANT_VERSION=2.4.7 +FEDORA_VERSION=42 +FEDORA_BOX_VERSION=1.1.0 setup() { if [ -n "$TRAVIS" ]; then @@ -27,7 +27,7 @@ setup() { openssh-client systemctl restart libvirtd vagrant plugin install vagrant-libvirt - vagrant init fedora/${FEDORA_VERSION}-cloud-base --box-version ${FEDORA_BOX_VERSION} + vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} # The default libvirt Vagrant VM uses 512MB. # Travis VMs should have around 7.5GB. # Increasing it to 4GB should work. From 67cbc02c66ee75d439acddd666f6ddedc2e9e724 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 25 Jul 2025 08:50:29 +0100 Subject: [PATCH 057/137] vagrant: fix tar including archive in itself The tar command was failing with the following message: $ tar cf criu.tar ../../../criu tar: Removing leading `../../../' from member names tar: ../../../criu/scripts/ci/criu.tar: archive cannot contain itself; not dumped In addition, the /vagrant no-longer exist in the new Fedora images. bash: line 1: cd: /vagrant: No such file or directory Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 81af5d2e5f..008a01fb35 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -17,7 +17,7 @@ setup() { fi # Tar up the git checkout to have vagrant rsync it to the VM - tar cf criu.tar ../../../criu + tar cf /tmp/criu.tar -C ../../../ criu # Cirrus has problems with the following certificate. wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb @@ -28,10 +28,16 @@ setup() { systemctl restart libvirtd vagrant plugin install vagrant-libvirt vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} + # The default libvirt Vagrant VM uses 512MB. # Travis VMs should have around 7.5GB. # Increasing it to 4GB should work. sed -i Vagrantfile -e 's,^end$, config.vm.provider :libvirt do |libvirt|'"\n"' libvirt.memory = 4096;end'"\n"'end,g' + # Sync /tmp/criu.tar into the VM + # We want to use $HOME without expansion + # shellcheck disable=SC2016 + sed -i Vagrantfile -e 's|^end$| config.vm.provision "file", source: "/tmp/criu.tar", destination: "$HOME/criu.tar"'"\n"'end|g' + vagrant up --provider=libvirt --no-tty mkdir -p /root/.ssh vagrant ssh-config >> /root/.ssh/config @@ -40,8 +46,11 @@ setup() { libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel + # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd + + ssh default 'sudo mkdir -p --mode=777 /vagrant && mv $HOME/criu.tar /vagrant && cd /vagrant && tar xf criu.tar' ssh default cat /proc/cmdline } @@ -49,7 +58,7 @@ fedora-no-vdso() { ssh default sudo grubby --update-kernel ALL --args="vdso=0" vagrant reload ssh default cat /proc/cmdline - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + ssh default 'cd /vagrant/criu; make -j' ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -a --keep-going' # This test (pidfd_store_sk) requires pidfd_getfd syscall which is guaranteed in Fedora 33. # It is also skipped from -a because it runs in RPC mode only @@ -74,12 +83,12 @@ fedora-rawhide() { # In the container it is not possible to change the state of selinux. # Let's just disable it for this test run completely. ssh default 'sudo setenforce Permissive' - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' + ssh default 'cd /vagrant/criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' } fedora-non-root() { ssh default uname -a - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + ssh default 'cd /vagrant/criu; make -j' # Setting the capability should be the only line needed to run as non-root on Fedora # In other environments either set /proc/sys/kernel/yama/ptrace_scope to 0 or grant cap_sys_ptrace to criu ssh default 'sudo setcap cap_checkpoint_restore+eip /vagrant/criu/criu/criu' From 53374a3f08a339d20dffc4dcca30946a973c4da0 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 10 Aug 2025 21:46:39 +0000 Subject: [PATCH 058/137] zdtm/socket-tcp-closing: fill socket buffers effectivly Send large chunks to fill socket buffers. Signed-off-by: Andrei Vagin --- test/zdtm/static/socket-tcp-closing.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/socket-tcp-closing.c b/test/zdtm/static/socket-tcp-closing.c index 87e1d75337..df291d4464 100644 --- a/test/zdtm/static/socket-tcp-closing.c +++ b/test/zdtm/static/socket-tcp-closing.c @@ -31,10 +31,13 @@ static int port = 8880; int fill_sock_buf(int fd) { + char zdtm[512]; int flags; int size; int ret; + memset(zdtm, 5, sizeof(zdtm)); + flags = fcntl(fd, F_GETFL, 0); if (flags == -1) { pr_perror("Can't get flags"); @@ -47,7 +50,6 @@ int fill_sock_buf(int fd) size = 0; while (1) { - char zdtm[] = "zdtm test packet"; ret = write(fd, zdtm, sizeof(zdtm)); if (ret == -1) { if (errno == EAGAIN) From 98b42c6d3d4dbf6208580860891d4866c232ef23 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 10 Aug 2025 21:50:41 +0000 Subject: [PATCH 059/137] CI: Consolidate arm64 tests on GitHub runners The arm64 tests are currently being executed on both actuated and GitHub runners. This change removes the actuated runner to avoid redundancy and streamline our CI process. Signed-off-by: Andrei Vagin --- .github/workflows/aarch64-test.yaml | 32 +++++++++++ .github/workflows/actuated-aarch64-test.yaml | 58 -------------------- 2 files changed, 32 insertions(+), 58 deletions(-) create mode 100644 .github/workflows/aarch64-test.yaml delete mode 100644 .github/workflows/actuated-aarch64-test.yaml diff --git a/.github/workflows/aarch64-test.yaml b/.github/workflows/aarch64-test.yaml new file mode 100644 index 0000000000..32b19e1766 --- /dev/null +++ b/.github/workflows/aarch64-test.yaml @@ -0,0 +1,32 @@ +name: aarch64 test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: aarch64-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-24.04-arm + strategy: + matrix: + target: [GCC=1, CLANG=1] + + steps: + - uses: actions/checkout@v4 + - name: Run Tests ${{ matrix.target }} + # Following tests are failing on the VMs: + # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out + # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) + # + # In combination with '--remote-lazy-pages' following error occurs: + # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) + run: | + # The 'sched_policy00' needs the following: + sudo sysctl -w kernel.sched_rt_runtime_us=-1 + # etc/hosts entry is needed for netns_lock_iptables + echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts + sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ + ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" diff --git a/.github/workflows/actuated-aarch64-test.yaml b/.github/workflows/actuated-aarch64-test.yaml deleted file mode 100644 index 567746a5f4..0000000000 --- a/.github/workflows/actuated-aarch64-test.yaml +++ /dev/null @@ -1,58 +0,0 @@ -name: aarch64 test - -on: [push, pull_request] - -# Cancel any preceding run on the pull request. -concurrency: - group: actuated-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - -jobs: - build: - # Actuated runners are not available in all repositories. - if: ${{ github.repository == 'checkpoint-restore/criu' }} - # The memory size and the number of CPUs can be freely selected for - # the actuated runners. 3GB and 4 CPUs seems to be enough according to the - # result from 'vmmeter'. - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [actuated-arm64-4cpu-3gb, ubuntu-24.04-arm] - target: [GCC=1, CLANG=1] - - steps: - # https://gist.github.com/alexellis/1f33e581c75e11e161fe613c46180771#file-metering-gha-md - # vmmeter start - - name: Prepare arkade - if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} - uses: alexellis/arkade-get@master - with: - crane: latest - print-summary: false - - - name: Install vmmeter - if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} - run: | - crane export --platform linux/arm64 ghcr.io/openfaasltd/vmmeter:latest | sudo tar -xvf - -C /usr/local/bin - - - name: Run vmmeter - if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} - uses: self-actuated/vmmeter-action@master - # vmmeter end - - - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }}/${{ matrix.os }} - # Following tests are failing on the actuated VMs: - # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out - # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) - # - # In combination with '--remote-lazy-pages' following error occurs: - # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) - run: | - # The 'sched_policy00' needs the following: - sudo sysctl -w kernel.sched_rt_runtime_us=-1 - # etc/hosts entry is needed for netns_lock_iptables - echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts - sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ - ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" From 3c166c400511ad2c94686372ce1e154880ab299a Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:22:57 +0200 Subject: [PATCH 060/137] criu/include/mman: define MADV_GUARD_INSTALL Signed-off-by: Alexander Mikhalitsyn --- criu/include/mman.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/criu/include/mman.h b/criu/include/mman.h index 086753bcf5..43e0b6cc7a 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -19,5 +19,8 @@ #ifndef MADV_WIPEONFORK #define MADV_WIPEONFORK 18 #endif +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif #endif /* __CR_MMAN_H__ */ From 630e8118b268413b040c4ff7c8b002e87090068d Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sat, 19 Apr 2025 20:37:48 +0200 Subject: [PATCH 061/137] kerndat: add madvise(MADV_GUARD_INSTALL) feature-detection Signed-off-by: Alexander Mikhalitsyn --- criu/include/kerndat.h | 1 + criu/kerndat.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index c5deb32832..66db756497 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -91,6 +91,7 @@ struct kerndat_s { bool has_close_range; bool has_timer_cr_ids; bool has_breakpoints; + bool has_madv_guard; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index fa43f7d3f2..7e2edb72d0 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -31,6 +31,7 @@ #include "kerndat.h" #include "fs-magic.h" #include "mem.h" +#include "mman.h" #include "common/compiler.h" #include "sysctl.h" #include "cr_options.h" @@ -1813,6 +1814,33 @@ static int kerndat_breakpoints(void) return exit_code; } +static int kerndat_has_madv_guard(void) +{ + void *map; + + map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (map == MAP_FAILED) { + pr_perror("Can't mmap a page for has_madv_guard feature test"); + return -1; + } + + if (madvise(map, PAGE_SIZE, MADV_GUARD_INSTALL)) { + if (errno != EINVAL) { + pr_perror("madvise failed (has_madv_guard check)"); + goto mmap_cleanup; + } + } else { + kdat.has_madv_guard = true; + } + + munmap(map, PAGE_SIZE); + return 0; + +mmap_cleanup: + munmap(map, PAGE_SIZE); + return -1; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -2081,6 +2109,10 @@ int kerndat_init(void) pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_madv_guard()) { + pr_err("kerndat_has_madv_guard has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); From 427cf379bc9e680df5f86ff9b35eec8f125c2941 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sat, 19 Apr 2025 20:36:45 +0200 Subject: [PATCH 062/137] kerndat: add pagemap_scan_guard_pages feature check logic Signed-off-by: Alexander Mikhalitsyn --- criu/cr-check.c | 8 ++++++++ criu/include/kerndat.h | 3 +++ criu/include/pagemap_scan.h | 1 + criu/kerndat.c | 12 ++++++++++++ 4 files changed, 24 insertions(+) diff --git a/criu/cr-check.c b/criu/cr-check.c index 9c4778490e..7c3dc76dd8 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1599,6 +1599,12 @@ static int check_breakpoints(void) return 0; } +static int check_pagemap_scan_guard_pages(void) +{ + kerndat_warn_about_madv_guards(); + + return kdat.has_pagemap_scan_guard_pages ? 0 : -1; +} static int (*chk_feature)(void); @@ -1724,6 +1730,7 @@ int cr_check(void) ret |= check_pagemap_scan(); ret |= check_overlayfs_maps(); ret |= check_timer_cr_ids(); + ret |= check_pagemap_scan_guard_pages(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1853,6 +1860,7 @@ static struct feature_list feature_list[] = { { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, { "breakpoints", check_breakpoints }, + { "pagemap_scan_guard_pages", check_pagemap_scan_guard_pages }, { NULL, NULL }, }; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 66db756497..e4922f401d 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -92,6 +92,7 @@ struct kerndat_s { bool has_timer_cr_ids; bool has_breakpoints; bool has_madv_guard; + bool has_pagemap_scan_guard_pages; }; extern struct kerndat_s kdat; @@ -114,4 +115,6 @@ extern int kerndat_fs_virtualized(unsigned int which, u32 kdev); extern int kerndat_has_nspid(void); +extern void kerndat_warn_about_madv_guards(void); + #endif /* __CR_KERNDAT_H__ */ diff --git a/criu/include/pagemap_scan.h b/criu/include/pagemap_scan.h index 0ad4c9bc0b..9046e01edf 100644 --- a/criu/include/pagemap_scan.h +++ b/criu/include/pagemap_scan.h @@ -14,6 +14,7 @@ #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) #define PAGE_IS_SOFT_DIRTY (1 << 7) +#define PAGE_IS_GUARD (1 << 8) /* * struct page_region - Page region with flags diff --git a/criu/kerndat.c b/criu/kerndat.c index 7e2edb72d0..997181ce75 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -87,6 +87,10 @@ static int check_pagemap(void) if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) { pr_debug("PAGEMAP_SCAN is supported\n"); kdat.has_pagemap_scan = true; + + args.return_mask |= PAGE_IS_GUARD; + if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) + kdat.has_pagemap_scan_guard_pages = true; } else { switch (errno) { case EINVAL: @@ -1841,6 +1845,14 @@ static int kerndat_has_madv_guard(void) return -1; } +void kerndat_warn_about_madv_guards(void) +{ + if (kdat.has_madv_guard && !kdat.has_pagemap_scan_guard_pages) + pr_warn("ioctl(PAGEMAP_SCAN) doesn't support PAGE_IS_GUARD flag. " + "CRIU dump will fail if dumped processes use madvise(MADV_GUARD_INSTALL). " + "Please, consider updating your kernel.\n"); +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the From 251cbcd3fcc621421447b77d722968015f858eeb Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sat, 19 Apr 2025 20:42:43 +0200 Subject: [PATCH 063/137] cr-dump: warn if MADV_GUARD is supported but isn't shown in pagemap Signed-off-by: Alexander Mikhalitsyn --- criu/cr-dump.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index b8cf7d64d9..f02db1a57f 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2124,6 +2124,8 @@ int cr_dump_tasks(pid_t pid) int pre_dump_ret = 0; int ret = -1; + kerndat_warn_about_madv_guards(); + pr_info("========================================\n"); pr_info("Dumping processes (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); From dd263089d869e95683bd4c8a8bbd31c2dcb3b9c4 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:51:24 +0200 Subject: [PATCH 064/137] criu/pagemap-cache: pagescan: look for PAGE_IS_GUARD pages Signed-off-by: Alexander Mikhalitsyn --- criu/pagemap-cache.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index f04a517de3..457c0d6497 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -194,6 +194,9 @@ int pmc_fill(pmc_t *pmc, u64 start, u64 end) }; long ret; + if (kdat.has_pagemap_scan_guard_pages) + args.return_mask |= PAGE_IS_GUARD; + ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args); if (ret == -1) { pr_perror("PAGEMAP_SCAN"); From b6e04c36b9b54ae03c1ea36afecadf887d3750c7 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 1 May 2025 20:02:37 +0200 Subject: [PATCH 065/137] criu/mem: refactor should_dump_page helper Make should_dump_page to return int to indicate failure, also return useful data back through the struct page_info structure passed as a pointer. Also, correspondingly convert all call sites. No functional changes intended, except fixing a bug in should_dump_page() as it could return (-1) when pmc_fill() fails, while caller didn't expect that before. Signed-off-by: Alexander Mikhalitsyn --- criu/include/mem.h | 8 +++++- criu/mem.c | 67 ++++++++++++++++++++++++++++++---------------- criu/shmem.c | 27 ++++++++++++------- 3 files changed, 68 insertions(+), 34 deletions(-) diff --git a/criu/include/mem.h b/criu/include/mem.h index 3618c9cc3b..0ce97822b2 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -49,5 +49,11 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); -u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty); +struct page_info { + u64 next; + bool softdirty; +}; + +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info); + #endif /* __CR_MEM_H__ */ diff --git a/criu/mem.c b/criu/mem.c index 803cb545b5..9fcf7a44c6 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -115,27 +115,37 @@ static bool should_dump_entire_vma(VmaEntry *vmae) } /* - * should_dump_page returns vaddr if an addressed page has to be dumped. - * Otherwise, it returns an address that has to be inspected next. + * should_dump_page writes vaddr in page_info->next if an addressed page has to be dumped. + * Otherwise, it writes an address that has to be inspected next. */ -u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info) { + if (!page_info) + goto err; + if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) - return -1; + goto err; if (pmc->regs) { while (1) { - if (pmc->regs_idx == pmc->regs_len) - return pmc->end; + if (pmc->regs_idx == pmc->regs_len) { + page_info->next = pmc->end; + return 0; + } + if (vaddr < pmc->regs[pmc->regs_idx].end) break; pmc->regs_idx++; } - if (vaddr < pmc->regs[pmc->regs_idx].start) - return pmc->regs[pmc->regs_idx].start; - if (softdirty) - *softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; - return vaddr; + + if (vaddr < pmc->regs[pmc->regs_idx].start) { + page_info->next = pmc->regs[pmc->regs_idx].start; + return 0; + } + + page_info->softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; + page_info->next = vaddr; + return 0; } else { u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; @@ -143,16 +153,26 @@ u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) * Optimisation for private mapping pages, that haven't * yet being COW-ed */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) - return vaddr + PAGE_SIZE; + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) { + page_info->next = vaddr + PAGE_SIZE; + return 0; + } + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { - if (softdirty) - *softdirty = pme & PME_SOFT_DIRTY; - return vaddr; + page_info->softdirty = pme & PME_SOFT_DIRTY; + page_info->next = vaddr; + return 0; } - return vaddr + PAGE_SIZE; + page_info->next = vaddr + PAGE_SIZE; + return 0; } + +err: + pr_err("should_dump_page failed on vma " + "%#016" PRIx64 "-%#016" PRIx64 " vaddr=%#016" PRIx64 "\n", + vmae->start, vmae->end, vaddr); + return -1; } bool page_is_zero(u64 pme) @@ -202,14 +222,15 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct nr_scanned = 0; for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { unsigned int ppb_flags = 0; - bool softdirty = false; - u64 next; + struct page_info page_info = {}; int st; /* If dump_all_pages is true, should_dump_page is called to get pme. */ - next = should_dump_page(pmc, vma->e, vaddr, &softdirty); - if (!dump_all_pages && next != vaddr) { - vaddr = next - PAGE_SIZE; + if (should_dump_page(pmc, vma->e, vaddr, &page_info)) + return -1; + + if (!dump_all_pages && page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; continue; } @@ -223,7 +244,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(softdirty)) { + if (has_parent && page_in_parent(page_info.softdirty)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { diff --git a/criu/shmem.c b/criu/shmem.c index 9e3178352d..bc7aa36695 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -206,31 +206,34 @@ static int expand_shmem(struct shmem_info *si, unsigned long new_size) return 0; } -static void update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) +static int update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; u64 vaddr; if (!is_shmem_tracking_en()) - return; + return 0; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); for (vma_pfn = 0, vaddr = vma->start; vma_pfn < vma_pgcnt; ++vma_pfn, vaddr += PAGE_SIZE) { - bool softdirty = false; - u64 next; + struct page_info page_info = {}; + + if (should_dump_page(pmc, vma, vaddr, &page_info)) + return -1; - next = should_dump_page(pmc, vma, vaddr, &softdirty); - if (next != vaddr) { - vaddr = next - PAGE_SIZE; + if (page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; continue; } shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); - if (softdirty) + if (page_info.softdirty) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } + + return 0; } int collect_sysv_shmem(unsigned long shmid, unsigned long size) @@ -667,7 +670,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) if (expand_shmem(si, size)) return -1; } - update_shmem_pmaps(si, pmc, vma); + + if (update_shmem_pmaps(si, pmc, vma)) + return -1; return 0; } @@ -684,7 +689,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) if (expand_shmem(si, size)) return -1; - update_shmem_pmaps(si, pmc, vma); + + if (update_shmem_pmaps(si, pmc, vma)) + return -1; return 0; } From c2c9ef85454ba155674b552c5e66eeda82c479f1 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:42:26 +0200 Subject: [PATCH 066/137] criu/mem: dump: skip MADV_GUARD pages content dump 1. get info about MADV_GUARD_INSTALL-protected pages with help of pagemap by looking for PME_GUARD_REGION flag if /proc//pagemap is used or by looking for PAGE_IS_GUARD flag if ioctl(PAGEMAP_SCAN) is used 2. skip those pages Signed-off-by: Alexander Mikhalitsyn --- criu/include/mem.h | 1 + criu/mem.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/criu/include/mem.h b/criu/include/mem.h index 0ce97822b2..b2cbd4b640 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -35,6 +35,7 @@ extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_l #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) #define PME_FILE (1ULL << 61) +#define PME_GUARD_REGION (1ULL << 58) #define PME_SOFT_DIRTY (1ULL << 55) #define PME_PSHIFT_BITS (6) #define PME_STATUS_BITS (3) diff --git a/criu/mem.c b/criu/mem.c index 9fcf7a44c6..58c4130c67 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -143,12 +143,18 @@ int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *pa return 0; } + if (pmc->regs[pmc->regs_idx].categories & PAGE_IS_GUARD) + goto skip_guard_page; + page_info->softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; page_info->next = vaddr; return 0; } else { u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; + if (pme & PME_GUARD_REGION) + goto skip_guard_page; + /* * Optimisation for private mapping pages, that haven't * yet being COW-ed @@ -173,6 +179,10 @@ int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *pa "%#016" PRIx64 "-%#016" PRIx64 " vaddr=%#016" PRIx64 "\n", vmae->start, vmae->end, vaddr); return -1; + +skip_guard_page: + page_info->next = vaddr + PAGE_SIZE; + return 0; } bool page_is_zero(u64 pme) From c077b26a7ec7439ff2e7b8d1c887f85878ff1662 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Mon, 4 Aug 2025 09:42:10 +0200 Subject: [PATCH 067/137] criu/{mem, vdso, cr-restore}: introduce VMA_AREA_GUARD fake VMAs Introduce a new kind of VMA - VMA_AREA_GUARD. In fact, it is not a real VMA as it is not represented as struct vm_area_struct in the kernel. We want to reuse an existing vma infrastructure in CRIU to dump an information about MADV_GUARD_INSTALL-covered address space ranges as VMAs. Then, on restore, we need to carefully skip those fake VMAs everywhere we expect a normal VMAs to be processed. And only in restorer we use these VMAs to get an information about where to call MADV_GUARD_INSTALL. Suggested-by: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/cr-restore.c | 6 ++++-- criu/include/image.h | 7 +++++++ criu/mem.c | 13 +++++++++++-- criu/vdso.c | 6 ++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index b376035631..1c3b364518 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2447,7 +2447,8 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he while (1) { if (prev_vma_end + vma_len > s_vma->e->start) { - if (s_vma->list.next == self_vma_list) { + if ((s_vma->list.next == self_vma_list) || + vma_area_is(vma_next(s_vma), VMA_AREA_GUARD)) { s_vma = &end_vma; continue; } @@ -2460,7 +2461,8 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he } if (prev_vma_end + vma_len > t_vma->e->start) { - if (t_vma->list.next == tgt_vma_list) { + if ((t_vma->list.next == tgt_vma_list) || + vma_area_is(vma_next(t_vma), VMA_AREA_GUARD)) { t_vma = &end_vma; continue; } diff --git a/criu/include/image.h b/criu/include/image.h index afa7d5e12f..934f7d4e97 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -68,6 +68,12 @@ * processing exiting with error; while the rest of bits * are part of image ABI, this particular one must never * be used in image. + * - guard + * stands for a fake VMA (not represented in the kernel + * by a struct vm_area_struct). Used to keep an information + * about virtual address space ranges covered by + * MADV_GUARD_INSTALL guards. These ones must be always at + * the end of the vma_area_list and properly skipped a.e. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) @@ -87,6 +93,7 @@ #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) #define VMA_AREA_SHSTK (1 << 15) +#define VMA_AREA_GUARD (1 << 16) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) diff --git a/criu/mem.c b/criu/mem.c index 58c4130c67..ee841aca2e 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -599,6 +599,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit parent_predump_mode = mdc->parent_ie->pre_dump_mode; list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (vma_area_is(vma_area, VMA_AREA_GUARD)) + continue; + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, parent_predump_mode); if (ret < 0) @@ -861,14 +864,14 @@ static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list /* <= here to shift from matching VMAs and ... */ while (vma->e->start <= pvma->e->start) { vma = vma_next(vma); - if (&vma->list == &vmas->h) + if ((&vma->list == &vmas->h) || vma_area_is(vma, VMA_AREA_GUARD)) return; } /* ... no == here since we must stop on matching pair */ while (pvma->e->start < vma->e->start) { pvma = vma_next(pvma); - if (&pvma->list == &pvmas->h) + if ((&pvma->list == &pvmas->h) || vma_area_is(pvma, VMA_AREA_GUARD)) return; } } @@ -1069,6 +1072,9 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo filemap_ctx_init(true); list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (task_size_check(vpid(t), vma->e)) { ret = -1; break; @@ -1276,6 +1282,9 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) unsigned long size, i = 0; void *addr = decode_pointer(vma->premmaped_addr); + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (!vma_inherited(vma)) continue; diff --git a/criu/vdso.c b/criu/vdso.c index d4d3511314..2d9e57c4da 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -145,6 +145,9 @@ static void drop_rt_vdso(struct vm_area_list *vma_area_list, struct vdso_quarter * Also BTW search for rt-vvar to remove it later. */ list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (vma->e->start == addr->orig_vdso) { vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO; pr_debug("vdso: Restore orig vDSO status at %lx\n", (long)vma->e->start); @@ -276,6 +279,9 @@ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, struct vm_area_list } list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + /* * Defer handling marked vdso until we walked over * all vmas and restore potentially remapped vDSO From f83670c18a3f55b9222287800cf2eaaae5d2e1c5 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:10:10 +0200 Subject: [PATCH 068/137] criu/pie/restorer: add madvise(MADV_GUARD_INSTALL) restore logic Signed-off-by: Alexander Mikhalitsyn --- criu/pie/restorer.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 9867a3ddd5..394d3dea08 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -28,6 +28,7 @@ #include #include #include +#include "mman.h" #include "signal.h" #include "prctl.h" #include "criu-log.h" @@ -1665,6 +1666,30 @@ static int restore_membarrier_registrations(int mask) return ret; } +static int restore_madv_guard_regions(struct task_restore_args *args) +{ + int i, ret; + + for (i = 0; i < args->vmas_n; i++) { + VmaEntry *vma_entry = args->vmas + i; + size_t len; + + if (!vma_entry_is(vma_entry, VMA_AREA_GUARD)) + continue; + + len = vma_entry->end - vma_entry->start; + ret = sys_madvise(vma_entry->start, len, MADV_GUARD_INSTALL); + if (ret) { + pr_err("madvise(%" PRIx64 ", %zu, MADV_GUARD_INSTALL) " + "failed with %d\n", + vma_entry->start, len, ret); + return -1; + } + } + + return 0; +} + /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -1972,6 +1997,13 @@ __visible long __export_restore_task(struct task_restore_args *args) } } + /* + * Restore madvise(MADV_GUARD_INSTALL) + */ + ret = restore_madv_guard_regions(args); + if (ret) + goto core_restore_end; + /* * Tune up the task fields. */ From 6e403a1ffde9a84554bdf6cd99f26c88b7f63ce9 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Mon, 4 Aug 2025 10:48:47 +0200 Subject: [PATCH 069/137] criu/mem: dump: note MADV_GUARD pages as VMA_AREA_GUARD VMAs Signed-off-by: Alexander Mikhalitsyn --- criu/cr-dump.c | 17 ++++++++++++ criu/include/mem.h | 1 + criu/mem.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index f02db1a57f..10c485cbe9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -130,6 +130,23 @@ int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap if (ret < 0) goto err; + /* + * In addition to real process VMAs we should keep an info about + * madvise(MADV_GUARD_INSTALL) pages. While these are not represented + * as a struct vm_area_struct in the kernel, it is convenient to treat + * them as mappings in CRIU and reuse the same VMA images but with only + * VMA_AREA_GUARD flag set. + * + * Also, we don't need to dump them during pre-dump. + */ + if (dump_file) { + ret = collect_madv_guards(pid, vma_area_list); + if (ret < 0) { + pr_err("Collect MADV_GUARD_INSTALL pages (pid: %d) failed with %d\n", pid, ret); + goto err; + } + } + pr_info("Collected, longest area occupies %lu pages\n", vma_area_list->nr_priv_pages_longest); pr_info_vma_list(&vma_area_list->h); diff --git a/criu/include/mem.h b/criu/include/mem.h index b2cbd4b640..e9ce3518ae 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -31,6 +31,7 @@ extern int do_task_reset_dirty_track(int pid); extern unsigned long dump_pages_args_size(struct vm_area_list *vmas); extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl); +extern int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list); #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) diff --git a/criu/mem.c b/criu/mem.c index ee841aca2e..0636273cbe 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -1548,3 +1548,72 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) return prepare_vma_ios(t, ta); } + +int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list) +{ + int pagemap_fd = -1; + struct page_region *regs = NULL; + long regs_len = 0; + int i, ret = -1; + + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = 0, + .end = kdat.task_size, + .walk_end = 0, + .vec_len = 1000, /* this should be enough for most cases */ + .max_pages = 0, + .category_mask = PAGE_IS_GUARD, + .return_mask = PAGE_IS_GUARD, + }; + + if (!kdat.has_pagemap_scan_guard_pages) { + ret = 0; + goto out; + } + + pagemap_fd = open_proc(pid, "pagemap"); + if (pagemap_fd < 0) + goto out; + + regs = xmalloc(args.vec_len * sizeof(struct page_region)); + if (!regs) + goto out; + args.vec = (long)regs; + + do { + /* start from where we finished the last time */ + args.start = args.walk_end; + regs_len = ioctl(pagemap_fd, PAGEMAP_SCAN, &args); + if (regs_len == -1) { + pr_perror("PAGEMAP_SCAN"); + goto out; + } + + for (i = 0; i < regs_len; i++) { + struct vma_area *vma; + + BUG_ON(!(regs[i].categories & PAGE_IS_GUARD)); + + vma = alloc_vma_area(); + if (!vma) + goto out; + + vma->e->start = regs[i].start; + vma->e->end = regs[i].end; + vma->e->status = VMA_AREA_GUARD; + + list_add_tail(&vma->list, &vma_area_list->h); + vma_area_list->nr++; + } + } while (args.walk_end != kdat.task_size); + + ret = 0; + +out: + xfree(regs); + if (pagemap_fd >= 0) + close(pagemap_fd); + return ret; +} From d6d6f73744c68aff67f034020a8fa59398575371 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 20 Apr 2025 20:20:20 +0200 Subject: [PATCH 070/137] test/zdtm/static/maps12: add madv guards test Test for madvise(MADV_GUARD_INSTALL). Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/Makefile | 1 + test/zdtm/static/maps12.c | 350 +++++++++++++++++++++++++++++++++++ test/zdtm/static/maps12.desc | 1 + 3 files changed, 352 insertions(+) create mode 100644 test/zdtm/static/maps12.c create mode 100644 test/zdtm/static/maps12.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index ab69f389ed..e73f964be5 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -315,6 +315,7 @@ TST_FILE = \ write_read02 \ write_read10 \ maps00 \ + maps12 \ link10 \ file_attr \ deleted_unix_sock \ diff --git a/test/zdtm/static/maps12.c b/test/zdtm/static/maps12.c new file mode 100644 index 0000000000..b645595bec --- /dev/null +++ b/test/zdtm/static/maps12.c @@ -0,0 +1,350 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test madvise(MADV_GUARD_INSTALL)"; +const char *test_author = "Alexander Mikhalitsyn "; +/* some parts of code were taken from Linux kernel's kselftest guard-pages.c + written by Lorenzo Stoakes */ + +char *filename; +int fd; +TEST_OPTION(filename, string, "file name", 1); + +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif + +uint8_t *map_base; + +struct { + unsigned int pages_num; + bool filemap; +} vmas[] = { + { 2, false }, + { 2, false }, + { 2, false }, + { 2, true }, + { 2, true }, + { 2, true }, +}; + +struct { + bool guarded; + bool wipeonfork; +} pages[] = { + { false, false }, /* vmas[0] */ + { true, false }, + { true, false }, /* vmas[1] */ + { false, false }, + { false, false }, /* vmas[2] */ + { true, true }, + { true, false }, /* vmas[3] */ + { false, false }, + { true, false }, /* vmas[4] */ + { true, false }, + { false, false }, /* vmas[5] */ + { true, false }, +}; + +static volatile sig_atomic_t signal_jump_set; +static sigjmp_buf signal_jmp_buf; + +static void handle_sigsegv(int signo) +{ + if (!signal_jump_set) + return; + + siglongjmp(signal_jmp_buf, 1); +} + +static bool try_write_to_addr(uint8_t *ptr) +{ + bool failed; + + /* Tell signal handler to jump back here on fatal signal. */ + signal_jump_set = true; + /* If a fatal signal arose, we will jump back here and failed is set. */ + failed = sigsetjmp(signal_jmp_buf, 1) != 0; + + if (!failed) + *ptr = 'x'; + + signal_jump_set = false; + return !failed; +} + +static int setup_sigsegv_handler(void) +{ + uint8_t write_me; + + if (signal(SIGSEGV, handle_sigsegv) == SIG_ERR) { + pr_perror("setting SIGSEGV handler failed"); + return 1; + } + + /* ensure that try_write_to_addr() works properly */ + if (!try_write_to_addr(&write_me)) { + pr_err("Failed to write at valid addr. Buggy try_write_to_addr()?\n"); + return 1; + } + + if (try_write_to_addr(NULL)) { + pr_err("Failed to detect an invalid write. Buggy try_write_to_addr()?\n"); + return 1; + } + + return 0; +} + +static inline void *mmap_pages(void *addr_hint, unsigned int count, bool filemap) +{ + char *map; + + map = mmap(addr_hint, count * PAGE_SIZE, PROT_WRITE | PROT_READ, + MAP_PRIVATE | (filemap ? 0 : MAP_ANONYMOUS) | (addr_hint ? MAP_FIXED : 0), + filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); + if (map == MAP_FAILED || (addr_hint && (map != addr_hint))) + return MAP_FAILED; + + return map; +} + +static int __check_guards(const char *when, bool in_child) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + /* + * Skip pages that were never guarded, and also those + * that were, but have MADV_WIPEONFORK which means that + * guards were removed on fork. + */ + if (!pages[i].guarded || (in_child && pages[i].wipeonfork)) + continue; + + if (try_write_to_addr(&map_base[i * PAGE_SIZE])) { + pr_err("successful write to a guarded area %d %s C/R\n", + i, when); + return 1; + } + } + + return 0; +} + +static int check_guards(const char *when) +{ + int status; + pid_t pid; + + /* + * First of all, check that guards are on their places + * in a main test process. + */ + if (__check_guards(when, false)) { + return 1; + } + + /* + * Now, check that guards are on their places + * after fork(). This allows to ensure that + * combo MADV_WIPEONFORK + MADV_GUARD_INSTALL + * is restored properly too. + */ + + pid = test_fork(); + if (pid < 0) { + pr_perror("check_guards: fork failed"); + return 1; + } + + if (pid == 0) { + if (__check_guards(when, true)) { + pr_err("check_guards(\"%s\") failed in child\n", when); + exit(1); + } + + exit(0); + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("check_guards: waitpid"); + return 1; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + pr_err("check_guards: process didn't exit cleanly: status=%d\n", status); + return 1; + } + + return 0; +} + +static void gen_pages_data(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + uint32_t crc; + + if (pages[i].guarded) + continue; + + crc = ~0; + datagen(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc); + } +} + +static int set_pages_madvs(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + if (pages[i].guarded) { + if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, + MADV_GUARD_INSTALL)) { + pr_perror("MADV_GUARD_INSTALL failed on page %d", i); + return 1; + } + } + + if (pages[i].wipeonfork) { + if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, + MADV_WIPEONFORK)) { + pr_perror("MADV_WIPEONFORK failed on page %d", i); + return 1; + } + } + } + + return 0; +} + +static int check_pages_data(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + uint32_t crc; + + if (pages[i].guarded) + continue; + + crc = ~0; + if (datachk(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc)) { + pr_err("Page %d is corrupted\n", i); + return 1; + } + } + + return 0; +} + +static int prepare_vmas(void) +{ + char *map; + int i, shift; + + shift = 0; + for (i = 0; i < ARRAY_SIZE(vmas); i++) { + map = mmap_pages(&map_base[shift * PAGE_SIZE], + vmas[i].pages_num, vmas[i].filemap); + if (map == MAP_FAILED) { + pr_err("mmap of [%d,%d] pages failed\n", + shift, shift + vmas[i].pages_num); + return 1; + } + + shift += vmas[i].pages_num; + } + + if (shift != ARRAY_SIZE(pages)) { + pr_err("Different number of pages in vmas and pages arrays.\n"); + return 1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + unsigned int pages_num = ARRAY_SIZE(pages); + + test_init(argc, argv); + + fd = open(filename, O_TRUNC | O_CREAT | O_RDWR, 0600); + if (fd < 0) { + pr_perror("Unable to create a test file"); + return -1; + } + + if (ftruncate(fd, pages_num * PAGE_SIZE)) { + pr_perror("Unable to ftruncate a test file"); + return -1; + } + + if (setup_sigsegv_handler()) { + pr_err("setup_sigsegv_handler() failed\n"); + return 1; + } + + /* let's find a large enough area in address space */ + map_base = mmap_pages(NULL, pages_num, false); + if (map_base == MAP_FAILED) { + pr_err("mmap of %d pages failed\n", pages_num); + return 1; + } + + /* + * Now we know that we have a free vm address space area + * [map_base, map_base + pages_num * PAGE_SIZE). + * We can use (map_base) as a hint for our further mmaps. + */ + if (prepare_vmas()) { + pr_err("prepare_vmas() failed\n"); + return 1; + } + + /* fill non-guarded pages with data and preserve checksums */ + gen_pages_data(); + + if (set_pages_madvs()) { + pr_err("set_pages_madvs() failed\n"); + return 1; + } + + /* ensure that madvise(MADV_GUARD_INSTALL) works like expected */ + if (check_guards("before")) { + pr_err("check_guards(\"before\") failed\n"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* ensure that guards are at their places */ + if (check_guards("after")) { + fail("check_guards(\"after\") failed"); + return 1; + } + + /* check that non-guarded pages still contain original data */ + if (check_pages_data()) { + fail("check_pages_data() failed"); + return 1; + } + + pass(); + munmap(map_base, pages_num * PAGE_SIZE); + close(fd); + return 0; +} diff --git a/test/zdtm/static/maps12.desc b/test/zdtm/static/maps12.desc new file mode 100644 index 0000000000..3f7627ff3c --- /dev/null +++ b/test/zdtm/static/maps12.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'feature': 'pagemap_scan_guard_pages'} From 3b70644368d468141bedee15a2842017bb72ab8c Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 10 Aug 2025 18:22:23 +0200 Subject: [PATCH 071/137] ci/vagrant: install vanilla kernel for Fedora Rawhide test We need at least 6.16 to test MADV_GUARD_INSTALL support, but our current Fedora Rawhide test uses only Rawhide's user space, while using Fedora 42 kernel. Let's start using a vanilla kernel. Suggested-by: Adrian Reber Signed-off-by: Alexander Mikhalitsyn --- scripts/ci/vagrant.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 008a01fb35..98942e7565 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -66,6 +66,10 @@ fedora-no-vdso() { } fedora-rawhide() { + # Upgrade the kernel to the latest vanilla one + ssh default sudo dnf -y copr enable @kernel-vanilla/stable + ssh default sudo dnf upgrade -y + # The 6.2 kernel of Fedora 38 in combination with rawhide userspace breaks # zdtm/static/socket-tcp-nfconntrack. To activate the new kernel previously # installed this reboots the VM. From 08e24051a4e059aa062cca3b4cb89c0da95f1d34 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 15 Aug 2025 01:44:01 +0000 Subject: [PATCH 072/137] make: Disable branch-protection for PIE code on ARM64 Branch protection uses PAC. It cryptographically "signs" a function's return address before it is stored on the stack. Upon return, the address is authenticated using a secret key. If the signature is invalid, the program will fault. The PIE code is used for the parasite and the restorer. In both cases, it runs in a foreign process. The case of the restorer is even trickier because it needs to restore the original PAC keys, which invalidates all previously "signed" pointers within the restorer itself. Fixes #2709 Signed-off-by: Andrei Vagin --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 5d8e89ac1b..7272cfce19 100644 --- a/Makefile +++ b/Makefile @@ -64,6 +64,8 @@ endif ifeq ($(ARCH),aarch64) DEFINES := -DCONFIG_AARCH64 + CC_MBRANCH_PROT := $(shell $(CC) -c -x c /dev/null -mbranch-protection=none -o /dev/null >/dev/null 2>&1 && echo "-mbranch-protection=none") + CFLAGS_PIE := $(CC_MBRANCH_PROT) endif ifeq ($(ARCH),ppc64) From ea1e966d24202af6bf96384ab7cc4c3817ec6d45 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 16 Aug 2025 15:45:05 +0100 Subject: [PATCH 073/137] test/zdtm/static/maps12: fix pointer-to-int cast The `offset` argument to `mmap()` was computed with a direct cast from pointer to `off_t`: `(off_t)addr_hint - (off_t)map_base` This causes a build failure when compiling since pointers and `off_t` may differ in size on some platforms. maps12.c: In function 'mmap_pages': maps12.c:114:50: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 114 | filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); | ^ maps12.c:114:69: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 114 | filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); The fix in this patch is to cast both pointers to `intptr_t`, perform the subtraction in that type, and then cast the result back to `off_t`. Signed-off-by: Radostin Stoyanov --- test/zdtm/static/maps12.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/maps12.c b/test/zdtm/static/maps12.c index b645595bec..f0d6c23819 100644 --- a/test/zdtm/static/maps12.c +++ b/test/zdtm/static/maps12.c @@ -111,7 +111,8 @@ static inline void *mmap_pages(void *addr_hint, unsigned int count, bool filemap map = mmap(addr_hint, count * PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_PRIVATE | (filemap ? 0 : MAP_ANONYMOUS) | (addr_hint ? MAP_FIXED : 0), - filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); + filemap ? fd : -1, + filemap ? (off_t)((intptr_t)addr_hint - (intptr_t)map_base) : 0); if (map == MAP_FAILED || (addr_hint && (map != addr_hint))) return MAP_FAILED; From 11972be56444f46b24e11f0edb50832814da81e8 Mon Sep 17 00:00:00 2001 From: Dong Sunchao Date: Wed, 20 Aug 2025 12:38:18 +0000 Subject: [PATCH 074/137] zdtm/static/sock_opts00: use unix socket to test SO_PASSCRED and SO_PASSSEC SO_PASSCRED and SO_PASSSEC are only valid for AF_UNIX and AF_NETLINK This patch updates the test logic to use a unix socket for these options, while preserving the original value consistency check Fixes: #2705 Signed-off-by: Dong Sunchao --- test/zdtm/static/sock_opts00.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/test/zdtm/static/sock_opts00.c b/test/zdtm/static/sock_opts00.c index fcf00ffed8..854aaa5911 100644 --- a/test/zdtm/static/sock_opts00.c +++ b/test/zdtm/static/sock_opts00.c @@ -31,7 +31,7 @@ int main(int argc, char **argv) static const int NOPTS = sizeof(vname) / sizeof(*vname); #undef OPT - int sock, ret = 0, val[NOPTS], rval, i; + int sock, usock, sk, ret = 0, val[NOPTS], rval, i; socklen_t len = sizeof(int); test_init(argc, argv); @@ -42,8 +42,15 @@ int main(int argc, char **argv) return 1; } + usock = socket(AF_UNIX, SOCK_STREAM, 0); + if (usock < 0) { + pr_perror("can't create unix socket"); + return 1; + } + for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], &len); + sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], &len); if (ret) { pr_perror("can't get %s", vname[i].name); return 1; @@ -51,13 +58,13 @@ int main(int argc, char **argv) val[i]++; - ret = setsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], len); + ret = setsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], len); if (ret) { pr_perror("can't set %s = %d", vname[i].name, val[i]); return 1; } - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { pr_perror("can't re-get %s", vname[i].name); return 1; @@ -78,7 +85,8 @@ int main(int argc, char **argv) test_waitsig(); for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); + sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { pr_perror("can't verify %s", vname[i].name); return 1; @@ -93,6 +101,7 @@ int main(int argc, char **argv) pass(); close(sock); + close(usock); return 0; } From 610ee3c50315375545b77aac6e600caeb4d81a47 Mon Sep 17 00:00:00 2001 From: Dong Sunchao Date: Wed, 20 Aug 2025 12:38:37 +0000 Subject: [PATCH 075/137] criu/sockets: Restrict SO_PASSCRED and SO_PASSSEC to supported families Linux 6.16+ restricts SO_PASSCRED and SO_PASSSEC to AF_UNIX, AF_NETLINK, and AF_BLUETOOTH This patch updates CRIU to check the socket family before dumping these options Fixes: #2705 Signed-off-by: Dong Sunchao --- criu/include/sockets.h | 2 +- criu/sk-inet.c | 2 +- criu/sk-netlink.c | 2 +- criu/sk-packet.c | 2 +- criu/sk-unix.c | 2 +- criu/sockets.c | 16 +++++++++------- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/criu/include/sockets.h b/criu/include/sockets.h index c3e7c879a7..6c81d3edd7 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -25,7 +25,7 @@ struct socket_desc { }; extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); -extern int dump_socket_opts(int sk, SkOptsEntry *soe); +extern int dump_socket_opts(int sk, int family, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); extern int sk_setbufs(int sk, uint32_t *bufs); extern void release_skopts(SkOptsEntry *); diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 6e0acf2ce3..422edc6567 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -581,7 +581,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa if (dump_ip_opts(lfd, family, type, proto, &ipopts)) goto err; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, family, &skopts)) goto err; pr_info("Dumping inet socket at %d\n", p->fd); diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c index a219b69be1..dc2baa1b80 100644 --- a/criu/sk-netlink.c +++ b/criu/sk-netlink.c @@ -165,7 +165,7 @@ static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) ne.fown = (FownEntry *)&p->fown; ne.opts = &skopts; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, AF_NETLINK, &skopts)) goto err; fe.type = FD_TYPES__NETLINKSK; diff --git a/criu/sk-packet.c b/criu/sk-packet.c index 1d2e23522a..6530bff580 100644 --- a/criu/sk-packet.c +++ b/criu/sk-packet.c @@ -173,7 +173,7 @@ static int dump_one_packet_fd(int lfd, u32 id, const struct fd_parms *p) psk.fown = (FownEntry *)&p->fown; psk.opts = &skopts; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, AF_PACKET, &skopts)) return -1; psk.protocol = sd->proto; diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 70ca16be4a..6145fe7347 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -527,7 +527,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) } } dump: - if (dump_socket_opts(lfd, skopts)) + if (dump_socket_opts(lfd, AF_UNIX, skopts)) goto err; pr_info("Dumping unix socket at %d\n", p->fd); diff --git a/criu/sockets.c b/criu/sockets.c index 0affccad02..e4adae03cd 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -649,7 +649,7 @@ int do_dump_opt(int sk, int level, int name, void *val, int len) return 0; } -int dump_socket_opts(int sk, SkOptsEntry *soe) +int dump_socket_opts(int sk, int family, SkOptsEntry *soe) { int ret = 0, val; struct timeval tv; @@ -688,13 +688,15 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) soe->so_reuseport = val ? true : false; soe->has_so_reuseport = true; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); - soe->has_so_passcred = true; - soe->so_passcred = val ? true : false; + if (family == AF_UNIX || family == AF_NETLINK) { + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); + soe->has_so_passcred = true; + soe->so_passcred = val ? true : false; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); - soe->has_so_passsec = true; - soe->so_passsec = val ? true : false; + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); + soe->has_so_passsec = true; + soe->so_passsec = val ? true : false; + } ret |= dump_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val); soe->has_so_dontroute = true; From 418c8fa7b443212743a6cc770cf4c167e2c45f45 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 8 Sep 2025 12:48:34 -0700 Subject: [PATCH 076/137] ci: avoid Docker 28 due to regression This change modifies the CI script to avoid Docker version 28, which has a known regression that breaks Checkpoint/Restore (C/R) functionality. The issue is tracked in the moby/moby project as https://github.com/moby/moby/issues/50750. Signed-off-by: Andrei Vagin --- scripts/ci/docker-test.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index aaf443afdc..ae7f52454d 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -2,6 +2,24 @@ set -x -e -o pipefail +# Workaround: Docker 28.x has a known regression that breaks the checkpoint and +# restore (C/R) feature. Let's install previous, or next major version. See +# https://github.com/moby/moby/issues/50750 for details on the bug. +export DEBIAN_FRONTEND=noninteractive +apt remove -y docker-ce docker-ce-cli +./apt-install -y ca-certificates curl +install -m 0755 -d /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +chmod a+r /etc/apt/keyrings/docker.asc +# shellcheck disable=SC1091 +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" > /etc/apt/sources.list.d/docker.list +apt update -y +apt-cache madison docker-ce | awk '{ print $3 }' +verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -v ':28\.'| tail -n 1)" +./apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" + # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json service docker restart From 032326ce84e681d05a599bc05e09dab739fc3503 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 14 Sep 2025 19:29:16 -0700 Subject: [PATCH 077/137] zdtm: stop importing junit_xml We are dropping support for generating JUnit XML reports in zdtm.py as we've migrated testing infrastructure entirely to `GitHub Actions` and other third-party test runners. This package has been removed from some distribution repositories (e.g., Fedora), making it simpler to remove the dependency than to force installation via pip. Signed-off-by: Andrei Vagin --- .cirrus.yml | 2 +- scripts/build/Dockerfile.alpine | 2 -- scripts/build/Dockerfile.archlinux | 1 - scripts/build/Dockerfile.centos8 | 2 -- scripts/ci/prepare-for-fedora-rawhide.sh | 1 - scripts/ci/run-ci-tests.sh | 2 +- scripts/ci/vagrant.sh | 2 +- test/jenkins/criu-lazy-migration.pipeline | 1 - test/zdtm.py | 24 +---------------------- 9 files changed, 4 insertions(+), 33 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index bddd5a3f1c..848e141329 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -34,7 +34,7 @@ task: setup_script: | dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel libuuid-devel + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python3-importlib-metadata xmlto libdrm-devel libuuid-devel # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index d843793ea2..819fda0c38 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -48,6 +48,4 @@ RUN apk add \ # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test -RUN pip3 install junit_xml --break-system-packages - RUN make -C test/zdtm diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index 9d11194bb0..d4b432f8d6 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -32,7 +32,6 @@ RUN pacman -Syu --noconfirm \ go \ python-yaml \ asciidoctor \ - python-junit-xml \ python-importlib-metadata \ libdrm \ util-linux-libs \ diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 index a672123441..5ab6c9cfa4 100644 --- a/scripts/build/Dockerfile.centos8 +++ b/scripts/build/Dockerfile.centos8 @@ -45,6 +45,4 @@ RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 test -RUN pip3 install junit_xml - RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index f8ad9cf978..f8f797c1e5 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -26,7 +26,6 @@ dnf install -y \ protobuf-devel \ python3-PyYAML \ python3-protobuf \ - python3-junit_xml \ python3-pip \ python3-importlib-metadata \ python-unversioned-command \ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 0c4a089757..617f54fc6e 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -6,7 +6,7 @@ CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time libbsd-dev python3-yaml uuid-dev libperl-dev pkg-config python3-protobuf python3-pip - python3-importlib-metadata python3-junit.xml libdrm-dev) + python3-importlib-metadata libdrm-dev) X86_64_PKGS=(gcc-multilib) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 98942e7565..c222e30e05 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -44,7 +44,7 @@ setup() { ssh default sudo dnf upgrade -y ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ + protobuf-devel python3-protobuf python3-importlib-metadata \ rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket diff --git a/test/jenkins/criu-lazy-migration.pipeline b/test/jenkins/criu-lazy-migration.pipeline index 2c863f170d..45dc2c7766 100644 --- a/test/jenkins/criu-lazy-migration.pipeline +++ b/test/jenkins/criu-lazy-migration.pipeline @@ -21,7 +21,6 @@ pipeline { stage('Test'){ steps { sh './test/jenkins/run_ct sh -c "mount --make-rprivate / && mount --rbind . /mnt && cd /mnt && ./test/jenkins/criu-lazy-migration.sh"' - junit 'test/report/criu-testreport*.xml' } } } diff --git a/test/zdtm.py b/test/zdtm.py index 3339dd8167..7e83aa4df9 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2078,8 +2078,6 @@ def __init__(self, opts, nr_tests): self.__subs = {} self.__fail = False self.__file_report = None - self.__junit_file = None - self.__junit_test_cases = None self.__failed = [] self.__nr_skip = 0 if self.__max > 1 and self.__total > 1: @@ -2091,22 +2089,14 @@ def __init__(self, opts, nr_tests): if opts['report'] and (opts['keep_going'] or self.__total == 1): global TestSuite, TestCase - from junit_xml import TestCase, TestSuite now = datetime.datetime.now() att = 0 reportname = os.path.join(report_dir, "criu-testreport.tap") - junitreport = os.path.join(report_dir, "criu-testreport.xml") - while os.access(reportname, os.F_OK) or os.access( - junitreport, os.F_OK): + while os.access(reportname, os.F_OK): reportname = os.path.join(report_dir, "criu-testreport" + ".%d.tap" % att) - junitreport = os.path.join(report_dir, - "criu-testreport" + ".%d.xml" % att) att += 1 - self.__junit_file = open(junitreport, 'a') - self.__junit_test_cases = [] - self.__file_report = open(reportname, 'a') print(u"TAP version 13", file=self.__file_report) print(u"# Hardware architecture: " + arch, file=self.__file_report) @@ -2141,10 +2131,6 @@ def skip(self, name, reason): self.__runtest += 1 self.__nr_skip += 1 - if self.__junit_test_cases is not None: - tc = TestCase(name) - tc.add_skipped_info(reason) - self.__junit_test_cases.append(tc) if self.__file_report: testline = u"ok %d - %s # SKIP %s" % (self.__runtest, name, reason) print(testline, file=self.__file_report) @@ -2247,10 +2233,6 @@ def __wait_one(self, flags): # It's useful for taming warnings in subprocess.Popen.__del__() sub['sub'].wait() tc = None - if self.__junit_test_cases is not None: - tc = TestCase(sub['name'], - elapsed_sec=time.time() - sub['start']) - self.__junit_test_cases.append(tc) if status != 0: self.__fail = True failed_flavor = decode_flav(os.WEXITSTATUS(status)) @@ -2307,10 +2289,6 @@ def finish(self): if not opts['fault'] and check_core_files(): self.__fail = True if self.__file_report: - ts = TestSuite(opts['title'], self.__junit_test_cases, - os.getenv("NODE_NAME")) - self.__junit_file.write(TestSuite.to_xml_string([ts])) - self.__junit_file.close() self.__file_report.close() if opts['keep_going']: From 75068c6c74dc0f75bdbc11b1a4af3689b0fddcd2 Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Thu, 18 Sep 2025 10:01:48 +0200 Subject: [PATCH 078/137] pagemap: prevent integer overflow in pagemap_len Fixes #2738 Original-patch-by: Andrey Vagin Signed-off-by: Lorenzo Fontana --- criu/include/pagemap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 3ae15deb9c..fae110108c 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -121,7 +121,7 @@ extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned lo static inline unsigned long pagemap_len(PagemapEntry *pe) { - return pe->nr_pages * PAGE_SIZE; + return (unsigned long)pe->nr_pages * PAGE_SIZE; } static inline bool page_read_has_parent(struct page_read *pr) From 2c08fcfb48d8669e5984bb8834ff508e769f85ab Mon Sep 17 00:00:00 2001 From: dong sunchao Date: Thu, 18 Sep 2025 03:09:30 +1000 Subject: [PATCH 079/137] compel/mips: Relax ELF magic check to support MIPS libraries On MIPS platforms, shared libraries may use EI_ABIVERSION = 5 to indicate support for .MIPS.xhash sections. The previous ELF header check in handle_binary() strictly compared e_ident against a hardcoded value, causing legitimate shared objects to be rejected. This patch replaces the memcmp-based check with a structured validation of ELF magic and class, and allows EI_ABIVERSION values beside 0. fixes: #2745 Signed-off-by: dong sunchao --- compel/arch/mips/src/lib/handle-elf.c | 31 +++++++++++++++++++-------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/compel/arch/mips/src/lib/handle-elf.c b/compel/arch/mips/src/lib/handle-elf.c index a605a5a452..e086761c21 100644 --- a/compel/arch/mips/src/lib/handle-elf.c +++ b/compel/arch/mips/src/lib/handle-elf.c @@ -5,18 +5,31 @@ #include "piegen.h" #include "log.h" -static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - extern int __handle_elf(void *mem, size_t size); int handle_binary(void *mem, size_t size) { - if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) - return __handle_elf(mem, size); + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)mem; + + /* check ELF magic */ + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return -EINVAL; + } + + /* check ELF class and data encoding */ + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64 || + ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF class or data encoding\n"); + return -EINVAL; + } + + if (ehdr->e_ident[EI_ABIVERSION] != 0) { + pr_warn("Unusual ABI version: %d\n", ehdr->e_ident[EI_ABIVERSION]); + } - pr_err("Unsupported Elf format detected\n"); - return -EINVAL; + return __handle_elf(mem, size); } From 74986b4190e6bd6999418ea38583ae9df541e73b Mon Sep 17 00:00:00 2001 From: Filip Hejsek Date: Sat, 13 Sep 2025 19:49:24 +0200 Subject: [PATCH 080/137] lsm: use attr/apparmor/current to get apparmor label On some kernels, attr/current can be intercepted by BPF LSM, causing errors (#2033). Using attr/apparmor/current is preferable, because it is guaranteed to return the apparmor label. attr/current will still be used as a fallback for older kernels. Fixes: #2033 Signed-off-by: Filip Hejsek --- criu/lsm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/criu/lsm.c b/criu/lsm.c index 70b66d42ee..5faf3e5b2f 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -29,7 +29,9 @@ static int apparmor_get_label(pid_t pid, char **profile_name) FILE *f; char *space; - f = fopen_proc(pid, "attr/current"); + f = fopen_proc(pid, "attr/apparmor/current"); + if (!f) + f = fopen_proc(pid, "attr/current"); if (!f) return -1; From 630a5d81cf3d72f72b8d6978c4ad6cdc714f6670 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 14 Sep 2025 18:44:51 -0700 Subject: [PATCH 081/137] files: fork helpers without CLONE_FILES | CLONE_FS On restore, CRIU needs to change mount namespaces to properly restore files and unix sockets. However, the kernel prevents this if a process is sharing its file system information (fs) with other processes. Fixes #2687 Signed-off-by: Andrei Vagin --- criu/files.c | 1 - criu/pstree.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/criu/files.c b/criu/files.c index f16ec32a23..af4b8aeac8 100644 --- a/criu/files.c +++ b/criu/files.c @@ -1329,7 +1329,6 @@ int prepare_fds(struct pstree_item *me) } } - BUG_ON(current->pid->state == TASK_HELPER); ret = open_fdinfos(me); if (rsti(me)->fdt) diff --git a/criu/pstree.c b/criu/pstree.c index 75c2fc8d0a..cee8b5741a 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -237,9 +237,8 @@ int init_pstree_helper(struct pstree_item *ret) { BUG_ON(!ret->parent); ret->pid->state = TASK_HELPER; - rsti(ret)->clone_flags = CLONE_FILES | CLONE_FS; - if (shared_fdt_prepare(ret) < 0) - return -1; + rsti(ret)->clone_flags = 0; + INIT_LIST_HEAD(&rsti(ret)->fds); task_entries->nr_helpers++; return 0; } From 21b14e191a4421852b176f93eeefbfc08bffac74 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 18 Sep 2025 14:48:42 +0000 Subject: [PATCH 082/137] pagemap: change PagemapEntry.nr_pages to uint64 to support huge mappings Update the nr_pages field in PagemapEntry to uint64 to prepare for checkpointing and restoring huge memory mappings. Backward compatibility with older pagemap images is preserved. Signed-off-by: Andrei Vagin --- criu/include/pagemap.h | 2 +- criu/page-xfer.c | 1 + criu/pagemap.c | 5 ++++- images/pagemap.proto | 3 ++- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index fae110108c..3ae15deb9c 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -121,7 +121,7 @@ extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned lo static inline unsigned long pagemap_len(PagemapEntry *pe) { - return (unsigned long)pe->nr_pages * PAGE_SIZE; + return pe->nr_pages * PAGE_SIZE; } static inline bool page_read_has_parent(struct page_read *pr) diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 0314963e6d..b0e04d82c5 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -326,6 +326,7 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag pe.nr_pages = iov->iov_len / PAGE_SIZE; pe.has_flags = true; pe.flags = flags; + pe.has_nr_pages = true; if (flags & PE_PRESENT) { if (opts.auto_dedup && xfer->parent != NULL) { diff --git a/criu/pagemap.c b/criu/pagemap.c index 85bb922596..d9ccc03eb6 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -171,7 +171,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%u vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%lu vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } @@ -682,6 +682,9 @@ static void init_compat_pagemap_entry(PagemapEntry *pe) pe->flags |= PE_PARENT; else if (!pe->has_flags) pe->flags = PE_PRESENT; + + if (!pe->has_nr_pages) + pe->nr_pages = pe->compat_nr_pages; } /* diff --git a/images/pagemap.proto b/images/pagemap.proto index e6d341b0f6..f2436a51ac 100644 --- a/images/pagemap.proto +++ b/images/pagemap.proto @@ -10,7 +10,8 @@ message pagemap_head { message pagemap_entry { required uint64 vaddr = 1 [(criu).hex = true]; - required uint32 nr_pages = 2; + required uint32 compat_nr_pages = 2; optional bool in_parent = 3; optional uint32 flags = 4 [(criu).flags = "pmap.flags" ]; + optional uint64 nr_pages = 5; } From dffb8543c1c3edafa299095a7bb378b9101b9f90 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 18 Sep 2025 15:20:32 +0000 Subject: [PATCH 083/137] pagemap: use unsigned long for page counts Variables storing page counts were previously `unsigned int`, limiting them to a maximum of 2^32 pages. With a 4k page size, this corresponds to a 16TB memory mapping, which is insufficient for larger mappings. This commit changes the type for these variables to `unsigned long` to support larger memory mappings. Signed-off-by: Andrei Vagin --- criu/include/page-pipe.h | 6 +++--- criu/include/page-xfer.h | 6 +++--- criu/include/pagemap.h | 6 +++--- criu/include/parasite.h | 2 +- criu/mem.c | 2 +- criu/page-pipe.c | 6 +++--- criu/page-xfer.c | 20 ++++++++++---------- criu/pagemap.c | 22 +++++++++++----------- criu/pie/parasite.c | 2 +- criu/uffd.c | 25 ++++++++++++------------- 10 files changed, 48 insertions(+), 49 deletions(-) diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h index 15178c0150..65292b7ab1 100644 --- a/criu/include/page-pipe.h +++ b/criu/include/page-pipe.h @@ -92,9 +92,9 @@ struct kernel_pipe_buffer { struct page_pipe_buf { int p[2]; /* pipe with pages */ unsigned int pipe_size; /* how many pages can be fit into pipe */ - unsigned int pipe_off; /* where this buf is started in a pipe */ - unsigned int pages_in; /* how many pages are there */ unsigned int nr_segs; /* how many iov-s are busy */ + unsigned long pipe_off; /* where this buf is started in a pipe */ + unsigned long pages_in; /* how many pages are there */ #define PPB_LAZY (1 << 0) unsigned int flags; struct iovec *iov; /* vaddr:len map */ @@ -149,7 +149,7 @@ struct pipe_read_dest { }; extern int pipe_read_dest_init(struct pipe_read_dest *prd); -extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, +extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long *nr_pages, unsigned int ppb_flags); #endif /* __CR_PAGE_PIPE_H__ */ diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h index 36fe670928..0d9b350194 100644 --- a/criu/include/page-xfer.h +++ b/criu/include/page-xfer.h @@ -69,9 +69,9 @@ extern int check_parent_page_xfer(int fd_type, unsigned long id); */ /* async request/receive of remote pages */ -extern int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages); +extern int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages); -typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, int nr_pages, void *); -extern int page_server_start_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); +typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, unsigned long nr_pages, void *); +extern int page_server_start_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); #endif /* __CR_PAGE_XFER__H__ */ diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 3ae15deb9c..4cbc87cc6d 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -44,7 +44,7 @@ struct page_read { /* reads page from current pagemap */ - int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *, unsigned flags); + int (*read_pages)(struct page_read *, unsigned long vaddr, unsigned long nr, void *, unsigned flags); /* Advance page_read to the next entry */ int (*advance)(struct page_read *pr); void (*close)(struct page_read *); @@ -52,8 +52,8 @@ struct page_read { int (*sync)(struct page_read *pr); int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr); void (*reset)(struct page_read *pr); - int (*io_complete)(struct page_read *, unsigned long vaddr, int nr); - int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags); + int (*io_complete)(struct page_read *, unsigned long vaddr, unsigned long nr); + int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags); /* Whether or not pages can be read in PIE code */ bool pieok; diff --git a/criu/include/parasite.h b/criu/include/parasite.h index b33d6710f8..1763577111 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -63,7 +63,7 @@ struct parasite_dump_pages_args { unsigned int add_prot; unsigned int off; unsigned int nr_segs; - unsigned int nr_pages; + unsigned long nr_pages; }; static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a) diff --git a/criu/mem.c b/criu/mem.c index 0636273cbe..f8c5508428 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -336,7 +336,7 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct pa list_for_each_entry(ppb, &pp->bufs, l) { args->nr_segs = ppb->nr_segs; args->nr_pages = ppb->pages_in; - pr_debug("PPB: %d pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, + pr_debug("PPB: %ld pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, args->off); ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl); diff --git a/criu/page-pipe.c b/criu/page-pipe.c index aab6742be7..f8e3520f79 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -381,7 +381,7 @@ int pipe_read_dest_init(struct pipe_read_dest *prd) return 0; } -int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, +int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long int *nr_pages, unsigned int ppb_flags) { struct page_pipe_buf *ppb; @@ -406,7 +406,7 @@ int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned lo } /* clamp the request if it passes the end of iovec */ - len = min((unsigned long)iov->iov_base + iov->iov_len - addr, (unsigned long)(*nr_pages) * PAGE_SIZE); + len = min((unsigned long)iov->iov_base + iov->iov_len - addr, *nr_pages * PAGE_SIZE); *nr_pages = len / PAGE_SIZE; skip += ppb->pipe_off * PAGE_SIZE; @@ -446,7 +446,7 @@ void debug_show_page_pipe(struct page_pipe *pp) pr_debug("Page pipe:\n"); pr_debug("* %u pipes %u/%u iovs:\n", pp->nr_pipes, pp->free_iov, pp->nr_iovs); list_for_each_entry(ppb, &pp->bufs, l) { - pr_debug("\tbuf %u pages, %u iovs, flags: %x pipe_off: %x :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, + pr_debug("\tbuf %lu pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, ppb->pipe_off); for (i = 0; i < ppb->nr_segs; i++) { iov = &ppb->iov[i]; diff --git a/criu/page-xfer.c b/criu/page-xfer.c index b0e04d82c5..4d057163d9 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -32,7 +32,7 @@ static int page_server_sk = -1; struct page_server_iov { u32 cmd; - u32 nr_pages; + u64 nr_pages; u64 vaddr; u64 dst_id; }; @@ -886,7 +886,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) list_for_each_entry(ppb, &pp->bufs, l) { unsigned int i; - pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs); + pr_debug("\tbuf %ld/%d\n", ppb->pages_in, ppb->nr_segs); for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; @@ -1071,7 +1071,7 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) struct page_xfer *lxfer = &cxfer.loc_xfer; struct iovec iov; - pr_debug("Adding %" PRIx64 "/%u\n", pi->vaddr, pi->nr_pages); + pr_debug("Adding %" PRIx64 "/%lu\n", pi->vaddr, pi->nr_pages); if (prep_loc_xfer(pi)) return -1; @@ -1348,7 +1348,7 @@ static int fill_page_pipe(struct page_read *pr, struct page_pipe *pp) static int page_pipe_from_pagemap(struct page_pipe **pp, int pid) { struct page_read pr; - int nr_pages = 0; + unsigned long nr_pages = 0; if (open_page_read(pid, &pr, PR_TASK) <= 0) { pr_err("Failed to open page read for %d\n", pid); @@ -1551,13 +1551,13 @@ struct ps_async_read { static LIST_HEAD(async_reads); -static inline void async_read_set_goal(struct ps_async_read *ar, int nr_pages) +static inline void async_read_set_goal(struct ps_async_read *ar, unsigned long nr_pages) { ar->goal = sizeof(ar->pi) + nr_pages * PAGE_SIZE; ar->nr_pages = nr_pages; } -static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages, ps_async_read_complete complete, +static void init_ps_async_read(struct ps_async_read *ar, void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) { ar->pages = buf; @@ -1567,7 +1567,7 @@ static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages async_read_set_goal(ar, nr_pages); } -static int page_server_start_async_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv) +static int page_server_start_async_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) { struct ps_async_read *ar; @@ -1667,7 +1667,7 @@ int connect_to_page_server_to_recv(int epfd) return epoll_add_rfd(epfd, &ps_rfd); } -int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) +int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages) { struct page_server_iov pi = { .cmd = PS_IOV_GET, @@ -1684,7 +1684,7 @@ int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) return 0; } -static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete complete, void *priv) +static int page_server_start_sync_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv) { struct ps_async_read ar; int ret = 1; @@ -1695,7 +1695,7 @@ static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete return ret; } -int page_server_start_read(void *buf, int nr, ps_async_read_complete complete, void *priv, unsigned flags) +int page_server_start_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv, unsigned flags) { if (flags & PR_ASYNC) return page_server_start_async_read(buf, nr, complete, priv); diff --git a/criu/pagemap.c b/criu/pagemap.c index d9ccc03eb6..16d680fdbb 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -168,15 +168,15 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) return 0; } -static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) +static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%lu vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%lu vs %lx:%lu\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } -static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int read_parent_page(struct page_read *pr, unsigned long vaddr, unsigned long int nr, void *buf, unsigned flags) { struct page_read *ppr = pr->parent; int ret; @@ -195,7 +195,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, v */ do { - int p_nr; + unsigned long int p_nr; pr_debug("\tpr%lu-%u Read from parent\n", pr->img_id, pr->id); ret = ppr->seek_pagemap(ppr, vaddr); @@ -210,7 +210,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, v * read as much as we can. */ p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE; - pr_info("\tparent has %u pages in\n", p_nr); + pr_info("\tparent has %lu pages in\n", p_nr); if (p_nr > nr) p_nr = nr; @@ -374,7 +374,7 @@ int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, st return 0; } -static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { int ret; unsigned long len = nr * PAGE_SIZE; @@ -402,7 +402,7 @@ static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int * We cannot use maybe_read_page_local() for streaming images as it uses * pread(), seeking in the file. Instead, we use this custom page reader. */ -static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { unsigned long len = nr * PAGE_SIZE; int fd; @@ -445,7 +445,7 @@ static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vadd return ret; } -static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_pages, void *priv) +static int read_page_complete(unsigned long img_id, unsigned long vaddr, unsigned long int nr_pages, void *priv) { int ret = 0; struct page_read *pr = priv; @@ -463,7 +463,7 @@ static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_ return ret; } -static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { int ret; @@ -474,9 +474,9 @@ static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int return ret; } -static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { - pr_info("pr%lu-%u Read %lx %u pages\n", pr->img_id, pr->id, vaddr, nr); + pr_info("pr%lu-%u Read %lx %lu pages\n", pr->img_id, pr->id, vaddr, nr); pagemap_bound_check(pr->pe, vaddr, nr); if (pagemap_in_parent(pr->pe)) { diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index 1bc03dc2a0..c966e9e62c 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -101,7 +101,7 @@ static int dump_pages(struct parasite_dump_pages_args *args) } if (spliced_bytes != args->nr_pages * PAGE_SIZE) { sys_close(p); - pr_err("Can't splice all pages to pipe (%ld/%d)\n", spliced_bytes, args->nr_pages); + pr_err("Can't splice all pages to pipe (%ld/%ld)\n", spliced_bytes, args->nr_pages); return -1; } diff --git a/criu/uffd.c b/criu/uffd.c index 98c2b7e075..8e12dcd636 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -668,12 +668,11 @@ static int remap_iovs(struct lazy_pages_info *lpi, unsigned long from, unsigned */ static int collect_iovs(struct lazy_pages_info *lpi) { + unsigned long start, end, len, nr_pages = 0; + int n_vma = 0, max_iov_len = 0, ret = -1; struct page_read *pr = &lpi->pr; struct lazy_iov *iov; MmEntry *mm; - int nr_pages = 0, n_vma = 0, max_iov_len = 0; - int ret = -1; - unsigned long start, end, len; mm = init_mm_entry(lpi); if (!mm) @@ -728,7 +727,7 @@ static int collect_iovs(struct lazy_pages_info *lpi) return ret; } -static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, int nr); +static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, unsigned long nr); static int ud_open(int client, struct lazy_pages_info **_lpi) { @@ -822,7 +821,7 @@ static bool uffd_recoverable_error(int mcopy_rc) return false; } -static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int *nr_pages, long mcopy_rc) +static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, unsigned long *nr_pages, long mcopy_rc) { if (errno == ENOSPC || errno == ESRCH) { handle_exit(lpi); @@ -844,7 +843,7 @@ static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int return 0; } -static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) +static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, unsigned long *nr_pages) { struct uffdio_copy uffdio_copy; unsigned long len = *nr_pages * page_size(); @@ -865,12 +864,12 @@ static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) return 0; } -static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr) +static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, unsigned long nr) { struct lazy_pages_info *lpi; - unsigned long addr = 0; - int req_pages, ret; + unsigned long addr = 0, req_pages; struct lazy_iov *req; + int ret; lpi = container_of(pr, struct lazy_pages_info, pr); @@ -920,7 +919,7 @@ static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr return drop_iovs(lpi, addr, nr * PAGE_SIZE); } -static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) +static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, unsigned long nr_pages) { struct uffdio_zeropage uffdio_zeropage; unsigned long len = page_size() * nr_pages; @@ -946,7 +945,7 @@ static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) * Returns 0 for zero pages, 1 for "real" pages and negative value on * error */ -static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) +static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr) { int ret; @@ -961,7 +960,7 @@ static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) return 0; } -static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, int nr, unsigned flags) +static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr, unsigned flags) { int ret; @@ -1003,7 +1002,7 @@ static void update_xfer_len(struct lazy_pages_info *lpi, bool pf) static int xfer_pages(struct lazy_pages_info *lpi) { struct lazy_iov *iov; - unsigned int nr_pages; + unsigned long nr_pages; unsigned long len; int err; From b0a2914dfdfa85f91f3375c8edce25eaf742cf3c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 19 Sep 2025 15:10:25 +0000 Subject: [PATCH 084/137] pagemap: print page regions in the format `start - end` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During investigations, it’s much easier to read logs when regions are printed in the start - end format rather than `start/size`. In addition, all page counters and memory sizes are now printed in hexadecimal, as they are hard to read in decimal form. Signed-off-by: Andrei Vagin --- criu/cr-dedup.c | 3 ++- criu/page-pipe.c | 6 +++--- criu/page-xfer.c | 23 +++++++++++++---------- criu/pagemap.c | 2 +- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/criu/cr-dedup.c b/criu/cr-dedup.c index c0c21f53ee..feeb9ebb03 100644 --- a/criu/cr-dedup.c +++ b/criu/cr-dedup.c @@ -87,7 +87,8 @@ static int cr_dedup_one_pagemap(unsigned long img_id, int flags) if (ret <= 0) goto exit; - pr_debug("dedup iovec base=%" PRIx64 ", len=%lu\n", pr.pe->vaddr, pagemap_len(pr.pe)); + pr_debug("dedup iovec %" PRIx64 " - %" PRIx64 "\n", + pr.pe->vaddr, pr.pe->vaddr + pagemap_len(pr.pe)); if (!pagemap_in_parent(pr.pe)) { ret = dedup_one_iovec(prp, pr.pe->vaddr, pagemap_len(pr.pe)); if (ret) diff --git a/criu/page-pipe.c b/criu/page-pipe.c index f8e3520f79..4601d8f9cd 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -446,17 +446,17 @@ void debug_show_page_pipe(struct page_pipe *pp) pr_debug("Page pipe:\n"); pr_debug("* %u pipes %u/%u iovs:\n", pp->nr_pipes, pp->free_iov, pp->nr_iovs); list_for_each_entry(ppb, &pp->bufs, l) { - pr_debug("\tbuf %lu pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, + pr_debug("\tbuf %lx pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, ppb->pipe_off); for (i = 0; i < ppb->nr_segs; i++) { iov = &ppb->iov[i]; - pr_debug("\t\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + pr_debug("\t\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); } } pr_debug("* %u holes:\n", pp->free_hole); for (i = 0; i < pp->free_hole; i++) { iov = &pp->holes[i]; - pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + pr_debug("\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); } } diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 4d057163d9..e2913b9244 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -178,12 +178,12 @@ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long le ssize_t ret, left = len; if (opts.tls) { - pr_debug("Sending %lu bytes / %lu pages\n", len, len / PAGE_SIZE); + pr_debug("Sending %lx bytes\n", len); if (tls_send_data_from_fd(p, len)) return -1; } else { - pr_debug("Splicing %lu bytes / %lu pages into socket\n", len, len / PAGE_SIZE); + pr_debug("Splicing %lx bytes into socket\n", len); while (left > 0) { ret = splice(p, NULL, xfer->sk, NULL, left, SPLICE_F_MOVE); @@ -192,7 +192,7 @@ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long le return -1; } - pr_debug("\tSpliced: %lu bytes sent\n", (unsigned long)ret); + pr_debug("\tSpliced: %lx bytes sent\n", (unsigned long)ret); left -= ret; } } @@ -288,7 +288,7 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) * read_pagemap_page routine. */ - pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len); + pr_debug("Checking %p - %p hole\n", iov->iov_base, iov->iov_base + iov->iov_len); off = (unsigned long)iov->iov_base; end = off + iov->iov_len; while (1) { @@ -300,7 +300,8 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) return -1; } - pr_debug("\tFound %" PRIx64 "/%lu\n", p->pe->vaddr, pagemap_len(p->pe)); + pr_debug("\tFound %" PRIx64 " - %" PRIx64 "\n", + p->pe->vaddr, p->pe->vaddr + pagemap_len(p->pe)); /* * The pagemap entry in parent may happen to be @@ -340,7 +341,8 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag if (xfer->parent != NULL) { ret = check_pagehole_in_parent(xfer->parent, iov); if (ret) { - pr_err("Hole %p/%zu not found in parent\n", iov->iov_base, iov->iov_len); + pr_err("Hole %p - %p not found in parent\n", + iov->iov_base, iov->iov_base + iov->iov_len); return -1; } } @@ -850,7 +852,7 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\t p %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); + pr_debug("\t p %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); flags = ppb_xfer_flags(xfer, ppb); @@ -886,7 +888,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) list_for_each_entry(ppb, &pp->bufs, l) { unsigned int i; - pr_debug("\tbuf %ld/%d\n", ppb->pages_in, ppb->nr_segs); + pr_debug("\tbuf %lx/%d\n", ppb->pages_in, ppb->nr_segs); for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; @@ -898,7 +900,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\tp %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); + pr_debug("\tp %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); flags = ppb_xfer_flags(xfer, ppb); @@ -1071,7 +1073,8 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) struct page_xfer *lxfer = &cxfer.loc_xfer; struct iovec iov; - pr_debug("Adding %" PRIx64 "/%lu\n", pi->vaddr, pi->nr_pages); + pr_debug("Adding %" PRIx64 " - %" PRIx64 "\n", + pi->vaddr, pi->vaddr + pi->nr_pages * PAGE_SIZE); if (prep_loc_xfer(pi)) return -1; diff --git a/criu/pagemap.c b/criu/pagemap.c index 16d680fdbb..b6ec3e3332 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -171,7 +171,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%lu vs %lx:%lu\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%lx vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } From 1de54cd820ecd5dce0fb5871f75e8d03ad4420c2 Mon Sep 17 00:00:00 2001 From: dong sunchao Date: Tue, 23 Sep 2025 01:00:12 +1000 Subject: [PATCH 085/137] vdso: relax EI_OSABI check to support linux in ELF header On some ARM/aarch64 systems, the VDSO ELF header sets EI_OSABI to 3 (Linux), while CRIU expects 0 (System V). This strict check causes restore to fail with "ELF header magic mismatch" This patch relaxes the check to accept both values, improving compatibility with modern toolchains and kernels (e.g. Linux 6.12+) Fixes: #2751 Signed-off-by: dong sunchao --- criu/pie/util-vdso.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index 8daf5c71f3..45fb6a648b 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -98,25 +98,45 @@ static unsigned long elf_gnu_hash(const unsigned char *name) static int has_elf_identity(Ehdr_t *ehdr) { - /* - * See Elf specification for this magic values. - */ + /* check ELF magic */ + + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return false; + }; + + /* check ELF class */ #if defined(CONFIG_VDSO_32) - static const char elf_ident[] = { - 0x7f, 0x45, 0x4c, 0x46, 0x01, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + if (ehdr->e_ident[EI_CLASS] != ELFCLASS32) { + pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); + return false; }; #else - static const char elf_ident[] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64) { + pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); + return false; }; #endif - BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident)); - - if (memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) { - pr_err("ELF header magic mismatch\n"); + /* check ELF data encoding */ + if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF data encoding: %d\n", ehdr->e_ident[EI_DATA]); return false; - } + }; + /* check ELF version */ + if (ehdr->e_ident[EI_VERSION] != EV_CURRENT) { + pr_err("Unsupported ELF version: %d\n", ehdr->e_ident[EI_VERSION]); + return false; + }; + /* check ELF OSABI */ + if (ehdr->e_ident[EI_OSABI] != ELFOSABI_NONE && + ehdr->e_ident[EI_OSABI] != ELFOSABI_LINUX) { + pr_err("Unsupported OSABI version: %d\n", ehdr->e_ident[EI_OSABI]); + return false; + }; return true; } From ae9863ff98ee76ae3bc423eb456aeff9ad1695ce Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 22 Sep 2025 17:59:29 +0000 Subject: [PATCH 086/137] zdtm: Remove junit_xml leftovers The previous commit 4cd4a6b1ac15 ("zdtm: stop importing junit_xml") removed the junit_xml library, but some variables related to it were left in the code. This commit removes the unused `tc` variable and a call to its `add_error_info` method. Fixes: 4cd4a6b1ac15 ("zdtm: stop importing junit_xml") Signed-off-by: Andrei Vagin --- test/zdtm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index 7e83aa4df9..e21356c30a 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2232,7 +2232,6 @@ def __wait_one(self, flags): # The following wait() is not useful for our domain logic. # It's useful for taming warnings in subprocess.Popen.__del__() sub['sub'].wait() - tc = None if status != 0: self.__fail = True failed_flavor = decode_flav(os.WEXITSTATUS(status)) @@ -2243,7 +2242,6 @@ def __wait_one(self, flags): with open(sub['log']) as sublog: output = sublog.read() details = {'output': output} - tc.add_error_info(output=output) print(testline, file=self.__file_report) print("%s" % yaml.safe_dump(details, explicit_start=True, From ebcc2676ac0a8f7c24a24e8bb674c9ced01c1b56 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 19 Sep 2025 00:34:56 +0000 Subject: [PATCH 087/137] docs: add developer overviews for AI assistants This commit adds the document to provide high-level overviews of the CRIU project for AI assistants like Claude and Gemini. These documents are intended to be used as context for AI-powered developer assistants to help them understand the project's goals, architecture, and development process. This will allow them to provide more accurate and helpful responses to developer questions. The documents include: - A brief introduction to CRIU - A quick start guide for checkpointing and restoring a simple process - An overview of the dump and restore process - A description of the Compel subproject - Information about the project's coding style, code layout, and tests Signed-off-by: Andrei Vagin --- CLAUDE.md | 1 + GEMINI.md | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 120000 CLAUDE.md create mode 100644 GEMINI.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 0000000000..e3c5a92d9f --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +GEMINI.md \ No newline at end of file diff --git a/GEMINI.md b/GEMINI.md new file mode 100644 index 0000000000..e56c1de12d --- /dev/null +++ b/GEMINI.md @@ -0,0 +1,136 @@ +# CRIU (Checkpoint/Restore In User-space) + +CRIU is a tool for saving the state of a running application to a set of files +(checkpointing) and restoring it back to a live state. It is primarily used for +live migration of containers, in-place updates, and fast application startup. + +It is implemented as a command-line tool called `criu`. The two primary commands +are `dump` and `restore`. + +- `dump`: Saves a process tree and all its related resources (file + descriptors, IPC, sockets, namespaces, etc.) into a collection of image + files. +- `restore`: Restores processes from image files to the same state they were + in before the dump. + +## Quick Start + +To get a feel for `criu`, you can try checkpointing and restoring a simple +process. + +1. **Run a simple process:** + Open a terminal and run a command that will run for a while. Find its PID. + ```bash + sleep 1000 & + [1] 12345 + ``` + +2. **Dump the process:** + As root, use `criu dump` with the process ID (`-t`) and a directory for the + image files (`-D`). + ```bash + sudo criu dump -t 12345 -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will no longer be running. + +3. **Restore the process:** + Use `criu restore` to bring the process back to life from the images. + ```bash + sudo criu restore -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will be running again as if nothing happened. + +# For Developers and Contributors + +This section contains more technical details about CRIU's internals and +development process. + +## Dump Process + +On dump, CRIU uses available kernel interfaces to collect information about +processes. For properties that can only be retrieved from within the process +itself, CRIU injects a binary blob (called a "parasite") into the process's +address space and executes it in the context of one of the process's threads. +This injection is handled by a subproject called **Compel**. + +## Restore Process + +On restore, CRIU reads the image files to reconstruct the processes. The goal is +to restore them to the exact state they were in before the dump. The restore +process is divided into several stages (defined as `CR_STATE_*` in +`./criu/include/restorer.h`). + +The main `criu` process acts as a coordinator. It first restores resources with +inter-process dependencies (file descriptors, sockets, shared memory, +namespaces, etc.). It then forks the process tree and sets up namespaces. +Finally, it restores process-specific resources like file descriptors and memory +mappings. + +A key step involves a small, self-contained binary called the "restorer". All +restored processes switch to executing this code, which unmaps the CRIU-specific +memory and restores the application's original memory mappings. On the final +step, the restorer calls `sigreturn` on a prepared signal frame to resume the +process with the state it had at the moment of the dump. + +## Compel + +Compel is a subproject responsible for generating the binary blobs used for the +parasite code (for dumping) and the restorer code (for restoring). It provides a +library for injecting and executing this code within the target process's +address space. It is a separate project because the logic for generating and +injecting Position-Independent Executable (PIE) code is complex and +self-contained. + +## Coding Style + +The C code in the CRIU project follows the +[Linux Kernel Coding Style](https://www.kernel.org/doc/html/latest/process/coding-style.html). +Here are some of the main points: + +- **Indentation**: Use tabs, which are set to 8 characters. +- **Line Length**: The preferred line limit is 80 characters, but it can be + extended to 120 if it improves code readability. +- **Braces**: + - The opening brace for a function goes on a new line. + - The opening brace for a block (like `if`, `for`, `while`, `switch`) goes + on the same line. +- **Spaces**: Use spaces around operators (`+`, `-`, `*`, `/`, `%`, `<`, `>`, + `=`, etc.). +- **Naming**: Use descriptive names for functions and variables. +- **Comments**: Use C-style comments (`/* ... */`). For multi-line comments, + the preferred format is: + ```c + /* + * This is a multi-line + * comment. + */ + ``` + +## Code Layout + +The code is organized into the following directories: + +- `./compel`: The Compel sub-project. +- `./criu`: The main `criu` tool source code. +- `./images`: Protobuf descriptions for the image files. +- `./test`: All tests. +- `./test/zdtm`: The Zero-Downtime Migration (ZDTM) test suite. +- `./test/zdtm.py`: The executor script for ZDTM tests. +- `./scripts`: Helper scripts. +- `./scripts/build`: Docker image files used for CI and cross-compilation + checks. +- `./crit`: A tool to inspect and manipulate CRIU image files. +- `./soccr`: A library for TCP socket checkpoint/restore. + +## Tests + +The main test suite is ZDTM. Here is an example of how to run a single test: + +```bash +sudo ./test/zdtm.py run -t zdtm/static/env00 +``` + +Each ZDTM test has three stages: preparation, C/R, and results checks. During +the test, a process calls `test_daemon()` to signal it is ready for C/R, then +calls `test_waitsig()` to wait for the C/R stage to complete. After being +restored, the test checks that all its resources are still in a valid state. From 2b8e740aa00bf96100f22cb05ecf784310d63802 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Wed, 17 Sep 2025 19:14:36 +0900 Subject: [PATCH 088/137] ci: use package-manager dependency install scripts Currently, adding a package which is required either for development or testing requires it to be added in multiple places due to many duplicated Dockerfiles and installation scripts. This makes it difficult to ensure that all scripts are updated appropriately and can lead to some places being missed. This patch consolidates the list of dependencies and adds installation scripts for each package-manager used in our CI (apk, apt, dnf, pacman). This change also replaces the `debian/dev-packages.lst` as this subfolder conflicts with the Ubuntu/Debian packing scripts used for CRIU: https://github.com/rst0git/criu-deb-packages This patch also removes the CentOS 8 build scripts as it is EOL and the container registry is no longer available. Signed-off-by: Shashank Balaji Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 12 +-- .github/workflows/check-commits.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/nftables-test.yml | 2 +- CONTRIBUTING.md | 84 ++++++++++++------- Makefile | 3 +- {scripts/ci => contrib}/apt-install | 0 contrib/debian/dev-packages.lst | 19 ----- contrib/dependencies/apk-packages.sh | 38 +++++++++ contrib/dependencies/apt-cross-packages.sh | 34 ++++++++ contrib/dependencies/apt-packages.sh | 40 +++++++++ contrib/dependencies/dnf-packages.sh | 35 ++++++++ contrib/dependencies/pacman-packages.sh | 31 +++++++ scripts/build/Dockerfile.alpine | 43 +--------- scripts/build/Dockerfile.archlinux | 35 +------- scripts/build/Dockerfile.centos8 | 48 ----------- scripts/build/Dockerfile.fedora.tmpl | 5 +- scripts/build/Dockerfile.hotspot-alpine | 25 +----- scripts/build/Dockerfile.hotspot-ubuntu | 28 +------ scripts/build/Dockerfile.linux32.tmpl | 26 +----- scripts/build/Dockerfile.openj9-ubuntu | 28 +------ .../Dockerfile.riscv64-stable-cross.tmpl | 33 +------- scripts/build/Dockerfile.stable-cross.tmpl | 25 +----- scripts/build/Dockerfile.tmpl | 34 +------- scripts/build/Dockerfile.unstable-cross.tmpl | 26 +----- scripts/build/Dockerfile.x86_64.hdr | 2 +- scripts/build/Makefile | 2 +- scripts/ci/Makefile | 2 +- scripts/ci/docker-test.sh | 4 +- scripts/ci/java-test.sh | 2 + scripts/ci/loongarch64-qemu-test.sh | 4 +- scripts/ci/prepare-for-fedora-rawhide.sh | 29 +------ scripts/ci/run-ci-tests.sh | 12 +-- scripts/ci/vagrant.sh | 12 +-- scripts/install-debian-pkgs.sh | 25 ------ 35 files changed, 294 insertions(+), 458 deletions(-) rename {scripts/ci => contrib}/apt-install (100%) delete mode 100644 contrib/debian/dev-packages.lst create mode 100755 contrib/dependencies/apk-packages.sh create mode 100755 contrib/dependencies/apt-cross-packages.sh create mode 100755 contrib/dependencies/apt-packages.sh create mode 100755 contrib/dependencies/dnf-packages.sh create mode 100755 contrib/dependencies/pacman-packages.sh delete mode 100644 scripts/build/Dockerfile.centos8 delete mode 100755 scripts/install-debian-pkgs.sh diff --git a/.cirrus.yml b/.cirrus.yml index 848e141329..99dd70d63f 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -13,7 +13,7 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok build_script: | make -C scripts/ci vagrant-fedora-no-vdso @@ -34,7 +34,7 @@ task: setup_script: | dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python3-importlib-metadata xmlto libdrm-devel libuuid-devel + contrib/dependencies/dnf-packages.sh # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel @@ -63,7 +63,7 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok build_script: | make -C scripts/ci vagrant-fedora-rawhide @@ -83,7 +83,7 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok build_script: | make -C scripts/ci vagrant-fedora-non-root @@ -96,7 +96,7 @@ task: memory: 4G script: uname -a build_script: | - scripts/ci/apt-install make + contrib/apt-install make make -C scripts/ci local task: @@ -107,7 +107,7 @@ task: memory: 4G script: uname -a build_script: | - scripts/ci/apt-install make + contrib/apt-install make make -C scripts/ci local CLANG=1 task: diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml index 354873909e..bf7d06697c 100644 --- a/.github/workflows/check-commits.yml +++ b/.github/workflows/check-commits.yml @@ -19,7 +19,7 @@ jobs: # Checkout pull request HEAD commit instead of merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies - run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev + run: sudo contrib/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev - name: Configure git user details run: | git config --global user.email "checkpoint-restore@users.noreply.github.com" diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 88e21d3d17..9c9e46c1b2 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -34,7 +34,7 @@ jobs: - name: Install Packages (cpp) if: ${{ matrix.language == 'cpp' }} run: | - sudo scripts/ci/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev + sudo contrib/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev - name: Initialize CodeQL uses: github/codeql-action/init@v3 with: diff --git a/.github/workflows/nftables-test.yml b/.github/workflows/nftables-test.yml index eb3d8e8141..7a7d8bd309 100644 --- a/.github/workflows/nftables-test.yml +++ b/.github/workflows/nftables-test.yml @@ -15,7 +15,7 @@ jobs: - name: Remove iptables run: sudo apt remove -y iptables - name: Install libnftables-dev - run: sudo scripts/ci/apt-install libnftables-dev + run: sudo contrib/apt-install libnftables-dev - name: chmod 755 /home/runner # CRIU's tests are sometimes running as some random user and need # to be able to access the test files. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 712e7b8132..3ad4aa1019 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,19 +27,43 @@ The repository may contain multiple branches. Development happens in the **criu- To clone CRIU repo and switch to the proper branch, run: ``` - git clone https://github.com/checkpoint-restore/criu criu - cd criu - git checkout criu-dev +git clone https://github.com/checkpoint-restore/criu criu +cd criu +git checkout criu-dev ``` -### Compile +### Building from source -First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. Alternatively, you can use the Nix flake to set up a development environment by running `nix develop`. +Follow these steps to compile CRIU from source code. -To compile CRIU, run: +#### Installing build dependencies + +First, you need to install the required build dependencies. We provide scripts to simplify this process for several Linux distributions in [contrib/dependencies](contrib/dependencies). For a complete list of dependencies, please refer to the [installation guide](https://criu.org/Installation). + +##### On Ubuntu/Debian-based systems: + +``` +./contrib/dependencies/apt-packages.sh +``` + +##### On Fedora/CentOS-based systems: + +``` +./contrib/dependencies/dnf-packages.sh +``` + +##### Using Nix: + +``` +nix develop +``` + +#### Compiling CRIU + +Once the dependencies are installed, you can compile CRIU by running the `make` command from the root of the source directory: ``` - make +make ``` This should create the `./criu/criu` executable. @@ -63,7 +87,7 @@ The following command can be used to automatically run a code linter for Python text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). ``` - make lint +make lint ``` In addition, we have adopted a [clang-format configuration file](https://www.kernel.org/doc/Documentation/process/clang-format.rst) @@ -73,7 +97,7 @@ results in decreased readability, we may choose to ignore these errors. Run the following command to check if your changes are compliant with the clang-format rules: ``` - make indent +make indent ``` This command is built upon the `git-clang-format` tool and supports two options `BASE` and `OPTS`. The `BASE` option allows you to @@ -83,7 +107,7 @@ can use `BASE=origin/criu-dev`. The `OPTS` option can be used to pass additional to check the last *N* commits for formatting errors, without applying the changes to the codebase you can use the following command. ``` - make indent OPTS=--diff BASE=HEAD~N +make indent OPTS=--diff BASE=HEAD~N ``` Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected @@ -96,7 +120,7 @@ Here are some bad examples of clang-format-ing: ``` @@ -58,8 +59,7 @@ static int register_membarriers(void) } - + if (!all_ok) { - fail("can't register membarrier()s - tried %#x, kernel %#x", - barriers_registered, barriers_supported); @@ -129,7 +153,7 @@ Here are some bad examples of clang-format-ing: CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run ``` - make test +make test ``` The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. @@ -166,21 +190,21 @@ If your change fixes a bug in a specific commit, e.g. you found an issue using the SHA-1 ID, and the one line summary. For example: ``` - Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") +Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") ``` The following `git config` settings can be used to add a pretty format for outputting the above style in the `git log` or `git show` commands: ``` - [pretty] - fixes = Fixes: %h (\"%s\") +[pretty] + fixes = Fixes: %h (\"%s\") ``` If your change address an issue listed in GitHub, please use `Fixes:` tag with the number of the issue. For instance: ``` - Fixes: #339 +Fixes: #339 ``` The `Fixes:` tags should be put at the end of the detailed description. @@ -263,7 +287,7 @@ can certify the below: then you just add a line saying ``` - Signed-off-by: Random J Developer +Signed-off-by: Random J Developer ``` using your real name (please, no pseudonyms or anonymous contributions if @@ -275,14 +299,14 @@ commit message. To append such line to a commit you already made, use ``` From: Random J Developer - Subject: [PATCH] component: Short patch description +Subject: [PATCH] component: Short patch description - Long patch description (could be skipped if patch - is trivial enough) +Long patch description (could be skipped if patch +is trivial enough) - Signed-off-by: Random J Developer - --- - Patch body here +Signed-off-by: Random J Developer +--- +Patch body here ``` ## Submit your work upstream @@ -316,8 +340,8 @@ contains the following: revisions should be listed. For example: ``` - v3: rebase on the current criu-dev - v2: add commit to foo() and update bar() coding style +v3: rebase on the current criu-dev +v2: add commit to foo() and update bar() coding style ``` If there are only minor updates to the commits in a pull request, it is @@ -335,7 +359,7 @@ Historically, CRIU worked with mailing lists and patches so if you still prefer To create a patch, run ``` - git format-patch --signoff origin/criu-dev +git format-patch --signoff origin/criu-dev ``` You might need to read GIT documentation on how to prepare patches @@ -346,8 +370,8 @@ at all. We recommend to post patches using `git send-email` ``` - git send-email --cover-letter --no-chain-reply-to --annotate \ - --confirm=always --to=criu@openvz.org criu-dev +git send-email --cover-letter --no-chain-reply-to --annotate \ + --confirm=always --to=criu@openvz.org criu-dev ``` Note that the `git send-email` subcommand may not be in @@ -359,14 +383,14 @@ If this is your first time using git send-email, you might need to configure it to point it to your SMTP server with something like: ``` - git config --global sendemail.smtpServer stmp.example.net +git config --global sendemail.smtpServer stmp.example.net ``` If you get tired of typing `--to=criu@openvz.org` all the time, you can configure that to be automatically handled as well: ``` - git config sendemail.to criu@openvz.org +git config sendemail.to criu@openvz.org ``` If a developer is sending another version of the patch (e.g. to address diff --git a/Makefile b/Makefile index 7272cfce19..3e5d62726c 100644 --- a/Makefile +++ b/Makefile @@ -464,7 +464,8 @@ ruff: shellcheck: shellcheck --version shellcheck scripts/*.sh - shellcheck scripts/ci/*.sh scripts/ci/apt-install + shellcheck scripts/ci/*.sh + shellcheck contrib/apt-install contrib/dependencies/*.sh shellcheck -x test/others/crit/*.sh shellcheck -x test/others/libcriu/*.sh shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh diff --git a/scripts/ci/apt-install b/contrib/apt-install similarity index 100% rename from scripts/ci/apt-install rename to contrib/apt-install diff --git a/contrib/debian/dev-packages.lst b/contrib/debian/dev-packages.lst deleted file mode 100644 index ce45f1b7cf..0000000000 --- a/contrib/debian/dev-packages.lst +++ /dev/null @@ -1,19 +0,0 @@ -# Required packages for development in Debian -build-essential -libprotobuf-dev -libprotobuf-c-dev -protobuf-c-compiler -protobuf-compiler -python3-protobuf -libnet-dev - -# Extra packages, required for testing and building other tools -pkg-config -libnl-3-dev -libbsd0 -libbsd-dev -iproute2 -libcap-dev -libaio-dev -python3-yaml -libnl-route-3-dev diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh new file mode 100755 index 0000000000..0084dea3ab --- /dev/null +++ b/contrib/dependencies/apk-packages.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env sh + +apk add --no-cache \ + asciidoctor \ + bash \ + build-base \ + coreutils \ + e2fsprogs \ + git \ + gnutls-dev \ + go \ + ip6tables \ + iproute2 \ + iptables \ + iptables-legacy \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libcap-utils \ + libdrm-dev \ + libnet-dev \ + libnl3-dev \ + nftables \ + nftables-dev \ + pkgconfig \ + procps \ + protobuf-c-compiler \ + protobuf-c-dev \ + protobuf-dev \ + py3-importlib-metadata \ + py3-pip \ + py3-protobuf \ + py3-yaml \ + python3 \ + sudo \ + tar \ + util-linux \ + util-linux-dev diff --git a/contrib/dependencies/apt-cross-packages.sh b/contrib/dependencies/apt-cross-packages.sh new file mode 100755 index 0000000000..588be40d02 --- /dev/null +++ b/contrib/dependencies/apt-cross-packages.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + crossbuild-essential-"${DEBIAN_ARCH}" \ + iproute2:"${DEBIAN_ARCH}" \ + libaio-dev:"${DEBIAN_ARCH}" \ + libbz2-dev:"${DEBIAN_ARCH}" \ + libc6-"${DEBIAN_ARCH}"-cross \ + libc6-dev-"${DEBIAN_ARCH}"-cross \ + libcap-dev:"${DEBIAN_ARCH}" \ + libexpat1-dev:"${DEBIAN_ARCH}" \ + libgnutls28-dev:"${DEBIAN_ARCH}" \ + libnet-dev:"${DEBIAN_ARCH}" \ + libnftables-dev:"${DEBIAN_ARCH}" \ + libnl-3-dev:"${DEBIAN_ARCH}" \ + libnl-route-3-dev:"${DEBIAN_ARCH}" \ + libprotobuf-c-dev:"${DEBIAN_ARCH}" \ + libprotobuf-dev:"${DEBIAN_ARCH}" \ + libssl-dev:"${DEBIAN_ARCH}" \ + ncurses-dev:"${DEBIAN_ARCH}" \ + uuid-dev:"${DEBIAN_ARCH}" \ + libdrm-dev:"${DEBIAN_ARCH}" \ + build-essential \ + pkg-config \ + git \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-protobuf diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh new file mode 100755 index 0000000000..c60ba9041c --- /dev/null +++ b/contrib/dependencies/apt-packages.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + asciidoctor \ + bash \ + bsdmainutils \ + build-essential \ + gdb \ + git-core \ + iptables \ + kmod \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libdrm-dev \ + libgnutls28-dev \ + libgnutls30 \ + libnet-dev \ + libnl-3-dev \ + libnl-route-3-dev \ + libperl-dev \ + libprotobuf-c-dev \ + libprotobuf-dev \ + libselinux-dev \ + pkg-config \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-importlib-metadata \ + python3-pip \ + python3-protobuf \ + python3-yaml \ + time \ + util-linux \ + uuid-dev diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh new file mode 100755 index 0000000000..efbb659c54 --- /dev/null +++ b/contrib/dependencies/dnf-packages.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env sh + +dnf install -y \ + asciidoc \ + binutils \ + gcc \ + git \ + glibc-devel \ + gnutls-devel \ + iproute \ + iptables \ + libaio-devel \ + libasan \ + libbpf-devel \ + libbsd-devel \ + libcap-devel \ + libdrm-devel \ + libnet-devel \ + libnl3-devel \ + libselinux-devel \ + libuuid-devel \ + make \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + protobuf-c-devel \ + protobuf-compiler \ + protobuf-devel \ + python-devel \ + python3-importlib-metadata \ + python3-protobuf \ + python3-pyyaml \ + rubygem-asciidoctor \ + xmlto diff --git a/contrib/dependencies/pacman-packages.sh b/contrib/dependencies/pacman-packages.sh new file mode 100755 index 0000000000..5fe6995fb9 --- /dev/null +++ b/contrib/dependencies/pacman-packages.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env sh + +pacman -Syu --noconfirm \ + asciidoctor \ + base-devel \ + bash \ + coreutils \ + diffutils \ + git \ + gnutls \ + go \ + iproute2 \ + iptables \ + libaio \ + libbsd \ + libcap \ + libdrm \ + libnet \ + libnl \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + python-importlib-metadata \ + python-pip \ + python-protobuf \ + python-yaml \ + sudo \ + tar \ + util-linux \ + util-linux-libs diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index 819fda0c38..ed883f3002 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -1,49 +1,12 @@ FROM alpine ARG CC=gcc -RUN apk update && apk add \ - $CC \ - bash \ - build-base \ - coreutils \ - procps \ - git \ - gnutls-dev \ - libaio-dev \ - libcap-dev \ - libnet-dev \ - libnl3-dev \ - nftables \ - nftables-dev \ - pkgconfig \ - protobuf-c-dev \ - protobuf-dev \ - py3-pip \ - py3-protobuf \ - python3 \ - sudo \ - libcap-utils \ - libdrm-dev \ - util-linux \ - util-linux-dev - COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date -RUN apk add \ - ip6tables \ - iptables \ - iptables-legacy \ - nftables \ - iproute2 \ - tar \ - bash \ - go \ - e2fsprogs \ - py-yaml \ - py3-importlib-metadata \ - asciidoctor +RUN apk add --no-cache "$CC" && /criu/contrib/dependencies/apk-packages.sh + +RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index d4b432f8d6..261bd2d799 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -5,40 +5,11 @@ ARG CC=gcc # Initialize machine ID RUN systemd-machine-id-setup -RUN pacman -Syu --noconfirm \ - $CC \ - bash \ - make \ - coreutils \ - git \ - gnutls \ - libaio \ - libcap \ - libnet \ - libnl \ - nftables \ - pkgconfig \ - protobuf-c \ - protobuf \ - python-pip \ - python-protobuf \ - which \ - sudo \ - iptables \ - nftables \ - iproute2 \ - tar \ - bash \ - go \ - python-yaml \ - asciidoctor \ - python-importlib-metadata \ - libdrm \ - util-linux-libs \ - diffutils - COPY . /criu WORKDIR /criu + +RUN pacman -Syu --noconfirm "$CC" && contrib/dependencies/pacman-packages.sh + RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 deleted file mode 100644 index 5ab6c9cfa4..0000000000 --- a/scripts/build/Dockerfile.centos8 +++ /dev/null @@ -1,48 +0,0 @@ -FROM registry.centos.org/centos/centos:8 - -ARG CC=gcc - -RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core -RUN yum config-manager --set-enabled powertools -RUN yum install -y --allowerasing \ - asciidoc \ - coreutils \ - chkconfig \ - diffutils \ - findutils \ - gcc \ - git \ - gnutls-devel \ - iproute \ - iptables \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - libselinux-devel \ - make \ - procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - python3-devel \ - python3-PyYAML \ - python3-protobuf \ - python3-pip \ - sudo \ - tar \ - which \ - xmlto - -RUN alternatives --set python /usr/bin/python3 -ENV PYTHON=python3 - -COPY . /criu -WORKDIR /criu - -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date - -# The rpc test cases are running as user #1000, let's add the user -RUN adduser -u 1000 test - -RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/build/Dockerfile.fedora.tmpl b/scripts/build/Dockerfile.fedora.tmpl index 9d3bb0f879..c26a5fd576 100644 --- a/scripts/build/Dockerfile.fedora.tmpl +++ b/scripts/build/Dockerfile.fedora.tmpl @@ -1,11 +1,10 @@ ARG CC=gcc -COPY scripts/ci/prepare-for-fedora-rawhide.sh /bin/prepare-for-fedora-rawhide.sh -RUN /bin/prepare-for-fedora-rawhide.sh - COPY . /criu WORKDIR /criu +RUN dnf install -y "$CC" && scripts/ci/prepare-for-fedora-rawhide.sh + RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.hotspot-alpine b/scripts/build/Dockerfile.hotspot-alpine index 6caf9d0b1b..cd632dddf5 100644 --- a/scripts/build/Dockerfile.hotspot-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -1,30 +1,11 @@ FROM docker.io/library/eclipse-temurin:11-alpine ARG CC=gcc -RUN apk update && apk add \ - bash \ - build-base \ - coreutils \ - git \ - gnutls-dev \ - libaio-dev \ - libcap-dev \ - libnet-dev \ - libnl3-dev \ - pkgconfig \ - protobuf-c-dev \ - protobuf-dev \ - python3 \ - sudo \ - maven \ - ip6tables \ - iptables \ - util-linux-dev \ - bash - COPY . /criu WORKDIR /criu +RUN apk add --no-cache maven "$CC" && contrib/dependencies/apk-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 67de916acb..76aa571fac 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -1,33 +1,11 @@ FROM docker.io/library/eclipse-temurin:11-focal ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install protobuf-c-compiler \ - libprotobuf-c-dev \ - libaio-dev \ - libprotobuf-dev \ - protobuf-compiler \ - libcap-dev \ - libnl-3-dev \ - gdb \ - bash \ - python3-protobuf \ - python3-yaml \ - libnet-dev \ - libnl-route-3-dev \ - libbsd-dev \ - make \ - git \ - pkg-config \ - iptables \ - gcc \ - uuid-dev \ - maven - COPY . /criu WORKDIR /criu +RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index d218e06414..a37f16e495 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -1,32 +1,10 @@ ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - uuid-dev \ - python3-minimal - COPY . /criu WORKDIR /criu +RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh + RUN uname -m && setarch linux32 uname -m && setarch --list RUN make mrproper && date && \ diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 0ae4727d2c..8254956596 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,34 +1,12 @@ FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-focal ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install protobuf-c-compiler \ - libprotobuf-c-dev \ - libaio-dev \ - libprotobuf-dev \ - protobuf-compiler \ - libcap-dev \ - libnl-3-dev \ - gdb \ - bash \ - python3-protobuf \ - python3-yaml \ - libnet-dev \ - libnl-route-3-dev \ - libbsd-dev \ - make \ - git \ - pkg-config \ - iptables \ - gcc \ - uuid-dev \ - maven - RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf COPY . /criu WORKDIR /criu +RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl index e95a433067..8933a6c828 100644 --- a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl +++ b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl @@ -1,5 +1,3 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN apt-get clean -y && apt-get update -y && apt-get install -y --no-install-recommends gnupg2 @@ -12,33 +10,6 @@ COPY scripts/ci/riscv64-cross/riscv64-sources.list /etc/apt/sources.list.d/ RUN dpkg --add-architecture ${DEBIAN_ARCH} && \ apt-get update -y -# Install required packages -RUN apt-get install -y --no-install-recommends \ - build-essential \ - pkg-config \ - git \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} \ - libnftables-dev:${DEBIAN_ARCH} \ - libgnutls28-dev:${DEBIAN_ARCH} \ - iproute2:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -55,4 +26,6 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +RUN contrib/dependencies/apt-cross-packages.sh + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl index 65ae558334..56104081f0 100644 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -1,30 +1,7 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN echo "deb http://deb.debian.org/debian/ stable main" >> /etc/apt/sources.list && \ dpkg --add-architecture ${DEBIAN_ARCH} -RUN apt-install \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} \ - libdrm-dev:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -41,6 +18,8 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu +RUN contrib/dependencies/apt-cross-packages.sh + # amdgpu_plugin with armv7 is not supported RUN make mrproper && date && \ make -j $(nproc) && \ diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index 3d6de10441..498b99be9f 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -1,39 +1,11 @@ ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install +COPY . /criu +WORKDIR /criu # On Ubuntu, kernel modules such as ip_tables and xt_mark may not be loaded by default # We need to install kmod to enable iptables to load these modules for us. -RUN apt-install \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libbsd-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnftables-dev \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - iproute2 \ - kmod \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-minimal \ - python3-protobuf \ - uuid-dev \ - python3-yaml - -COPY . /criu -WORKDIR /criu +RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh RUN git clean -dfx && date && \ # Check single object build diff --git a/scripts/build/Dockerfile.unstable-cross.tmpl b/scripts/build/Dockerfile.unstable-cross.tmpl index 3504b0433c..7edb289b6f 100644 --- a/scripts/build/Dockerfile.unstable-cross.tmpl +++ b/scripts/build/Dockerfile.unstable-cross.tmpl @@ -1,29 +1,7 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN echo "deb http://deb.debian.org/debian/ unstable main" >> /etc/apt/sources.list && \ dpkg --add-architecture ${DEBIAN_ARCH} -RUN apt-install \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -40,4 +18,6 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +RUN contrib/dependencies/apt-cross-packages.sh + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.x86_64.hdr b/scripts/build/Dockerfile.x86_64.hdr index 566b4c9160..a666f6c262 100644 --- a/scripts/build/Dockerfile.x86_64.hdr +++ b/scripts/build/Dockerfile.x86_64.hdr @@ -1,5 +1,5 @@ FROM ubuntu:24.04 -COPY scripts/ci/apt-install /bin/apt-install +COPY contrib/apt-install /bin/apt-install RUN apt-install gcc-multilib diff --git a/scripts/build/Makefile b/scripts/build/Makefile index 3893152270..a420cea942 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,4 +1,4 @@ -ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf centos8 +ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross riscv64-stable-cross UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 9dc0190b37..ed30e42686 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -11,7 +11,7 @@ ifdef CLANG target-suffix = -clang endif -TARGETS := alpine fedora-rawhide centos8 archlinux +TARGETS := alpine fedora-rawhide archlinux ZDTM_OPTS := UNAME := $(shell uname -m) export UNAME diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index ae7f52454d..bc5a746675 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -7,7 +7,7 @@ set -x -e -o pipefail # https://github.com/moby/moby/issues/50750 for details on the bug. export DEBIAN_FRONTEND=noninteractive apt remove -y docker-ce docker-ce-cli -./apt-install -y ca-certificates curl +../../contrib/apt-install -y ca-certificates curl install -m 0755 -d /etc/apt/keyrings curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc chmod a+r /etc/apt/keyrings/docker.asc @@ -18,7 +18,7 @@ echo \ apt update -y apt-cache madison docker-ce | awk '{ print $3 }' verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -v ':28\.'| tail -n 1)" -./apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" +../../contrib/apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json diff --git a/scripts/ci/java-test.sh b/scripts/ci/java-test.sh index 7cf704f074..a5b13a1071 100755 --- a/scripts/ci/java-test.sh +++ b/scripts/ci/java-test.sh @@ -2,6 +2,8 @@ cd ../.. || exit 1 +sudo modprobe iptable_filter + failures="" docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh index d5646468e8..7e00ab65a8 100755 --- a/scripts/ci/loongarch64-qemu-test.sh +++ b/scripts/ci/loongarch64-qemu-test.sh @@ -4,7 +4,7 @@ set -o nounset set -o errexit set -x -./apt-install \ +../../contrib/apt-install \ apt-transport-https \ ca-certificates \ curl \ @@ -19,7 +19,7 @@ add-apt-repository \ $(lsb_release -cs) \ stable test" -./apt-install docker-ce +../../contrib/apt-install docker-ce # shellcheck source=/dev/null . /etc/lsb-release diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index f8f797c1e5..ff75717c59 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -1,43 +1,22 @@ #!/bin/bash set -e -x +contrib/dependencies/dnf-packages.sh dnf install -y \ diffutils \ + e2fsprogs \ findutils \ gawk \ - gcc \ - git \ - gnutls-devel \ gzip \ - iproute \ - iptables \ - nftables \ - nftables-devel \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - libbsd-devel \ + kmod \ libselinux-utils \ - make \ procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - python3-PyYAML \ - python3-protobuf \ python3-pip \ - python3-importlib-metadata \ python-unversioned-command \ redhat-rpm-config \ sudo \ tar \ - which \ - e2fsprogs \ - rubygem-asciidoctor \ - libdrm-devel \ - libuuid-devel \ - kmod + which # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it chmod 1777 /tmp diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 617f54fc6e..9fbdd8e309 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -1,12 +1,7 @@ #!/bin/bash set -x -e -CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev - libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev - libnl-3-dev gdb bash libnet-dev util-linux asciidoctor - libnl-route-3-dev time libbsd-dev python3-yaml uuid-dev - libperl-dev pkg-config python3-protobuf python3-pip - python3-importlib-metadata libdrm-dev) +CI_PKGS=() X86_64_PKGS=(gcc-multilib) @@ -60,7 +55,8 @@ ci_prep () { CI_PKGS+=("${X86_64_PKGS[@]}") fi - scripts/ci/apt-install "${CI_PKGS[@]}" + contrib/dependencies/apt-packages.sh + contrib/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" } @@ -187,7 +183,7 @@ if [ "${COMPAT_TEST}x" = "yx" ] ; then done apt-get remove "${INCOMPATIBLE_LIBS[@]}" dpkg --add-architecture i386 - scripts/ci/apt-install "${IA32_PKGS[@]}" + contrib/apt-install "${IA32_PKGS[@]}" mkdir -p /usr/lib/x86_64-linux-gnu/ mv "$REFUGE"/* /usr/lib/x86_64-linux-gnu/ fi diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index c222e30e05..f69b113523 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -22,9 +22,8 @@ setup() { wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb - ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ - ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base \ - openssh-client + ../../contrib/apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ + ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base openssh-client systemctl restart libvirtd vagrant plugin install vagrant-libvirt vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} @@ -41,16 +40,13 @@ setup() { vagrant up --provider=libvirt --no-tty mkdir -p /root/.ssh vagrant ssh-config >> /root/.ssh/config - ssh default sudo dnf upgrade -y - ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ - libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-protobuf python3-importlib-metadata \ - rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd ssh default 'sudo mkdir -p --mode=777 /vagrant && mv $HOME/criu.tar /vagrant && cd /vagrant && tar xf criu.tar' + ssh default sudo dnf upgrade -y + ssh default sudo /vagrant/criu/contrib/dependencies/dnf-packages.sh ssh default cat /proc/cmdline } diff --git a/scripts/install-debian-pkgs.sh b/scripts/install-debian-pkgs.sh deleted file mode 100755 index 8be49c7871..0000000000 --- a/scripts/install-debian-pkgs.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -# Install required packages for development environment in Debian Distro - -REQ_PKGS=${REQ_PKGS:=contrib/debian/dev-packages.lst} - -help_msg="Install required packages for development environment in Debian Distro -Usage: - scripts/install-debian-pkgs.sh" - -function print_help() -{ - exec echo -e "$help_msg" -} - -function process() -{ - sudo apt-get update - sudo apt-get install -yq "$( sed 's/\#.*$//' "${REQ_PKGS}" )" -} - -if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then - print_help -else - process -fi From e22017c8a50283109252570c39d12d9c9f8419a5 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 26 Sep 2025 16:54:49 +0100 Subject: [PATCH 089/137] Remove travis-ci leftovers Travis CI stopped providing CI minutes for open-source projects some time ago and we have migrated to GitHub actions. Signed-off-by: Radostin Stoyanov --- .travis.yml | 35 ----------------------------------- CONTRIBUTING.md | 7 ------- Makefile | 2 +- Makefile.compel | 4 ++-- scripts/ci/Makefile | 4 ++-- scripts/ci/run-ci-tests.sh | 16 +++++++--------- scripts/ci/vagrant.sh | 7 +------ test/inhfd/memfd.py.checkskip | 2 +- test/zdtm/Makefile.inc | 2 +- 9 files changed, 15 insertions(+), 64 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 94841b3f3c..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,35 +0,0 @@ -language: c -os: linux -dist: bionic -services: - - docker -jobs: - include: - - os: linux - arch: ppc64le - env: TR_ARCH=local - dist: bionic - - os: linux - arch: ppc64le - env: TR_ARCH=local CLANG=1 - dist: bionic - - os: linux - arch: s390x - env: TR_ARCH=local - dist: bionic - - os: linux - arch: arm64-graviton2 - env: TR_ARCH=local RUN_TESTS=1 - dist: focal - group: edge - virt: vm - - os: linux - arch: arm64-graviton2 - env: TR_ARCH=local CLANG=1 RUN_TESTS=1 - group: edge - virt: vm - dist: bionic -script: - - sudo make -C scripts/ci $TR_ARCH -after_success: - - make -C scripts/ci after_success diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3ad4aa1019..2d1dc8227e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -158,11 +158,6 @@ make test The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. -In case you'd rather have someone else run the tests, you can use travis-ci for your -own GitHub fork of CRIU. It will check the compilation for various supported platforms, -as well as run most of the tests from the suite. See https://travis-ci.org/checkpoint-restore/criu -for more details. - ## Describe your changes Describe your problem. Whether your change is a one-line bug fix or @@ -420,5 +415,3 @@ sometimes a patch may fly around a week before it gets reviewed. Wiki article: [Continuous integration](https://criu.org/Continuous_integration) CRIU tests are run for each series sent to the mailing list. If you get a message from our patchwork that patches failed to pass the tests, you have to investigate what is wrong. - -We also recommend you to [enable Travis CI for your repo](https://criu.org/Continuous_integration#Enable_Travis_CI_for_your_repo) to check patches in your git branch, before sending them to the mailing list. diff --git a/Makefile b/Makefile index 3e5d62726c..611bcdd5aa 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ ifeq ($(ARCH),arm) endif ifeq ($(ARMV),8) - # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. + # Running 'setarch linux32 uname -m' returns armv8l on aarch64. # This tells CRIU to handle armv8l just as armv7hf. Right now this is # only used for compile testing. No further verification of armv8l exists. ARCHCFLAGS += -march=armv7-a diff --git a/Makefile.compel b/Makefile.compel index 764afadc81..a4209edc5d 100644 --- a/Makefile.compel +++ b/Makefile.compel @@ -50,8 +50,8 @@ compel/plugins/%: $(compel-deps) .FORCE # # GNU make 4.x supports targets matching via wide -# match targeting, where GNU make 3.x series (used on -# Travis) is not, so we have to write them here explicitly. +# match targeting, where GNU make 3.x series is not, +# so we have to write them here explicitly. compel/plugins/std.lib.a: $(compel-deps) .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index ed30e42686..bad8065f23 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -30,9 +30,9 @@ endif export CONTAINER_TERMINAL +# Here we assume that any CPU architecture besides x86_64 is running in containers +# that may not support running docker with '--privileged'. ifeq ($(UNAME),x86_64) - # On anything besides x86_64 Travis is running unprivileged LXD - # containers which do not support running docker with '--privileged'. CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged --userns=host --cgroupns=host -v /lib/modules:/lib/modules --tmpfs /run else CONTAINER_OPTS := --rm -v /lib/modules:/lib/modules --tmpfs /run diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 9fbdd8e309..7a8345b7c0 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -11,13 +11,11 @@ IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) if [ "$UNAME_M" != "x86_64" ]; then - # For Travis only x86_64 seems to be baremetal. Other - # architectures are running in unprivileged LXD containers. - # That seems to block most of CRIU's interfaces. - - # But with the introduction of baremetal aarch64 systems in - # Travis (arch: arm64-graviton2) we can override this using - # an environment variable + # Some tests rely on kernel features that may not be availble + # when running in a container. Here we assume that x86_64 + # systems are baremetal, and skip the tests for all other + # CPU architectures. We can override this using the RUN_TESTS + # environment variable (e.g., for aarch64). [ -n "$RUN_TESTS" ] || SKIP_CI_TEST=1 fi @@ -31,7 +29,7 @@ ci_prep () { # not run anymore with 'sudo -u \#1000' if the UID does not exist. adduser -u 1000 --disabled-password --gecos "criutest" criutest || : - # This can fail on aarch64 travis + # This can fail on aarch64 service apport stop || : # Ubuntu has set up AppArmor in 24.04 so that it blocks use of user @@ -258,7 +256,7 @@ if [ -z "$SKIP_EXT_DEV_TEST" ]; then fi make -C test/others/make/ run CC="$CC" -if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then +if [ -n "$CIRCLECI" ]; then # GitHub Actions (and Cirrus CI) does not provide a real TTY and CRIU will fail with: # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index f69b113523..5f2de32b84 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -11,11 +11,6 @@ FEDORA_VERSION=42 FEDORA_BOX_VERSION=1.1.0 setup() { - if [ -n "$TRAVIS" ]; then - # Load the kvm modules for vagrant to use qemu - modprobe kvm kvm_intel - fi - # Tar up the git checkout to have vagrant rsync it to the VM tar cf /tmp/criu.tar -C ../../../ criu # Cirrus has problems with the following certificate. @@ -29,7 +24,7 @@ setup() { vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} # The default libvirt Vagrant VM uses 512MB. - # Travis VMs should have around 7.5GB. + # VMs in our CI typically have around 16GB. # Increasing it to 4GB should work. sed -i Vagrantfile -e 's,^end$, config.vm.provider :libvirt do |libvirt|'"\n"' libvirt.memory = 4096;end'"\n"'end,g' # Sync /tmp/criu.tar into the VM diff --git a/test/inhfd/memfd.py.checkskip b/test/inhfd/memfd.py.checkskip index 27e2b7b155..32c57d929c 100755 --- a/test/inhfd/memfd.py.checkskip +++ b/test/inhfd/memfd.py.checkskip @@ -3,5 +3,5 @@ import ctypes libc = ctypes.CDLL(None) -# libc may not have memfd_create (e.g., centos on travis) +# libc may not have memfd_create (e.g., centos) libc.memfd_create("test".encode('utf8'), 0) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index c19888da31..3b349ed4d7 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -27,7 +27,7 @@ ifeq ($(ARCH),arm) else ifeq ($(ARMV),7) ARCHCFLAGS += -march=armv7-a+fp else ifeq ($(ARMV),8) - # To build aarch32 on armv8 Travis-CI (see criu Makefile) + # To build aarch32 on armv8 (see criu Makefile) ARCHCFLAGS += -march=armv7-a ARMV := 7 endif From db4388e014e54fe74bac3db7a0a0b63125038f0e Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Fri, 26 Sep 2025 23:38:08 +0900 Subject: [PATCH 090/137] ci/java: update base image from focal to jammy Ubuntu Focal Fossa (20.04) reached its end-of-life on 31 May 2025. So, move over to using Ubuntu Jammy (22.04) base images. Also, focal repos do not have libtracefs, which the uprobes zdtm test needs. Signed-off-by: Shashank Balaji --- scripts/build/Dockerfile.hotspot-ubuntu | 2 +- scripts/build/Dockerfile.openj9-ubuntu | 2 +- scripts/ci/run-ci-tests.sh | 9 ++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 76aa571fac..a459e1ec71 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/eclipse-temurin:11-focal +FROM docker.io/library/eclipse-temurin:11-jammy ARG CC=gcc COPY . /criu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 8254956596..18664f100a 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-focal +FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-jammy ARG CC=gcc RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 7a8345b7c0..05a3b71e8d 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -11,11 +11,10 @@ IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) if [ "$UNAME_M" != "x86_64" ]; then - # Some tests rely on kernel features that may not be availble - # when running in a container. Here we assume that x86_64 - # systems are baremetal, and skip the tests for all other - # CPU architectures. We can override this using the RUN_TESTS - # environment variable (e.g., for aarch64). + # Some tests rely on kernel features that may not be available + # when running in a container. Here we assume that x86_64 systems + # are baremetal, and skip the tests for all other CPU architectures. + # The RUN_TESTS environment variable can override this, e.g., for aarch64. [ -n "$RUN_TESTS" ] || SKIP_CI_TEST=1 fi From 6d6828415cb0d5fc57dedb9c75ccefc7ca88846a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 27 Sep 2025 09:21:26 +0100 Subject: [PATCH 091/137] ci: consolidate aarch64 tests on GitHub runners Currently we run aarch64 tests on both Cirrus CI and GitHub runners. However, Cirrus CI fails with "Monthly compute limit exceeded!". This change removes the redundant tests to streamline our CI process. Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 22 ---------------------- .github/workflows/aarch64-test.yaml | 6 ++++-- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 99dd70d63f..72dbb38981 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -88,28 +88,6 @@ task: build_script: | make -C scripts/ci vagrant-fedora-non-root -task: - name: aarch64 build GCC (native) - arm_container: - image: docker.io/library/ubuntu:jammy - cpu: 4 - memory: 4G - script: uname -a - build_script: | - contrib/apt-install make - make -C scripts/ci local - -task: - name: aarch64 build CLANG (native) - arm_container: - image: docker.io/library/ubuntu:jammy - cpu: 4 - memory: 4G - script: uname -a - build_script: | - contrib/apt-install make - make -C scripts/ci local CLANG=1 - task: name: aarch64 Fedora Rawhide arm_container: diff --git a/.github/workflows/aarch64-test.yaml b/.github/workflows/aarch64-test.yaml index 32b19e1766..ebbecadb33 100644 --- a/.github/workflows/aarch64-test.yaml +++ b/.github/workflows/aarch64-test.yaml @@ -9,14 +9,16 @@ concurrency: jobs: build: - runs-on: ubuntu-24.04-arm strategy: matrix: + os: [ubuntu-24.04-arm, ubuntu-22.04-arm] target: [GCC=1, CLANG=1] + runs-on: ${{ matrix.os }} + steps: - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }} + - name: Run Tests ${{ matrix.target }} on ${{ matrix.os }} # Following tests are failing on the VMs: # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) From c398752a43b1bc2ec2be1a021ec97988e4cd6f79 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 1 Oct 2025 11:20:13 +0100 Subject: [PATCH 092/137] contributing: update links to mailing list Our previous mailing list had some technical issues and we created a new one that is hopefully more reliable. Signed-off-by: Radostin Stoyanov --- CONTRIBUTING.md | 12 ++++++------ crit/pyproject.toml | 2 +- crit/setup.cfg | 2 +- lib/pyproject.toml | 2 +- lib/setup.cfg | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2d1dc8227e..03875639df 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,8 +8,8 @@ Here are some useful hints to get involved. * We have both -- [very simple](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; * CRIU does need [extensive testing](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); * Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; -* Feedback is expected on the GitHub issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); -* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lists.openvz.org/mailman/listinfo/criu). +* Feedback is expected on the GitHub issues page and on the [mailing list](https://lore.kernel.org/criu); +* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lore.kernel.org/criu). Below we describe in more detail recommend practices for CRIU development. * Spread the word about CRIU in [social networks](http://criu.org/Contacts); * If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); @@ -366,7 +366,7 @@ We recommend to post patches using `git send-email` ``` git send-email --cover-letter --no-chain-reply-to --annotate \ - --confirm=always --to=criu@openvz.org criu-dev + --confirm=always --to=criu@lists.linux.dev criu-dev ``` Note that the `git send-email` subcommand may not be in @@ -381,11 +381,11 @@ configure it to point it to your SMTP server with something like: git config --global sendemail.smtpServer stmp.example.net ``` -If you get tired of typing `--to=criu@openvz.org` all the time, +If you get tired of typing `--to=criu@lists.linux.dev` all the time, you can configure that to be automatically handled as well: ``` -git config sendemail.to criu@openvz.org +git config sendemail.to criu@lists.linux.dev ``` If a developer is sending another version of the patch (e.g. to address @@ -398,7 +398,7 @@ version if needed though). ### Mail patches -The patches should be sent to CRIU development mailing list, `criu AT openvz.org`. Note that you need to be subscribed first in order to post. The list web interface is available at https://openvz.org/mailman/listinfo/criu; you can also use standard mailman aliases to work with it. +The patches should be sent to CRIU development mailing list, `criu AT lists.linux.dev`. Note that you need to be subscribed first in order to post. The list web interface is available at https://lore.kernel.org/criu; you can also use standard mailman aliases to work with it. Please make sure the email client you're using doesn't screw your patch (line wrapping and so on). diff --git a/crit/pyproject.toml b/crit/pyproject.toml index 9089f0a394..f0b185eb7a 100644 --- a/crit/pyproject.toml +++ b/crit/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "crit" description = "CRiu Image Tool" authors = [ - {name = "CRIU team", email = "criu@openvz.org"}, + {name = "CRIU team", email = "criu@lists.linux.dev"}, ] license = {text = "GPLv2"} dynamic = ["version"] diff --git a/crit/setup.cfg b/crit/setup.cfg index fbc9a51439..37895923f3 100644 --- a/crit/setup.cfg +++ b/crit/setup.cfg @@ -7,7 +7,7 @@ name = crit description = CRiu Image Tool author = CRIU team -author_email = criu@openvz.org +author_email = criu@lists.linux.dev license = GPLv2 version = attr: crit.__version__ diff --git a/lib/pyproject.toml b/lib/pyproject.toml index 8eb4b7084d..c9e11551b0 100644 --- a/lib/pyproject.toml +++ b/lib/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "pycriu" description = "Python bindings for CRIU" authors = [ - {name = "CRIU team", email = "criu@openvz.org"}, + {name = "CRIU team", email = "criu@lists.linux.dev"}, ] license = {text = "GPLv2"} dynamic = ["version"] diff --git a/lib/setup.cfg b/lib/setup.cfg index 23ee48dd5b..5d75719ca9 100644 --- a/lib/setup.cfg +++ b/lib/setup.cfg @@ -7,7 +7,7 @@ name = pycriu description = Python bindings for CRIU author = CRIU team -author_email = criu@openvz.org +author_email = criu@lists.linux.dev license = GPLv2 version = attr: pycriu.__version__ From 5e574cdc3c50ef41dc919c894561c55bdcbc506d Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 2 Oct 2025 08:39:30 +0100 Subject: [PATCH 093/137] page-xfer: fix incompatible pointer type on armv7 page_pipe_read() expects an 'unsigned long *', but pi->nr_pages is u64. On 32-bit platforms (e.g., armv7), passing &pi->nr_pages directly causes a compiler error. To fix this we introduce a temporary variable and copy the result back to pi->nr_pages. Fixes: #2756 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/page-xfer.c | 9 +++++++-- criu/pagemap.c | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/criu/page-xfer.c b/criu/page-xfer.c index e2913b9244..463d4c506f 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -1139,13 +1139,17 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) { struct pstree_item *item; struct page_pipe *pp; - unsigned long len; + unsigned long len, nr_pages; int ret; item = pstree_item_by_virt(pi->dst_id); pp = dmpi(item)->mem_pp; - ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &pi->nr_pages, PPB_LAZY); + /* page_pipe_read() uses 'unsigned long *' but pi->nr_pages is u64. + * Use a temporary variable to fix the incompatible pointer type + * on 32-bit platforms (e.g. armv7). */ + nr_pages = pi->nr_pages; + ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &nr_pages, PPB_LAZY); if (ret) return ret; @@ -1154,6 +1158,7 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) * .dst_id all remain intact. */ + pi->nr_pages = nr_pages; if (pi->nr_pages == 0) { pr_debug("no iovs found, zero pages\n"); return -1; diff --git a/criu/pagemap.c b/criu/pagemap.c index b6ec3e3332..6c9c4f7feb 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -171,7 +171,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%lx vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%" PRIx64 " vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } From 0324cb54f7dcb86f8d6b90b74ab7f3318525c542 Mon Sep 17 00:00:00 2001 From: Pepper Gray <111446242+peppergrayxyz@users.noreply.github.com> Date: Tue, 30 Sep 2025 22:58:29 +0200 Subject: [PATCH 094/137] make: prevent redefinition of 'struct sigcontext' Compilation on gentoo/arm64 (llvm+musl) fails with: In file included from compel/include/uapi/compel/asm/sigframe.h:4, from compel/plugins/std/infect.c:14: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ In file included from criu/arch/aarch64/include/asm/restorer.h:4, from criu/arch/aarch64/crtools.c:11: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ This is happening because and are mutually incompatible on Linux. To fix, use instead of for arm64 (like all others arches do). Fixes: #2766 Signed-off-by: Pepper Gray --- compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h | 3 ++- criu/arch/aarch64/include/asm/restorer.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index 9152024fd8..a3528500db 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -1,10 +1,11 @@ #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ -#include +#include #include #include +#include /* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h index 64a9c24eb9..2174df4fa1 100644 --- a/criu/arch/aarch64/include/asm/restorer.h +++ b/criu/arch/aarch64/include/asm/restorer.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ -#include +#include #include #include "asm/types.h" From b281a616f5803496457479c25d3815333a96c53f Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 2 Oct 2025 12:03:57 -0700 Subject: [PATCH 095/137] ci: run alpine tests on arm64 These tests reveal the following build error: In file included from compel/include/uapi/compel/asm/sigframe.h:4, from compel/plugins/std/infect.c:14: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ In file included from criu/arch/aarch64/include/asm/restorer.h:4, from criu/arch/aarch64/crtools.c:11: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ Inspired by #2766 / #2767. Signed-off-by: Kir Kolyshkin Signed-off-by: Radostin Stoyanov --- .github/workflows/alpine-test.yml | 3 ++- contrib/dependencies/apk-packages.sh | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 73530d79ae..0f5c20f48b 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -9,10 +9,11 @@ concurrency: jobs: build: - runs-on: ubuntu-22.04 strategy: matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm] target: [GCC=1, CLANG=1] + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh index 0084dea3ab..d02704b15c 100755 --- a/contrib/dependencies/apk-packages.sh +++ b/contrib/dependencies/apk-packages.sh @@ -22,6 +22,7 @@ apk add --no-cache \ libnl3-dev \ nftables \ nftables-dev \ + perl \ pkgconfig \ procps \ protobuf-c-compiler \ From cb3d193aeb41f6233fb6f6fd5d7b8645de739223 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 3 Oct 2025 17:02:25 +0100 Subject: [PATCH 096/137] zdtm: add sk-unix-restore-fs-share test Add a ZDTM test case where CRIU uses a helper process to restore a non-empty process group with a terminated leader and a Unix domain socket. This reproduces a corner case in which mount namespace switching can fail during restore: https://github.com/checkpoint-restore/criu/issues/2687 Signed-off-by: Qiao Ma Signed-off-by: Radostin Stoyanov --- test/zdtm/static/Makefile | 1 + test/zdtm/static/sk-unix-restore-fs-share.c | 196 ++++++++++++++++++ .../zdtm/static/sk-unix-restore-fs-share.desc | 1 + 3 files changed, 198 insertions(+) create mode 100644 test/zdtm/static/sk-unix-restore-fs-share.c create mode 100644 test/zdtm/static/sk-unix-restore-fs-share.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index e73f964be5..6b262c4439 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -382,6 +382,7 @@ TST_FILE = \ sk-unix-listen02 \ sk-unix-listen03 \ sk-unix-listen04 \ + sk-unix-restore-fs-share \ mnt_ext_file_bind_auto \ TST_DIR = \ diff --git a/test/zdtm/static/sk-unix-restore-fs-share.c b/test/zdtm/static/sk-unix-restore-fs-share.c new file mode 100644 index 0000000000..d4f6dde75d --- /dev/null +++ b/test/zdtm/static/sk-unix-restore-fs-share.c @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test non-empty process group with terminated parent and unix socket"; +const char *test_author = "Qiao Ma "; + +char *filename; +TEST_OPTION(filename, string, "socket file name", 1); + +static int create_and_connect(void) +{ + struct sockaddr_un addr; + int client_fd; + + client_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (client_fd == -1) { + pr_perror("socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + if (snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", filename) >= (int)sizeof(addr.sun_path)) { + pr_err("Socket path too long\n"); + close(client_fd); + return -1; + } + + if (connect(client_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("connect"); + close(client_fd); + return -1; + } + + return 0; +} + +static int child(int ready_fd) +{ + int listen_fd; + struct sockaddr_un addr; + int ret = EXIT_FAILURE; + + listen_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (listen_fd == -1) { + pr_perror("socket"); + return EXIT_FAILURE; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + if (strlen(filename) >= sizeof(addr.sun_path)) { + pr_err("Socket path too long\n"); + goto cleanup; + } + strncpy(addr.sun_path, filename, sizeof(addr.sun_path)); + + unlink(filename); /* Ignore error if file doesn't exist */ + + if (bind(listen_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("bind"); + goto cleanup; + } + + if (listen(listen_fd, 5) == -1) { + pr_perror("listen"); + goto cleanup; + } + + if (create_and_connect() != 0) { + pr_err("Failed to create and connect\n"); + goto cleanup; + } + + /* Signal parent that socket is ready */ + if (write(ready_fd, "1", 1) != 1) { + pr_perror("write ready_fd"); + goto cleanup; + } + + /* Wait indefinitely */ + pause(); + + ret = EXIT_SUCCESS; +cleanup: + if (listen_fd != -1) + close(listen_fd); + unlink(filename); + + return ret; +} + +static int zombie_leader(int *cpid) +{ + char buf; + pid_t pid; + int pipefd[2]; + + if (pipe(pipefd) == -1) { + pr_perror("pipe"); + return EXIT_FAILURE; + } + + if (setpgid(0, 0) == -1) { + pr_perror("setpgid"); + return EXIT_FAILURE; + } + + pid = fork(); + if (pid < 0) { + pr_perror("Failed to fork child"); + return EXIT_FAILURE; + } + + if (pid == 0) { + /* Close read end */ + close(pipefd[0]); + exit(child(pipefd[1])); + } + + /* Close write end in parent */ + close(pipefd[1]); + + /* Wait for child to set up socket */ + if (read(pipefd[0], &buf, 1) != 1) { + pr_err("Failed to receive readiness signal from child\n"); + close(pipefd[0]); + return EXIT_FAILURE; + } + close(pipefd[0]); + + *cpid = pid; + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + int ret = EXIT_FAILURE, status; + pid_t pid; + int *cpid; + + test_init(argc, argv); + + cpid = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (cpid == MAP_FAILED) { + pr_perror("mmap"); + return EXIT_FAILURE; + } + *cpid = 0; + + pid = fork(); + if (pid < 0) { + pr_perror("Failed to fork zombie"); + goto out; + } + + if (pid == 0) + exit(zombie_leader(cpid)); + + if (waitpid(pid, &status, 0) < 0) { + pr_perror("Failed to waitpid zombie"); + goto out; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != EXIT_SUCCESS) { + pr_err("Unexpected exit code: %d\n", WEXITSTATUS(status)); + goto out; + } + + if (!*cpid) { + pr_err("Don't know grandchild's pid\n"); + goto out; + } + + test_daemon(); + test_waitsig(); + + ret = EXIT_SUCCESS; + pass(); +out: + /* Clean up */ + if (*cpid) + kill(*cpid, SIGKILL); + + munmap(cpid, sizeof(int)); + + return ret; +} diff --git a/test/zdtm/static/sk-unix-restore-fs-share.desc b/test/zdtm/static/sk-unix-restore-fs-share.desc new file mode 100644 index 0000000000..6c4afe5f03 --- /dev/null +++ b/test/zdtm/static/sk-unix-restore-fs-share.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} From eed7b39840afcb72d696b46a0272ec6dd7e42f03 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Mon, 18 Aug 2025 11:00:07 +0900 Subject: [PATCH 097/137] vma: introduce VMA_AREA_UPROBES flag This flag will be used for a "[uprobes]" vma. Signed-off-by: Shashank Balaji --- criu/include/image.h | 7 +++++++ criu/util.c | 1 + 2 files changed, 8 insertions(+) diff --git a/criu/include/image.h b/criu/include/image.h index 934f7d4e97..b5951d3d49 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -74,6 +74,12 @@ * about virtual address space ranges covered by * MADV_GUARD_INSTALL guards. These ones must be always at * the end of the vma_area_list and properly skipped a.e. + * - uprobes + * stands for a "[uprobes]" vma that's automatically mapped by + * the kernel when an active uprobe is hit. Contents of this vma + * are not dumped and neither are its madvise bits restored, + * because the kernel is in complete control of this vma. This is + * just used to track the existence of the uprobes vma. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) @@ -94,6 +100,7 @@ #define VMA_AREA_MEMFD (1 << 14) #define VMA_AREA_SHSTK (1 << 15) #define VMA_AREA_GUARD (1 << 16) +#define VMA_AREA_UPROBES (1 << 17) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) diff --git a/criu/util.c b/criu/util.c index 58c18e20be..e2f80e4c61 100644 --- a/criu/util.c +++ b/criu/util.c @@ -195,6 +195,7 @@ static void vma_opt_str(const struct vma_area *v, char *opt) opt2s(VMA_ANON_PRIVATE, "ap"); opt2s(VMA_AREA_SYSVIPC, "sysv"); opt2s(VMA_AREA_SOCKET, "sk"); + opt2s(VMA_AREA_UPROBES, "uprobes"); #undef opt2s } From 808e8140bdc084fccde2b22d4eb55baccae496f3 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Mon, 18 Aug 2025 10:53:18 +0900 Subject: [PATCH 098/137] criu-coredump: add VMA_AREA_UPROBES flag Signed-off-by: Shashank Balaji --- coredump/criu_coredump/coredump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index c6a758c8ad..9454d8f0bb 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -55,6 +55,7 @@ "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, "VMA_AREA_MEMFD": 1 << 14, + "VMA_AREA_UPROBES": 1 << 17, "VMA_AREA_UNSUPP": 1 << 31 } From 00f3c0515a7cf88ea1152e17192fe74d681a4ac7 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Mon, 18 Aug 2025 10:54:28 +0900 Subject: [PATCH 099/137] crit: add VMA_AREA_UPROBES flag Signed-off-by: Shashank Balaji --- lib/pycriu/images/pb2dict.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index 6c4f688896..a35dd3c3fc 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -105,6 +105,7 @@ def _custom_conv(field): ('VMA_AREA_AIORING', 1 << 13), ('VMA_AREA_MEMFD', 1 << 14), ('VMA_AREA_SHSTK', 1 << 15), + ('VMA_AREA_UPROBES', 1 << 17), ('VMA_UNSUPP', 1 << 31), ] From 1ac0c086c566d7a2eb0121f7dbbcec60f5a5a6be Mon Sep 17 00:00:00 2001 From: "Mahadasyam, Shashank (SGC)" Date: Mon, 18 Aug 2025 01:03:39 +0900 Subject: [PATCH 100/137] vma: introduce --allow-uprobes option This commit teaches criu to deal with processes which have a "[uprobes]" vma. This vma is mapped by the kernel when execution hits a uprobe location. This is done so as to execute the uprobe'd instruciton out-of-line in the special vma. The uprobe'd location is replaced by a software breakpoint instruction, which is int3 on x86. When execution reaches that location, control is transferred over to the kernel, which then executes whatever handler code it has to, for the uprobe, and then executed the replaced instruction out-of-line in the special vma. For more details, refer to this commit: https://github.com/torvalds/linux/commit/d4b3b6384f98f8692ad0209891ccdbc7e78bbefe Reason for adding a new option ------------------------------ A new option is added instead of making the uprobes vma handling transparent to the user, so that when a dump is attempted on a process tree in which a process has the uprobes vma, criu will error, asking the user to use this option. This gives the user a chance to check what uprobes are attached to the processes being dumped, and try to ensure that those uprobes are active on restore as well. Again, the same reason for requiring this option on restore as well. Because if a process is dumped with an active uprobe, and on restore if the uprobe is not active, then if execution reaches the uprobe location, then the process will be sent a SIGTRAP, whose default behaviour will terminate and core dump the process. This is because the code pages are dumped with the software breakpoint instruction replacement at the uprobe'd locations. On restore, if execution reaches these locations and the kernel sees no associated active uprobes, then it'll send a SIGTRAP. So, using this option is on dump and restore is an implicit guarantee on the user's behalf that they'll take care of the active uprobes and that any future SIGTRAPs because of this are not on us! :) Handling uprobes vma on dump ---------------------------- We don't need to store any information about the uprobes vma because it's completely handled by the kernel, transparent to userspace. So, when a uprobes vma is detected, we check if the --allow-uprobes option was specified or not. If so, then the allow_uprobes boolean in the inventory image is set (this is used on restore). The uprobes vma is skipped from being added to the vma list. Handling uprobes vma on restore ------------------------------- If allow_uprobes is set in the inventory image, then check if --allow-uprobes is specified or not. Restoring the vma is not required. Fixes: checkpoint-restore#1961 Signed-off-by: Shashank Balaji --- criu/config.c | 2 ++ criu/cr-dump.c | 4 ++++ criu/crtools.c | 2 ++ criu/image.c | 5 +++++ criu/include/cr_options.h | 1 + criu/include/image.h | 2 ++ criu/include/proc_parse.h | 2 ++ criu/proc_parse.c | 24 +++++++++++++++++++++++- images/inventory.proto | 1 + 9 files changed, 42 insertions(+), 1 deletion(-) diff --git a/criu/config.c b/criu/config.c index 1322a490ab..d7ef3f8e8b 100644 --- a/criu/config.c +++ b/criu/config.c @@ -18,6 +18,7 @@ #include "cr_options.h" #include "filesystems.h" #include "file-lock.h" +#include "image.h" #include "irmap.h" #include "mount.h" #include "mount-v2.h" @@ -703,6 +704,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), BOOL_OPT("unprivileged", &opts.unprivileged), BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), + BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes), {}, }; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 10c485cbe9..60b8e793c9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2319,6 +2319,10 @@ int cr_dump_tasks(pid_t pid) goto err; he.has_pre_dump_mode = false; + if (found_uprobes_vma()) { + he.has_allow_uprobes = true; + he.allow_uprobes = true; + } ret = write_img_inventory(&he); if (ret) diff --git a/criu/crtools.c b/criu/crtools.c index 509e73d741..203bded811 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -427,6 +427,8 @@ int main(int argc, char *argv[], char *envp[]) " can be 'nftables' or 'iptables' (default).\n" " --unprivileged accept limitations when running as non-root\n" " consult documentation for further details\n" + " --allow-uprobes allow dump/restore with uprobes vma\n" + " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" diff --git a/criu/image.c b/criu/image.c index f3747d6ff5..c4f05e1597 100644 --- a/criu/image.c +++ b/criu/image.c @@ -95,6 +95,11 @@ int check_img_inventory(bool restore) goto out_err; } + if (restore && he->allow_uprobes && !opts.allow_uprobes) { + pr_err("Dumped with --" OPT_ALLOW_UPROBES ". Need to set it on restore as well.\n"); + goto out_err; + } + if (restore) { if (!he->has_network_lock_method) { /* diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 4df8056b7b..8c5707b415 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -196,6 +196,7 @@ struct cr_options { char *work_dir; int network_lock_method; int skip_file_rwx_check; + int allow_uprobes; /* * When we scheduler for removal some functionality we first diff --git a/criu/include/image.h b/criu/include/image.h index b5951d3d49..b06dbf7062 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -114,6 +114,8 @@ #define CR_PARENT_LINK "parent" +#define OPT_ALLOW_UPROBES "allow-uprobes" + extern bool ns_per_id; extern bool img_common_magic; diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h index 0bd79bf553..76d3242d2b 100644 --- a/criu/include/proc_parse.h +++ b/criu/include/proc_parse.h @@ -105,4 +105,6 @@ extern int parse_uptime(uint64_t *upt); extern int parse_timens_offsets(struct timespec *boff, struct timespec *moff); +extern bool found_uprobes_vma(void); + #endif /* __CR_PROC_PARSE_H__ */ diff --git a/criu/proc_parse.c b/criu/proc_parse.c index d7eb256626..0d3b5b23f1 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -74,6 +74,8 @@ struct buffer { static struct buffer __buf; static char *buf = __buf.buf; +/* only ever goes from false to true, if at all */ +static bool uprobes_vma_exists = false; /* * This is how AIO ring buffers look like in proc @@ -202,8 +204,11 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the * only exception is VVAR area that mapped by the kernel as * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP + * + * The uprobes vma is also mapped by the kernel with VM_IO, among other flags */ - if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) + if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED) + && !vma_area_is(vma_area, VMA_AREA_UPROBES)) vma_area->e->status |= VMA_UNSUPP; if (vma_area->e->madv) @@ -603,6 +608,14 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat goto err; } else if (!strcmp(file_path, "[heap]")) { vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; + } else if (!strcmp(file_path, "[uprobes]")) { + uprobes_vma_exists = true; + if (!opts.allow_uprobes) { + pr_err("PID %d has uprobes vma. Consider using --" OPT_ALLOW_UPROBES ".\n", + pid); + goto err; + } + vma_area->e->status |= VMA_AREA_UPROBES; } else { vma_area->e->status = VMA_AREA_REGULAR; } @@ -739,6 +752,10 @@ static int vma_list_add(struct vma_area *vma_area, struct vm_area_list *vma_area */ pr_debug("Device file mapping %016" PRIx64 "-%016" PRIx64 " supported via device plugins\n", vma_area->e->start, vma_area->e->end); + } else if (vma_area->e->status & VMA_AREA_UPROBES) { + pr_debug("Skipping uprobes vma %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, + vma_area->e->end); + return 0; } else if (vma_area->e->status & VMA_UNSUPP) { pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, vma_area->e->end); @@ -2929,3 +2946,8 @@ int parse_uptime(uint64_t *upt) fclose(f); return 0; } + +bool found_uprobes_vma(void) +{ + return uprobes_vma_exists; +} diff --git a/images/inventory.proto b/images/inventory.proto index 1e18815bb9..feed5b8509 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -33,4 +33,5 @@ message inventory_entry { // This is currently used to delete the correct nftables // network locking rule. optional string dump_criu_run_id = 13; + optional bool allow_uprobes = 14; } From 4082da91ea692589a1f566d8e9345e9cd092bdc6 Mon Sep 17 00:00:00 2001 From: "Mahadasyam, Shashank (SGC)" Date: Mon, 18 Aug 2025 01:04:10 +0900 Subject: [PATCH 101/137] docs: add documentation for --allow-uprobes Signed-off-by: Shashank Balaji --- Documentation/criu.txt | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 606935790b..40ede84e25 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -465,6 +465,30 @@ The 'mode' may be one of the following: *skip*::: Don't lock the network. If *--tcp-close* is not used, the network must be locked externally to allow CRIU to dump TCP connections. +*--allow-uprobes*:: + Allow dumping when uprobes vma is present. When used on dump, this option is + required on restore as well. + + A uprobes vma is automatically created by the kernel once a uprobe is + triggered. This mapping is not removed even once the uprobe is deleted. So, + even if a process once had uprobes attached to it, and they're removed by + the time the process is dumped, this option is still required because criu + has no way of knowing whether there are active uprobes or not. + + When using this option on restore, make sure the uprobes (if any) active on + the dumped processes are still active. Otherwise, when execution reaches + a uprobe'd location in any of the restored processes, that process will be + sent a SIGTRAP. + + As an example, say a uprobe is set at function foo in the executable of the + process p_bar. Whenever execution in p_bar reaches function foo, the uprobe + is triggered. If the uprobe has been triggered at least once, then the kernel + will have created the uprobes vma. To dump p_bar, this option is + necessary. After dumping, say the uprobe is deleted. Now, on restoring with + this option, once execution reaches function foo, SIGTRAP will be sent to + the restored p_bar. Unless it has a signal handler installed for SIGTRAP, + it will be terminated and core dumped. + *restore* ~~~~~~~~~ Restores previously checkpointed processes. @@ -692,6 +716,10 @@ The 'mode' may be one of the following: *--skip-file-rwx-check*:: Skip checking file permissions (r/w/x for u/g/o) on restore. +*--allow-uprobes*:: + Required when dumped with this option. Refer to this option in the section + on dumping for more details. + *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to From 40efe831a5d7f5b2c25718579deb1989509feb99 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Wed, 20 Aug 2025 22:05:03 +0900 Subject: [PATCH 102/137] crtools: remove "consult documentation" Most people know this, don't they? :) Suggested-by: Radostin Stoyanov Signed-off-by: Shashank Balaji --- criu/crtools.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/criu/crtools.c b/criu/crtools.c index 203bded811..e207133ac0 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -426,9 +426,7 @@ int main(int argc, char *argv[], char *envp[]) " --network-lock METHOD network locking/unlocking method; argument\n" " can be 'nftables' or 'iptables' (default).\n" " --unprivileged accept limitations when running as non-root\n" - " consult documentation for further details\n" " --allow-uprobes allow dump/restore with uprobes vma\n" - " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" From 832a64e85f7ed0721d4ab4198260b9145063ab98 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Fri, 22 Aug 2025 12:47:16 +0900 Subject: [PATCH 103/137] zdtm: add a test for --allow-uprobes option Program flow: - Parse the test's own executable to calculate the file offset of the uprobe target function symbol - Enable the uprobe at the target function - Call the target function to trigger the uprobe, and hence the uprobes vma creation - C/R - Call the target function again to check that no SIGTRAP is sent, since the uprobe is still active At least v1.7 of libtracefs is required because that's when tracefs_instance_reset was introduced. The uprobes API was introduced in v1.4, and the dynamic events API was introduced in v1.3. Ubuntu Focal doesn't have libtracefs. Jammy has v1.2.5, and Noble has v1.7. Signed-off-by: Shashank Balaji --- contrib/dependencies/apk-packages.sh | 3 + contrib/dependencies/apt-cross-packages.sh | 5 +- contrib/dependencies/apt-packages.sh | 3 + contrib/dependencies/dnf-packages.sh | 5 +- contrib/dependencies/pacman-packages.sh | 3 + test/zdtm/static/Makefile | 9 +- test/zdtm/static/uprobes.c | 295 +++++++++++++++++++++ test/zdtm/static/uprobes.desc | 6 + 8 files changed, 326 insertions(+), 3 deletions(-) create mode 100644 test/zdtm/static/uprobes.c create mode 100644 test/zdtm/static/uprobes.desc diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh index d02704b15c..c47fb9fe07 100755 --- a/contrib/dependencies/apk-packages.sh +++ b/contrib/dependencies/apk-packages.sh @@ -6,6 +6,7 @@ apk add --no-cache \ build-base \ coreutils \ e2fsprogs \ + elfutils-dev \ git \ gnutls-dev \ go \ @@ -20,6 +21,8 @@ apk add --no-cache \ libdrm-dev \ libnet-dev \ libnl3-dev \ + libtraceevent-dev \ + libtracefs-dev \ nftables \ nftables-dev \ perl \ diff --git a/contrib/dependencies/apt-cross-packages.sh b/contrib/dependencies/apt-cross-packages.sh index 588be40d02..30ce6874c8 100755 --- a/contrib/dependencies/apt-cross-packages.sh +++ b/contrib/dependencies/apt-cross-packages.sh @@ -14,6 +14,8 @@ fi libc6-"${DEBIAN_ARCH}"-cross \ libc6-dev-"${DEBIAN_ARCH}"-cross \ libcap-dev:"${DEBIAN_ARCH}" \ + libdrm-dev:"${DEBIAN_ARCH}" \ + libelf-dev:"${DEBIAN_ARCH}" \ libexpat1-dev:"${DEBIAN_ARCH}" \ libgnutls28-dev:"${DEBIAN_ARCH}" \ libnet-dev:"${DEBIAN_ARCH}" \ @@ -23,9 +25,10 @@ fi libprotobuf-c-dev:"${DEBIAN_ARCH}" \ libprotobuf-dev:"${DEBIAN_ARCH}" \ libssl-dev:"${DEBIAN_ARCH}" \ + libtraceevent-dev:"${DEBIAN_ARCH}" \ + libtracefs-dev:"${DEBIAN_ARCH}" \ ncurses-dev:"${DEBIAN_ARCH}" \ uuid-dev:"${DEBIAN_ARCH}" \ - libdrm-dev:"${DEBIAN_ARCH}" \ build-essential \ pkg-config \ git \ diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh index c60ba9041c..1fd42d4e68 100755 --- a/contrib/dependencies/apt-packages.sh +++ b/contrib/dependencies/apt-packages.sh @@ -19,6 +19,7 @@ fi libbsd-dev \ libcap-dev \ libdrm-dev \ + libelf-dev \ libgnutls28-dev \ libgnutls30 \ libnet-dev \ @@ -28,6 +29,8 @@ fi libprotobuf-c-dev \ libprotobuf-dev \ libselinux-dev \ + libtraceevent-dev \ + libtracefs-dev \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index efbb659c54..00dc91a2e8 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -3,6 +3,7 @@ dnf install -y \ asciidoc \ binutils \ + elfutils-libelf-devel \ gcc \ git \ glibc-devel \ @@ -18,6 +19,8 @@ dnf install -y \ libnet-devel \ libnl3-devel \ libselinux-devel \ + libtraceevent-devel \ + libtracefs-devel \ libuuid-devel \ make \ nftables \ @@ -27,9 +30,9 @@ dnf install -y \ protobuf-c-devel \ protobuf-compiler \ protobuf-devel \ - python-devel \ python3-importlib-metadata \ python3-protobuf \ python3-pyyaml \ + python-devel \ rubygem-asciidoctor \ xmlto diff --git a/contrib/dependencies/pacman-packages.sh b/contrib/dependencies/pacman-packages.sh index 5fe6995fb9..260797606b 100755 --- a/contrib/dependencies/pacman-packages.sh +++ b/contrib/dependencies/pacman-packages.sh @@ -15,8 +15,11 @@ pacman -Syu --noconfirm \ libbsd \ libcap \ libdrm \ + libelf \ libnet \ libnl \ + libtraceevent \ + libtracefs \ nftables \ pkg-config \ protobuf \ diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 6b262c4439..ea901a805d 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -290,6 +290,7 @@ TST_NOFILE := \ PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') +pkg-config-atleast-version = $(shell sh -c '$(PKG_CONFIG) --atleast-version=$(2) $(1) && echo y') ifeq ($(call pkg-config-check,libbpf),y) TST_NOFILE += \ bpf_hash \ @@ -298,7 +299,10 @@ endif ifneq ($(ARCH),arm) ifneq ($(COMPAT_TEST),y) - TST_NOFILE += maps03 + TST_NOFILE += maps03 +ifeq ($(call pkg-config-atleast-version,libtracefs,1.7),y) + TST_NOFILE += uprobes +endif endif endif @@ -727,6 +731,9 @@ sk-unix-listen04: CFLAGS += -DSK_UNIX_LISTEN02 -DSK_UNIX_LISTEN03 cgroupv2_01: LDLIBS += -pthread +uprobes: CFLAGS += $(call pkg-cflags, libtracefs libtraceevent) +uprobes: LDLIBS += $(call pkg-libs, libtracefs libelf) + $(LIB): force $(Q) $(MAKE) -C $(LIBDIR) diff --git a/test/zdtm/static/uprobes.c b/test/zdtm/static/uprobes.c new file mode 100644 index 0000000000..4164375b7a --- /dev/null +++ b/test/zdtm/static/uprobes.c @@ -0,0 +1,295 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test the --allow-uprobes option"; +const char *test_author = "Shashank Balaji "; + +#define UPROBE_GROUP_NAME "zdtm" +#define UPROBE_EVENT_NAME "uprobes_test" +#define UPROBED_FUNCTION uprobe_target + +/* + * A uprobe can be set at the start of a function, but not all instructions + * will trigger the creation of a uprobes vma. + * + * Examples: + * - aarch64: if the function is a single `ret`, then no vma creation + * - x64: if the function is `nop; ret`, then no vma creation + * + * So to guarantee vma creation, create a volatile dummy variable (to prevent + * compiler optimization) and use it (to prevent "unused variable" warning) + */ +void UPROBED_FUNCTION(void) { + volatile int dummy = 0; + dummy += 1; +} +/* Calling via volatile function pointer ensures noinline at callsite */ +typedef void (*func_ptr)(void); +volatile func_ptr uprobe_target_alias = UPROBED_FUNCTION; + +struct uprobe_context { + struct tracefs_instance *instance; + struct tracefs_dynevent *uprobe; +}; + +volatile bool got_sigtrap = false; + +/* + * Returns the file offset of a symbol in the executable of this program + * Returns 0 on failure +*/ +uint64_t calc_sym_offset(const char *sym_name) +{ + GElf_Shdr section_header; + Elf_Scn *section = NULL; + Elf_Data *symtab_data; + uint64_t offset = 0; + char buf[PATH_MAX]; + GElf_Sym symbol; + ssize_t n_bytes; + int n_entries; + Elf *elf; + int fd; + int i; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_err("ELF version of libelf is lower than that of the program\n"); + return 0; + } + + n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); + if (n_bytes < 0) { + pr_perror("Failed to readlink /proc/self/exe"); + return 0; + } + buf[n_bytes] = '\0'; + + fd = open(buf, O_RDONLY); + if (fd < 0) { + pr_perror("Failed to open self-executable"); + return 0; + } + + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) { + pr_err("%s\n", elf_errmsg(elf_errno())); + goto out_fd; + } + + /* Look for the symbol table section and its header */ + while ((section = elf_nextscn(elf, section)) != NULL) { + gelf_getshdr(section, §ion_header); + if (section_header.sh_type == SHT_SYMTAB) + break; + } + if (!section) { + pr_err("Failed to find symbol table\n"); + goto out_elf; + } + symtab_data = elf_getdata(section, NULL); + n_entries = section_header.sh_size / section_header.sh_entsize; + + /* Look for a symbol with the required name */ + for (i = 0; i < n_entries; i++) { + gelf_getsym(symtab_data, i, &symbol); + /* Symbol table's sh_link is the index of the string table section header */ + if (!strcmp(sym_name, + elf_strptr(elf, section_header.sh_link, symbol.st_name))) + break; + } + if (i == n_entries) { + pr_err("Failed to find symbol \"%s\"\n", sym_name); + goto out_elf; + } + + /* Get the section the symbol belongs to (mostly .text) */ + section = elf_getscn(elf, symbol.st_shndx); + gelf_getshdr(section, §ion_header); + offset = symbol.st_value - section_header.sh_addr + section_header.sh_offset; + +out_elf: + elf_end(elf); +out_fd: + close(fd); + return offset; +} + +/* + * Set and enable a uprobe on the file at the given offset + * Returns struct uprobe_context with members set to NULL on failure +*/ +struct uprobe_context enable_uprobe(const char *file, uint64_t offset) +{ + struct tracefs_instance *trace_instance; + struct tracefs_dynevent *uprobe; + struct uprobe_context context = {}; + + trace_instance = tracefs_instance_create("zdtm_uprobes_test"); + if (!trace_instance) { + pr_perror("Failed to create tracefs instance"); + return context; + } + tracefs_instance_reset(trace_instance); + + uprobe = tracefs_uprobe_alloc(UPROBE_GROUP_NAME, UPROBE_EVENT_NAME, file, offset, NULL); + if (!uprobe) { + pr_perror("Failed to allocate uprobe"); + goto instance_destroy; + } + + if (tracefs_dynevent_create(uprobe)) { + pr_perror("Failed to create uprobe"); + goto uprobe_free; + } + + if (tracefs_event_enable(trace_instance, UPROBE_GROUP_NAME, UPROBE_EVENT_NAME)) { + pr_perror("Failed to enable uprobe"); + goto uprobe_destroy; + } + + context.instance = trace_instance; + context.uprobe = uprobe; + return context; + +uprobe_destroy: + tracefs_dynevent_destroy(uprobe, false); +uprobe_free: + tracefs_dynevent_free(uprobe); +instance_destroy: + tracefs_instance_destroy(trace_instance); + tracefs_instance_free(trace_instance); + return context; +} + +void destroy_uprobe(struct uprobe_context context) +{ + tracefs_dynevent_destroy(context.uprobe, true); + tracefs_dynevent_free(context.uprobe); + tracefs_instance_destroy(context.instance); + tracefs_instance_free(context.instance); +} + +/* + * Check for the existence of the "[uprobes]" vma in /proc/self/maps + * Returns -1 on failure, 0 if not found, 1 if found +*/ +int uprobes_vma_exists(void) +{ + FILE *f; + char buf[LINE_MAX]; + int ret = 0; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + pr_perror("Failed to open /proc/self/maps"); + return -1; + } + + while (fgets(buf, sizeof(buf), f)) { + if (strstr(buf, "[uprobes]")) { + ret = 1; + break; + } + } + if (ret == 0 && !feof(f)) { + pr_err("Failed to finish reading /proc/self/maps\n"); + ret = -1; + } + + fclose(f); + return ret; +} + +/* + * SIGTRAP is sent if execution reaches a previously set uprobed location, and + * the corresponding uprobe is not active. We don't want this to happen on restore +*/ +void sigtrap_handler(int signo, siginfo_t *info, void* context) +{ + if (info->si_code == SI_KERNEL) { + got_sigtrap = true; + fail("SIGTRAP on attempting to call uprobed function"); + } +} + +int main(int argc, char **argv) +{ + struct uprobe_context context; + struct sigaction sa; + char buf[PATH_MAX]; + uint64_t offset; + int n_bytes; + int ret = 1; + + test_init(argc, argv); + + offset = calc_sym_offset(__stringify(UPROBED_FUNCTION)); + if (!offset) + return 1; + + n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); + if (n_bytes < 0) { + pr_perror("Failed to readlink /proc/self/exe"); + return 1; + } + buf[n_bytes] = '\0'; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = sigtrap_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGTRAP, &sa, NULL)) { + pr_perror("Failed to set SIGTRAP handler"); + return 1; + } + + context = enable_uprobe(buf, offset); + if (!context.instance) + return 1; + + /* + * Execution must reach the uprobed location at least once + * for the kernel to create the uprobes vma + */ + uprobe_target_alias(); + + switch (uprobes_vma_exists()) { + case -1: + goto out_uprobe; + break; + case 0: + pr_err("uprobes vma does not exist\n"); + goto out_uprobe; + break; + case 1: + test_msg("Found uprobes vma\n"); + break; + } + + test_daemon(); + test_waitsig(); + + /* + * Calling the uprobed function after restore should not cause + * a SIGTRAP, since the uprobe is still active + */ + uprobe_target_alias(); + if (!got_sigtrap) { + pass(); + ret = 0; + } + +out_uprobe: + destroy_uprobe(context); + return ret; +} diff --git a/test/zdtm/static/uprobes.desc b/test/zdtm/static/uprobes.desc new file mode 100644 index 0000000000..6eab1f4982 --- /dev/null +++ b/test/zdtm/static/uprobes.desc @@ -0,0 +1,6 @@ +{ + 'feature': 'cgroupns', + 'flags': 'suid nouser', + 'flavor': 'h', + 'opts': '--allow-uprobes' +} From 9889409fc93fe7483b77894073c67f8a515226e2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 10 Sep 2025 10:50:46 +0100 Subject: [PATCH 104/137] cr-service: refactor rpc config parsing When an additional configuration file is specified via RPC, this file is parsed twice: first at an early stage to load options such as --log-file, --work-dir, and --images-dir; and again after all RPC options and configuration files have been evaluated. This allows users to overwrite options specified via RPC by the container runtime (e.g., --tcp-established). However, processing the RPC config file twice leads to silently duplicating the values of repeatable options such as `--action-script`. To address this problem, we adjust the order of options parsing so that the RPC config file is evaluated only once. This change should not introduce any functional changes. Note that this change does not affect the logging functionality, as early log messages are temporarily buffered and only written to the log file once it has been initialized (see commit 1ff2333 "Printout early log messages"). Fixes #2727 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 299 +++++++++++++++++++++------------------------- 1 file changed, 138 insertions(+), 161 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index a1089ad5c7..e6aac232e7 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -312,156 +312,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) BUG_ON(st.st_ino == -1); service_sk_ino = st.st_ino; - /* - * Evaluate an additional configuration file if specified. - * This needs to happen twice, because it is needed early to detect - * things like work_dir, imgs_dir and logfile. The second parsing - * of the optional RPC configuration file happens at the end and - * overwrites all options set via RPC. - */ - if (req->config_file) { - char *tmp_output = opts.output; - char *tmp_work = opts.work_dir; - char *tmp_imgs = opts.imgs_dir; - - opts.output = NULL; - opts.work_dir = NULL; - opts.imgs_dir = NULL; - - rpc_cfg_file = req->config_file; - i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) { - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - goto err; - } - /* If this is non-NULL, the RPC configuration file had a value, use it.*/ - if (opts.output) - output_changed_by_rpc_conf = true; - /* If this is NULL, use the old value if it was set. */ - if (!opts.output && tmp_output) { - opts.output = tmp_output; - tmp_output = NULL; - } - - if (opts.work_dir) - work_changed_by_rpc_conf = true; - if (!opts.work_dir && tmp_work) { - opts.work_dir = tmp_work; - tmp_work = NULL; - } - - if (opts.imgs_dir) - imgs_changed_by_rpc_conf = true; - /* - * As the images directory is a required RPC setting, it is not - * necessary to use the value from other configuration files. - * Either it is set in the RPC configuration file or it is set - * via RPC. - */ - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - } - - /* - * open images_dir - images_dir_fd is a required RPC parameter - * - * This assumes that if opts.imgs_dir is set we have a value - * from the configuration file parser. The test to see that - * imgs_changed_by_rpc_conf is true is used to make sure the value - * is from the RPC configuration file. - * The idea is that only the RPC configuration file is able to - * overwrite RPC settings: - * * apply_config(global_conf) - * * apply_config(user_conf) - * * apply_config(environment variable) - * * apply_rpc_options() - * * apply_config(rpc_conf) - */ - if (imgs_changed_by_rpc_conf) - strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - else if (req->images_dir_fd != -1) - sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); - else if (req->images_dir) - strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); - else { - pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); - goto err; - } - - if (req->parent_img) - SET_CHAR_OPTS(img_parent, req->parent_img); - - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); - goto err; - } - - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - goto err; - } - - /* chdir to work dir */ - if (work_changed_by_rpc_conf) - /* Use the value from the RPC configuration file first. */ - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else if (req->has_work_dir_fd) - /* Use the value set via RPC. */ - sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); - else if (opts.work_dir) - /* Use the value from one of the other configuration files. */ - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else - /* Use the images directory a work directory. */ - strcpy(work_dir_path, images_dir_path); - - if (chdir(work_dir_path)) { - pr_perror("Can't chdir to work_dir"); - goto err; - } - - /* initiate log file in work dir */ - if (req->log_file && !output_changed_by_rpc_conf) { - /* - * If RPC sets a log file and if there nothing from the - * RPC configuration file, use the RPC value. - */ - if (strchr(req->log_file, '/')) { - pr_perror("No subdirs are allowed in log_file name"); - goto err; - } - - SET_CHAR_OPTS(output, req->log_file); - } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { - xfree(opts.output); - opts.output = NULL; - } else if (!opts.output) { - SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); - } - - /* This is needed later to correctly set the log_level */ - opts.log_level = req->log_level; - log_set_loglevel(req->log_level); - if (log_init(opts.output) == -1) { - pr_perror("Can't initiate log"); - goto err; - } - - if (req->config_file) { - pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); - } - if (req->has_unprivileged) opts.unprivileged = req->unprivileged; @@ -753,14 +603,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->empty_ns & ~(CLONE_NEWNET)) goto err; } - - if (req->n_irmap_scan_paths) { - for (i = 0; i < req->n_irmap_scan_paths; i++) { - if (irmap_scan_path_add(req->irmap_scan_paths[i])) - goto err; - } - } - if (req->has_status_fd) { pr_warn("status_fd is obsoleted; use status-ready notification instead\n"); @@ -781,13 +623,148 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_display_stats) opts.display_stats = req->display_stats; - /* Evaluate additional configuration file a second time to overwrite - * all RPC settings. */ + /* Evaluate additional configuration file (e.g., runc.conf) to overwrite all RPC settings. */ if (req->config_file) { + char *tmp_output = opts.output; + char *tmp_work = opts.work_dir; + + opts.output = NULL; + opts.work_dir = NULL; + + /* + * As the images directory is a required RPC setting, it is not + * necessary to use the value from other configuration files. + * Either it is set in the RPC configuration file or it is set + * via RPC. + */ + xfree(opts.imgs_dir); + opts.imgs_dir = NULL; + + pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); + rpc_cfg_file = req->config_file; i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) + if (i) { + xfree(tmp_output); + xfree(tmp_work); goto err; + } + + /* If opts.{output,work_dir} is non-NULL, the RPC configuration file had a value, use it.*/ + /* If opts.{output,work_dir} is NULL, use the old value if it was set. */ + if (opts.output) { + output_changed_by_rpc_conf = true; + } else { + opts.output = tmp_output; + tmp_output = NULL; + } + + if (opts.work_dir) { + work_changed_by_rpc_conf = true; + } else { + opts.work_dir = tmp_work; + tmp_work = NULL; + } + + if (opts.imgs_dir) + imgs_changed_by_rpc_conf = true; + + xfree(tmp_output); + xfree(tmp_work); + } + + /* + * open images_dir - images_dir_fd is a required RPC parameter + * + * This assumes that if opts.imgs_dir is set we have a value + * from the configuration file parser. The test to see that + * imgs_changed_by_rpc_conf is true is used to make sure the value + * is from the RPC configuration file. The idea is that only the + * RPC configuration file is able to overwrite RPC settings: + * * apply_config(global_conf) + * * apply_config(user_conf) + * * apply_config(environment variable) + * * apply_rpc_options() + * * apply_config(rpc_conf) + */ + if (imgs_changed_by_rpc_conf) { + strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); + } else if (req->images_dir_fd != -1) { + sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); + } else if (req->images_dir) { + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + } else { + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + goto err; + } + + if (req->parent_img) + SET_CHAR_OPTS(img_parent, req->parent_img); + + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + goto err; + } + + /* get full path to images_dir to use in process title */ + if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { + pr_perror("Can't readlink %s", images_dir_path); + goto err; + } + + if (work_changed_by_rpc_conf) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (req->has_work_dir_fd) + sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); + else if (opts.work_dir) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else + strcpy(work_dir_path, images_dir_path); + + if (chdir(work_dir_path)) { + pr_perror("Can't chdir to work_dir"); + goto err; + } + + if (req->n_irmap_scan_paths) { + for (i = 0; i < req->n_irmap_scan_paths; i++) { + if (irmap_scan_path_add(req->irmap_scan_paths[i])) + goto err; + } + } + + /* initiate log file in work dir */ + if (req->log_file && !output_changed_by_rpc_conf) { + /* + * If RPC sets a log file and if there nothing from the + * RPC configuration file, use the RPC value. + */ + if (strchr(req->log_file, '/')) { + pr_perror("No subdirs are allowed in log_file name"); + goto err; + } + + SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; + } else if (!opts.output) { + SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); + } + + /* This is needed later to correctly set the log_level */ + opts.log_level = req->log_level; + log_set_loglevel(req->log_level); + if (log_init(opts.output) == -1) { + pr_perror("Can't initiate log"); + goto err; } if (req->mntns_compat_mode) From 43aa2f8d27f3c60ac04b73293ae9e660d2fde9ed Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 3 Sep 2025 18:29:34 +0100 Subject: [PATCH 105/137] test/others/rpc: parse action-script via config Extend the test for overwriting config options via RPC with repeatable option (--action-script) and verify that the value will not be silently duplicated. Signed-off-by: Radostin Stoyanov --- test/others/rpc/Makefile | 1 + test/others/rpc/action-script.sh | 17 +++++++++++++++++ test/others/rpc/config_file.py | 11 +++++++++++ 3 files changed, 29 insertions(+) create mode 100755 test/others/rpc/action-script.sh diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index 384eb05397..c0e56d5289 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -12,6 +12,7 @@ run: all chmod a+rwx build chmod a+rwx build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} rm -f build/status + rm -f build/_marker_* @# Create all log files to be accessible for anybody @# so that they can be displayed by any user. for i in imgs_errno/criu.log imgs_ps/page-server.log imgs_ps/dump.log \ diff --git a/test/others/rpc/action-script.sh b/test/others/rpc/action-script.sh new file mode 100755 index 0000000000..991e315de4 --- /dev/null +++ b/test/others/rpc/action-script.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +MARKER_FILE="_marker_${CRTOOLS_SCRIPT_ACTION}" + +if [ -z "$CRTOOLS_SCRIPT_ACTION" ]; then + echo "Error: CRTOOLS_SCRIPT_ACTION is not set." + exit 2 +fi + +if [ ! -f "$MARKER_FILE" ]; then + touch "$MARKER_FILE" +else + echo "Error: Running the same action hook for the second time" + exit 1 +fi + +exit 0 diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index 6cffe270d0..f5ec408187 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -13,6 +13,9 @@ log_file = 'config_file_test.log' does_not_exist = 'does-not.exist' +script_path = os.path.dirname(os.path.abspath(__file__)) +action_script_file = os.path.join(script_path, 'action-script.sh') + def setup_config_file(content): # Creating a temporary file which will be used as configuration file. @@ -156,6 +159,7 @@ def test_rpc_with_configuration_file_overwriting_rpc(): # file settings in the default configuration. log = does_not_exist content = 'log-file ' + log + '\n' + content += 'action-script ' + action_script_file + '\n' content += 'no-tcp-established\nno-shell-job' path = setup_config_file(content) # Only set the configuration file via RPC; @@ -180,11 +184,18 @@ def test_rpc_with_configuration_file_overwriting_rpc(): cleanup_output(args['dir']) +print("*** Test broken config file ***") test_broken_configuration_file() cleanup_output(args['dir']) + +print("*** Test RPC without config file ***") test_rpc_without_configuration_file() cleanup_output(args['dir']) + +print("*** Test RPC with config file ***") test_rpc_with_configuration_file() cleanup_output(args['dir']) + +print("*** Test configuration file overwriting RPC ***") test_rpc_with_configuration_file_overwriting_rpc() cleanup_output(args['dir']) From 9249cb78199d5e9783b4e846e51b9a1b96959080 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 3 Sep 2025 21:40:02 +0100 Subject: [PATCH 106/137] test/others/rpc: show logs on error Signed-off-by: Radostin Stoyanov --- test/others/rpc/config_file.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index f5ec408187..c1a8276d86 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -92,29 +92,37 @@ def test_broken_configuration_file(): sys.exit(-1) -def search_in_log_file(log, message): - with open(os.path.join(args['dir'], log)) as f: +def search_in_log_file(log_path, message): + with open(log_path) as f: if message not in f.read(): - print( - 'FAIL: Missing the expected error message (%s) in the log file' - % message) + print('FAIL: Missing the expected error message (%s) in the log file' % message) sys.exit(-1) +def print_log_file(log_path): + print("\n--- Begin log file: %s ---" % log_path) + with open(log_path, 'r') as f: + print(f.read()) + print("--- End log file ---\n") + + def check_results(resp, log): # Check if the specified log file exists - if not os.path.isfile(os.path.join(args['dir'], log)): + log_path = os.path.join(args['dir'], log) + if not os.path.isfile(log_path): print('FAIL: Expected log file %s does not exist' % log) sys.exit(-1) # Dump should have failed with: 'The criu itself is within dumped tree' if resp.type != rpc.DUMP: print('FAIL: Unexpected msg type %r' % resp.type) + print_log_file(log_path) sys.exit(-1) if 'The criu itself is within dumped tree' not in resp.cr_errmsg: print('FAIL: Missing the expected error message in RPC response') + print_log_file(log_path) sys.exit(-1) # Look into the log file for the same message - search_in_log_file(log, 'The criu itself is within dumped tree') + search_in_log_file(log_path, 'The criu itself is within dumped tree') def test_rpc_without_configuration_file(): From 5cfe9a6b24d9cebccee9e1066c6a098c97d48008 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Thu, 4 Sep 2025 21:35:37 +0200 Subject: [PATCH 107/137] restorer: shstk: add restorer shadow stack stubs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * shstk_restorer_stack_size() – restorer shadow stack size * shstk_set_restorer_stack() – set restorer shadow stack start Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/include/restore.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/criu/include/restore.h b/criu/include/restore.h index 04d0065051..2c4e4e2679 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -9,6 +9,7 @@ extern int arch_set_thread_regs_nosigrt(struct pid *pid); struct task_restore_args; struct pstree_item; +struct rst_shstk_info; #ifndef arch_shstk_prepare static inline int arch_shstk_prepare(struct pstree_item *item, @@ -38,4 +39,18 @@ static inline int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *cor #define arch_shstk_trampoline arch_shstk_trampoline #endif +#ifndef shstk_restorer_stack_size +static always_inline long shstk_restorer_stack_size(void) +{ + return 0; +} +#endif + +#ifndef shstk_set_restorer_stack +static always_inline long shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) +{ + return 0; +} +#endif + #endif From b65864ee92d1e4aac124cd049869b8f7461a0ec8 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Thu, 4 Sep 2025 21:45:19 +0200 Subject: [PATCH 108/137] x86/criu: shstk restorer memory accounting functions * shstk_restorer_stack_size(): PAGE_SIZE * shstk_set_restorer_stack(): set restorer temporary shadow stack start Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/include/asm/shstk.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index 7814c351d1..2b9a303b89 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -73,6 +73,17 @@ int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, int (*func)(void *arg), void *arg); #define arch_shstk_trampoline arch_shstk_trampoline +static always_inline long shstk_restorer_stack_size(void) +{ + return PAGE_SIZE; +} +#define shstk_restorer_stack_size shstk_restorer_stack_size +static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) +{ + info->tmp_shstk = (unsigned long)ptr; +} +#define shstk_set_restorer_stack shstk_set_restorer_stack + #ifdef CR_NOGLIBC #include From 1511f8260955de905a833a8f4bea91db1d035378 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 17 Oct 2025 18:53:01 +0200 Subject: [PATCH 109/137] restorer: shstk: add shstk_min_mmap_addr() * default: return whatever passed in eg. to be used as shtk_min_mmap_addr(kdat.mmap_min_addr) * x86: ignore def and return 4G On x86, CET shadow stack is required to be mapped above 4GiB On the other hand forcing 4GiB globally would break 32-bit restores. Co-Authored-By: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/include/asm/shstk.h | 6 ++++++ criu/cr-restore.c | 9 +++++---- criu/include/restore.h | 7 +++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index 2b9a303b89..f62b8c3e99 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -84,6 +84,12 @@ static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *info, } #define shstk_set_restorer_stack shstk_set_restorer_stack +static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long __maybe_unused def) +{ + return !(info->cet & ARCH_SHSTK_SHSTK) ? def : (4UL << 30); +} +#define shstk_min_mmap_addr shstk_min_mmap_addr + #ifdef CR_NOGLIBC #include diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1c3b364518..9781dbfa03 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2431,16 +2431,15 @@ int cr_restore_tasks(void) return ret; } -static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long vma_len) +static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long min_addr, long vma_len) { struct vma_area *t_vma, *s_vma; - long prev_vma_end = 0; + long prev_vma_end = min_addr; struct vma_area end_vma; VmaEntry end_e; end_vma.e = &end_e; end_e.start = end_e.end = kdat.task_size; - prev_vma_end = kdat.mmap_min_addr; s_vma = list_first_entry(self_vma_list, struct vma_area, list); t_vma = list_first_entry(tgt_vma_list, struct vma_area, list); @@ -3226,7 +3225,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * or inited from scratch). */ - mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, task_args->bootstrap_len); + mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, + shstk_min_mmap_addr(&task_args->shstk, kdat.mmap_min_addr), + task_args->bootstrap_len); if (mem == (void *)-1) { pr_err("No suitable area for task_restore bootstrap (%ldK)\n", task_args->bootstrap_len); goto err; diff --git a/criu/include/restore.h b/criu/include/restore.h index 2c4e4e2679..1890518263 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -53,4 +53,11 @@ static always_inline long shstk_set_restorer_stack(struct rst_shstk_info *info, } #endif +#ifndef shstk_min_mmap_addr +static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long def) +{ + return def; +} +#endif + #endif From 8ee4b57f74bf04f375b09fb9e49001b415e9f304 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sat, 6 Sep 2025 17:25:06 +0200 Subject: [PATCH 110/137] restorer: shstk: allocate restorer shadow stack * reserve space for restorer shadow stack * set tmp_shstk at mem, advance mem by PAGE_SIZE * forget the extra PAGE_SIZE (shstk) for premapped VMAs Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin [ alex: small code cleanups ] Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/shstk.c | 1 - criu/cr-restore.c | 6 +++++- criu/mem.c | 9 --------- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c index b752f114a8..0810efac5e 100644 --- a/criu/arch/x86/shstk.c +++ b/criu/arch/x86/shstk.c @@ -45,7 +45,6 @@ static int shstk_prepare_task(struct vm_area_list *vmas, shstk->vma_start = vma->e->start; shstk->vma_size = size; shstk->premmaped_addr = premmaped_addr; - shstk->tmp_shstk = premmaped_addr + size; break; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9781dbfa03..057ec0e93d 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3195,7 +3195,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns rst_mem_size = rst_mem_lock(); memzone_size = round_up(sizeof(struct restore_mem_zone) * current->nr_threads, page_size()); - task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size; + task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size + shstk_restorer_stack_size(); BUG_ON(task_args->bootstrap_len & (PAGE_SIZE - 1)); pr_info("%d threads require %ldK of memory\n", current->nr_threads, KBYTES(task_args->bootstrap_len)); @@ -3466,6 +3466,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * self-vmas are unmaped. */ mem += rst_mem_size; + + shstk_set_restorer_stack(&task_args->shstk, mem); + mem += shstk_restorer_stack_size(); + task_args->vdso_rt_parked_at = (unsigned long)mem; task_args->vdso_maps_rt = vdso_maps_rt; task_args->vdso_rt_size = vdso_rt_size; diff --git a/criu/mem.c b/criu/mem.c index f8c5508428..9e8740c070 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -787,8 +787,6 @@ int prepare_mm_pid(struct pstree_item *i) ri->vmas.rst_priv_size += vma_area_len(vma); if (vma_has_guard_gap_hidden(vma)) ri->vmas.rst_priv_size += PAGE_SIZE; - if (vma_area_is(vma, VMA_AREA_SHSTK)) - ri->vmas.rst_priv_size += PAGE_SIZE; } pr_info("vma 0x%" PRIx64 " 0x%" PRIx64 "\n", vma->e->start, vma->e->end); @@ -931,13 +929,6 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void size = vma_entry_len(vma->e); - /* - * map an extra page for shadow stack VMAs, it will be used as a - * temporary shadow stack - */ - if (vma_area_is(vma, VMA_AREA_SHSTK)) - size += PAGE_SIZE; - if (!vma_inherited(vma)) { int flag = 0; /* From 73816229e025dd6a1d26381b6b30f04090e6a698 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sat, 6 Sep 2025 18:02:37 +0200 Subject: [PATCH 111/137] x86/criu: shstk: add shstk_vma_restore() 1. create shadow stack vma during vma_remap cycle 2. copy contents from a premapped non-shstk VMA into it 3. unmap premapped non-shstk VMA 4. Mark shstk VMA for remap into the final destination Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin Co-Authored-By: Alexander Mikhalitsyn [ alex: debugging, rework together with Andrei and code cleanup ] Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/include/asm/shstk.h | 37 +++++++++++++++++++++++++++++++ criu/include/restorer.h | 7 ++++++ 2 files changed, 44 insertions(+) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index f62b8c3e99..da4fb80cda 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -163,6 +163,43 @@ static inline int shstk_finalize(void) return ret; } +/* + * Create shadow stack vma and restore its content from premmapped anonymous (non-shstk) vma + */ +static always_inline int shstk_vma_restore(VmaEntry *vma_entry) +{ + long shstk, i; + unsigned long *shstk_data = (void *)vma_premmaped_start(vma_entry); + unsigned long vma_size = vma_entry_len(vma_entry); + long ret; + + shstk = sys_map_shadow_stack(0, vma_size, SHADOW_STACK_SET_TOKEN); + if (shstk < 0) { + pr_err("Failed to map shadow stack: %ld\n", shstk); + return -1; + } + + /* restore shadow stack contents */ + for (i = 0; i < vma_size / 8; i++) + wrssq(shstk + i * 8, shstk_data[i]); + + ret = sys_munmap(shstk_data, vma_size); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + /* + * From that point premapped vma is (shstk) and we need + * to mremap() it to the final location. Originally premapped + * (shstk_data) has been unmapped already. + */ + vma_premmaped_start(vma_entry) = shstk; + + return 0; +} +#define shstk_vma_restore shstk_vma_restore + /* * Restore contents of the shadow stack and set shadow stack pointer */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 56bea0fcc0..14c0a37680 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -357,4 +357,11 @@ static inline int arch_shstk_restore(struct rst_shstk_info *shstk) #define arch_shstk_restore arch_shstk_restore #endif +#ifndef shstk_vma_restore +static always_inline int shstk_vma_restore(VmaEntry *vma_entry) +{ + return -1; +} +#endif + #endif /* __CR_RESTORER_H__ */ From fbb4e833a24334276347c9ef6dc766df7ea01723 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sat, 6 Sep 2025 18:13:37 +0200 Subject: [PATCH 112/137] x86/criu: shstk: restore SHSTK via premap loops * call shstk_vma_restore() for VMA_AREA_SHSTK in vma_remap() * delete map/copy/unmap from shstk_restore() and keep token setup + finalize * before the loop naturally stopped at cet->ssp-8, so a -8 nudge is required here Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin [ alex: small code cleanups ] Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/include/asm/shstk.h | 26 ++------------------------ criu/pie/restorer.c | 31 +++++++++++++++++-------------- 2 files changed, 19 insertions(+), 38 deletions(-) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index da4fb80cda..d113fd8abb 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -205,28 +205,11 @@ static always_inline int shstk_vma_restore(VmaEntry *vma_entry) */ static always_inline int shstk_restore(struct rst_shstk_info *cet) { - unsigned long *shstk_data = (unsigned long *)cet->premmaped_addr; - unsigned long ssp = cet->vma_start + cet->vma_size - 8; - unsigned long shstk_top = cet->vma_size / 8 - 1; - unsigned long val; - long ret; + unsigned long ssp, val; if (!(cet->cet & ARCH_SHSTK_SHSTK)) return 0; - if (shstk_map(cet->vma_start, cet->vma_size)) - return -1; - - /* - * Switch shadow stack from temporary location to the actual task's - * shadow stack VMA - */ - shstk_switch_ssp(ssp); - - /* restore shadow stack contents */ - for (; ssp >= cet->ssp; ssp -= 8, shstk_top--) - wrssq(ssp, shstk_data[shstk_top]); - /* * Add tokens for sigreturn frame and for switch of the shadow stack. * The sigreturn token will be checked by the kernel during @@ -236,6 +219,7 @@ static always_inline int shstk_restore(struct rst_shstk_info *cet) */ /* token for sigreturn frame */ + ssp = cet->ssp - 8; val = ALIGN_DOWN(cet->ssp, 8) | SHSTK_DATA_BIT; wrssq(ssp, val); @@ -247,12 +231,6 @@ static always_inline int shstk_restore(struct rst_shstk_info *cet) /* reset shadow stack pointer to the proper location */ shstk_switch_ssp(ssp); - ret = sys_munmap(shstk_data, cet->vma_size + PAGE_SIZE); - if (ret < 0) { - pr_err("Failed to unmap premmaped shadow stack\n"); - return ret; - } - return shstk_finalize(); } #define arch_shstk_restore shstk_restore diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 394d3dea08..5c40b0e937 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1112,6 +1112,23 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) pr_info("Remap %lx->%lx len %lx\n", src, dst, len); + /* + * SHSTK VMAs are a bit special, in fact we create shstk vma right in the + * shstk_vma_restore() and populate it with contents from a premapped VMA + * (which in turns is just a normal anonymous VMA!). Then, we munmap() this + * premapped VMA. After, we need to adjust vma_premmaped_start(vma_entry) + * to point to a created shstk vma and treat it as a premmaped one in vma_remap(). + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) { + if (shstk_vma_restore(vma_entry)) { + pr_err("Unable to prepare shadow stack vma for remap %lx -> %lx\n", src, dst); + return -1; + } + + /* shstk_vma_restore() modifies vma premapped address */ + src = vma_premmaped_start(vma_entry); + } + if (src - dst < len) guard = dst; else if (dst - src < len) @@ -1811,13 +1828,6 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start > vma_entry->shmid) break; - /* - * shadow stack VMAs cannot be remapped, they must be - * recreated with map_shadow_stack system call - */ - if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) - continue; - if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } @@ -1835,13 +1845,6 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start < vma_entry->shmid) break; - /* - * shadow stack VMAs cannot be remapped, they must be - * recreated with map_shadow_stack system call - */ - if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) - continue; - if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } From 24ed5e196d7154e4b4575866e608a562d83da31e Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sat, 6 Sep 2025 19:40:35 +0200 Subject: [PATCH 113/137] zdtm: shstk: add SHSTK_ENABLE test build option * add SHSTK_ENABLE=1 toggle * passes -mshstk to compiler and -z shstk to linker Example: $ make -C test/zdtm/static clean $ make -C test/zdtm/static V=1 SHSTK_ENABLE=1 env00 $ readelf --notes test/zdtm/static/env00 | grep SHSTK Properties: x86 feature: SHSTK Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/Makefile.inc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index 3b349ed4d7..465285f085 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -66,6 +66,11 @@ endif export PKG_CONFIG_PATH endif +ifeq ($(SHSTK_ENABLE),1) + CFLAGS += -mshstk + LDFLAGS += -Wl,-z,shstk +endif + define pkg-libs $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" $(PKG_CONFIG) --libs $(1)) endef From 9805138c1c30d37d1597d1506f352deb048b7c42 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Sat, 18 Oct 2025 06:39:17 +0100 Subject: [PATCH 114/137] pycriu: set licence to LGPLv2.1 We use LGPL-v2.1 license for the libcriu and pycriu as they are intended to be usable by both proprietary and open-source applications. Signed-off-by: Andrii Herheliuk Signed-off-by: Radostin Stoyanov --- lib/pyproject.toml | 2 +- lib/setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pyproject.toml b/lib/pyproject.toml index c9e11551b0..63d9b5f472 100644 --- a/lib/pyproject.toml +++ b/lib/pyproject.toml @@ -8,7 +8,7 @@ description = "Python bindings for CRIU" authors = [ {name = "CRIU team", email = "criu@lists.linux.dev"}, ] -license = {text = "GPLv2"} +license = {text = "LGPLv2.1"} dynamic = ["version"] requires-python = ">=3.6" diff --git a/lib/setup.cfg b/lib/setup.cfg index 5d75719ca9..902fed9ee3 100644 --- a/lib/setup.cfg +++ b/lib/setup.cfg @@ -8,7 +8,7 @@ name = pycriu description = Python bindings for CRIU author = CRIU team author_email = criu@lists.linux.dev -license = GPLv2 +license = LGPLv2.1 version = attr: pycriu.__version__ [options] From 6e28c5f451256b16fb1686a8b6d034f9ca5aada3 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Fri, 17 Oct 2025 06:05:14 +0100 Subject: [PATCH 115/137] pycriu: add missing protobuf dependency pycriu depends on protobuf to function correctly. Currently, it raises an error if protobuf is not installed. Adding protobuf to the dependencies ensures it is available after installing pycriu. Signed-off-by: Andrii Herheliuk --- lib/pyproject.toml | 1 + lib/setup.cfg | 2 ++ 2 files changed, 3 insertions(+) diff --git a/lib/pyproject.toml b/lib/pyproject.toml index 63d9b5f472..ea9f88dccc 100644 --- a/lib/pyproject.toml +++ b/lib/pyproject.toml @@ -11,6 +11,7 @@ authors = [ license = {text = "LGPLv2.1"} dynamic = ["version"] requires-python = ">=3.6" +dependencies = ["protobuf"] [tool.setuptools] packages = ["pycriu", "pycriu.images"] diff --git a/lib/setup.cfg b/lib/setup.cfg index 902fed9ee3..28c9e49c3f 100644 --- a/lib/setup.cfg +++ b/lib/setup.cfg @@ -14,3 +14,5 @@ version = attr: pycriu.__version__ [options] packages = find: python_requires = >=3.6 +install_requires = + protobuf From c950083d1c2260badb943bcc932358416acdf273 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Sat, 18 Oct 2025 04:00:08 +0100 Subject: [PATCH 116/137] pycriu: prevent always appending "Unknown" to error messages Regardless of the actual error message, "Unknown" was always appended to the end of the string, resulting in messages like: "DUMP failed: Error(3): No process with such pidUnknown". Fixed by changing standalone if statements to else-if blocks so "Unknown" is only added when no specific error condition matches. Signed-off-by: Andrii Herheliuk --- lib/pycriu/criu.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index f3e018095a..5bd7ffecd4 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -181,15 +181,14 @@ def _gen_error_str(self): if self.errno == errno.EBADRQC: s += "Bad options" - if self.typ == rpc.DUMP: - if self.errno == errno.ESRCH: - s += "No process with such pid" + elif self.typ == rpc.DUMP and self.errno == errno.ESRCH: + s += "No process with such pid" - if self.typ == rpc.RESTORE: - if self.errno == errno.EEXIST: - s += "Process with requested pid already exists" + elif self.typ == rpc.RESTORE and self.errno == errno.EEXIST: + s += "Process with requested pid already exists" - s += "Unknown" + else: + s += "Unknown" return s From 9ecbfc04bb3f92b284d059696f186f1f8e9b7d5b Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 12:06:56 +0100 Subject: [PATCH 117/137] ci: consolidate action-script tests This patch consolidates the action-script tests into `test/others/action-script` to ensure all tests are executed consistently and reduce duplication. Since we had two tests that appear to do the same thing, we can remove the one that doesn't use zdtm.py. Signed-off-by: Radostin Stoyanov --- Makefile | 1 + test/jenkins/actions.sh | 8 --- test/others/action-script/.gitignore | 2 +- test/others/action-script/Makefile | 2 - test/others/action-script/action-script.sh | 2 - .../action-script}/check_actions.py | 0 test/others/action-script/run.sh | 59 ++----------------- .../{ => others/action-script}/show_action.sh | 3 +- 8 files changed, 9 insertions(+), 68 deletions(-) delete mode 100755 test/jenkins/actions.sh delete mode 100755 test/others/action-script/action-script.sh rename test/{ => others/action-script}/check_actions.py (100%) rename test/{ => others/action-script}/show_action.sh (66%) diff --git a/Makefile b/Makefile index 611bcdd5aa..e6653bd6cc 100644 --- a/Makefile +++ b/Makefile @@ -451,6 +451,7 @@ ruff: test/zdtm.py \ test/inhfd/*.py \ test/others/rpc/config_file.py \ + test/others/action-script/check_actions.py \ lib/pycriu/images/pb2dict.py \ lib/pycriu/images/images.py \ scripts/criu-ns \ diff --git a/test/jenkins/actions.sh b/test/jenkins/actions.sh deleted file mode 100755 index 8019045004..0000000000 --- a/test/jenkins/actions.sh +++ /dev/null @@ -1,8 +0,0 @@ -# Check how crit de/encodes images -set -e -source `dirname $0`/criu-lib.sh -# prep -rm -f actions_called.txt -./test/zdtm.py run -t zdtm/static/env00 --script "$(pwd)/test/show_action.sh" || fail -./test/check_actions.py || fail -exit 0 diff --git a/test/others/action-script/.gitignore b/test/others/action-script/.gitignore index c0b6a2490a..ca9a0b5416 100644 --- a/test/others/action-script/.gitignore +++ b/test/others/action-script/.gitignore @@ -1 +1 @@ -img-dir-* +actions_called.txt diff --git a/test/others/action-script/Makefile b/test/others/action-script/Makefile index f1ce191dbc..594edc0701 100644 --- a/test/others/action-script/Makefile +++ b/test/others/action-script/Makefile @@ -1,5 +1,3 @@ run: - @make -C .. loop ./run.sh - .PHONY: run diff --git a/test/others/action-script/action-script.sh b/test/others/action-script/action-script.sh deleted file mode 100755 index aba8292c05..0000000000 --- a/test/others/action-script/action-script.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -touch action-hook-"$CRTOOLS_SCRIPT_ACTION" diff --git a/test/check_actions.py b/test/others/action-script/check_actions.py similarity index 100% rename from test/check_actions.py rename to test/others/action-script/check_actions.py diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh index a82fccf359..f18301502c 100755 --- a/test/others/action-script/run.sh +++ b/test/others/action-script/run.sh @@ -1,60 +1,11 @@ #!/bin/bash -set -ebm +set -e -# shellcheck source=test/others/env.sh -source ../env.sh || exit 1 +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -SELFDIR="$(dirname "$(readlink -f "$0")")" -SCRIPT="$SELFDIR/action-script.sh" -IMGDIR="$SELFDIR/img-dir-$$" +rm -f "${SCRIPT_DIR}"/actions_called.txt +"${SCRIPT_DIR}"/../../zdtm.py run -t zdtm/static/env00 --script "$SCRIPT_DIR/show_action.sh" || exit 1 +"${SCRIPT_DIR}"/check_actions.py || exit 1 -rm -rf "$IMGDIR" -mkdir "$IMGDIR" - -trap "cleanup" QUIT TERM INT HUP EXIT - -# shellcheck disable=SC2317 -# https://github.com/koalaman/shellcheck/issues/2660 -function cleanup() -{ - if [[ -n "$PID" ]]; then - kill -9 "$PID" - fi -} - -PID=$(../loop) -if ! $CRIU dump -v4 -o dump.log -t "$PID" -D "$IMGDIR" --action-script "$SCRIPT"; then - echo "Failed to checkpoint process $PID" - cat dump.log - kill -9 "$PID" - exit 1 -fi - -if ! $CRIU restore -v4 -o restore.log -D "$IMGDIR" -d --pidfile test.pidfile --action-script "$SCRIPT"; then - echo "CRIU restore failed" - echo FAIL - exit 1 -fi - -PID=$(cat "$IMGDIR"/test.pidfile) - -found_missing_file=false -hooks=("pre-dump" "post-dump" "pre-restore" "pre-resume" "post-restore" "post-resume") - -for hook in "${hooks[@]}" -do - if [ ! -e "$IMGDIR/action-hook-$hook" ]; then - echo "ERROR: action-hook-$hook does not exist" - found_missing_file=true - fi -done - -if [ "$found_missing_file" = true ]; then - exit 1 -fi - -echo PASS - -rm -rf "$IMGDIR" exit 0 diff --git a/test/show_action.sh b/test/others/action-script/show_action.sh similarity index 66% rename from test/show_action.sh rename to test/others/action-script/show_action.sh index 86468b67ae..afbfc3f27e 100755 --- a/test/show_action.sh +++ b/test/others/action-script/show_action.sh @@ -1,3 +1,4 @@ #!/bin/bash + echo "${CRTOOLS_SCRIPT_ACTION} ${CRTOOLS_IMAGE_DIR} ${CRTOOLS_INIT_PID}" \ - >> "$(dirname $0)/actions_called.txt" + >> "$(dirname "$0")/actions_called.txt" From 8bcb2eb63cfa378affcbb125c11019d8bdd79306 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 12:11:45 +0100 Subject: [PATCH 118/137] ci: verify call order of action-script hooks The existing test collects all action-script hooks triggered during `h`, `ns`, and `uns` runs with ZDTM into `actions_called.txt`, then verifies that each hook appears at least once. However, the test does not verify that hooks are invoked *exactly once* or in *correct order*. This change updates the test to run ZDTM only with ns flavour as this seems to cover all action-script hooks, and checks that all hooks are called correctly. Signed-off-by: Radostin Stoyanov --- test/others/action-script/check_actions.py | 65 +++++++++++++--------- test/others/action-script/run.sh | 2 +- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/test/others/action-script/check_actions.py b/test/others/action-script/check_actions.py index 84d738dbb7..0140d8762d 100755 --- a/test/others/action-script/check_actions.py +++ b/test/others/action-script/check_actions.py @@ -1,41 +1,54 @@ #!/usr/bin/env python3 -import sys import os +import sys + +EXPECTED_ACTIONS = [ + 'pre-dump', + 'network-lock', + 'post-dump', + 'pre-restore', + 'setup-namespaces', + 'post-setup-namespaces', + 'post-restore', + 'network-unlock', + 'pre-resume', + 'post-resume', +] -actions = set(['pre-dump', 'pre-restore', 'post-dump', 'setup-namespaces', \ - 'post-setup-namespaces', 'post-restore', 'post-resume', \ - 'network-lock', 'network-unlock' ]) errors = [] -af = os.path.dirname(os.path.abspath(__file__)) + '/actions_called.txt' +actions_called = [] +actions_called_file = os.path.join(os.path.dirname(__file__), 'actions_called.txt') + +with open(actions_called_file) as f: + for index, line in enumerate(f): + parts = line.strip().split() + parts += ['EMPTY'] * (3 - len(parts)) + action_hook, image_dir, pid = parts -for act in open(af): - act = act.strip().split() - act.append('EMPTY') - act.append('EMPTY') + if action_hook == 'EMPTY': + raise ValueError("Error in test: bogus actions line") - if act[0] == 'EMPTY': - raise Exception("Error in test, bogus actions line") + expected_action = EXPECTED_ACTIONS[index] if index < len(EXPECTED_ACTIONS) else None + if action_hook != expected_action: + raise ValueError(f"Invalid action: {action_hook} != {expected_action}") - if act[1] == 'EMPTY': - errors.append('Action %s misses CRTOOLS_IMAGE_DIR' % act[0]) + if image_dir == 'EMPTY': + errors.append(f'Action {action_hook} misses CRTOOLS_IMAGE_DIR') - if act[0] in ('post-dump', 'setup-namespaces', 'post-setup-namespaces', \ - 'post-restore', 'post-resume', 'network-lock', 'network-unlock'): - if act[2] == 'EMPTY': - errors.append('Action %s misses CRTOOLS_INIT_PID' % act[0]) - elif not act[2].isdigit() or int(act[2]) == 0: - errors.append('Action %s PID is not number (%s)' % - (act[0], act[2])) + if action_hook != 'pre-restore': + if pid == 'EMPTY': + errors.append(f'Action {action_hook} misses CRTOOLS_INIT_PID') + elif not pid.isdigit() or int(pid) == 0: + errors.append(f'Action {action_hook} PID is not a valid number ({pid})') - actions -= set([act[0]]) + actions_called.append(action_hook) -if actions: - errors.append('Not all actions called: %r' % actions) +if actions_called != EXPECTED_ACTIONS: + errors.append(f'Not all actions called: {actions_called!r}') if errors: - for x in errors: - print(x) + print('\n'.join(errors)) sys.exit(1) -print('PASS') +print('Check Actions PASS') diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh index f18301502c..574f6fc863 100755 --- a/test/others/action-script/run.sh +++ b/test/others/action-script/run.sh @@ -5,7 +5,7 @@ set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" rm -f "${SCRIPT_DIR}"/actions_called.txt -"${SCRIPT_DIR}"/../../zdtm.py run -t zdtm/static/env00 --script "$SCRIPT_DIR/show_action.sh" || exit 1 +"${SCRIPT_DIR}"/../../zdtm.py run -t zdtm/static/env00 -f ns --script "$SCRIPT_DIR/show_action.sh" || exit 1 "${SCRIPT_DIR}"/check_actions.py || exit 1 exit 0 From 3ae97599fef995ea873ae6e68565f4e7c5f1f5ea Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 10:51:46 +0100 Subject: [PATCH 119/137] make: don't install external dependencies Don't install external pip dependencies when running `make install`. As we are not really into developing a Python project, we should not install additional packages. CRIU does that nowhere else. Signed-off-by: Radostin Stoyanov --- Makefile.install | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Makefile.install b/Makefile.install index 455735f3b1..70c607ec6c 100644 --- a/Makefile.install +++ b/Makefile.install @@ -46,9 +46,13 @@ endif endif # Default flags for pip install: -# --upgrade: Upgrade crit/pycriu packages -# --ignore-installed: Ignore existing packages and reinstall them -PIPFLAGS ?= --upgrade --ignore-installed +# --ignore-installed: Overwrite already installed pycriu/crit packages +# --no-build-isolation: Use current Python environment to build pycriu/crit packages +# --no-deps: Don't install any dependencies +# --no-index: Don't use PyPI index to find packages +# --progress-bar: Cleaner output +# --upgrade: Treat the install as an upgrade when replacing the installed version +PIPFLAGS ?= --ignore-installed --no-build-isolation --no-deps --no-index --progress-bar off --upgrade export SKIP_PIP_INSTALL PIPFLAGS From a123d0eed44193a0eb0de0e8f846e4816bb87962 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 11:43:17 +0100 Subject: [PATCH 120/137] ci: add wheel and setuptools in dnf packages These dependencies are required to for `pip install`. Signed-off-by: Radostin Stoyanov --- contrib/dependencies/dnf-packages.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index 00dc91a2e8..793f267a5c 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -30,9 +30,11 @@ dnf install -y \ protobuf-c-devel \ protobuf-compiler \ protobuf-devel \ + python-devel \ python3-importlib-metadata \ python3-protobuf \ python3-pyyaml \ - python-devel \ + python3-setuptools \ + python3-wheel \ rubygem-asciidoctor \ xmlto From 42a63b12e47dba52a10c0f26ca8feccf04335abf Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 12:49:00 +0100 Subject: [PATCH 121/137] ci: add which dependency in dnf packages which is used in Makefiles to check for dependencies: Example: export USE_ASCIIDOCTOR ?= $(shell which asciidoctor 2>/dev/null) Signed-off-by: Radostin Stoyanov --- contrib/dependencies/dnf-packages.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index 793f267a5c..60f21db6d9 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -37,4 +37,5 @@ dnf install -y \ python3-setuptools \ python3-wheel \ rubygem-asciidoctor \ + which \ xmlto From 6a700b816a7f4d7b3fc700b96309193b5a3b0745 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Sun, 26 Oct 2025 17:14:03 -0700 Subject: [PATCH 122/137] Use command -v instead of which Unlike "which", which is a separate executable not always installed by default, "command -v" is a shell built-in available at least for bash, dash, and busybox shell. Unlike "which", "command -v" is also easier to grep for, and it is already used in a few places here. Inspired by commit 57251d811. Signed-off-by: Kir Kolyshkin --- .github/workflows/lint.yml | 2 +- Makefile | 2 +- contrib/dependencies/dnf-packages.sh | 1 - contrib/docker_cr.sh | 4 ++-- scripts/ci/prepare-for-fedora-rawhide.sh | 3 +-- scripts/nmk/scripts/tools.mk | 4 ++-- 6 files changed, 7 insertions(+), 9 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 862d682458..f7da4f6f6c 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -14,7 +14,7 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make ruff xz clang-tools-extra which codespell git-clang-format ShellCheck + run: sudo dnf -y install git make ruff xz clang-tools-extra codespell git-clang-format ShellCheck - uses: actions/checkout@v4 diff --git a/Makefile b/Makefile index e6653bd6cc..1824ea180c 100644 --- a/Makefile +++ b/Makefile @@ -489,7 +489,7 @@ lint: ruff shellcheck codespell ! git --no-pager grep -E '\s+$$' \*.c \*.h .PHONY: lint ruff shellcheck codespell -codecov: SHELL := $(shell which bash) +codecov: SHELL := $(shell command -v bash) codecov: curl -Os https://uploader.codecov.io/latest/linux/codecov chmod +x codecov diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index 60f21db6d9..793f267a5c 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -37,5 +37,4 @@ dnf install -y \ python3-setuptools \ python3-wheel \ rubygem-asciidoctor \ - which \ xmlto diff --git a/contrib/docker_cr.sh b/contrib/docker_cr.sh index 9b43d8ba19..04ef676cd6 100755 --- a/contrib/docker_cr.sh +++ b/contrib/docker_cr.sh @@ -418,7 +418,7 @@ resolve_path() { local p p="${2}" - if which realpath > /dev/null; then + if command -v realpath > /dev/null; then p=$(realpath "${p}") fi ${ECHO} "${1}: ${p}" @@ -427,7 +427,7 @@ resolve_path() { resolve_cmd() { local cpath - cpath=$(which "${2}") + cpath=$(command -v "${2}") resolve_path "${1}" "${cpath}" } diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index ff75717c59..b0b45fcc3e 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -15,8 +15,7 @@ dnf install -y \ python-unversioned-command \ redhat-rpm-config \ sudo \ - tar \ - which + tar # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it chmod 1777 /tmp diff --git a/scripts/nmk/scripts/tools.mk b/scripts/nmk/scripts/tools.mk index 724204a03c..de5782c137 100644 --- a/scripts/nmk/scripts/tools.mk +++ b/scripts/nmk/scripts/tools.mk @@ -23,7 +23,7 @@ MAKE := make MKDIR := mkdir -p AWK := awk PERL := perl -FULL_PYTHON := $(shell which python3 2>/dev/null) +FULL_PYTHON := $(shell command -v python3 2>/dev/null) PYTHON ?= $(shell basename $(FULL_PYTHON)) FIND := find SH := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ @@ -36,7 +36,7 @@ CTAGS := ctags export RM HOSTLD LD HOSTCC CC CPP AS AR STRIP OBJCOPY OBJDUMP export NM SH MAKE MKDIR AWK PERL PYTHON SH CSCOPE -export USE_ASCIIDOCTOR ?= $(shell which asciidoctor 2>/dev/null) +export USE_ASCIIDOCTOR ?= $(shell command -v asciidoctor 2>/dev/null) # # Footer. From 3c06ad7b41f6c998bee4fd4d7b6fb3edeb0a39af Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 7 Oct 2025 16:31:17 +0100 Subject: [PATCH 123/137] libcriu: enable setting of RPC config file Container runtimes that use libcriu (e.g., crun) need to specify a CRIU configuration file that allows to overwrite default options set via RPC. This is particularly useful to set options such as `--tcp-established` via `/etc/criu/runc.conf` in Kubernetes. Signed-off-by: Radostin Stoyanov --- lib/c/criu.c | 19 +++++++++++++++++++ lib/c/criu.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/lib/c/criu.c b/lib/c/criu.c index c16fe5dcd7..485c8b178e 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -2041,3 +2041,22 @@ void criu_set_empty_ns(int namespaces) { criu_local_set_empty_ns(global_opts, namespaces); } + +int criu_local_set_config_file(criu_opts *opts, const char *path) +{ + char *new; + + new = strdup(path); + if (!new) + return -ENOMEM; + + free(opts->rpc->config_file); + opts->rpc->config_file = new; + + return 0; +} + +int criu_set_config_file(const char *path) +{ + return criu_local_set_config_file(global_opts, path); +} diff --git a/lib/c/criu.h b/lib/c/criu.h index c1c6078698..44446f6645 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -116,6 +116,7 @@ void criu_set_pidfd_store_sk(int sk); int criu_set_network_lock(enum criu_network_lock_method method); int criu_join_ns_add(const char *ns, const char *ns_file, const char *extra_opt); void criu_set_mntns_compat_mode(bool val); +int criu_set_config_file(const char *path); /* * The criu_notify_arg_t na argument is an opaque @@ -281,6 +282,7 @@ void criu_local_set_pidfd_store_sk(criu_opts *opts, int sk); int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method); int criu_local_join_ns_add(criu_opts *opts, const char *ns, const char *ns_file, const char *extra_opt); void criu_local_set_mntns_compat_mode(criu_opts *opts, bool val); +int criu_local_set_config_file(criu_opts *opts, const char *path); void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_notify_arg_t na)); From b1f7505324d06c7c31dbfa036b938baa31393323 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 9 Oct 2025 11:21:35 +0100 Subject: [PATCH 124/137] test/libcriu: check setting of RPC config file Signed-off-by: Radostin Stoyanov --- test/others/libcriu/.gitignore | 1 + test/others/libcriu/Makefile | 1 + test/others/libcriu/run.sh | 1 + test/others/libcriu/test_rpc_config.c | 223 ++++++++++++++++++++++++++ 4 files changed, 226 insertions(+) create mode 100644 test/others/libcriu/test_rpc_config.c diff --git a/test/others/libcriu/.gitignore b/test/others/libcriu/.gitignore index 0f6e52bb4e..30a56999c4 100644 --- a/test/others/libcriu/.gitignore +++ b/test/others/libcriu/.gitignore @@ -8,3 +8,4 @@ test_pre_dump test_feature_check output/ libcriu.so.* +test_rpc_config diff --git a/test/others/libcriu/Makefile b/test/others/libcriu/Makefile index ae73305331..e0ee5b2aba 100644 --- a/test/others/libcriu/Makefile +++ b/test/others/libcriu/Makefile @@ -3,6 +3,7 @@ include ../../../../criu/Makefile.versions TESTS += test_sub TESTS += test_self TESTS += test_notify +TESTS += test_rpc_config TESTS += test_iters TESTS += test_errno TESTS += test_join_ns diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index f7d363aabe..804af9b835 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -55,6 +55,7 @@ run_test() { run_test test_sub run_test test_self run_test test_notify +run_test test_rpc_config if [ "$(uname -m)" = "x86_64" ]; then # Skip this on aarch64 as aarch64 has no dirty page tracking run_test test_iters diff --git a/test/others/libcriu/test_rpc_config.c b/test/others/libcriu/test_rpc_config.c new file mode 100644 index 0000000000..529f136371 --- /dev/null +++ b/test/others/libcriu/test_rpc_config.c @@ -0,0 +1,223 @@ +#include "criu.h" +#include "lib.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RANDOM_NAME_LEN 6 +#define PATH_BUF_SIZE 128 + +static volatile sig_atomic_t stop = 0; +static char base_name[RANDOM_NAME_LEN + 1]; +static char log_file[PATH_BUF_SIZE]; +static char conf_file[PATH_BUF_SIZE]; + +static void handle_signal(int sig) +{ + (void)sig; + stop = 1; +} + +static void generate_random_base_name(void) +{ + const char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + size_t charset_len; + int i; + + charset_len = sizeof(charset) - 1; + + for (i = 0; i < RANDOM_NAME_LEN; i++) { + base_name[i] = charset[rand() % charset_len]; + } + base_name[i] = '\0'; + + snprintf(log_file, sizeof(log_file), "/tmp/criu-%s.log", base_name); + snprintf(conf_file, sizeof(conf_file), "/tmp/criu-%s.conf", base_name); +} + +static int create_criu_config_file(void) +{ + int fd; + FILE *fp; + + srand(time(NULL)); + generate_random_base_name(); + + fd = open(conf_file, O_CREAT | O_EXCL | O_WRONLY, 0600); + if (fd < 0) { + perror("Failed to create config file"); + return -1; + } + + fp = fdopen(fd, "w"); + if (!fp) { + perror("fdopen failed"); + close(fd); + unlink(conf_file); + return -1; + } + + fprintf(fp, "log-file=%s\n", log_file); + fflush(fp); + fclose(fp); + + return 0; +} + +static int check_log_file(void) +{ + struct stat st; + + if (stat(log_file, &st) < 0) { + perror("Config file does not exist"); + return -1; + } + + if (st.st_size == 0) { + fprintf(stderr, "Config file is empty\n"); + return -1; + } + + unlink(log_file); + return 0; +} + +int main(int argc, char **argv) +{ + int pipe_fd[2]; + pid_t pid; + int ret; + int child_ret; + + int img_fd = open(argv[2], O_DIRECTORY); + if (img_fd < 0) { + perror("Failed to open images directory"); + goto cleanup; + } + + if (create_criu_config_file() < 0) { + printf("Failed to create config file\n"); + return EXIT_FAILURE; + } + + if (pipe(pipe_fd) < 0) { + perror("pipe"); + return EXIT_FAILURE; + } + + pid = fork(); + if (pid < 0) { + perror("fork failed"); + return EXIT_FAILURE; + } + + if (pid == 0) { + /** child process **/ + printf(" `- loop: initializing\n"); + + if (setsid() < 0 || signal(SIGUSR1, handle_signal) == SIG_ERR) { + _exit(EXIT_FAILURE); + } + + close(STDIN_FILENO); + close(STDOUT_FILENO); + close(STDERR_FILENO); + close(pipe_fd[0]); + + child_ret = SUCC_ECODE; + write(pipe_fd[1], &child_ret, sizeof(child_ret)); + close(pipe_fd[1]); + + while (!stop) { + sleep(1); + } + + _exit(SUCC_ECODE); + } + + /** parent process **/ + close(pipe_fd[1]); + + ret = -1; + if (read(pipe_fd[0], &ret, sizeof(ret)) != sizeof(ret) || ret != SUCC_ECODE) { + printf("Error starting loop\n"); + goto cleanup; + } + + read(pipe_fd[0], &ret, 1); + close(pipe_fd[0]); + + printf("--- Loop process started (pid: %d) ---\n", pid); + + printf("--- Checkpoint ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + criu_set_images_dir_fd(img_fd); + criu_set_pid(pid); + criu_set_log_level(CRIU_LOG_DEBUG); + + /* The RPC config file should overwrite the log-file set below */ + printf("Setting dump RPC config file: %s\n", conf_file); + criu_set_config_file(conf_file); + criu_set_log_file("dump.log"); + + ret = criu_dump(); + if (ret < 0) { + what_err_ret_mean(ret); + kill(pid, SIGKILL); + printf("criu dump failed\n"); + goto cleanup; + } + + printf(" `- Dump succeeded\n"); + waitpid(pid, NULL, 0); + + if (check_log_file()) { + printf("Error: log file not overwritten by RPC config file\n"); + goto cleanup; + } + + printf("--- Restore loop ---\n"); + criu_init_opts(); + criu_set_images_dir_fd(img_fd); + criu_set_log_level(CRIU_LOG_DEBUG); + + /* The RPC config file should overwrite the log-file set below */ + printf("Setting restore RPC config file: %s\n", conf_file); + criu_set_config_file(conf_file); + criu_set_log_file("restore.log"); + + pid = criu_restore_child(); + if (pid <= 0) { + what_err_ret_mean(pid); + ret = EXIT_FAILURE; + goto cleanup; + } + + printf(" `- Restore returned pid %d\n", pid); + kill(pid, SIGUSR1); + + if (check_log_file()) { + printf("Error: log file not overwritten by RPC config file\n"); + goto cleanup; + } + +cleanup: + if (waitpid(pid, &ret, 0) < 0) { + perror("waitpid failed"); + return EXIT_FAILURE; + } + + printf("Remove RPC config file: %s\n", conf_file); + unlink(conf_file); + return chk_exit(ret, SUCC_ECODE); +} From 236a684d9ccce29806f55d157b5b1408ee6dbec3 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Wed, 22 Oct 2025 21:51:28 +0100 Subject: [PATCH 125/137] lib/pycriu: changing the default behavior to use the system binary Use system-installed CRIU binary instead of a local file Thanks to @avagin for suggesting this solution. Co-authored-by: Andrei Vagin Signed-off-by: Andrii Herheliuk --- lib/pycriu/criu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 5bd7ffecd4..5973b4b91f 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -103,7 +103,7 @@ def exec_criu(): os.close(2) css[0].send(struct.pack('i', os.getpid())) - os.execv(self.comm, + os.execvp(self.comm, [self.comm, 'swrk', "%d" % css[0].fileno()]) os._exit(1) From a424a391e3fff3d0fbca274cae33b655881f4445 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Thu, 23 Oct 2025 10:50:40 +0100 Subject: [PATCH 126/137] pycriu: better socket error handling [Errno 2] No such file or directory -> Socket file not found. [Errno 111] Connection refused -> Service not running. Signed-off-by: Andrii Herheliuk --- lib/pycriu/criu.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 5973b4b91f..43550c3cac 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -45,7 +45,14 @@ def __init__(self, sk_path): def connect(self, daemon): self.sk = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) - self.sk.connect(self.comm) + try: + self.sk.connect(self.comm) + + except FileNotFoundError: + raise FileNotFoundError("Socket file not found.") + + except ConnectionRefusedError: + raise ConnectionRefusedError("Service not running.") return self.sk From bb064dbe477166e9bd019dd79625ca32c47a462d Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Mon, 27 Oct 2025 21:57:41 +0000 Subject: [PATCH 127/137] pycriu: set default value for sk_name This change allows users to call criu.use_sk() without any parameters to use the default socket name. Co-authored-by: Radostin Stoyanov Signed-off-by: Andrii Herheliuk --- lib/pycriu/criu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 43550c3cac..05a85c58da 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -8,6 +8,7 @@ import pycriu.rpc_pb2 as rpc +CR_DEFAULT_SERVICE_ADDRESS = "./criu_service.socket" class _criu_comm: """ @@ -213,7 +214,7 @@ def __init__(self): self.opts = rpc.criu_opts() self.sk = None - def use_sk(self, sk_name): + def use_sk(self, sk_name=CR_DEFAULT_SERVICE_ADDRESS): """ Access criu using unix socket which that belongs to criu service daemon. """ From ad2506661cd74c7ebc2088eb0afcb0deaecea0d3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 06:42:53 +0100 Subject: [PATCH 128/137] cr-service: refactor images/workdir setup Move the code that opens the images directory, resolves its absolute path via readlink(), selects the work_dir, and chdir()s into it into a new function: setup_images_and_workdir(). This reduces the size of `setup_opts_from_req()`, improves its readability, and allows this functionality to be reused. While at it, change open_image_dir() to take a const char *dir parameter, reflecting that the path is not modified by the function and allowing callers to pass string literals without casts. No functional changes are intended. Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 74 +++++++++++++++++++++++++------------------- criu/image.c | 2 +- criu/include/image.h | 2 +- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index e6aac232e7..36ef8d72b5 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -285,13 +285,54 @@ int exec_rpc_query_external_files(char *name, int sk) static char images_dir[PATH_MAX]; +static int setup_images_and_workdir(const char *images_dir_path, + bool work_changed_by_rpc_conf, + CriuOpts *req, + pid_t peer_pid) +{ + char work_dir_path[PATH_MAX]; + + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + return -1; + } + + /* get full path to images_dir to use in process title */ + if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { + pr_perror("Can't readlink %s", images_dir_path); + return -1; + } + + if (work_changed_by_rpc_conf) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (req->has_work_dir_fd) + sprintf(work_dir_path, "/proc/%d/fd/%d", peer_pid, req->work_dir_fd); + else if (opts.work_dir) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else + strcpy(work_dir_path, images_dir_path); + + if (chdir(work_dir_path)) { + pr_perror("Can't chdir to work_dir"); + return -1; + } + + return 0; +} + static int setup_opts_from_req(int sk, CriuOpts *req) { struct ucred ids; struct stat st; socklen_t ids_len = sizeof(struct ucred); char images_dir_path[PATH_MAX]; - char work_dir_path[PATH_MAX]; char status_fd[PATH_MAX]; bool output_changed_by_rpc_conf = false; bool work_changed_by_rpc_conf = false; @@ -701,37 +742,8 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->parent_img) SET_CHAR_OPTS(img_parent, req->parent_img); - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); + if (setup_images_and_workdir(images_dir_path, work_changed_by_rpc_conf, req, ids.pid)) goto err; - } - - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - goto err; - } - - if (work_changed_by_rpc_conf) - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else if (req->has_work_dir_fd) - sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); - else if (opts.work_dir) - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else - strcpy(work_dir_path, images_dir_path); - - if (chdir(work_dir_path)) { - pr_perror("Can't chdir to work_dir"); - goto err; - } if (req->n_irmap_scan_paths) { for (i = 0; i < req->n_irmap_scan_paths; i++) { diff --git a/criu/image.c b/criu/image.c index c4f05e1597..91101c3ebf 100644 --- a/criu/image.c +++ b/criu/image.c @@ -717,7 +717,7 @@ struct cr_img *img_from_fd(int fd) * This is used when opts.stream is enabled for picking the right streamer * socket name. `mode` is ignored when opts.stream is not enabled. */ -int open_image_dir(char *dir, int mode) +int open_image_dir(const char *dir, int mode) { int fd, ret; diff --git a/criu/include/image.h b/criu/include/image.h index b06dbf7062..30e32323d5 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -165,7 +165,7 @@ static inline int img_raw_fd(struct cr_img *img) extern off_t img_raw_size(struct cr_img *img); -extern int open_image_dir(char *dir, int mode); +extern int open_image_dir(const char *dir, int mode); extern void close_image_dir(void); /* * Return -1 -- parent symlink points to invalid target From 8a13b82523955184e002bad55dab570d2a5b2be2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 11:04:35 +0100 Subject: [PATCH 129/137] cr-service: drop images_dir from setproctitle Commit 9089ce8 ("service: use setproctitle") extended cr-service to get the full path of images_dir using readlink(). However, the RPC API was later extended to allow setting a custom path (folder) to be set instead of passing a file descriptor, which causes readlink() to fail as the path is not a symbolic link. It would be better to drop the code setting the images-dir path as a string in the proctitle. Fixes: #2794 Suggested-by: Andrei Vagin Co-authored-by: Andrii Herheliuk Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 36ef8d72b5..0808be3e7d 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -283,8 +283,6 @@ int exec_rpc_query_external_files(char *name, int sk) return ret; } -static char images_dir[PATH_MAX]; - static int setup_images_and_workdir(const char *images_dir_path, bool work_changed_by_rpc_conf, CriuOpts *req, @@ -304,12 +302,6 @@ static int setup_images_and_workdir(const char *images_dir_path, return -1; } - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - return -1; - } - if (work_changed_by_rpc_conf) strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); else if (req->has_work_dir_fd) @@ -802,7 +794,7 @@ static int dump_using_req(int sk, CriuOpts *req) if (setup_opts_from_req(sk, req)) goto exit; - __setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("dump --rpc -t %d", req->pid); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -845,7 +837,7 @@ static int restore_using_req(int sk, CriuOpts *req) if (setup_opts_from_req(sk, req)) goto exit; - __setproctitle("restore --rpc -D %s", images_dir); + __setproctitle("restore --rpc"); if (cr_restore_tasks()) goto exit; @@ -940,7 +932,7 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) if (setup_opts_from_req(sk, req)) goto cout; - __setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("pre-dump --rpc -t %d", req->pid); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -1276,8 +1268,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (setup_opts_from_req(sk, msg->opts)) goto cout; - __setproctitle("cpuinfo %s --rpc -D %s", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check", - images_dir); + __setproctitle("cpuinfo %s --rpc", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check"); if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ret = cpuinfo_dump(); From 287160d334aa09ba90b2a5f7c873e1720c251cf3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 06:50:31 +0100 Subject: [PATCH 130/137] cr-service: refactor images_dir path resolution Move the images_dir selection logic from setup_opts_from_req() into a new function: resolve_images_dir_path(). This improves readability and allows the code to be reused. While at it, use snprintf() instead of sprintf() for the /proc path and ensure NULL termination after strncpy(). Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 59 +++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 0808be3e7d..7d17a63e03 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -283,6 +283,41 @@ int exec_rpc_query_external_files(char *name, int sk) return ret; } +static int resolve_images_dir_path(char *images_dir_path, + bool imgs_changed_by_rpc_conf, + const CriuOpts *req, + pid_t peer_pid) +{ + /* + * images_dir_fd is a required RPC parameter with -1 as default value. + * + * This assumes that if opts.imgs_dir is set, we have a value + * from the configuration file parser. The test to see that + * imgs_changed_by_rpc_conf is true is used to make sure the value + * is from the RPC configuration file. The idea is that only the + * RPC configuration file is able to overwrite RPC settings: + * * apply_config(global_conf) + * * apply_config(user_conf) + * * apply_config(environment variable) + * * apply_rpc_options() + * * apply_config(rpc_conf) + */ + if (imgs_changed_by_rpc_conf) { + strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); + images_dir_path[PATH_MAX - 1] = '\0'; + } else if (req->images_dir_fd != -1) { + snprintf(images_dir_path, PATH_MAX, "/proc/%d/fd/%d", peer_pid, req->images_dir_fd); + } else if (req->images_dir) { + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + images_dir_path[PATH_MAX - 1] = '\0'; + } else { + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + return -1; + } + + return 0; +} + static int setup_images_and_workdir(const char *images_dir_path, bool work_changed_by_rpc_conf, CriuOpts *req, @@ -706,30 +741,8 @@ static int setup_opts_from_req(int sk, CriuOpts *req) xfree(tmp_work); } - /* - * open images_dir - images_dir_fd is a required RPC parameter - * - * This assumes that if opts.imgs_dir is set we have a value - * from the configuration file parser. The test to see that - * imgs_changed_by_rpc_conf is true is used to make sure the value - * is from the RPC configuration file. The idea is that only the - * RPC configuration file is able to overwrite RPC settings: - * * apply_config(global_conf) - * * apply_config(user_conf) - * * apply_config(environment variable) - * * apply_rpc_options() - * * apply_config(rpc_conf) - */ - if (imgs_changed_by_rpc_conf) { - strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - } else if (req->images_dir_fd != -1) { - sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); - } else if (req->images_dir) { - strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); - } else { - pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + if (resolve_images_dir_path(images_dir_path, imgs_changed_by_rpc_conf, req, ids.pid) < 0) goto err; - } if (req->parent_img) SET_CHAR_OPTS(img_parent, req->parent_img); From dfb7e2096e1567c100faf2a46490afd4b4393553 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 28 Oct 2025 18:37:31 +0000 Subject: [PATCH 131/137] cr-service: refactor logging setup Move the logging initialization into a helper function that can be reused. No functional change intended. Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 51 ++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 7d17a63e03..b4e8629c99 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -354,6 +354,31 @@ static int setup_images_and_workdir(const char *images_dir_path, return 0; } +static int setup_logging_from_req(CriuOpts *req, bool output_changed_by_rpc_conf) +{ + if (req->log_file && !output_changed_by_rpc_conf) { + if (strchr(req->log_file, '/')) { + pr_perror("No subdirs are allowed in log_file name"); + return -1; + } + SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; /* log_init(NULL) writes to stderr */ + } else if (!opts.output) { + SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); + } + + opts.log_level = req->log_level; + log_set_loglevel(opts.log_level); + if (log_init(opts.output)) { + pr_perror("Can't initiate log"); + return -1; + } + + return 0; +} + static int setup_opts_from_req(int sk, CriuOpts *req) { struct ucred ids; @@ -758,36 +783,12 @@ static int setup_opts_from_req(int sk, CriuOpts *req) } /* initiate log file in work dir */ - if (req->log_file && !output_changed_by_rpc_conf) { - /* - * If RPC sets a log file and if there nothing from the - * RPC configuration file, use the RPC value. - */ - if (strchr(req->log_file, '/')) { - pr_perror("No subdirs are allowed in log_file name"); - goto err; - } - - SET_CHAR_OPTS(output, req->log_file); - } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { - xfree(opts.output); - opts.output = NULL; - } else if (!opts.output) { - SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); - } - - /* This is needed later to correctly set the log_level */ - opts.log_level = req->log_level; - log_set_loglevel(req->log_level); - if (log_init(opts.output) == -1) { - pr_perror("Can't initiate log"); + if (setup_logging_from_req(req, output_changed_by_rpc_conf)) goto err; - } if (req->mntns_compat_mode) opts.mntns_compat_mode = true; - log_set_loglevel(opts.log_level); if (check_options()) goto err; From 5251ceccdf5690e0adc605535cde53d62d2319bd Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 07:12:03 +0100 Subject: [PATCH 132/137] cr-service: refactor RPC opts parsing for check() The check() functionality is very different from dump, pre-dump, and restore. It is used only to check if the kernel supports required features, and does not need the majority of options set via RPC. In particular, we don't need to open `image_dir` when running `check()` because this functionality doesn't create or process image files. In this case, `image_dir` is used as `work_dir`, only when the latter is not specified and a log file is used. This patch updates the RPC options parser so that it only handles the logging options when check() is used. Logging to a file is required when log_file is explicitly set or no log_to_stderr is used. In such case, we also resolve images_dir and work_dir where the log file will be created. Fixes: #2758 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 57 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index b4e8629c99..b4718dde2b 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -311,6 +311,12 @@ static int resolve_images_dir_path(char *images_dir_path, strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); images_dir_path[PATH_MAX - 1] = '\0'; } else { + /* + * Since images dir is not required in CHECK mode, we need to + * check for work_dir_fd in setup_images_and_workdir() + */ + if (opts.mode == CR_CHECK) + return 0; pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); return -1; } @@ -323,18 +329,21 @@ static int setup_images_and_workdir(const char *images_dir_path, CriuOpts *req, pid_t peer_pid) { - char work_dir_path[PATH_MAX]; + char work_dir_path[PATH_MAX] = ""; - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); - return -1; + /* We don't need to open images dir in CHECK mode. */ + if (opts.mode != CR_CHECK) { + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + return -1; + } } if (work_changed_by_rpc_conf) @@ -343,9 +352,14 @@ static int setup_images_and_workdir(const char *images_dir_path, sprintf(work_dir_path, "/proc/%d/fd/%d", peer_pid, req->work_dir_fd); else if (opts.work_dir) strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else + else if (images_dir_path[0] != '\0') strcpy(work_dir_path, images_dir_path); + if (work_dir_path[0] == '\0') { + pr_err("images-dir or work-dir is required when using log file\n"); + return -1; + } + if (chdir(work_dir_path)) { pr_perror("Can't chdir to work_dir"); return -1; @@ -384,7 +398,7 @@ static int setup_opts_from_req(int sk, CriuOpts *req) struct ucred ids; struct stat st; socklen_t ids_len = sizeof(struct ucred); - char images_dir_path[PATH_MAX]; + char images_dir_path[PATH_MAX] = ""; char status_fd[PATH_MAX]; bool output_changed_by_rpc_conf = false; bool work_changed_by_rpc_conf = false; @@ -397,6 +411,23 @@ static int setup_opts_from_req(int sk, CriuOpts *req) goto err; } + /* + * The options relevant in CHECK mode are: log_file, log_to_stderr, and log_level. + * When logging to a file, we also need to resolve images_dir and work_dir. + */ + if (opts.mode == CR_CHECK) { + if (!req) + return 0; /* nothing to do */ + + /* + * A log file is needed only if: + * - log_file is explicitly set, or + * - log_to_stderr is NOT requested (i.e., using DEFAULT_LOG_FILENAME) + */ + if (!req->log_file || (req->has_log_to_stderr && req->log_to_stderr)) + return 0; /* no log file, don't require images_dir or work_dir */ + } + if (fstat(sk, &st)) { pr_perror("Can't get socket stat"); goto err; From bb1e51a696bd91356a157e1f527a36ba1e7733c9 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 09:28:28 +0100 Subject: [PATCH 133/137] pycriu: set RPC opts for CHECK This allows users to specify RPC options when using the check() functionality. Co-authored-by: Andrii Herheliuk Signed-off-by: Radostin Stoyanov --- lib/pycriu/criu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 05a85c58da..760d2be78a 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -211,7 +211,8 @@ class criu: def __init__(self): self.use_binary('criu') - self.opts = rpc.criu_opts() + # images_dir_fd is required field with default value of -1 + self.opts = rpc.criu_opts(images_dir_fd=-1) self.sk = None def use_sk(self, sk_name=CR_DEFAULT_SERVICE_ADDRESS): @@ -273,6 +274,7 @@ def check(self): """ req = rpc.criu_req() req.type = rpc.CHECK + req.opts.MergeFrom(self.opts) resp = self._send_req_and_recv_resp(req) From bc4aedf0dd05c521bdcb6a6327a1c90b9820c988 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 25 Oct 2025 12:35:19 +0100 Subject: [PATCH 134/137] pycriu: use explicit imports for __init__ _init__.py defines the public API for pycriu. It is important to use explicit imports to avoid leaking every symbol from criu.py into the pycriu namespace. This avoids import-time side effects, prevents name collisions, and circular-import traps. Fixes the following lint error: F403 `from .criu import *` used; unable to detect undefined names Signed-off-by: Radostin Stoyanov --- Makefile | 2 ++ lib/pycriu/__init__.py | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 1824ea180c..05834d6821 100644 --- a/Makefile +++ b/Makefile @@ -452,6 +452,8 @@ ruff: test/inhfd/*.py \ test/others/rpc/config_file.py \ test/others/action-script/check_actions.py \ + lib/pycriu/criu.py \ + lib/pycriu/__init__.py \ lib/pycriu/images/pb2dict.py \ lib/pycriu/images/images.py \ scripts/criu-ns \ diff --git a/lib/pycriu/__init__.py b/lib/pycriu/__init__.py index 2abcf029de..28f1e94249 100644 --- a/lib/pycriu/__init__.py +++ b/lib/pycriu/__init__.py @@ -1,4 +1,15 @@ from . import rpc_pb2 as rpc from . import images -from .criu import * -from .version import __version__ \ No newline at end of file +from .criu import criu, CRIUExceptionExternal, CRIUException +from .criu import CR_DEFAULT_SERVICE_ADDRESS +from .version import __version__ + +__all__ = ( + "rpc", + "images", + "criu", + "CRIUExceptionExternal", + "CRIUException", + "CR_DEFAULT_SERVICE_ADDRESS", + "__version__", +) \ No newline at end of file From 68db3d9895c322b8d1d8da315195a89455f5a84a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 19:32:23 +0100 Subject: [PATCH 135/137] test/rpc: update errno check The --mntns-compat-mode option is no longer parsed with CHECK. Use --log-file instead to test the error message. Signed-off-by: Radostin Stoyanov --- test/others/rpc/errno.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index a5a3eb54dc..ea841199f9 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -49,8 +49,8 @@ def check_resp(self, resp, typ, err, errmsg = None): if err and resp.cr_errno != err: raise Exception('Unexpected cr_errno ' + str(resp.cr_errno)) - - if errmsg and errmsg not in resp.cr_errmsg: + + if errmsg and errmsg not in str(resp.cr_errmsg): raise Exception('Unexpected cr_msg \'' + str(resp.cr_errmsg) + '\'') def no_process(self): @@ -134,20 +134,19 @@ def bad_request(self): self.check_resp(resp, rpc.EMPTY, None) print('Success') - + def child_first_err(self): print('Receive correct first error message') req = self.get_base_req() req.type = rpc.CHECK - - # mntns_compat_mode options is only allowed on restore - req.opts.mntns_compat_mode = True + # Log file must not have subdirectory + req.opts.log_file = "/foo/bar.log" self.send_req(req) resp = self.recv_resp() - self.check_resp(resp, rpc.CHECK, None, "Option --mntns-compat-mode is only valid on restore\n") + self.check_resp(resp, rpc.CHECK, None, "No subdirs are allowed in log_file name") print('Success') From 159bf1179e1a1c217218e2edca6052b6f66595bf Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 20 Oct 2025 10:24:49 +0100 Subject: [PATCH 136/137] test/others: add test for check() with libcriu Signed-off-by: Radostin Stoyanov --- test/others/libcriu/Makefile | 1 + test/others/libcriu/run.sh | 1 + test/others/libcriu/test_check.c | 17 +++++++++++++++++ 3 files changed, 19 insertions(+) create mode 100644 test/others/libcriu/test_check.c diff --git a/test/others/libcriu/Makefile b/test/others/libcriu/Makefile index e0ee5b2aba..927f17c236 100644 --- a/test/others/libcriu/Makefile +++ b/test/others/libcriu/Makefile @@ -8,6 +8,7 @@ TESTS += test_iters TESTS += test_errno TESTS += test_join_ns TESTS += test_pre_dump +TESTS += test_check TESTS += test_feature_check all: $(TESTS) diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index 804af9b835..6b36d44960 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -63,6 +63,7 @@ if [ "$(uname -m)" = "x86_64" ]; then fi run_test test_errno run_test test_join_ns +run_test test_check if criu check --feature mem_dirty_track > /dev/null; then export CRIU_FEATURE_MEM_TRACK=1 fi diff --git a/test/others/libcriu/test_check.c b/test/others/libcriu/test_check.c new file mode 100644 index 0000000000..4af3b36306 --- /dev/null +++ b/test/others/libcriu/test_check.c @@ -0,0 +1,17 @@ +#include +#include "criu.h" +#include "lib.h" + +int main(int argc, char **argv) +{ + int ret; + + printf("--- Start check ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + + if (criu_check()) + return -1; + + return 0; +} From db4d493ece075ae4639f9d59df97fd08f161ec77 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 26 Oct 2025 10:00:39 +0000 Subject: [PATCH 137/137] test/others: add tests for check() with pycriu Signed-off-by: Radostin Stoyanov --- Makefile | 1 + test/others/pycriu/.gitignore | 1 + test/others/pycriu/Makefile | 63 ++++++++++++++++++++ test/others/pycriu/read.py | 1 + test/others/pycriu/test_check.py | 29 +++++++++ test/others/pycriu/test_check_fail.py | 32 ++++++++++ test/others/pycriu/test_check_images_dir.py | 44 ++++++++++++++ test/others/pycriu/test_check_work_dir_fd.py | 44 ++++++++++++++ test/others/rpc/read.py | 0 9 files changed, 215 insertions(+) create mode 100644 test/others/pycriu/.gitignore create mode 100644 test/others/pycriu/Makefile create mode 120000 test/others/pycriu/read.py create mode 100755 test/others/pycriu/test_check.py create mode 100755 test/others/pycriu/test_check_fail.py create mode 100755 test/others/pycriu/test_check_images_dir.py create mode 100755 test/others/pycriu/test_check_work_dir_fd.py mode change 100644 => 100755 test/others/rpc/read.py diff --git a/Makefile b/Makefile index 05834d6821..e268071584 100644 --- a/Makefile +++ b/Makefile @@ -452,6 +452,7 @@ ruff: test/inhfd/*.py \ test/others/rpc/config_file.py \ test/others/action-script/check_actions.py \ + test/others/pycriu/*.py \ lib/pycriu/criu.py \ lib/pycriu/__init__.py \ lib/pycriu/images/pb2dict.py \ diff --git a/test/others/pycriu/.gitignore b/test/others/pycriu/.gitignore new file mode 100644 index 0000000000..567609b123 --- /dev/null +++ b/test/others/pycriu/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/test/others/pycriu/Makefile b/test/others/pycriu/Makefile new file mode 100644 index 0000000000..b6e3b48148 --- /dev/null +++ b/test/others/pycriu/Makefile @@ -0,0 +1,63 @@ +.SHELLFLAGS := -eu -o pipefail -c +.ONESHELL: + +CRIU ?= ../../../criu/criu +BUILD_DIR ?= build +SOCKET_NAME ?= criu_service.socket +PIDFILE_NAME ?= pidfile +SERVICE_LOG ?= service.log +PYTHON ?= python3 + +PIDFILE := $(BUILD_DIR)/$(PIDFILE_NAME) +CRIU_SOCKET := $(BUILD_DIR)/$(SOCKET_NAME) +STATUS_FIFO := $(BUILD_DIR)/startup.status +STATUS_FD := 200 + +run: start + cleanup() { $(MAKE) --no-print-directory stop || true; } + trap cleanup EXIT INT TERM + "$(PYTHON)" test_check.py + "$(PYTHON)" test_check_fail.py + "$(PYTHON)" test_check_images_dir.py + "$(PYTHON)" test_check_work_dir_fd.py + +start: + mkdir -p "$(BUILD_DIR)" + if [ -s "$(PIDFILE)" ] && kill -0 "$$(cat "$(PIDFILE)")" 2>/dev/null; then + echo "Service running (PID $$(cat "$(PIDFILE)"))." + exit 0 + fi + if ! command -v "$(CRIU)" >/dev/null 2>&1; then + echo "CRIU not found at $(CRIU)" + exit 1 + fi + mkfifo "$(STATUS_FIFO)" + exec $(STATUS_FD)<>"$(STATUS_FIFO)" + "$(CRIU)" service \ + -v4 \ + -W "$(BUILD_DIR)" \ + --address "$(SOCKET_NAME)" \ + -d \ + --pidfile "$(PIDFILE_NAME)" \ + -o "$(SERVICE_LOG)" \ + --status-fd "$(STATUS_FD)" + "$(PYTHON)" read.py "$(STATUS_FIFO)" + +stop: + if [ ! -s "$(PIDFILE)" ]; then + echo "pidfile missing or empty" + exit 1 + fi + pid=$$(cat "$(PIDFILE)") + if kill -0 "$$pid" 2>/dev/null; then + kill -9 "$$pid" || true + fi + rm -f "$(PIDFILE)" "$(CRIU_SOCKET)" "$(STATUS_FIFO)" + +clean: + if [ -s "$(PIDFILE)" ] && kill -0 "$$(cat "$(PIDFILE)")" 2>/dev/null; then + kill -9 "$$(cat "$(PIDFILE)")" || true + fi + rm -rf "$(BUILD_DIR)" + +.PHONY: start stop clean run \ No newline at end of file diff --git a/test/others/pycriu/read.py b/test/others/pycriu/read.py new file mode 120000 index 0000000000..c2c1e13658 --- /dev/null +++ b/test/others/pycriu/read.py @@ -0,0 +1 @@ +../rpc/read.py \ No newline at end of file diff --git a/test/others/pycriu/test_check.py b/test/others/pycriu/test_check.py new file mode 100755 index 0000000000..9888158db2 --- /dev/null +++ b/test/others/pycriu/test_check.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + socket_path = os.path.join(SCRIPT_DIR, "build", "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + try: + criu.check() + except Exception as e: + print(f"FAIL: {e}") + return 1 + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_fail.py b/test/others/pycriu/test_check_fail.py new file mode 100755 index 0000000000..b5634c60b4 --- /dev/null +++ b/test/others/pycriu/test_check_fail.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + socket_path = os.path.join(SCRIPT_DIR, "build", "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + # Intentionally set only log_file (no images/work dir) to ensure check() fails + criu.opts.log_file = "check.log" + + try: + criu.check() + except Exception: + print("PASS") + return 0 + + print("FAIL: check() did not fail when log_file is set without images/work dir") + return 1 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_images_dir.py b/test/others/pycriu/test_check_images_dir.py new file mode 100755 index 0000000000..f479c2a88f --- /dev/null +++ b/test/others/pycriu/test_check_images_dir.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def _log_path(images_dir, log_file): + return log_file if os.path.isabs(log_file) else os.path.join(images_dir, log_file) + +def main(): + build_dir = os.path.join(SCRIPT_DIR, "build") + socket_path = os.path.join(build_dir, "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + criu.opts.images_dir = build_dir + criu.opts.log_file = "check.log" + criu.opts.log_level = 4 + + try: + criu.check() + except Exception as e: + lp = _log_path(build_dir, criu.opts.log_file) + msg = f"FAIL: {e} ({'see log: ' + lp if os.path.exists(lp) else 'no log found'})" + print(msg) + return 1 + + lp = _log_path(build_dir, criu.opts.log_file) + if not (os.path.isfile(lp) and os.path.getsize(lp) > 0): + print(f"FAIL: log file missing or empty: {lp}") + return 1 + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_work_dir_fd.py b/test/others/pycriu/test_check_work_dir_fd.py new file mode 100755 index 0000000000..e20a83097a --- /dev/null +++ b/test/others/pycriu/test_check_work_dir_fd.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + build_dir = os.path.join(SCRIPT_DIR, "build") + socket_path = os.path.join(build_dir, "criu_service.socket") + os.makedirs(build_dir, exist_ok=True) + + # Open a directory FD to use as work_dir_fd (prefer O_PATH if available) + flags = getattr(os, "O_PATH", 0) or os.O_RDONLY + fd = os.open(build_dir, flags) + + criu = pycriu.criu() + criu.use_sk(socket_path) + + criu.opts.work_dir_fd = fd + criu.opts.log_file = "check.log" + criu.opts.log_level = 4 + + try: + criu.check() + except Exception as e: + print(f"FAIL: {e}") + return 1 + finally: + try: + os.close(fd) + except Exception: + pass + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/rpc/read.py b/test/others/rpc/read.py old mode 100644 new mode 100755