diff --git a/.cirrus.yml b/.cirrus.yml index a4b53a54b0..72dbb38981 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -13,9 +13,8 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-no-vdso @@ -33,10 +32,9 @@ task: memory: 8G setup_script: | - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel libuuid-devel + contrib/dependencies/dnf-packages.sh # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel @@ -65,9 +63,8 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-rawhide @@ -86,36 +83,11 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-non-root -task: - name: aarch64 build GCC (native) - arm_container: - image: docker.io/library/ubuntu:jammy - cpu: 4 - memory: 4G - script: uname -a - build_script: | - scripts/ci/apt-install make - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - make -C scripts/ci local - -task: - name: aarch64 build CLANG (native) - arm_container: - image: docker.io/library/ubuntu:jammy - cpu: 4 - memory: 4G - script: uname -a - build_script: | - scripts/ci/apt-install make - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - make -C scripts/ci local CLANG=1 - task: name: aarch64 Fedora Rawhide arm_container: @@ -125,6 +97,5 @@ task: script: uname -a build_script: | scripts/ci/prepare-for-fedora-rawhide.sh - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 make -C test/zdtm -j 4 diff --git a/.github/workflows/actuated-aarch64-test.yaml b/.github/workflows/aarch64-test.yaml similarity index 50% rename from .github/workflows/actuated-aarch64-test.yaml rename to .github/workflows/aarch64-test.yaml index 8b0a63fc7b..ebbecadb33 100644 --- a/.github/workflows/actuated-aarch64-test.yaml +++ b/.github/workflows/aarch64-test.yaml @@ -1,43 +1,25 @@ -name: Actuated aarch64 test +name: aarch64 test on: [push, pull_request] # Cancel any preceding run on the pull request. concurrency: - group: actuated-test-${{ github.event.pull_request.number || github.ref }} + group: aarch64-test-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} jobs: build: - # Actuated runners are not available in all repositories. - if: ${{ github.repository == 'checkpoint-restore/criu' }} - # The memory size and the number of CPUs can be freely selected. - # 3GB and 4 CPUs seems to be enough according to the result from 'vmmeter'. - runs-on: actuated-arm64-4cpu-3gb strategy: matrix: + os: [ubuntu-24.04-arm, ubuntu-22.04-arm] target: [GCC=1, CLANG=1] - steps: - # https://gist.github.com/alexellis/1f33e581c75e11e161fe613c46180771#file-metering-gha-md - # vmmeter start - - name: Prepare arkade - uses: alexellis/arkade-get@master - with: - crane: latest - print-summary: false - - - name: Install vmmeter - run: | - crane export --platform linux/arm64 ghcr.io/openfaasltd/vmmeter:latest | sudo tar -xvf - -C /usr/local/bin - - - name: Run vmmeter - uses: self-actuated/vmmeter-action@master - # vmmeter end + runs-on: ${{ matrix.os }} + steps: - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }} - # Following tests are failing on the actuated VMs: + - name: Run Tests ${{ matrix.target }} on ${{ matrix.os }} + # Following tests are failing on the VMs: # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) # diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 73530d79ae..0f5c20f48b 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -9,10 +9,11 @@ concurrency: jobs: build: - runs-on: ubuntu-22.04 strategy: matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm] target: [GCC=1, CLANG=1] + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml index 354873909e..bf7d06697c 100644 --- a/.github/workflows/check-commits.yml +++ b/.github/workflows/check-commits.yml @@ -19,7 +19,7 @@ jobs: # Checkout pull request HEAD commit instead of merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies - run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev + run: sudo contrib/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev - name: Configure git user details run: | git config --global user.email "checkpoint-restore@users.noreply.github.com" diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 88e21d3d17..9c9e46c1b2 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -34,7 +34,7 @@ jobs: - name: Install Packages (cpp) if: ${{ matrix.language == 'cpp' }} run: | - sudo scripts/ci/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev + sudo contrib/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev - name: Initialize CodeQL uses: github/codeql-action/init@v3 with: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 862d682458..f7da4f6f6c 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -14,7 +14,7 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make ruff xz clang-tools-extra which codespell git-clang-format ShellCheck + run: sudo dnf -y install git make ruff xz clang-tools-extra codespell git-clang-format ShellCheck - uses: actions/checkout@v4 diff --git a/.github/workflows/nftables-test.yml b/.github/workflows/nftables-test.yml index eb3d8e8141..7a7d8bd309 100644 --- a/.github/workflows/nftables-test.yml +++ b/.github/workflows/nftables-test.yml @@ -15,7 +15,7 @@ jobs: - name: Remove iptables run: sudo apt remove -y iptables - name: Install libnftables-dev - run: sudo scripts/ci/apt-install libnftables-dev + run: sudo contrib/apt-install libnftables-dev - name: chmod 755 /home/runner # CRIU's tests are sometimes running as some random user and need # to be able to access the test files. diff --git a/.gitignore b/.gitignore index 854657d1c1..94daa13ea0 100644 --- a/.gitignore +++ b/.gitignore @@ -20,8 +20,6 @@ compel/compel compel/compel-host-bin images/*.c images/*.h -images/google/protobuf/*.c -images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest diff --git a/.lgtm.yml b/.lgtm.yml index 0dd49cda41..4beadcc637 100644 --- a/.lgtm.yml +++ b/.lgtm.yml @@ -23,8 +23,3 @@ extraction: - "python3-yaml" - "libnl-route-3-dev" - "gnutls-dev" - configure: - command: - - "ls -laR images/google" - - "ln -s /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto" - - "ls -laR images/google" diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 94841b3f3c..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,35 +0,0 @@ -language: c -os: linux -dist: bionic -services: - - docker -jobs: - include: - - os: linux - arch: ppc64le - env: TR_ARCH=local - dist: bionic - - os: linux - arch: ppc64le - env: TR_ARCH=local CLANG=1 - dist: bionic - - os: linux - arch: s390x - env: TR_ARCH=local - dist: bionic - - os: linux - arch: arm64-graviton2 - env: TR_ARCH=local RUN_TESTS=1 - dist: focal - group: edge - virt: vm - - os: linux - arch: arm64-graviton2 - env: TR_ARCH=local CLANG=1 RUN_TESTS=1 - group: edge - virt: vm - dist: bionic -script: - - sudo make -C scripts/ci $TR_ARCH -after_success: - - make -C scripts/ci after_success diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 0000000000..e3c5a92d9f --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +GEMINI.md \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 37965e5fba..03875639df 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,8 +8,8 @@ Here are some useful hints to get involved. * We have both -- [very simple](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; * CRIU does need [extensive testing](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); * Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; -* Feedback is expected on the GitHub issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); -* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lists.openvz.org/mailman/listinfo/criu). +* Feedback is expected on the GitHub issues page and on the [mailing list](https://lore.kernel.org/criu); +* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lore.kernel.org/criu). Below we describe in more detail recommend practices for CRIU development. * Spread the word about CRIU in [social networks](http://criu.org/Contacts); * If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); @@ -27,19 +27,43 @@ The repository may contain multiple branches. Development happens in the **criu- To clone CRIU repo and switch to the proper branch, run: ``` - git clone https://github.com/checkpoint-restore/criu criu - cd criu - git checkout criu-dev +git clone https://github.com/checkpoint-restore/criu criu +cd criu +git checkout criu-dev ``` -### Compile +### Building from source -First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. +Follow these steps to compile CRIU from source code. -To compile CRIU, run: +#### Installing build dependencies + +First, you need to install the required build dependencies. We provide scripts to simplify this process for several Linux distributions in [contrib/dependencies](contrib/dependencies). For a complete list of dependencies, please refer to the [installation guide](https://criu.org/Installation). + +##### On Ubuntu/Debian-based systems: + +``` +./contrib/dependencies/apt-packages.sh +``` + +##### On Fedora/CentOS-based systems: + +``` +./contrib/dependencies/dnf-packages.sh +``` + +##### Using Nix: ``` - make +nix develop +``` + +#### Compiling CRIU + +Once the dependencies are installed, you can compile CRIU by running the `make` command from the root of the source directory: + +``` +make ``` This should create the `./criu/criu` executable. @@ -63,7 +87,7 @@ The following command can be used to automatically run a code linter for Python text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). ``` - make lint +make lint ``` In addition, we have adopted a [clang-format configuration file](https://www.kernel.org/doc/Documentation/process/clang-format.rst) @@ -73,7 +97,7 @@ results in decreased readability, we may choose to ignore these errors. Run the following command to check if your changes are compliant with the clang-format rules: ``` - make indent +make indent ``` This command is built upon the `git-clang-format` tool and supports two options `BASE` and `OPTS`. The `BASE` option allows you to @@ -83,7 +107,7 @@ can use `BASE=origin/criu-dev`. The `OPTS` option can be used to pass additional to check the last *N* commits for formatting errors, without applying the changes to the codebase you can use the following command. ``` - make indent OPTS=--diff BASE=HEAD~N +make indent OPTS=--diff BASE=HEAD~N ``` Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected @@ -96,7 +120,7 @@ Here are some bad examples of clang-format-ing: ``` @@ -58,8 +59,7 @@ static int register_membarriers(void) } - + if (!all_ok) { - fail("can't register membarrier()s - tried %#x, kernel %#x", - barriers_registered, barriers_supported); @@ -129,16 +153,11 @@ Here are some bad examples of clang-format-ing: CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run ``` - make test +make test ``` The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. -In case you'd rather have someone else run the tests, you can use travis-ci for your -own GitHub fork of CRIU. It will check the compilation for various supported platforms, -as well as run most of the tests from the suite. See https://travis-ci.org/checkpoint-restore/criu -for more details. - ## Describe your changes Describe your problem. Whether your change is a one-line bug fix or @@ -166,21 +185,21 @@ If your change fixes a bug in a specific commit, e.g. you found an issue using the SHA-1 ID, and the one line summary. For example: ``` - Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") +Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") ``` The following `git config` settings can be used to add a pretty format for outputting the above style in the `git log` or `git show` commands: ``` - [pretty] - fixes = Fixes: %h (\"%s\") +[pretty] + fixes = Fixes: %h (\"%s\") ``` If your change address an issue listed in GitHub, please use `Fixes:` tag with the number of the issue. For instance: ``` - Fixes: #339 +Fixes: #339 ``` The `Fixes:` tags should be put at the end of the detailed description. @@ -263,7 +282,7 @@ can certify the below: then you just add a line saying ``` - Signed-off-by: Random J Developer +Signed-off-by: Random J Developer ``` using your real name (please, no pseudonyms or anonymous contributions if @@ -275,14 +294,14 @@ commit message. To append such line to a commit you already made, use ``` From: Random J Developer - Subject: [PATCH] component: Short patch description +Subject: [PATCH] component: Short patch description - Long patch description (could be skipped if patch - is trivial enough) +Long patch description (could be skipped if patch +is trivial enough) - Signed-off-by: Random J Developer - --- - Patch body here +Signed-off-by: Random J Developer +--- +Patch body here ``` ## Submit your work upstream @@ -316,8 +335,8 @@ contains the following: revisions should be listed. For example: ``` - v3: rebase on the current criu-dev - v2: add commit to foo() and update bar() coding style +v3: rebase on the current criu-dev +v2: add commit to foo() and update bar() coding style ``` If there are only minor updates to the commits in a pull request, it is @@ -335,7 +354,7 @@ Historically, CRIU worked with mailing lists and patches so if you still prefer To create a patch, run ``` - git format-patch --signoff origin/criu-dev +git format-patch --signoff origin/criu-dev ``` You might need to read GIT documentation on how to prepare patches @@ -346,8 +365,8 @@ at all. We recommend to post patches using `git send-email` ``` - git send-email --cover-letter --no-chain-reply-to --annotate \ - --confirm=always --to=criu@openvz.org criu-dev +git send-email --cover-letter --no-chain-reply-to --annotate \ + --confirm=always --to=criu@lists.linux.dev criu-dev ``` Note that the `git send-email` subcommand may not be in @@ -359,14 +378,14 @@ If this is your first time using git send-email, you might need to configure it to point it to your SMTP server with something like: ``` - git config --global sendemail.smtpServer stmp.example.net +git config --global sendemail.smtpServer stmp.example.net ``` -If you get tired of typing `--to=criu@openvz.org` all the time, +If you get tired of typing `--to=criu@lists.linux.dev` all the time, you can configure that to be automatically handled as well: ``` - git config sendemail.to criu@openvz.org +git config sendemail.to criu@lists.linux.dev ``` If a developer is sending another version of the patch (e.g. to address @@ -379,7 +398,7 @@ version if needed though). ### Mail patches -The patches should be sent to CRIU development mailing list, `criu AT openvz.org`. Note that you need to be subscribed first in order to post. The list web interface is available at https://openvz.org/mailman/listinfo/criu; you can also use standard mailman aliases to work with it. +The patches should be sent to CRIU development mailing list, `criu AT lists.linux.dev`. Note that you need to be subscribed first in order to post. The list web interface is available at https://lore.kernel.org/criu; you can also use standard mailman aliases to work with it. Please make sure the email client you're using doesn't screw your patch (line wrapping and so on). @@ -396,5 +415,3 @@ sometimes a patch may fly around a week before it gets reviewed. Wiki article: [Continuous integration](https://criu.org/Continuous_integration) CRIU tests are run for each series sent to the mailing list. If you get a message from our patchwork that patches failed to pass the tests, you have to investigate what is wrong. - -We also recommend you to [enable Travis CI for your repo](https://criu.org/Continuous_integration#Enable_Travis_CI_for_your_repo) to check patches in your git branch, before sending them to the mailing list. diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index 68803f3dbc..fe76fc3bc6 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -15,6 +15,7 @@ Checkpoint / Restore inside a docker container Pytorch Tensorflow Using CRIU Image Streamer +Parallel Restore DESCRIPTION ----------- diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 606935790b..40ede84e25 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -465,6 +465,30 @@ The 'mode' may be one of the following: *skip*::: Don't lock the network. If *--tcp-close* is not used, the network must be locked externally to allow CRIU to dump TCP connections. +*--allow-uprobes*:: + Allow dumping when uprobes vma is present. When used on dump, this option is + required on restore as well. + + A uprobes vma is automatically created by the kernel once a uprobe is + triggered. This mapping is not removed even once the uprobe is deleted. So, + even if a process once had uprobes attached to it, and they're removed by + the time the process is dumped, this option is still required because criu + has no way of knowing whether there are active uprobes or not. + + When using this option on restore, make sure the uprobes (if any) active on + the dumped processes are still active. Otherwise, when execution reaches + a uprobe'd location in any of the restored processes, that process will be + sent a SIGTRAP. + + As an example, say a uprobe is set at function foo in the executable of the + process p_bar. Whenever execution in p_bar reaches function foo, the uprobe + is triggered. If the uprobe has been triggered at least once, then the kernel + will have created the uprobes vma. To dump p_bar, this option is + necessary. After dumping, say the uprobe is deleted. Now, on restoring with + this option, once execution reaches function foo, SIGTRAP will be sent to + the restored p_bar. Unless it has a signal handler installed for SIGTRAP, + it will be terminated and core dumped. + *restore* ~~~~~~~~~ Restores previously checkpointed processes. @@ -692,6 +716,10 @@ The 'mode' may be one of the following: *--skip-file-rwx-check*:: Skip checking file permissions (r/w/x for u/g/o) on restore. +*--allow-uprobes*:: + Required when dumped with this option. Refer to this option in the section + on dumping for more details. + *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to diff --git a/GEMINI.md b/GEMINI.md new file mode 100644 index 0000000000..e56c1de12d --- /dev/null +++ b/GEMINI.md @@ -0,0 +1,136 @@ +# CRIU (Checkpoint/Restore In User-space) + +CRIU is a tool for saving the state of a running application to a set of files +(checkpointing) and restoring it back to a live state. It is primarily used for +live migration of containers, in-place updates, and fast application startup. + +It is implemented as a command-line tool called `criu`. The two primary commands +are `dump` and `restore`. + +- `dump`: Saves a process tree and all its related resources (file + descriptors, IPC, sockets, namespaces, etc.) into a collection of image + files. +- `restore`: Restores processes from image files to the same state they were + in before the dump. + +## Quick Start + +To get a feel for `criu`, you can try checkpointing and restoring a simple +process. + +1. **Run a simple process:** + Open a terminal and run a command that will run for a while. Find its PID. + ```bash + sleep 1000 & + [1] 12345 + ``` + +2. **Dump the process:** + As root, use `criu dump` with the process ID (`-t`) and a directory for the + image files (`-D`). + ```bash + sudo criu dump -t 12345 -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will no longer be running. + +3. **Restore the process:** + Use `criu restore` to bring the process back to life from the images. + ```bash + sudo criu restore -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will be running again as if nothing happened. + +# For Developers and Contributors + +This section contains more technical details about CRIU's internals and +development process. + +## Dump Process + +On dump, CRIU uses available kernel interfaces to collect information about +processes. For properties that can only be retrieved from within the process +itself, CRIU injects a binary blob (called a "parasite") into the process's +address space and executes it in the context of one of the process's threads. +This injection is handled by a subproject called **Compel**. + +## Restore Process + +On restore, CRIU reads the image files to reconstruct the processes. The goal is +to restore them to the exact state they were in before the dump. The restore +process is divided into several stages (defined as `CR_STATE_*` in +`./criu/include/restorer.h`). + +The main `criu` process acts as a coordinator. It first restores resources with +inter-process dependencies (file descriptors, sockets, shared memory, +namespaces, etc.). It then forks the process tree and sets up namespaces. +Finally, it restores process-specific resources like file descriptors and memory +mappings. + +A key step involves a small, self-contained binary called the "restorer". All +restored processes switch to executing this code, which unmaps the CRIU-specific +memory and restores the application's original memory mappings. On the final +step, the restorer calls `sigreturn` on a prepared signal frame to resume the +process with the state it had at the moment of the dump. + +## Compel + +Compel is a subproject responsible for generating the binary blobs used for the +parasite code (for dumping) and the restorer code (for restoring). It provides a +library for injecting and executing this code within the target process's +address space. It is a separate project because the logic for generating and +injecting Position-Independent Executable (PIE) code is complex and +self-contained. + +## Coding Style + +The C code in the CRIU project follows the +[Linux Kernel Coding Style](https://www.kernel.org/doc/html/latest/process/coding-style.html). +Here are some of the main points: + +- **Indentation**: Use tabs, which are set to 8 characters. +- **Line Length**: The preferred line limit is 80 characters, but it can be + extended to 120 if it improves code readability. +- **Braces**: + - The opening brace for a function goes on a new line. + - The opening brace for a block (like `if`, `for`, `while`, `switch`) goes + on the same line. +- **Spaces**: Use spaces around operators (`+`, `-`, `*`, `/`, `%`, `<`, `>`, + `=`, etc.). +- **Naming**: Use descriptive names for functions and variables. +- **Comments**: Use C-style comments (`/* ... */`). For multi-line comments, + the preferred format is: + ```c + /* + * This is a multi-line + * comment. + */ + ``` + +## Code Layout + +The code is organized into the following directories: + +- `./compel`: The Compel sub-project. +- `./criu`: The main `criu` tool source code. +- `./images`: Protobuf descriptions for the image files. +- `./test`: All tests. +- `./test/zdtm`: The Zero-Downtime Migration (ZDTM) test suite. +- `./test/zdtm.py`: The executor script for ZDTM tests. +- `./scripts`: Helper scripts. +- `./scripts/build`: Docker image files used for CI and cross-compilation + checks. +- `./crit`: A tool to inspect and manipulate CRIU image files. +- `./soccr`: A library for TCP socket checkpoint/restore. + +## Tests + +The main test suite is ZDTM. Here is an example of how to run a single test: + +```bash +sudo ./test/zdtm.py run -t zdtm/static/env00 +``` + +Each ZDTM test has three stages: preparation, C/R, and results checks. During +the test, a process calls `test_daemon()` to signal it is ready for C/R, then +calls `test_waitsig()` to wait for the C/R stage to complete. After being +restored, the test checks that all its resources are still in a valid state. diff --git a/Makefile b/Makefile index 5d8e89ac1b..e268071584 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ ifeq ($(ARCH),arm) endif ifeq ($(ARMV),8) - # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. + # Running 'setarch linux32 uname -m' returns armv8l on aarch64. # This tells CRIU to handle armv8l just as armv7hf. Right now this is # only used for compile testing. No further verification of armv8l exists. ARCHCFLAGS += -march=armv7-a @@ -64,6 +64,8 @@ endif ifeq ($(ARCH),aarch64) DEFINES := -DCONFIG_AARCH64 + CC_MBRANCH_PROT := $(shell $(CC) -c -x c /dev/null -mbranch-protection=none -o /dev/null >/dev/null 2>&1 && echo "-mbranch-protection=none") + CFLAGS_PIE := $(CC_MBRANCH_PROT) endif ifeq ($(ARCH),ppc64) @@ -449,6 +451,10 @@ ruff: test/zdtm.py \ test/inhfd/*.py \ test/others/rpc/config_file.py \ + test/others/action-script/check_actions.py \ + test/others/pycriu/*.py \ + lib/pycriu/criu.py \ + lib/pycriu/__init__.py \ lib/pycriu/images/pb2dict.py \ lib/pycriu/images/images.py \ scripts/criu-ns \ @@ -462,7 +468,8 @@ ruff: shellcheck: shellcheck --version shellcheck scripts/*.sh - shellcheck scripts/ci/*.sh scripts/ci/apt-install + shellcheck scripts/ci/*.sh + shellcheck contrib/apt-install contrib/dependencies/*.sh shellcheck -x test/others/crit/*.sh shellcheck -x test/others/libcriu/*.sh shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh @@ -485,7 +492,7 @@ lint: ruff shellcheck codespell ! git --no-pager grep -E '\s+$$' \*.c \*.h .PHONY: lint ruff shellcheck codespell -codecov: SHELL := $(shell which bash) +codecov: SHELL := $(shell command -v bash) codecov: curl -Os https://uploader.codecov.io/latest/linux/codecov chmod +x codecov diff --git a/Makefile.compel b/Makefile.compel index 764afadc81..a4209edc5d 100644 --- a/Makefile.compel +++ b/Makefile.compel @@ -50,8 +50,8 @@ compel/plugins/%: $(compel-deps) .FORCE # # GNU make 4.x supports targets matching via wide -# match targeting, where GNU make 3.x series (used on -# Travis) is not, so we have to write them here explicitly. +# match targeting, where GNU make 3.x series is not, +# so we have to write them here explicitly. compel/plugins/std.lib.a: $(compel-deps) .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ diff --git a/Makefile.config b/Makefile.config index 5ab689d411..5cf4b8216d 100644 --- a/Makefile.config +++ b/Makefile.config @@ -9,7 +9,7 @@ ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) LIBS_FEATURES += -lbsd FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD else - $(info Note: Building without setproctitle() and strlcpy() support.) + $(info Note: Building without setproctitle() support.) $(info $S Install libbsd-devel (RPM) / libbsd-dev (DEB) to fix.) endif @@ -84,7 +84,7 @@ endif export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) -FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ +FEATURES_LIST := TCP_REPAIR PTRACE_PEEKSIGINFO \ SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ OPENAT2 NO_LIBC_RSEQ_DEFS diff --git a/Makefile.install b/Makefile.install index 455735f3b1..70c607ec6c 100644 --- a/Makefile.install +++ b/Makefile.install @@ -46,9 +46,13 @@ endif endif # Default flags for pip install: -# --upgrade: Upgrade crit/pycriu packages -# --ignore-installed: Ignore existing packages and reinstall them -PIPFLAGS ?= --upgrade --ignore-installed +# --ignore-installed: Overwrite already installed pycriu/crit packages +# --no-build-isolation: Use current Python environment to build pycriu/crit packages +# --no-deps: Don't install any dependencies +# --no-index: Don't use PyPI index to find packages +# --progress-bar: Cleaner output +# --upgrade: Treat the install as an upgrade when replacing the installed version +PIPFLAGS ?= --ignore-installed --no-build-isolation --no-deps --no-index --progress-bar off --upgrade export SKIP_PIP_INSTALL PIPFLAGS diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index 9152024fd8..a3528500db 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -1,10 +1,11 @@ #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ -#include +#include #include #include +#include /* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ diff --git a/compel/arch/mips/src/lib/handle-elf.c b/compel/arch/mips/src/lib/handle-elf.c index a605a5a452..e086761c21 100644 --- a/compel/arch/mips/src/lib/handle-elf.c +++ b/compel/arch/mips/src/lib/handle-elf.c @@ -5,18 +5,31 @@ #include "piegen.h" #include "log.h" -static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - extern int __handle_elf(void *mem, size_t size); int handle_binary(void *mem, size_t size) { - if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) - return __handle_elf(mem, size); + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)mem; + + /* check ELF magic */ + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return -EINVAL; + } + + /* check ELF class and data encoding */ + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64 || + ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF class or data encoding\n"); + return -EINVAL; + } + + if (ehdr->e_ident[EI_ABIVERSION] != 0) { + pr_warn("Unusual ABI version: %d\n", ehdr->e_ident[EI_ABIVERSION]); + } - pr_err("Unsupported Elf format detected\n"); - return -EINVAL; + return __handle_elf(mem, size); } diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index ed97d64dd6..1f61876ffb 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -13,6 +13,15 @@ #define PARASITE_START_AREA_MIN (4096) +#define PARASITE_STACK_SIZE (16 << 10) +/* + * A stack redzone is a small, protected region of memory located immediately + * after a parasite stack. It is intended to remain unchanged. While it can be + * implemented as a guard page, we want to avoid the overhead of additional + * remote system calls. + */ +#define PARASITE_STACK_REDZONE 128 + extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index a9bbd64004..22fcf24fad 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -38,8 +38,6 @@ #define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - (size_t)((struct sockaddr_un *)0)->sun_path) #endif -#define PARASITE_STACK_SIZE (16 << 10) - #ifndef SECCOMP_MODE_DISABLED #define SECCOMP_MODE_DISABLED 0 #endif @@ -1056,6 +1054,16 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, memcpy(ctl->local_map, ctl->pblob.hdr.mem, ctl->pblob.hdr.bsize); compel_relocs_apply(ctl->local_map, ctl->remote_map, &ctl->pblob); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(ctl->local_map, ctl->local_map + ctl->pblob.hdr.bsize); p = parasite_size; @@ -1064,7 +1072,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, p += RESTORE_STACK_SIGFRAME; p += PARASITE_STACK_SIZE; - ctl->rstack = ctl->remote_map + p; + ctl->rstack = ctl->remote_map + p - PARASITE_STACK_REDZONE; /* * x86-64 ABI requires a 16 bytes aligned stack. @@ -1078,7 +1086,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, if (nr_threads > 1) { p += PARASITE_STACK_SIZE; - ctl->r_thread_stack = ctl->remote_map + p; + ctl->r_thread_stack = ctl->remote_map + p - PARASITE_STACK_REDZONE; } ret = arch_fetch_sas(ctl, ctl->rsigframe); diff --git a/compel/test/stack/spy.c b/compel/test/stack/spy.c index 9b7c9a7f09..184c8ab318 100644 --- a/compel/test/stack/spy.c +++ b/compel/test/stack/spy.c @@ -50,70 +50,6 @@ static void *get_parasite_rstack_start(struct parasite_ctl *ctl) return rstack_start; } -static int page_writable(struct parasite_ctl *ctl, int pid, void *page) -{ - FILE *maps; - size_t maps_line_len = 0; - char *maps_line = NULL; - char victim_maps_path[6 + 11 + 5 + 1]; - int written; - int ret = 0; - - if (((uintptr_t)page & (page_size() - 1)) != 0) { - fprintf(stderr, "Page address not aligned\n"); - ret = -1; - goto done; - } - - written = snprintf(victim_maps_path, sizeof(victim_maps_path), "/proc/%d/maps", pid); - if (written < 0 || written >= sizeof(victim_maps_path)) { - fprintf(stderr, "Failed to create path string to victim's /proc/%d/maps file\n", pid); - ret = -1; - goto done; - } - - maps = fopen(victim_maps_path, "r"); - if (maps == NULL) { - perror("Can't open victim's /proc/$pid/maps"); - ret = -1; - goto done; - } - - while (getline(&maps_line, &maps_line_len, maps) != -1) { - unsigned long vmstart, vmend; - char r, w; - - if (sscanf(maps_line, "%lx-%lx %c%c", &vmstart, &vmend, &r, &w) < 4) { - fprintf(stderr, "Can't parse victim's /proc/%d/maps; line: %s\n", pid, maps_line); - ret = -1; - goto free_linebuf; - } - - if (page >= (void *)vmstart && page < (void *)vmend) { - if (w == 'w') { - if (r != 'r') { - fprintf(stderr, "Expecting writable memory to also be readable"); - ret = -1; - goto free_linebuf; - } - ret = 1; - } - break; - } - } - - if (errno) { - perror("Can't read victim's /proc/$pid/maps"); - ret = -1; - } - -free_linebuf: - free(maps_line); - fclose(maps); -done: - return ret; -} - static void *read_proc_mem(int pid, void *offset, size_t len) { char victim_mem_path[6 + 11 + 4 + 1]; @@ -153,51 +89,6 @@ static void *read_proc_mem(int pid, void *offset, size_t len) return NULL; } -static int save_data_near_stack(struct parasite_ctl *ctl, int pid, void *stack, void **saved_data, - size_t *saved_data_size) -{ - size_t page_mask = page_size() - 1; - size_t saved_size = 0; - size_t stack_size_last_page = (uintptr_t)stack & page_mask; - void *next_page = stack; - - if (stack_size_last_page != 0) { - size_t empty_space_last_page = page_size() - stack_size_last_page; - saved_size = min(empty_space_last_page, (size_t)SAVED_DATA_MAX); - next_page += page_size() - stack_size_last_page; - } - - while (saved_size < SAVED_DATA_MAX && next_page != NULL) { - switch (page_writable(ctl, pid, next_page)) { - case 1: - saved_size = min((size_t)(saved_size + page_size()), (size_t)SAVED_DATA_MAX); - next_page += page_size(); - break; - case 0: - next_page = NULL; - break; - default: - return -1; - } - } - - if (saved_size > 0) { - void *sd; - - sd = read_proc_mem(pid, stack, saved_size); - if (sd == NULL) - return -1; - - *saved_data = sd; - } else { - *saved_data = NULL; - } - - *saved_data_size = saved_size; - - return 0; -} - static int check_saved_data(struct parasite_ctl *ctl, int pid, void *stack, void *saved_data, size_t saved_data_size) { if (saved_data != NULL) { @@ -221,7 +112,7 @@ static int do_infection(int pid) struct infect_ctx *ictx; int *arg; void *stack; - size_t saved_data_size; + size_t saved_data_size = PARASITE_STACK_REDZONE; int saved_data_check; compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); @@ -257,8 +148,6 @@ static int do_infection(int pid) err_and_ret("Can't register cleanup function with atexit\n"); stack = get_parasite_rstack_start(ctl); - if (save_data_near_stack(ctl, pid, stack, &saved_data, &saved_data_size)) - err_and_ret("Can't save data above stack\n"); if (compel_start_daemon(ctl)) err_and_ret("Can't start daemon in victim\n"); diff --git a/scripts/ci/apt-install b/contrib/apt-install similarity index 100% rename from scripts/ci/apt-install rename to contrib/apt-install diff --git a/contrib/debian/dev-packages.lst b/contrib/debian/dev-packages.lst deleted file mode 100644 index ce45f1b7cf..0000000000 --- a/contrib/debian/dev-packages.lst +++ /dev/null @@ -1,19 +0,0 @@ -# Required packages for development in Debian -build-essential -libprotobuf-dev -libprotobuf-c-dev -protobuf-c-compiler -protobuf-compiler -python3-protobuf -libnet-dev - -# Extra packages, required for testing and building other tools -pkg-config -libnl-3-dev -libbsd0 -libbsd-dev -iproute2 -libcap-dev -libaio-dev -python3-yaml -libnl-route-3-dev diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh new file mode 100755 index 0000000000..c47fb9fe07 --- /dev/null +++ b/contrib/dependencies/apk-packages.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env sh + +apk add --no-cache \ + asciidoctor \ + bash \ + build-base \ + coreutils \ + e2fsprogs \ + elfutils-dev \ + git \ + gnutls-dev \ + go \ + ip6tables \ + iproute2 \ + iptables \ + iptables-legacy \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libcap-utils \ + libdrm-dev \ + libnet-dev \ + libnl3-dev \ + libtraceevent-dev \ + libtracefs-dev \ + nftables \ + nftables-dev \ + perl \ + pkgconfig \ + procps \ + protobuf-c-compiler \ + protobuf-c-dev \ + protobuf-dev \ + py3-importlib-metadata \ + py3-pip \ + py3-protobuf \ + py3-yaml \ + python3 \ + sudo \ + tar \ + util-linux \ + util-linux-dev diff --git a/contrib/dependencies/apt-cross-packages.sh b/contrib/dependencies/apt-cross-packages.sh new file mode 100755 index 0000000000..30ce6874c8 --- /dev/null +++ b/contrib/dependencies/apt-cross-packages.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + crossbuild-essential-"${DEBIAN_ARCH}" \ + iproute2:"${DEBIAN_ARCH}" \ + libaio-dev:"${DEBIAN_ARCH}" \ + libbz2-dev:"${DEBIAN_ARCH}" \ + libc6-"${DEBIAN_ARCH}"-cross \ + libc6-dev-"${DEBIAN_ARCH}"-cross \ + libcap-dev:"${DEBIAN_ARCH}" \ + libdrm-dev:"${DEBIAN_ARCH}" \ + libelf-dev:"${DEBIAN_ARCH}" \ + libexpat1-dev:"${DEBIAN_ARCH}" \ + libgnutls28-dev:"${DEBIAN_ARCH}" \ + libnet-dev:"${DEBIAN_ARCH}" \ + libnftables-dev:"${DEBIAN_ARCH}" \ + libnl-3-dev:"${DEBIAN_ARCH}" \ + libnl-route-3-dev:"${DEBIAN_ARCH}" \ + libprotobuf-c-dev:"${DEBIAN_ARCH}" \ + libprotobuf-dev:"${DEBIAN_ARCH}" \ + libssl-dev:"${DEBIAN_ARCH}" \ + libtraceevent-dev:"${DEBIAN_ARCH}" \ + libtracefs-dev:"${DEBIAN_ARCH}" \ + ncurses-dev:"${DEBIAN_ARCH}" \ + uuid-dev:"${DEBIAN_ARCH}" \ + build-essential \ + pkg-config \ + git \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-protobuf diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh new file mode 100755 index 0000000000..1fd42d4e68 --- /dev/null +++ b/contrib/dependencies/apt-packages.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + asciidoctor \ + bash \ + bsdmainutils \ + build-essential \ + gdb \ + git-core \ + iptables \ + kmod \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libdrm-dev \ + libelf-dev \ + libgnutls28-dev \ + libgnutls30 \ + libnet-dev \ + libnl-3-dev \ + libnl-route-3-dev \ + libperl-dev \ + libprotobuf-c-dev \ + libprotobuf-dev \ + libselinux-dev \ + libtraceevent-dev \ + libtracefs-dev \ + pkg-config \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-importlib-metadata \ + python3-pip \ + python3-protobuf \ + python3-yaml \ + time \ + util-linux \ + uuid-dev diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh new file mode 100755 index 0000000000..793f267a5c --- /dev/null +++ b/contrib/dependencies/dnf-packages.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env sh + +dnf install -y \ + asciidoc \ + binutils \ + elfutils-libelf-devel \ + gcc \ + git \ + glibc-devel \ + gnutls-devel \ + iproute \ + iptables \ + libaio-devel \ + libasan \ + libbpf-devel \ + libbsd-devel \ + libcap-devel \ + libdrm-devel \ + libnet-devel \ + libnl3-devel \ + libselinux-devel \ + libtraceevent-devel \ + libtracefs-devel \ + libuuid-devel \ + make \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + protobuf-c-devel \ + protobuf-compiler \ + protobuf-devel \ + python-devel \ + python3-importlib-metadata \ + python3-protobuf \ + python3-pyyaml \ + python3-setuptools \ + python3-wheel \ + rubygem-asciidoctor \ + xmlto diff --git a/contrib/dependencies/pacman-packages.sh b/contrib/dependencies/pacman-packages.sh new file mode 100755 index 0000000000..260797606b --- /dev/null +++ b/contrib/dependencies/pacman-packages.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env sh + +pacman -Syu --noconfirm \ + asciidoctor \ + base-devel \ + bash \ + coreutils \ + diffutils \ + git \ + gnutls \ + go \ + iproute2 \ + iptables \ + libaio \ + libbsd \ + libcap \ + libdrm \ + libelf \ + libnet \ + libnl \ + libtraceevent \ + libtracefs \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + python-importlib-metadata \ + python-pip \ + python-protobuf \ + python-yaml \ + sudo \ + tar \ + util-linux \ + util-linux-libs diff --git a/contrib/docker_cr.sh b/contrib/docker_cr.sh index 9b43d8ba19..04ef676cd6 100755 --- a/contrib/docker_cr.sh +++ b/contrib/docker_cr.sh @@ -418,7 +418,7 @@ resolve_path() { local p p="${2}" - if which realpath > /dev/null; then + if command -v realpath > /dev/null; then p=$(realpath "${p}") fi ${ECHO} "${1}: ${p}" @@ -427,7 +427,7 @@ resolve_path() { resolve_cmd() { local cpath - cpath=$(which "${2}") + cpath=$(command -v "${2}") resolve_path "${1}" "${cpath}" } diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index c6a758c8ad..3c9cd45aaa 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -55,6 +55,7 @@ "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, "VMA_AREA_MEMFD": 1 << 14, + "VMA_AREA_UPROBES": 1 << 17, "VMA_AREA_UNSUPP": 1 << 31 } @@ -793,7 +794,8 @@ def _get_page(self, pid, page_no): off = 0 # in pages for m in pagemap[1:]: found = False - for i in range(m["nr_pages"]): + num_pages = m.get("nr_pages", m.compat_nr_pages) + for i in range(num_pages): if m["vaddr"] + i * PAGESIZE == page_no * PAGESIZE: found = True break diff --git a/crit/pyproject.toml b/crit/pyproject.toml index 9089f0a394..f0b185eb7a 100644 --- a/crit/pyproject.toml +++ b/crit/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "crit" description = "CRiu Image Tool" authors = [ - {name = "CRIU team", email = "criu@openvz.org"}, + {name = "CRIU team", email = "criu@lists.linux.dev"}, ] license = {text = "GPLv2"} dynamic = ["version"] diff --git a/crit/setup.cfg b/crit/setup.cfg index fbc9a51439..37895923f3 100644 --- a/crit/setup.cfg +++ b/crit/setup.cfg @@ -7,7 +7,7 @@ name = crit description = CRiu Image Tool author = CRIU team -author_email = criu@openvz.org +author_email = criu@lists.linux.dev license = GPLv2 version = attr: crit.__version__ diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index 6cde03ee38..3ed5c9d635 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -23,14 +23,57 @@ #include "compel/infect.h" #include "pstree.h" +/* + * cr_user_pac_* are a copy of the corresponding uapi structs + * in arch/arm64/include/uapi/asm/ptrace.h + */ +struct cr_user_pac_address_keys { + __uint128_t apiakey; + __uint128_t apibkey; + __uint128_t apdakey; + __uint128_t apdbkey; +}; + +struct cr_user_pac_generic_keys { + __uint128_t apgakey; +}; + +/* + * The following HWCAP constants are copied from + * arch/arm64/include/uapi/asm/hwcap.h + */ +#ifndef HWCAP_PACA +#define HWCAP_PACA (1 << 30) +#endif + +#ifndef HWCAP_PACG +#define HWCAP_PACG (1UL << 31) +#endif + +/* + * The following NT_ARM_PAC constants are copied from + * include/uapi/linux/elf.h + */ +#ifndef NT_ARM_PACA_KEYS +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#endif + +#ifndef NT_ARM_PACG_KEYS +#define NT_ARM_PACG_KEYS 0x408 +#endif + +#ifndef NT_ARM_PAC_ENABLED_KEYS +#define NT_ARM_PAC_ENABLED_KEYS 0x40a /* AArch64 pointer authentication enabled keys. */ +#endif + extern unsigned long getauxval(unsigned long type); #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e static int save_pac_keys(int pid, CoreEntry *core) { - struct user_pac_address_keys paca; - struct user_pac_generic_keys pacg; + struct cr_user_pac_address_keys paca; + struct cr_user_pac_generic_keys pacg; PacKeys *pac_entry; long pac_enabled_key; struct iovec iov; @@ -224,8 +267,8 @@ int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) int arch_ptrace_restore(int pid, struct pstree_item *item) { unsigned long hwcaps = getauxval(AT_HWCAP); - struct user_pac_address_keys upaca; - struct user_pac_generic_keys upacg; + struct cr_user_pac_address_keys upaca; + struct cr_user_pac_generic_keys upacg; PacAddressKeys *paca; PacGenericKeys *pacg; long pac_enabled_keys; diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h index 64a9c24eb9..2174df4fa1 100644 --- a/criu/arch/aarch64/include/asm/restorer.h +++ b/criu/arch/aarch64/include/asm/restorer.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ -#include +#include #include #include "asm/types.h" diff --git a/criu/arch/ppc64/cpu.c b/criu/arch/ppc64/cpu.c index bb5b7256e2..b87230f40a 100644 --- a/criu/arch/ppc64/cpu.c +++ b/criu/arch/ppc64/cpu.c @@ -64,6 +64,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/s390/cpu.c b/criu/arch/s390/cpu.c index 3f430f4550..e227fad5e1 100644 --- a/criu/arch/s390/cpu.c +++ b/criu/arch/s390/cpu.c @@ -87,6 +87,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + ret = 0; if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/s390/crtools.c b/criu/arch/s390/crtools.c index 96cef819e3..e08c838783 100644 --- a/criu/arch/s390/crtools.c +++ b/criu/arch/s390/crtools.c @@ -142,6 +142,29 @@ static void print_core_fp_regs(const char *msg, CoreEntry *core) print_core_ri_cb(core); } +/* + * Allocate floating point registers + */ +static UserS390FpregsEntry *allocate_fp_regs(void) +{ + UserS390FpregsEntry *fpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + return NULL; + user_s390_fpregs_entry__init(fpregs); + + fpregs->n_fprs = 16; + fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); + if (!fpregs->fprs) + goto fail_free_fpregs; + return fpregs; + +fail_free_fpregs: + xfree(fpregs); + return NULL; +} + /* * Allocate VxrsLow registers */ @@ -294,7 +317,13 @@ int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_stru CoreEntry *core = arg; gpregs = CORE_THREAD_ARCH_INFO(core)->gpregs; - fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; + /* + * We delay allocating this until now because checkpointing can fail earlier. + * When it fails we need to know if we reached here or not so that the cleanup + * code doesn't restore FPRs that were never saved in the first place. + */ + fpregs = allocate_fp_regs(); + CORE_THREAD_ARCH_INFO(core)->fpregs = fpregs; /* Vector registers */ if (f->flags & USER_FPREGS_VXRS) { @@ -399,36 +428,15 @@ int restore_fpu(struct rt_sigframe *f, CoreEntry *core) return 0; } -/* - * Allocate floating point registers - */ -static UserS390FpregsEntry *allocate_fp_regs(void) -{ - UserS390FpregsEntry *fpregs; - - fpregs = xmalloc(sizeof(*fpregs)); - if (!fpregs) - return NULL; - user_s390_fpregs_entry__init(fpregs); - - fpregs->n_fprs = 16; - fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); - if (!fpregs->fprs) - goto fail_free_fpregs; - return fpregs; - -fail_free_fpregs: - xfree(fpregs); - return NULL; -} - /* * Free floating point registers */ static void free_fp_regs(UserS390FpregsEntry *fpregs) { - xfree(fpregs->fprs); - xfree(fpregs); + if (fpregs) { + xfree(fpregs->fprs); + xfree(fpregs); + } } /* @@ -487,15 +495,17 @@ int arch_alloc_thread_info(CoreEntry *core) ti_s390->gpregs = allocate_gp_regs(); if (!ti_s390->gpregs) goto fail_free_ti_s390; - ti_s390->fpregs = allocate_fp_regs(); - if (!ti_s390->fpregs) - goto fail_free_gp_regs; + + /* + * Delay allocating space until needed. Checkpointing can fail before that + * and the cleanup code needs to be able to tell if FPRs were saved or not + * before trying to restore the register state. + */ + ti_s390->fpregs = NULL; CORE_THREAD_ARCH_INFO(core) = ti_s390; return 0; -fail_free_gp_regs: - free_gp_regs(ti_s390->gpregs); fail_free_ti_s390: xfree(ti_s390); return -1; @@ -678,14 +688,18 @@ static int set_task_regs(pid_t pid, CoreEntry *core) user_fpregs_struct_t fpregs; memset(&fpregs, 0, sizeof(fpregs)); - /* Floating point registers */ + /* + * Floating point registers + * Optional on checkpoint; checkpoint may have failed and we may reach here as part of cleanup + * so there's no guarantee that we saved FPRs for this thread. + */ cfpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; - if (!cfpregs) - return -1; - fpregs.prfpreg.fpc = cfpregs->fpc; - memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); - if (set_fp_regs(pid, &fpregs) < 0) - return -1; + if (cfpregs) { + fpregs.prfpreg.fpc = cfpregs->fpc; + memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); + if (set_fp_regs(pid, &fpregs) < 0) + return -1; + } /* Vector registers (optional) */ cvxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; if (cvxrs_low != NULL) { diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index dfa31569fa..2e1f2de9ad 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -407,6 +407,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0) goto err; diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index 7814c351d1..d113fd8abb 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -73,6 +73,23 @@ int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, int (*func)(void *arg), void *arg); #define arch_shstk_trampoline arch_shstk_trampoline +static always_inline long shstk_restorer_stack_size(void) +{ + return PAGE_SIZE; +} +#define shstk_restorer_stack_size shstk_restorer_stack_size +static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) +{ + info->tmp_shstk = (unsigned long)ptr; +} +#define shstk_set_restorer_stack shstk_set_restorer_stack + +static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long __maybe_unused def) +{ + return !(info->cet & ARCH_SHSTK_SHSTK) ? def : (4UL << 30); +} +#define shstk_min_mmap_addr shstk_min_mmap_addr + #ifdef CR_NOGLIBC #include @@ -147,31 +164,51 @@ static inline int shstk_finalize(void) } /* - * Restore contents of the shadow stack and set shadow stack pointer + * Create shadow stack vma and restore its content from premmapped anonymous (non-shstk) vma */ -static always_inline int shstk_restore(struct rst_shstk_info *cet) +static always_inline int shstk_vma_restore(VmaEntry *vma_entry) { - unsigned long *shstk_data = (unsigned long *)cet->premmaped_addr; - unsigned long ssp = cet->vma_start + cet->vma_size - 8; - unsigned long shstk_top = cet->vma_size / 8 - 1; - unsigned long val; + long shstk, i; + unsigned long *shstk_data = (void *)vma_premmaped_start(vma_entry); + unsigned long vma_size = vma_entry_len(vma_entry); long ret; - if (!(cet->cet & ARCH_SHSTK_SHSTK)) - return 0; - - if (shstk_map(cet->vma_start, cet->vma_size)) + shstk = sys_map_shadow_stack(0, vma_size, SHADOW_STACK_SET_TOKEN); + if (shstk < 0) { + pr_err("Failed to map shadow stack: %ld\n", shstk); return -1; + } + + /* restore shadow stack contents */ + for (i = 0; i < vma_size / 8; i++) + wrssq(shstk + i * 8, shstk_data[i]); + + ret = sys_munmap(shstk_data, vma_size); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } /* - * Switch shadow stack from temporary location to the actual task's - * shadow stack VMA + * From that point premapped vma is (shstk) and we need + * to mremap() it to the final location. Originally premapped + * (shstk_data) has been unmapped already. */ - shstk_switch_ssp(ssp); + vma_premmaped_start(vma_entry) = shstk; - /* restore shadow stack contents */ - for (; ssp >= cet->ssp; ssp -= 8, shstk_top--) - wrssq(ssp, shstk_data[shstk_top]); + return 0; +} +#define shstk_vma_restore shstk_vma_restore + +/* + * Restore contents of the shadow stack and set shadow stack pointer + */ +static always_inline int shstk_restore(struct rst_shstk_info *cet) +{ + unsigned long ssp, val; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; /* * Add tokens for sigreturn frame and for switch of the shadow stack. @@ -182,6 +219,7 @@ static always_inline int shstk_restore(struct rst_shstk_info *cet) */ /* token for sigreturn frame */ + ssp = cet->ssp - 8; val = ALIGN_DOWN(cet->ssp, 8) | SHSTK_DATA_BIT; wrssq(ssp, val); @@ -193,12 +231,6 @@ static always_inline int shstk_restore(struct rst_shstk_info *cet) /* reset shadow stack pointer to the proper location */ shstk_switch_ssp(ssp); - ret = sys_munmap(shstk_data, cet->vma_size + PAGE_SIZE); - if (ret < 0) { - pr_err("Failed to unmap premmaped shadow stack\n"); - return ret; - } - return shstk_finalize(); } #define arch_shstk_restore shstk_restore diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c index b752f114a8..0810efac5e 100644 --- a/criu/arch/x86/shstk.c +++ b/criu/arch/x86/shstk.c @@ -45,7 +45,6 @@ static int shstk_prepare_task(struct vm_area_list *vmas, shstk->vma_start = vma->e->start; shstk->vma_size = size; shstk->premmaped_addr = premmaped_addr; - shstk->tmp_shstk = premmaped_addr + size; break; } diff --git a/criu/config.c b/criu/config.c index 1322a490ab..d7ef3f8e8b 100644 --- a/criu/config.c +++ b/criu/config.c @@ -18,6 +18,7 @@ #include "cr_options.h" #include "filesystems.h" #include "file-lock.h" +#include "image.h" #include "irmap.h" #include "mount.h" #include "mount-v2.h" @@ -703,6 +704,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), BOOL_OPT("unprivileged", &opts.unprivileged), BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), + BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes), {}, }; diff --git a/criu/cr-check.c b/criu/cr-check.c index 0388cbe7fe..7c3dc76dd8 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1392,6 +1392,14 @@ static int check_pagemap_scan(void) return 0; } +static int check_timer_cr_ids(void) +{ + if (!kdat.has_timer_cr_ids) + return -1; + + return 0; +} + /* musl doesn't have a statx wrapper... */ struct staty { __u32 stx_dev_major; @@ -1581,6 +1589,23 @@ static int check_overlayfs_maps(void) return status == 0 ? 0 : -1; } +static int check_breakpoints(void) +{ + if (!kdat.has_breakpoints) { + pr_warn("Hardware breakpoints don't seem to work\n"); + return -1; + } + + return 0; +} + +static int check_pagemap_scan_guard_pages(void) +{ + kerndat_warn_about_madv_guards(); + + return kdat.has_pagemap_scan_guard_pages ? 0 : -1; +} + static int (*chk_feature)(void); /* @@ -1608,6 +1633,7 @@ static int (*chk_feature)(void); return ret; \ } \ } while (0) + int cr_check(void) { struct ns_id *ns; @@ -1703,6 +1729,8 @@ int cr_check(void) ret |= check_ipv6_freebind(); ret |= check_pagemap_scan(); ret |= check_overlayfs_maps(); + ret |= check_timer_cr_ids(); + ret |= check_pagemap_scan_guard_pages(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1715,6 +1743,10 @@ int cr_check(void) ret |= check_autofs(); ret |= check_compat_cr(); } + /* + * Category 4 - optional. + */ + check_breakpoints(); pr_msg("%s\n", ret ? CHECK_MAYBE : CHECK_GOOD); return ret; @@ -1825,7 +1857,10 @@ static struct feature_list feature_list[] = { { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, { "pagemap_scan", check_pagemap_scan }, + { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, + { "breakpoints", check_breakpoints }, + { "pagemap_scan_guard_pages", check_pagemap_scan_guard_pages }, { NULL, NULL }, }; diff --git a/criu/cr-dedup.c b/criu/cr-dedup.c index c0c21f53ee..feeb9ebb03 100644 --- a/criu/cr-dedup.c +++ b/criu/cr-dedup.c @@ -87,7 +87,8 @@ static int cr_dedup_one_pagemap(unsigned long img_id, int flags) if (ret <= 0) goto exit; - pr_debug("dedup iovec base=%" PRIx64 ", len=%lu\n", pr.pe->vaddr, pagemap_len(pr.pe)); + pr_debug("dedup iovec %" PRIx64 " - %" PRIx64 "\n", + pr.pe->vaddr, pr.pe->vaddr + pagemap_len(pr.pe)); if (!pagemap_in_parent(pr.pe)) { ret = dedup_one_iovec(prp, pr.pe->vaddr, pagemap_len(pr.pe)); if (ret) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 302078caa0..60b8e793c9 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -130,6 +130,23 @@ int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap if (ret < 0) goto err; + /* + * In addition to real process VMAs we should keep an info about + * madvise(MADV_GUARD_INSTALL) pages. While these are not represented + * as a struct vm_area_struct in the kernel, it is convenient to treat + * them as mappings in CRIU and reuse the same VMA images but with only + * VMA_AREA_GUARD flag set. + * + * Also, we don't need to dump them during pre-dump. + */ + if (dump_file) { + ret = collect_madv_guards(pid, vma_area_list); + if (ret < 0) { + pr_err("Collect MADV_GUARD_INSTALL pages (pid: %d) failed with %d\n", pid, ret); + goto err; + } + } + pr_info("Collected, longest area occupies %lu pages\n", vma_area_list->nr_priv_pages_longest); pr_info_vma_list(&vma_area_list->h); @@ -1396,7 +1413,7 @@ static int dump_zombies(void) item->sid = pps_buf.sid; item->pgid = pps_buf.pgid; - BUG_ON(!list_empty(&item->children)); + BUG_ON(has_children(item)); if (!item->sid) { pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n", @@ -2124,6 +2141,8 @@ int cr_dump_tasks(pid_t pid) int pre_dump_ret = 0; int ret = -1; + kerndat_warn_about_madv_guards(); + pr_info("========================================\n"); pr_info("Dumping processes (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); @@ -2300,6 +2319,10 @@ int cr_dump_tasks(pid_t pid) goto err; he.has_pre_dump_mode = false; + if (found_uprobes_vma()) { + he.has_allow_uprobes = true; + he.allow_uprobes = true; + } ret = write_img_inventory(&he); if (ret) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1f4881dab0..057ec0e93d 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1820,6 +1820,7 @@ static int restore_rseq_cs(void) static int catch_tasks(bool root_seized) { struct pstree_item *item; + bool nobp = fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints; for_each_pstree_item(item) { int status, i, ret; @@ -1847,7 +1848,7 @@ static int catch_tasks(bool root_seized) return -1; } - ret = compel_stop_pie(pid, rsti(item)->breakpoint, fault_injected(FI_NO_BREAKPOINTS)); + ret = compel_stop_pie(pid, rsti(item)->breakpoint, nobp); if (ret < 0) return -1; } @@ -2119,7 +2120,7 @@ static int restore_root_task(struct pstree_item *init) * the '--empty-ns net' mode no iptables C/R is done and we * need to return these rules by hands. */ - ret = network_lock_internal(); + ret = network_lock_internal(/* restore = */ true); if (ret) goto out_kill; } @@ -2131,6 +2132,9 @@ static int restore_root_task(struct pstree_item *init) __restore_switch_stage(CR_STATE_FORKING); skip_ns_bouncing: + ret = run_plugins(POST_FORKING); + if (ret < 0 && ret != -ENOTSUP) + goto out_kill; ret = restore_wait_inprogress_tasks(); if (ret < 0) @@ -2258,7 +2262,7 @@ static int restore_root_task(struct pstree_item *init) * might actually be a true error code but that would be also * captured in the plugin so no need to print the error here. */ - if (ret < 0) + if (ret < 0 && ret != -ENOTSUP) pr_debug("restore late stage hook for external plugin failed\n"); } @@ -2362,41 +2366,47 @@ int cr_restore_tasks(void) return 1; if (check_img_inventory(/* restore = */ true) < 0) - goto err; - - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) return -1; if (init_stats(RESTORE_STATS)) - goto err; + return -1; if (lsm_check_opts()) - goto err; + return -1; timing_start(TIME_RESTORE); if (cpu_init() < 0) - goto err; + return -1; if (vdso_init_restore()) - goto err; + return -1; if (tty_init_restore()) - goto err; + return -1; if (opts.cpu_cap & CPU_CAP_IMAGE) { if (cpu_validate_cpuinfo()) - goto err; + return -1; } if (prepare_task_entries() < 0) - goto err; + return -1; if (prepare_pstree() < 0) - goto err; + return -1; if (fdstore_init()) - goto err; + return -1; + + /* + * For the AMDGPU plugin, its parallel restore feature needs to use fdstore to store + * its socket file descriptor. This allows the main process and the target process to + * communicate with each other through this file descriptor. Therefore, cr_plugin_init + * must be initialized after fdstore_init. + */ + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; if (inherit_fd_move_to_fdstore()) goto err; @@ -2421,23 +2431,23 @@ int cr_restore_tasks(void) return ret; } -static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long vma_len) +static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long min_addr, long vma_len) { struct vma_area *t_vma, *s_vma; - long prev_vma_end = 0; + long prev_vma_end = min_addr; struct vma_area end_vma; VmaEntry end_e; end_vma.e = &end_e; end_e.start = end_e.end = kdat.task_size; - prev_vma_end = kdat.mmap_min_addr; s_vma = list_first_entry(self_vma_list, struct vma_area, list); t_vma = list_first_entry(tgt_vma_list, struct vma_area, list); while (1) { if (prev_vma_end + vma_len > s_vma->e->start) { - if (s_vma->list.next == self_vma_list) { + if ((s_vma->list.next == self_vma_list) || + vma_area_is(vma_next(s_vma), VMA_AREA_GUARD)) { s_vma = &end_vma; continue; } @@ -2450,7 +2460,8 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he } if (prev_vma_end + vma_len > t_vma->e->start) { - if (t_vma->list.next == tgt_vma_list) { + if ((t_vma->list.next == tgt_vma_list) || + vma_area_is(vma_next(t_vma), VMA_AREA_GUARD)) { t_vma = &end_vma; continue; } @@ -2559,6 +2570,17 @@ static int remap_restorer_blob(void *addr) restorer_setup_c_header_desc(&pbd, true); compel_relocs_apply(addr, addr, &pbd); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(addr, addr + pbd.hdr.bsize); + return 0; } @@ -3173,7 +3195,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns rst_mem_size = rst_mem_lock(); memzone_size = round_up(sizeof(struct restore_mem_zone) * current->nr_threads, page_size()); - task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size; + task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size + shstk_restorer_stack_size(); BUG_ON(task_args->bootstrap_len & (PAGE_SIZE - 1)); pr_info("%d threads require %ldK of memory\n", current->nr_threads, KBYTES(task_args->bootstrap_len)); @@ -3203,7 +3225,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * or inited from scratch). */ - mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, task_args->bootstrap_len); + mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, + shstk_min_mmap_addr(&task_args->shstk, kdat.mmap_min_addr), + task_args->bootstrap_len); if (mem == (void *)-1) { pr_err("No suitable area for task_restore bootstrap (%ldK)\n", task_args->bootstrap_len); goto err; @@ -3442,6 +3466,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * self-vmas are unmaped. */ mem += rst_mem_size; + + shstk_set_restorer_stack(&task_args->shstk, mem); + mem += shstk_restorer_stack_size(); + task_args->vdso_rt_parked_at = (unsigned long)mem; task_args->vdso_maps_rt = vdso_maps_rt; task_args->vdso_rt_size = vdso_rt_size; diff --git a/criu/cr-service.c b/criu/cr-service.c index b9d11ced22..b4718dde2b 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -283,185 +283,159 @@ int exec_rpc_query_external_files(char *name, int sk) return ret; } -static char images_dir[PATH_MAX]; - -static int setup_opts_from_req(int sk, CriuOpts *req) +static int resolve_images_dir_path(char *images_dir_path, + bool imgs_changed_by_rpc_conf, + const CriuOpts *req, + pid_t peer_pid) { - struct ucred ids; - struct stat st; - socklen_t ids_len = sizeof(struct ucred); - char images_dir_path[PATH_MAX]; - char work_dir_path[PATH_MAX]; - char status_fd[PATH_MAX]; - bool output_changed_by_rpc_conf = false; - bool work_changed_by_rpc_conf = false; - bool imgs_changed_by_rpc_conf = false; - int i; - bool dummy = false; - - if (getsockopt(sk, SOL_SOCKET, SO_PEERCRED, &ids, &ids_len)) { - pr_perror("Can't get socket options"); - goto err; - } - - if (fstat(sk, &st)) { - pr_perror("Can't get socket stat"); - goto err; - } - - BUG_ON(st.st_ino == -1); - service_sk_ino = st.st_ino; - - /* - * Evaluate an additional configuration file if specified. - * This needs to happen twice, because it is needed early to detect - * things like work_dir, imgs_dir and logfile. The second parsing - * of the optional RPC configuration file happens at the end and - * overwrites all options set via RPC. - */ - if (req->config_file) { - char *tmp_output = opts.output; - char *tmp_work = opts.work_dir; - char *tmp_imgs = opts.imgs_dir; - - opts.output = NULL; - opts.work_dir = NULL; - opts.imgs_dir = NULL; - - rpc_cfg_file = req->config_file; - i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) { - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - goto err; - } - /* If this is non-NULL, the RPC configuration file had a value, use it.*/ - if (opts.output) - output_changed_by_rpc_conf = true; - /* If this is NULL, use the old value if it was set. */ - if (!opts.output && tmp_output) { - opts.output = tmp_output; - tmp_output = NULL; - } - - if (opts.work_dir) - work_changed_by_rpc_conf = true; - if (!opts.work_dir && tmp_work) { - opts.work_dir = tmp_work; - tmp_work = NULL; - } - - if (opts.imgs_dir) - imgs_changed_by_rpc_conf = true; - /* - * As the images directory is a required RPC setting, it is not - * necessary to use the value from other configuration files. - * Either it is set in the RPC configuration file or it is set - * via RPC. - */ - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - } - /* - * open images_dir - images_dir_fd is a required RPC parameter + * images_dir_fd is a required RPC parameter with -1 as default value. * - * This assumes that if opts.imgs_dir is set we have a value + * This assumes that if opts.imgs_dir is set, we have a value * from the configuration file parser. The test to see that * imgs_changed_by_rpc_conf is true is used to make sure the value - * is from the RPC configuration file. - * The idea is that only the RPC configuration file is able to - * overwrite RPC settings: + * is from the RPC configuration file. The idea is that only the + * RPC configuration file is able to overwrite RPC settings: * * apply_config(global_conf) * * apply_config(user_conf) * * apply_config(environment variable) * * apply_rpc_options() * * apply_config(rpc_conf) */ - if (imgs_changed_by_rpc_conf) + if (imgs_changed_by_rpc_conf) { strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - else if (req->images_dir_fd != -1) - sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); - else if (req->images_dir) + images_dir_path[PATH_MAX - 1] = '\0'; + } else if (req->images_dir_fd != -1) { + snprintf(images_dir_path, PATH_MAX, "/proc/%d/fd/%d", peer_pid, req->images_dir_fd); + } else if (req->images_dir) { strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); - else { + images_dir_path[PATH_MAX - 1] = '\0'; + } else { + /* + * Since images dir is not required in CHECK mode, we need to + * check for work_dir_fd in setup_images_and_workdir() + */ + if (opts.mode == CR_CHECK) + return 0; pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); - goto err; + return -1; } - if (req->parent_img) - SET_CHAR_OPTS(img_parent, req->parent_img); + return 0; +} - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); - goto err; - } +static int setup_images_and_workdir(const char *images_dir_path, + bool work_changed_by_rpc_conf, + CriuOpts *req, + pid_t peer_pid) +{ + char work_dir_path[PATH_MAX] = ""; - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - goto err; + /* We don't need to open images dir in CHECK mode. */ + if (opts.mode != CR_CHECK) { + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + return -1; + } } - /* chdir to work dir */ if (work_changed_by_rpc_conf) - /* Use the value from the RPC configuration file first. */ strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); else if (req->has_work_dir_fd) - /* Use the value set via RPC. */ - sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); + sprintf(work_dir_path, "/proc/%d/fd/%d", peer_pid, req->work_dir_fd); else if (opts.work_dir) - /* Use the value from one of the other configuration files. */ strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else - /* Use the images directory a work directory. */ + else if (images_dir_path[0] != '\0') strcpy(work_dir_path, images_dir_path); + if (work_dir_path[0] == '\0') { + pr_err("images-dir or work-dir is required when using log file\n"); + return -1; + } + if (chdir(work_dir_path)) { pr_perror("Can't chdir to work_dir"); - goto err; + return -1; } - /* initiate log file in work dir */ + return 0; +} + +static int setup_logging_from_req(CriuOpts *req, bool output_changed_by_rpc_conf) +{ if (req->log_file && !output_changed_by_rpc_conf) { - /* - * If RPC sets a log file and if there nothing from the - * RPC configuration file, use the RPC value. - */ if (strchr(req->log_file, '/')) { pr_perror("No subdirs are allowed in log_file name"); - goto err; + return -1; } - SET_CHAR_OPTS(output, req->log_file); } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { xfree(opts.output); - opts.output = NULL; + opts.output = NULL; /* log_init(NULL) writes to stderr */ } else if (!opts.output) { SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); } - /* This is needed later to correctly set the log_level */ opts.log_level = req->log_level; - log_set_loglevel(req->log_level); - if (log_init(opts.output) == -1) { + log_set_loglevel(opts.log_level); + if (log_init(opts.output)) { pr_perror("Can't initiate log"); + return -1; + } + + return 0; +} + +static int setup_opts_from_req(int sk, CriuOpts *req) +{ + struct ucred ids; + struct stat st; + socklen_t ids_len = sizeof(struct ucred); + char images_dir_path[PATH_MAX] = ""; + char status_fd[PATH_MAX]; + bool output_changed_by_rpc_conf = false; + bool work_changed_by_rpc_conf = false; + bool imgs_changed_by_rpc_conf = false; + int i; + bool dummy = false; + + if (getsockopt(sk, SOL_SOCKET, SO_PEERCRED, &ids, &ids_len)) { + pr_perror("Can't get socket options"); goto err; } - if (req->config_file) { - pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); + /* + * The options relevant in CHECK mode are: log_file, log_to_stderr, and log_level. + * When logging to a file, we also need to resolve images_dir and work_dir. + */ + if (opts.mode == CR_CHECK) { + if (!req) + return 0; /* nothing to do */ + + /* + * A log file is needed only if: + * - log_file is explicitly set, or + * - log_to_stderr is NOT requested (i.e., using DEFAULT_LOG_FILENAME) + */ + if (!req->log_file || (req->has_log_to_stderr && req->log_to_stderr)) + return 0; /* no log file, don't require images_dir or work_dir */ } + if (fstat(sk, &st)) { + pr_perror("Can't get socket stat"); + goto err; + } + + BUG_ON(st.st_ino == -1); + service_sk_ino = st.st_ino; + if (req->has_unprivileged) opts.unprivileged = req->unprivileged; @@ -753,14 +727,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->empty_ns & ~(CLONE_NEWNET)) goto err; } - - if (req->n_irmap_scan_paths) { - for (i = 0; i < req->n_irmap_scan_paths; i++) { - if (irmap_scan_path_add(req->irmap_scan_paths[i])) - goto err; - } - } - if (req->has_status_fd) { pr_warn("status_fd is obsoleted; use status-ready notification instead\n"); @@ -781,19 +747,79 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_display_stats) opts.display_stats = req->display_stats; - /* Evaluate additional configuration file a second time to overwrite - * all RPC settings. */ + /* Evaluate additional configuration file (e.g., runc.conf) to overwrite all RPC settings. */ if (req->config_file) { + char *tmp_output = opts.output; + char *tmp_work = opts.work_dir; + + opts.output = NULL; + opts.work_dir = NULL; + + /* + * As the images directory is a required RPC setting, it is not + * necessary to use the value from other configuration files. + * Either it is set in the RPC configuration file or it is set + * via RPC. + */ + xfree(opts.imgs_dir); + opts.imgs_dir = NULL; + + pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); + rpc_cfg_file = req->config_file; i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) + if (i) { + xfree(tmp_output); + xfree(tmp_work); goto err; + } + + /* If opts.{output,work_dir} is non-NULL, the RPC configuration file had a value, use it.*/ + /* If opts.{output,work_dir} is NULL, use the old value if it was set. */ + if (opts.output) { + output_changed_by_rpc_conf = true; + } else { + opts.output = tmp_output; + tmp_output = NULL; + } + + if (opts.work_dir) { + work_changed_by_rpc_conf = true; + } else { + opts.work_dir = tmp_work; + tmp_work = NULL; + } + + if (opts.imgs_dir) + imgs_changed_by_rpc_conf = true; + + xfree(tmp_output); + xfree(tmp_work); + } + + if (resolve_images_dir_path(images_dir_path, imgs_changed_by_rpc_conf, req, ids.pid) < 0) + goto err; + + if (req->parent_img) + SET_CHAR_OPTS(img_parent, req->parent_img); + + if (setup_images_and_workdir(images_dir_path, work_changed_by_rpc_conf, req, ids.pid)) + goto err; + + if (req->n_irmap_scan_paths) { + for (i = 0; i < req->n_irmap_scan_paths; i++) { + if (irmap_scan_path_add(req->irmap_scan_paths[i])) + goto err; + } } + /* initiate log file in work dir */ + if (setup_logging_from_req(req, output_changed_by_rpc_conf)) + goto err; + if (req->mntns_compat_mode) opts.mntns_compat_mode = true; - log_set_loglevel(opts.log_level); if (check_options()) goto err; @@ -813,7 +839,7 @@ static int dump_using_req(int sk, CriuOpts *req) if (setup_opts_from_req(sk, req)) goto exit; - __setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("dump --rpc -t %d", req->pid); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -856,7 +882,7 @@ static int restore_using_req(int sk, CriuOpts *req) if (setup_opts_from_req(sk, req)) goto exit; - __setproctitle("restore --rpc -D %s", images_dir); + __setproctitle("restore --rpc"); if (cr_restore_tasks()) goto exit; @@ -895,6 +921,11 @@ static int check(int sk, CriuOpts *req) resp.type = CRIU_REQ_TYPE__CHECK; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -919,6 +950,7 @@ static int check(int sk, CriuOpts *req) resp.success = true; out: + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -927,6 +959,11 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) int pid, status; bool success = false; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -940,7 +977,7 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) if (setup_opts_from_req(sk, req)) goto cout; - __setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("pre-dump --rpc -t %d", req->pid); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -1005,6 +1042,11 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT; struct ps_info info; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + if (pipe(start_pipe)) { pr_perror("No start pipe"); goto out; @@ -1078,6 +1120,7 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) out: resp.type = CRIU_REQ_TYPE__PAGE_SERVER; resp.success = success; + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -1252,6 +1295,11 @@ static int handle_cpuinfo(int sk, CriuReq *msg) bool success = false; int pid, status; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -1261,12 +1309,11 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (pid == 0) { int ret = 1; - opts.mode = CR_CPUINFO; + opts.mode = (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ? CR_CPUINFO_DUMP : CR_CPUINFO_CHECK; if (setup_opts_from_req(sk, msg->opts)) goto cout; - __setproctitle("cpuinfo %s --rpc -D %s", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check", - images_dir); + __setproctitle("cpuinfo %s --rpc", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check"); if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ret = cpuinfo_dump(); @@ -1301,7 +1348,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) out: resp.type = msg->type; resp.success = success; - + set_resp_err(&resp); return send_criu_msg(sk, &resp); } diff --git a/criu/crtools.c b/criu/crtools.c index 6f493850b9..e207133ac0 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -54,19 +54,17 @@ void flush_early_log_to_stderr(void) flush_early_log_buffer(STDERR_FILENO); } -static int image_dir_mode(char *argv[], int optind) +static int image_dir_mode(void) { switch (opts.mode) { case CR_DUMP: /* fallthrough */ + case CR_CPUINFO_DUMP: + /* fallthrough */ case CR_PRE_DUMP: return O_DUMP; case CR_RESTORE: return O_RSTR; - case CR_CPUINFO: - if (!strcmp(argv[optind + 1], "dump")) - return O_DUMP; - /* fallthrough */ default: return -1; } @@ -76,36 +74,55 @@ static int image_dir_mode(char *argv[], int optind) return -1; } -static int parse_criu_mode(char *mode) +struct { + char *cmd; + int mode; +} commands[] = { + { "dump", CR_DUMP }, + { "pre-dump", CR_PRE_DUMP }, + { "restore", CR_RESTORE }, + { "lazy-pages", CR_LAZY_PAGES }, + { "check", CR_CHECK }, + { "page-server", CR_PAGE_SERVER }, + { "service", CR_SERVICE }, + { "swrk", CR_SWRK }, + { "dedup", CR_DEDUP }, + { "exec", CR_EXEC_DEPRECATED }, + { "show", CR_SHOW_DEPRECATED }, +}; + +static int parse_criu_mode(int argc, char **argv, int *optind) { - if (!strcmp(mode, "dump")) - opts.mode = CR_DUMP; - else if (!strcmp(mode, "pre-dump")) - opts.mode = CR_PRE_DUMP; - else if (!strcmp(mode, "restore")) - opts.mode = CR_RESTORE; - else if (!strcmp(mode, "lazy-pages")) - opts.mode = CR_LAZY_PAGES; - else if (!strcmp(mode, "check")) - opts.mode = CR_CHECK; - else if (!strcmp(mode, "page-server")) - opts.mode = CR_PAGE_SERVER; - else if (!strcmp(mode, "service")) - opts.mode = CR_SERVICE; - else if (!strcmp(mode, "swrk")) - opts.mode = CR_SWRK; - else if (!strcmp(mode, "dedup")) - opts.mode = CR_DEDUP; - else if (!strcmp(mode, "cpuinfo")) - opts.mode = CR_CPUINFO; - else if (!strcmp(mode, "exec")) - opts.mode = CR_EXEC_DEPRECATED; - else if (!strcmp(mode, "show")) - opts.mode = CR_SHOW_DEPRECATED; - else - return -1; + char *cmd = argv[*optind]; + bool has_sub_command = (argc - *optind) > 1; + char *subcommand = has_sub_command ? argv[*optind + 1] : NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(commands); i++) { + if (strcmp(cmd, commands[i].cmd)) + continue; + opts.mode = commands[i].mode; + return 0; + } - return 0; + if (!strcmp(cmd, "cpuinfo")) { + if (subcommand == NULL) { + pr_err("cpuinfo requires an action: dump or check\n"); + return -1; + } + if (!strcmp(subcommand, "dump")) + opts.mode = CR_CPUINFO_DUMP; + else if (!strcmp(subcommand, "check")) + opts.mode = CR_CPUINFO_CHECK; + else { + pr_err("unknown cpuinfo sub-command: %s\n", subcommand); + return -1; + } + (*optind)++; + return 0; + } + pr_err("unknown command: %s\n", argv[*optind]); + return -1; } int main(int argc, char *argv[], char *envp[]) @@ -115,6 +132,7 @@ int main(int argc, char *argv[], char *envp[]) bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; + char *cmd; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); @@ -165,10 +183,11 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - if (parse_criu_mode(argv[optind])) { - pr_err("unknown command: %s\n", argv[optind]); + cmd = argv[optind]; + ret = parse_criu_mode(argc, argv, &optind); + if (ret) goto usage; - } + /* * util_init initializes criu_run_id and compel_run_id so that sockets * are generated with an unique name identifying the specific process @@ -223,25 +242,19 @@ int main(int argc, char *argv[], char *envp[]) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; - } else { - /* No subcommands except for cpuinfo and restore --exec-cmd */ - if (opts.mode != CR_CPUINFO && has_sub_command) { - pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); - goto usage; - } else if (opts.mode == CR_CPUINFO && !has_sub_command) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } + } else if (has_sub_command) { + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", cmd); + goto usage; } - if (opts.stream && image_dir_mode(argv, optind) == -1) { - pr_err("--stream cannot be used with the %s command\n", argv[optind]); + if (opts.stream && image_dir_mode() == -1) { + pr_err("--stream cannot be used with the %s command\n", cmd); goto usage; } /* We must not open imgs dir, if service is called */ if (opts.mode != CR_SERVICE) { - ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); + ret = open_image_dir(opts.imgs_dir, image_dir_mode()); if (ret < 0) { pr_err("Couldn't open image dir %s\n", opts.imgs_dir); return 1; @@ -286,14 +299,13 @@ int main(int argc, char *argv[], char *envp[]) if (opts.img_parent) pr_info("Will do snapshot from %s\n", opts.img_parent); - if (opts.mode == CR_DUMP) { + switch (opts.mode) { + case CR_DUMP: if (!opts.tree_id) goto opt_pid_missing; return cr_dump_tasks(opts.tree_id); - } - - if (opts.mode == CR_PRE_DUMP) { + case CR_PRE_DUMP: if (!opts.tree_id) goto opt_pid_missing; @@ -303,9 +315,7 @@ int main(int argc, char *argv[], char *envp[]) } return cr_pre_dump_tasks(opts.tree_id) != 0; - } - - if (opts.mode == CR_RESTORE) { + case CR_RESTORE: if (opts.tree_id) pr_warn("Using -t with criu restore is obsoleted\n"); @@ -318,46 +328,41 @@ int main(int argc, char *argv[], char *envp[]) } return ret != 0; - } - if (opts.mode == CR_LAZY_PAGES) + case CR_LAZY_PAGES: return cr_lazy_pages(opts.daemon_mode) != 0; - if (opts.mode == CR_CHECK) + case CR_CHECK: return cr_check() != 0; - if (opts.mode == CR_PAGE_SERVER) + case CR_PAGE_SERVER: return cr_page_server(opts.daemon_mode, false, -1) != 0; - if (opts.mode == CR_SERVICE) + case CR_SERVICE: return cr_service(opts.daemon_mode); - if (opts.mode == CR_DEDUP) + case CR_DEDUP: return cr_dedup() != 0; - if (opts.mode == CR_CPUINFO) { - if (!argv[optind + 1]) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } - if (!strcmp(argv[optind + 1], "dump")) - return cpuinfo_dump(); - else if (!strcmp(argv[optind + 1], "check")) - return cpuinfo_check(); - } + case CR_CPUINFO_DUMP: + return cpuinfo_dump(); + + case CR_CPUINFO_CHECK: + return cpuinfo_check(); - if (opts.mode == CR_EXEC_DEPRECATED) { + case CR_EXEC_DEPRECATED: pr_err("The \"exec\" action is deprecated by the Compel library.\n"); return -1; - } - if (opts.mode == CR_SHOW_DEPRECATED) { + case CR_SHOW_DEPRECATED: pr_err("The \"show\" action is deprecated by the CRIT utility.\n"); pr_err("To view an image use the \"crit decode -i $name --pretty\" command.\n"); return -1; - } - pr_err("unknown command: %s\n", argv[optind]); + case CR_UNSET: + default: + pr_err("unknown command: %s\n", cmd); + } usage: pr_msg("\n" "Usage:\n" @@ -421,7 +426,7 @@ int main(int argc, char *argv[], char *envp[]) " --network-lock METHOD network locking/unlocking method; argument\n" " can be 'nftables' or 'iptables' (default).\n" " --unprivileged accept limitations when running as non-root\n" - " consult documentation for further details\n" + " --allow-uprobes allow dump/restore with uprobes vma\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" diff --git a/criu/files.c b/criu/files.c index f16ec32a23..af4b8aeac8 100644 --- a/criu/files.c +++ b/criu/files.c @@ -1329,7 +1329,6 @@ int prepare_fds(struct pstree_item *me) } } - BUG_ON(current->pid->state == TASK_HELPER); ret = open_fdinfos(me); if (rsti(me)->fdt) diff --git a/criu/image.c b/criu/image.c index f3747d6ff5..91101c3ebf 100644 --- a/criu/image.c +++ b/criu/image.c @@ -95,6 +95,11 @@ int check_img_inventory(bool restore) goto out_err; } + if (restore && he->allow_uprobes && !opts.allow_uprobes) { + pr_err("Dumped with --" OPT_ALLOW_UPROBES ". Need to set it on restore as well.\n"); + goto out_err; + } + if (restore) { if (!he->has_network_lock_method) { /* @@ -712,7 +717,7 @@ struct cr_img *img_from_fd(int fd) * This is used when opts.stream is enabled for picking the right streamer * socket name. `mode` is ignored when opts.stream is not enabled. */ -int open_image_dir(char *dir, int mode) +int open_image_dir(const char *dir, int mode) { int fd, ret; diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index ab0bd8fa36..8c5707b415 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -125,7 +125,8 @@ enum criu_mode { CR_SERVICE, CR_SWRK, CR_DEDUP, - CR_CPUINFO, + CR_CPUINFO_DUMP, + CR_CPUINFO_CHECK, CR_EXEC_DEPRECATED, CR_SHOW_DEPRECATED, }; @@ -195,6 +196,7 @@ struct cr_options { char *work_dir; int network_lock_method; int skip_file_rwx_check; + int allow_uprobes; /* * When we scheduler for removal some functionality we first diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 392ea9f534..9fb21a4497 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -60,6 +60,8 @@ enum { CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11, + CR_PLUGIN_HOOK__POST_FORKING = 12, + CR_PLUGIN_HOOK__MAX }; @@ -78,6 +80,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); enum { CR_PLUGIN_STAGE__DUMP, @@ -152,5 +155,6 @@ typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat); typedef int(cr_plugin_update_vma_map_t)(const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); typedef int(cr_plugin_resume_devices_late_t)(int pid); +typedef int(cr_plugin_post_forking_t)(void); #endif /* __CRIU_PLUGIN_H__ */ diff --git a/criu/include/image.h b/criu/include/image.h index afa7d5e12f..30e32323d5 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -68,6 +68,18 @@ * processing exiting with error; while the rest of bits * are part of image ABI, this particular one must never * be used in image. + * - guard + * stands for a fake VMA (not represented in the kernel + * by a struct vm_area_struct). Used to keep an information + * about virtual address space ranges covered by + * MADV_GUARD_INSTALL guards. These ones must be always at + * the end of the vma_area_list and properly skipped a.e. + * - uprobes + * stands for a "[uprobes]" vma that's automatically mapped by + * the kernel when an active uprobe is hit. Contents of this vma + * are not dumped and neither are its madvise bits restored, + * because the kernel is in complete control of this vma. This is + * just used to track the existence of the uprobes vma. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) @@ -87,6 +99,8 @@ #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) #define VMA_AREA_SHSTK (1 << 15) +#define VMA_AREA_GUARD (1 << 16) +#define VMA_AREA_UPROBES (1 << 17) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) @@ -100,6 +114,8 @@ #define CR_PARENT_LINK "parent" +#define OPT_ALLOW_UPROBES "allow-uprobes" + extern bool ns_per_id; extern bool img_common_magic; @@ -149,7 +165,7 @@ static inline int img_raw_fd(struct cr_img *img) extern off_t img_raw_size(struct cr_img *img); -extern int open_image_dir(char *dir, int mode); +extern int open_image_dir(const char *dir, int mode); extern void close_image_dir(void); /* * Return -1 -- parent symlink points to invalid target diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index e03a573419..e4922f401d 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -89,6 +89,10 @@ struct kerndat_s { bool has_pagemap_scan; bool has_shstk; bool has_close_range; + bool has_timer_cr_ids; + bool has_breakpoints; + bool has_madv_guard; + bool has_pagemap_scan_guard_pages; }; extern struct kerndat_s kdat; @@ -111,4 +115,6 @@ extern int kerndat_fs_virtualized(unsigned int which, u32 kdev); extern int kerndat_has_nspid(void); +extern void kerndat_warn_about_madv_guards(void); + #endif /* __CR_KERNDAT_H__ */ diff --git a/criu/include/mem.h b/criu/include/mem.h index 3618c9cc3b..e9ce3518ae 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -31,10 +31,12 @@ extern int do_task_reset_dirty_track(int pid); extern unsigned long dump_pages_args_size(struct vm_area_list *vmas); extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl); +extern int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list); #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) #define PME_FILE (1ULL << 61) +#define PME_GUARD_REGION (1ULL << 58) #define PME_SOFT_DIRTY (1ULL << 55) #define PME_PSHIFT_BITS (6) #define PME_STATUS_BITS (3) @@ -49,5 +51,11 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); -u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty); +struct page_info { + u64 next; + bool softdirty; +}; + +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info); + #endif /* __CR_MEM_H__ */ diff --git a/criu/include/mman.h b/criu/include/mman.h index 8ca71fadf9..43e0b6cc7a 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -4,6 +4,9 @@ #ifndef MAP_HUGETLB #define MAP_HUGETLB 0x40000 #endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif @@ -13,5 +16,11 @@ #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif #endif /* __CR_MMAN_H__ */ diff --git a/criu/include/net.h b/criu/include/net.h index 5e8a848620..7c5ede21e1 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -31,7 +31,7 @@ extern int collect_net_namespaces(bool for_dump); extern int network_lock(void); extern void network_unlock(void); -extern int network_lock_internal(void); +extern int network_lock_internal(bool restore); extern struct ns_desc net_ns_desc; diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h index 15178c0150..65292b7ab1 100644 --- a/criu/include/page-pipe.h +++ b/criu/include/page-pipe.h @@ -92,9 +92,9 @@ struct kernel_pipe_buffer { struct page_pipe_buf { int p[2]; /* pipe with pages */ unsigned int pipe_size; /* how many pages can be fit into pipe */ - unsigned int pipe_off; /* where this buf is started in a pipe */ - unsigned int pages_in; /* how many pages are there */ unsigned int nr_segs; /* how many iov-s are busy */ + unsigned long pipe_off; /* where this buf is started in a pipe */ + unsigned long pages_in; /* how many pages are there */ #define PPB_LAZY (1 << 0) unsigned int flags; struct iovec *iov; /* vaddr:len map */ @@ -149,7 +149,7 @@ struct pipe_read_dest { }; extern int pipe_read_dest_init(struct pipe_read_dest *prd); -extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, +extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long *nr_pages, unsigned int ppb_flags); #endif /* __CR_PAGE_PIPE_H__ */ diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h index 36fe670928..0d9b350194 100644 --- a/criu/include/page-xfer.h +++ b/criu/include/page-xfer.h @@ -69,9 +69,9 @@ extern int check_parent_page_xfer(int fd_type, unsigned long id); */ /* async request/receive of remote pages */ -extern int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages); +extern int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages); -typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, int nr_pages, void *); -extern int page_server_start_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); +typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, unsigned long nr_pages, void *); +extern int page_server_start_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); #endif /* __CR_PAGE_XFER__H__ */ diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 3ae15deb9c..4cbc87cc6d 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -44,7 +44,7 @@ struct page_read { /* reads page from current pagemap */ - int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *, unsigned flags); + int (*read_pages)(struct page_read *, unsigned long vaddr, unsigned long nr, void *, unsigned flags); /* Advance page_read to the next entry */ int (*advance)(struct page_read *pr); void (*close)(struct page_read *); @@ -52,8 +52,8 @@ struct page_read { int (*sync)(struct page_read *pr); int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr); void (*reset)(struct page_read *pr); - int (*io_complete)(struct page_read *, unsigned long vaddr, int nr); - int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags); + int (*io_complete)(struct page_read *, unsigned long vaddr, unsigned long nr); + int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags); /* Whether or not pages can be read in PIE code */ bool pieok; diff --git a/criu/include/pagemap_scan.h b/criu/include/pagemap_scan.h index 0ad4c9bc0b..9046e01edf 100644 --- a/criu/include/pagemap_scan.h +++ b/criu/include/pagemap_scan.h @@ -14,6 +14,7 @@ #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) #define PAGE_IS_SOFT_DIRTY (1 << 7) +#define PAGE_IS_GUARD (1 << 8) /* * struct page_region - Page region with flags diff --git a/criu/include/parasite.h b/criu/include/parasite.h index b33d6710f8..1763577111 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -63,7 +63,7 @@ struct parasite_dump_pages_args { unsigned int add_prot; unsigned int off; unsigned int nr_segs; - unsigned int nr_pages; + unsigned long nr_pages; }; static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a) diff --git a/criu/include/prctl.h b/criu/include/prctl.h index f5f23c9692..2966659dad 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -97,4 +97,11 @@ struct prctl_mm_map { #define PR_GET_THP_DISABLE 42 #endif +#ifndef PR_TIMER_CREATE_RESTORE_IDS +#define PR_TIMER_CREATE_RESTORE_IDS 77 +# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 +# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 +# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 +#endif + #endif /* __CR_PRCTL_H__ */ diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h index 0bd79bf553..76d3242d2b 100644 --- a/criu/include/proc_parse.h +++ b/criu/include/proc_parse.h @@ -105,4 +105,6 @@ extern int parse_uptime(uint64_t *upt); extern int parse_timens_offsets(struct timespec *boff, struct timespec *moff); +extern bool found_uprobes_vma(void); + #endif /* __CR_PROC_PARSE_H__ */ diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 1137046d43..b750a919e6 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -104,6 +104,7 @@ extern void pstree_insert_pid(struct pid *pid_node); extern struct pid *pstree_pid_by_virt(pid_t pid); extern struct pstree_item *root_item; +extern bool has_children(struct pstree_item *item); extern struct pstree_item *pstree_item_next(struct pstree_item *item); #define for_each_pstree_item(pi) for (pi = root_item; pi != NULL; pi = pstree_item_next(pi)) diff --git a/criu/include/restore.h b/criu/include/restore.h index 04d0065051..1890518263 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -9,6 +9,7 @@ extern int arch_set_thread_regs_nosigrt(struct pid *pid); struct task_restore_args; struct pstree_item; +struct rst_shstk_info; #ifndef arch_shstk_prepare static inline int arch_shstk_prepare(struct pstree_item *item, @@ -38,4 +39,25 @@ static inline int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *cor #define arch_shstk_trampoline arch_shstk_trampoline #endif +#ifndef shstk_restorer_stack_size +static always_inline long shstk_restorer_stack_size(void) +{ + return 0; +} +#endif + +#ifndef shstk_set_restorer_stack +static always_inline long shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) +{ + return 0; +} +#endif + +#ifndef shstk_min_mmap_addr +static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long def) +{ + return def; +} +#endif + #endif diff --git a/criu/include/restorer.h b/criu/include/restorer.h index a4fb7ea794..14c0a37680 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -170,6 +170,7 @@ struct task_restore_args { struct restore_posix_timer *posix_timers; unsigned int posix_timers_n; + bool posix_timer_cr_ids; struct restore_timerfd *timerfd; unsigned int timerfd_n; @@ -356,4 +357,11 @@ static inline int arch_shstk_restore(struct rst_shstk_info *shstk) #define arch_shstk_restore arch_shstk_restore #endif +#ifndef shstk_vma_restore +static always_inline int shstk_vma_restore(VmaEntry *vma_entry) +{ + return -1; +} +#endif + #endif /* __CR_RESTORER_H__ */ diff --git a/criu/include/sockets.h b/criu/include/sockets.h index c3e7c879a7..6c81d3edd7 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -25,7 +25,7 @@ struct socket_desc { }; extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); -extern int dump_socket_opts(int sk, SkOptsEntry *soe); +extern int dump_socket_opts(int sk, int family, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); extern int sk_setbufs(int sk, uint32_t *bufs); extern void release_skopts(SkOptsEntry *); diff --git a/criu/include/sysctl.h b/criu/include/sysctl.h index cb3eba8174..2d689a9a04 100644 --- a/criu/include/sysctl.h +++ b/criu/include/sysctl.h @@ -37,6 +37,6 @@ enum { #define CTL_FLAGS_OPTIONAL 1 #define CTL_FLAGS_HAS 2 #define CTL_FLAGS_READ_EIO_SKIP 4 -#define CTL_FLAGS_IPC_EACCES_SKIP 5 +#define CTL_FLAGS_IPC_EACCES_SKIP 8 #endif /* __CR_SYSCTL_H__ */ diff --git a/criu/kerndat.c b/criu/kerndat.c index 5939005a41..997181ce75 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -31,6 +31,7 @@ #include "kerndat.h" #include "fs-magic.h" #include "mem.h" +#include "mman.h" #include "common/compiler.h" #include "sysctl.h" #include "cr_options.h" @@ -86,6 +87,10 @@ static int check_pagemap(void) if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) { pr_debug("PAGEMAP_SCAN is supported\n"); kdat.has_pagemap_scan = true; + + args.return_mask |= PAGE_IS_GUARD; + if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) + kdat.has_pagemap_scan_guard_pages = true; } else { switch (errno) { case EINVAL: @@ -1720,6 +1725,134 @@ static int kerndat_has_close_range(void) return 0; } +static int kerndat_has_timer_cr_ids(void) +{ + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) == -1) { + if (errno == EINVAL) { + pr_debug("PR_TIMER_CREATE_RESTORE_IDS isn't supported\n"); + return 0; + } + pr_perror("prctl returned unexpected error code"); + return -1; + } + + kdat.has_timer_cr_ids = true; + return 0; +} + +static void breakpoint_func(void) +{ + if (raise(SIGSTOP)) + pr_perror("Unable to kill itself with SIGSTOP"); + exit(1); +} + +/* + * kerndat_breakpoints checks that hardware breakpoints work as they should. + * In some cases, they might not work in virtual machines if the hypervisor + * doesn't virtualize them. For example, they don't work in AMD SEV virtual + * machines if the Debug Virtualization extension isn't supported or isn't + * enabled in SEV_FEATURES. + */ +static int kerndat_breakpoints(void) +{ + int status, ret, exit_code = -1; + pid_t pid; + + pid = fork(); + if (pid == -1) { + pr_perror("fork"); + return -1; + } + if (pid == 0) { + if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { + pr_perror("ptrace(PTRACE_TRACEME)"); + exit(1); + } + raise(SIGSTOP); + breakpoint_func(); + exit(1); + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for initial stop"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) { + pr_err("Child didn't stop as expected: status=%x\n", status); + goto err; + } + ret = ptrace_set_breakpoint(pid, &breakpoint_func); + if (ret < 0) { + pr_err("Failed to set breakpoint\n"); + goto err; + } + if (ret == 0) { + pr_debug("Hardware breakpoints appear to be disabled\n"); + goto out; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for breakpoint trigger"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP) { + pr_warn("Hardware breakpoints don't seem to work (status=%x)\n", status); + goto out; + } + kdat.has_breakpoints = true; +out: + exit_code = 0; +err: + if (kill(pid, SIGKILL)) { + pr_perror("Failed to kill the child process"); + exit_code = -1; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("Failed to wait for the child process"); + exit_code = -1; + } + if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) { + pr_err("The child exited with unexpected code: %x\n", status); + exit_code = -1; + } + return exit_code; +} + +static int kerndat_has_madv_guard(void) +{ + void *map; + + map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (map == MAP_FAILED) { + pr_perror("Can't mmap a page for has_madv_guard feature test"); + return -1; + } + + if (madvise(map, PAGE_SIZE, MADV_GUARD_INSTALL)) { + if (errno != EINVAL) { + pr_perror("madvise failed (has_madv_guard check)"); + goto mmap_cleanup; + } + } else { + kdat.has_madv_guard = true; + } + + munmap(map, PAGE_SIZE); + return 0; + +mmap_cleanup: + munmap(map, PAGE_SIZE); + return -1; +} + +void kerndat_warn_about_madv_guards(void) +{ + if (kdat.has_madv_guard && !kdat.has_pagemap_scan_guard_pages) + pr_warn("ioctl(PAGEMAP_SCAN) doesn't support PAGE_IS_GUARD flag. " + "CRIU dump will fail if dumped processes use madvise(MADV_GUARD_INSTALL). " + "Please, consider updating your kernel.\n"); +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1981,6 +2114,17 @@ int kerndat_init(void) pr_err("kerndat_has_close_range has failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_timer_cr_ids()) { + pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + } + if (!ret && kerndat_breakpoints()) { + pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_madv_guard()) { + pr_err("kerndat_has_madv_guard has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/log.c b/criu/log.c index 70e267fd65..a02a8df204 100644 --- a/criu/log.c +++ b/criu/log.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -114,6 +115,9 @@ static struct str_and_lock *first_err; int log_keep_err(void) { + if (first_err) + return 0; + first_err = shmalloc(sizeof(struct str_and_lock)); if (first_err == NULL) return -1; diff --git a/criu/lsm.c b/criu/lsm.c index 70b66d42ee..5faf3e5b2f 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -29,7 +29,9 @@ static int apparmor_get_label(pid_t pid, char **profile_name) FILE *f; char *space; - f = fopen_proc(pid, "attr/current"); + f = fopen_proc(pid, "attr/apparmor/current"); + if (!f) + f = fopen_proc(pid, "attr/current"); if (!f) return -1; diff --git a/criu/mem.c b/criu/mem.c index c9578ef441..9e8740c070 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -10,6 +10,7 @@ #include "cr_options.h" #include "servicefd.h" #include "mem.h" +#include "mman.h" #include "parasite-syscall.h" #include "parasite.h" #include "page-pipe.h" @@ -114,44 +115,74 @@ static bool should_dump_entire_vma(VmaEntry *vmae) } /* - * should_dump_page returns vaddr if an addressed page has to be dumped. - * Otherwise, it returns an address that has to be inspected next. + * should_dump_page writes vaddr in page_info->next if an addressed page has to be dumped. + * Otherwise, it writes an address that has to be inspected next. */ -u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info) { + if (!page_info) + goto err; + if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) - return -1; + goto err; if (pmc->regs) { while (1) { - if (pmc->regs_idx == pmc->regs_len) - return pmc->end; + if (pmc->regs_idx == pmc->regs_len) { + page_info->next = pmc->end; + return 0; + } + if (vaddr < pmc->regs[pmc->regs_idx].end) break; pmc->regs_idx++; } - if (vaddr < pmc->regs[pmc->regs_idx].start) - return pmc->regs[pmc->regs_idx].start; - if (softdirty) - *softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; - return vaddr; + + if (vaddr < pmc->regs[pmc->regs_idx].start) { + page_info->next = pmc->regs[pmc->regs_idx].start; + return 0; + } + + if (pmc->regs[pmc->regs_idx].categories & PAGE_IS_GUARD) + goto skip_guard_page; + + page_info->softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; + page_info->next = vaddr; + return 0; } else { u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; + if (pme & PME_GUARD_REGION) + goto skip_guard_page; + /* * Optimisation for private mapping pages, that haven't * yet being COW-ed */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) - return vaddr + PAGE_SIZE; + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) { + page_info->next = vaddr + PAGE_SIZE; + return 0; + } + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { - if (softdirty) - *softdirty = pme & PME_SOFT_DIRTY; - return vaddr; + page_info->softdirty = pme & PME_SOFT_DIRTY; + page_info->next = vaddr; + return 0; } - return vaddr + PAGE_SIZE; + page_info->next = vaddr + PAGE_SIZE; + return 0; } + +err: + pr_err("should_dump_page failed on vma " + "%#016" PRIx64 "-%#016" PRIx64 " vaddr=%#016" PRIx64 "\n", + vmae->start, vmae->end, vaddr); + return -1; + +skip_guard_page: + page_info->next = vaddr + PAGE_SIZE; + return 0; } bool page_is_zero(u64 pme) @@ -201,14 +232,15 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct nr_scanned = 0; for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { unsigned int ppb_flags = 0; - bool softdirty = false; - u64 next; + struct page_info page_info = {}; int st; /* If dump_all_pages is true, should_dump_page is called to get pme. */ - next = should_dump_page(pmc, vma->e, vaddr, &softdirty); - if (!dump_all_pages && next != vaddr) { - vaddr = next - PAGE_SIZE; + if (should_dump_page(pmc, vma->e, vaddr, &page_info)) + return -1; + + if (!dump_all_pages && page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; continue; } @@ -222,7 +254,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(softdirty)) { + if (has_parent && page_in_parent(page_info.softdirty)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { @@ -304,7 +336,7 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct pa list_for_each_entry(ppb, &pp->bufs, l) { args->nr_segs = ppb->nr_segs; args->nr_pages = ppb->pages_in; - pr_debug("PPB: %d pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, + pr_debug("PPB: %ld pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, args->off); ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl); @@ -398,6 +430,17 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str if (vma_entry_is(vma->e, VMA_AREA_VVAR)) return 0; + /* + * 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") + * tells us that: + * Under memory pressure, mm can just drop the pages (so that they're + * zero when read back again). + * + * Let's just skip MAP_DROPPABLE mappings pages dump logic. + */ + if (vma->e->flags & MAP_DROPPABLE) + return 0; + /* * To facilitate any combination of pre-dump modes to run after * one another, we need to take extra care as discussed below. @@ -556,6 +599,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit parent_predump_mode = mdc->parent_ie->pre_dump_mode; list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (vma_area_is(vma_area, VMA_AREA_GUARD)) + continue; + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, parent_predump_mode); if (ret < 0) @@ -741,8 +787,6 @@ int prepare_mm_pid(struct pstree_item *i) ri->vmas.rst_priv_size += vma_area_len(vma); if (vma_has_guard_gap_hidden(vma)) ri->vmas.rst_priv_size += PAGE_SIZE; - if (vma_area_is(vma, VMA_AREA_SHSTK)) - ri->vmas.rst_priv_size += PAGE_SIZE; } pr_info("vma 0x%" PRIx64 " 0x%" PRIx64 "\n", vma->e->start, vma->e->end); @@ -818,14 +862,14 @@ static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list /* <= here to shift from matching VMAs and ... */ while (vma->e->start <= pvma->e->start) { vma = vma_next(vma); - if (&vma->list == &vmas->h) + if ((&vma->list == &vmas->h) || vma_area_is(vma, VMA_AREA_GUARD)) return; } /* ... no == here since we must stop on matching pair */ while (pvma->e->start < vma->e->start) { pvma = vma_next(pvma); - if (&pvma->list == &pvmas->h) + if ((&pvma->list == &pvmas->h) || vma_area_is(pvma, VMA_AREA_GUARD)) return; } } @@ -885,13 +929,6 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void size = vma_entry_len(vma->e); - /* - * map an extra page for shadow stack VMAs, it will be used as a - * temporary shadow stack - */ - if (vma_area_is(vma, VMA_AREA_SHSTK)) - size += PAGE_SIZE; - if (!vma_inherited(vma)) { int flag = 0; /* @@ -1026,6 +1063,9 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo filemap_ctx_init(true); list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (task_size_check(vpid(t), vma->e)) { ret = -1; break; @@ -1233,6 +1273,9 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) unsigned long size, i = 0; void *addr = decode_pointer(vma->premmaped_addr); + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (!vma_inherited(vma)) continue; @@ -1496,3 +1539,72 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) return prepare_vma_ios(t, ta); } + +int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list) +{ + int pagemap_fd = -1; + struct page_region *regs = NULL; + long regs_len = 0; + int i, ret = -1; + + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = 0, + .end = kdat.task_size, + .walk_end = 0, + .vec_len = 1000, /* this should be enough for most cases */ + .max_pages = 0, + .category_mask = PAGE_IS_GUARD, + .return_mask = PAGE_IS_GUARD, + }; + + if (!kdat.has_pagemap_scan_guard_pages) { + ret = 0; + goto out; + } + + pagemap_fd = open_proc(pid, "pagemap"); + if (pagemap_fd < 0) + goto out; + + regs = xmalloc(args.vec_len * sizeof(struct page_region)); + if (!regs) + goto out; + args.vec = (long)regs; + + do { + /* start from where we finished the last time */ + args.start = args.walk_end; + regs_len = ioctl(pagemap_fd, PAGEMAP_SCAN, &args); + if (regs_len == -1) { + pr_perror("PAGEMAP_SCAN"); + goto out; + } + + for (i = 0; i < regs_len; i++) { + struct vma_area *vma; + + BUG_ON(!(regs[i].categories & PAGE_IS_GUARD)); + + vma = alloc_vma_area(); + if (!vma) + goto out; + + vma->e->start = regs[i].start; + vma->e->end = regs[i].end; + vma->e->status = VMA_AREA_GUARD; + + list_add_tail(&vma->list, &vma_area_list->h); + vma_area_list->nr++; + } + } while (args.walk_end != kdat.task_size); + + ret = 0; + +out: + xfree(regs); + if (pagemap_fd >= 0) + close(pagemap_fd); + return ret; +} diff --git a/criu/mount-v2.c b/criu/mount-v2.c index cdebc83182..1e33ac12a2 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -443,6 +443,7 @@ static int do_bind_mount_v2(struct mount_info *mi) /* Mounts root container mount. */ static int do_mount_root_v2(struct mount_info *mi) { + unsigned long mflags = mi->flags & (~MS_PROPAGATE); unsigned long flags = MS_BIND; int fd; @@ -477,6 +478,11 @@ static int do_mount_root_v2(struct mount_info *mi) return -1; } + if (mflags && mount(NULL, mi->plain_mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + mi->mounted = true; return 0; diff --git a/criu/mount.c b/criu/mount.c index 82bbd52d6c..b643a7f26e 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -888,7 +888,11 @@ static int resolve_external_mounts(struct mount_info *info) cut_root = cut_root_for_bind(m->root, match->root); - p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + if (cut_root[0] == '\0') { + p = xstrdup(match->ns_mountpoint + 1); + } else { + p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + } if (!p) return -1; @@ -2690,9 +2694,16 @@ static bool can_mount_now(struct mount_info *mi) static int do_mount_root(struct mount_info *mi) { + unsigned long mflags = mi->flags & (~MS_PROPAGATE); + if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, mi->shared_id, mi->master_id)) return -1; + if (mflags && mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + return fetch_rt_stat(mi, service_mountpoint(mi)); } diff --git a/criu/net.c b/criu/net.c index ee46f1c495..e5775a3287 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2144,51 +2144,53 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) char path[ARRAY_SIZE(ipv4_sysctl_entries)][MAX_IPV4_SYSCTL_PATH] = {}; struct sysctl_req req[ARRAY_SIZE(ipv4_sysctl_entries)] = {}; SysctlEntry **sysctl = *rsysctl; - size_t n = *pn; + size_t n = *pn, ri; if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { - pr_err("unix: Unexpected entries in sysctlig (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + pr_err("ipv4: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); return -EINVAL; } if (opts.weak_sysctls || op == CTL_READ) flags = CTL_FLAGS_OPTIONAL; - for (i = 0; i < n; i++) { - snprintf(path[i], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); - req[i].name = path[i]; - req[i].flags = flags; + for (i = 0, ri = 0; i < n; i++) { + snprintf(path[ri], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); + req[ri].name = path[ri]; + req[ri].flags = flags; switch (sysctl[i]->type) { case SYSCTL_TYPE__CTL_STR: - req[i].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); + req[ri].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); /* skip write if have no value */ if (op == CTL_WRITE && !sysctl[i]->sarg) continue; - req[i].arg = sysctl[i]->sarg; + req[ri].arg = sysctl[i]->sarg; break; default: pr_err("ipv4: Unknown sysctl type %d\n", sysctl[i]->type); return -1; } + ri++; } - ret = sysctl_op(req, n, op, CLONE_NEWNET); + ret = sysctl_op(req, ri, op, CLONE_NEWNET); if (ret < 0) { - pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); + pr_err("ipv4: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); return -1; } if (op == CTL_READ) { bool has_entries = false; + BUG_ON(ri != n); for (i = 0; i < n; i++) { if (req[i].flags & CTL_FLAGS_HAS) { - sysctl[i]->has_iarg = true; - if (!has_entries) - has_entries = true; + has_entries = true; + } else { + sysctl[i]->sarg = NULL; } } @@ -2201,6 +2203,42 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) return 0; } +static int ipv4_sysctls_ping_group_range_map_gid(SysctlEntry *ent, size_t size) +{ + int start, end, ustart, uend, ret; + + if (sscanf(ent->sarg, "%d %d", &start, &end) != 2) { + pr_err("Failed to parse ping_group_range: %s\n", ent->sarg); + return -1; + } + + /* + * The default is "1 0", which means no group + * is allowed to create ICMP Echo sockets. + */ + if (start == 1 && end == 0) { + pr_debug("The ping_group_range is set to default, skipping it.\n"); + ent->sarg = NULL; + return 0; + } + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + ustart = userns_gid(start); + uend = userns_gid(end); + pr_debug("Mapping ping_group_range %d %d to userns -> %d %d\n", + start, end, ustart, uend); + + ret = snprintf(ent->sarg, size, "%d\t%d\n", ustart, uend); + if (ret < 0 || ret >= size) { + pr_err("Failed to map ping_group_range: %d\t%d\n", ustart, uend); + return -1; + } + + return 0; +} + static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) { void *buf, *o_buf; @@ -2218,6 +2256,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) SysctlEntry *ipv4_sysctls = NULL; size_t ipv4_sysctl_size = ARRAY_SIZE(ipv4_sysctl_entries); char ping_group_range[MAX_STR_IPV4_SYSCTL_LEN + 1] = {}; + int ping_group_range_id = -1; NetnsId *ids; struct netns_id *p; @@ -2308,6 +2347,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (!strcmp(ipv4_sysctl_entries[i], "ping_group_range")) { netns.ipv4_sysctl[i]->type = SYSCTL_TYPE__CTL_STR; netns.ipv4_sysctl[i]->sarg = ping_group_range; + ping_group_range_id = i; } else { /* Need to handle this case when we have more sysctls */ BUG(); @@ -2336,6 +2376,12 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (ret < 0) goto err_free; + BUG_ON(ping_group_range_id == -1); + ret = ipv4_sysctls_ping_group_range_map_gid(netns.ipv4_sysctl[ping_group_range_id], + MAX_STR_IPV4_SYSCTL_LEN + 1); + if (ret < 0) + goto err_free; + ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); err_free: xfree(o_buf); @@ -3206,12 +3252,12 @@ static inline FILE *redirect_nftables_output(struct nft_ctx *nft) } #endif -static inline int nftables_lock_network_internal(void) +static inline int nftables_lock_network_internal(bool restore) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) cleanup_file FILE *fp = NULL; struct nft_ctx *nft; - int ret = 0; + int ret = 0, exit_code = -1; char table[32]; char buf[128]; @@ -3224,11 +3270,16 @@ static inline int nftables_lock_network_internal(void) fp = redirect_nftables_output(nft); if (!fp) - goto out; + goto err2; snprintf(buf, sizeof(buf), "create table %s", table); - if (NFT_RUN_CMD(nft, buf)) + ret = NFT_RUN_CMD(nft, buf); + if (ret) { + /* The network has been locked on dump. */ + if (restore && errno == EEXIST) + return 0; goto err2; + } snprintf(buf, sizeof(buf), "add chain %s output { type filter hook output priority 0; policy drop; }", table); if (NFT_RUN_CMD(nft, buf)) @@ -3246,17 +3297,16 @@ static inline int nftables_lock_network_internal(void) if (NFT_RUN_CMD(nft, buf)) goto err1; - goto out; - + exit_code = 0; +out: + nft_ctx_free(nft); + return exit_code; err1: snprintf(buf, sizeof(buf), "delete table %s", table); NFT_RUN_CMD(nft, buf); err2: - ret = -1; pr_err("Locking network failed using nftables\n"); -out: - nft_ctx_free(nft); - return ret; + goto out; #else pr_err("CRIU was built without libnftables support\n"); return -1; @@ -3288,7 +3338,7 @@ static int iptables_network_lock_internal(void) return ret; } -int network_lock_internal(void) +int network_lock_internal(bool restore) { int ret = 0, nsret; @@ -3301,7 +3351,7 @@ int network_lock_internal(void) if (opts.network_lock_method == NETWORK_LOCK_IPTABLES) ret = iptables_network_lock_internal(); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) - ret = nftables_lock_network_internal(); + ret = nftables_lock_network_internal(restore); if (restore_ns(nsret, &net_ns_desc)) ret = -1; @@ -3427,7 +3477,7 @@ int network_lock(void) if (run_scripts(ACT_NET_LOCK)) return -1; - return network_lock_internal(); + return network_lock_internal(false); } void network_unlock(void) diff --git a/criu/page-pipe.c b/criu/page-pipe.c index aab6742be7..4601d8f9cd 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -381,7 +381,7 @@ int pipe_read_dest_init(struct pipe_read_dest *prd) return 0; } -int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, +int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long int *nr_pages, unsigned int ppb_flags) { struct page_pipe_buf *ppb; @@ -406,7 +406,7 @@ int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned lo } /* clamp the request if it passes the end of iovec */ - len = min((unsigned long)iov->iov_base + iov->iov_len - addr, (unsigned long)(*nr_pages) * PAGE_SIZE); + len = min((unsigned long)iov->iov_base + iov->iov_len - addr, *nr_pages * PAGE_SIZE); *nr_pages = len / PAGE_SIZE; skip += ppb->pipe_off * PAGE_SIZE; @@ -446,17 +446,17 @@ void debug_show_page_pipe(struct page_pipe *pp) pr_debug("Page pipe:\n"); pr_debug("* %u pipes %u/%u iovs:\n", pp->nr_pipes, pp->free_iov, pp->nr_iovs); list_for_each_entry(ppb, &pp->bufs, l) { - pr_debug("\tbuf %u pages, %u iovs, flags: %x pipe_off: %x :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, + pr_debug("\tbuf %lx pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, ppb->pipe_off); for (i = 0; i < ppb->nr_segs; i++) { iov = &ppb->iov[i]; - pr_debug("\t\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + pr_debug("\t\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); } } pr_debug("* %u holes:\n", pp->free_hole); for (i = 0; i < pp->free_hole; i++) { iov = &pp->holes[i]; - pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + pr_debug("\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); } } diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 0314963e6d..463d4c506f 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -32,7 +32,7 @@ static int page_server_sk = -1; struct page_server_iov { u32 cmd; - u32 nr_pages; + u64 nr_pages; u64 vaddr; u64 dst_id; }; @@ -178,12 +178,12 @@ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long le ssize_t ret, left = len; if (opts.tls) { - pr_debug("Sending %lu bytes / %lu pages\n", len, len / PAGE_SIZE); + pr_debug("Sending %lx bytes\n", len); if (tls_send_data_from_fd(p, len)) return -1; } else { - pr_debug("Splicing %lu bytes / %lu pages into socket\n", len, len / PAGE_SIZE); + pr_debug("Splicing %lx bytes into socket\n", len); while (left > 0) { ret = splice(p, NULL, xfer->sk, NULL, left, SPLICE_F_MOVE); @@ -192,7 +192,7 @@ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long le return -1; } - pr_debug("\tSpliced: %lu bytes sent\n", (unsigned long)ret); + pr_debug("\tSpliced: %lx bytes sent\n", (unsigned long)ret); left -= ret; } } @@ -288,7 +288,7 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) * read_pagemap_page routine. */ - pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len); + pr_debug("Checking %p - %p hole\n", iov->iov_base, iov->iov_base + iov->iov_len); off = (unsigned long)iov->iov_base; end = off + iov->iov_len; while (1) { @@ -300,7 +300,8 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) return -1; } - pr_debug("\tFound %" PRIx64 "/%lu\n", p->pe->vaddr, pagemap_len(p->pe)); + pr_debug("\tFound %" PRIx64 " - %" PRIx64 "\n", + p->pe->vaddr, p->pe->vaddr + pagemap_len(p->pe)); /* * The pagemap entry in parent may happen to be @@ -326,6 +327,7 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag pe.nr_pages = iov->iov_len / PAGE_SIZE; pe.has_flags = true; pe.flags = flags; + pe.has_nr_pages = true; if (flags & PE_PRESENT) { if (opts.auto_dedup && xfer->parent != NULL) { @@ -339,7 +341,8 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag if (xfer->parent != NULL) { ret = check_pagehole_in_parent(xfer->parent, iov); if (ret) { - pr_err("Hole %p/%zu not found in parent\n", iov->iov_base, iov->iov_len); + pr_err("Hole %p - %p not found in parent\n", + iov->iov_base, iov->iov_base + iov->iov_len); return -1; } } @@ -849,7 +852,7 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\t p %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); + pr_debug("\t p %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); flags = ppb_xfer_flags(xfer, ppb); @@ -885,7 +888,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) list_for_each_entry(ppb, &pp->bufs, l) { unsigned int i; - pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs); + pr_debug("\tbuf %lx/%d\n", ppb->pages_in, ppb->nr_segs); for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; @@ -897,7 +900,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\tp %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); + pr_debug("\tp %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); flags = ppb_xfer_flags(xfer, ppb); @@ -1070,7 +1073,8 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) struct page_xfer *lxfer = &cxfer.loc_xfer; struct iovec iov; - pr_debug("Adding %" PRIx64 "/%u\n", pi->vaddr, pi->nr_pages); + pr_debug("Adding %" PRIx64 " - %" PRIx64 "\n", + pi->vaddr, pi->vaddr + pi->nr_pages * PAGE_SIZE); if (prep_loc_xfer(pi)) return -1; @@ -1135,13 +1139,17 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) { struct pstree_item *item; struct page_pipe *pp; - unsigned long len; + unsigned long len, nr_pages; int ret; item = pstree_item_by_virt(pi->dst_id); pp = dmpi(item)->mem_pp; - ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &pi->nr_pages, PPB_LAZY); + /* page_pipe_read() uses 'unsigned long *' but pi->nr_pages is u64. + * Use a temporary variable to fix the incompatible pointer type + * on 32-bit platforms (e.g. armv7). */ + nr_pages = pi->nr_pages; + ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &nr_pages, PPB_LAZY); if (ret) return ret; @@ -1150,6 +1158,7 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) * .dst_id all remain intact. */ + pi->nr_pages = nr_pages; if (pi->nr_pages == 0) { pr_debug("no iovs found, zero pages\n"); return -1; @@ -1347,7 +1356,7 @@ static int fill_page_pipe(struct page_read *pr, struct page_pipe *pp) static int page_pipe_from_pagemap(struct page_pipe **pp, int pid) { struct page_read pr; - int nr_pages = 0; + unsigned long nr_pages = 0; if (open_page_read(pid, &pr, PR_TASK) <= 0) { pr_err("Failed to open page read for %d\n", pid); @@ -1550,13 +1559,13 @@ struct ps_async_read { static LIST_HEAD(async_reads); -static inline void async_read_set_goal(struct ps_async_read *ar, int nr_pages) +static inline void async_read_set_goal(struct ps_async_read *ar, unsigned long nr_pages) { ar->goal = sizeof(ar->pi) + nr_pages * PAGE_SIZE; ar->nr_pages = nr_pages; } -static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages, ps_async_read_complete complete, +static void init_ps_async_read(struct ps_async_read *ar, void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) { ar->pages = buf; @@ -1566,7 +1575,7 @@ static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages async_read_set_goal(ar, nr_pages); } -static int page_server_start_async_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv) +static int page_server_start_async_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) { struct ps_async_read *ar; @@ -1666,7 +1675,7 @@ int connect_to_page_server_to_recv(int epfd) return epoll_add_rfd(epfd, &ps_rfd); } -int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) +int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages) { struct page_server_iov pi = { .cmd = PS_IOV_GET, @@ -1683,7 +1692,7 @@ int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) return 0; } -static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete complete, void *priv) +static int page_server_start_sync_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv) { struct ps_async_read ar; int ret = 1; @@ -1694,7 +1703,7 @@ static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete return ret; } -int page_server_start_read(void *buf, int nr, ps_async_read_complete complete, void *priv, unsigned flags) +int page_server_start_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv, unsigned flags) { if (flags & PR_ASYNC) return page_server_start_async_read(buf, nr, complete, priv); diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index f04a517de3..457c0d6497 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -194,6 +194,9 @@ int pmc_fill(pmc_t *pmc, u64 start, u64 end) }; long ret; + if (kdat.has_pagemap_scan_guard_pages) + args.return_mask |= PAGE_IS_GUARD; + ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args); if (ret == -1) { pr_perror("PAGEMAP_SCAN"); diff --git a/criu/pagemap.c b/criu/pagemap.c index 85bb922596..6c9c4f7feb 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -168,15 +168,15 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) return 0; } -static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) +static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%u vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%" PRIx64 " vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } -static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int read_parent_page(struct page_read *pr, unsigned long vaddr, unsigned long int nr, void *buf, unsigned flags) { struct page_read *ppr = pr->parent; int ret; @@ -195,7 +195,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, v */ do { - int p_nr; + unsigned long int p_nr; pr_debug("\tpr%lu-%u Read from parent\n", pr->img_id, pr->id); ret = ppr->seek_pagemap(ppr, vaddr); @@ -210,7 +210,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, v * read as much as we can. */ p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE; - pr_info("\tparent has %u pages in\n", p_nr); + pr_info("\tparent has %lu pages in\n", p_nr); if (p_nr > nr) p_nr = nr; @@ -374,7 +374,7 @@ int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, st return 0; } -static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { int ret; unsigned long len = nr * PAGE_SIZE; @@ -402,7 +402,7 @@ static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int * We cannot use maybe_read_page_local() for streaming images as it uses * pread(), seeking in the file. Instead, we use this custom page reader. */ -static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { unsigned long len = nr * PAGE_SIZE; int fd; @@ -445,7 +445,7 @@ static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vadd return ret; } -static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_pages, void *priv) +static int read_page_complete(unsigned long img_id, unsigned long vaddr, unsigned long int nr_pages, void *priv) { int ret = 0; struct page_read *pr = priv; @@ -463,7 +463,7 @@ static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_ return ret; } -static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { int ret; @@ -474,9 +474,9 @@ static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int return ret; } -static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { - pr_info("pr%lu-%u Read %lx %u pages\n", pr->img_id, pr->id, vaddr, nr); + pr_info("pr%lu-%u Read %lx %lu pages\n", pr->img_id, pr->id, vaddr, nr); pagemap_bound_check(pr->pe, vaddr, nr); if (pagemap_in_parent(pr->pe)) { @@ -682,6 +682,9 @@ static void init_compat_pagemap_entry(PagemapEntry *pe) pe->flags |= PE_PARENT; else if (!pe->has_flags) pe->flags = PE_PRESENT; + + if (!pe->has_nr_pages) + pe->nr_pages = pe->compat_nr_pages; } /* diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 6db9d21fee..e19847b377 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -421,7 +421,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, ictx->flags |= INFECT_NO_MEMFD; if (fault_injected(FI_PARASITE_CONNECT)) ictx->flags |= INFECT_FAIL_CONNECT; - if (fault_injected(FI_NO_BREAKPOINTS)) + if (fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints) ictx->flags |= INFECT_NO_BREAKPOINTS; if (kdat.compat_cr) ictx->flags |= INFECT_COMPATIBLE; diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index 1bc03dc2a0..c966e9e62c 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -101,7 +101,7 @@ static int dump_pages(struct parasite_dump_pages_args *args) } if (spliced_bytes != args->nr_pages * PAGE_SIZE) { sys_close(p); - pr_err("Can't splice all pages to pipe (%ld/%d)\n", spliced_bytes, args->nr_pages); + pr_err("Can't splice all pages to pipe (%ld/%ld)\n", spliced_bytes, args->nr_pages); return -1; } diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 348ce6659b..5c40b0e937 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -28,6 +28,7 @@ #include #include #include +#include "mman.h" #include "signal.h" #include "prctl.h" #include "criu-log.h" @@ -1111,6 +1112,23 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) pr_info("Remap %lx->%lx len %lx\n", src, dst, len); + /* + * SHSTK VMAs are a bit special, in fact we create shstk vma right in the + * shstk_vma_restore() and populate it with contents from a premapped VMA + * (which in turns is just a normal anonymous VMA!). Then, we munmap() this + * premapped VMA. After, we need to adjust vma_premmaped_start(vma_entry) + * to point to a created shstk vma and treat it as a premmaped one in vma_remap(). + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) { + if (shstk_vma_restore(vma_entry)) { + pr_err("Unable to prepare shadow stack vma for remap %lx -> %lx\n", src, dst); + return -1; + } + + /* shstk_vma_restore() modifies vma premapped address */ + src = vma_premmaped_start(vma_entry); + } + if (src - dst < len) guard = dst; else if (dst - src < len) @@ -1235,9 +1253,23 @@ static int timerfd_arm(struct task_restore_args *args) static int create_posix_timers(struct task_restore_args *args) { - int ret, i; + int ret, i, exit_code = -1; kernel_timer_t next_id = 0, timer_id; struct sigevent sev; + bool create_restore_ids = false; + + if (!args->posix_timers_n) + return 0; + + /* prctl returns EINVAL if PR_TIMER_CREATE_RESTORE_IDS isn't supported. */ + ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_ON, 0, 0, 0); + if (ret == 0) { + create_restore_ids = true; + } else if (ret != -EINVAL) { + pr_err("Can't enabled PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); + return -1; + } for (i = 0; i < args->posix_timers_n; i++) { sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify; @@ -1249,16 +1281,36 @@ static int create_posix_timers(struct task_restore_args *args) #endif sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; + if (create_restore_ids) { + /* + * With enabled PR_TIMER_CREATE_RESTORE_IDS, the + * timer_create syscall creates a new timer with the + * specified ID. + */ + timer_id = args->posix_timers[i].spt.it_id; + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); + if (ret < 0) { + pr_err("Can't create posix timer - %d: %d\n", i, ret); + goto out; + } + if (timer_id != args->posix_timers[i].spt.it_id) { + pr_err("Unexpected timer id %u (expected %lu)\n", + timer_id, args->posix_timers[i].spt.it_id); + goto out; + } + continue; + } + while (1) { ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); if (ret < 0) { pr_err("Can't create posix timer - %d\n", i); - return ret; + goto out; } if (timer_id != next_id) { pr_err("Can't create timers, kernel don't give them consequently\n"); - return -1; + goto out; } next_id++; @@ -1268,12 +1320,22 @@ static int create_posix_timers(struct task_restore_args *args) ret = sys_timer_delete(timer_id); if (ret < 0) { pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id); - return ret; + goto out; } } } - return 0; + exit_code = 0; +out: + if (create_restore_ids) { + ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_OFF, 0, 0, 0); + if (ret != 0) { + pr_err("Can't disable PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); + exit_code = -1; + } + } + return exit_code; } static void restore_posix_timers(struct task_restore_args *args) @@ -1621,6 +1683,30 @@ static int restore_membarrier_registrations(int mask) return ret; } +static int restore_madv_guard_regions(struct task_restore_args *args) +{ + int i, ret; + + for (i = 0; i < args->vmas_n; i++) { + VmaEntry *vma_entry = args->vmas + i; + size_t len; + + if (!vma_entry_is(vma_entry, VMA_AREA_GUARD)) + continue; + + len = vma_entry->end - vma_entry->start; + ret = sys_madvise(vma_entry->start, len, MADV_GUARD_INSTALL); + if (ret) { + pr_err("madvise(%" PRIx64 ", %zu, MADV_GUARD_INSTALL) " + "failed with %d\n", + vma_entry->start, len, ret); + return -1; + } + } + + return 0; +} + /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -1742,13 +1828,6 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start > vma_entry->shmid) break; - /* - * shadow stack VMAs cannot be remapped, they must be - * recreated with map_shadow_stack system call - */ - if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) - continue; - if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } @@ -1766,13 +1845,6 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start < vma_entry->shmid) break; - /* - * shadow stack VMAs cannot be remapped, they must be - * recreated with map_shadow_stack system call - */ - if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) - continue; - if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } @@ -1928,6 +2000,13 @@ __visible long __export_restore_task(struct task_restore_args *args) } } + /* + * Restore madvise(MADV_GUARD_INSTALL) + */ + ret = restore_madv_guard_regions(args); + if (ret) + goto core_restore_end; + /* * Tune up the task fields. */ diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index 8daf5c71f3..45fb6a648b 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -98,25 +98,45 @@ static unsigned long elf_gnu_hash(const unsigned char *name) static int has_elf_identity(Ehdr_t *ehdr) { - /* - * See Elf specification for this magic values. - */ + /* check ELF magic */ + + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return false; + }; + + /* check ELF class */ #if defined(CONFIG_VDSO_32) - static const char elf_ident[] = { - 0x7f, 0x45, 0x4c, 0x46, 0x01, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + if (ehdr->e_ident[EI_CLASS] != ELFCLASS32) { + pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); + return false; }; #else - static const char elf_ident[] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64) { + pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); + return false; }; #endif - BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident)); - - if (memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) { - pr_err("ELF header magic mismatch\n"); + /* check ELF data encoding */ + if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF data encoding: %d\n", ehdr->e_ident[EI_DATA]); return false; - } + }; + /* check ELF version */ + if (ehdr->e_ident[EI_VERSION] != EV_CURRENT) { + pr_err("Unsupported ELF version: %d\n", ehdr->e_ident[EI_VERSION]); + return false; + }; + /* check ELF OSABI */ + if (ehdr->e_ident[EI_OSABI] != ELFOSABI_NONE && + ehdr->e_ident[EI_OSABI] != ELFOSABI_LINUX) { + pr_err("Unsupported OSABI version: %d\n", ehdr->e_ident[EI_OSABI]); + return false; + }; return true; } diff --git a/criu/plugin.c b/criu/plugin.c index 65e79a0692..18da0499d7 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -59,6 +59,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); + __assign_hook(POST_FORKING, "cr_plugin_post_forking"); #undef __assign_hook diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 99dc518a5e..0d3b5b23f1 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -74,6 +74,8 @@ struct buffer { static struct buffer __buf; static char *buf = __buf.buf; +/* only ever goes from false to true, if at all */ +static bool uprobes_vma_exists = false; /* * This is how AIO ring buffers look like in proc @@ -144,6 +146,8 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; + else if (_vmflag_match(tok, "dp")) + *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) @@ -160,6 +164,8 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); + else if (_vmflag_match(tok, "wf")) + *madv |= (1ul << MADV_WIPEONFORK); /* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */ if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) @@ -198,12 +204,29 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the * only exception is VVAR area that mapped by the kernel as * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP + * + * The uprobes vma is also mapped by the kernel with VM_IO, among other flags */ - if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) + if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED) + && !vma_area_is(vma_area, VMA_AREA_UPROBES)) vma_area->e->status |= VMA_UNSUPP; if (vma_area->e->madv) vma_area->e->has_madv = true; + + /* + * We set MAP_PRIVATE flag on vma_area->e->flags right after parsing + * a first line of VMA entry in /proc//smaps file: + * 7fa84fa70000-7fa84fa95000 rw-p 00000000 00:00 0 + * but it's too early and we can't distinguish between MAP_DROPPABLE + * and MAP_PRIVATE mappings yet, as they both private mappings in nature + * and at this point we haven't yet read "VmFlags:" line in smaps. + * + * Let's detect this situation and drop MAP_PRIVATE flag while keep + * MAP_DROPPABLE, otherwise restorer's restore_mapping() helper will fail. + */ + if ((vma_area->e->flags & MAP_PRIVATE) && (vma_area->e->flags & MAP_DROPPABLE)) + vma_area->e->flags &= ~MAP_PRIVATE; } static inline int is_anon_shmem_map(dev_t dev) @@ -585,6 +608,14 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat goto err; } else if (!strcmp(file_path, "[heap]")) { vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; + } else if (!strcmp(file_path, "[uprobes]")) { + uprobes_vma_exists = true; + if (!opts.allow_uprobes) { + pr_err("PID %d has uprobes vma. Consider using --" OPT_ALLOW_UPROBES ".\n", + pid); + goto err; + } + vma_area->e->status |= VMA_AREA_UPROBES; } else { vma_area->e->status = VMA_AREA_REGULAR; } @@ -721,6 +752,10 @@ static int vma_list_add(struct vma_area *vma_area, struct vm_area_list *vma_area */ pr_debug("Device file mapping %016" PRIx64 "-%016" PRIx64 " supported via device plugins\n", vma_area->e->start, vma_area->e->end); + } else if (vma_area->e->status & VMA_AREA_UPROBES) { + pr_debug("Skipping uprobes vma %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, + vma_area->e->end); + return 0; } else if (vma_area->e->status & VMA_UNSUPP) { pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, vma_area->e->end); @@ -2911,3 +2946,8 @@ int parse_uptime(uint64_t *upt) fclose(f); return 0; } + +bool found_uprobes_vma(void) +{ + return uprobes_vma_exists; +} diff --git a/criu/pstree.c b/criu/pstree.c index 660f1b9d99..cee8b5741a 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -182,7 +182,7 @@ void free_pstree(struct pstree_item *root_item) struct pstree_item *item = root_item, *parent; while (item) { - if (!list_empty(&item->children)) { + if (has_children(item)) { item = list_first_entry(&item->children, struct pstree_item, sibling); continue; } @@ -237,17 +237,21 @@ int init_pstree_helper(struct pstree_item *ret) { BUG_ON(!ret->parent); ret->pid->state = TASK_HELPER; - rsti(ret)->clone_flags = CLONE_FILES | CLONE_FS; - if (shared_fdt_prepare(ret) < 0) - return -1; + rsti(ret)->clone_flags = 0; + INIT_LIST_HEAD(&rsti(ret)->fds); task_entries->nr_helpers++; return 0; } +bool has_children(struct pstree_item *item) +{ + return !list_empty(&item->children); +} + /* Deep first search on children */ struct pstree_item *pstree_item_next(struct pstree_item *item) { - if (!list_empty(&item->children)) + if (has_children(item)) return list_first_entry(&item->children, struct pstree_item, sibling); while (item->parent) { diff --git a/criu/seize.c b/criu/seize.c index f56357ac7b..d0cf7b36c8 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1008,7 +1008,7 @@ static int collect_task(struct pstree_item *item) if (ret < 0) goto err_close; - if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) { + if ((item->pid->state == TASK_DEAD) && has_children(item)) { pr_err("Zombie with children?! O_o Run, run, run!\n"); goto err_close; } @@ -1060,22 +1060,32 @@ int collect_pstree(void) */ alarm(opts.timeout); - ret = run_plugins(PAUSE_DEVICES, pid); - if (ret < 0 && ret != -ENOTSUP) { - goto err; - } - if (opts.freeze_cgroup && cgroup_version()) goto err; pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); if (opts.freeze_cgroup && !compel_interrupt_only_mode) { + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (freeze_processes()) goto err; } else { if (opts.freeze_cgroup && prepare_freezer_for_interrupt_only_mode()) goto err; + + /* + * Call PAUSE_DEVICES after prepare_freezer_for_interrupt_only_mode() + * to be able to checkpoint containers in a frozen state. + */ + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (compel_interrupt_task(pid)) { set_cr_errno(ESRCH); goto err; @@ -1136,4 +1146,4 @@ int checkpoint_devices(void) exit_code = 0; err: return exit_code; -} \ No newline at end of file +} diff --git a/criu/shmem.c b/criu/shmem.c index 9e3178352d..bc7aa36695 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -206,31 +206,34 @@ static int expand_shmem(struct shmem_info *si, unsigned long new_size) return 0; } -static void update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) +static int update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; u64 vaddr; if (!is_shmem_tracking_en()) - return; + return 0; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); for (vma_pfn = 0, vaddr = vma->start; vma_pfn < vma_pgcnt; ++vma_pfn, vaddr += PAGE_SIZE) { - bool softdirty = false; - u64 next; + struct page_info page_info = {}; + + if (should_dump_page(pmc, vma, vaddr, &page_info)) + return -1; - next = should_dump_page(pmc, vma, vaddr, &softdirty); - if (next != vaddr) { - vaddr = next - PAGE_SIZE; + if (page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; continue; } shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); - if (softdirty) + if (page_info.softdirty) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } + + return 0; } int collect_sysv_shmem(unsigned long shmid, unsigned long size) @@ -667,7 +670,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) if (expand_shmem(si, size)) return -1; } - update_shmem_pmaps(si, pmc, vma); + + if (update_shmem_pmaps(si, pmc, vma)) + return -1; return 0; } @@ -684,7 +689,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) if (expand_shmem(si, size)) return -1; - update_shmem_pmaps(si, pmc, vma); + + if (update_shmem_pmaps(si, pmc, vma)) + return -1; return 0; } diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 92f53e5697..422edc6567 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -44,6 +44,11 @@ #define PB_ALEN_INET 1 #define PB_ALEN_INET6 4 +/* Definition for older kernels without MPTCP support (e.g. Ubuntu 20.04) */ +#ifndef IPPROTO_MPTCP +#define IPPROTO_MPTCP 262 +#endif + static LIST_HEAD(inet_ports); struct inet_port { @@ -125,9 +130,13 @@ static int can_dump_ipproto(unsigned int ino, int proto, int type) case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: break; default: pr_err("Unsupported proto %d for socket %x\n", proto, ino); + if (proto == IPPROTO_MPTCP) + pr_err("For Go programs, consider using \"GODEBUG=multipathtcp=0\" to disable MPTCP\n"); return 0; } @@ -572,7 +581,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa if (dump_ip_opts(lfd, family, type, proto, &ipopts)) goto err; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, family, &skopts)) goto err; pr_info("Dumping inet socket at %d\n", p->fd); @@ -915,8 +924,9 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) } if (ie->src_port) { - if (inet_bind(sk, ii)) - goto err; + if (ie->proto != IPPROTO_ICMP && ie->proto != IPPROTO_ICMPV6) + if (inet_bind(sk, ii)) + goto err; } /* diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c index a219b69be1..dc2baa1b80 100644 --- a/criu/sk-netlink.c +++ b/criu/sk-netlink.c @@ -165,7 +165,7 @@ static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) ne.fown = (FownEntry *)&p->fown; ne.opts = &skopts; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, AF_NETLINK, &skopts)) goto err; fe.type = FD_TYPES__NETLINKSK; diff --git a/criu/sk-packet.c b/criu/sk-packet.c index 1d2e23522a..6530bff580 100644 --- a/criu/sk-packet.c +++ b/criu/sk-packet.c @@ -173,7 +173,7 @@ static int dump_one_packet_fd(int lfd, u32 id, const struct fd_parms *p) psk.fown = (FownEntry *)&p->fown; psk.opts = &skopts; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, AF_PACKET, &skopts)) return -1; psk.protocol = sd->proto; diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 70ca16be4a..6145fe7347 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -527,7 +527,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) } } dump: - if (dump_socket_opts(lfd, skopts)) + if (dump_socket_opts(lfd, AF_UNIX, skopts)) goto err; pr_info("Dumping unix socket at %d\n", p->fd); diff --git a/criu/sockets.c b/criu/sockets.c index f9ce999bed..e4adae03cd 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -65,7 +65,7 @@ const char *socket_proto_name(unsigned int proto, char *nm, size_t size) [IPPROTO_IPV6] = __stringify_1(IPPROTO_IPV6), [IPPROTO_RSVP] = __stringify_1(IPPROTO_RSVP), [IPPROTO_GRE] = __stringify_1(IPPROTO_GRE), [IPPROTO_ESP] = __stringify_1(IPPROTO_ESP), [IPPROTO_AH] = __stringify_1(IPPROTO_AH), [IPPROTO_UDPLITE] = __stringify_1(IPPROTO_UDPLITE), - [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), + [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), [IPPROTO_ICMPV6] = __stringify_1(IPPROTO_ICMPV6), }; return __socket_const_name(nm, size, protos, ARRAY_SIZE(protos), proto); } @@ -131,10 +131,12 @@ enum socket_cl_bits { INET_UDP_CL_BIT, INET_UDPLITE_CL_BIT, INET_RAW_CL_BIT, + INET_ICMP_CL_BIT, INET6_TCP_CL_BIT, INET6_UDP_CL_BIT, INET6_UDPLITE_CL_BIT, INET6_RAW_CL_BIT, + INET6_ICMP_CL_BIT, UNIX_CL_BIT, PACKET_CL_BIT, _MAX_CL_BIT, @@ -161,6 +163,8 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET_RAW_CL_BIT; + if (proto == IPPROTO_ICMP) + return INET_ICMP_CL_BIT; } if (family == AF_INET6) { if (proto == IPPROTO_TCP) @@ -171,6 +175,8 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET6_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET6_RAW_CL_BIT; + if (proto == IPPROTO_ICMPV6) + return INET6_ICMP_CL_BIT; } pr_err("Unknown pair family %d proto %d\n", family, proto); @@ -282,6 +288,12 @@ void preload_socket_modules(void) req.r.i.sdiag_protocol = IPPROTO_RAW; probe_diag(nl, &req, -ENOENT); + req.r.i.sdiag_protocol = IPPROTO_ICMP; + probe_diag(nl, &req, -ENOENT); + + req.r.i.sdiag_protocol = IPPROTO_ICMPV6; + probe_diag(nl, &req, -ENOENT); + close(nl); pr_info("Done probing\n"); } @@ -637,7 +649,7 @@ int do_dump_opt(int sk, int level, int name, void *val, int len) return 0; } -int dump_socket_opts(int sk, SkOptsEntry *soe) +int dump_socket_opts(int sk, int family, SkOptsEntry *soe) { int ret = 0, val; struct timeval tv; @@ -676,13 +688,15 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) soe->so_reuseport = val ? true : false; soe->has_so_reuseport = true; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); - soe->has_so_passcred = true; - soe->so_passcred = val ? true : false; + if (family == AF_UNIX || family == AF_NETLINK) { + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); + soe->has_so_passcred = true; + soe->so_passcred = val ? true : false; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); - soe->has_so_passsec = true; - soe->so_passsec = val ? true : false; + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); + soe->has_so_passsec = true; + soe->so_passsec = val ? true : false; + } ret |= dump_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val); soe->has_so_dontroute = true; @@ -773,6 +787,10 @@ static int inet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg) case IPPROTO_RAW: type = SOCK_RAW; break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + type = SOCK_DGRAM; + break; default: BUG_ON(1); return -1; @@ -797,7 +815,7 @@ static int collect_err(int err, struct ns_id *ns, void *arg) char family[32], proto[32]; char msg[256]; - snprintf(msg, sizeof(msg), "Sockects collect procedure family %s proto %s", + snprintf(msg, sizeof(msg), "Sockets collect procedure family %s proto %s", socket_family_name(gr->family, family, sizeof(family)), socket_proto_name(gr->protocol, proto, sizeof(proto))); @@ -905,6 +923,13 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; + /* Collect IPv4 ICMP sockets */ + req.r.i.sdiag_family = AF_INET; + req.r.i.sdiag_protocol = IPPROTO_ICMP; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); + /* Collect IPv6 TCP sockets */ req.r.i.sdiag_family = AF_INET6; req.r.i.sdiag_protocol = IPPROTO_TCP; @@ -944,6 +969,13 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; + /* Collect IPv6 ICMP sockets */ + req.r.i.sdiag_family = AF_INET6; + req.r.i.sdiag_protocol = IPPROTO_ICMPV6; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); + req.r.p.sdiag_family = AF_PACKET; req.r.p.sdiag_protocol = 0; req.r.p.pdiag_show = PACKET_SHOW_INFO | PACKET_SHOW_MCLIST | PACKET_SHOW_FANOUT | PACKET_SHOW_RING_CFG; diff --git a/criu/timer.c b/criu/timer.c index 0413e2a720..856501be6b 100644 --- a/criu/timer.c +++ b/criu/timer.c @@ -195,6 +195,7 @@ int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) if (!img) return -1; + ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; ta->posix_timers_n = 0; while (1) { PosixTimerEntry *pte; @@ -234,6 +235,7 @@ int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) return prepare_posix_timers_from_fd(pid, ta); ta->posix_timers_n = tte->n_posix; + ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; for (i = 0; i < ta->posix_timers_n; i++) { t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); if (!t) diff --git a/criu/uffd.c b/criu/uffd.c index 98c2b7e075..8e12dcd636 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -668,12 +668,11 @@ static int remap_iovs(struct lazy_pages_info *lpi, unsigned long from, unsigned */ static int collect_iovs(struct lazy_pages_info *lpi) { + unsigned long start, end, len, nr_pages = 0; + int n_vma = 0, max_iov_len = 0, ret = -1; struct page_read *pr = &lpi->pr; struct lazy_iov *iov; MmEntry *mm; - int nr_pages = 0, n_vma = 0, max_iov_len = 0; - int ret = -1; - unsigned long start, end, len; mm = init_mm_entry(lpi); if (!mm) @@ -728,7 +727,7 @@ static int collect_iovs(struct lazy_pages_info *lpi) return ret; } -static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, int nr); +static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, unsigned long nr); static int ud_open(int client, struct lazy_pages_info **_lpi) { @@ -822,7 +821,7 @@ static bool uffd_recoverable_error(int mcopy_rc) return false; } -static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int *nr_pages, long mcopy_rc) +static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, unsigned long *nr_pages, long mcopy_rc) { if (errno == ENOSPC || errno == ESRCH) { handle_exit(lpi); @@ -844,7 +843,7 @@ static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int return 0; } -static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) +static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, unsigned long *nr_pages) { struct uffdio_copy uffdio_copy; unsigned long len = *nr_pages * page_size(); @@ -865,12 +864,12 @@ static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) return 0; } -static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr) +static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, unsigned long nr) { struct lazy_pages_info *lpi; - unsigned long addr = 0; - int req_pages, ret; + unsigned long addr = 0, req_pages; struct lazy_iov *req; + int ret; lpi = container_of(pr, struct lazy_pages_info, pr); @@ -920,7 +919,7 @@ static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr return drop_iovs(lpi, addr, nr * PAGE_SIZE); } -static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) +static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, unsigned long nr_pages) { struct uffdio_zeropage uffdio_zeropage; unsigned long len = page_size() * nr_pages; @@ -946,7 +945,7 @@ static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) * Returns 0 for zero pages, 1 for "real" pages and negative value on * error */ -static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) +static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr) { int ret; @@ -961,7 +960,7 @@ static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) return 0; } -static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, int nr, unsigned flags) +static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr, unsigned flags) { int ret; @@ -1003,7 +1002,7 @@ static void update_xfer_len(struct lazy_pages_info *lpi, bool pf) static int xfer_pages(struct lazy_pages_info *lpi) { struct lazy_iov *iov; - unsigned int nr_pages; + unsigned long nr_pages; unsigned long len; int err; diff --git a/criu/util.c b/criu/util.c index 58c18e20be..e2f80e4c61 100644 --- a/criu/util.c +++ b/criu/util.c @@ -195,6 +195,7 @@ static void vma_opt_str(const struct vma_area *v, char *opt) opt2s(VMA_ANON_PRIVATE, "ap"); opt2s(VMA_AREA_SYSVIPC, "sysv"); opt2s(VMA_AREA_SOCKET, "sk"); + opt2s(VMA_AREA_UPROBES, "uprobes"); #undef opt2s } diff --git a/criu/vdso.c b/criu/vdso.c index d4d3511314..2d9e57c4da 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -145,6 +145,9 @@ static void drop_rt_vdso(struct vm_area_list *vma_area_list, struct vdso_quarter * Also BTW search for rt-vvar to remove it later. */ list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (vma->e->start == addr->orig_vdso) { vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO; pr_debug("vdso: Restore orig vDSO status at %lx\n", (long)vma->e->start); @@ -276,6 +279,9 @@ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, struct vm_area_list } list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + /* * Defer handling marked vdso until we walked over * all vmas and restore potentially remapped vDSO diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000000..90c914452b --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1744463964, + "narHash": "sha256-LWqduOgLHCFxiTNYi3Uj5Lgz0SR+Xhw3kr/3Xd0GPTM=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "2631b0b7abcea6e640ce31cd78ea58910d31e650", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000000..dc2429ffc9 --- /dev/null +++ b/flake.nix @@ -0,0 +1,77 @@ +{ + description = "CRIU development environment"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + + # Dependencies for CRIU + criuDeps = with pkgs; [ + # Compiler and build essentials + gcc + gnumake + pkg-config + + # Protocol Buffers + protobuf + protobufc + python3Packages.protobuf + + # Other required libraries + libuuid + libbsd + iproute2 + nftables + libcap + libnet + libnl + libaio + gnutls + libdrm + + # ZDTM + python3Packages.pyyaml + ]; + + # Multilib support for 32-bit compatibility + # criuDeps32bit = with pkgs; [ + # glibc.dev + # glibc + # gcc-unwrapped + # ]; + + devShell = pkgs.mkShell { + buildInputs = criuDeps; # ++ (if pkgs.stdenv.isx86_64 then criuDeps32bit else []); + + shellHook = '' + echo "CRIU development environment" + echo "==============================" + echo "" + echo "Useful commands:" + echo " make - Build CRIU" + echo " make test - Run tests (requires ZDTM dependencies)" + echo "" + ''; + + # Add proper flags for multilib support + # NIX_CFLAGS_COMPILE = pkgs.lib.optional pkgs.stdenv.isx86_64 "-m32"; + + # Make sure the shell can find headers for multilib + # PKG_CONFIG_PATH = pkgs.lib.makeSearchPath "lib/pkgconfig" criuDeps; + }; + in + { + # Export the development shell + devShells.default = devShell; + + # Build CRIU package as well + packages.default = pkgs.criu; + } + ); +} diff --git a/images/Makefile b/images/Makefile index 1e40b8a8f0..2c33152e91 100644 --- a/images/Makefile +++ b/images/Makefile @@ -58,7 +58,6 @@ proto-obj-y += ext-file.o proto-obj-y += cgroup.o proto-obj-y += userns.o proto-obj-y += pidns.o -proto-obj-y += google/protobuf/descriptor.o # To make protoc-c happy and compile opts.proto proto-obj-y += opts.o proto-obj-y += seccomp.o proto-obj-y += binfmt-misc.o @@ -91,12 +90,27 @@ endef makefile-deps := Makefile $(obj)/Makefile +# +# Generate descriptor.pb-c.c and descriptor.pb-c.h to compile opts.proto. +DESCRIPTOR_DIR := images/google/protobuf +GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf +$(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto + $(call msg-gen, $@) + $(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< + +cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d + +submrproper: + $(Q) rm -f $(DESCRIPTOR_DIR)/* +.PHONY: submrproper +mrproper: submrproper + # # Generates rules needed to compile protobuf files. define gen-proto-rules $(obj)/$(1).pb-c.c $(obj)/$(1).pb-c.h: $(obj)/$(1).proto $(addsuffix .pb-c.c,$(addprefix $(obj)/,$(2))) $(makefile-deps) $$(E) " PBCC " $$@ - $$(Q) protoc-c --proto_path=$(obj)/ --c_out=$(obj)/ $$< + $$(Q) protoc --proto_path=$(obj)/ --c_out=$(obj)/ $$< ifeq ($(PROTOUFIX),y) $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$@ $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$(patsubst %.c,%.h,$$@) diff --git a/images/google/protobuf/.gitignore b/images/google/protobuf/.gitignore new file mode 100644 index 0000000000..68359a7869 --- /dev/null +++ b/images/google/protobuf/.gitignore @@ -0,0 +1,2 @@ +*.c +*.h diff --git a/images/google/protobuf/descriptor.proto b/images/google/protobuf/descriptor.proto deleted file mode 120000 index 07a4c9add6..0000000000 --- a/images/google/protobuf/descriptor.proto +++ /dev/null @@ -1 +0,0 @@ -/usr/include/google/protobuf/descriptor.proto \ No newline at end of file diff --git a/images/inventory.proto b/images/inventory.proto index 1e18815bb9..feed5b8509 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -33,4 +33,5 @@ message inventory_entry { // This is currently used to delete the correct nftables // network locking rule. optional string dump_criu_run_id = 13; + optional bool allow_uprobes = 14; } diff --git a/images/pagemap.proto b/images/pagemap.proto index e6d341b0f6..f2436a51ac 100644 --- a/images/pagemap.proto +++ b/images/pagemap.proto @@ -10,7 +10,8 @@ message pagemap_head { message pagemap_entry { required uint64 vaddr = 1 [(criu).hex = true]; - required uint32 nr_pages = 2; + required uint32 compat_nr_pages = 2; optional bool in_parent = 3; optional uint32 flags = 4 [(criu).flags = "pmap.flags" ]; + optional uint64 nr_pages = 5; } diff --git a/lib/c/criu.c b/lib/c/criu.c index c16fe5dcd7..485c8b178e 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -2041,3 +2041,22 @@ void criu_set_empty_ns(int namespaces) { criu_local_set_empty_ns(global_opts, namespaces); } + +int criu_local_set_config_file(criu_opts *opts, const char *path) +{ + char *new; + + new = strdup(path); + if (!new) + return -ENOMEM; + + free(opts->rpc->config_file); + opts->rpc->config_file = new; + + return 0; +} + +int criu_set_config_file(const char *path) +{ + return criu_local_set_config_file(global_opts, path); +} diff --git a/lib/c/criu.h b/lib/c/criu.h index c1c6078698..44446f6645 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -116,6 +116,7 @@ void criu_set_pidfd_store_sk(int sk); int criu_set_network_lock(enum criu_network_lock_method method); int criu_join_ns_add(const char *ns, const char *ns_file, const char *extra_opt); void criu_set_mntns_compat_mode(bool val); +int criu_set_config_file(const char *path); /* * The criu_notify_arg_t na argument is an opaque @@ -281,6 +282,7 @@ void criu_local_set_pidfd_store_sk(criu_opts *opts, int sk); int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method); int criu_local_join_ns_add(criu_opts *opts, const char *ns, const char *ns_file, const char *extra_opt); void criu_local_set_mntns_compat_mode(criu_opts *opts, bool val); +int criu_local_set_config_file(criu_opts *opts, const char *path); void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_notify_arg_t na)); diff --git a/lib/pycriu/__init__.py b/lib/pycriu/__init__.py index 2abcf029de..28f1e94249 100644 --- a/lib/pycriu/__init__.py +++ b/lib/pycriu/__init__.py @@ -1,4 +1,15 @@ from . import rpc_pb2 as rpc from . import images -from .criu import * -from .version import __version__ \ No newline at end of file +from .criu import criu, CRIUExceptionExternal, CRIUException +from .criu import CR_DEFAULT_SERVICE_ADDRESS +from .version import __version__ + +__all__ = ( + "rpc", + "images", + "criu", + "CRIUExceptionExternal", + "CRIUException", + "CR_DEFAULT_SERVICE_ADDRESS", + "__version__", +) \ No newline at end of file diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index f3e018095a..760d2be78a 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -8,6 +8,7 @@ import pycriu.rpc_pb2 as rpc +CR_DEFAULT_SERVICE_ADDRESS = "./criu_service.socket" class _criu_comm: """ @@ -45,7 +46,14 @@ def __init__(self, sk_path): def connect(self, daemon): self.sk = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) - self.sk.connect(self.comm) + try: + self.sk.connect(self.comm) + + except FileNotFoundError: + raise FileNotFoundError("Socket file not found.") + + except ConnectionRefusedError: + raise ConnectionRefusedError("Service not running.") return self.sk @@ -103,7 +111,7 @@ def exec_criu(): os.close(2) css[0].send(struct.pack('i', os.getpid())) - os.execv(self.comm, + os.execvp(self.comm, [self.comm, 'swrk', "%d" % css[0].fileno()]) os._exit(1) @@ -181,15 +189,14 @@ def _gen_error_str(self): if self.errno == errno.EBADRQC: s += "Bad options" - if self.typ == rpc.DUMP: - if self.errno == errno.ESRCH: - s += "No process with such pid" + elif self.typ == rpc.DUMP and self.errno == errno.ESRCH: + s += "No process with such pid" - if self.typ == rpc.RESTORE: - if self.errno == errno.EEXIST: - s += "Process with requested pid already exists" + elif self.typ == rpc.RESTORE and self.errno == errno.EEXIST: + s += "Process with requested pid already exists" - s += "Unknown" + else: + s += "Unknown" return s @@ -204,10 +211,11 @@ class criu: def __init__(self): self.use_binary('criu') - self.opts = rpc.criu_opts() + # images_dir_fd is required field with default value of -1 + self.opts = rpc.criu_opts(images_dir_fd=-1) self.sk = None - def use_sk(self, sk_name): + def use_sk(self, sk_name=CR_DEFAULT_SERVICE_ADDRESS): """ Access criu using unix socket which that belongs to criu service daemon. """ @@ -266,6 +274,7 @@ def check(self): """ req = rpc.criu_req() req.type = rpc.CHECK + req.opts.MergeFrom(self.opts) resp = self._send_req_and_recv_resp(req) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index e3dd95ac0a..a35dd3c3fc 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -83,6 +83,7 @@ def _custom_conv(field): mmap_flags_map = [ ('MAP_SHARED', 0x1), ('MAP_PRIVATE', 0x2), + ('MAP_DROPPABLE', 0x08), ('MAP_ANON', 0x20), ('MAP_GROWSDOWN', 0x0100), ] @@ -104,6 +105,7 @@ def _custom_conv(field): ('VMA_AREA_AIORING', 1 << 13), ('VMA_AREA_MEMFD', 1 << 14), ('VMA_AREA_SHSTK', 1 << 15), + ('VMA_AREA_UPROBES', 1 << 17), ('VMA_UNSUPP', 1 << 31), ] diff --git a/lib/pyproject.toml b/lib/pyproject.toml index 8eb4b7084d..ea9f88dccc 100644 --- a/lib/pyproject.toml +++ b/lib/pyproject.toml @@ -6,11 +6,12 @@ build-backend = "setuptools.build_meta" name = "pycriu" description = "Python bindings for CRIU" authors = [ - {name = "CRIU team", email = "criu@openvz.org"}, + {name = "CRIU team", email = "criu@lists.linux.dev"}, ] -license = {text = "GPLv2"} +license = {text = "LGPLv2.1"} dynamic = ["version"] requires-python = ">=3.6" +dependencies = ["protobuf"] [tool.setuptools] packages = ["pycriu", "pycriu.images"] diff --git a/lib/setup.cfg b/lib/setup.cfg index 23ee48dd5b..28c9e49c3f 100644 --- a/lib/setup.cfg +++ b/lib/setup.cfg @@ -7,10 +7,12 @@ name = pycriu description = Python bindings for CRIU author = CRIU team -author_email = criu@openvz.org -license = GPLv2 +author_email = criu@lists.linux.dev +license = LGPLv2.1 version = attr: pycriu.__version__ [options] packages = find: python_requires = >=3.6 +install_requires = + protobuf diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index a20d1d1639..870a039cdb 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -25,9 +25,9 @@ else endif criu-amdgpu.pb-c.c: criu-amdgpu.proto - protoc-c --proto_path=. --c_out=. criu-amdgpu.proto + protoc --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 1078eafe6f..b808fbc4f0 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -3,7 +3,8 @@ Supporting ROCm with CRIU _Felix Kuehling _
_Rajneesh Bardwaj _
-_David Yat Sin _ +_David Yat Sin _
+_Yanning Yang _ # Introduction @@ -224,6 +225,26 @@ to resume execution on the GPUs. *This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC patch series.* +## Restoring BO content in parallel + +Restoring the BO content is an important part in the restore of GPU state and +usually takes a significant amount of time. A possible location for this +procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook +blocks the target process from performing other restore operations, which +hinders further optimization of the restore process. + +Therefore, a new plugin hook that runs in the master restore process is +introduced, and it interacts with the `cr_plugin_restore_file` hook to complete +the restore of BO content. Specifically, the target process only needs to send +the relevant BOs to the master restore process, while this new hook handles all +the restore of buffer objects. Through this method, during the restore of the BO +content, the target process can perform other restore operations, thus +accelerating the restore procedure. This is an implementation of the gCROP +method proposed in the ACM SoCC'24 paper: [On-demand and Parallel +Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510). + +*This optimization technique is enabled by the `__POST_FORKING` hook.* + ## Other CRIU changes In addition to the new plugins, we need to make some changes to CRIU itself to diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 96c0861628..69194fbc79 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -28,11 +28,13 @@ #include "xmalloc.h" #include "criu-log.h" #include "files.h" +#include "pstree.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "amdgpu_socket_utils.h" #include "img-streamer.h" #include "image.h" @@ -64,6 +66,18 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; +/* + * In the case of a single process (common case), this optimization can effectively + * reduce the restore latency with parallel restore. In the case of multiple processes, + * states are already restored in parallel within different processes. Therefore, this + * optimization does not introduce further improvement and will be disabled by default + * in this case. The flag, parallel_disabled, is used to control whether the + * optimization is enabled or disabled. + */ +bool parallel_disabled = false; + +pthread_t parallel_thread = 0; +int parallel_thread_result = 0; /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -351,6 +365,15 @@ int amdgpu_plugin_init(int stage) maps_init(&restore_maps); if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (has_children(root_item)) { + pr_info("Parallel restore disabled\n"); + parallel_disabled = true; + } else { + if (install_parallel_sock() < 0) { + pr_err("Failed to install parallel socket\n"); + return -1; + } + } /* Default Values */ kfd_fw_version_check = true; kfd_sdma_fw_version_check = true; @@ -1439,14 +1462,9 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) { - struct thread_data *thread_datas; + struct thread_data *thread_datas = NULL; int thread_i, ret = 0; - - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; - goto exit; - } + int offset = 0; for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -1489,56 +1507,101 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf } } - thread_i = 0; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - uint32_t target_gpu_id; + if (!parallel_disabled) { + parallel_restore_cmd restore_cmd; + pr_info("Begin to send parallel restore cmd\n"); + ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd); + if (ret) + goto exit_parallel; - if (!e->device_entries[i]->gpu_id) - continue; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + uint32_t target_gpu_id; + struct tp_node *dev; - /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + if (!e->device_entries[i]->gpu_id) + continue; - /* We need the fd for actual_gpu_id */ - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit_parallel; + } + parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd); + + for (int j = 0; j < e->num_of_bos; j++) { + if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id) + continue; + if (bo_buckets[j].alloc_flags & + (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, + bo_buckets[j].size, offset, &restore_cmd); + offset += bo_buckets[j].size; + } + } + } + ret = send_parallel_restore_cmd(&restore_cmd); +exit_parallel: + free_parallel_restore_cmd(&restore_cmd); + } else { + thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); + if (!thread_datas) { + ret = -ENOMEM; goto exit; } - thread_datas[thread_i].id = id; - thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; - thread_datas[thread_i].bo_buckets = bo_buckets; - thread_datas[thread_i].bo_entries = e->bo_entries; - thread_datas[thread_i].pid = e->pid; - thread_datas[thread_i].num_of_bos = e->num_of_bos; + thread_i = 0; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + struct tp_node *dev; + int ret_thread = 0; + uint32_t target_gpu_id; - thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[thread_i].drm_fd < 0) { - ret = -thread_datas[thread_i].drm_fd; - goto exit; - } + if (!e->device_entries[i]->gpu_id) + continue; - ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, - (void *)&thread_datas[thread_i]); - if (ret_thread) { - pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); - ret = -ret_thread; - goto exit; + /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + + /* We need the fd for actual_gpu_id */ + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit; + } + + thread_datas[thread_i].id = id; + thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; + thread_datas[thread_i].bo_buckets = bo_buckets; + thread_datas[thread_i].bo_entries = e->bo_entries; + thread_datas[thread_i].pid = e->pid; + thread_datas[thread_i].num_of_bos = e->num_of_bos; + + thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); + if (thread_datas[thread_i].drm_fd < 0) { + ret = -thread_datas[thread_i].drm_fd; + goto exit; + } + + ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, + (void *)&thread_datas[thread_i]); + if (ret_thread) { + pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); + ret = -ret_thread; + goto exit; + } + thread_i++; } - thread_i++; - } - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + for (int i = 0; i < e->num_of_gpus; i++) { + pthread_join(thread_datas[i].thread, NULL); + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; + if (thread_datas[i].ret) { + ret = thread_datas[i].ret; + goto exit; + } } } exit: @@ -1546,8 +1609,8 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) close(bo_buckets[i].dmabuf_fd); } - - xfree(thread_datas); + if (thread_datas) + xfree(thread_datas); return ret; } @@ -1836,6 +1899,24 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (plugin_disabled) return -ENOTSUP; + if (!parallel_disabled) { + pr_info("Close parallel restore server\n"); + if (close_parallel_restore_server()) { + pr_err("Close parallel restore server fail\n"); + return -1; + } + + exit_code = pthread_join(parallel_thread, NULL); + if (exit_code) { + pr_err("Failed to join parallel thread ret:%d\n", exit_code); + return -1; + } + if (parallel_thread_result) { + pr_err("Parallel restore fail\n"); + return parallel_thread_result; + } + } + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1862,3 +1943,244 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) + +int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, + amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) +{ + return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); +} + +int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) +{ + int ret = 0; + int drm_fd = -1; + uint32_t major, minor; + + struct amdgpu_gpu_info gpu_info = { 0 }; + + drm_fd = open_drm_render_device(dev_minor); + if (drm_fd < 0) { + return drm_fd; + } + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev); + if (ret) { + pr_perror("Failed to initialize device"); + goto err; + } + + ret = amdgpu_query_gpu_info(*h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto err; + } + *max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + return 0; +err: + amdgpu_device_deinitialize(*h_dev); + return ret; +} + +FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size) +{ + char img_path[PATH_MAX]; + size_t image_size = 0; + FILE *bo_contents_fp = NULL; + + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id); + bo_contents_fp = open_img_file(img_path, false, &image_size); + if (!bo_contents_fp) { + pr_perror("Cannot fopen %s", img_path); + return NULL; + } + + if (tot_size != image_size) { + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size); + fclose(bo_contents_fp); + return NULL; + } + return bo_contents_fp; +} + +struct parallel_thread_data { + pthread_t thread; + uint32_t gpu_id; + int minor; + parallel_restore_cmd *restore_cmd; + int ret; +}; + +void *parallel_restore_bo_contents(void *_thread_data) +{ + struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0; + FILE *bo_contents_fp = NULL; + parallel_restore_entry *entry; + parallel_restore_cmd *restore_cmd = thread_data->restore_cmd; + int ret = 0; + int offset = 0; + void *buffer = NULL; + + ret = init_dev(thread_data->minor, &h_dev, &max_copy_size); + if (ret) { + goto err; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) { + total_bo_size += restore_cmd->entries[i].size; + max_bo_size = max(restore_cmd->entries[i].size, max_bo_size); + } + } + + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size); + if (bo_contents_fp == NULL) { + ret = -1; + goto err_sdma; + } + offset = ftell(bo_contents_fp); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto err_sdma; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id) + continue; + + entry = &restore_cmd->entries[i]; + fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + goto err_sdma; + } + } + +err_sdma: + if (bo_contents_fp) + fclose(bo_contents_fp); + if (buffer) + xfree(buffer); + amdgpu_device_deinitialize(h_dev); +err: + thread_data->ret = ret; + return NULL; +} + +void *restore_device_parallel_worker(void *arg) +{ + while (1) { + parallel_restore_cmd restore_cmd = { 0 }; + struct parallel_thread_data *thread_datas = NULL; + int ret; + int error_occurred = 0, join_ret = 0, created_threads = 0; + + ret = recv_parallel_restore_cmd(&restore_cmd); + if (ret) { + if (ret == 1) { + *(int *)arg = 0; + goto exit; + } + goto err; + } + + thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num); + if (!thread_datas) { + ret = -ENOMEM; + goto err; + } + + for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) { + thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id; + thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor; + thread_datas[created_threads].restore_cmd = &restore_cmd; + + ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents, + (void *)&thread_datas[created_threads]); + if (ret) { + pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret); + error_occurred = 1; + break; + } + } + + for (int i = 0; i < created_threads; i++) { + join_ret = pthread_join(thread_datas[i].thread, NULL); + if (join_ret != 0) { + pr_err("pthread_join failed for Thread[0x%x] ret:%d\n", + thread_datas[i].gpu_id, join_ret); + if (!error_occurred) { + ret = join_ret; + error_occurred = 1; + } + } + + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + + /* Check thread return value */ + if (thread_datas[i].ret && !error_occurred) { + ret = thread_datas[i].ret; + error_occurred = 1; + } + } + + if (thread_datas) + xfree(thread_datas); +err: + free_parallel_restore_cmd(&restore_cmd); + + if (ret) { + *(int *)arg = ret; + return NULL; + } + } +exit: + return NULL; +} + +/* + * While the background thread is running, some processing functions (e.g., stop_cgroupd) + * in the main thread need to block SIGCHLD. To prevent interference from this background + * thread, SIGCHLD is blocked in this thread. + */ +static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg) +{ + int ret = 0; + sigset_t blockmask, oldmask; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + ret = pthread_create(newthread, NULL, f, arg); + if (ret) { + pr_err("Create worker thread fail: %d\n", ret); + return -1; + } + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + return 0; +} + +int amdgpu_plugin_post_forking(void) +{ + if (plugin_disabled) + return -ENOTSUP; + + if (parallel_disabled) + return 0; + + return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 5b4396a0cc..730f2e0284 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -45,7 +45,7 @@ bool kfd_capability_check = true; */ int fd_next = -1; -static int open_drm_render_device(int minor) +int open_drm_render_device(int minor) { char path[128]; int fd, ret_fd; diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index c890e3ddae..e19f8e7ce9 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -118,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index); +int open_drm_render_device(int minor); int node_get_drm_render_device(struct tp_node *node); void sys_close_drm_render_devices(struct tp_system *sys); diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c new file mode 100644 index 0000000000..c8bf6d1ba3 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -0,0 +1,320 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "amdgpu_socket_utils.h" +#include "criu-log.h" +#include "common/scm.h" +#include "fdstore.h" +#include "util-pie.h" +#include "util.h" + +int parallel_socket_addr_len; +struct sockaddr_un parallel_socket_addr; +int parallel_socket_id = 0; + +static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len) +{ + addr->sun_family = AF_UNIX; + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id); + *len = SUN_LEN(addr); + *addr->sun_path = '\0'; +} + +int install_parallel_sock(void) +{ + int ret = 0; + int sock_fd; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("socket creation failed"); + return -1; + } + + amdgpu_socket_name_gen(¶llel_socket_addr, ¶llel_socket_addr_len); + ret = bind(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("bind failed"); + goto err; + } + + ret = listen(sock_fd, SOMAXCONN); + if (ret < 0) { + pr_perror("listen failed"); + goto err; + } + + parallel_socket_id = fdstore_add(sock_fd); + if (parallel_socket_id < 0) { + ret = -1; + goto err; + } +err: + close(sock_fd); + return ret; +} + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd) +{ + parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num]; + restore_entry->gpu_id = gpu_id; + restore_entry->write_id = restore_cmd->cmd_head.fd_write_num; + restore_entry->write_offset = 0; + restore_entry->read_offset = offset; + restore_entry->size = size; + + restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd; + + restore_cmd->cmd_head.entry_num += 1; + restore_cmd->cmd_head.fd_write_num += 1; +} + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor }; + restore_cmd->cmd_head.gpu_num += 1; +} + +static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + return 0; +} + +static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Send parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Send dmabuf fds fail"); + return -1; + } + return 0; +} + +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd; + int ret = 0; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + ret = send_metadata(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_gpu_ids(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_cmds(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_dmabuf_fds(sock_fd, restore_cmd); + +err: + close(sock_fd); + return ret; +} + +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->cmd_head.id = id; + restore_cmd->cmd_head.fd_write_num = 0; + restore_cmd->cmd_head.entry_num = 0; + restore_cmd->cmd_head.gpu_num = 0; + + restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + if (restore_cmd->gpu_ids) + xfree(restore_cmd->gpu_ids); + if (restore_cmd->fds_write) + xfree(restore_cmd->fds_write); + if (restore_cmd->entries) + xfree(restore_cmd->entries); +} + +static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +static int check_quit_cmd(parallel_restore_cmd *restore_cmd) +{ + return restore_cmd->cmd_head.fd_write_num == 0; +} + +static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Recv parallel restore command head fail"); + return -1; + } + return 0; +} + +static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Recv parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Recv dmabuf fds fail"); + return -1; + } + return 0; +} + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd, client_fd; + int ret = 0; + + sock_fd = fdstore_get(parallel_socket_id); + if (sock_fd < 0) + return -1; + + client_fd = accept(sock_fd, NULL, NULL); + if (client_fd < 0) { + ret = client_fd; + goto err_accept; + } + + ret = recv_metadata(client_fd, restore_cmd); + if (ret) { + goto err; + } + + // Return 1 to quit + if (check_quit_cmd(restore_cmd)) { + ret = 1; + goto err; + } + + ret = init_parallel_restore_cmd_by_head(restore_cmd); + if (ret) { + goto err; + } + + ret = recv_gpu_ids(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_cmds(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_dmabuf_fds(client_fd, restore_cmd); + +err: + close(client_fd); +err_accept: + close(sock_fd); + return ret; +} + +int close_parallel_restore_server(void) +{ + int sock_fd; + int ret = 0; + parallel_restore_cmd_head cmd_head; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head)); + if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + +err: + close(sock_fd); + return ret; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h new file mode 100644 index 0000000000..d7200c6bd5 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -0,0 +1,54 @@ +#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ + +typedef struct { + int id; + int fd_write_num; /* The number of buffer objects to be restored. */ + int entry_num; /* The number of restore commands.*/ + int gpu_num; +} parallel_restore_cmd_head; + +typedef struct { + int gpu_id; + int minor; +} parallel_gpu_info; + +typedef struct { + int gpu_id; + int write_id; + uint64_t read_offset; + uint64_t write_offset; + uint64_t size; +} parallel_restore_entry; + +typedef struct { + parallel_restore_cmd_head cmd_head; + int *fds_write; + parallel_gpu_info *gpu_ids; + parallel_restore_entry *entries; +} parallel_restore_cmd; + +/* + * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU + * buffer object. However, initially, the ownership of these buffer objects and the metadata for + * restoration are all with the target process. Therefore, we introduce a series of functions to + * help the target process send these tasks to the main CRIU process. + */ +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd); + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +int install_parallel_sock(void); + +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd); + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd); + +int close_parallel_restore_server(void); + +#endif \ No newline at end of file diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 99e4caf743..9ccb042249 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -93,7 +93,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) int fd[2], buf_off; if (pipe(fd) != 0) { - pr_err("Couldn't create pipes for reading cuda-checkpoint output\n"); + pr_perror("Couldn't create pipes for reading cuda-checkpoint output"); return -1; } @@ -101,7 +101,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) int child_pid = fork(); if (child_pid == -1) { - pr_err("Failed to fork to exec cuda-checkpoint\n"); + pr_perror("Failed to fork to exec cuda-checkpoint"); close(fd[READ]); close(fd[WRITE]); return -1; @@ -166,7 +166,6 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) } if (WIFSIGNALED(status)) { int sig = WTERMSIG(status); - pr_err("cuda-checkpoint unexpectedly signaled with %d: %s\n", sig, strsignal(sig)); } else if (WIFEXITED(status)) { exit_code = WEXITSTATUS(status); @@ -283,8 +282,8 @@ static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigse * a compel_interrupt_task() */ if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) { - pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n", - restore_tid); + pr_perror("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state", + restore_tid); return -1; } @@ -295,12 +294,12 @@ static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigse } if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) { - pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid); + pr_perror("Failed to set ptrace options on interrupt for restore tid %d", restore_tid); return -1; } if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) { - pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid); + pr_perror("Unable to restore original sigmask to restore tid %d", restore_tid); return -1; } @@ -312,7 +311,7 @@ static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) k_rtsigset_t block; if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) { - pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid); + pr_perror("Failed to get current sigmask for restore tid %d", restore_tid); return -1; } @@ -320,18 +319,18 @@ static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) ksigdelset(&block, SIGTRAP); if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) { - pr_err("Failed to block signals on restore tid %d\n", restore_tid); + pr_perror("Failed to block signals on restore tid %d", restore_tid); return -1; } // Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) { - pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid); + pr_perror("Could not clear ptrace options on restore tid %d", restore_tid); return -1; } if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) { - pr_err("Could not resume cuda restore tid %d\n", restore_tid); + pr_perror("Could not resume cuda restore tid %d", restore_tid); return -1; } @@ -396,12 +395,9 @@ int cuda_plugin_checkpoint_devices(int pid) status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); - goto interrupt; } -interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); - return status != 0 ? -1 : int_ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index d843793ea2..ed883f3002 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -1,53 +1,14 @@ FROM alpine ARG CC=gcc -RUN apk update && apk add \ - $CC \ - bash \ - build-base \ - coreutils \ - procps \ - git \ - gnutls-dev \ - libaio-dev \ - libcap-dev \ - libnet-dev \ - libnl3-dev \ - nftables \ - nftables-dev \ - pkgconfig \ - protobuf-c-dev \ - protobuf-dev \ - py3-pip \ - py3-protobuf \ - python3 \ - sudo \ - libcap-utils \ - libdrm-dev \ - util-linux \ - util-linux-dev - COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date -RUN apk add \ - ip6tables \ - iptables \ - iptables-legacy \ - nftables \ - iproute2 \ - tar \ - bash \ - go \ - e2fsprogs \ - py-yaml \ - py3-importlib-metadata \ - asciidoctor +RUN apk add --no-cache "$CC" && /criu/contrib/dependencies/apk-packages.sh + +RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test -RUN pip3 install junit_xml --break-system-packages - RUN make -C test/zdtm diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index 9d11194bb0..261bd2d799 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -5,41 +5,11 @@ ARG CC=gcc # Initialize machine ID RUN systemd-machine-id-setup -RUN pacman -Syu --noconfirm \ - $CC \ - bash \ - make \ - coreutils \ - git \ - gnutls \ - libaio \ - libcap \ - libnet \ - libnl \ - nftables \ - pkgconfig \ - protobuf-c \ - protobuf \ - python-pip \ - python-protobuf \ - which \ - sudo \ - iptables \ - nftables \ - iproute2 \ - tar \ - bash \ - go \ - python-yaml \ - asciidoctor \ - python-junit-xml \ - python-importlib-metadata \ - libdrm \ - util-linux-libs \ - diffutils - COPY . /criu WORKDIR /criu + +RUN pacman -Syu --noconfirm "$CC" && contrib/dependencies/pacman-packages.sh + RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 deleted file mode 100644 index a672123441..0000000000 --- a/scripts/build/Dockerfile.centos8 +++ /dev/null @@ -1,50 +0,0 @@ -FROM registry.centos.org/centos/centos:8 - -ARG CC=gcc - -RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core -RUN yum config-manager --set-enabled powertools -RUN yum install -y --allowerasing \ - asciidoc \ - coreutils \ - chkconfig \ - diffutils \ - findutils \ - gcc \ - git \ - gnutls-devel \ - iproute \ - iptables \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - libselinux-devel \ - make \ - procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - python3-devel \ - python3-PyYAML \ - python3-protobuf \ - python3-pip \ - sudo \ - tar \ - which \ - xmlto - -RUN alternatives --set python /usr/bin/python3 -ENV PYTHON=python3 - -COPY . /criu -WORKDIR /criu - -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date - -# The rpc test cases are running as user #1000, let's add the user -RUN adduser -u 1000 test - -RUN pip3 install junit_xml - -RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/build/Dockerfile.fedora.tmpl b/scripts/build/Dockerfile.fedora.tmpl index 9d3bb0f879..c26a5fd576 100644 --- a/scripts/build/Dockerfile.fedora.tmpl +++ b/scripts/build/Dockerfile.fedora.tmpl @@ -1,11 +1,10 @@ ARG CC=gcc -COPY scripts/ci/prepare-for-fedora-rawhide.sh /bin/prepare-for-fedora-rawhide.sh -RUN /bin/prepare-for-fedora-rawhide.sh - COPY . /criu WORKDIR /criu +RUN dnf install -y "$CC" && scripts/ci/prepare-for-fedora-rawhide.sh + RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.hotspot-alpine b/scripts/build/Dockerfile.hotspot-alpine index 6caf9d0b1b..cd632dddf5 100644 --- a/scripts/build/Dockerfile.hotspot-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -1,30 +1,11 @@ FROM docker.io/library/eclipse-temurin:11-alpine ARG CC=gcc -RUN apk update && apk add \ - bash \ - build-base \ - coreutils \ - git \ - gnutls-dev \ - libaio-dev \ - libcap-dev \ - libnet-dev \ - libnl3-dev \ - pkgconfig \ - protobuf-c-dev \ - protobuf-dev \ - python3 \ - sudo \ - maven \ - ip6tables \ - iptables \ - util-linux-dev \ - bash - COPY . /criu WORKDIR /criu +RUN apk add --no-cache maven "$CC" && contrib/dependencies/apk-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 67de916acb..a459e1ec71 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -1,33 +1,11 @@ -FROM docker.io/library/eclipse-temurin:11-focal +FROM docker.io/library/eclipse-temurin:11-jammy ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install protobuf-c-compiler \ - libprotobuf-c-dev \ - libaio-dev \ - libprotobuf-dev \ - protobuf-compiler \ - libcap-dev \ - libnl-3-dev \ - gdb \ - bash \ - python3-protobuf \ - python3-yaml \ - libnet-dev \ - libnl-route-3-dev \ - libbsd-dev \ - make \ - git \ - pkg-config \ - iptables \ - gcc \ - uuid-dev \ - maven - COPY . /criu WORKDIR /criu +RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index d218e06414..a37f16e495 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -1,32 +1,10 @@ ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - uuid-dev \ - python3-minimal - COPY . /criu WORKDIR /criu +RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh + RUN uname -m && setarch linux32 uname -m && setarch --list RUN make mrproper && date && \ diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 0ae4727d2c..18664f100a 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,34 +1,12 @@ -FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-focal +FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-jammy ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install protobuf-c-compiler \ - libprotobuf-c-dev \ - libaio-dev \ - libprotobuf-dev \ - protobuf-compiler \ - libcap-dev \ - libnl-3-dev \ - gdb \ - bash \ - python3-protobuf \ - python3-yaml \ - libnet-dev \ - libnl-route-3-dev \ - libbsd-dev \ - make \ - git \ - pkg-config \ - iptables \ - gcc \ - uuid-dev \ - maven - RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf COPY . /criu WORKDIR /criu +RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl index e95a433067..8933a6c828 100644 --- a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl +++ b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl @@ -1,5 +1,3 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN apt-get clean -y && apt-get update -y && apt-get install -y --no-install-recommends gnupg2 @@ -12,33 +10,6 @@ COPY scripts/ci/riscv64-cross/riscv64-sources.list /etc/apt/sources.list.d/ RUN dpkg --add-architecture ${DEBIAN_ARCH} && \ apt-get update -y -# Install required packages -RUN apt-get install -y --no-install-recommends \ - build-essential \ - pkg-config \ - git \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} \ - libnftables-dev:${DEBIAN_ARCH} \ - libgnutls28-dev:${DEBIAN_ARCH} \ - iproute2:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -55,4 +26,6 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +RUN contrib/dependencies/apt-cross-packages.sh + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl index 65ae558334..56104081f0 100644 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -1,30 +1,7 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN echo "deb http://deb.debian.org/debian/ stable main" >> /etc/apt/sources.list && \ dpkg --add-architecture ${DEBIAN_ARCH} -RUN apt-install \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} \ - libdrm-dev:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -41,6 +18,8 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu +RUN contrib/dependencies/apt-cross-packages.sh + # amdgpu_plugin with armv7 is not supported RUN make mrproper && date && \ make -j $(nproc) && \ diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index 3d6de10441..498b99be9f 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -1,39 +1,11 @@ ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install +COPY . /criu +WORKDIR /criu # On Ubuntu, kernel modules such as ip_tables and xt_mark may not be loaded by default # We need to install kmod to enable iptables to load these modules for us. -RUN apt-install \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libbsd-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnftables-dev \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - iproute2 \ - kmod \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-minimal \ - python3-protobuf \ - uuid-dev \ - python3-yaml - -COPY . /criu -WORKDIR /criu +RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh RUN git clean -dfx && date && \ # Check single object build diff --git a/scripts/build/Dockerfile.unstable-cross.tmpl b/scripts/build/Dockerfile.unstable-cross.tmpl index 3504b0433c..7edb289b6f 100644 --- a/scripts/build/Dockerfile.unstable-cross.tmpl +++ b/scripts/build/Dockerfile.unstable-cross.tmpl @@ -1,29 +1,7 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN echo "deb http://deb.debian.org/debian/ unstable main" >> /etc/apt/sources.list && \ dpkg --add-architecture ${DEBIAN_ARCH} -RUN apt-install \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -40,4 +18,6 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +RUN contrib/dependencies/apt-cross-packages.sh + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.x86_64.hdr b/scripts/build/Dockerfile.x86_64.hdr index 566b4c9160..a666f6c262 100644 --- a/scripts/build/Dockerfile.x86_64.hdr +++ b/scripts/build/Dockerfile.x86_64.hdr @@ -1,5 +1,5 @@ FROM ubuntu:24.04 -COPY scripts/ci/apt-install /bin/apt-install +COPY contrib/apt-install /bin/apt-install RUN apt-install gcc-multilib diff --git a/scripts/build/Makefile b/scripts/build/Makefile index 3893152270..a420cea942 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,4 +1,4 @@ -ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf centos8 +ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross riscv64-stable-cross UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 9dc0190b37..bad8065f23 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -11,7 +11,7 @@ ifdef CLANG target-suffix = -clang endif -TARGETS := alpine fedora-rawhide centos8 archlinux +TARGETS := alpine fedora-rawhide archlinux ZDTM_OPTS := UNAME := $(shell uname -m) export UNAME @@ -30,9 +30,9 @@ endif export CONTAINER_TERMINAL +# Here we assume that any CPU architecture besides x86_64 is running in containers +# that may not support running docker with '--privileged'. ifeq ($(UNAME),x86_64) - # On anything besides x86_64 Travis is running unprivileged LXD - # containers which do not support running docker with '--privileged'. CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged --userns=host --cgroupns=host -v /lib/modules:/lib/modules --tmpfs /run else CONTAINER_OPTS := --rm -v /lib/modules:/lib/modules --tmpfs /run diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index aaf443afdc..bc5a746675 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -2,6 +2,24 @@ set -x -e -o pipefail +# Workaround: Docker 28.x has a known regression that breaks the checkpoint and +# restore (C/R) feature. Let's install previous, or next major version. See +# https://github.com/moby/moby/issues/50750 for details on the bug. +export DEBIAN_FRONTEND=noninteractive +apt remove -y docker-ce docker-ce-cli +../../contrib/apt-install -y ca-certificates curl +install -m 0755 -d /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +chmod a+r /etc/apt/keyrings/docker.asc +# shellcheck disable=SC1091 +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" > /etc/apt/sources.list.d/docker.list +apt update -y +apt-cache madison docker-ce | awk '{ print $3 }' +verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -v ':28\.'| tail -n 1)" +../../contrib/apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" + # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json service docker restart diff --git a/scripts/ci/java-test.sh b/scripts/ci/java-test.sh index 7cf704f074..a5b13a1071 100755 --- a/scripts/ci/java-test.sh +++ b/scripts/ci/java-test.sh @@ -2,6 +2,8 @@ cd ../.. || exit 1 +sudo modprobe iptable_filter + failures="" docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh index d5646468e8..7e00ab65a8 100755 --- a/scripts/ci/loongarch64-qemu-test.sh +++ b/scripts/ci/loongarch64-qemu-test.sh @@ -4,7 +4,7 @@ set -o nounset set -o errexit set -x -./apt-install \ +../../contrib/apt-install \ apt-transport-https \ ca-certificates \ curl \ @@ -19,7 +19,7 @@ add-apt-repository \ $(lsb_release -cs) \ stable test" -./apt-install docker-ce +../../contrib/apt-install docker-ce # shellcheck source=/dev/null . /etc/lsb-release diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index f8ad9cf978..b0b45fcc3e 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -1,44 +1,21 @@ #!/bin/bash set -e -x +contrib/dependencies/dnf-packages.sh dnf install -y \ diffutils \ + e2fsprogs \ findutils \ gawk \ - gcc \ - git \ - gnutls-devel \ gzip \ - iproute \ - iptables \ - nftables \ - nftables-devel \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - libbsd-devel \ + kmod \ libselinux-utils \ - make \ procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - python3-PyYAML \ - python3-protobuf \ - python3-junit_xml \ python3-pip \ - python3-importlib-metadata \ python-unversioned-command \ redhat-rpm-config \ sudo \ - tar \ - which \ - e2fsprogs \ - rubygem-asciidoctor \ - libdrm-devel \ - libuuid-devel \ - kmod + tar # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it chmod 1777 /tmp diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 0c4a089757..05a3b71e8d 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -1,12 +1,7 @@ #!/bin/bash set -x -e -CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev - libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev - libnl-3-dev gdb bash libnet-dev util-linux asciidoctor - libnl-route-3-dev time libbsd-dev python3-yaml uuid-dev - libperl-dev pkg-config python3-protobuf python3-pip - python3-importlib-metadata python3-junit.xml libdrm-dev) +CI_PKGS=() X86_64_PKGS=(gcc-multilib) @@ -16,13 +11,10 @@ IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) if [ "$UNAME_M" != "x86_64" ]; then - # For Travis only x86_64 seems to be baremetal. Other - # architectures are running in unprivileged LXD containers. - # That seems to block most of CRIU's interfaces. - - # But with the introduction of baremetal aarch64 systems in - # Travis (arch: arm64-graviton2) we can override this using - # an environment variable + # Some tests rely on kernel features that may not be available + # when running in a container. Here we assume that x86_64 systems + # are baremetal, and skip the tests for all other CPU architectures. + # The RUN_TESTS environment variable can override this, e.g., for aarch64. [ -n "$RUN_TESTS" ] || SKIP_CI_TEST=1 fi @@ -36,7 +28,7 @@ ci_prep () { # not run anymore with 'sudo -u \#1000' if the UID does not exist. adduser -u 1000 --disabled-password --gecos "criutest" criutest || : - # This can fail on aarch64 travis + # This can fail on aarch64 service apport stop || : # Ubuntu has set up AppArmor in 24.04 so that it blocks use of user @@ -60,7 +52,8 @@ ci_prep () { CI_PKGS+=("${X86_64_PKGS[@]}") fi - scripts/ci/apt-install "${CI_PKGS[@]}" + contrib/dependencies/apt-packages.sh + contrib/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" } @@ -187,7 +180,7 @@ if [ "${COMPAT_TEST}x" = "yx" ] ; then done apt-get remove "${INCOMPATIBLE_LIBS[@]}" dpkg --add-architecture i386 - scripts/ci/apt-install "${IA32_PKGS[@]}" + contrib/apt-install "${IA32_PKGS[@]}" mkdir -p /usr/lib/x86_64-linux-gnu/ mv "$REFUGE"/* /usr/lib/x86_64-linux-gnu/ fi @@ -262,7 +255,7 @@ if [ -z "$SKIP_EXT_DEV_TEST" ]; then fi make -C test/others/make/ run CC="$CC" -if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then +if [ -n "$CIRCLECI" ]; then # GitHub Actions (and Cirrus CI) does not provide a real TTY and CRIU will fail with: # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index ed5a011787..5f2de32b84 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -6,42 +6,42 @@ set -e set -x -VAGRANT_VERSION=2.4.1 -FEDORA_VERSION=40 -FEDORA_BOX_VERSION=40.20240414.0 +VAGRANT_VERSION=2.4.7 +FEDORA_VERSION=42 +FEDORA_BOX_VERSION=1.1.0 setup() { - if [ -n "$TRAVIS" ]; then - # Load the kvm modules for vagrant to use qemu - modprobe kvm kvm_intel - fi - # Tar up the git checkout to have vagrant rsync it to the VM - tar cf criu.tar ../../../criu + tar cf /tmp/criu.tar -C ../../../ criu # Cirrus has problems with the following certificate. wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb - ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu \ - ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base \ - openssh-client + ../../contrib/apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ + ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base openssh-client systemctl restart libvirtd vagrant plugin install vagrant-libvirt - vagrant init fedora/${FEDORA_VERSION}-cloud-base --box-version ${FEDORA_BOX_VERSION} + vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} + # The default libvirt Vagrant VM uses 512MB. - # Travis VMs should have around 7.5GB. + # VMs in our CI typically have around 16GB. # Increasing it to 4GB should work. sed -i Vagrantfile -e 's,^end$, config.vm.provider :libvirt do |libvirt|'"\n"' libvirt.memory = 4096;end'"\n"'end,g' + # Sync /tmp/criu.tar into the VM + # We want to use $HOME without expansion + # shellcheck disable=SC2016 + sed -i Vagrantfile -e 's|^end$| config.vm.provision "file", source: "/tmp/criu.tar", destination: "$HOME/criu.tar"'"\n"'end|g' + vagrant up --provider=libvirt --no-tty mkdir -p /root/.ssh vagrant ssh-config >> /root/.ssh/config - ssh default sudo dnf upgrade -y - ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ - libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ - rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel + # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd + + ssh default 'sudo mkdir -p --mode=777 /vagrant && mv $HOME/criu.tar /vagrant && cd /vagrant && tar xf criu.tar' + ssh default sudo dnf upgrade -y + ssh default sudo /vagrant/criu/contrib/dependencies/dnf-packages.sh ssh default cat /proc/cmdline } @@ -49,7 +49,7 @@ fedora-no-vdso() { ssh default sudo grubby --update-kernel ALL --args="vdso=0" vagrant reload ssh default cat /proc/cmdline - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + ssh default 'cd /vagrant/criu; make -j' ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -a --keep-going' # This test (pidfd_store_sk) requires pidfd_getfd syscall which is guaranteed in Fedora 33. # It is also skipped from -a because it runs in RPC mode only @@ -57,6 +57,10 @@ fedora-no-vdso() { } fedora-rawhide() { + # Upgrade the kernel to the latest vanilla one + ssh default sudo dnf -y copr enable @kernel-vanilla/stable + ssh default sudo dnf upgrade -y + # The 6.2 kernel of Fedora 38 in combination with rawhide userspace breaks # zdtm/static/socket-tcp-nfconntrack. To activate the new kernel previously # installed this reboots the VM. @@ -74,12 +78,12 @@ fedora-rawhide() { # In the container it is not possible to change the state of selinux. # Let's just disable it for this test run completely. ssh default 'sudo setenforce Permissive' - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' + ssh default 'cd /vagrant/criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' } fedora-non-root() { ssh default uname -a - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + ssh default 'cd /vagrant/criu; make -j' # Setting the capability should be the only line needed to run as non-root on Fedora # In other environments either set /proc/sys/kernel/yama/ptrace_scope to 0 or grant cap_sys_ptrace to criu ssh default 'sudo setcap cap_checkpoint_restore+eip /vagrant/criu/criu/criu' diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index fb5d2ef7ad..727e9689ea 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -35,34 +35,6 @@ int main(void) } endef -define FEATURE_TEST_STRLCPY - -#include - -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - -int main(void) -{ - return strlcpy(NULL, NULL, 0); -} -endef - -define FEATURE_TEST_STRLCAT - -#include - -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - -int main(void) -{ - return strlcat(NULL, NULL, 0); -} -endef - define FEATURE_TEST_PTRACE_PEEKSIGINFO #include diff --git a/scripts/install-debian-pkgs.sh b/scripts/install-debian-pkgs.sh deleted file mode 100755 index 8be49c7871..0000000000 --- a/scripts/install-debian-pkgs.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -# Install required packages for development environment in Debian Distro - -REQ_PKGS=${REQ_PKGS:=contrib/debian/dev-packages.lst} - -help_msg="Install required packages for development environment in Debian Distro -Usage: - scripts/install-debian-pkgs.sh" - -function print_help() -{ - exec echo -e "$help_msg" -} - -function process() -{ - sudo apt-get update - sudo apt-get install -yq "$( sed 's/\#.*$//' "${REQ_PKGS}" )" -} - -if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then - print_help -else - process -fi diff --git a/scripts/nmk/scripts/tools.mk b/scripts/nmk/scripts/tools.mk index 724204a03c..de5782c137 100644 --- a/scripts/nmk/scripts/tools.mk +++ b/scripts/nmk/scripts/tools.mk @@ -23,7 +23,7 @@ MAKE := make MKDIR := mkdir -p AWK := awk PERL := perl -FULL_PYTHON := $(shell which python3 2>/dev/null) +FULL_PYTHON := $(shell command -v python3 2>/dev/null) PYTHON ?= $(shell basename $(FULL_PYTHON)) FIND := find SH := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ @@ -36,7 +36,7 @@ CTAGS := ctags export RM HOSTLD LD HOSTCC CC CPP AS AR STRIP OBJCOPY OBJDUMP export NM SH MAKE MKDIR AWK PERL PYTHON SH CSCOPE -export USE_ASCIIDOCTOR ?= $(shell which asciidoctor 2>/dev/null) +export USE_ASCIIDOCTOR ?= $(shell command -v asciidoctor 2>/dev/null) # # Footer. diff --git a/test/check_actions.py b/test/check_actions.py deleted file mode 100755 index 84d738dbb7..0000000000 --- a/test/check_actions.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import os - -actions = set(['pre-dump', 'pre-restore', 'post-dump', 'setup-namespaces', \ - 'post-setup-namespaces', 'post-restore', 'post-resume', \ - 'network-lock', 'network-unlock' ]) -errors = [] -af = os.path.dirname(os.path.abspath(__file__)) + '/actions_called.txt' - -for act in open(af): - act = act.strip().split() - act.append('EMPTY') - act.append('EMPTY') - - if act[0] == 'EMPTY': - raise Exception("Error in test, bogus actions line") - - if act[1] == 'EMPTY': - errors.append('Action %s misses CRTOOLS_IMAGE_DIR' % act[0]) - - if act[0] in ('post-dump', 'setup-namespaces', 'post-setup-namespaces', \ - 'post-restore', 'post-resume', 'network-lock', 'network-unlock'): - if act[2] == 'EMPTY': - errors.append('Action %s misses CRTOOLS_INIT_PID' % act[0]) - elif not act[2].isdigit() or int(act[2]) == 0: - errors.append('Action %s PID is not number (%s)' % - (act[0], act[2])) - - actions -= set([act[0]]) - -if actions: - errors.append('Not all actions called: %r' % actions) - -if errors: - for x in errors: - print(x) - sys.exit(1) - -print('PASS') diff --git a/test/inhfd/memfd.py.checkskip b/test/inhfd/memfd.py.checkskip index 27e2b7b155..32c57d929c 100755 --- a/test/inhfd/memfd.py.checkskip +++ b/test/inhfd/memfd.py.checkskip @@ -3,5 +3,5 @@ import ctypes libc = ctypes.CDLL(None) -# libc may not have memfd_create (e.g., centos on travis) +# libc may not have memfd_create (e.g., centos) libc.memfd_create("test".encode('utf8'), 0) diff --git a/test/jenkins/actions.sh b/test/jenkins/actions.sh deleted file mode 100755 index 8019045004..0000000000 --- a/test/jenkins/actions.sh +++ /dev/null @@ -1,8 +0,0 @@ -# Check how crit de/encodes images -set -e -source `dirname $0`/criu-lib.sh -# prep -rm -f actions_called.txt -./test/zdtm.py run -t zdtm/static/env00 --script "$(pwd)/test/show_action.sh" || fail -./test/check_actions.py || fail -exit 0 diff --git a/test/jenkins/criu-lazy-migration.pipeline b/test/jenkins/criu-lazy-migration.pipeline index 2c863f170d..45dc2c7766 100644 --- a/test/jenkins/criu-lazy-migration.pipeline +++ b/test/jenkins/criu-lazy-migration.pipeline @@ -21,7 +21,6 @@ pipeline { stage('Test'){ steps { sh './test/jenkins/run_ct sh -c "mount --make-rprivate / && mount --rbind . /mnt && cd /mnt && ./test/jenkins/criu-lazy-migration.sh"' - junit 'test/report/criu-testreport*.xml' } } } diff --git a/test/others/action-script/.gitignore b/test/others/action-script/.gitignore index c0b6a2490a..ca9a0b5416 100644 --- a/test/others/action-script/.gitignore +++ b/test/others/action-script/.gitignore @@ -1 +1 @@ -img-dir-* +actions_called.txt diff --git a/test/others/action-script/Makefile b/test/others/action-script/Makefile index f1ce191dbc..594edc0701 100644 --- a/test/others/action-script/Makefile +++ b/test/others/action-script/Makefile @@ -1,5 +1,3 @@ run: - @make -C .. loop ./run.sh - .PHONY: run diff --git a/test/others/action-script/action-script.sh b/test/others/action-script/action-script.sh deleted file mode 100755 index aba8292c05..0000000000 --- a/test/others/action-script/action-script.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -touch action-hook-"$CRTOOLS_SCRIPT_ACTION" diff --git a/test/others/action-script/check_actions.py b/test/others/action-script/check_actions.py new file mode 100755 index 0000000000..0140d8762d --- /dev/null +++ b/test/others/action-script/check_actions.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import os +import sys + +EXPECTED_ACTIONS = [ + 'pre-dump', + 'network-lock', + 'post-dump', + 'pre-restore', + 'setup-namespaces', + 'post-setup-namespaces', + 'post-restore', + 'network-unlock', + 'pre-resume', + 'post-resume', +] + +errors = [] +actions_called = [] +actions_called_file = os.path.join(os.path.dirname(__file__), 'actions_called.txt') + +with open(actions_called_file) as f: + for index, line in enumerate(f): + parts = line.strip().split() + parts += ['EMPTY'] * (3 - len(parts)) + action_hook, image_dir, pid = parts + + if action_hook == 'EMPTY': + raise ValueError("Error in test: bogus actions line") + + expected_action = EXPECTED_ACTIONS[index] if index < len(EXPECTED_ACTIONS) else None + if action_hook != expected_action: + raise ValueError(f"Invalid action: {action_hook} != {expected_action}") + + if image_dir == 'EMPTY': + errors.append(f'Action {action_hook} misses CRTOOLS_IMAGE_DIR') + + if action_hook != 'pre-restore': + if pid == 'EMPTY': + errors.append(f'Action {action_hook} misses CRTOOLS_INIT_PID') + elif not pid.isdigit() or int(pid) == 0: + errors.append(f'Action {action_hook} PID is not a valid number ({pid})') + + actions_called.append(action_hook) + +if actions_called != EXPECTED_ACTIONS: + errors.append(f'Not all actions called: {actions_called!r}') + +if errors: + print('\n'.join(errors)) + sys.exit(1) + +print('Check Actions PASS') diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh index a82fccf359..574f6fc863 100755 --- a/test/others/action-script/run.sh +++ b/test/others/action-script/run.sh @@ -1,60 +1,11 @@ #!/bin/bash -set -ebm +set -e -# shellcheck source=test/others/env.sh -source ../env.sh || exit 1 +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -SELFDIR="$(dirname "$(readlink -f "$0")")" -SCRIPT="$SELFDIR/action-script.sh" -IMGDIR="$SELFDIR/img-dir-$$" +rm -f "${SCRIPT_DIR}"/actions_called.txt +"${SCRIPT_DIR}"/../../zdtm.py run -t zdtm/static/env00 -f ns --script "$SCRIPT_DIR/show_action.sh" || exit 1 +"${SCRIPT_DIR}"/check_actions.py || exit 1 -rm -rf "$IMGDIR" -mkdir "$IMGDIR" - -trap "cleanup" QUIT TERM INT HUP EXIT - -# shellcheck disable=SC2317 -# https://github.com/koalaman/shellcheck/issues/2660 -function cleanup() -{ - if [[ -n "$PID" ]]; then - kill -9 "$PID" - fi -} - -PID=$(../loop) -if ! $CRIU dump -v4 -o dump.log -t "$PID" -D "$IMGDIR" --action-script "$SCRIPT"; then - echo "Failed to checkpoint process $PID" - cat dump.log - kill -9 "$PID" - exit 1 -fi - -if ! $CRIU restore -v4 -o restore.log -D "$IMGDIR" -d --pidfile test.pidfile --action-script "$SCRIPT"; then - echo "CRIU restore failed" - echo FAIL - exit 1 -fi - -PID=$(cat "$IMGDIR"/test.pidfile) - -found_missing_file=false -hooks=("pre-dump" "post-dump" "pre-restore" "pre-resume" "post-restore" "post-resume") - -for hook in "${hooks[@]}" -do - if [ ! -e "$IMGDIR/action-hook-$hook" ]; then - echo "ERROR: action-hook-$hook does not exist" - found_missing_file=true - fi -done - -if [ "$found_missing_file" = true ]; then - exit 1 -fi - -echo PASS - -rm -rf "$IMGDIR" exit 0 diff --git a/test/show_action.sh b/test/others/action-script/show_action.sh similarity index 66% rename from test/show_action.sh rename to test/others/action-script/show_action.sh index 86468b67ae..afbfc3f27e 100755 --- a/test/show_action.sh +++ b/test/others/action-script/show_action.sh @@ -1,3 +1,4 @@ #!/bin/bash + echo "${CRTOOLS_SCRIPT_ACTION} ${CRTOOLS_IMAGE_DIR} ${CRTOOLS_INIT_PID}" \ - >> "$(dirname $0)/actions_called.txt" + >> "$(dirname "$0")/actions_called.txt" diff --git a/test/others/libcriu/.gitignore b/test/others/libcriu/.gitignore index 0f6e52bb4e..30a56999c4 100644 --- a/test/others/libcriu/.gitignore +++ b/test/others/libcriu/.gitignore @@ -8,3 +8,4 @@ test_pre_dump test_feature_check output/ libcriu.so.* +test_rpc_config diff --git a/test/others/libcriu/Makefile b/test/others/libcriu/Makefile index ae73305331..927f17c236 100644 --- a/test/others/libcriu/Makefile +++ b/test/others/libcriu/Makefile @@ -3,10 +3,12 @@ include ../../../../criu/Makefile.versions TESTS += test_sub TESTS += test_self TESTS += test_notify +TESTS += test_rpc_config TESTS += test_iters TESTS += test_errno TESTS += test_join_ns TESTS += test_pre_dump +TESTS += test_check TESTS += test_feature_check all: $(TESTS) diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index f7d363aabe..6b36d44960 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -55,6 +55,7 @@ run_test() { run_test test_sub run_test test_self run_test test_notify +run_test test_rpc_config if [ "$(uname -m)" = "x86_64" ]; then # Skip this on aarch64 as aarch64 has no dirty page tracking run_test test_iters @@ -62,6 +63,7 @@ if [ "$(uname -m)" = "x86_64" ]; then fi run_test test_errno run_test test_join_ns +run_test test_check if criu check --feature mem_dirty_track > /dev/null; then export CRIU_FEATURE_MEM_TRACK=1 fi diff --git a/test/others/libcriu/test_check.c b/test/others/libcriu/test_check.c new file mode 100644 index 0000000000..4af3b36306 --- /dev/null +++ b/test/others/libcriu/test_check.c @@ -0,0 +1,17 @@ +#include +#include "criu.h" +#include "lib.h" + +int main(int argc, char **argv) +{ + int ret; + + printf("--- Start check ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + + if (criu_check()) + return -1; + + return 0; +} diff --git a/test/others/libcriu/test_rpc_config.c b/test/others/libcriu/test_rpc_config.c new file mode 100644 index 0000000000..529f136371 --- /dev/null +++ b/test/others/libcriu/test_rpc_config.c @@ -0,0 +1,223 @@ +#include "criu.h" +#include "lib.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RANDOM_NAME_LEN 6 +#define PATH_BUF_SIZE 128 + +static volatile sig_atomic_t stop = 0; +static char base_name[RANDOM_NAME_LEN + 1]; +static char log_file[PATH_BUF_SIZE]; +static char conf_file[PATH_BUF_SIZE]; + +static void handle_signal(int sig) +{ + (void)sig; + stop = 1; +} + +static void generate_random_base_name(void) +{ + const char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + size_t charset_len; + int i; + + charset_len = sizeof(charset) - 1; + + for (i = 0; i < RANDOM_NAME_LEN; i++) { + base_name[i] = charset[rand() % charset_len]; + } + base_name[i] = '\0'; + + snprintf(log_file, sizeof(log_file), "/tmp/criu-%s.log", base_name); + snprintf(conf_file, sizeof(conf_file), "/tmp/criu-%s.conf", base_name); +} + +static int create_criu_config_file(void) +{ + int fd; + FILE *fp; + + srand(time(NULL)); + generate_random_base_name(); + + fd = open(conf_file, O_CREAT | O_EXCL | O_WRONLY, 0600); + if (fd < 0) { + perror("Failed to create config file"); + return -1; + } + + fp = fdopen(fd, "w"); + if (!fp) { + perror("fdopen failed"); + close(fd); + unlink(conf_file); + return -1; + } + + fprintf(fp, "log-file=%s\n", log_file); + fflush(fp); + fclose(fp); + + return 0; +} + +static int check_log_file(void) +{ + struct stat st; + + if (stat(log_file, &st) < 0) { + perror("Config file does not exist"); + return -1; + } + + if (st.st_size == 0) { + fprintf(stderr, "Config file is empty\n"); + return -1; + } + + unlink(log_file); + return 0; +} + +int main(int argc, char **argv) +{ + int pipe_fd[2]; + pid_t pid; + int ret; + int child_ret; + + int img_fd = open(argv[2], O_DIRECTORY); + if (img_fd < 0) { + perror("Failed to open images directory"); + goto cleanup; + } + + if (create_criu_config_file() < 0) { + printf("Failed to create config file\n"); + return EXIT_FAILURE; + } + + if (pipe(pipe_fd) < 0) { + perror("pipe"); + return EXIT_FAILURE; + } + + pid = fork(); + if (pid < 0) { + perror("fork failed"); + return EXIT_FAILURE; + } + + if (pid == 0) { + /** child process **/ + printf(" `- loop: initializing\n"); + + if (setsid() < 0 || signal(SIGUSR1, handle_signal) == SIG_ERR) { + _exit(EXIT_FAILURE); + } + + close(STDIN_FILENO); + close(STDOUT_FILENO); + close(STDERR_FILENO); + close(pipe_fd[0]); + + child_ret = SUCC_ECODE; + write(pipe_fd[1], &child_ret, sizeof(child_ret)); + close(pipe_fd[1]); + + while (!stop) { + sleep(1); + } + + _exit(SUCC_ECODE); + } + + /** parent process **/ + close(pipe_fd[1]); + + ret = -1; + if (read(pipe_fd[0], &ret, sizeof(ret)) != sizeof(ret) || ret != SUCC_ECODE) { + printf("Error starting loop\n"); + goto cleanup; + } + + read(pipe_fd[0], &ret, 1); + close(pipe_fd[0]); + + printf("--- Loop process started (pid: %d) ---\n", pid); + + printf("--- Checkpoint ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + criu_set_images_dir_fd(img_fd); + criu_set_pid(pid); + criu_set_log_level(CRIU_LOG_DEBUG); + + /* The RPC config file should overwrite the log-file set below */ + printf("Setting dump RPC config file: %s\n", conf_file); + criu_set_config_file(conf_file); + criu_set_log_file("dump.log"); + + ret = criu_dump(); + if (ret < 0) { + what_err_ret_mean(ret); + kill(pid, SIGKILL); + printf("criu dump failed\n"); + goto cleanup; + } + + printf(" `- Dump succeeded\n"); + waitpid(pid, NULL, 0); + + if (check_log_file()) { + printf("Error: log file not overwritten by RPC config file\n"); + goto cleanup; + } + + printf("--- Restore loop ---\n"); + criu_init_opts(); + criu_set_images_dir_fd(img_fd); + criu_set_log_level(CRIU_LOG_DEBUG); + + /* The RPC config file should overwrite the log-file set below */ + printf("Setting restore RPC config file: %s\n", conf_file); + criu_set_config_file(conf_file); + criu_set_log_file("restore.log"); + + pid = criu_restore_child(); + if (pid <= 0) { + what_err_ret_mean(pid); + ret = EXIT_FAILURE; + goto cleanup; + } + + printf(" `- Restore returned pid %d\n", pid); + kill(pid, SIGUSR1); + + if (check_log_file()) { + printf("Error: log file not overwritten by RPC config file\n"); + goto cleanup; + } + +cleanup: + if (waitpid(pid, &ret, 0) < 0) { + perror("waitpid failed"); + return EXIT_FAILURE; + } + + printf("Remove RPC config file: %s\n", conf_file); + unlink(conf_file); + return chk_exit(ret, SUCC_ECODE); +} diff --git a/test/others/pycriu/.gitignore b/test/others/pycriu/.gitignore new file mode 100644 index 0000000000..567609b123 --- /dev/null +++ b/test/others/pycriu/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/test/others/pycriu/Makefile b/test/others/pycriu/Makefile new file mode 100644 index 0000000000..b6e3b48148 --- /dev/null +++ b/test/others/pycriu/Makefile @@ -0,0 +1,63 @@ +.SHELLFLAGS := -eu -o pipefail -c +.ONESHELL: + +CRIU ?= ../../../criu/criu +BUILD_DIR ?= build +SOCKET_NAME ?= criu_service.socket +PIDFILE_NAME ?= pidfile +SERVICE_LOG ?= service.log +PYTHON ?= python3 + +PIDFILE := $(BUILD_DIR)/$(PIDFILE_NAME) +CRIU_SOCKET := $(BUILD_DIR)/$(SOCKET_NAME) +STATUS_FIFO := $(BUILD_DIR)/startup.status +STATUS_FD := 200 + +run: start + cleanup() { $(MAKE) --no-print-directory stop || true; } + trap cleanup EXIT INT TERM + "$(PYTHON)" test_check.py + "$(PYTHON)" test_check_fail.py + "$(PYTHON)" test_check_images_dir.py + "$(PYTHON)" test_check_work_dir_fd.py + +start: + mkdir -p "$(BUILD_DIR)" + if [ -s "$(PIDFILE)" ] && kill -0 "$$(cat "$(PIDFILE)")" 2>/dev/null; then + echo "Service running (PID $$(cat "$(PIDFILE)"))." + exit 0 + fi + if ! command -v "$(CRIU)" >/dev/null 2>&1; then + echo "CRIU not found at $(CRIU)" + exit 1 + fi + mkfifo "$(STATUS_FIFO)" + exec $(STATUS_FD)<>"$(STATUS_FIFO)" + "$(CRIU)" service \ + -v4 \ + -W "$(BUILD_DIR)" \ + --address "$(SOCKET_NAME)" \ + -d \ + --pidfile "$(PIDFILE_NAME)" \ + -o "$(SERVICE_LOG)" \ + --status-fd "$(STATUS_FD)" + "$(PYTHON)" read.py "$(STATUS_FIFO)" + +stop: + if [ ! -s "$(PIDFILE)" ]; then + echo "pidfile missing or empty" + exit 1 + fi + pid=$$(cat "$(PIDFILE)") + if kill -0 "$$pid" 2>/dev/null; then + kill -9 "$$pid" || true + fi + rm -f "$(PIDFILE)" "$(CRIU_SOCKET)" "$(STATUS_FIFO)" + +clean: + if [ -s "$(PIDFILE)" ] && kill -0 "$$(cat "$(PIDFILE)")" 2>/dev/null; then + kill -9 "$$(cat "$(PIDFILE)")" || true + fi + rm -rf "$(BUILD_DIR)" + +.PHONY: start stop clean run \ No newline at end of file diff --git a/test/others/pycriu/read.py b/test/others/pycriu/read.py new file mode 120000 index 0000000000..c2c1e13658 --- /dev/null +++ b/test/others/pycriu/read.py @@ -0,0 +1 @@ +../rpc/read.py \ No newline at end of file diff --git a/test/others/pycriu/test_check.py b/test/others/pycriu/test_check.py new file mode 100755 index 0000000000..9888158db2 --- /dev/null +++ b/test/others/pycriu/test_check.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + socket_path = os.path.join(SCRIPT_DIR, "build", "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + try: + criu.check() + except Exception as e: + print(f"FAIL: {e}") + return 1 + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_fail.py b/test/others/pycriu/test_check_fail.py new file mode 100755 index 0000000000..b5634c60b4 --- /dev/null +++ b/test/others/pycriu/test_check_fail.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + socket_path = os.path.join(SCRIPT_DIR, "build", "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + # Intentionally set only log_file (no images/work dir) to ensure check() fails + criu.opts.log_file = "check.log" + + try: + criu.check() + except Exception: + print("PASS") + return 0 + + print("FAIL: check() did not fail when log_file is set without images/work dir") + return 1 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_images_dir.py b/test/others/pycriu/test_check_images_dir.py new file mode 100755 index 0000000000..f479c2a88f --- /dev/null +++ b/test/others/pycriu/test_check_images_dir.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def _log_path(images_dir, log_file): + return log_file if os.path.isabs(log_file) else os.path.join(images_dir, log_file) + +def main(): + build_dir = os.path.join(SCRIPT_DIR, "build") + socket_path = os.path.join(build_dir, "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + criu.opts.images_dir = build_dir + criu.opts.log_file = "check.log" + criu.opts.log_level = 4 + + try: + criu.check() + except Exception as e: + lp = _log_path(build_dir, criu.opts.log_file) + msg = f"FAIL: {e} ({'see log: ' + lp if os.path.exists(lp) else 'no log found'})" + print(msg) + return 1 + + lp = _log_path(build_dir, criu.opts.log_file) + if not (os.path.isfile(lp) and os.path.getsize(lp) > 0): + print(f"FAIL: log file missing or empty: {lp}") + return 1 + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_work_dir_fd.py b/test/others/pycriu/test_check_work_dir_fd.py new file mode 100755 index 0000000000..e20a83097a --- /dev/null +++ b/test/others/pycriu/test_check_work_dir_fd.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + build_dir = os.path.join(SCRIPT_DIR, "build") + socket_path = os.path.join(build_dir, "criu_service.socket") + os.makedirs(build_dir, exist_ok=True) + + # Open a directory FD to use as work_dir_fd (prefer O_PATH if available) + flags = getattr(os, "O_PATH", 0) or os.O_RDONLY + fd = os.open(build_dir, flags) + + criu = pycriu.criu() + criu.use_sk(socket_path) + + criu.opts.work_dir_fd = fd + criu.opts.log_file = "check.log" + criu.opts.log_level = 4 + + try: + criu.check() + except Exception as e: + print(f"FAIL: {e}") + return 1 + finally: + try: + os.close(fd) + except Exception: + pass + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index b2f907abee..c0e56d5289 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -12,6 +12,7 @@ run: all chmod a+rwx build chmod a+rwx build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} rm -f build/status + rm -f build/_marker_* @# Create all log files to be accessible for anybody @# so that they can be displayed by any user. for i in imgs_errno/criu.log imgs_ps/page-server.log imgs_ps/dump.log \ @@ -47,7 +48,7 @@ rpc_pb2.py: rpc.proto protoc --proto_path=. --python_out=. rpc.proto rpc.pb-c.c: rpc.proto - protoc-c --proto_path=. --c_out=. rpc.proto + protoc --proto_path=. --c_out=. rpc.proto clean: rm -rf build rpc.pb-c.o test-c.o test-c rpc.pb-c.c rpc.pb-c.h rpc_pb2.py rpc_pb2.pyc criu diff --git a/test/others/rpc/action-script.sh b/test/others/rpc/action-script.sh new file mode 100755 index 0000000000..991e315de4 --- /dev/null +++ b/test/others/rpc/action-script.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +MARKER_FILE="_marker_${CRTOOLS_SCRIPT_ACTION}" + +if [ -z "$CRTOOLS_SCRIPT_ACTION" ]; then + echo "Error: CRTOOLS_SCRIPT_ACTION is not set." + exit 2 +fi + +if [ ! -f "$MARKER_FILE" ]; then + touch "$MARKER_FILE" +else + echo "Error: Running the same action hook for the second time" + exit 1 +fi + +exit 0 diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index 6cffe270d0..c1a8276d86 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -13,6 +13,9 @@ log_file = 'config_file_test.log' does_not_exist = 'does-not.exist' +script_path = os.path.dirname(os.path.abspath(__file__)) +action_script_file = os.path.join(script_path, 'action-script.sh') + def setup_config_file(content): # Creating a temporary file which will be used as configuration file. @@ -89,29 +92,37 @@ def test_broken_configuration_file(): sys.exit(-1) -def search_in_log_file(log, message): - with open(os.path.join(args['dir'], log)) as f: +def search_in_log_file(log_path, message): + with open(log_path) as f: if message not in f.read(): - print( - 'FAIL: Missing the expected error message (%s) in the log file' - % message) + print('FAIL: Missing the expected error message (%s) in the log file' % message) sys.exit(-1) +def print_log_file(log_path): + print("\n--- Begin log file: %s ---" % log_path) + with open(log_path, 'r') as f: + print(f.read()) + print("--- End log file ---\n") + + def check_results(resp, log): # Check if the specified log file exists - if not os.path.isfile(os.path.join(args['dir'], log)): + log_path = os.path.join(args['dir'], log) + if not os.path.isfile(log_path): print('FAIL: Expected log file %s does not exist' % log) sys.exit(-1) # Dump should have failed with: 'The criu itself is within dumped tree' if resp.type != rpc.DUMP: print('FAIL: Unexpected msg type %r' % resp.type) + print_log_file(log_path) sys.exit(-1) if 'The criu itself is within dumped tree' not in resp.cr_errmsg: print('FAIL: Missing the expected error message in RPC response') + print_log_file(log_path) sys.exit(-1) # Look into the log file for the same message - search_in_log_file(log, 'The criu itself is within dumped tree') + search_in_log_file(log_path, 'The criu itself is within dumped tree') def test_rpc_without_configuration_file(): @@ -156,6 +167,7 @@ def test_rpc_with_configuration_file_overwriting_rpc(): # file settings in the default configuration. log = does_not_exist content = 'log-file ' + log + '\n' + content += 'action-script ' + action_script_file + '\n' content += 'no-tcp-established\nno-shell-job' path = setup_config_file(content) # Only set the configuration file via RPC; @@ -180,11 +192,18 @@ def test_rpc_with_configuration_file_overwriting_rpc(): cleanup_output(args['dir']) +print("*** Test broken config file ***") test_broken_configuration_file() cleanup_output(args['dir']) + +print("*** Test RPC without config file ***") test_rpc_without_configuration_file() cleanup_output(args['dir']) + +print("*** Test RPC with config file ***") test_rpc_with_configuration_file() cleanup_output(args['dir']) + +print("*** Test configuration file overwriting RPC ***") test_rpc_with_configuration_file_overwriting_rpc() cleanup_output(args['dir']) diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index 4ea6c9d441..ea841199f9 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -40,7 +40,7 @@ def recv_resp(self): resp.ParseFromString(self.s.recv(self._MAX_MSG_SIZE)) return resp - def check_resp(self, resp, typ, err): + def check_resp(self, resp, typ, err, errmsg = None): if resp.type != typ: raise Exception('Unexpected response type ' + str(resp.type)) @@ -50,6 +50,9 @@ def check_resp(self, resp, typ, err): if err and resp.cr_errno != err: raise Exception('Unexpected cr_errno ' + str(resp.cr_errno)) + if errmsg and errmsg not in str(resp.cr_errmsg): + raise Exception('Unexpected cr_msg \'' + str(resp.cr_errmsg) + '\'') + def no_process(self): print('Try to dump unexisting process') # Get pid of non-existing process. @@ -132,11 +135,27 @@ def bad_request(self): print('Success') + def child_first_err(self): + print('Receive correct first error message') + + req = self.get_base_req() + req.type = rpc.CHECK + # Log file must not have subdirectory + req.opts.log_file = "/foo/bar.log" + + self.send_req(req) + resp = self.recv_resp() + + self.check_resp(resp, rpc.CHECK, None, "No subdirs are allowed in log_file name") + + print('Success') + def run(self): self.no_process() self.process_exists() self.bad_options() self.bad_request() + self.child_first_err() t = test() diff --git a/test/others/rpc/read.py b/test/others/rpc/read.py old mode 100644 new mode 100755 diff --git a/test/others/unix-callback/Makefile b/test/others/unix-callback/Makefile index 25bcf228b3..9840440773 100644 --- a/test/others/unix-callback/Makefile +++ b/test/others/unix-callback/Makefile @@ -4,7 +4,7 @@ run: all ./run.sh unix.pb-c.c: unix.proto - protoc-c --proto_path=. --c_out=. unix.proto + protoc --proto_path=. --c_out=. unix.proto unix-lib.so: unix-lib.c unix.pb-c.c gcc -g -Werror -Wall -shared -nostartfiles unix-lib.c unix.pb-c.c -o unix-lib.so -iquote ../../../criu/include -fPIC diff --git a/test/zdtm.py b/test/zdtm.py index 37ebe63b7b..e21356c30a 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -22,11 +22,11 @@ import tempfile import time import uuid +import site from builtins import input, int, open, range, str, zip import yaml -import pycriu as crpc from zdtm.criu_config import criu_config # File to store content of streamed images @@ -443,6 +443,7 @@ def __init__(self, name, desc, flavor, freezer, rootless): self._bins = [name] self._env = {'TMPDIR': os.environ.get('TMPDIR', '/tmp')} self._deps = desc.get('deps', []) + self._bind = desc.get('bind') self.auto_reap = True def __make_action(self, act, env=None, root=None): @@ -513,6 +514,8 @@ def start(self): if self.__flavor.ns: env['ZDTM_NEWNS'] = "1" env['ZDTM_ROOT'] = self.__flavor.root + if self._bind: + env['ZDTM_BIND'] = self._bind env['ZDTM_DEV'] = self.__flavor.devpath env['PATH'] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" @@ -1139,6 +1142,24 @@ def __init__(self, opts): self.__img_streamer_process = None self.__tls = self.__tls_options() if opts['tls'] else [] self.__criu_bin = opts['criu_bin'] + + global crpc + pycriu_search_path = opts.get('pycriu_search_path') + if pycriu_search_path: + sys.path.insert(0, pycriu_search_path) + + try: + import pycriu as crpc + if pycriu_search_path: + print(f"pycriu loaded from: {crpc.__file__}") + except ImportError: + if not pycriu_search_path: + print("Consider building CRIU or using '--pycriu-search-path' option.") + raise + finally: + if pycriu_search_path: + sys.path.pop(0) + self.__crit_bin = opts['crit_bin'] self.__pre_dump_mode = opts['pre_dump_mode'] self.__preload_libfault = bool(opts['preload_libfault']) @@ -1590,6 +1611,7 @@ def check(feature): def available(): if not os.access(opts['criu_bin'], os.X_OK): print("CRIU binary not found at %s" % opts['criu_bin']) + print("Consider building CRIU or using '--criu-bin' option.") sys.exit(1) def kill(self): @@ -2056,8 +2078,6 @@ def __init__(self, opts, nr_tests): self.__subs = {} self.__fail = False self.__file_report = None - self.__junit_file = None - self.__junit_test_cases = None self.__failed = [] self.__nr_skip = 0 if self.__max > 1 and self.__total > 1: @@ -2069,22 +2089,14 @@ def __init__(self, opts, nr_tests): if opts['report'] and (opts['keep_going'] or self.__total == 1): global TestSuite, TestCase - from junit_xml import TestCase, TestSuite now = datetime.datetime.now() att = 0 reportname = os.path.join(report_dir, "criu-testreport.tap") - junitreport = os.path.join(report_dir, "criu-testreport.xml") - while os.access(reportname, os.F_OK) or os.access( - junitreport, os.F_OK): + while os.access(reportname, os.F_OK): reportname = os.path.join(report_dir, "criu-testreport" + ".%d.tap" % att) - junitreport = os.path.join(report_dir, - "criu-testreport" + ".%d.xml" % att) att += 1 - self.__junit_file = open(junitreport, 'a') - self.__junit_test_cases = [] - self.__file_report = open(reportname, 'a') print(u"TAP version 13", file=self.__file_report) print(u"# Hardware architecture: " + arch, file=self.__file_report) @@ -2119,10 +2131,6 @@ def skip(self, name, reason): self.__runtest += 1 self.__nr_skip += 1 - if self.__junit_test_cases is not None: - tc = TestCase(name) - tc.add_skipped_info(reason) - self.__junit_test_cases.append(tc) if self.__file_report: testline = u"ok %d - %s # SKIP %s" % (self.__runtest, name, reason) print(testline, file=self.__file_report) @@ -2166,7 +2174,8 @@ def run_test(self, name, desc, flavor): 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint') + 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint', + 'pycriu_search_path') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2223,11 +2232,6 @@ def __wait_one(self, flags): # The following wait() is not useful for our domain logic. # It's useful for taming warnings in subprocess.Popen.__del__() sub['sub'].wait() - tc = None - if self.__junit_test_cases is not None: - tc = TestCase(sub['name'], - elapsed_sec=time.time() - sub['start']) - self.__junit_test_cases.append(tc) if status != 0: self.__fail = True failed_flavor = decode_flav(os.WEXITSTATUS(status)) @@ -2238,7 +2242,6 @@ def __wait_one(self, flags): with open(sub['log']) as sublog: output = sublog.read() details = {'output': output} - tc.add_error_info(output=output) print(testline, file=self.__file_report) print("%s" % yaml.safe_dump(details, explicit_start=True, @@ -2284,10 +2287,6 @@ def finish(self): if not opts['fault'] and check_core_files(): self.__fail = True if self.__file_report: - ts = TestSuite(opts['title'], self.__junit_test_cases, - os.getenv("NODE_NAME")) - self.__junit_file.write(TestSuite.to_xml_string([ts])) - self.__junit_file.close() self.__file_report.close() if opts['keep_going']: @@ -2857,6 +2856,9 @@ def get_cli_args(): rp.add_argument("--criu-bin", help="Path to criu binary", default='../criu/criu') + rp.add_argument("--pycriu-search-path", + help=f"Path to search for pycriu module first (e.g., {site.getsitepackages()[0]})", + default=None) rp.add_argument("--crit-bin", help="Path to crit binary", default='../crit/crit') @@ -2947,7 +2949,7 @@ def fork_zdtm(): if opts['debug']: sys.settrace(traceit) - if opts['action'] == 'run': + if opts['action'] == run_tests: criu.available() for tst in test_classes.values(): tst.available() diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index c19888da31..465285f085 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -27,7 +27,7 @@ ifeq ($(ARCH),arm) else ifeq ($(ARMV),7) ARCHCFLAGS += -march=armv7-a+fp else ifeq ($(ARMV),8) - # To build aarch32 on armv8 Travis-CI (see criu Makefile) + # To build aarch32 on armv8 (see criu Makefile) ARCHCFLAGS += -march=armv7-a ARMV := 7 endif @@ -66,6 +66,11 @@ endif export PKG_CONFIG_PATH endif +ifeq ($(SHSTK_ENABLE),1) + CFLAGS += -mshstk + LDFLAGS += -Wl,-z,shstk +endif + define pkg-libs $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" $(PKG_CONFIG) --libs $(1)) endef diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 3c0dbdeb80..822e09c928 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -28,8 +28,9 @@ extern int pivot_root(const char *new_root, const char *put_old); static int prepare_mntns(void) { int dfd, ret; - char *root, *criu_path, *dev_path; + char *root, *criu_path, *dev_path, *zdtm_bind; char path[PATH_MAX]; + char bind_path[PATH_MAX]; root = getenv("ZDTM_ROOT"); if (!root) { @@ -52,6 +53,21 @@ static int prepare_mntns(void) return -1; } + zdtm_bind = getenv("ZDTM_BIND"); + if (zdtm_bind) { + /* + * Bindmount the directory to itself. + * e.g.: The mnt_ro_root test makes "/" mount readonly, but we + * still want to write logs to /zdtm/static/ so let's make it + * separate writable bind mount. + */ + snprintf(bind_path, sizeof(bind_path), "%s/%s", root, zdtm_bind); + if (mount(bind_path, bind_path, NULL, MS_BIND, NULL)) { + fprintf(stderr, "Can't bind-mount ZDTM_BIND: %m\n"); + return -1; + } + } + dev_path = getenv("ZDTM_DEV"); if (dev_path) { snprintf(path, sizeof(path), "%s/dev", root); diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 6a19cad3c2..ea901a805d 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -37,6 +37,8 @@ TST_NOFILE := \ socket_udp-corked \ socket6_udp \ socket_udp_shutdown \ + socket_icmp \ + socket6_icmp \ sk-freebind \ sk-freebind-false \ socket_udplite \ @@ -150,6 +152,7 @@ TST_NOFILE := \ maps05 \ maps09 \ maps10 \ + maps11 \ mlock_setuid \ xids00 \ groups \ @@ -287,6 +290,7 @@ TST_NOFILE := \ PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') +pkg-config-atleast-version = $(shell sh -c '$(PKG_CONFIG) --atleast-version=$(2) $(1) && echo y') ifeq ($(call pkg-config-check,libbpf),y) TST_NOFILE += \ bpf_hash \ @@ -295,7 +299,10 @@ endif ifneq ($(ARCH),arm) ifneq ($(COMPAT_TEST),y) - TST_NOFILE += maps03 + TST_NOFILE += maps03 +ifeq ($(call pkg-config-atleast-version,libtracefs,1.7),y) + TST_NOFILE += uprobes +endif endif endif @@ -312,6 +319,7 @@ TST_FILE = \ write_read02 \ write_read10 \ maps00 \ + maps12 \ link10 \ file_attr \ deleted_unix_sock \ @@ -378,6 +386,8 @@ TST_FILE = \ sk-unix-listen02 \ sk-unix-listen03 \ sk-unix-listen04 \ + sk-unix-restore-fs-share \ + mnt_ext_file_bind_auto \ TST_DIR = \ cwd00 \ @@ -423,6 +433,7 @@ TST_DIR = \ mntns_ghost \ mntns_ghost01 \ mntns_ro_root \ + mnt_ro_root \ mntns_link_ghost \ mntns_shared_bind \ mntns_shared_bind02 \ @@ -628,6 +639,7 @@ socket-tcp6-closed: CFLAGS += -D ZDTM_IPV6 socket-tcp6-closed: CFLAGS += -D ZDTM_IPV4V6 socket-tcp-closed-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK socket-tcp-skip-in-flight: CFLAGS += -D ZDTM_IPV4V6 +socket6-icmp: CFLAGS += -DZDTM_IPV6 sock_ip_opts01: CFLAGS += -DZDTM_VAL_ZERO sock_tcp_opts01: CFLAGS += -DZDTM_VAL_ZERO tun_ns: CFLAGS += -DTUN_NS @@ -719,6 +731,9 @@ sk-unix-listen04: CFLAGS += -DSK_UNIX_LISTEN02 -DSK_UNIX_LISTEN03 cgroupv2_01: LDLIBS += -pthread +uprobes: CFLAGS += $(call pkg-cflags, libtracefs libtraceevent) +uprobes: LDLIBS += $(call pkg-libs, libtracefs libelf) + $(LIB): force $(Q) $(MAKE) -C $(LIBDIR) @@ -733,7 +748,7 @@ criu-rtc.pb-c.c: criu-rtc.proto $(Q)echo $@ >> .gitignore $(Q)echo $(@:%.c=%.h) >> .gitignore $(E) " PBCC " $@ - $(Q)protoc-c --proto_path=. --c_out=. criu-rtc.proto + $(Q)protoc --proto_path=. --c_out=. criu-rtc.proto criu-rtc.so: criu-rtc.c criu-rtc.pb-c.c $(E) " LD " $@ diff --git a/test/zdtm/static/get_smaps_bits.c b/test/zdtm/static/get_smaps_bits.c index 31d0d92b2f..3d952ac95d 100644 --- a/test/zdtm/static/get_smaps_bits.c +++ b/test/zdtm/static/get_smaps_bits.c @@ -6,6 +6,10 @@ #define MAP_HUGETLB 0x40000 #endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif @@ -18,6 +22,10 @@ #define MADV_DONTDUMP 16 #endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) { char *tok; @@ -41,6 +49,8 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; + else if (_vmflag_match(tok, "dp")) + *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) @@ -57,6 +67,8 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); + else if (_vmflag_match(tok, "wf")) + *madv |= (1ul << MADV_WIPEONFORK); /* * Anything else is just ignored. diff --git a/test/zdtm/static/maps02.c b/test/zdtm/static/maps02.c index 29f1372c9a..38244f0205 100644 --- a/test/zdtm/static/maps02.c +++ b/test/zdtm/static/maps02.c @@ -2,11 +2,19 @@ #include "zdtmtst.h" #include "get_smaps_bits.h" +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif -const char *test_doc = "Test shared memory with advises"; +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + +const char *test_doc = "Test private memory with advises"; const char *test_author = "Cyrill Gorcunov "; struct mmap_data { @@ -23,8 +31,14 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) { m->start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, flags, -1, 0); if (m->start == MAP_FAILED) { - pr_perror("mmap failed"); - return -1; + if (errno == EINVAL) { + test_msg("mmap failed, no kernel support\n"); + *m = (struct mmap_data){}; + return 0; + } else { + pr_perror("mmap failed"); + return -1; + } } if (madvise(m->start, MEM_SIZE, adv)) { @@ -43,12 +57,12 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) int main(int argc, char **argv) { - struct mmap_data m[5] = {}; + struct mmap_data m[7] = {}; size_t i; test_init(argc, argv); - test_msg("Alloc growsdown\n"); + test_msg("Alloc dontfork\n"); if (alloc_anon_mmap(&m[0], MAP_PRIVATE | MAP_ANONYMOUS, MADV_DONTFORK)) return -1; @@ -64,10 +78,18 @@ int main(int argc, char **argv) if (alloc_anon_mmap(&m[3], MAP_PRIVATE | MAP_ANONYMOUS, MADV_HUGEPAGE)) return -1; - test_msg("Alloc dontfork/random|mergeable\n"); + test_msg("Alloc mergeable\n"); if (alloc_anon_mmap(&m[4], MAP_PRIVATE | MAP_ANONYMOUS, MADV_MERGEABLE)) return -1; + test_msg("Alloc wipeonfork\n"); + if (alloc_anon_mmap(&m[5], MAP_PRIVATE | MAP_ANONYMOUS, MADV_WIPEONFORK)) + return -1; + + test_msg("Alloc droppable\n"); + if (alloc_anon_mmap(&m[6], MAP_DROPPABLE | MAP_ANONYMOUS, MADV_NORMAL)) + return -1; + test_msg("Fetch existing flags/adv\n"); for (i = 0; i < sizeof(m) / sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].orig_flags, &m[i].orig_madv)) diff --git a/test/zdtm/static/maps11.c b/test/zdtm/static/maps11.c new file mode 100644 index 0000000000..df309714b0 --- /dev/null +++ b/test/zdtm/static/maps11.c @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + +const char *test_doc = "Test MAP_DROPPABLE/MADV_WIPEONFORK mappings with 2 processes"; +const char *test_author = "Alexander Mikhalitsyn "; + +bool mem_is_zero(const uint8_t *buffer, size_t length) +{ + size_t i; + + for (i = 0; i < length; i++) + if (buffer[i] != 0) + return false; + + return true; +} + +int main(int argc, char **argv) +{ + uint8_t *p1, *p2; + pid_t pid; + int status; + const char data[] = "MADV_WIPEONFORK vma data"; + bool criu_was_there = false; + struct stat st1, st2; + + test_init(argc, argv); + + p1 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, + MAP_DROPPABLE | MAP_ANONYMOUS, 0, 0); + if (p1 == MAP_FAILED) { + if (errno == EINVAL) { + skip("mmap failed, no kernel support for MAP_DROPPABLE\n"); + goto skip; + } else { + pr_perror("mmap failed"); + return -1; + } + } + + p2 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (p2 == MAP_FAILED) { + pr_perror("mmap failed"); + return 1; + } + + if (madvise(p2, sizeof(data), MADV_WIPEONFORK)) { + pr_perror("madvise failed"); + return -1; + } + + /* contents of this mapping is supposed to be dropped after C/R */ + memcpy(p1, data, sizeof(data)); + + /* contents of this mapping is supposed to be dropped after fork() */ + memcpy(p2, data, sizeof(data)); + + /* + * Let's spawn a process before C/R so our mappings get inherited + * then, after C/R we need to ensure that CRIU memory premapping + * machinery works properly. + * + * It is important, because we restore MADV_WIPEONFORK on a later + * stages (after vma premapping happens) and we need to ensure that + * CRIU handles everything in a right way. + */ + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + test_waitsig(); + + /* + * Both mappings have VM_WIPEONFORK flag set, + * so we expect to have it null-ified after fork(). + */ + if (!mem_is_zero(p1, sizeof(data)) || + !mem_is_zero(p2, sizeof(data))) { + pr_err("1st child: memory check failed\n"); + return 1; + } + + return 0; + } + + /* + * A simple way to detect if C/R happened is to compare st_ino + * fields of stat() on the procfs files of the current task. + * + * Hopefully, this terrible hack is never used in real-world + * applications ;-) Here, we only need this to make test + * to pass with/without --nocr option. + */ + if (stat("/proc/self/status", &st1)) { + pr_perror("stat"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* signal a child process to continue */ + if (kill(pid, SIGTERM)) { + pr_perror("kill"); + goto err; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("1st waitpid"); + goto err; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("1st process didn't exit cleanly: status=%d", status); + goto err; + } + + if (stat("/proc/self/status", &st2)) { + pr_perror("stat"); + return 1; + } + + /* detect CRIU */ + criu_was_there = st1.st_ino != st2.st_ino; + + /* + * We should mark failure if one of the following happens: + * 1. MAP_DROPPABLE memory is not zero after C/R + * 2. MAP_DROPPABLE memory somehow changed without C/R + * (kernel issue? memory pressure?) + * 3. MADV_WIPEONFORK memory is not preserved + * + * We care about 2nd case only because we would like test + * to pass even with --nocr zdtm.py option. + */ + if ((criu_was_there && !mem_is_zero(p1, sizeof(data))) || + (!criu_was_there && memcmp(p1, data, sizeof(data))) || + memcmp(p2, data, sizeof(data))) { + fail("Data mismatch"); + return 1; + } + + /* contents of these mappings is supposed to be dropped after fork() */ + memcpy(p1, data, sizeof(data)); + memcpy(p2, data, sizeof(data)); + + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + if (!mem_is_zero(p1, sizeof(data)) || + !mem_is_zero(p2, sizeof(data))) { + pr_err("2nd child: memory check failed\n"); + return 1; + } + + return 0; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("2nd waitpid"); + goto err; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("2nd process didn't exit cleanly: status=%d", status); + goto err; + } + + pass(); + + return 0; +err: + if (waitpid(-1, NULL, WNOHANG) == 0) { + kill(pid, SIGTERM); + wait(NULL); + } + return 1; + +skip: + test_daemon(); + test_waitsig(); + pass(); + return 0; +} diff --git a/test/zdtm/static/maps12.c b/test/zdtm/static/maps12.c new file mode 100644 index 0000000000..f0d6c23819 --- /dev/null +++ b/test/zdtm/static/maps12.c @@ -0,0 +1,351 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test madvise(MADV_GUARD_INSTALL)"; +const char *test_author = "Alexander Mikhalitsyn "; +/* some parts of code were taken from Linux kernel's kselftest guard-pages.c + written by Lorenzo Stoakes */ + +char *filename; +int fd; +TEST_OPTION(filename, string, "file name", 1); + +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif + +uint8_t *map_base; + +struct { + unsigned int pages_num; + bool filemap; +} vmas[] = { + { 2, false }, + { 2, false }, + { 2, false }, + { 2, true }, + { 2, true }, + { 2, true }, +}; + +struct { + bool guarded; + bool wipeonfork; +} pages[] = { + { false, false }, /* vmas[0] */ + { true, false }, + { true, false }, /* vmas[1] */ + { false, false }, + { false, false }, /* vmas[2] */ + { true, true }, + { true, false }, /* vmas[3] */ + { false, false }, + { true, false }, /* vmas[4] */ + { true, false }, + { false, false }, /* vmas[5] */ + { true, false }, +}; + +static volatile sig_atomic_t signal_jump_set; +static sigjmp_buf signal_jmp_buf; + +static void handle_sigsegv(int signo) +{ + if (!signal_jump_set) + return; + + siglongjmp(signal_jmp_buf, 1); +} + +static bool try_write_to_addr(uint8_t *ptr) +{ + bool failed; + + /* Tell signal handler to jump back here on fatal signal. */ + signal_jump_set = true; + /* If a fatal signal arose, we will jump back here and failed is set. */ + failed = sigsetjmp(signal_jmp_buf, 1) != 0; + + if (!failed) + *ptr = 'x'; + + signal_jump_set = false; + return !failed; +} + +static int setup_sigsegv_handler(void) +{ + uint8_t write_me; + + if (signal(SIGSEGV, handle_sigsegv) == SIG_ERR) { + pr_perror("setting SIGSEGV handler failed"); + return 1; + } + + /* ensure that try_write_to_addr() works properly */ + if (!try_write_to_addr(&write_me)) { + pr_err("Failed to write at valid addr. Buggy try_write_to_addr()?\n"); + return 1; + } + + if (try_write_to_addr(NULL)) { + pr_err("Failed to detect an invalid write. Buggy try_write_to_addr()?\n"); + return 1; + } + + return 0; +} + +static inline void *mmap_pages(void *addr_hint, unsigned int count, bool filemap) +{ + char *map; + + map = mmap(addr_hint, count * PAGE_SIZE, PROT_WRITE | PROT_READ, + MAP_PRIVATE | (filemap ? 0 : MAP_ANONYMOUS) | (addr_hint ? MAP_FIXED : 0), + filemap ? fd : -1, + filemap ? (off_t)((intptr_t)addr_hint - (intptr_t)map_base) : 0); + if (map == MAP_FAILED || (addr_hint && (map != addr_hint))) + return MAP_FAILED; + + return map; +} + +static int __check_guards(const char *when, bool in_child) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + /* + * Skip pages that were never guarded, and also those + * that were, but have MADV_WIPEONFORK which means that + * guards were removed on fork. + */ + if (!pages[i].guarded || (in_child && pages[i].wipeonfork)) + continue; + + if (try_write_to_addr(&map_base[i * PAGE_SIZE])) { + pr_err("successful write to a guarded area %d %s C/R\n", + i, when); + return 1; + } + } + + return 0; +} + +static int check_guards(const char *when) +{ + int status; + pid_t pid; + + /* + * First of all, check that guards are on their places + * in a main test process. + */ + if (__check_guards(when, false)) { + return 1; + } + + /* + * Now, check that guards are on their places + * after fork(). This allows to ensure that + * combo MADV_WIPEONFORK + MADV_GUARD_INSTALL + * is restored properly too. + */ + + pid = test_fork(); + if (pid < 0) { + pr_perror("check_guards: fork failed"); + return 1; + } + + if (pid == 0) { + if (__check_guards(when, true)) { + pr_err("check_guards(\"%s\") failed in child\n", when); + exit(1); + } + + exit(0); + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("check_guards: waitpid"); + return 1; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + pr_err("check_guards: process didn't exit cleanly: status=%d\n", status); + return 1; + } + + return 0; +} + +static void gen_pages_data(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + uint32_t crc; + + if (pages[i].guarded) + continue; + + crc = ~0; + datagen(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc); + } +} + +static int set_pages_madvs(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + if (pages[i].guarded) { + if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, + MADV_GUARD_INSTALL)) { + pr_perror("MADV_GUARD_INSTALL failed on page %d", i); + return 1; + } + } + + if (pages[i].wipeonfork) { + if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, + MADV_WIPEONFORK)) { + pr_perror("MADV_WIPEONFORK failed on page %d", i); + return 1; + } + } + } + + return 0; +} + +static int check_pages_data(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + uint32_t crc; + + if (pages[i].guarded) + continue; + + crc = ~0; + if (datachk(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc)) { + pr_err("Page %d is corrupted\n", i); + return 1; + } + } + + return 0; +} + +static int prepare_vmas(void) +{ + char *map; + int i, shift; + + shift = 0; + for (i = 0; i < ARRAY_SIZE(vmas); i++) { + map = mmap_pages(&map_base[shift * PAGE_SIZE], + vmas[i].pages_num, vmas[i].filemap); + if (map == MAP_FAILED) { + pr_err("mmap of [%d,%d] pages failed\n", + shift, shift + vmas[i].pages_num); + return 1; + } + + shift += vmas[i].pages_num; + } + + if (shift != ARRAY_SIZE(pages)) { + pr_err("Different number of pages in vmas and pages arrays.\n"); + return 1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + unsigned int pages_num = ARRAY_SIZE(pages); + + test_init(argc, argv); + + fd = open(filename, O_TRUNC | O_CREAT | O_RDWR, 0600); + if (fd < 0) { + pr_perror("Unable to create a test file"); + return -1; + } + + if (ftruncate(fd, pages_num * PAGE_SIZE)) { + pr_perror("Unable to ftruncate a test file"); + return -1; + } + + if (setup_sigsegv_handler()) { + pr_err("setup_sigsegv_handler() failed\n"); + return 1; + } + + /* let's find a large enough area in address space */ + map_base = mmap_pages(NULL, pages_num, false); + if (map_base == MAP_FAILED) { + pr_err("mmap of %d pages failed\n", pages_num); + return 1; + } + + /* + * Now we know that we have a free vm address space area + * [map_base, map_base + pages_num * PAGE_SIZE). + * We can use (map_base) as a hint for our further mmaps. + */ + if (prepare_vmas()) { + pr_err("prepare_vmas() failed\n"); + return 1; + } + + /* fill non-guarded pages with data and preserve checksums */ + gen_pages_data(); + + if (set_pages_madvs()) { + pr_err("set_pages_madvs() failed\n"); + return 1; + } + + /* ensure that madvise(MADV_GUARD_INSTALL) works like expected */ + if (check_guards("before")) { + pr_err("check_guards(\"before\") failed\n"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* ensure that guards are at their places */ + if (check_guards("after")) { + fail("check_guards(\"after\") failed"); + return 1; + } + + /* check that non-guarded pages still contain original data */ + if (check_pages_data()) { + fail("check_pages_data() failed"); + return 1; + } + + pass(); + munmap(map_base, pages_num * PAGE_SIZE); + close(fd); + return 0; +} diff --git a/test/zdtm/static/maps12.desc b/test/zdtm/static/maps12.desc new file mode 100644 index 0000000000..3f7627ff3c --- /dev/null +++ b/test/zdtm/static/maps12.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'feature': 'pagemap_scan_guard_pages'} diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.c b/test/zdtm/static/mnt_ext_file_bind_auto.c new file mode 100644 index 0000000000..0c3b9f5fbd --- /dev/null +++ b/test/zdtm/static/mnt_ext_file_bind_auto.c @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if external file mount works"; +const char *test_author = "Pavel Tikhomirov "; + +char *filename = "mnt_ext_file_bind_auto_bind_auto.file"; +TEST_OPTION(filename, string, "file name", 1); + +char *source = "mnt_ext_file_bind_auto_bind_auto.source"; + +int create_file(const char *path) +{ + int fd; + + fd = open(path, O_CREAT | O_RDWR, 0644); + if (fd < 0) { + pr_perror("open"); + return -1; + } + + close(fd); + return 0; +} + +int main(int argc, char **argv) +{ + char *zdtm_newns = getenv("ZDTM_NEWNS"); + char *tmp = "/tmp/zdtm_ext_file_bind_auto.tmp"; + char *sourcefile = "/tmp/zdtm_ext_file_bind_auto.file"; + char *root, tmpfile[PATH_MAX], testfile[PATH_MAX]; + + root = getenv("ZDTM_ROOT"); + if (root == NULL) { + pr_perror("root"); + return 1; + } + + if (!zdtm_newns) { + pr_perror("ZDTM_NEWNS is not set"); + return 1; + } else if (strcmp(zdtm_newns, "1")) { + goto test; + } + + /* Prepare file bindmount in criu root (source for external file bindmount) */ + mkdir(tmp, 0755); + if (mount(source, tmp, "tmpfs", 0, NULL)) { + pr_perror("mount tmpfs"); + return 1; + } + if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { + pr_perror("make private"); + return 1; + } + + sprintf(tmpfile, "%s/%s", tmp, filename); + if (create_file(tmpfile)) + return 1; + + if (create_file(sourcefile)) + return 1; + + if (mount(tmpfile, sourcefile, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + + umount2(tmp, MNT_DETACH); + + /* Prepare file in test root (mount point for external file bindmount) */ + sprintf(testfile, "%s/%s", root, filename); + if (create_file(testfile)) + return 1; + + /* + * Create temporary mntns, next mounts will not show up in criu mntns + * and will be inherited into test mntns + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + if (mount(sourcefile, testfile, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } +test: + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + pass(); + return 0; +} diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.desc b/test/zdtm/static/mnt_ext_file_bind_auto.desc new file mode 100644 index 0000000000..825b081274 --- /dev/null +++ b/test/zdtm/static/mnt_ext_file_bind_auto.desc @@ -0,0 +1,4 @@ +{ 'opts': '--external mnt[]', + 'feature': 'mnt_id', + 'flavor': 'ns uns', + 'flags': 'suid'} diff --git a/test/zdtm/static/mnt_ro_root.c b/test/zdtm/static/mnt_ro_root.c new file mode 100644 index 0000000000..2d8370150b --- /dev/null +++ b/test/zdtm/static/mnt_ro_root.c @@ -0,0 +1,32 @@ +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if root mount remains read-only after c/r"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + if (mount(NULL, "/", NULL, MS_REMOUNT | MS_RDONLY | MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* + * Note: In zdtm.py:check_visible_state() we already check for all + * tests, that all mounts in the test's mount namespace remain the + * same, by comparing mountinfo before and after c/r. So rw/ro mount + * option inconsistency will be detected there and we don't need to + * check it in the test itself. + */ + pass(); + return 0; +} diff --git a/test/zdtm/static/mnt_ro_root.desc b/test/zdtm/static/mnt_ro_root.desc new file mode 100644 index 0000000000..c9a8e4f186 --- /dev/null +++ b/test/zdtm/static/mnt_ro_root.desc @@ -0,0 +1,6 @@ +{ + 'flavor': 'ns uns', + 'flags': 'suid', + 'feature': 'mnt_id', + 'bind': 'zdtm/static', +} diff --git a/test/zdtm/static/netns_sub_sysctl.c b/test/zdtm/static/netns_sub_sysctl.c index 0f94c40a79..03b478b7d7 100644 --- a/test/zdtm/static/netns_sub_sysctl.c +++ b/test/zdtm/static/netns_sub_sysctl.c @@ -1,4 +1,6 @@ #include +#include +#include #include "zdtmtst.h" #include "sysctl.h" @@ -20,6 +22,7 @@ typedef struct { int new; char s_old[MAX_STR_SYSCTL_LEN]; char s_new[MAX_STR_SYSCTL_LEN]; + bool set; } sysctl_opt_t; #define CONF_UNIX_BASE "/proc/sys/net/unix" @@ -38,6 +41,11 @@ int main(int argc, char **argv) test_init(argc, argv); for (p = net_unix_params; p->path != NULL; p++) { + if (access(p->path, W_OK) != 0) { + test_msg("%s doesn't exist\n", p->path); + continue; + } + p->set = true; if (p->type == SYSCTL_INT) { p->old = (((unsigned)lrand48()) % 1023) + 1; if (sysctl_write_int(p->path, p->old)) { @@ -56,6 +64,8 @@ int main(int argc, char **argv) test_waitsig(); for (p = net_unix_params; p->path != NULL; p++) { + if (!p->set) + continue; if (p->type == SYSCTL_INT) { if (sysctl_read_int(p->path, &p->new)) ret = 1; diff --git a/test/zdtm/static/netns_sub_sysctl.desc b/test/zdtm/static/netns_sub_sysctl.desc index 5358426683..0c357aefe4 100644 --- a/test/zdtm/static/netns_sub_sysctl.desc +++ b/test/zdtm/static/netns_sub_sysctl.desc @@ -1,4 +1,4 @@ { - 'flavor': 'ns', + 'flavor': 'ns uns', 'flags': 'suid' } diff --git a/test/zdtm/static/sk-unix-restore-fs-share.c b/test/zdtm/static/sk-unix-restore-fs-share.c new file mode 100644 index 0000000000..d4f6dde75d --- /dev/null +++ b/test/zdtm/static/sk-unix-restore-fs-share.c @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test non-empty process group with terminated parent and unix socket"; +const char *test_author = "Qiao Ma "; + +char *filename; +TEST_OPTION(filename, string, "socket file name", 1); + +static int create_and_connect(void) +{ + struct sockaddr_un addr; + int client_fd; + + client_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (client_fd == -1) { + pr_perror("socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + if (snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", filename) >= (int)sizeof(addr.sun_path)) { + pr_err("Socket path too long\n"); + close(client_fd); + return -1; + } + + if (connect(client_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("connect"); + close(client_fd); + return -1; + } + + return 0; +} + +static int child(int ready_fd) +{ + int listen_fd; + struct sockaddr_un addr; + int ret = EXIT_FAILURE; + + listen_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (listen_fd == -1) { + pr_perror("socket"); + return EXIT_FAILURE; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + if (strlen(filename) >= sizeof(addr.sun_path)) { + pr_err("Socket path too long\n"); + goto cleanup; + } + strncpy(addr.sun_path, filename, sizeof(addr.sun_path)); + + unlink(filename); /* Ignore error if file doesn't exist */ + + if (bind(listen_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("bind"); + goto cleanup; + } + + if (listen(listen_fd, 5) == -1) { + pr_perror("listen"); + goto cleanup; + } + + if (create_and_connect() != 0) { + pr_err("Failed to create and connect\n"); + goto cleanup; + } + + /* Signal parent that socket is ready */ + if (write(ready_fd, "1", 1) != 1) { + pr_perror("write ready_fd"); + goto cleanup; + } + + /* Wait indefinitely */ + pause(); + + ret = EXIT_SUCCESS; +cleanup: + if (listen_fd != -1) + close(listen_fd); + unlink(filename); + + return ret; +} + +static int zombie_leader(int *cpid) +{ + char buf; + pid_t pid; + int pipefd[2]; + + if (pipe(pipefd) == -1) { + pr_perror("pipe"); + return EXIT_FAILURE; + } + + if (setpgid(0, 0) == -1) { + pr_perror("setpgid"); + return EXIT_FAILURE; + } + + pid = fork(); + if (pid < 0) { + pr_perror("Failed to fork child"); + return EXIT_FAILURE; + } + + if (pid == 0) { + /* Close read end */ + close(pipefd[0]); + exit(child(pipefd[1])); + } + + /* Close write end in parent */ + close(pipefd[1]); + + /* Wait for child to set up socket */ + if (read(pipefd[0], &buf, 1) != 1) { + pr_err("Failed to receive readiness signal from child\n"); + close(pipefd[0]); + return EXIT_FAILURE; + } + close(pipefd[0]); + + *cpid = pid; + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + int ret = EXIT_FAILURE, status; + pid_t pid; + int *cpid; + + test_init(argc, argv); + + cpid = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (cpid == MAP_FAILED) { + pr_perror("mmap"); + return EXIT_FAILURE; + } + *cpid = 0; + + pid = fork(); + if (pid < 0) { + pr_perror("Failed to fork zombie"); + goto out; + } + + if (pid == 0) + exit(zombie_leader(cpid)); + + if (waitpid(pid, &status, 0) < 0) { + pr_perror("Failed to waitpid zombie"); + goto out; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != EXIT_SUCCESS) { + pr_err("Unexpected exit code: %d\n", WEXITSTATUS(status)); + goto out; + } + + if (!*cpid) { + pr_err("Don't know grandchild's pid\n"); + goto out; + } + + test_daemon(); + test_waitsig(); + + ret = EXIT_SUCCESS; + pass(); +out: + /* Clean up */ + if (*cpid) + kill(*cpid, SIGKILL); + + munmap(cpid, sizeof(int)); + + return ret; +} diff --git a/test/zdtm/static/sk-unix-restore-fs-share.desc b/test/zdtm/static/sk-unix-restore-fs-share.desc new file mode 100644 index 0000000000..6c4afe5f03 --- /dev/null +++ b/test/zdtm/static/sk-unix-restore-fs-share.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} diff --git a/test/zdtm/static/sock_opts00.c b/test/zdtm/static/sock_opts00.c index fcf00ffed8..854aaa5911 100644 --- a/test/zdtm/static/sock_opts00.c +++ b/test/zdtm/static/sock_opts00.c @@ -31,7 +31,7 @@ int main(int argc, char **argv) static const int NOPTS = sizeof(vname) / sizeof(*vname); #undef OPT - int sock, ret = 0, val[NOPTS], rval, i; + int sock, usock, sk, ret = 0, val[NOPTS], rval, i; socklen_t len = sizeof(int); test_init(argc, argv); @@ -42,8 +42,15 @@ int main(int argc, char **argv) return 1; } + usock = socket(AF_UNIX, SOCK_STREAM, 0); + if (usock < 0) { + pr_perror("can't create unix socket"); + return 1; + } + for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], &len); + sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], &len); if (ret) { pr_perror("can't get %s", vname[i].name); return 1; @@ -51,13 +58,13 @@ int main(int argc, char **argv) val[i]++; - ret = setsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], len); + ret = setsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], len); if (ret) { pr_perror("can't set %s = %d", vname[i].name, val[i]); return 1; } - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { pr_perror("can't re-get %s", vname[i].name); return 1; @@ -78,7 +85,8 @@ int main(int argc, char **argv) test_waitsig(); for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); + sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { pr_perror("can't verify %s", vname[i].name); return 1; @@ -93,6 +101,7 @@ int main(int argc, char **argv) pass(); close(sock); + close(usock); return 0; } diff --git a/test/zdtm/static/socket-tcp-closing.c b/test/zdtm/static/socket-tcp-closing.c index 87e1d75337..df291d4464 100644 --- a/test/zdtm/static/socket-tcp-closing.c +++ b/test/zdtm/static/socket-tcp-closing.c @@ -31,10 +31,13 @@ static int port = 8880; int fill_sock_buf(int fd) { + char zdtm[512]; int flags; int size; int ret; + memset(zdtm, 5, sizeof(zdtm)); + flags = fcntl(fd, F_GETFL, 0); if (flags == -1) { pr_perror("Can't get flags"); @@ -47,7 +50,6 @@ int fill_sock_buf(int fd) size = 0; while (1) { - char zdtm[] = "zdtm test packet"; ret = write(fd, zdtm, sizeof(zdtm)); if (ret == -1) { if (errno == EAGAIN) diff --git a/test/zdtm/static/socket6_icmp.c b/test/zdtm/static/socket6_icmp.c new file mode 120000 index 0000000000..24d8fd8067 --- /dev/null +++ b/test/zdtm/static/socket6_icmp.c @@ -0,0 +1 @@ +socket_icmp.c \ No newline at end of file diff --git a/test/zdtm/static/socket_icmp.c b/test/zdtm/static/socket_icmp.c new file mode 100644 index 0000000000..f72e348bf4 --- /dev/null +++ b/test/zdtm/static/socket_icmp.c @@ -0,0 +1,128 @@ +#include "zdtmtst.h" + +const char *test_doc = "static test for ICMP socket\n"; +const char *test_author = "समीर सिंह Sameer Singh \n"; + +/* Description: + * Send a ping to localhost using ICMP socket + */ + +#include +#include +#include +#include +#if defined(ZDTM_IPV6) +#include +#else +#include +#endif +#include +#include +#include + +#include "sysctl.h" + +#define PACKET_SIZE 64 +#define RECV_TIMEOUT 1 + +static int echo_id = 1234; + +#if defined(ZDTM_IPV6) +#define TEST_ICMP_ECHOREPLY ICMP6_ECHOREPLY +#else +#define TEST_ICMP_ECHOREPLY ICMP_ECHOREPLY +#endif +int main(int argc, char **argv) +{ + int ret, sock, seq = 0; + char packet[PACKET_SIZE], recv_packet[PACKET_SIZE]; + + struct timeval tv; +#if defined(ZDTM_IPV6) + struct sockaddr_in6 addr, recv_addr; +#else + struct icmphdr icmp_header, *icmp_reply; +#endif + struct sockaddr_in addr, recv_addr; + socklen_t addr_len; + + // Allow GIDs 0-58468 to open an unprivileged ICMP socket + if (sysctl_write_str("/proc/sys/net/ipv4/ping_group_range", "0 58468")) + return -1; + + test_init(argc, argv); + +#if defined(ZDTM_IPV6) + sock = socket(PF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6); +#else + sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP); +#endif + if (sock < 0) { + pr_perror("Can't create socket"); + return 1; + } + + tv.tv_sec = RECV_TIMEOUT; + tv.tv_usec = 0; + if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0) { + pr_perror("Can't set socket option"); + return 1; + } + + memset(&addr, 0, sizeof(addr)); + memset(&icmp_header, 0, sizeof(icmp_header)); +#if defined(ZDTM_IPV6) + addr.sin6_family = AF_INET6; + inet_pton(AF_INET6, "::1", &addr.sin6_addr); + + icmp_header.icmp6_type = ICMP6_ECHO_REQUEST; + icmp_header.icmp6_code = 0; + icmp_header.icmp6_id = echo_id; + icmp_header.icmp6_seq = seq; +#else + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + + icmp_header.type = ICMP_ECHO; + icmp_header.code = 0; + icmp_header.un.echo.id = echo_id; + icmp_header.un.echo.sequence = seq; +#endif + + memcpy(packet, &icmp_header, sizeof(icmp_header)); + memset(packet + sizeof(icmp_header), 0xa5, + PACKET_SIZE - sizeof(icmp_header)); + + test_daemon(); + test_waitsig(); + + ret = sendto(sock, packet, PACKET_SIZE, 0, + (struct sockaddr *)&addr, sizeof(addr)); + + if (ret < 0) { + fail("Can't send"); + return 1; + } + + addr_len = sizeof(recv_addr); + + ret = recvfrom(sock, recv_packet, sizeof(recv_packet), 0, + (struct sockaddr *)&recv_addr, &addr_len); + + if (ret < 0) { + fail("Can't recv"); + return 1; + } + + icmp_reply = (struct icmphdr *)recv_packet; + + if (icmp_reply->type != ICMP_ECHOREPLY) { + fail("Got no ICMP_ECHO_REPLY"); + return 1; + } + + close(sock); + + pass(); + return 0; +} diff --git a/test/zdtm/static/uprobes.c b/test/zdtm/static/uprobes.c new file mode 100644 index 0000000000..4164375b7a --- /dev/null +++ b/test/zdtm/static/uprobes.c @@ -0,0 +1,295 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test the --allow-uprobes option"; +const char *test_author = "Shashank Balaji "; + +#define UPROBE_GROUP_NAME "zdtm" +#define UPROBE_EVENT_NAME "uprobes_test" +#define UPROBED_FUNCTION uprobe_target + +/* + * A uprobe can be set at the start of a function, but not all instructions + * will trigger the creation of a uprobes vma. + * + * Examples: + * - aarch64: if the function is a single `ret`, then no vma creation + * - x64: if the function is `nop; ret`, then no vma creation + * + * So to guarantee vma creation, create a volatile dummy variable (to prevent + * compiler optimization) and use it (to prevent "unused variable" warning) + */ +void UPROBED_FUNCTION(void) { + volatile int dummy = 0; + dummy += 1; +} +/* Calling via volatile function pointer ensures noinline at callsite */ +typedef void (*func_ptr)(void); +volatile func_ptr uprobe_target_alias = UPROBED_FUNCTION; + +struct uprobe_context { + struct tracefs_instance *instance; + struct tracefs_dynevent *uprobe; +}; + +volatile bool got_sigtrap = false; + +/* + * Returns the file offset of a symbol in the executable of this program + * Returns 0 on failure +*/ +uint64_t calc_sym_offset(const char *sym_name) +{ + GElf_Shdr section_header; + Elf_Scn *section = NULL; + Elf_Data *symtab_data; + uint64_t offset = 0; + char buf[PATH_MAX]; + GElf_Sym symbol; + ssize_t n_bytes; + int n_entries; + Elf *elf; + int fd; + int i; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_err("ELF version of libelf is lower than that of the program\n"); + return 0; + } + + n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); + if (n_bytes < 0) { + pr_perror("Failed to readlink /proc/self/exe"); + return 0; + } + buf[n_bytes] = '\0'; + + fd = open(buf, O_RDONLY); + if (fd < 0) { + pr_perror("Failed to open self-executable"); + return 0; + } + + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) { + pr_err("%s\n", elf_errmsg(elf_errno())); + goto out_fd; + } + + /* Look for the symbol table section and its header */ + while ((section = elf_nextscn(elf, section)) != NULL) { + gelf_getshdr(section, §ion_header); + if (section_header.sh_type == SHT_SYMTAB) + break; + } + if (!section) { + pr_err("Failed to find symbol table\n"); + goto out_elf; + } + symtab_data = elf_getdata(section, NULL); + n_entries = section_header.sh_size / section_header.sh_entsize; + + /* Look for a symbol with the required name */ + for (i = 0; i < n_entries; i++) { + gelf_getsym(symtab_data, i, &symbol); + /* Symbol table's sh_link is the index of the string table section header */ + if (!strcmp(sym_name, + elf_strptr(elf, section_header.sh_link, symbol.st_name))) + break; + } + if (i == n_entries) { + pr_err("Failed to find symbol \"%s\"\n", sym_name); + goto out_elf; + } + + /* Get the section the symbol belongs to (mostly .text) */ + section = elf_getscn(elf, symbol.st_shndx); + gelf_getshdr(section, §ion_header); + offset = symbol.st_value - section_header.sh_addr + section_header.sh_offset; + +out_elf: + elf_end(elf); +out_fd: + close(fd); + return offset; +} + +/* + * Set and enable a uprobe on the file at the given offset + * Returns struct uprobe_context with members set to NULL on failure +*/ +struct uprobe_context enable_uprobe(const char *file, uint64_t offset) +{ + struct tracefs_instance *trace_instance; + struct tracefs_dynevent *uprobe; + struct uprobe_context context = {}; + + trace_instance = tracefs_instance_create("zdtm_uprobes_test"); + if (!trace_instance) { + pr_perror("Failed to create tracefs instance"); + return context; + } + tracefs_instance_reset(trace_instance); + + uprobe = tracefs_uprobe_alloc(UPROBE_GROUP_NAME, UPROBE_EVENT_NAME, file, offset, NULL); + if (!uprobe) { + pr_perror("Failed to allocate uprobe"); + goto instance_destroy; + } + + if (tracefs_dynevent_create(uprobe)) { + pr_perror("Failed to create uprobe"); + goto uprobe_free; + } + + if (tracefs_event_enable(trace_instance, UPROBE_GROUP_NAME, UPROBE_EVENT_NAME)) { + pr_perror("Failed to enable uprobe"); + goto uprobe_destroy; + } + + context.instance = trace_instance; + context.uprobe = uprobe; + return context; + +uprobe_destroy: + tracefs_dynevent_destroy(uprobe, false); +uprobe_free: + tracefs_dynevent_free(uprobe); +instance_destroy: + tracefs_instance_destroy(trace_instance); + tracefs_instance_free(trace_instance); + return context; +} + +void destroy_uprobe(struct uprobe_context context) +{ + tracefs_dynevent_destroy(context.uprobe, true); + tracefs_dynevent_free(context.uprobe); + tracefs_instance_destroy(context.instance); + tracefs_instance_free(context.instance); +} + +/* + * Check for the existence of the "[uprobes]" vma in /proc/self/maps + * Returns -1 on failure, 0 if not found, 1 if found +*/ +int uprobes_vma_exists(void) +{ + FILE *f; + char buf[LINE_MAX]; + int ret = 0; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + pr_perror("Failed to open /proc/self/maps"); + return -1; + } + + while (fgets(buf, sizeof(buf), f)) { + if (strstr(buf, "[uprobes]")) { + ret = 1; + break; + } + } + if (ret == 0 && !feof(f)) { + pr_err("Failed to finish reading /proc/self/maps\n"); + ret = -1; + } + + fclose(f); + return ret; +} + +/* + * SIGTRAP is sent if execution reaches a previously set uprobed location, and + * the corresponding uprobe is not active. We don't want this to happen on restore +*/ +void sigtrap_handler(int signo, siginfo_t *info, void* context) +{ + if (info->si_code == SI_KERNEL) { + got_sigtrap = true; + fail("SIGTRAP on attempting to call uprobed function"); + } +} + +int main(int argc, char **argv) +{ + struct uprobe_context context; + struct sigaction sa; + char buf[PATH_MAX]; + uint64_t offset; + int n_bytes; + int ret = 1; + + test_init(argc, argv); + + offset = calc_sym_offset(__stringify(UPROBED_FUNCTION)); + if (!offset) + return 1; + + n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); + if (n_bytes < 0) { + pr_perror("Failed to readlink /proc/self/exe"); + return 1; + } + buf[n_bytes] = '\0'; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = sigtrap_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGTRAP, &sa, NULL)) { + pr_perror("Failed to set SIGTRAP handler"); + return 1; + } + + context = enable_uprobe(buf, offset); + if (!context.instance) + return 1; + + /* + * Execution must reach the uprobed location at least once + * for the kernel to create the uprobes vma + */ + uprobe_target_alias(); + + switch (uprobes_vma_exists()) { + case -1: + goto out_uprobe; + break; + case 0: + pr_err("uprobes vma does not exist\n"); + goto out_uprobe; + break; + case 1: + test_msg("Found uprobes vma\n"); + break; + } + + test_daemon(); + test_waitsig(); + + /* + * Calling the uprobed function after restore should not cause + * a SIGTRAP, since the uprobe is still active + */ + uprobe_target_alias(); + if (!got_sigtrap) { + pass(); + ret = 0; + } + +out_uprobe: + destroy_uprobe(context); + return ret; +} diff --git a/test/zdtm/static/uprobes.desc b/test/zdtm/static/uprobes.desc new file mode 100644 index 0000000000..6eab1f4982 --- /dev/null +++ b/test/zdtm/static/uprobes.desc @@ -0,0 +1,6 @@ +{ + 'feature': 'cgroupns', + 'flags': 'suid nouser', + 'flavor': 'h', + 'opts': '--allow-uprobes' +}