diff --git a/.codechecker.json b/.codechecker.json new file mode 100644 index 00000000000..6d7ef70943e --- /dev/null +++ b/.codechecker.json @@ -0,0 +1,6 @@ +{ + "analyze": [ + "--disable=misc-header-include-cycle", + "--disable=clang-diagnostic-unused-parameter" + ] +} diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 721e2c63221..0b898836157 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -32,6 +32,7 @@ d6ab15362548b8fe270bd14d5153b8d94e1b15c0 b12cf444edea15da6274975e1b2ca6a7fce2a090 364c27f5d18ab9dd31825e67a93efabecad06823 d8b4de9076531dd13bdffa20cc10c72290a52356 +bdf06bca7534fbc0c4fc3cee3408a51a22615226 # ocp-indent d018d26d6acd4707a23288b327b49e44f732725e diff --git a/.github/workflows/1.249-lcm.yml b/.github/workflows/1.249-lcm.yml index 39132476bd9..8ba69e28ec2 100644 --- a/.github/workflows/1.249-lcm.yml +++ b/.github/workflows/1.249-lcm.yml @@ -1,5 +1,7 @@ name: Build and test (1.249-lcm, scheduled) +permissions: {} + on: schedule: # run every Monday, this refreshes the cache @@ -8,7 +10,9 @@ on: jobs: python-test: name: Python tests - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 + permissions: + contents: read strategy: fail-fast: false matrix: @@ -24,7 +28,7 @@ jobs: ocaml-test: name: Ocaml tests - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Checkout code diff --git a/.github/workflows/codechecker.yml b/.github/workflows/codechecker.yml new file mode 100644 index 00000000000..bb3a9fa4304 --- /dev/null +++ b/.github/workflows/codechecker.yml @@ -0,0 +1,83 @@ +name: Run CodeChecker static analyzer on XAPI's C stubs +permissions: {} + +on: + push: + pull_request: + branches: + - master + - 'feature/**' + - '*-lcm' + +concurrency: # On new push, cancel old workflows from the same PR, branch or tag: + group: ${{ github.workflow }}-${{github.event_name}}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + staticanalyzer: + name: Static analyzer for OCaml C stubs + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + env: + XAPI_VERSION: "v0.0.0-${{ github.sha }}" + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup XenAPI environment + uses: ./.github/workflows/setup-xapi-environment + with: + xapi_version: ${{ env.XAPI_VERSION }} + ocaml_version: "4.14.2" + + - name: Install dune-compiledb to generate compile_commands.json + run: | + opam pin add -y ezjsonm https://github.com/mirage/ezjsonm/releases/download/v1.3.0/ezjsonm-1.3.0.tbz + opam pin add -y dune-compiledb https://github.com/edwintorok/dune-compiledb/releases/download/0.6.0/dune-compiledb-0.6.0.tbz + + - name: Trim dune cache + run: opam exec -- dune cache trim --size=2GiB + + - name: Generate compile_commands.json + run: opam exec -- make compile_commands.json + + - name: Upload compile commands json + uses: actions/upload-artifact@v4 + with: + path: ${{ github.workspace }}/compile_commands.json + + - uses: whisperity/codechecker-analysis-action@v1 + id: codechecker + with: + ctu: true + logfile: ${{ github.workspace }}/compile_commands.json + analyze-output: "codechecker_results" + + - name: Upload CodeChecker report + uses: actions/upload-artifact@v4 + with: + name: codechecker_results + path: "${{ steps.codechecker.outputs.result-html-dir }}" + + # cppcheck even for other analyzers apparently, this is + # codechecker's output + - name: convert to SARIF + shell: bash + run: report-converter "codechecker_results" --type cppcheck --output codechecker.sarif --export sarif + + - name: Upload CodeChecker SARIF report + uses: actions/upload-artifact@v4 + with: + name: codechecker_sarif + path: codechecker.sarif + + # TODO: reenable after fixing + # https://github.blog/changelog/2025-07-21-code-scanning-will-stop-combining-multiple-sarif-runs-uploaded-in-the-same-sarif-file/ + # + #- name: Upload SARIF report + # uses: github/codeql-action/upload-sarif@v3 + # with: + # sarif_file: codechecker.sarif diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 096fe18227b..08d381eeaae 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -1,5 +1,7 @@ name: Generate and upload docs +permissions: {} + on: push: branches: master @@ -8,6 +10,8 @@ jobs: ocaml: name: Docs runs-on: ubuntu-22.04 + permissions: + contents: read env: XAPI_VERSION: "v0.0.0-${{ github.sha }}" STORAGE_DOCDIR: .gh-pages-xapi-storage @@ -27,15 +31,23 @@ jobs: - name: Update Ubuntu repositories run: sudo apt-get update + # We set DUNE_CACHE_STORAGE_MODE, it is required for dune cache to work inside opam for now, + # otherwise it gets EXDEV and considers it a cache miss - name: Use ocaml - uses: ocaml/setup-ocaml@v2 + uses: ocaml/setup-ocaml@v3 with: ocaml-compiler: ${{ steps.dotenv.outputs.ocaml_version_full }} opam-repositories: | xs-opam: ${{ steps.dotenv.outputs.repository }} + dune-cache: true + opam-pin: false + cache-prefix: v3-${{ steps.system-info.outputs.name }}-${{ steps.system-info.outputs.release }} + env: + DUNE_CACHE_STORAGE_MODE: copy - name: Install dependencies - run: opam pin list --short | xargs opam install --deps-only -v + shell: bash + run: opam install . --deps-only -v - name: Generate xapi-storage docs run: | diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 3c2d7148f90..1b9947ccec7 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -1,5 +1,7 @@ name: Check format +permissions: {} + on: pull_request: branches: @@ -12,6 +14,8 @@ jobs: ocaml-format: name: Ocaml files runs-on: ubuntu-latest + permissions: + contents: read steps: - name: Checkout code diff --git a/.github/workflows/generate-and-build-sdks.yml b/.github/workflows/generate-and-build-sdks.yml index 53a9b8452cb..1c9ec9dd7f4 100644 --- a/.github/workflows/generate-and-build-sdks.yml +++ b/.github/workflows/generate-and-build-sdks.yml @@ -1,5 +1,7 @@ name: Generate and Build SDKs +permissions: {} + on: workflow_call: inputs: @@ -11,6 +13,9 @@ jobs: generate-sdk-sources: name: Generate SDK sources runs-on: ubuntu-22.04 + permissions: + contents: read + steps: - name: Checkout code uses: actions/checkout@v4 @@ -19,22 +24,12 @@ jobs: uses: ./.github/workflows/setup-xapi-environment with: xapi_version: ${{ inputs.xapi_version }} + ocaml_version: "4.14.2" - name: Generate SDKs shell: bash run: opam exec -- make sdk - # sdk-ci runs some Go unit tests. - # This setting ensures that SDK date time - # tests are run on a machine that - # isn't using UTC - - name: Set Timezone to Tokyo for datetime tests - run: | - sudo timedatectl set-timezone Asia/Tokyo - - - name: Run CI for SDKs - uses: ./.github/workflows/sdk-ci - - name: Store C SDK source uses: actions/upload-artifact@v4 with: @@ -55,7 +50,13 @@ jobs: name: SDK_Source_PowerShell path: _build/install/default/share/powershell/* - - name: Store Go SDK Artifacts + - name: Store Java SDK source + uses: actions/upload-artifact@v4 + with: + name: SDK_Source_Java + path: _build/install/default/share/java/* + + - name: Store Go SDK source uses: actions/upload-artifact@v4 with: name: SDK_Artifacts_Go @@ -64,11 +65,16 @@ jobs: !_build/install/default/share/go/dune !_build/install/default/share/go/**/*_test.go - - name: Store Java SDK source - uses: actions/upload-artifact@v4 - with: - name: SDK_Source_Java - path: _build/install/default/share/java/* + # sdk-ci runs some Go unit tests. + # This setting ensures that SDK date time + # tests are run on a machine that + # isn't using UTC + - name: Set Timezone to Tokyo for datetime tests + run: | + sudo timedatectl set-timezone Asia/Tokyo + + - name: Run CI for SDKs + uses: ./.github/workflows/sdk-ci - name: Trim dune cache run: opam exec -- dune cache trim --size=2GiB @@ -77,6 +83,9 @@ jobs: name: Build C SDK runs-on: ubuntu-latest needs: generate-sdk-sources + permissions: + contents: read + steps: - name: Install dependencies run: sudo apt-get install libxml2-dev @@ -103,6 +112,9 @@ jobs: name: Build Java SDK runs-on: ubuntu-latest needs: generate-sdk-sources + permissions: + contents: read + steps: - name: Install dependencies run: sudo apt-get install maven @@ -120,9 +132,9 @@ jobs: distribution: 'temurin' # Java Tests are run at compile time. - # This setting ensures that SDK date time + # This setting ensures that SDK date time # tests are run on a machine that - # isn't using UTC + # isn't using UTC - name: Set Timezone to Tokyo for datetime tests run: | sudo timedatectl set-timezone Asia/Tokyo @@ -144,6 +156,9 @@ jobs: name: Build C# SDK runs-on: windows-2022 needs: generate-sdk-sources + permissions: + contents: read + steps: - name: Strip 'v' prefix from xapi version shell: pwsh @@ -158,7 +173,7 @@ jobs: # All tests builds and pipelines should # work on other timezones. This setting ensures that # SDK date time tests are run on a machine that - # isn't using UTC + # isn't using UTC - name: Set Timezone to Tokyo for datetime tests shell: pwsh run: Set-TimeZone -Id "Tokyo Standard Time" @@ -174,7 +189,7 @@ jobs: - name: Build C# SDK shell: pwsh run: | - dotnet build source/src ` + dotnet build source/src/XenServer.csproj ` --disable-build-servers ` --configuration Release ` -p:Version=${{ env.XAPI_VERSION_NUMBER }}-prerelease-unsigned ` @@ -186,85 +201,16 @@ jobs: name: SDK_Binaries_CSharp path: source/src/bin/Release/XenServer.NET.${{ env.XAPI_VERSION_NUMBER }}-prerelease-unsigned.nupkg - build-powershell-5x-sdk: - name: Build PowerShell 5.x SDK (.NET Framework 4.5) - needs: build-csharp-sdk - # PowerShell SDK for PowerShell 5.x needs to run on windows-2019 because - # windows-2022 doesn't contain .NET Framework 4.x dev tools - runs-on: windows-2019 - steps: - - name: Strip 'v' prefix from xapi version - shell: pwsh - run: echo "XAPI_VERSION_NUMBER=$("${{ inputs.xapi_version }}".TrimStart('v'))" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append - - - name: Retrieve PowerShell SDK source - uses: actions/download-artifact@v4 - with: - name: SDK_Source_PowerShell - path: source/ - - - name: Retrieve C# SDK binaries - uses: actions/download-artifact@v4 - with: - name: SDK_Binaries_CSharp - path: csharp/ - - # Following needed for restoring packages - # when calling dotnet add package - - name: Set up dotnet CLI (.NET 6.0 and 8.0) - uses: actions/setup-dotnet@v4 - with: - dotnet-version: | - 6 - 8 - - - name: Setup project and dotnet CLI - shell: pwsh - run: | - dotnet nuget add source --name local ${{ github.workspace }}\csharp - dotnet add source/src package XenServer.NET --version ${{ env.XAPI_VERSION_NUMBER }}-prerelease-unsigned - - - name: Build PowerShell SDK (.NET Framework 4.5) - shell: pwsh - run: | - dotnet build source/src/XenServerPowerShell.csproj ` - --disable-build-servers ` - --configuration Release ` - -p:Version=${{ env.XAPI_VERSION_NUMBER }}-prerelease-unsigned ` - -p:TargetFramework=net45 ` - --verbosity=normal` - - - name: Update SDK and PS versions in "XenServerPSModule.psd1" - shell: pwsh - run: | - (Get-Content "source\XenServerPSModule.psd1") -replace "@SDK_VERSION@","${{ env.XAPI_VERSION_NUMBER }}" | Set-Content -Path "source\XenServerPSModule.psd1" - (Get-Content "source\XenServerPSModule.psd1") -replace "@PS_VERSION@","5.0" | Set-Content -Path "source\XenServerPSModule.psd1" - - - name: Move binaries to destination folder - shell: pwsh - run: | - New-Item -Path "." -Name "output" -ItemType "directory" - Copy-Item -Verbose "source\README_51.md" -Destination "output" -Force - Copy-Item -Verbose "source\LICENSE" -Destination "output" -Force - Copy-Item -Path "source\src\bin\Release\net45\*" -Include "*.dll" "output\" - Get-ChildItem -Path "source" |` - Where-Object { $_.Extension -eq ".ps1" -or $_.Extension -eq ".ps1xml" -or $_.Extension -eq ".psd1" -or $_.Extension -eq ".txt" } |` - ForEach-Object -Process { Copy-Item -Verbose $_.FullName -Destination "output" } - - - name: Store PowerShell SDK (.NET Framework 4.5) - uses: actions/upload-artifact@v4 - with: - name: SDK_Binaries_XenServerPowerShell_NET45 - path: output/**/* - build-powershell-7x-sdk: name: Build PowerShell 7.x SDK strategy: fail-fast: false matrix: - dotnet: ["6", "8"] + dotnet: ["8"] needs: build-csharp-sdk runs-on: windows-2022 + permissions: + contents: read steps: - name: Strip 'v' prefix from xapi version diff --git a/.github/workflows/hugo.yml b/.github/workflows/hugo.yml index 9b831b12ae7..6a0116389fd 100644 --- a/.github/workflows/hugo.yml +++ b/.github/workflows/hugo.yml @@ -1,5 +1,7 @@ name: Generate and upload Hugo docs +permissions: {} + on: push: branches: master @@ -8,6 +10,9 @@ jobs: ocaml: name: Docs runs-on: ubuntu-22.04 + permissions: + contents: read + steps: - name: Checkout code diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f8dcee80945..a22f85dc72f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,5 +1,7 @@ name: Build and test +permissions: {} + on: # When only Hugo docs change, this workflow is not required: push: @@ -19,12 +21,30 @@ concurrency: # On new push, cancel old workflows from the same PR, branch or tag jobs: ocaml-tests: name: Run OCaml tests - runs-on: ubuntu-22.04 + strategy: + fail-fast: false + matrix: + runs-on: ["ubuntu-22.04"] + ocaml-version: ["4.14.2"] + experimental: [false] + include: + - runs-on: "ubuntu-22.04-arm" + ocaml-version: "4.14.2" + experimental: true + - runs-on: "ubuntu-22.04" + ocaml-version: "5.3.0" + experimental: true + + continue-on-error: ${{ matrix.experimental }} + runs-on: ${{ matrix.runs-on }} + permissions: + contents: read env: # Ensure you also update test-sdk-builds # when changing this value, to keep builds # consistent XAPI_VERSION: "v0.0.0" + steps: - name: Checkout code uses: actions/checkout@v4 @@ -33,6 +53,7 @@ jobs: uses: ./.github/workflows/setup-xapi-environment with: xapi_version: ${{ env.XAPI_VERSION }} + ocaml_version: ${{ matrix.ocaml-version }} - name: Build run: opam exec -- make diff --git a/.github/workflows/other.yml b/.github/workflows/other.yml index 7cac6522c2c..0a94353560c 100644 --- a/.github/workflows/other.yml +++ b/.github/workflows/other.yml @@ -1,5 +1,7 @@ name: Build and test (other) +permissions: {} + on: # When only Hugo docs change, this workflow is not required: push: @@ -20,6 +22,10 @@ jobs: python-test: name: Python tests runs-on: ubuntu-22.04 + permissions: + contents: read + pull-requests: write # allow commenting on the PR + strategy: fail-fast: false matrix: @@ -29,7 +35,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 # To check which files changed: origin/master..HEAD - - uses: LizardByte/setup-python-action@master + - uses: actions/setup-python@v5 with: python-version: ${{matrix.python-version}} @@ -45,7 +51,7 @@ jobs: - uses: pre-commit/action@v3.0.1 name: Run pre-commit checks (no spaces at end of lines, etc) with: - extra_args: --all-files --verbose --hook-stage commit + extra_args: --all-files --verbose --hook-stage pre-commit env: SKIP: no-commit-to-branch @@ -56,6 +62,7 @@ jobs: files: .git/coverage${{matrix.python-version}}.xml flag-name: python${{matrix.python-version}} parallel: true + fail-on-error: false - uses: dciborow/action-pylint@0.1.0 with: @@ -89,12 +96,14 @@ jobs: - name: Finish the parallel coverage upload to Coveralls uses: coverallsapp/github-action@v2 with: + fail-on-error: false parallel-finished: true - continue-on-error: true # Do not fail CI if this step fails deprecation-test: name: Deprecation tests runs-on: ubuntu-22.04 + permissions: + contents: read steps: - name: Checkout code @@ -109,6 +118,8 @@ jobs: test-sdk-builds: name: Test SDK builds uses: ./.github/workflows/generate-and-build-sdks.yml + permissions: + contents: read with: # Ensure you also update ocaml-tests # when changing this value, to keep builds diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5c3f1cd5502..5dc14425102 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,5 +1,7 @@ name: Create release from tag +permissions: {} + on: push: tags: @@ -9,6 +11,8 @@ jobs: build-python: name: Build and upload Python artifacts runs-on: ubuntu-latest + permissions: + contents: read steps: - name: Checkout code @@ -36,10 +40,15 @@ jobs: build-sdks: name: Build and upload SDK artifacts uses: ./.github/workflows/generate-and-build-sdks.yml + permissions: + contents: read with: xapi_version: ${{ github.ref_name }} release: + permissions: + contents: write # allow creating a release + name: "Create and package release" runs-on: ubuntu-latest needs: [build-python, build-sdks] @@ -74,16 +83,10 @@ jobs: name: SDK_Binaries_CSharp path: dist/ - - name: Retrieve PowerShell 5.x SDK distribution artifacts - uses: actions/download-artifact@v4 - with: - name: SDK_Binaries_XenServerPowerShell_NET45 - path: sdk_powershell_5x/ - - name: Retrieve PowerShell 7.x SDK distribution artifacts uses: actions/download-artifact@v4 with: - name: SDK_Binaries_XenServerPowerShell_NET6 + name: SDK_Binaries_XenServerPowerShell_NET8 path: sdk_powershell_7x/ - name: Package C SDK artifacts for deployment @@ -95,10 +98,6 @@ jobs: rm -rf libxenserver/usr/local/lib/ tar -zcvf libxenserver-prerelease.src.tar.gz -C ./libxenserver/usr/local . - - name: Zip PowerShell 5.x SDK artifacts for deployment - shell: bash - run: zip PowerShell-SDK-5.x-prerelease-unsigned.zip ./sdk_powershell_5x -r - - name: Zip PowerShell 7.x SDK artifacts for deployment shell: bash run: zip PowerShell-SDK-7.x-prerelease-unsigned.zip ./sdk_powershell_7x -r @@ -111,7 +110,6 @@ jobs: shell: bash run: | gh release create ${{ github.ref_name }} --repo ${{ github.repository }} --generate-notes dist/* \ - PowerShell-SDK-5.x-prerelease-unsigned.zip \ PowerShell-SDK-7.x-prerelease-unsigned.zip \ Go-SDK-prerelease-unsigned.zip \ libxenserver-prerelease.tar.gz libxenserver-prerelease.src.tar.gz @@ -124,6 +122,7 @@ jobs: needs: release environment: pypi permissions: + contents: read id-token: write steps: - name: Retrieve python distribution artifacts diff --git a/.github/workflows/setup-xapi-environment/action.yml b/.github/workflows/setup-xapi-environment/action.yml index 8381e31117b..aba9e881219 100644 --- a/.github/workflows/setup-xapi-environment/action.yml +++ b/.github/workflows/setup-xapi-environment/action.yml @@ -5,6 +5,9 @@ inputs: xapi_version: description: "XenAPI version, pass to configure as --xapi_version=" required: true + ocaml_version: + description: "OCaml compiler version" + required: true runs: using: "composite" steps: @@ -18,7 +21,7 @@ runs: shell: bash run: | mkdir -p /opt/xensource/sm - wget -O /opt/xensource/sm/XE_SR_ERRORCODES.xml https://raw.githubusercontent.com/xapi-project/sm/master/drivers/XE_SR_ERRORCODES.xml + wget -O /opt/xensource/sm/XE_SR_ERRORCODES.xml https://raw.githubusercontent.com/xapi-project/sm/master/libs/sm/core/XE_SR_ERRORCODES.xml - name: Load environment file id: dotenv @@ -52,7 +55,7 @@ runs: - name: Use ocaml uses: ocaml/setup-ocaml@v3 with: - ocaml-compiler: ${{ steps.dotenv.outputs.ocaml_version_full }} + ocaml-compiler: ${{ inputs.ocaml_version }} opam-repositories: | xs-opam: ${{ steps.dotenv.outputs.repository }} dune-cache: true diff --git a/.github/workflows/shellcheck.yaml b/.github/workflows/shellcheck.yaml index b078eaba549..f685b35d9f4 100644 --- a/.github/workflows/shellcheck.yaml +++ b/.github/workflows/shellcheck.yaml @@ -1,5 +1,7 @@ name: ShellCheck +permissions: {} + on: pull_request: merge_group: @@ -16,8 +18,11 @@ jobs: runs-on: ubuntu-latest permissions: + actions: read + contents: read + pull-requests: write # allow commenting on the PR security-events: write - + steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/.gitignore b/.gitignore index 2c90d7261d3..93ad844074b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ _coverage/ *.install *.swp compile_flags.txt +_opam # tests xapi-db.xml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e8fb2f37e0e..008a4e13fb7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ ## For manually executing the pre-push hook: # pre-commit run -av --hook-stage pre-push # -default_stages: [commit, push] +default_stages: [pre-commit, pre-push] default_language_version: python: python3.11 repos: @@ -68,7 +68,7 @@ repos: entry: env PYTHONDEVMODE=yes sh -c 'coverage run && coverage xml && coverage html && coverage report && diff-cover --ignore-whitespace --compare-branch=origin/master - --show-uncovered --html-report .git/coverage-diff.html + --show-uncovered --format html:.git/coverage-diff.html --fail-under 50 .git/coverage3.11.xml' require_serial: true pass_filenames: false @@ -108,7 +108,7 @@ repos: hooks: - id: pylint files: python3/ - stages: [push] + stages: [pre-push] name: check that changes to python3 tree pass pylint entry: diff-quality --violations=pylint --ignore-whitespace --compare-branch=origin/master @@ -134,7 +134,7 @@ repos: entry: python3 pytype_reporter.py pass_filenames: false types: [python] - stages: [push] + stages: [pre-push] verbose: true # This hook runs locally only when Python files change: language: python diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000000..34b62707ea4 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,162 @@ +# Issues + +We welcome reports of technical issues with the components of the xen-api +toolstack. Please make sure that the description of the issue is as detailed as +possible to help anyone investigating it: + +1) Mention how it was detected, if and how it could be reproduced + +1) What's the desired behaviour? In what cases would it be useful? + +1) Include error messages, related logs if appropriate + +# Pull Requests + +To contribute changes to xen-api, please fork the repository on +GitHub, and then submit a pull request. + +It is required to add a `Signed-off-by:` as a +[Developers Certificate of Origin](http://developercertificate.org). +It certifies the patch's origin and is licensed under an +appropriate open-source licence to include it in Xapi: +https://git-scm.com/docs/git-commit#Documentation/git-commit.txt---signoff + +The following points are intended to describe what makes a contribution "good" - +easier to review, integrate, and maintain. Please follow them in your work. + +## Commit subjects and PR titles + +Commit subjects should preferrably start with the name of the component the +commit is most related to, and describe what the commit achieves. If your +commit only touches the `ocaml/xenopsd` directory, it should look like this, +for example: + +``` +xenopsd: Fix a deadlock during VM suspend +``` + +Similar principle applies to Pull Request titles. If there is only a single +commit in the PR, Github will automatically copy its subject and description to +the PR's title and body. If there are several commits in the PR, describe what +the PR achieves and the components it most directly impacts. + +If the commit subject includes some tracking identifier (such as `CP-1234`, for +example) referring to internal systems, please make sure to include all of the +essential information in the public descriptions - describe the symptoms of the +issue, how it was detected, investigated, how it could be reproduced, what are +the trade-offs and so on as appropriate. + +## Split into commits + +Following from the rules described above, if what the commit achieves is +difficult to fit into its subject, it is probably better to split it into +several commits, if possible. Note that every commit should build (`make` +should work and the CI should pass) independently, without requiring future +commits. This means some modifications can't really be split into several +commits (datamodel changes, in particular, require modifications to several +components at the same time), but makes it easier to revert part of the Pull +Request if some issues are detected in integration testing at a later point. + +## Good Commit Messages + +Commit messages (and the body of a Pull Request) should be as helpful and +descriptive as possible. If applicable, please include a description of current +behaviour, your changes, and the new behaviour. Justify the reasoning behind +your changes - are they sufficient on their own, or preparing for more changes? +Link any appropriate documentation, issues, or commits (avoiding internal and +publicly inaccessible sources) + +## CI + +Please make sure your Pull Request passes the Github CI. It will verify that +your code has been properly formatted (can be done locally with `make format`), +builds (`make` and `make check`), and passes the unit tests (`make test`). +The CI will run in the branches of your fork, so you can verify it passes +there before opening a Pull Request. + +## Testing + +Describe what kind of testing your contribution underwent. If the testing was +manual, please describe the commands or external clients that were used. If the +tests were automated, include at least a cursory description/name of the tests, +when they were regressed, if possible. + +Please note that any contribution to the code of the project will likely +require at least some testing to be done. Depending on how central the +component touched in your PR is to the system, the more things could only be +detected in real-world usecases through integration testing. + +If a commit has been determined to break integration testing at a later stage, +please note that the first and safest measure will almost always be reverting +the faulty commit. Making sure critical tests are passing remains a priority +over waiting for some commit to be reworked or refactored (which can be worked +on after a revert has been done). Though we are striving to make more tests +public (with failure then being visible to all), as long as some critical tests +remain private, this will also apply to such tests (with maintainers flagging +the breakage preferrably describing at least the gist of the test). + +If you are still waiting on some testing to be done, please mark the PR as a +"draft" and make the reasoning clear. + +If wider testing is needed (e.g. the change itself is believed to be correct +but may expose latent bugs in other components), lightweight feature flags can +also be used. E.g. an entry in `xapi_globs.ml` and `xapi.conf`, where the +feature/change is defaulted to `off`, to be turned on at a future time +(when e.g. more related PRs land, or it has passed some wider testing). + +If your contribution doesn't intend to have any functional changes, please make +that clear as well. + +## Feature work + +If your contribution adds some new feature or reworks some major aspect of the +system (as opposed to one-off fixes), it can be benefitial to first describe +the plan of your work in a design proposal. Architectural issues are better +spotted early on, and taking a big-picture view can often lead to new insights. + +An example of a design proposal is here: + +https://github.com/xapi-project/xen-api/pull/6387 + +If submitting a design first is not possible, include documentation alongside +with your PR describing the work, like it was done in the last three commits +here: + +https://github.com/xapi-project/xen-api/pull/6457 + +Note that the design will often serve as documentation as well - so take care +updating it after the implementation is done to better reflect reality. + +## Review process and merge + +It can often be useful to address review suggestions with a "fixup" commit +(created manually or with the help of `git commit --fixup=HASH`). This way it +is clear what the original code was and what your fix touches. Once the +fixup commit has been reviewed and the PR approved, please squash the fixup +commits with `git rebase --autosquash` before merging. Otherwise the commits in +the Pull Request should stay as independent commits - we do not require +squashing all the commits into a single one on merge. + +If the commit fixes a bug in an earlier, already merged PR then it might be +useful to mention that in the commit, if known. + +This can be done by adding this to your GIT configuration: + +``` +[pretty] + fixes = Fixes: %h (\"%s\") +``` + +And then running: + +``` +# git log -1 --pretty=fixes +Fixes: 1c581c074 ("xenopsd: Fix a deadlock during VM suspend") +``` + +This will print the commit title and hash in a nice format, which can then be +added to the footer of the commit message (alongside the sign-off). + +This is useful information to have if any of these commits get backported to +another release in the future, so that we also backport the bugfixes, not just +the buggy commits. diff --git a/Makefile b/Makefile index 7f7386bf6b1..a1d5a628f33 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ JOBS = $(shell getconf _NPROCESSORS_ONLN) PROFILE=release OPTMANDIR ?= $(OPTDIR)/man/man1/ -.PHONY: build clean test doc python format install uninstall coverage +.PHONY: build clean test doc python format install uninstall coverage analyze # if we have XAPI_VERSION set then set it in dune-project so we use that version number instead of the one obtained from git # this is typically used when we're not building from a git repo @@ -147,33 +147,34 @@ install-extra: DESTDIR=$(DESTDIR) SBINDIR=$(SBINDIR) QEMU_WRAPPER_DIR=$(QEMU_WRAPPER_DIR) XENOPSD_LIBEXECDIR=$(XENOPSD_LIBEXECDIR) ETCDIR=$(ETCDIR) ./ocaml/xenopsd/scripts/make-custom-xenopsd.conf # common flags and packages for 'dune install' and 'dune uninstall' -DUNE_IU_PACKAGES1=-j $(JOBS) --destdir=$(DESTDIR) --prefix=$(PREFIX) --libdir=$(LIBDIR) --mandir=$(MANDIR) +DUNE_IU_COMMON=-j $(JOBS) --destdir=$(DESTDIR) --libdir=$(LIBDIR) --mandir=$(MANDIR) +DUNE_IU_PACKAGES1=$(DUNE_IU_COMMON) --prefix=$(PREFIX) DUNE_IU_PACKAGES1+=--libexecdir=$(XENOPSD_LIBEXECDIR) --datadir=$(SDKDIR) DUNE_IU_PACKAGES1+=xapi-client xapi-schema xapi-consts xapi-cli-protocol xapi-datamodel xapi-types DUNE_IU_PACKAGES1+=xen-api-client xen-api-client-lwt rrdd-plugin rrd-transport DUNE_IU_PACKAGES1+=gzip http-lib pciutil sexpr stunnel uuid xml-light2 zstd xapi-compression safe-resources DUNE_IU_PACKAGES1+=message-switch message-switch-cli message-switch-core message-switch-lwt -DUNE_IU_PACKAGES1+=message-switch-unix xapi-idl forkexec xapi-forkexecd xapi-storage xapi-storage-script xapi-storage-cli +DUNE_IU_PACKAGES1+=message-switch-unix xapi-idl xapi-forkexecd xapi-storage xapi-storage-script xapi-storage-cli DUNE_IU_PACKAGES1+=xapi-nbd varstored-guard xapi-log xapi-open-uri xapi-tracing xapi-tracing-export xapi-expiry-alerts cohttp-posix -DUNE_IU_PACKAGES1+=xapi-rrd xapi-inventory clock xapi-sdk -DUNE_IU_PACKAGES1+=xapi-stdext-date xapi-stdext-encodings xapi-stdext-pervasives xapi-stdext-std xapi-stdext-threads xapi-stdext-unix xapi-stdext-zerocheck xapi-tools +DUNE_IU_PACKAGES1+=xapi-rrd xapi-inventory clock xapi-sdk tgroup +DUNE_IU_PACKAGES1+=xapi-stdext-encodings xapi-stdext-pervasives xapi-stdext-std xapi-stdext-threads xapi-stdext-unix xapi-stdext-zerocheck xapi-tools install-dune1: # dune can install libraries and several other files into the right locations dune install $(DUNE_IU_PACKAGES1) -DUNE_IU_PACKAGES2=-j $(JOBS) --destdir=$(DESTDIR) --prefix=$(OPTDIR) --libdir=$(LIBDIR) --mandir=$(MANDIR) --libexecdir=$(OPTDIR)/libexec --datadir=$(DOCDIR) xapi xe +DUNE_IU_PACKAGES2=$(DUNE_IU_COMMON) --prefix=$(OPTDIR) --libexecdir=$(OPTDIR)/libexec --datadir=$(DOCDIR) xapi xe install-dune2: dune install $(DUNE_IU_PACKAGES2) -DUNE_IU_PACKAGES3=-j $(JOBS) --destdir=$(DESTDIR) --prefix=$(OPTDIR) --libdir=$(LIBDIR) --mandir=$(MANDIR) --libexecdir=$(OPTDIR)/libexec --bindir=$(OPTDIR)/debug --datadir=$(OPTDIR)/debug xapi-debug +DUNE_IU_PACKAGES3=$(DUNE_IU_COMMON) --prefix=$(OPTDIR) --libexecdir=$(OPTDIR)/libexec --bindir=$(OPTDIR)/debug --datadir=$(OPTDIR)/debug xapi-debug install-dune3: dune install $(DUNE_IU_PACKAGES3) -DUNE_IU_PACKAGES4=-j $(JOBS) --destdir=$(DESTDIR) --prefix=$(PREFIX) --libdir=$(LIBDIR) --libexecdir=/usr/libexec --mandir=$(MANDIR) vhd-tool +DUNE_IU_PACKAGES4=$(DUNE_IU_COMMON) --prefix=$(PREFIX) --libexecdir=/usr/libexec vhd-tool forkexec qcow-stream-tool install-dune4: dune install $(DUNE_IU_PACKAGES4) @@ -186,7 +187,7 @@ install: chmod +x $(DESTDIR)$(DOCDIR)/doc-convert.sh # backward compat with existing specfile, to be removed after it is updated find $(DESTDIR) -name '*.cmxs' -delete - for pkg in xapi-debug xapi xe xapi-tools xapi-sdk vhd-tool; do for f in CHANGELOG LICENSE README.markdown; do rm $(DESTDIR)$(OPTDIR)/doc/$$pkg/$$f $(DESTDIR)$(PREFIX)/doc/$$pkg/$$f -f; done; for f in META dune-package opam; do rm $(DESTDIR)$(LIBDIR)/$$pkg/$$f -f; done; done; + for pkg in xapi-debug xapi xe xapi-tools xapi-sdk vhd-tool qcow-stream-tool; do for f in CHANGELOG LICENSE README.markdown; do rm $(DESTDIR)$(OPTDIR)/doc/$$pkg/$$f $(DESTDIR)$(PREFIX)/doc/$$pkg/$$f -f; done; for f in META dune-package opam; do rm $(DESTDIR)$(LIBDIR)/$$pkg/$$f -f; done; done; uninstall: @@ -196,6 +197,17 @@ uninstall: dune uninstall $(DUNE_IU_PACKAGES3) dune uninstall $(DUNE_IU_PACKAGES4) +# An approximation, we actually depend on all dune files recursively +# Also fixup the directory paths to remove _build +# (we must refer to paths that exist in the repository for static analysis results) +compile_commands.json: Makefile dune + mkdir -p _build/ + dune rules | dune-compiledb -o _build/ + sed -e 's/"directory".*/"directory": ".",/' <_build/$@ >$@ + +analyze: compile_commands.json Makefile .codechecker.json + CodeChecker check --config .codechecker.json -l compile_commands.json + compile_flags.txt: Makefile (ocamlc -config-var ocamlc_cflags;\ ocamlc -config-var ocamlc_cppflags;\ diff --git a/README.markdown b/README.markdown index 37174144a3e..9f795d85506 100644 --- a/README.markdown +++ b/README.markdown @@ -11,7 +11,7 @@ Xen API is written mostly in [OCaml](http://caml.inria.fr/ocaml/) 4.07. Xapi is the main component produced by the Linux Foundation's -[Xapi Project](http://xenproject.org/developers/teams/xapi.html). +[Xapi Project](https://xenproject.org/projects/xapi/). Build and Install ----------------- @@ -32,7 +32,7 @@ To build xen-api from source, we recommend using [opam](https://opam.ocaml.org/d - Run that line, e.g.: ```bash - export OCAML_VERSION_FULL="4.14.1" + export OCAML_VERSION_FULL="4.14.2" ``` 4) Setup opam with your environment (i.e. switch). @@ -99,18 +99,29 @@ git push origin --tags Contributions ------------- -To contribute patches to xen-api, please fork the repository on -Github, and then submit a pull request. If for some reason you can't -use Github to submit a pull request, then you may send your patch for -review to the [xen-api@lists.xenproject.org mailing list](http://www.xenproject.org/help/mailing-list.html), with a link to a -public git repository for review. We much prefer Github pull requests, -however, and submitting a patch to the mailing list will take much -more time for review. +To contribute changes to xen-api, please fork the repository on +GitHub, and then submit a pull request. -Maintainers +It is required to add a `Signed-off-by:` as a +[Developers Certificate of Origin](http://developercertificate.org). +It certifies the patch's origin and is licensed under an +appropriate open-source licence to include it in Xapi: +https://git-scm.com/docs/git-commit#Documentation/git-commit.txt---signoff + +For more detailed guidelines on what makes a good contribution, see +[CONTRIBUTING](./CONTRIBUTING.md). + +Discussions ----------- -Maintainers can be contacted via this mailing list: `xen-api@lists.xenproject.org` +Discussions can be started at +https://github.com/xapi-project/xen-api/discussions + +Issues +------ + +Issues can be raised at +https://github.com/xapi-project/xen-api/issues Licensing --------- diff --git a/doc/assets/css/misc.css b/doc/assets/css/misc.css index beb5a28e43a..dad61421838 100644 --- a/doc/assets/css/misc.css +++ b/doc/assets/css/misc.css @@ -47,10 +47,6 @@ } -.table-striped > tbody > tr:nth-child(odd) { - background-color: #f9f9f9; -} - .btn { display: inline-block; padding: 6px 12px; diff --git a/doc/assets/css/xenapi.css b/doc/assets/css/xenapi.css index d75b1b6d089..4ab6ff3ea16 100644 --- a/doc/assets/css/xenapi.css +++ b/doc/assets/css/xenapi.css @@ -42,6 +42,16 @@ th { text-align: left; .field, .field2 { margin: 0em 0; padding: .5em .7em .7em; + /** + * doc/layouts/partials/content.html generates tables with alternating + * field and field2 for the rows of the XenAPI Class Reference tables. + * Their background colours are hard-coded to bright colours here, but the + * colors are not adjusted for dark mode. We cannot use the theme colours + * in this case. Thus we have to hard-code the colours for now. Ergo, also + * hard-code the text colour to ensure that it has contrast in dark mode too. + * Only shades of grey are used, so the text colour is hard-coded to black. + */ + color: black; background-color: #dddddd; cursor: pointer; font-size: 15px; @@ -113,3 +123,7 @@ th { text-align: left; margin: 0; vertical-align: middle; } + +div[id$='_details'] { + cursor: default; +} diff --git a/doc/assets/js/parse.js b/doc/assets/js/parse.js new file mode 100644 index 00000000000..9460aab1bf7 --- /dev/null +++ b/doc/assets/js/parse.js @@ -0,0 +1,146 @@ + +class Type {}; + +class Builtin extends Type { + constructor(name) { + super(); + this.name = name; + } + + static ofString(s) { + const concrete = ['string', 'bool', 'int', 'float', 'void', 'datetime']; + if (!concrete.includes(s)) + return null; + + return new Builtin(s); + } +}; + +class Enum extends Type { + constructor(name) { + super(); + this.name = name; + } +}; + +class Ctor extends Type { + constructor(params, name) { + super(); + this.params = params; + this.name = name; + } +}; + +function lex(str) { + if (str.indexOf('$') >= 0) + throw new Error('Not allowed to contain $'); + + let ts = str.replaceAll('(', ' ( '); + ts = ts.replaceAll(')', ' ) '); + ts = ts.split(' '); + ts = ts.filter(x => x !== ''); + ts.push('$'); + return ts; +} + +class Lexer { + constructor(tokens) { + this.tokens = tokens; + this.pos = 0; + } + + shift() { + if (this.pos >= this.tokens.length - 1) + return '$'; + + return this.tokens[this.pos++]; + } + + peek() { + const prev = this.pos; + let t = this.shift(); + this.pos = prev; + return t; + } + + expect(ts) { + if (!Array.isArray(ts)) + ts = [ts]; + + let l = this.shift(); + for (const t of ts) + if (l == t) return; + + throw new Error(`Expected ${t}, got ${l}`); + } +}; + +function lbp(t) { + switch (t) { + case '(': + case ')': + case '->': + case '\u2192': + return 0; + case '$': + return -1; + } + + return 1; +} + +function nud(l, t) { + switch (t) { + case 'enum': + return new Enum(l.shift()); + + case '(': + let left = parseType(l, 0); + l.expect(['->', '\u2192']); + let right = parseType(l, 0); + l.expect(')'); + l.expect('map'); + return new Ctor([left, right], 'map'); + } + + let bty = Builtin.ofString(t); + if (bty != null) + return bty; + + const fmt = /^[a-zA-Z_]+$/; + if (fmt.test(t)) + return new Ctor([], t); + + throw new Error(`No null denotation for ${t}`); +} + +function led(l, left, t) { + const known = ['set', 'ref', 'option', 'record']; + if (!known.includes(t)) + throw new Error(`Invalid type constructor: ${t}`); + + return new Ctor([left], t); +} + +function parseType(l, rbp) { + let left = nud(l, l.shift()); + + while (lbp(l.peek()) > rbp) + left = led(l, left, l.shift()); + + return left; +} + +function parseSingleType(input) { + try { + let lexer = new Lexer(lex(input)); + let ty = parseType(lexer, 0); + if (lexer.peek() != '$') + throw new Error('Did not consume entire input'); + return ty; + } catch (e) { + } + + return null; +} + diff --git a/doc/content/design/add-qcow-tool-for-vdi-import-export.md b/doc/content/design/add-qcow-tool-for-vdi-import-export.md new file mode 100644 index 00000000000..127369e3db5 --- /dev/null +++ b/doc/content/design/add-qcow-tool-for-vdi-import-export.md @@ -0,0 +1,121 @@ +--- +title: Add qcow tool to allow VDI import/export +layout: default +design_doc: true +revision: 1 +status: proposed +--- + +# Introduction + +At XCP-ng, we are working on overcoming the 2TiB limitation for VM disks while +preserving essential features such as snapshots, copy-on-write capabilities, and +live migration. + +To achieve this, we are introducing Qcow2 support in SMAPI and the blktap driver. +With the alpha release, we can: + - Create a VDI + - Snapshot it + - Export and import it to/from XVA + - Perform full backups + +However, we currently cannot export a VDI to a Qcow2 file, nor import one. + +The purpose of this design proposal is to outline a solution for implementing VDI +import/export in Qcow2 format. + +# Design Proposal + +The import and export of VHD-based VDIs currently rely on *vhd-tool*, which is +responsible for streaming data between a VDI and a file. It supports both Raw and +VHD formats, but not Qcow2. + +There is an existing tool called [qcow-tool](https://opam.ocaml.org/packages/qcow-tool/) +originally packaged by MirageOS. It is no longer actively maintained, but it can +produce Qcow files readable by QEMU. + +Currently, *qcow-tool* does not support streaming, but we propose to add this +capability. This means replicating the approach used in *vhd-tool*, where data is +pushed to a socket. + +We have contacted the original developer, David Scott, and there are no objections +to us maintaining the tool if needed. + +Therefore, the most appropriate way to enable Qcow2 import/export in XAPI is to +add streaming support to `qcow-tool`. + +# XenAPI changes + +## The workflow + +- The export and import of VDIs are handled by the XAPI HTTP server: + - `GET /export_raw_vdi` + - `PUT /import_raw_vdi` +- The corresponding handlers are `Export_raw_vdi.handler` and + `Import_raw_vdi.handler`. +- Since the format is checked in the handler, we need to add support for `Qcow2`, + as currently only `Raw`, `Tar`, and `Vhd` are supported. +- This requires adding a new type in the `Importexport.Format` module and a new + content type: `application/x-qemu-disk`. + See [mime-types format](https://www.digipres.org/formats/mime-types/#application/x-qemu-disk). +- This allows the format to be properly decoded. Currently, all formats use a + wrapper called `Vhd_tool_wrapper`, which sets up parameters for `vhd-tool`. + We need to add a new wrapper for the Qcow2 format, which will instead use + `qcow-tool`, a tool that we will package (see the section below). +- The new wrapper will be responsible for setting up parameters (source, + destination, etc.). Since it only manages Qcow2 files, we don’t need to pass + additional format information. +- The format (`qcow2`) will be specified in the URI. For example: + - `/import_raw_vdi?session_id=&task_id=&vdi=&format=qcow2` + +## Adding and modifying qcow-tool + +- We need to package [qcow-tool](https://opam.ocaml.org/packages/qcow-tool). +- This new tool will be called from `ocaml/xapi/qcow_tool_wrapper.ml`, as + described in the previous section. + +- To export a VDI to a Qcow2 file, we need to add functionality similar to + `Vhd_tool_wrapper.send`, which calls `vhd-tool stream`. + - It writes data from the source to a destination. Unlike `vhd-tool`, which + supports multiple destinations, we will only support Qcow2 files. + - Here is a typicall call to `vhd-tool stream` +```sh +/bin/vhd-tool stream \ + --source-protocol none \ + --source-format hybrid \ + --source /dev/sm/backend/ff1b27b1-3c35-972e-76ec-a56fe9f25e36/87711319-2b05-41a3-8ee0-3b63a2fc7035:/dev/VG_XenStorage-ff1b27b1-3c35-972e-76ec-a56fe9f25e36/VHD-87711319-2b05-41a3-8ee0-3b63a2fc7035 \ + --destination-protocol none \ + --destination-format vhd \ + --destination-fd 2585f988-7374-8131-5b66-77bbc239cbb2 \ + --tar-filename-prefix \ + --progress \ + --machine \ + --direct \ + --path /dev/mapper:. +``` + +- To import a VDI from a Qcow2 file, we need to implement functionality similar + to `Vhd_tool_wrapper.receive`, which calls `vhd-tool serve`. + - This is the reverse of the export process. As with export, we will only + support a single type of import: from a Qcow2 file. + - Here is a typical call to `vhd-tool serve` +```sh +/bin/vhd-tool serve \ + --source-format raw \ + --source-protocol none \ + --source-fd 3451d7ed-9078-8b01-95bf-293d3bc53e7a \ + --tar-filename-prefix \ + --destination file:///dev/sm/backend/f939be89-5b9f-c7c7-e1e8-30c419ee5de6/4868ac1d-8321-4826-b058-952d37a29b82 \ + --destination-format raw \ + --progress \ + --machine \ + --direct \ + --destination-size 180405760 \ + --prezeroed +``` + +- We don't need to propose different protocol and different format. As we will +not support different formats we just to handle data copy from socket into file +and from file to socket. Sockets and files will be managed into the +`qcow_tool_wrapper`. The `forkhelpers.ml` manages the list of file descriptors +and we will mimic what the vhd tool wrapper does to link a UUID to socket. diff --git a/doc/content/design/numa.md b/doc/content/design/numa.md new file mode 100644 index 00000000000..fa1917b3c57 --- /dev/null +++ b/doc/content/design/numa.md @@ -0,0 +1,142 @@ +--- +title: NUMA +layout: default +design_doc: true +revision: 1 +status: proposed +--- + +# NUMA + +NUMA stands for Non-Uniform Memory Access and describes that RAM access +for CPUs in a large system is not equally fast for all of them. CPUs +are grouped into so-called nodes and each node has fast access to RAM +that is considered local to its node and slower access to other RAM. +Conceptually, a node is a container that bundles some CPUs and RAM and +there is an associated cost when accessing RAM in a different node. In +the context of CPU virtualisation assigning vCPUs to NUMA nodes is an +optimisation strategy to reduce memory latency. This document describes +a design to make NUMA-related assignments for Xen domains (hence, VMs) +visible to the user. Below we refer to these assignments and +optimisations collectively as NUMA for simplicity. + +NUMA is more generally discussed as +[NUMA Feature](../toolstack/features/NUMA/index.md). + + +## NUMA Properties + +Xen 4.20 implements NUMA optimisation. We want to expose the following +NUMA-related properties of VMs to API clients, and in particualar +XenCenter. Each one is represented by a new field in XAPI's `VM_metrics` +data model: + +* RO `VM_metrics.numa_optimised`: boolean: if the VM is + optimised for NUMA +* RO `VM_metrics.numa_nodes`: integer: number of NUMA nodes of the host + the VM is using +* MRO `VM_metrics.numa_node_memory`: int -> int map; mapping a NUMA node + (int) to an amount of memory (bytes) in that node. + +Required NUMA support is only available in Xen 4.20. Some parts of the +code will have to be managed by patches. + +## XAPI High-Level Implementation + +As far as Xapi clients are concerned, we implement new fields in the +`VM_metrics` class of the data model and surface the values in the CLI +via `records.ml`; we could decide to make `numa_optimised` visible by +default in `xe vm-list`. + +Introducing new fields requires defaults; these would be: + +* `numa_optimised`: false +* `numa_nodes`: 0 +* `numa_node_memory`: [] + +The data model ensures that the values are visible to API clients. + +## XAPI Low-Level Implementation + +NUMA properties are observed by Xenopsd and Xapi learns about them as +part of the `Client.VM.stat` call implemented by Xenopsd. Xapi makes +these calls frequently and we will update the Xapi VM fields related to +NUMA simply as part of processing the result of such a call in Xapi. + +For this to work, we extend the return type of `VM.stat` in + +* `xenops_types.ml`, type `Vm.state` + +with three fields: + +* `numa_optimised: bool` +* `numa_nodes: int` +* `numa_node_memory: (int, int64) list` + +matching the semantics from above. + +## Xenopsd Implementation + +Xenopsd implements the `VM.stat` return value in + +* `Xenops_server_sen.get_state` + +where the three fields would be set. Xenopsds relies on bindings to Xen to +observe NUMA-related properties of a domain. + +Given that NUMA related functionality is only available for Xen 4.20, we +probably will have to maintain a patch in xapi.spec for compatibility +with earlier Xen versions. + +The (existing) C bindings and changes come in two forms: new functions +and an extension of a type used by and existing function. + +```ocaml + external domain_get_numa_info_node_pages_size : handle -> int -> int + = "stub_xc_domain_get_numa_info_node_pages_size" +``` + +Thia function reports the number of NUMA nodes used by a Xen domain +(supplied as an argument) + +```ocaml + type domain_numainfo_node_pages = { + tot_pages_per_node : int64 array; + } + external domain_get_numa_info_node_pages : + handle -> int -> int -> domain_numainfo_node_pages + = "stub_xc_domain_get_numa_info_node_pages" +``` + +This function receives as arguments a domain ID and the number of nodes +this domain is using (acquired using `domain_get_numa_info_node_pages`) + +The number of NUMA nodes of the host (not domain) is reported by +`Xenctrl.physinfo` which returns a value of type `physinfo`. + +```diff + index b4579862ff..491bd3fc73 100644 + --- a/tools/ocaml/libs/xc/xenctrl.ml + +++ b/tools/ocaml/libs/xc/xenctrl.ml + @@ -155,6 +155,7 @@ type physinfo = + capabilities : physinfo_cap_flag list; + max_nr_cpus : int; + arch_capabilities : arch_physinfo_cap_flags; + + nr_nodes : int; + } +``` + +We are not reporting `nr_nodes` directly but use it to determine the +value of `numa_optimised` for a domain/VM: + + numa_optimised = + (VM.numa_nodes = 1) + or (VM.numa_nodes < physinfo.Xenctrl.nr_nodes) + +### Details + +The three new fields that become part of type `VM.state` are updated as +part of `get_state()` using the primitives above. + + + diff --git a/doc/content/design/sm-supported-image-formats.md b/doc/content/design/sm-supported-image-formats.md new file mode 100644 index 00000000000..3d860c2833f --- /dev/null +++ b/doc/content/design/sm-supported-image-formats.md @@ -0,0 +1,166 @@ +--- +title: Add supported image formats in sm-list +layout: default +design_doc: true +revision: 3 +status: proposed +--- + +# Introduction + +At XCP-ng, we are enhancing support for QCOW2 images in SMAPI. The primary +motivation for this change is to overcome the 2TB size limitation imposed +by the VHD format. By adding support for QCOW2, a Storage Repository (SR) will +be able to host disks in VHD and/or QCOW2 formats, depending on the SR type. +In the future, additional formats—such as VHDx—could also be supported. + +We need a mechanism to expose to end users which image formats are supported +by a given SR. The proposal is to extend the SM API object with a new field +that clients (such as XenCenter, XenOrchestra, etc.) can use to determine the +available formats. + +# Design Proposal + +To expose the available image formats to clients (e.g., XenCenter, XenOrchestra, etc.), +we propose adding a new field called `supported_image_formats` to the Storage Manager +(SM) module. This field will be included in the output of the `SM.get_all_records` call. + +- With this new information, listing all parameters of the SM object will return: + +```bash +# xe sm-list params=all +``` + +Output of the command will look like (notice that CLI uses hyphens): + +``` +uuid ( RO) : c6ae9a43-fff6-e482-42a9-8c3f8c533e36 +name-label ( RO) : Local EXT3 VHD +name-description ( RO) : SR plugin representing disks as VHD files stored on a local EXT3 filesystem, created inside an LVM volume +type ( RO) : ext +vendor ( RO) : Citrix Systems Inc +copyright ( RO) : (C) 2008 Citrix Systems Inc +required-api-version ( RO) : 1.0 +capabilities ( RO) [DEPRECATED] : SR_PROBE; SR_SUPPORTS_LOCAL_CACHING; SR_UPDATE; THIN_PROVISIONING; VDI_ACTIVATE; VDI_ATTACH; VDI_CLONE; VDI_CONFIG_CBT; VDI_CREATE; VDI_DEACTIVATE; VDI_DELETE; VDI_DETACH; VDI_GENERATE_CONFIG; VDI_MIRROR; VDI_READ_CACHING; VDI_RESET_ON_BOOT; VDI_RESIZE; VDI_SNAPSHOT; VDI_UPDATE +features (MRO) : SR_PROBE: 1; SR_SUPPORTS_LOCAL_CACHING: 1; SR_UPDATE: 1; THIN_PROVISIONING: 1; VDI_ACTIVATE: 1; VDI_ATTACH: 1; VDI_CLONE: 1; VDI_CONFIG_CBT: 1; VDI_CREATE: 1; VDI_DEACTIVATE: 1; VDI_DELETE: 1; VDI_DETACH: 1; VDI_GENERATE_CONFIG: 1; VDI_MIRROR: 1; VDI_READ_CACHING: 1; VDI_RESET_ON_BOOT: 2; VDI_RESIZE: 1; VDI_SNAPSHOT: 1; VDI_UPDATE: 1 +configuration ( RO) : device: local device path (required) (e.g. /dev/sda3) +driver-filename ( RO) : /opt/xensource/sm/EXTSR +required-cluster-stack ( RO) : +supported-image-formats ( RO) : vhd, raw, qcow2 +``` + +## Implementation details + +The `supported_image_formats` field will be populated by retrieving information +from the SMAPI drivers. Specifically, each driver will update its `DRIVER_INFO` +dictionary with a new key, `supported_image_formats`, which will contain a list +of strings representing the supported image formats +(for example: `["vhd", "raw", "qcow2"]`). Although the formats are listed as a +list of strings, they are treated as a set-specifying the same format multiple +times has no effect. + +### Driver behavior without `supported_image_formats` + +If a driver does not provide this information (as is currently the case with +existing drivers), the default value will be an empty list. This signifies +that the driver determines which format to use when creating VDI. During a migration, +the destination driver will choose the format of the VDI if none is explicitly +specified. This ensures backward compatibility with both current and future drivers. + +### Specifying image formats for VDIs creation + +If the supported image format is exposed to the client, then, when creating new VDI, +user can specify the desired format via the `sm_config` parameter `image-format=qcow2` (or +any format that is supported). If no format is specified, the driver will use its +preferred default format. If the specified format is not supported, an error will be +generated indicating that the SR does not support it. Here is how it can be achieved +using the XE CLI: + +```bash +# xe vdi-create \ + sr-uuid=cbe2851e-9f9b-f310-9bca-254c1cf3edd8 \ + name-label="A new VDI" \ + virtual-size=10240 \ + sm-config:image-format=vhd +``` + +### Specifying image formats for VDIs migration + +When migrating a VDI, an API client may need to specify the desired image format if +the destination SR supports multiple storage formats. + +#### VDI pool migrate + +To support this, a new parameter, `dest_img_format`, is introduced to +`VDI.pool_migrate`. This field accepts a string specifying the desired format (e.g., *qcow2*), +ensuring that the VDI is migrated in the correct format. The new signature of +`VDI.pool_migrate` will be +`VDI ref pool_migrate (session ref, VDI ref, SR ref, string, (string -> string) map)`. + +If the specified format is not supported or cannot be used (e.g., due to size limitations), +an error will be generated. Validation will be performed as early as possible to prevent +disruptions during migration. These checks can be performed by examining the XAPI database +to determine whether the SR provided as the destination has a corresponding SM object with +the expected format. If this is not the case, a `format not found` error will be returned. +If no format is specified by the client, the destination driver will determine the appropriate +format. + +```bash +# xe vdi-pool-migrate \ + uuid= \ + sr-uuid= \ + dest-img-format=qcow2 +``` + +#### VM migration to remote host + +A VDI migration can also occur during a VM migration. In this case, we need to +be able to specify the expected destination format as well. Unlike `VDI.pool_migrate`, +which applies to a single VDI, VM migration may involve multiple VDIs. +The current signature of `VM.migrate_send` is `(session ref, VM ref, (string -> string) map, +bool, (VDI ref -> SR ref) map, (VIF ref -> network ref) map, (string -> string) map, +(VGPU ref -> GPU_group ref) map)`. Thus there is already a parameter that maps each source +VDI to its destination SR. We propose to add a new parameter that allows specifying the +desired destination format for a given source VDI: `(VDI ref -> string)`. It is +similar to the VDI-to-SR mapping. We will update the XE cli to support this new format. +It would be `image_format:=`: + +```bash +# xe vm-migrate \ + host-uuid= \ + remote-master= \ + remote-password= \ + remote-username= \ + vdi:= \ + vdi:= \ + image-format:=vhd \ + image-format:=qcow2 \ + uuid= +``` +The destination image format would be a string such as *vhd*, *qcow2*, or another +supported format. It is optional to specify a format. If omitted, the driver +managing the destination SR will determine the appropriate format. +As with VDI pool migration, if this parameter is not supported by the SM driver, +a `format not found` error will be returned. The validation must happen before +sending a creation message to the SM driver, ideally at the same time as checking +whether all VDIs can be migrated. + +To be able to check the format, we will need to modify `VM.assert_can_migrate` and +add the mapping from VDI references to their image formats, as is done in `VM.migrate_send`. + +# Impact + +It should have no impact on existing storage repositories that do not provide any information +about the supported image format. + +This change impacts the SM data model, and as such, the XAPI database version will +be incremented. It also impacts the API. + +- **Data Model:** + - A new field (`supported_image_formats`) is added to the SM records. + - A new parameter is added to `VM.migrate_send`: `(VDI ref -> string) map` + - A new parameter is added to `VM.assert_can_migrate`: `(VDI ref -> string) map` + - A new parameter is added to `VDI.pool_migrate`: `string` +- **Client Awareness:** Clients like the `xe` CLI will now be able to query and display the supported image formats for a given SR. +- **Database Versioning:** The XAPI database version will be updated to reflect this change. + diff --git a/doc/content/design/snapshot-revert.md b/doc/content/design/snapshot-revert.md index 4618e1ee9ce..e0144039535 100644 --- a/doc/content/design/snapshot-revert.md +++ b/doc/content/design/snapshot-revert.md @@ -1,62 +1,100 @@ --- -title: Improving snapshot revert behaviour +title: Better VM revert layout: default design_doc: true -revision: 1 +revision: 2 status: confirmed --- -Currently there is a XenAPI `VM.revert` which reverts a "VM" to the state it -was in when a VM-level snapshot was taken. There is no `VDI.revert` so -`VM.revert` uses `VDI.clone` to change the state of the disks. +## Overview -The use of `VDI.clone` has the side-effect of changing VDI refs and uuids. -This causes the following problems: +XenAPI allows users to rollback the state of a VM to a previous state, which is +stored in a snapshot, using the call `VM.revert`. Because there is no +`VDI.revert` call, `VM.revert` uses `VDI.clone` on the snapshot to duplicate +the contents of that disk and then use the new clone as the storage for the VM. -- It is difficult for clients - such as [Apache CloudStack](http://cloudstack.apache.org) to keep track - of the disks it is actively managing -- VDI snapshot metadata (`VDI.snapshot_of` et al) has to be carefully - fixed up since all the old refs are now dangling +Because `VDI.clone` creates new VDI refs and uuids, some problematic +behaviours arise: -We will fix these problems by: +- Clients such as + [Apache CloudStack](http://cloudstack.apache.org) need to include complex + logic to keep track of the disks they are actively managing +- Because the snapshot is cloned and the original vdi is deleted, VDI + references to the VDI become invalid, like `VDI.snapshot_of`. This means + that the database has to be combed through to change these references. + Because the database doesn't support transactions this operation is not atomic + and can produce inconsistent database states. -1. adding a `VDI.revert` to the SMAPIv2 and calling this from `VM.revert` -2. defining a new SMAPIv1 operation `vdi_revert` and a corresponding capability - `VDI_REVERT` -3. the Xapi implementation of `VDI.revert` will first try the `vdi_revert`, - and fall back to `VDI.clone` if that fails -4. implement `vdi_revert` for common storage types, including File and LVM-based - SRs. +Additionally, some filesystems support snapshots natively, doing the clone +procedure is much costlier than allowing the filesystem to do the revert. -XenAPI changes --------------- +We will fix these problems by: -We will add the function `VDI.revert` with arguments: +- introducing the new feature `VDI_REVERT` in SM interface (`xapi_smint`). This + allows backends to advertise that they support the new functionality +- defining a new storage operation `VDI.revert` in storage_interface, which is + gated by the feature `VDI_REVERT` +- proxying the storage operation to SMAPIv3 and SMAPv1 backends accordingly +- adding `VDI.revert` to xapi_vdi which will call the storage operation if the + backend advertises it, and fallback to the previous method that uses + `VDI.clone` if it doesn't advertise it, or issues are detected at runtime + that prevent it +- changing the Xapi implementation of `VM.revert` to use `VDI.revert` +- implement `vdi_revert` for common storage types, including File and LVM-based + SRs +- adding unit and quick tests to xapi to test that `VM.revert` does not regress + +## Current VM.revert behaviour + +The code that reverts the state of storage is located in +[update_vifs_vbds_vgpus_and_vusbs](https://github.com/xapi-project/xen-api/blob/bc0ba4e9dc8dc4b85b7cbdbf3e0ba5915b4ad76d/ocaml/xapi/xapi_vm_snapshot.ml#L211). +The steps it does is: +1. destroys the VM's VBDs (both disks and CDs) +2. destroys the VM's VDI (disks only), referenced by the snapshot's VDIs using + `snapshot_of`; as well as the suspend VDI. +3. clones the snapshot's VDIs (disks and CDs), if one clone fails none remain. +4. searches the database for all `snapshot_of` references to the deleted VDIs + and replaces them with the referenced of the newly cloned snapshots. +5. clones the snapshot's resume VDI +6. creates copies of all the cloned VBDs and associates them with the cloned VDIs +7. assigns the new resume VDI to the VM + +## XenAPI design + +### API + +The function `VDI.revert` will be added, with arguments: - in: `snapshot: Ref(VDI)`: the snapshot to which we want to revert - in: `driver_params: Map(String,String)`: optional extra parameters -- out: `Ref(VDI)` the new VDI +- out: `Ref(VDI)` reference to the new VDI with the reverted contents + +The function will extract the reference of VDI whose contents need to be +replaced. This is the snapshot's `snapshot_of` field, then it will call the +storage function function `VDI.revert` to have its contents replaced with the +snapshot's. The VDI object will not be modified, and the reference returned is +the VDI's original reference. +If anything impedes the successful finish of an in-place revert, like the SM +backend does not advertising the feature `VDI_REVERT`, not implement the +feature, or the `snapshot_of` reference is invalid; an exception will be +raised. -The function will look up the VDI which this is a `snapshot_of`, and change -the VDI to have the same contents as the snapshot. The snapshot will not be -modified. If the implementation is able to revert in-place, then the reference -returned will be the VDI this is a `snapshot_of`; otherwise it is a reference -to a fresh VDI (created by the `VDI.clone` fallback path) +### Xapi Storage -References: +The function `VDI.revert` is added, with the following arguments: -- @johnelse's [pull request](https://github.com/xapi-project/xen-api/pull/1963) - which implements this +- in: `dbg`: the task identifier, useful for tracing +- in: `sr`: SR where the new VDI must be created +- in: `snapshot_info`: metadata of the snapshot, the contents of which must be + made available in the VDI indicated by the `snapshot_of` field -SMAPIv1 changes ---------------- +#### SMAPIv1 -We will define the function `vdi_revert` with arguments: +The function `vdi_revert` is defined with the following arguments: - in: `sr_uuid`: the UUID of the SR containing both the VDI and the snapshot -- in: `vdi_uuid`: the UUID of the snapshot whose contents should be duplicated -- in: `target_uuid`: the UUID of the target whose contents should be replaced +- in: `vdi_uuid`: the UUID of the snapshot whose contents must be duplicated +- in: `target_uuid`: the UUID of the target whose contents must be replaced The function will replace the contents of the `target_uuid` VDI with the contents of the `vdi_uuid` VDI without changing the identify of the target @@ -64,22 +102,27 @@ contents of the `vdi_uuid` VDI without changing the identify of the target The `vdi_uuid` is preserved by this operation. The operation is obvoiusly idempotent. -Xapi changes ------------- +#### SMAPIv3 -Xapi will +In an analogous way to SMAPIv1, the function `Volume.revert` is defined with the +following arguments: -- use `VDI.revert` in the `VM.revert` code-path -- expose a new `xe vdi-revert` CLI command -- implement the `VDI.revert` by calling the SMAPIv1 function and falling back - to `VDI.clone` if a `Not_implemented` exception is thrown +- in: `dbg`: the task identifier, useful for tracing +- in: `sr`: the UUID of the SR containing both the VDI and the snapshot +- in: `snapshot`: the UUID of the snapshot whose contents must be duplicated +- in: `vdi`: the UUID of the VDI whose contents must be replaced -References: +### Xapi -- @johnelse's [pull request](https://github.com/xapi-project/xen-api/pull/1963) +- add the capability `VDI_REVERT` so backends can advertise it +- use `VDI.revert` in the `VM.revert` after the VDIs have been destroyed, and + before the snapshot's VDIs have been cloned. If any of the reverts fail + because a `Not_implemented` exception is thrown, or the `snapshot_of` + contains an invalid reference, add the affected VDIs to the list to be cloned + and recovered, using the existing method +- expose a new `xe vdi-revert` CLI command -SM changes ----------- +## SM changes We will modify @@ -92,8 +135,7 @@ We will modify snapshot/clone machinery - LVHDoISCSISR.py and LVHDoHBASR.py to advertise the `VDI_REVERT` capability -Prototype code -============== +# Prototype code from the previous proposal Prototype code exists here: diff --git a/doc/content/lib/_index.md b/doc/content/lib/_index.md new file mode 100644 index 00000000000..a0592427b0b --- /dev/null +++ b/doc/content/lib/_index.md @@ -0,0 +1,5 @@ +--- +title: Libraries +hidden: true +--- +{{% children description=true %}} \ No newline at end of file diff --git a/doc/content/lib/xenctrl/_index.md b/doc/content/lib/xenctrl/_index.md new file mode 100644 index 00000000000..d38c927b83f --- /dev/null +++ b/doc/content/lib/xenctrl/_index.md @@ -0,0 +1,5 @@ +--- +title: libxenctrl +description: Xen Control library for controlling the Xen hypervisor +--- +{{% children description=true %}} \ No newline at end of file diff --git a/doc/content/lib/xenctrl/xc_domain_claim_pages.md b/doc/content/lib/xenctrl/xc_domain_claim_pages.md new file mode 100644 index 00000000000..7f72f01342c --- /dev/null +++ b/doc/content/lib/xenctrl/xc_domain_claim_pages.md @@ -0,0 +1,157 @@ +--- +title: xc_domain_claim_pages() +description: Stake a claim for further memory for a domain, and release it too. +--- + +## Purpose + +The purpose of `xc_domain_claim_pages()` is to attempt to +stake a claim on an amount of memory for a given domain which guarantees that +memory allocations for the claimed amount will be successful. + +The domain can still attempt to allocate beyond the claim, but those are not +guaranteed to be successful and will fail if the domain's memory reaches it's +`max_mem` value. + +Each domain can only have one claim, and the domid is the key of the claim. +By killing the domain, the claim is also released. + +Depending on the given size argument, the remaining stack of the domain +can be set initially, updated to the given amount, or reset to no claim (0). + +## Management of claims + +- The stake is centrally managed by the Xen hypervisor using a + [Hypercall](https://wiki.xenproject.org/wiki/Hypercall). +- Claims are not reflected in the amount of free memory reported by Xen. + +## Reporting of claims + +- `xl claims` reports the outstanding claims of the domains: + > [!info] Sample output of `xl claims`: + > ```js + > Name ID Mem VCPUs State Time(s) Claimed + > Domain-0 0 2656 8 r----- 957418.2 0 + > ``` +- `xl info` reports the host-wide outstanding claims: + > [!info] Sample output from `xl info | grep outstanding`: + > ```js + > outstanding_claims : 0 + > ``` + +## Tracking of claims + +Xen only tracks: +- the outstanding claims of each domain and +- the outstanding host-wide claims. + +Claiming zero pages effectively cancels the domain's outstanding claim +and is always successful. + +> [!info] +> - Allocations for outstanding claims are expected to always be successful. +> - But this reduces the amount of outstanding claims if the domain. +> - Freeing memory of the domain increases the domain's claim again: +> - But, when a domain consumes its claim, it is reset. +> - When the claim is reset, freed memory is longer moved to the outstanding claims! +> - It would have to get a new claim on memory to have spare memory again. + +> [!warning] The domain's `max_mem` value is used to deny memory allocation +> If an allocation would cause the domain to exceed it's `max_mem` +> value, it will always fail. + + +## Implementation + +Function signature of the libXenCtrl function to call the Xen hypercall: + +```c +long xc_memory_op(libxc_handle, XENMEM_claim_pages, struct xen_memory_reservation *) +``` + +`struct xen_memory_reservation` is defined as : + +```c +struct xen_memory_reservation { + .nr_extents = nr_pages, /* number of pages to claim */ + .extent_order = 0, /* an order 0 means: 4k pages, only 0 is allowed */ + .mem_flags = 0, /* no flags, only 0 is allowed (at the moment) */ + .domid = domid /* numerical domain ID of the domain */ +}; +``` + +### Concurrency + +Xen protects the consistency of the stake of the domain +using the domain's `page_alloc_lock` and the global `heap_lock` of Xen. +Thse spin-locks prevent any "time-of-check-time-of-use" races. +As the hypercall needs to take those spin-locks, it cannot be preempted. + +### Return value + +The call returns 0 if the hypercall successfully claimed the requested amount +of memory, else it returns non-zero. + +## Current users + +### libxl and the xl CLI + +If the `struct xc_dom_image` passed by `libxl` to the +[libxenguest](https://github.com/xen-project/xen/tree/master/tools/libs/guest) +functions +[meminit_hvm()](https://github.com/xen-project/xen/blob/de0254b9/tools/libs/guest/xg_dom_x86.c#L1348-L1649) +and +[meminit_pv()](https://github.com/xen-project/xen/blob/de0254b9/tools/libs/guest/xg_dom_x86.c#L1183-L1333) +has it's `claim_enabled` field set, they, +before allocating the domain's system memory using the allocation function +[xc_populate_physmap()](https://github.com/xen-project/xen/blob/de0254b9/xen/common/memory.c#L159-L314) which calls the hypercall to allocate and populate +the domain's main system memory, will attempt to claim the to-be allocated +memory using a call to `xc_domain_claim_pages()`. +In case this fails, they do not attempt to continue and return the error code +of `xc_domain_claim_pages()`. + +Both functions also (unconditionally) reset the claim upon return. + +But, the `xl` CLI uses this functionality (unless disabled in `xl.conf`) +to make building the domains fail to prevent running out of memory inside +the `meminit_hvm` and `meminit_pv` calls. +Instead, they immediately return an error. + +This means that in case the claim fails, `xl` avoids: +- The effort of allocating the memory, thereby not blocking it for other domains. +- The effort of potentially needing to scrub the memory after the build failure. + +### xenguest + +While [xenguest](../../../xenopsd/walkthroughs/VM.build/xenguest) calls the +[libxenguest](https://github.com/xen-project/xen/tree/master/tools/libs/guest) +functions +[meminit_hvm()](https://github.com/xen-project/xen/blob/de0254b9/tools/libs/guest/xg_dom_x86.c#L1348-L1649) +and +[meminit_pv()](https://github.com/xen-project/xen/blob/de0254b9/tools/libs/guest/xg_dom_x86.c#L1183-L1333) +like `libxl` does, it does not set +[struct xc_dom_image.claim_enabled](https://github.com/xen-project/xen/blob/de0254b9/tools/include/xenguest.h#L186), +so it does not enable the first call to `xc_domain_claim_pages()` +which would claim the amount of memory that these functions will +attempt to allocate and populate for the domain. + +#### Future design ideas for improved NUMA support + +For improved support for [NUMA](../../../toolstack/features/NUMA/), `xenopsd` +may want to call an updated version of this function for the domain, so it has +a stake on the NUMA node's memory before `xenguest` will allocate for the domain +before assigning an NUMA node to a new domain. + +Further, as PV drivers `unmap` and `free` memory for grant tables to Xen and +then re-allocate memory for those grant tables, `xenopsd` may want to try to +stake a very small claim for the domain on the NUMA node of the domain so that +Xen can increase this claim when the PV drivers `free` this memory and re-use +the resulting claimed amount for allocating the grant tables. This would ensure +that the grant tables are then allocated on the local NUMA node of the domain, +avoiding remote memory accesses when accessing the grant tables from inside +the domain. + +Note: In case the corresponding backend process in Dom0 is running on another +NUMA node, it would access the domain's grant tables from a remote NUMA node, +but in this would enable a future improvement for Dom0, where it could prefer to +run the corresponding backend process on the same or a neighbouring NUMA node. diff --git a/doc/content/lib/xenctrl/xc_domain_node_setaffinity.md b/doc/content/lib/xenctrl/xc_domain_node_setaffinity.md new file mode 100644 index 00000000000..03b28e6b213 --- /dev/null +++ b/doc/content/lib/xenctrl/xc_domain_node_setaffinity.md @@ -0,0 +1,146 @@ +--- +title: xc_domain_node_setaffinity() +description: Set a Xen domain's NUMA node affinity for memory allocations +mermaid: + force: true +--- + +`xc_domain_node_setaffinity()` controls the NUMA node affinity of a domain, +but it only updates the Xen hypervisor domain's `d->node_affinity` mask. +This mask is read by the Xen memory allocator as the 2nd preference for the +NUMA node to allocate memory from for this domain. + +> [!info] Preferences of the Xen memory allocator: +> 1. A NUMA node passed to the allocator directly takes precedence, if present. +> 2. Then, if the allocation is for a domain, it's `node_affinity` mask is tried. +> 3. Finally, it falls back to spread the pages over all remaining NUMA nodes. + +As this call has no practical effect on the Xen scheduler, vCPU affinities +need to be set separately anyways. + +The domain's `auto_node_affinity` flag is enabled by default by Xen. This means +that when setting vCPU affinities, Xen updates the `d->node_affinity` mask +to consist of the NUMA nodes to which its vCPUs have affinity to. + +See [xc_vcpu_setaffinity()](xc_vcpu_setaffinity) for more information +on how `d->auto_node_affinity` is used to set the NUMA node affinity. + +Thus, so far, there is no obvious need to call `xc_domain_node_setaffinity()` +when building a domain. + +Setting the NUMA node affinity using this call can be used, +for example, when there might not be enough memory on the +preferred NUMA node, but there are other NUMA nodes that have +enough free memory to be used for the system memory of the domain. + +In terms of future NUMA design, it might be even more favourable to +have a strategy in `xenguest` where in such cases, the superpages +of the preferred node are used first and a fallback to neighbouring +NUMA nodes only happens to the extent necessary. + +Likely, the future allocation strategy should be passed to `xenguest` +using Xenstore like the other platform parameters for the VM. + +## Walk-through of xc_domain_node_setaffinity() + +```mermaid +classDiagram +class `xc_domain_node_setaffinity()` { + +xch: xc_interface #42; + +domid: uint32_t + +nodemap: xc_nodemap_t + 0(on success) + -EINVAL(if a node in the nodemask is not online) +} +click `xc_domain_node_setaffinity()` href " +https://github.com/xen-project/xen/blob/master/tools/libs/ctrl/xc_domain.c#L122-L158" + +`xc_domain_node_setaffinity()` --> `Xen hypercall: do_domctl()` +`xc_domain_node_setaffinity()` <-- `Xen hypercall: do_domctl()` +class `Xen hypercall: do_domctl()` { + Calls domain_set_node_affinity#40;#41; and returns its return value + Passes: domain (struct domain *, looked up using the domid) + Passes: new_affinity (modemask, converted from xc_nodemap_t) +} +click `Xen hypercall: do_domctl()` href " +https://github.com/xen-project/xen/blob/master/xen/common/domctl.c#L516-L525" + +`Xen hypercall: do_domctl()` --> `domain_set_node_affinity()` +`Xen hypercall: do_domctl()` <-- `domain_set_node_affinity()` +class `domain_set_node_affinity()` { + domain: struct domain + new_affinity: nodemask + 0(on success, the domain's node_affinity is updated) + -EINVAL(if a node in the nodemask is not online) +} +click `domain_set_node_affinity()` href " +https://github.com/xen-project/xen/blob/master/xen/common/domain.c#L943-L970" +``` + +### domain_set_node_affinity() + +This function implements the functionality of `xc_domain_node_setaffinity` +to set the NUMA affinity of a domain as described above. +If the new_affinity does not intersect the `node_online_map`, +it returns `-EINVAL`. Otherwise, the result is a success, and it returns `0`. + +When the `new_affinity` is a specific set of NUMA nodes, it updates the NUMA +`node_affinity` of the domain to these nodes and disables `d->auto_node_affinity` +for this domain. With `d->auto_node_affinity` disabled, +[xc_vcpu_setaffinity()](xc_vcpu_setaffinity) no longer updates the NUMA affinity +of this domain. + +If `new_affinity` has all bits set, it re-enables the `d->auto_node_affinity` +for this domain and calls +[domain_update_node_aff()](https://github.com/xen-project/xen/blob/e16acd80/xen/common/sched/core.c#L1809-L1876) +to re-set the domain's `node_affinity` mask to the NUMA nodes of the current +the hard and soft affinity of the domain's online vCPUs. + +### Flowchart in relation to xc_set_vcpu_affinity() + +The effect of `domain_set_node_affinity()` can be seen more clearly on this +flowchart which shows how `xc_set_vcpu_affinity()` is currently used to set +the NUMA affinity of a new domain, but also shows how `domain_set_node_affinity()` +relates to it: + +{{% include "xc_vcpu_setaffinity-xenopsd-notes.md" %}} +{{% include "xc_vcpu_setaffinity-xenopsd.md" %}} + +`xc_domain_node_setaffinity` can be used to set the domain's `node_affinity` +(which is normally set by `xc_set_vcpu_affinity`) to different NUMA nodes. + +#### No effect on the Xen scheduler + +Currently, the node affinity does not affect the Xen scheudler: +In case `d->node_affinity` would be set before vCPU creation, the initial pCPU +of the new vCPU is the first pCPU of the first NUMA node in the domain's +`node_affinity`. This is further changed when one of more `cpupools` are set up. +As this is only the initial pCPU of the vCPU, this alone does not change the +scheduling of Xen Credit scheduler as it reschedules the vCPUs to other pCPUs. + +## Notes on future design improvements + +### It may be possible to call it before vCPUs are created + +When done early, before vCPU creation, some domain-related data structures +could be allocated using the domain's `d->node_affinity` NUMA node mask. + +With further changes in Xen and `xenopsd`, Xen could allocate the vCPU structs +on the affine NUMA nodes of the domain. + +For this, would be that `xenopsd` would have to call `xc_domain_node_setaffinity()` +before vCPU creation, after having decided the domain's NUMA placement, +preferably including claiming the required memory for the domain to ensure +that the domain will be populated from the same NUMA node(s). + +This call cannot influence the past: The `xenopsd` +[VM_create](../../xenopsd/walkthroughs/VM.start.md#2-create-a-xen-domain) +micro-ops calls `Xenctrl.domain_create`. It currently creates +the domain's data structures before `numa_placement` was done. + +Improving `Xenctrl.domain_create` to pass a NUMA node +for allocating the Hypervisor's data structures (e.g. vCPU) +of the domain would require changes +to the Xen hypervisor and the `xenopsd` +[xenopsd VM_create](../../xenopsd/walkthroughs/VM.start.md#2-create-a-xen-domain) +micro-op. diff --git a/doc/content/lib/xenctrl/xc_vcpu_setaffinity-simplified.md b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-simplified.md new file mode 100644 index 00000000000..48ebf1185dd --- /dev/null +++ b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-simplified.md @@ -0,0 +1,30 @@ +--- +title: Simplified flowchart of xc_vcpu_setaffinity() +description: See lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md for an extended version +hidden: true +--- +```mermaid +flowchart TD +subgraph libxenctrl + xc_vcpu_setaffinity("xc_vcpu_setaffinity()")--hypercall-->xen +end +subgraph xen[Xen Hypervisor] +direction LR +vcpu_set_affinity("vcpu_set_affinity()
set the vCPU affinity") + -->check_auto_node{"Is the domain's
auto_node_affinity
enabled?"} + --"yes
(default)"--> + auto_node_affinity("Set the
domain's
node_affinity + mask as well
(used for further
NUMA memory
allocation)") + +click xc_vcpu_setaffinity +"https://github.com/xen-project/xen/blob/7cf16387/tools/libs/ctrl/xc_domain.c#L199-L250" _blank +click vcpu_set_affinity +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1353-L1393" _blank +click domain_update_node_aff +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1809-L1876" _blank +click check_auto_node +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1840-L1870" _blank +click auto_node_affinity +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1867-L1869" _blank +end +``` diff --git a/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd-notes.md b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd-notes.md new file mode 100644 index 00000000000..a6e7a8be5be --- /dev/null +++ b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd-notes.md @@ -0,0 +1,13 @@ +--- +title: Notes for the flowchart on the use of setaffinity for VM.start +hidden: true +--- +In the flowchart, two code paths are set in bold: +- Show the path when `Host.numa_affinity_policy` is the default (off) in `xenopsd`. +- Show the default path of `xc_vcpu_setaffinity(XEN_VCPUAFFINITY_SOFT)` in Xen, + when the Domain's `auto_node_affinity` flag is enabled (default) to show + how it changes to the vCPU affinity update the domain's `node_affinity` + in this default case as well. + +[xenguest](../../xenopsd/walkthroughs/VM.build/xenguest/) uses the Xenstore +to read the static domain configuration that it needs reads to build the domain. diff --git a/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md new file mode 100644 index 00000000000..f1fddecfbca --- /dev/null +++ b/doc/content/lib/xenctrl/xc_vcpu_setaffinity-xenopsd.md @@ -0,0 +1,176 @@ +--- +title: Flowchart of the use of xc_vcpu_setaffinity() by xenopsd +description: Shows how xenopsd uses xc_vcpu_setaffinity() to set NUMA affinity +hidden: true +--- +```mermaid +flowchart TD + +subgraph VM.create["xenopsd VM.create"] + + %% Is xe vCPU-params:mask= set? If yes, write to Xenstore: + + is_xe_vCPUparams_mask_set?{" + + Is + xe vCPU-params:mask= + set? Example: 1,2,3 + (Is used to enable vCPU
hard-affinity) + + "} --"yes"--> set_hard_affinity("Write hard-affinity to XenStore: + platform/vcpu/#domid/affinity + (xenguest will read this and other configuration data + from Xenstore)") + +end + +subgraph VM.build["xenopsd VM.build"] + + %% Labels of the decision nodes + + is_Host.numa_affinity_policy_set?{ + Is

Host.numa_affinity_policy

set?} + has_hard_affinity?{ + Is hard-affinity configured in

platform/vcpu/#domid/affinity?} + + %% Connections from VM.create: + set_hard_affinity --> is_Host.numa_affinity_policy_set? + is_xe_vCPUparams_mask_set? == "no"==> is_Host.numa_affinity_policy_set? + + %% The Subgraph itself: + + %% Check Host.numa_affinity_policy + + is_Host.numa_affinity_policy_set? + + %% If Host.numa_affinity_policy is "best_effort": + + -- Host.numa_affinity_policy is

best_effort --> + + %% If has_hard_affinity is set, skip numa_placement: + + has_hard_affinity? + --"yes"-->exec_xenguest + + %% If has_hard_affinity is not set, run numa_placement: + + has_hard_affinity? + --"no"-->numa_placement-->exec_xenguest + + %% If Host.numa_affinity_policy is off (default, for now), + %% skip NUMA placement: + + is_Host.numa_affinity_policy_set? + =="default: disabled"==> + exec_xenguest +end + +%% xenguest subgraph + +subgraph xenguest + + exec_xenguest + + ==> stub_xc_hvm_build("stub_xc_hvm_build()") + + ==> configure_vcpus("configure_vcpus()") + + %% Decision + ==> set_hard_affinity?{" + Is platform/
vcpu/#domid/affinity
+ set?"} + +end + +%% do_domctl Hypercalls + +numa_placement + --Set the NUMA placement using soft-affinity--> + XEN_VCPUAFFINITY_SOFT("xc_vcpu_setaffinity(SOFT)") + ==> do_domctl + +set_hard_affinity? + --yes--> + XEN_VCPUAFFINITY_HARD("xc_vcpu_setaffinity(HARD)") + --> do_domctl + +xc_domain_node_setaffinity("xc_domain_node_setaffinity() + and + xc_domain_node_getaffinity()") + <--> do_domctl + +%% Xen subgraph + +subgraph xen[Xen Hypervisor] + + subgraph domain_update_node_affinity["domain_update_node_affinity()"] + domain_update_node_aff("domain_update_node_aff()") + ==> check_auto_node{"Is domain's
auto_node_affinity
enabled?"} + =="yes (default)"==>set_node_affinity_from_vcpu_affinities(" + Calculate the domain's node_affinity mask from vCPU affinity + (used for further NUMA memory allocation for the domain)") + end + + do_domctl{"do_domctl()
op->cmd=?"} + ==XEN_DOMCTL_setvcpuaffinity==> + vcpu_set_affinity("vcpu_set_affinity()
set the vCPU affinity") + ==>domain_update_node_aff + do_domctl + --XEN_DOMCTL_setnodeaffinity (not used currently) + -->is_new_affinity_all_nodes? + + subgraph domain_set_node_affinity["domain_set_node_affinity()"] + + is_new_affinity_all_nodes?{new_affinity
is #34;all#34;?} + + --is #34;all#34; + + --> enable_auto_node_affinity("auto_node_affinity=1") + --> domain_update_node_aff + + is_new_affinity_all_nodes? + + --not #34;all#34; + + --> disable_auto_node_affinity("auto_node_affinity=0") + --> domain_update_node_aff + end + +%% setting and getting the struct domain's node_affinity: + +disable_auto_node_affinity + --node_affinity=new_affinity--> + domain_node_affinity + +set_node_affinity_from_vcpu_affinities + ==> domain_node_affinity@{ shape: bow-rect,label: "domain: node_affinity" } + --XEN_DOMCTL_getnodeaffinity--> do_domctl + +end +click is_Host.numa_affinity_policy_set? +"https://github.com/xapi-project/xen-api/blob/90ef043c1f3a3bc20f1c5d3ccaaf6affadc07983/ocaml/xenopsd/xc/domain.ml#L951-L962" +click numa_placement +"https://github.com/xapi-project/xen-api/blob/90ef043c/ocaml/xenopsd/xc/domain.ml#L862-L897" +click stub_xc_hvm_build +"https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L2329-L2436" _blank +click get_flags +"https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L1164-L1288" _blank +click do_domctl +"https://github.com/xen-project/xen/blob/7cf163879/xen/common/domctl.c#L282-L894" _blank +click domain_set_node_affinity +"https://github.com/xen-project/xen/blob/7cf163879/xen/common/domain.c#L943-L970" _blank +click configure_vcpus +"https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L1297-L1348" _blank +click set_hard_affinity? +"https://github.com/xenserver/xen.pg/blob/65c0438b/patches/xenguest.patch#L1305-L1326" _blank +click xc_vcpu_setaffinity +"https://github.com/xen-project/xen/blob/7cf16387/tools/libs/ctrl/xc_domain.c#L199-L250" _blank +click vcpu_set_affinity +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1353-L1393" _blank +click domain_update_node_aff +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1809-L1876" _blank +click check_auto_node +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1840-L1870" _blank +click set_node_affinity_from_vcpu_affinities +"https://github.com/xen-project/xen/blob/7cf16387/xen/common/sched/core.c#L1867-L1869" _blank +``` diff --git a/doc/content/lib/xenctrl/xc_vcpu_setaffinity.md b/doc/content/lib/xenctrl/xc_vcpu_setaffinity.md new file mode 100644 index 00000000000..8586492d9cc --- /dev/null +++ b/doc/content/lib/xenctrl/xc_vcpu_setaffinity.md @@ -0,0 +1,92 @@ +--- +title: xc_vcpu_setaffinity() +description: Set a Xen vCPU's pCPU affinity and the domain's NUMA node affinity +mermaid: + force: true +--- +## Introduction + +In the Xen hypervisor, each vCPU has: + +- A _soft affinity_, This is the list of pCPUs where a vCPU prefers to run: + + This can be used in cases to make vCPUs prefer to run on a set on pCPUs, + for example the pCPUs of a NUMA node, but in case those are already busy, + the Credit schedule can still ignore the soft-affinity. + A typical use case for this are NUMA machines, where the soft affinity + for the vCPUs of a domain should be set equal to the pCPUs of the NUMA node where the domain's memory shall be placed. + + See the description of the [NUMA feature](../../../toolstack/features/NUMA/) + for more details. + +- A _hard affinity_, also known as pinning. + This is the list of pCPUs where a vCPU is allowed to run + + Hard affinity is currently not used for NUMA placement, but can be configured + manually for a given domain, either using `xe VCPUs-params:mask=` or the API. + + For example, the vCPU’s pinning can be configured using a template with: + ```py + xe template-param-set uuid= vCPUs-params:mask=1,2,3 + ``` + + There are also host-level `guest_VCPUs_params` which are used by + `host-cpu-tune` to exclusively pin Dom0 and guests (i.e. that their + pCPUs never overlap). Note: This isn't currently supported by the + NUMA code: It could result that the NUMA placement picks a node that + has reduced capacity or unavailable due to the host mask that + `host-cpu-tune` has set. + +## Purpose + +The libxenctrl library call `xc_set_vcpu_affinity()` +controls the pCPU affinity of the given vCPU. + +[xenguest](../../../xenopsd/walkthroughs/VM.build/xenguest/#walkthrough-of-the-xenguest-build-mode) +uses it when building domains if +[xenopsd](../../xenopsd/walkthroughs/VM.build/Domain.build) +added vCPU affinity information to the XenStore platform data path +`platform/vcpu/#domid/affinity` of the domain. + +### Updating the NUMA node affinity of a domain + +Besides that, `xc_set_vcpu_affinity()` can also modify the NUMA node +affinity of the Xen domain if the vCPU: + +When Xen creates a domain, it enables the domain's `d->auto_node_affinity` +feature flag. + +When it is enabled, setting the vCPU affinity also updates the NUMA node +affinity which is used for memory allocations for the domain: + +### Simplified flowchart + +{{% include "xc_vcpu_setaffinity-simplified.md" %}} + +## Current use by xenopsd and xenguest + +When `Host.numa_affinity_policy` is set to +[best_effort](../../../toolstack/features/NUMA/#xapi-datamodel-design), +[xenopsd](../../../xenopsd/walkthroughs/VM.build) attempts NUMA node placement +when building new VMs and instructs +[xenguest](../../../xenopsd/walkthroughs/VM.build/xenguest/#walkthrough-of-the-xenguest-build-mode) +to set the vCPU affinity of the domain. + +With the domain's `auto_node_affinity` flag enabled by default in Xen, +this automatically also sets the `d->node_affinity` mask of the domain. + +This then causes the Xen memory allocator to prefer the NUMA nodes in the +`d->node_affinity` NUMA node mask when allocating memory. + +That is, (for completeness) unless Xen's allocation function +`alloc_heap_pages()` receives a specific NUMA node in its `memflags` +argument when called. + +See [xc_domain_node_setaffinity()](xc_domain_node_setaffinity) for more +information about another way to set the `node_affinity` NUMA node mask +of Xen domains and more depth on how it is used in Xen. + +### Flowchart of its current use for NUMA affinity + +{{% include "xc_vcpu_setaffinity-xenopsd-notes.md" %}} +{{% include "xc_vcpu_setaffinity-xenopsd.md" %}} diff --git a/doc/content/python/_index.md b/doc/content/python/_index.md index 773f02ce38c..523c2018718 100644 --- a/doc/content/python/_index.md +++ b/doc/content/python/_index.md @@ -52,7 +52,7 @@ in the [pre-commit] configuration file [.pre-commit-config.yaml]. entry: sh -c 'coverage run && coverage xml && coverage html && coverage report && diff-cover --ignore-whitespace --compare-branch=origin/master - --show-uncovered --html-report .git/coverage-diff.html + --show-uncovered --format html:.git/coverage-diff.html --fail-under 50 .git/coverage3.11.xml' require_serial: true pass_filenames: false diff --git a/doc/content/squeezed/architecture/index.md b/doc/content/squeezed/architecture/index.md index fb86fd69989..2f7135fe926 100644 --- a/doc/content/squeezed/architecture/index.md +++ b/doc/content/squeezed/architecture/index.md @@ -1,8 +1,9 @@ +++ -title = "Architecture" +title = "Squeezed Architecture" +linkTitle = "Architecture" +++ -Squeezed is responsible for managing the memory on a single host. Squeezed +Squeezed is the XAPI Toolstack’s host memory ballooning daemon. It "balances" memory between VMs according to a policy written to Xenstore. The following diagram shows the internals of Squeezed: diff --git a/doc/content/toolstack/features/SSH/index.md b/doc/content/toolstack/features/SSH/index.md new file mode 100644 index 00000000000..a0a7c937706 --- /dev/null +++ b/doc/content/toolstack/features/SSH/index.md @@ -0,0 +1,249 @@ +# SSH Management + +SSH Management enables programmatic control of SSH access to XenServer hosts. This feature +allows administrators to enable/disable SSH services, configure timeout settings, and implement +automatic SSH management based on XAPI health status. + +## Architecture Overview + +The SSH Management feature is built around three core components: + +1. **SSH Service Control**: Direct enable/disable operations for SSH on individual hosts or entire pools +2. **Timeout Management**: Configurable timeouts for both SSH sessions and service duration limits +3. **Auto Mode**: Intelligent SSH management that automatically adjusts based on XAPI health status + +![SSH Status Transition](ssh-status-trans.png) + +## SSH Service Control + +### API Design + +#### Host APIs + +- `host.enable_ssh`: Enables SSH access on the specified host +- `host.disable_ssh`: Disables SSH access on the specified host +- `host.set_ssh_enabled_timeout`: Configures SSH service timeout duration (0-172800 seconds, maximum 2 days) +- `host.set_console_idle_timeout`: Sets idle timeout for SSH/VNC console sessions +- `host.set_ssh_auto_mode`: Controls SSH auto mode behavior (when true, SSH is normally disabled but enabled during XAPI downtime) + +#### Pool APIs + +- `pool.enable_ssh`: Enables SSH access across all hosts in the pool +- `pool.disable_ssh`: Disables SSH access across all hosts in the pool +- `pool.set_ssh_enabled_timeout`: Sets SSH service timeout for all pool hosts +- `pool.set_console_idle_timeout`: Configures console idle timeout for all pool hosts +- `pool.set_ssh_auto_mode`: Applies SSH auto mode configuration to all pool hosts + +### Implementation Details + +The enable/disable operations work by directly managing systemd services. The code starts and enables the sshd systemd service to enable SSH access, or stops and disables it to disable SSH access: + +```ocaml +Xapi_systemctl.start "sshd" +Xapi_systemctl.enable "sshd" + +Xapi_systemctl.stop "sshd" +Xapi_systemctl.disable "sshd" +``` + +#### SSH Timeout Management + +The timeout management uses the scheduler system to automatically disable SSH after a specified period. The function removes any existing disable job from the queue and creates a new one-shot job that will execute the SSH disable operation when the timeout expires. if the XAPI restart during this period, xapi will schedule a new job to disable SSH with remaining time: + +```ocaml +let schedule_disable_ssh_job ~__context ~self ~timeout ~auto_mode = + Xapi_stdext_threads_scheduler.Scheduler.remove_from_queue + !Xapi_globs.job_for_disable_ssh ; + Xapi_stdext_threads_scheduler.Scheduler.add_to_queue + !Xapi_globs.job_for_disable_ssh + Xapi_stdext_threads_scheduler.Scheduler.OneShot (Int64.to_float timeout) + (fun () -> + disable_ssh_internal ~__context ~self + ) +``` + +#### Console Idle Timeout + +The console idle timeout is configured by writing to a profile script that sets the TMOUT environment variable. The function generates appropriate content based on the timeout value and atomically writes it to the profile script file: + +```ocaml +let set_console_idle_timeout ~__context ~self ~value = + let content = match value with + | 0L -> "# Console timeout is disabled\n" + | timeout -> Printf.sprintf "# Console timeout configuration\nexport TMOUT=%Ld\n" timeout + in + Unixext.atomic_write_to_file !Xapi_globs.console_timeout_profile_path 0o0644 + (fun fd -> Unix.write fd (Bytes.of_string content) 0 (String.length content)) +``` + +#### SSH Auto Mode + +The SSH auto mode is configured by managing the monitoring service. The function updates the database with the auto mode setting and then enables or disables the SSH monitoring daemon accordingly. When auto mode is enabled, it starts the monitoring service and enable SSH service (Always enable SSH service for avoid both XAPI and Monitor service are down, user is still able to start SSH service by reboot host); when disabled, it stops and disables the monitoring service: + +```ocaml +let set_ssh_auto_mode ~__context ~self ~value = + Db.Host.set_ssh_auto_mode ~__context ~self ~value ; + if value then ( + Xapi_systemctl.enable ~wait_until_success:false !Xapi_globs.ssh_service ; + Xapi_systemctl.enable ~wait_until_success:false !Xapi_globs.ssh_monitor_service ; + Xapi_systemctl.start ~wait_until_success:false !Xapi_globs.ssh_monitor_service + ) else ( + Xapi_systemctl.stop ~wait_until_success:false !Xapi_globs.ssh_monitor_service ; + Xapi_systemctl.disable ~wait_until_success:false !Xapi_globs.ssh_monitor_service + ) +``` + +### CLI Commands + +```bash +# Enable/disable SSH on hosts +xe host-enable-ssh host= +xe host-disable-ssh host-uuid= + +# Configure timeouts on individual hosts +xe host-param-set uuid= ssh-enabled-timeout=3600 +xe host-param-set uuid= console-idle-timeout=300 +xe host-param-set uuid= ssh-auto-mode=true + +# Query host SSH parameters +xe host-param-get uuid= param-name=ssh-enabled +xe host-param-get uuid= param-name=ssh-expiry +xe host-param-get uuid= param-name=ssh-enabled-timeout +xe host-param-get uuid= param-name=console-idle-timeout +xe host-param-get uuid= param-name=ssh-auto-mode + +# Enable/disable SSH across pool +xe pool-enable-ssh +xe pool-disable-ssh + +# Configure timeouts across pool +xe pool-param-set uuid= ssh-enabled-timeout=3600 +xe pool-param-set uuid= console-idle-timeout=300 +xe pool-param-set uuid= ssh-auto-mode=true + +# Query pool SSH parameters +xe pool-param-get uuid= param-name=ssh-enabled +xe pool-param-get uuid= param-name=ssh-expiry +xe pool-param-get uuid= param-name=ssh-enabled-timeout +xe pool-param-get uuid= param-name=console-idle-timeout +xe pool-param-get uuid= param-name=ssh-auto-mode +``` + +## Auto Mode + +### Overview + +The auto mode feature intelligently manages SSH access based on XAPI health status: +- SSH is automatically enabled when XAPI becomes unhealthy +- SSH is automatically disabled when XAPI is healthy and running normally + +When the user enables the SSH service with `enable_ssh` API, SSH auto mode will be turned off. +| SSH service | auto mode | +|-------------|-----------| +| enabled | off | + +If SSH auto mode is enabled and XAPI becomes unresponsive, the system will automatically enable the SSH service to allow access. +| auto mode | xapi healthy | SSH service | +|-----------|--------------|-------------| +| on | yes | disable | +| on | no | enable | +| off | NA | NA | + +When SSH is temporarily enabled using the ssh-enabled-timeout setting and enable-ssh command, the system preserves the original SSH auto-mode state in cache. During the timeout period, SSH auto-mode is suspended (set to off) to allow SSH access. Once the timeout expires, the system restores the cached auto-mode state - if auto-mode was originally enabled, it will be reactivated and automatically stop the SSH service again +| auto mode before set enable timeout | SSH service before set enable timeout | auto mode during the limited time period | auto mode after enable timeout | +|-----------------------------------|--------------------------------------|----------------------------------------|-------------------------------| +| on | off | off | on | + +### Service Architecture + +#### Monitoring Daemon + +The monitoring daemon (`/opt/xensource/libexec/xapi-state-monitor`) operates continuously: + +1. Monitors current SSH service status +2. When auto mode is enabled: + - If XAPI is healthy and SSH is active → Stop SSH + - If XAPI is unhealthy and SSH is inactive → Start SSH +3. Implements retry logic with up to 3 attempts for failed operations +4. Pauses for 60 seconds between health check cycles + +### Health Check Integration + +The system leverages the existing `xapi-health-check` script for health monitoring: +- Returns 0 when XAPI is healthy +- Returns 1 when XAPI is unhealthy +- Triggers unhealthy status after 20 consecutive failures + +### Configuration + +#### Default Behavior + +- **XenServer 8**: `ssh_auto_mode=false` (SSH is enabled by default) +- **XenServer 9**: `ssh_auto_mode=true` (SSH is disabled by default) + +#### Configuration Files + +In XS8, the ssh_auto_mode default value will be overridden by the configuration file as below, while in XS9, there is no configuration file, so auto-mode will remain enabled by default. + +```bash +# XS8: /etc/xapi.conf.d/ssh-auto-mode.conf +ssh_auto_mode=false +``` + +## Pool Operations + +### Pool Join + +When a host joins a pool, the following sequence occurs: +1. The host inherits SSH configuration from the pool coordinator +2. SSH settings are applied before metadata updates +3. The xapi-ssh-monitor service is started if auto mode is enabled + +### Pool Eject + +When a host is ejected from a pool: +1. The host resets to its default configuration (e.g., in XS8 SSH enabled, no timeout) +2. Default SSH configuration is applied before the host becomes a coordinator + +## XAPI Restart Handling + +During XAPI startup, the system performs several key operations to handle different restart scenarios: + +#### SSH Status Synchronization +The database is updated to reflect the actual SSH service state, ensuring consistency between the database and the running system. + +#### Short XAPI Downtime Recovery +When `ssh_enabled_timeout > 0` and `ssh_expiry > current_time`, indicating that XAPI restarted during a temporary SSH disable period: +- The system reschedules the disable SSH job with the remaining time +- This ensures that the original timeout period is maintained even after XAPI restart + +#### Extended XAPI Downtime Handling +When a ssh_enabled_timeout is configured, `ssh_expiry < current_time` and the SSH service is currently active, indicating that XAPI was down for an extended period that exceeded the timeout duration: +- SSH is automatically disabled +- SSH auto mode is enabled to ensure continuous SSH availability + +This scenario typically occurs when XAPI is not active when the SSH timeout expires, requiring the system to disable SSH and enable auto mode for remains continuously available. + +## Error Handling + +### Retry Logic + +The system implements robust retry mechanisms: +- SSH disable operations are retried up to 3 times +- 5-second intervals are maintained between retry attempts + +## Integration Points + +### xsconsole Integration + +The xsconsole interface has been updated to use XAPI APIs rather than direct systemd commands for consistent with XAPI db status: +- Enable/Disable operations: Calls `host.enable_ssh`/`host.disable_ssh` +- Auto mode configuration: Calls `host.set_ssh_auto_mode` + +### Answerfile Support + +The following configuration in answerfile can be used, when configure ssh-mode to on, auto-mode will be disabled and SSH will be enabled, when configure ssh-mode to off, auto-mode will be disabled and SSH will be disabled as well, when configure to auto, the auto-mode will be enabled and SSH will be disabled by auto-mode once the XAPI is on: + +```xml +on|off|auto +``` \ No newline at end of file diff --git a/doc/content/toolstack/features/SSH/ssh-status-trans.png b/doc/content/toolstack/features/SSH/ssh-status-trans.png new file mode 100644 index 00000000000..40cf16255a7 Binary files /dev/null and b/doc/content/toolstack/features/SSH/ssh-status-trans.png differ diff --git a/doc/content/toolstack/features/Tracing/index.md b/doc/content/toolstack/features/Tracing/index.md new file mode 100644 index 00000000000..4c90c570699 --- /dev/null +++ b/doc/content/toolstack/features/Tracing/index.md @@ -0,0 +1,140 @@ ++++ +title = "Tracing" ++++ + +Tracing is a powerful tool for observing system behavior across multiple components, making it especially +useful for debugging and performance analysis in complex environments. + +By integrating OpenTelemetry (a standard that unifies OpenTracing and OpenCensus) and the Zipkin v2 protocol, +XAPI enables efficient tracking and visualization of operations across internal and external systems. +This facilitates detailed analysis and improves collaboration between teams. + +Tracing is commonly used in high-level applications such as web services. As a result, less widely-used or +non-web-oriented languages may lack dedicated libraries for distributed tracing (An OCaml implementation +has been developed specifically for XenAPI). + +# How tracing works in XAPI + +## Spans and Trace Context + +- A *span* is the core unit of a trace, representing a single operation with a defined start and end time. + Spans can contain sub-spans that represent child tasks. This helps identify bottlenecks or areas that + can be parallelized. + - A span can contain several contextual elements such as *tags* (key-value pairs), + *events* (time-based data), and *errors*. +- The *TraceContext* HTTP standard defines how trace IDs and span contexts are propagated across systems, + enabling full traceability of operations. + +This data enables the creation of relationships between tasks and supports visualizations such as +architecture diagrams or execution flows. These help in identifying root causes of issues and bottlenecks, +and also assist newcomers in onboarding to the project. + +## Configuration + +- To enable tracing, you need to create an *Observer* object in XAPI. This can be done using the *xe* CLI: + ```sh + xe observer-create \ + name-label= \ + enabled=true \ + components=xapi,xenopsd \ + ``` +- By default, if you don't specify `enabled=true`, the observer will be disabled. +- To add an HTTP endpoint, make sure the server is up and running, then run: + ```sh + xe observer-param-set uuid= endpoints=bugtool,http://:9411/api/v2/spans + ``` + If you specify an invalid or unreachable HTTP endpoint, the configuration will fail. +- **components**: Specify which internal components (e.g., *xapi*, *xenopsd*) should be traced. + Additional components are expected to be supported in future releases. An experimental *smapi* component + is also available and requires additional configuration (explained below). + +- **endpoints**: The observer can collect traces locally in */var/log/dt* or forward them to external + visualization tools such as [Jaeger](https://www.jaegertracing.io/). Currently, only HTTP/S endpoints + are supported, and they require additional configuration steps (see next section). + +- To disable tracing you just need to set *enabled* to false: + ```sh + xe observer-param-set uuid= enabled=false + ``` + +### Enabling smapi component + +- *smapi* component is currently considered experimental and is filtered by default. To enable it, you must + explicitly configure the following in **xapi.conf**: + ```ini + observer-experimental-components="" + ``` + This tells XAPI that no components are considered experimental, thereby allowing *smapi* to be traced. + A modification to **xapi.conf** requires a restart of the XAPI toolstack. + +### Enabling HTTP/S endpoints + +- By default HTTP and HTTPS endpoints are disabled. To enable them, add the following lines to **xapi.conf**: + ```ini + observer-endpoint-http-enabled=true + observer-endpoint-https-enabled=true + ``` + As with enabling *smapi* component, modifying **xapi.conf** requires a restart of the XAPI toolstack. + *Note*: HTTPS endpoint support is available but not tested and may not work. + +### Sending local trace to endpoint + +By default, traces are generated locally in the `/var/log/dt` directory. You can copy or forward +these traces to another location or endpoint using the `xs-trace` tool. For example, if you have +a *Jaeger* server running locally, you can copy a trace to an endpoint by running: + +```sh +xs-trace cp /var/log/dt/ http://127.0.0.1:9411/api/v2/spans +``` + +You will then be able to visualize the traces in Jaeger. + +The `xs-trace` tool also supports trace files in `.ndjson` and compressed `.zst` formats, so +you can copy or forward these files directly as well. + +### Tagging Trace Sessions for Easier Search + +#### Specific attributes +To make trace logs easier to locate and analyze, it can be helpful to add custom attributes around the +execution of specific commands. For example: + +```sh +# xe observer-param-set uuid= attributes:custom.random=1234 +# xe vm-start ... +# xe observer-param-clear uuid= param-name=attributes param-key=custom.random +``` + +This technique adds a temporary attribute, *custom.random=1234*, which will appear in the generated trace +spans, making it easier to search for specific activity in trace visualisation tools. It may also be possible +to achieve similar tagging using baggage parameters directly in individual *xe* commands, but this approach +is currently undocumented. + +#### Baggage + +*Baggage*, contextual information that resides alongside the context, is supported. This means you can run +the following command: + +```sh +BAGGAGE="mybaggage=apples" xe vm-list +``` + +You will be able to search for tags `mybaggage=apples`. + +#### Traceparent + +Another way to assist in trace searching is to use the `TRACEPARENT` HTTP header. It is an HTTP header field that +identifies the incoming request. It has a [specific format](https://www.w3.org/TR/trace-context/#traceparent-header) +and it is supported by **XAPI**. Once generated you can run command as: + +```sh +TRACEPARENT="00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01" xe vm-list +``` + +And you will be able to look for trace *4bf92f3577b34da6a3ce929d0e0e4736*. + +### Links + +- [Opentelemetry](https://opentelemetry.io/) +- [Trace Context](https://www.w3.org/TR/trace-context/) +- [Baggage](https://opentelemetry.io/docs/concepts/signals/baggage/) +- [Ocaml opentelemetry module](https://ocaml.org/p/opentelemetry/latest) diff --git a/doc/content/toolstack/features/events/index.md b/doc/content/toolstack/features/events/index.md index 3d76d4db927..98bdf17e6ae 100644 --- a/doc/content/toolstack/features/events/index.md +++ b/doc/content/toolstack/features/events/index.md @@ -72,9 +72,9 @@ while True: events = session.xenapi.event.next() # block until a xapi event on a xapi DB object is available for event in events: print "received event op=%s class=%s ref=%s" % (event['operation'], event['class'], event['ref']) - if event['class'] == 'vm' and event['operatoin'] == 'mod': + if event['class'] == 'vm' and event['operation'] == 'mod': vm = event['snapshot'] - print "xapi-event on vm: vm_uuid=%s, power_state=%s, current_operation=%s" % (vm['uuid'],vm['name_label'],vm['power_state'],vm['current_operations'].values()) + print "xapi-event on vm: vm_uuid=%s, vm_name_label=%s, power_state=%s, current_operation=%s" % (vm['uuid'],vm['name_label'],vm['power_state'],vm['current_operations'].values()) except XenAPI.Failure, e: if len(e.details) > 0 and e.details[0] == 'EVENTS_LOST': session.xenapi.event.unregister(["VM","pool"]) diff --git a/doc/content/xapi/alarms/index.md b/doc/content/xapi/alarms/index.md new file mode 100644 index 00000000000..da4c9e542ca --- /dev/null +++ b/doc/content/xapi/alarms/index.md @@ -0,0 +1,218 @@ ++++ +title = "How to set up alarms" +linkTitle = "Alarms" ++++ + +# Introduction + +In XAPI, alarms are triggered by a Python daemon located at `/opt/xensource/bin/perfmon`. +The daemon is managed as a systemd service and can be configured by setting parameters in `/etc/sysconfig/perfmon`. + +It listens on an internal Unix socket to receive commands. Otherwise, it runs in a loop, periodically requesting metrics from XAPI. It can then be configured to generate events based on these metrics. It can monitor various types of XAPI objects, including `VMs`, `SRs`, and `Hosts`. The configuration for each object is defined by writing an XML string into the object's `other-config` key. + +The metrics used by `perfmon` are collected by the `xcp-rrdd` daemon. The `xcp-rrdd` daemon is a component of XAPI responsible for collecting metrics and storing them as Round-Robin Databases (RRDs). + +A XAPI plugin also exists, providing the functions `refresh` and `debug_mem`, which send commands through the Unix socket. The `refresh` function is used when an `other-config` key is added or updated; it triggers the daemon to reread the monitored objects so that new alerts are taken into account. The `debug_mem` function logs the objects currently being monitored into `/var/log/user.log` as a dictionary. + +# Monitoring and alarms + +## Overview + +- To get the metrics, `perfmon` requests XAPI by calling: `http://localhost/rrd_updates?session_id=&start=1759912021&host=true&sr_uuid=all&cf=AVERAGE&interval=60` +- Different consolidation functions can be used like **AVERAGE**, **MIN**, **MAX** or **LAST**. See the details in the next sections for specific objects and how to set it. +- Once retrieve, `perfmon` will check all its triggers and generate alarms if needed. + +## Specific XAPI objects +### VMs + +- To set an alarm on a VM, you need to write an XML string into the `other-config` key of the object. For example, to trigger an alarm when the CPU usage is higher than 50%, run: +```sh +xe vm-param-set uuid= other-config:perfmon=' ' +``` + +- Then, you can either wait until the new configuration is read by the `perfmon` daemon or force a refresh by running: +```sh +xe host-call-plugin host-uuid= plugin=perfmon fn=refresh +``` + +- Now, if you generate some load inside the VM and the CPU usage goes above 50%, the `perfmon` daemon will create a message (a XAPI object) with the name **ALARM**. This message will include a _priority_, a _timestamp_, an _obj-uuid_ and a _body_. To list all messages that are alarms, run: +```sh +xe message-list name=ALARM +``` + +- You will see, for example: +```sh +uuid ( RO) : dadd7cbc-cb4e-5a56-eb0b-0bb31c102c94 + name ( RO): ALARM + priority ( RO): 3 + class ( RO): VM + obj-uuid ( RO): ea9efde2-d0f2-34bb-74cb-78c303f65d89 + timestamp ( RO): 20251007T11:30:26Z + body ( RO): value: 0.986414 +config: + + + + + + + +``` +- where the _body_ contains all the relevant information: the value that triggered the alarm and the configuration of your alarm. + +- When configuring you alarm, your XML string can: + - have multiple `` nodes + - use the following values for child nodes: + * **name**: what to call the variable (no default) + * **alarm_priority**: the priority of the messages generated (default '3') + * **alarm_trigger_level**: level of value that triggers an alarm (no default) + * **alarm_trigger_sense**:'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high') + * **alarm_trigger_period**: num seconds of 'bad' values before an alarm is sent (default '60') + * **alarm_auto_inhibit_period**: num seconds this alarm disabled after an alarm is sent (default '3600') + * **consolidation_fn**: how to combine variables from rrd_updates into one value (default is 'average' for 'cpu_usage', 'get_percent_fs_usage' for 'fs_usage', 'get_percent_log_fs_usage' for 'log_fs_usage','get_percent_mem_usage' for 'mem_usage', & 'sum' for everything else) + * **rrd_regex** matches the names of variables from (xe vm-data-sources-list uuid=$vmuuid) used to compute value (only has defaults for "cpu_usage", "network_usage", and "disk_usage") + +- Notice that `alarm_priority` will be the priority of the generated `message`, 0 being low priority. + +### SRs + +- To set an alarm on an SR object, as with VMs, you need to write an XML string into the `other-config` key of the SR. For example, you can run: +```sh +xe sr-param-set uuid= other-config:perfmon='' +``` +- When configuring you alarm, the XML string supports the same child elements as for VMs + +### Hosts + +- As with VMs ans SRs, alarms can be configured by writing an XML string into an `other-config` key. For example, you can run: +```sh +xe host-param-set uuid= other-config:perfmon=\ + '' +``` + +- The XML string can include multiple nodes allowed +- The full list of supported child nodes is: + * **name**: what to call the variable (no default) + * **alarm_priority**: the priority of the messages generated (default '3') + * **alarm_trigger_level**: level of value that triggers an alarm (no default) + * **alarm_trigger_sense**: 'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high') + * **alarm_trigger_period**: num seconds of 'bad' values before an alarm is sent (default '60') + * **alarm_auto_inhibit_period**:num seconds this alarm disabled after an alarm is sent (default '3600') + * **consolidation_fn**: how to combine variables from rrd_updates into one value (default is 'average' for 'cpu_usage' & 'sum' for everything else) + * **rrd_regex** matches the names of variables from (xe host-data-source-list uuid=) used to compute value (only has defaults for "cpu_usage", "network_usage", "memory_free_kib" and "sr_io_throughput_total_xxxxxxxx") where that last one ends with the first eight characters of the SR UUID) + +- As a special case for SR throughput, it is also possible to configure a Host by writing XML into the `other-config` key of an SR connected to it. For example: +```sh +xe sr-param-set uuid=$sruuid other-config:perfmon=\ + '' +``` +- This only works for that specific variable name, and `rrd_regex` must not be specified. +- Configuration done directly on the host (variable-name, sr_io_throughput_total_xxxxxxxx) takes priority. + +## Which metrics are available? + +- Accepted name for metrics are: + - **cpu_usage**: matches RRD metrics with the pattern `cpu[0-9]+` + - **network_usage**: matches RRD metrics with the pattern `vif_[0-9]+_[rt]x` + - **disk_usage**: match RRD metrics with the pattern `vbd_(xvd|hd)[a-z]+_(read|write)` + - **fs_usage**, **log_fs_usage**, **mem_usage** and **memory_internal_free** do not match anything by default. +- By using `rrd_regex`, you can add your own expressions. To get a list of available metrics with their descriptions, you can call the `get_data_sources` method for [VM](https://xapi-project.github.io/new-docs/xen-api/classes/vm/), for [SR](https://xapi-project.github.io/new-docs/xen-api/classes/sr/) and also for [Host](https://xapi-project.github.io/new-docs/xen-api/classes/host/). +- A python script is provided at the end to get data sources. Using the script we can, for example, see: +```sh +# ./get_data_sources.py --vm 5a445deb-0a8e-c6fe-24c8-09a0508bbe21 + +List of data sources related to VM 5a445deb-0a8e-c6fe-24c8-09a0508bbe21 +cpu0 | CPU0 usage +cpu_usage | Domain CPU usage +memory | Memory currently allocated to VM +memory_internal_free | Memory used as reported by the guest agent +memory_target | Target of VM balloon driver +... +vbd_xvda_io_throughput_read | Data read from the VDI, in MiB/s +... +``` +- You can then set up an alarm when the data read from a VDI exceeds a certain level by doing: +``` +xe vm-param-set uuid=5a445deb-0a8e-c6fe-24c8-09a0508bbe21 \ + other-config:perfmon=' \ + \ + \ + \ + ' +``` +- Here is the script that allows you to get data sources: +```python +#!/usr/bin/env python3 + +import argparse +import sys +import XenAPI + + +def pretty_print(data_sources): + if not data_sources: + print("No data sources.") + return + + # Compute alignment for something nice + max_label_len = max(len(data["name_label"]) for data in data_sources) + + for data in data_sources: + label = data["name_label"] + desc = data["name_description"] + print(f"{label:<{max_label_len}} | {desc}") + + +def list_vm_data(session, uuid): + vm_ref = session.xenapi.VM.get_by_uuid(uuid) + data_sources = session.xenapi.VM.get_data_sources(vm_ref) + print(f"\nList of data sources related to VM {uuid}") + pretty_print(data_sources) + + +def list_host_data(session, uuid): + host_ref = session.xenapi.host.get_by_uuid(uuid) + data_sources = session.xenapi.host.get_data_sources(host_ref) + print(f"\nList of data sources related to Host {uuid}") + pretty_print(data_sources) + + +def list_sr_data(session, uuid): + sr_ref = session.xenapi.SR.get_by_uuid(uuid) + data_sources = session.xenapi.SR.get_data_sources(sr_ref) + print(f"\nList of data sources related to SR {uuid}") + pretty_print(data_sources) + + +def main(): + parser = argparse.ArgumentParser( + description="List data sources related to VM, host or SR" + ) + parser.add_argument("--vm", help="VM UUID") + parser.add_argument("--host", help="Host UUID") + parser.add_argument("--sr", help="SR UUID") + + args = parser.parse_args() + + # Connect to local XAPI: no identification required to access local socket + session = XenAPI.xapi_local() + + try: + session.xenapi.login_with_password("", "") + if args.vm: + list_vm_data(session, args.vm) + if args.host: + list_host_data(session, args.host) + if args.sr: + list_sr_data(session, args.sr) + except XenAPI.Failure as e: + print(f"XenAPI call failed: {e.details}") + sys.exit(1) + finally: + session.xenapi.session.logout() + + +if __name__ == "__main__": + main() +``` + diff --git a/doc/content/xapi/guides/howtos/add-function.md b/doc/content/xapi/guides/howtos/add-function.md index 8aeedfb27fb..cbde59a991e 100644 --- a/doc/content/xapi/guides/howtos/add-function.md +++ b/doc/content/xapi/guides/howtos/add-function.md @@ -172,8 +172,8 @@ the Host module: let price_of ~__context ~host ~item = info "Host.price_of for item %s" item; let local_fn = Local.Host.price_of ~host ~item in - do_op_on ~local_fn ~__context ~host - (fun session_id rpc -> Client.Host.price_of ~rpc ~session_id ~host ~item) + let remote_fn = Client.Host.price_of ~host ~item in + do_op_on ~local_fn ~__context ~host ~remote_fn After the ~__context parameter, the parameters of this new function should match the parameters we specified for the message. In this case, that is the diff --git a/doc/content/xapi/internals/certificates.md b/doc/content/xapi/internals/certificates.md new file mode 100644 index 00000000000..c63a2499d65 --- /dev/null +++ b/doc/content/xapi/internals/certificates.md @@ -0,0 +1,111 @@ + ++++ +title = "Certificates and PEM Files" ++++ + +Xapi uses certificates for secure communication within a pool and with +external clients. These certificates are using the PEM file format and +reside in the Dom0 file system. This documents explains the purpose of +these files. + +##  Design Documents + +* [Pool Certificates](../../design/pool-certificates.md) +* [User Certificates](../../design/user-certificates.md) + +## Paths + +Below are paths used by Xapi for certificates; additional certficates +may be installed but they are not fundamental for Xapi's operation. + +``` +/etc/xensource/xapi-ssl.pem +/etc/xensource/xapi-pool-tls.pem +/etc/stunnel/certs-pool/1c111a1f-412e-47c0-9003-60789b839bc3.pem +/etc/stunnel/certs-pool/960abfff-6017-4d97-bd56-0a8f1a43e51a.pem +/etc/stunnel/xapi-stunnel-ca-bundle.pem +/etc/stunnel/certs/ +/etc/stunnel/xapi-pool-ca-bundle.pem +``` + + +## Fundamental Certificates + +Certificates that identify a host. These certificates are comprised of +both a private and a public key. The public key may be distributed to +other hosts. + +### xapi-ssl.pem + +This certificate identifies a host for extra-pool clients. + +This is the certificate used by the API HTTPS server that clients like +XenCenter or CVAD connect to. On installation of XenServer it is auto +generated but can be updated by a user using the API. This is the most +important certificate for a user to establish an HTTPS connection to a +pool or host to be used as an API. + +* /etc/xensource/xapi-ssl.pem +* contains private and public key for this host +* `Host.get_server_certificate` API call +* referenced by /etc/stunnel/xapi.conf +* `xe host-server-certificate-install` XE command to replace the + certificate. +* See below for xapi-stunnel-ca-bundle for additional certificates that + can be added to a pool in support of a user-supplied host certificate. +* `xe host-reset-server-certificate` creates a new self-signed certificate. + + +### `xapi-pool-tls.pem` + +This certificate identifies a host inside a pool. It is auto generated +and used for all intra-pool HTTPS connections. It needs to be +distributed inside a pool to establish trust. The distribution of the +public part of the certificate is performed by the API and must not be +done manually. + +* /etc/xensource/xapi-pool-tls.pem +* contains private and public key for this host +* referenced by /etc/stunnel/xapi.conf +* This certificate can be re-generated using the API or XE +* `Host.refresh_server_certificate` +* `xe host-refresh-server-certificate` + +## Certificate Bundles + +Certifiacte bundles are used by stunnel. They are a collection of public +keys from hosts and certificates provided by a user. Knowing a host's +public key facilitates stunnel connecting to the host. + +Bundles by themselves are a technicality as they organise a set of +certificates in a single file but don't add new certificates. + +### `xapi-pool-ca-bundle.pem` and `certs-pool/*.pem` + +Collection of public keys from xapi-pool-tls.pem across the +pool. The public keys are collected in the certs-pool directory: each is +named after the UUID of its host and the bundle is constructed from +them. + +* bundle of public keys from hosts' `xapi-pool-tls.pem` +* constructed from PEM files in `certs-pool/` +* `/opt/xensource/bin/update-ca-bundle.sh` generates the bundle from PEM + files + +### `xapi-stunnel-ca-bundle.pem` and `certs/*.pem` + +User-supplied certificates; they are not essential for the operation of +a pool from Xapi's perspective. They make stunnel aware of certificates +used by clients when using HTTPS for API calls. + +* in a plain pool installation, these are empty; PEMs supplied by a user + are stored here and bundled into the `xapi-stunnerl-ca-bundle.pem`. +* bundle of public keys supploed by a user +* constructed from PEM files in `certs/` +* `/opt/xensource/bin/update-ca-bundle.sh` generates the bundle from PEM files +* Updated by a user using `xe pool-install-ca-certificate` +* `Pool.install_ca_certificate` +* `Pool.uninstall_ca_certificate` +* `xe pool-certificate-sync` explicitly distribute these certificates in + the pool. +* User-provided certificates can be used to let xapi connect to WLB. diff --git a/doc/content/xapi/storage/_index.md b/doc/content/xapi/storage/_index.md index ded2fd95c00..925d13296e4 100644 --- a/doc/content/xapi/storage/_index.md +++ b/doc/content/xapi/storage/_index.md @@ -3,112 +3,111 @@ title = "XAPI's Storage Layers" linkTitle = "Storage" +++ + {{% notice info %}} The links in this page point to the source files of xapi -[v1.127.0](https://github.com/xapi-project/xen-api/tree/v1.127.0), and xcp-idl -[v1.62.0](https://github.com/xapi-project/xcp-idl/tree/v1.62.0), not to the -latest source code. - -In the beginning of 2023, significant changes have been made in the layering. -In particular, the wrapper code from `storage_impl.ml` has been pushed down the -stack, below the mux, such that it only covers the SMAPIv1 backend and not -SMAPIv3. Also, all of the code (from xcp-idl etc) is now present in this repo -(xen-api). +[v25.11.0](https://github.com/xapi-project/xen-api/tree/v25.11.0). {{% /notice %}} + Xapi directly communicates only with the SMAPIv2 layer. There are no plugins directly implementing the SMAPIv2 interface, but the plugins in other layers are accessed through it: -{{}} +```mermaid graph TD A[xapi] --> B[SMAPIv2 interface] -B --> C[SMAPIv2 <-> SMAPIv1 translation: storage_access.ml] +B --> C[SMAPIv2 <-> SMAPIv1 state machine: storage_smapiv1_wrapper.ml] +C --> G[SMAPIv2 <-> SMAPIv1 translation: storage_smapiv1.ml] B --> D[SMAPIv2 <-> SMAPIv3 translation: xapi-storage-script] -C --> E[SMAPIv1 plugins] +G --> E[SMAPIv1 plugins] D --> F[SMAPIv3 plugins] -{{< /mermaid >}} +``` ## SMAPIv1 -These are the files related to SMAPIv1 in `xen-api/ocaml/xapi/`: +These are the files related to SMAPIv1 in [`/ocaml/xapi/`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi): -- [sm.ml](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/sm.ml): +- [`sm.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/sm.ml): OCaml "bindings" for the SMAPIv1 Python "drivers" (SM) -- [sm_exec.ml](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/sm_exec.ml): - support for implementing the above "bindings". The - parameters are converted to XML-RPC, passed to the relevant python - script ("driver"), and then the standard output of the program is - parsed as an XML-RPC response (we use - `xen-api-libs-transitional/http-svr/xMLRPC.ml` for parsing XML-RPC). +- [`sm_exec.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/sm_exec.ml): + support for implementing the above "bindings". + The parameters are converted to XML-RPC, passed to the relevant python script ("driver"), + and then the standard output of the program is parsed as an XML-RPC response (we use + [`ocaml/libs/http-lib/xMLRPC.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/libs/http-lib/xMLRPC.ml) + for parsing XML-RPC). When adding new functionality, we can modify `type call` to add parameters, but when we don't add any common ones, we should just pass the new parameters in the args record. -- `smint.ml`: Contains types, exceptions, ... for the SMAPIv1 OCaml - interface +- [`smint.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/smint.ml): + Contains types, exceptions, ... for the SMAPIv1 OCaml interface. +- [`storage_smapiv1_wrapper.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_smapiv1_wrapper.ml): + The [`Wrapper`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_smapiv1_wrapper.ml#L360) + module wraps a SMAPIv2 server (`Server_impl`) and takes care of + locking and datapaths (in case of multiple connections (=datapaths) + from VMs to the same VDI, using a state machine for SMAPIv1 operations. + It will use the superstate computed by the + [`vdi_automaton.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi-idl/storage/vdi_automaton.ml) + in xapi-idl) to compute the required actions to reach the desired state from the current one. + It also implements some functionality, like the `DP` module, that is not implemented in lower layers. +- [`storage_smapiv1.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_smapiv1.ml): + a SMAPIv2 server that translates SMAPIv2 calls to SMAPIv1 ones, by calling + [`ocaml/xapi/sm.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/sm.ml). + It calls passes the XML-RPC requests as the first command-line argument to the + corresponding Python script, which returns an XML-RPC response on standard + output. ## SMAPIv2 These are the files related to SMAPIv2, which need to be modified to implement new calls: -- [xcp-idl/storage/storage\_interface.ml](https://github.com/xapi-project/xcp-idl/blob/v1.62.0/storage/storage_interface.ml): +- [`ocaml/xapi-idl/storage/storage_interface.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi-idl/storage/storage_interface.ml): Contains the SMAPIv2 interface -- [xcp-idl/storage/storage\_skeleton.ml](https://github.com/xapi-project/xcp-idl/blob/v1.62.0/storage/storage_skeleton.ml): +- [`ocaml/xapi-idl/storage/storage_skeleton.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi-idl/storage/storage_skeleton.ml): A stub SMAPIv2 storage server implementation that matches the SMAPIv2 storage server interface (this is verified by - [storage\_skeleton\_test.ml](https://github.com/xapi-project/xcp-idl/blob/v1.62.0/storage/storage_skeleton_test.ml)), + [`storage_skeleton_test.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi-idl/storage/storage_skeleton_test.ml)), each of its function just raise a `Storage_interface.Unimplemented` error. This skeleton is used to automatically fill the unimplemented methods of the below storage servers to satisfy the interface. -- [xen-api/ocaml/xapi/storage\_access.ml](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_access.ml): - [module SMAPIv1](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_access.ml#L104): - a SMAPIv2 server that does SMAPIv2 -> SMAPIv1 translation. - It passes the XML-RPC requests as the first command-line argument to the - corresponding Python script, which returns an XML-RPC response on standard - output. -- [xen-api/ocaml/xapi/storage\_impl.ml](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_impl.ml): - The - [Wrapper](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_impl.ml#L302) - module wraps a SMAPIv2 server (Server\_impl) and takes care of - locking and datapaths (in case of multiple connections (=datapaths) - from VMs to the same VDI, it will use the superstate computed by the - [Vdi_automaton](https://github.com/xapi-project/xcp-idl/blob/v1.62.0/storage/vdi_automaton.ml) - in xcp-idl). It also implements some functionality, like the `DP` - module, that is not implemented in lower layers. -- [xen-api/ocaml/xapi/storage\_mux.ml](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_mux.ml): +- [`ocaml/xapi/storage_mux.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_mux.ml): A SMAPIv2 server, which multiplexes between other servers. A different SMAPIv2 server can be registered for each SR. Then it forwards the calls for each SR to the "storage plugin" registered for that SR. +- [`ocaml/xapi/storage_smapiv1_wrapper.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_smapiv1_wrapper.ml): + Implements a state machine to compute SMAPIv1 actions needed to reach the desired state, see [SMAPIv1](#smapiv1). +- [`ocaml/xapi/storage_smapiv1.ml`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_smapiv1.ml): + Translates the SMAPIv2 calls to SMAPIv1, see [SMAPIv1](#smapiv1). ### How SMAPIv2 works: We use [message-switch] under the hood for RPC communication between -[xcp-idl](https://github.com/xapi-project/xcp-idl) components. The +[xapi-idl](https://github.com/xapi-project/xen-api/tree/v25.11.0/ocaml/xapi-idl) components. The main `Storage_mux.Server` (basically `Storage_impl.Wrapper(Mux)`) is [registered to -listen](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_access.ml#L1279) +listen](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_access.ml#L500) on the "`org.xen.xapi.storage`" queue [during xapi's -startup](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/xapi.ml#L801), +startup](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/xapi.ml#L1080), and this is the main entry point for incoming SMAPIv2 function calls. `Storage_mux` does not really multiplex between different plugins right now: [earlier during xapi's -startup](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/xapi.ml#L799), +startup](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/xapi.ml#L1076), the same SMAPIv1 storage server module [is -registered](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_access.ml#L934) +registered](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_access.ml#L934) on the various "`org.xen.xapi.storage.`" queues for each supported SR type. (This will change with SMAPIv3, which is accessed via a SMAPIv2 plugin outside of xapi that translates between SMAPIv2 and SMAPIv3.) Then, in -[Storage\_access.create\_sr](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_access.ml#L1531), +[Storage\_access.create\_sr](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_access.ml#L802), which is called -[during SR.create](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/xapi_sr.ml#L326), +[during SR.create](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/xapi_sr.ml#L391), and also -[during PBD.plug](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/xapi_pbd.ml#L121), +[during PBD.plug](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/xapi_pbd.ml#L175), the relevant "`org.xen.xapi.storage.`" queue needed for that PBD is [registered with Storage_mux in -Storage\_access.bind](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_access.ml#L1107) +Storage\_access.bind](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_access.ml#L267) for the SR of that PBD.\ So basically what happens is that xapi registers itself as a SMAPIv2 server, and forwards incoming function calls to itself through @@ -118,10 +117,10 @@ translation. #### Registration of the various storage servers -{{}} +```mermaid sequenceDiagram participant q as message-switch -participant v1 as Storage_access.SMAPIv1 +participant v1 as Storage_smapiv1.SMAPIv1 participant svr as Storage_mux.Server Note over q, svr: xapi startup, "Starting SMAPIv1 proxies" @@ -134,11 +133,11 @@ q ->> svr:org.xen.xapi.storage Note over q, svr: SR.create, PBD.plug svr ->> q:org.xapi.storage.sr_type_2 -{{< /mermaid >}} +``` #### What happens when a SMAPIv2 "function" is called -{{}} +```mermaid graph TD call[SMAPIv2 call] --VDI.attach2--> org.xen.xapi.storage @@ -148,24 +147,24 @@ org.xen.xapi.storage org.xen.xapi.storage.SR_type_x end -org.xen.xapi.storage --VDI.attach2--> Storage_impl.Wrapper +org.xen.xapi.storage --VDI.attach2--> Storage_smapiv1_wrapper.Wrapper subgraph xapi subgraph Storage_mux.server -Storage_impl.Wrapper --> Storage_mux.mux +Storage_smapiv1_wrapper.Wrapper --> Storage_mux.mux end -Storage_access.SMAPIv1 +Storage_smapiv1.SMAPIv1 end Storage_mux.mux --VDI.attach2--> org.xen.xapi.storage.SR_type_x -org.xen.xapi.storage.SR_type_x --VDI.attach2--> Storage_access.SMAPIv1 +org.xen.xapi.storage.SR_type_x --VDI.attach2--> Storage_smapiv1.SMAPIv1 subgraph SMAPIv1 driver_x[SMAPIv1 driver for SR_type_x] end -Storage_access.SMAPIv1 --vdi_attach--> driver_x -{{< /mermaid >}} +Storage_smapiv1.SMAPIv1 --vdi_attach--> driver_x +``` ### Interface Changes, Backward Compatibility, & SXM @@ -182,27 +181,23 @@ translation. However, the former has large portions of code in its intermediate layers, in addition to the basic SMAPIv2 <-> SMAPIv1 translation in `storage_access.ml`. -These are the three files in xapi that implement the SMAPIv2 storage interface, +These are the two files in xapi that implement the SMAPIv2 storage interface, from higher to lower level: -- [xen-api/ocaml/xapi/storage\_impl.ml](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_impl.ml): -- [xen-api/ocaml/xapi/storage\_mux.ml](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_mux.ml): -- [xen-api/ocaml/xapi/storage\_access.ml](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_access.ml): +- [xen-api/ocaml/xapi/storage\_mux.ml](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_mux.ml): +- [xen-api/ocaml/xapi/storage\_access.ml](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_access.ml): Functionality implemented by higher layers is not implemented by the layers below it. -#### Extra functionality in `storage_impl.ml` - -In addition to its usual functions, `Storage_impl.Wrapper` also implements the -`UPDATES` and `TASK` SMAPIv2 APIs, without calling the wrapped module. +#### Extra functionality in `storage_task.ml` -These are backed by the `Updates`, `Task_server`, and `Scheduler` modules from +`storage_smapiv1_wrapper.ml` also implements the `UPDATES` and `TASK` SMAPIv2 APIs. These are backed by the `Updates`, `Task_server`, and `Scheduler` modules from xcp-idl, instantiated in xapi's `Storage_task` module. Migration code in `Storage_mux` will interact with these to update task progress. There is also an event loop in xapi that keeps calling `UPDATES.get` to keep the tasks in xapi's database in sync with the storage manager's tasks. -`Storage_impl.Wrapper` also implements the legacy `VDI.attach` call by simply +`Storage_smapiv1_wrapper.ml` also implements the legacy `VDI.attach` call by simply calling the newer `VDI.attach2` call in the same module. In general, this is a good place to implement a compatibility layer for deprecated functionality removed from other layers, because this is the first module that intercepts a @@ -210,39 +205,37 @@ SMAPIv2 call. #### Extra functionality in `storage_mux.ml` -`Storage_mux` implements storage motion (SXM): it implements the `DATA` and -`DATA.MIRROR` modules. Migration code will use the `Storage_task` module to run -the operations and update the task's progress. +`Storage_mux` redirects all storage motion (SXM) code to `storage_migrate.ml`, +and the multiplexed will be managed by `storage_migrate.ml`. The main implementation +resides in the `DATA` and `DATA.MIRROR` modules. Migration code will use +the `Storage_task` module to run the operations and update the task's progress. It also implements the `Policy` module from the SMAPIv2 interface. ## SMAPIv3 -[SMAPIv3](https://xapi-project.github.io/xapi-storage/) has a slightly -different interface from SMAPIv2.The -[xapi-storage-script](https://github.com/xapi-project/xapi-storage-script) -daemon is a SMAPIv2 plugin separate from xapi that is doing the SMAPIv2 -↔ SMAPIv3 translation. It keeps the plugins registered with xcp-idl -(their message-switch queues) up to date as their files appear or -disappear from the relevant directory. +[SMAPIv3](https://xapi-project.github.io/xapi-storage/) has a slightly different interface from SMAPIv2. +The +[`xapi-storage-script`](https://github.com/xapi-project/xen-api/tree/v25.11.0/ocaml/xapi-storage-script) +daemon is a SMAPIv2 plugin separate from xapi that is doing the SMAPIv2 ↔ SMAPIv3 translation. +It keeps the plugins registered with xapi-idl (their message-switch queues) +up to date as their files appear or disappear from the relevant directory. ### SMAPIv3 Interface The SMAPIv3 interface is defined using an OCaml-based IDL from the -[ocaml-rpc](https://github.com/mirage/ocaml-rpc) library, and is in this -repo: +[`ocaml-rpc`](https://github.com/mirage/ocaml-rpc) library, and is located at +[`xen-api/ocaml/xapi-storage`](https://github.com/xapi-project/xen-api/tree/v25.11.0/ocaml/xapi-storage) From this interface we generate -- OCaml RPC client bindings used in - [xapi-storage-script](https://github.com/xapi-project/xapi-storage-script) -- The [SMAPIv3 API - reference](https://xapi-project.github.io/xapi-storage) +- OCaml RPC client bindings used in `xapi-storage-script` +- The + [SMAPIv3 API reference](https://xapi-project.github.io/xapi-storage) - Python bindings, used by the SM scripts that implement the SMAPIv3 interface. - - These bindings are built by running "`make`" in the root - [xapi-storage](https://github.com/xapi-project/xapi-storage), - and appear in the` _build/default/python/xapi/storage/api/v5` + - These bindings are built by running `make` at the root level, + and appear in the` _build/default/ocaml/xapi-storage/python/xapi/storage/api/v5/` directory. - On a XenServer host, they are stored in the `/usr/lib/python3.6/site-packages/xapi/storage/api/v5/` @@ -265,24 +258,24 @@ stored in subdirectories of the `/usr/libexec/xapi-storage-script/volume/` and `/usr/libexec/xapi-storage-script/datapath/` directories, respectively. When it finds a new datapath plugin, it adds the plugin to a lookup table and -uses it the next time that datapath is required. When it finds a new volume -plugin, it binds a new [message-switch] queue named after the plugin's -subdirectory to a new server instance that uses these volume scripts. +uses it the next time that datapath is required. +When it finds a new volume plugin, it binds a new +[`message-switch`](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi-storage-script/main.ml#L2023) +queue named after the plugin's subdirectory to a new server instance that uses these volume scripts. To invoke a SMAPIv3 method, it executes a program named -`.` in the plugin's directory, for -example +`.` in the plugin's directory, +for example `/usr/libexec/xapi-storage-script/volume/org.xen.xapi.storage.gfs2/SR.ls`. The inputs to each script can be passed as command-line arguments and -are type-checked using the generated Python bindings, and so are the -outputs. The URIs of the SRs that xapi-storage-script knows about are -stored in the `/var/run/nonpersistent/xapi-storage-script/state.db` -file, these URIs can be used on the command line when an sr argument is -expected.` ` +are type-checked using the generated Python bindings, and so are the outputs. +The URIs of the SRs that xapi-storage-script knows about are stored in the + `/var/run/nonpersistent/xapi-storage-script/state.db` file, +these URIs can be used on the command line when an sr argument is expected. #### Registration of the various SMAPIv3 plugins -{{}} +```mermaid sequenceDiagram participant q as message-switch participant v1 as (Storage_access.SMAPIv1) @@ -306,11 +299,11 @@ q ->> svr:org.xen.xapi.storage Note over q, svr: SR.create, PBD.plug svr ->> q:org.xapi.storage.sr_type_4 -{{< /mermaid >}} +``` #### What happens when a SMAPIv3 "function" is called -{{}} +```mermaid graph TD call[SMAPIv2 call] --VDI.attach2--> org.xen.xapi.storage @@ -348,28 +341,28 @@ end end org.xen.xapi.storage.SR_type_x --VDI.attach2-->xapi-storage-script -{{< /mermaid >}} +``` ## Error reporting In our SMAPIv1 OCaml "bindings" in xapi -([xen-api/ocaml/xapi/sm\_exec.ml](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/sm_exec.ml)), +([xen-api/ocaml/xapi/sm\_exec.ml](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/sm_exec.ml)), [when we inspect the error codes returned from a call to -SM](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/sm_exec.ml#L199), +SM](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/sm_exec.ml#L421), we translate some of the SMAPIv1/SM error codes to XenAPI errors, and for others, we just [construct an error code](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/sm_exec.ml#L214) of the form `SR_BACKEND_FAILURE_`. The file -[xcp-idl/storage/storage\_interface.ml](https://github.com/xapi-project/xcp-idl/blob/v1.62.0/storage/storage_interface.ml#L362) +[xcp-idl/storage/storage\_interface.ml](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi-idl/storage/storage_interface.ml#L343) defines a number of SMAPIv2 errors, ultimately all errors from the various SMAPIv2 storage servers in xapi will be returned as one of these. Most of the errors aren't converted into a specific exception in `Storage_interface`, but are simply wrapped with `Storage_interface.Backend_error`. The -[Storage\_access.transform\_storage\_exn](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/storage_access.ml#L29) +[Storage\_utils.transform\_storage\_exn](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/storage_utils.ml#L133) function is used by the client code in xapi to translate the SMAPIv2 errors into XenAPI errors again, this unwraps the errors wrapped with `Storage_interface.Backend_error`. @@ -379,7 +372,7 @@ errors into XenAPI errors again, this unwraps the errors wrapped with In the message forwarding layer, first we check the validity of VDI operations using `mark_vdi` and `mark_sr`. These first check that the operation is valid operations, -using [Xapi\_vdi.check\_operation\_error](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/xapi_vdi.ml#L57), +using [Xapi\_vdi.check\_operation\_error](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/xapi_vdi.ml#L65), for `mark_vdi`, which also inspects the current operations of the VDI, and then, if the operation is valid, it is added to the VDI's current operations, and update\_allowed\_operations is called. Then we forward @@ -390,7 +383,7 @@ VDI's SR. For the VDI operations, we check at two different places whether the SR is attached: first, at the Xapi level, [in -Xapi\_vdi.check\_operation\_error](https://github.com/xapi-project/xen-api/blob/v1.127.0/ocaml/xapi/xapi_vdi.ml#L98), +Xapi\_vdi.check\_operation\_error](https://github.com/xapi-project/xen-api/blob/v25.11.0/ocaml/xapi/xapi_vdi.ml#L133), for the resize operation, and then, at the SMAPIv1 level, in `Sm.assert_pbd_is_plugged`. `Sm.assert_pbd_is_plugged` performs the same checks, plus it checks that the PBD is attached to the localhost, diff --git a/doc/content/xapi/storage/sxm.md b/doc/content/xapi/storage/sxm/index.md similarity index 76% rename from doc/content/xapi/storage/sxm.md rename to doc/content/xapi/storage/sxm/index.md index 6c44e432d22..4a8a68ced52 100644 --- a/doc/content/xapi/storage/sxm.md +++ b/doc/content/xapi/storage/sxm/index.md @@ -2,9 +2,443 @@ Title: Storage migration --- +- [Overview](#overview) +- [SXM Multiplexing](#sxm-multiplexing) + - [Motivation](#motivation) + - [But we have storage\_mux.ml](#but-we-have-storage_muxml) + - [Thought experiments on an alternative design](#thought-experiments-on-an-alternative-design) + - [Design](#design) +- [SMAPIv1 migration](#smapiv1-migration) + - [Preparation](#preparation) + - [Establishing mirror](#establishing-mirror) + - [Mirror](#mirror) + - [Snapshot](#snapshot) + - [Copy and compose](#copy-and-compose) + - [Finish](#finish) +- [SMAPIv3 migration](#smapiv3-migration) + - [Preparation](#preparation-1) + - [Establishing mirror](#establishing-mirror-1) + - [Limitations](#limitations) + - [Finish](#finish-1) +- [Error Handling](#error-handling) + - [Preparation (SMAPIv1 and SMAPIv3)](#preparation-smapiv1-and-smapiv3) + - [Snapshot and mirror failure (SMAPIv1)](#snapshot-and-mirror-failure-smapiv1) + - [Mirror failure (SMAPIv3)](#mirror-failure-smapiv3) + - [Copy failure (SMAPIv1)](#copy-failure-smapiv1) +- [SMAPIv1 Migration implementation detail](#smapiv1-migration-implementation-detail) + - [Receiving SXM](#receiving-sxm) + - [Xapi code](#xapi-code) + - [Storage code](#storage-code) + - [Copying a VDI](#copying-a-vdi) + - [Mirroring a VDI](#mirroring-a-vdi) + - [Code walkthrough](#code-walkthrough) + - [DATA.copy](#datacopy) + - [DATA.copy\_into](#datacopy_into) + - [DATA.MIRROR.start](#datamirrorstart) + + ## Overview -{{}} +The core idea of storage migration is surprisingly simple: We have VDIs attached to a VM, +and we wish to migrate these VDIs from one SR to another. This necessarily requires +us to copy the data stored in these VDIs over to the new SR, which can be a long-running +process if there are gigabytes or even terabytes of them. We wish to minimise the +down time of this process to allow the VM to keep running as much as possible. + +At a very high level, the SXM process generally only consists of two stages: preparation +and mirroring. The preparation is about getting the receiving host ready for the +mirroring operation, while the mirroring itself can be further divided into two +more operations: 1. sending new writes to both sides; 2.copying existing data from +source to destination. The exact detail of how to set up a mirror differs significantly +between SMAPIv1 and SMAPIv3, but both of them will have to perform the above two +operations. +Once the mirroring is established, it is a matter of checking the status of the +mirroring and carry on with the follwoing VM migration. + +The reality is more complex than what we had hoped for. For example, in SMAPIv1, +the mirror establishment is quite an involved process and is itself divided into +several stages, which will be discussed in more detail later on. + + +## SXM Multiplexing + +This section is about the design idea behind the additional layer of mutiplexing specifically +for Storage Xen Motion (SXM) from SRs using SMAPIv3. It is recommended that you have read the +[introduction doc](_index.md) for the storage layer first to understand how storage +multiplexing is done between SMAPIv2 and SMAPI{v1, v3} before reading this. + + +### Motivation + +The existing SXM code was designed to work only with SMAPIv1 SRs, and therefore +does not take into account the dramatic difference in the ways SXM is done between +SMAPIv1 and SMAPIv3. The exact difference will be covered later on in this doc, for this section +it is sufficient to assume that they have two ways of doing migration. Therefore, +we need different code paths for migration from SMAPIv1 and SMAPIv3. + +#### But we have storage_mux.ml + +Indeed, storage_mux.ml is responsible for multiplexing and forwarding requests to +the correct storage backend, based on the SR type that the caller specifies. And +in fact, for inbound SXM to SMAPIv3 (i.e. migrating into a SMAPIv3 SR, GFS2 for example), +storage_mux is doing the heavy lifting of multiplexing between different storage +backends. Every time a `Remote.` call is invoked, this will go through the SMAPIv2 +layer to the remote host and get multiplexed on the destination host, based on +whether we are migrating into a SMAPIv1 or SMAPIv3 SR (see the diagram below). +And the inbound SXM is implemented +by implementing the existing SMAPIv2 -> SMAPIv3 calls (see `import_activate` for example) +which may not have been implemented before. + +![mux for inbound](sxm_mux_inbound.svg) + +While this works fine for inbound SXM, it does not work for outbound SXM. A typical SXM +consists of four combinations, the source sr type (v1/v3) and the destiantion sr +type (v1/v3), any of the four combinations is possible. We have already covered the +destination multiplexing (v1/v3) by utilising storage_mux, and at this point we +have run out of multiplexer for multiplexing on the source. In other words, we +can only mutiplex once for each SMAPIv2 call, and we can either use that chance for +either the source or the destination, and we have already used it for the latter. + + +#### Thought experiments on an alternative design + +To make it even more concrete, let us consider an example: the mirroring logic in +SXM is different based on the source SR type of the SXM call. You might imagine +defining a function like `MIRROR.start v3_sr v1_sr` that will be multiplexed +by the storage_mux based on the source SR type, and forwarded to storage_smapiv3_migrate, +or even just xapi-storage-script, which is indeed quite possible. +Now at this point we have already done the multiplexing, but we still wish to +multiplex operations on destination SRs, for example, we might want to attach a +VDI belonging to a SMAPIv1 SR on the remote host. But as we have already done the +multiplexing and is now inside xapi-storage-script, we have lost any chance of doing +any further multiplexing :( + +### Design + +The idea of this new design is to introduce an additional multiplexing layer that +is specific for multiplexing calls based on the source SR type. For example, in +the diagram below the `send_start src_sr dest_sr` will take both the src SR and the +destination SR as parameters, and suppose the mirroring logic is different for different +types of source SRs (i.e. SMAPIv1 or SMAPIv3), the storage migration code will +necessarily choose the right code path based on the source SR type. And this is +exactly what is done in this additional multiplexing layer. The respective logic +for doing {v1,v3}-specifi mirroring, for example, will stay in storage_smapi{v1,v3}_migrate.ml + +![mux for outbound](sxm_mux_outbound.svg) + +Note that later on storage_smapi{v1,v3}_migrate.ml will still have the flexibility +to call remote SMAPIv2 functions, such as `Remote.VDI.attach dest_sr vdi`, and +it will be handled just as before. + +## SMAPIv1 migration + +This section is about migration from SMAPIv1 SRs to SMAPIv1 or SMAPIv3 SRs, since +the migration is driven by the source host, it is usally the source host that +determines most of the logic during a storage migration. + +First we take a look at an overview diagram of what happens during SMAPIv1 SXM: +the diagram is labelled with S1, S2 ... which indicates different stages of the migration. +We will talk about each stage in more detail below. + +![overview-v1](sxm-overview-v1.svg) + +### Preparation + +Before we can start our migration process, there are a number of preparations +needed to prepare for the following mirror. For SMAPIv1 this involves: + +1. Create a new VDI (called leaf) that will be used as the receiving VDI for all the new writes +2. Create a dummy snapshot of the VDI above to make sure it is a differencing disk and can be composed later on +3. Create a VDI (called parent) that will be used to receive the existing content of the disk (the snapshot) + +Note that the leaf VDI needs to be attached and activated on the destination host (to a non-exsiting `mirror_vm`) +since it will later on accept writes to mirror what is written on the source host. + +The parent VDI may be created in two different ways: 1. If there is a "similar VDI", +clone it on the destination host and use it as the parent VDI; 2. If there is no +such VDI, create a new blank VDI. The similarity here is defined by the distances +between different VDIs in the VHD tree, which is exploiting the internal representation +of the storage layer, hence we will not go into too much detail about this here. + +Once these preparations are done, a `mirror_receive_result` data structure is then +passed back to the source host that will contain all the necessary information about +these new VDIs, etc. + +### Establishing mirror + +At a high level, mirror establishment for SMAPIv1 works as follows: + +1. Take a snapshot of a VDI that is attached to VM1. This gives us an immutable +copy of the current state of the VDI, with all the data up until the point we took +the snapshot. This is illustrated in the diagram as a VDI and its snapshot connecting +to a shared parent, which stores the shared content for the snapshot and the writable +VDI from which we took the snapshot (snapshot) +2. Mirror the writable VDI to the server hosts: this means that all writes that goes to the +client VDI will also be written to the mirrored VDI on the remote host (mirror) +3. Copy the immutable snapshot from our local host to the remote (copy) +4. Compose the mirror and the snapshot to form a single VDI +5. Destroy the snapshot on the local host (cleanup) + +#### Mirror + +The mirroring process for SMAPIv1 is rather unconventional, so it is worth +documenting how this works. Instead of a conventional client server architecture, +where the source client connects to the destination server directly through the +NBD protocol in tapdisk, the connection is established in xapi and then passed +onto tapdisk. It was done in this rather unusual way mainly due to authentication +issues. Because it is xapi that is creating the connection, tapdisk does not need +to be concerned about authentication of the connection, thus simplifying the storage +component. This is reasonable as the storage component should focus on handling +storage requests rather than worrying about network security. + +The diagram below illustrates this prcess. First, xapi on the source host will +initiate an https request to the remote xapi. This request contains the necessary +information about the VDI to be mirrored, and the SR that contains it, etc. This +information is then passed onto the https handler on the destination host (called +`nbd_handler`) which then processes this information. Now the unusual step is that +both the source and the destination xapi will pass this connection onto tapdisk, +by sending the fd representing the socket connection to the tapdisk process. On +the source this would be nbd client process of tapdisk, and on the destination +this would be the nbd server process of the tapdisk. After this step, we can consider +a client-server connection is established between two tapdisks on the client and +server, as if the tapdisk on the source host makes a request to the tapdisk on the +destination host and initiates the connection. On the diagram, this is indicated +by the dashed lines between the tapdisk processes. Logically, we can view this as +xapi creates the connection, and then passes this connection down into tapdisk. + +![mirror](sxm-mirror-v1.svg) + +#### Snapshot + +The next step would be create a snapshot of the VDI. This is easily done as a +`VDI.snapshot` operation. If the VDI was in VHD format, then internally this would +create two children for, one for the snapshot, which only contains the metadata +information and tends to be small, the other for the writable VDI where all the +new writes will go to. The shared base copy contains the shared blocks. + +![snapshot](sxm-snapshot-v1.svg) + +#### Copy and compose + +Once the snapshot is created, we can then copy the snapshot from the source +to the destination. This step is done by `sparse_dd` using the nbd protocol. This +is also the step that takes the most time to complete. + +`sparse_dd` is a process forked by xapi that does the copying of the disk blocks. +`sparse_dd` can supports a number of protocols, including nbd. In this case, `sparse_dd` +will initiate an https put request to the destination host, with a url of the form +`

/services/SM/nbdproxy//`. This https request then +gets handled by the https handler on the destination host B, which will then spawn +a handler thread. This handler will find the +"generic" nbd server[^2] of either tapdisk or qemu-dp, depending on the destination +SR type, and then start proxying data between the https connection socket and the +socket connected to the nbd server. + +[^2]: The server is generic because it does not accept fd passing, and I call those +"special" nbd server/fd receiver. + +![sxm new copy](sxm-new-copy-v1.svg) + +Once copying is done, the snapshot and mirrored VDI can be then composed into a +single VDI. + +#### Finish + +At this point the VDI is synchronised to the new host! Mirror is still working at this point +though because that will not be destroyed until the VM itself has been migrated +as well. Some cleanups are done at this point, such as deleting the snapshot +that is taken on the source, destroying the mirror datapath, etc. + +The end results look like the following. Note that VM2 is in dashed line as it +is not yet created yet. The next steps would be to migrate the VM1 itself to the +destination as well, but this is part of the VM migration process and will not +be covered here. + +![final](sxm-final-v1.svg) + + +## SMAPIv3 migration + +This section covers the mechanism of migrations *from* SRs using SMAPIv3 (to +SMAPIv1 or SMAPIv3). Although the core ideas are the same, SMAPIv3 has a rather +different mechanism for mirroring: 1. it does not require xapi to take snapshot +of the VDI anymore, since the mirror itself will take care of replicating the +existing data to the destination; 2. there is no fd passing for connection establishment anymore, and instead proxies are used for connection setup. + +### Preparation + +The preparation work for SMAPIv3 is greatly simplified by the fact that the mirror +at the storage layer will copy the existing data in the VDI to the destination. +This means that snapshot of the source VDI is not required anymore. So we are left +with only one thing: + +1. Create a VDI used for mirroring the data of the source VDI + +For this reason, the implementation logic for SMAPIv3 preparation is also shorter, +as the complexity is now handled by the storage layer, which is where it is supposed +to be handled. + +### Establishing mirror + +The other significant difference is that the storage backend for SMAPIv3 `qemu-dp` +SRs no longer accepts fds, so xapi needs to proxy the data between two nbd client +and nbd server. + +SMAPIv3 provides the `Data.mirror uri domain remote` which needs three parameters: +`uri` for accessing the local disk, `doamin` for the domain slice on which mirroring +should happen, and most importantly for this design, a `remote` url which represents +the remote nbd server to which the blocks of data can be sent to. + +This function itself, when called by xapi and forwarded to the storage layer's qemu-dp +nbd client, will initiate a nbd connection to the nbd server pointed to by `remote`. +This works fine when the storage migration happens entirely within a local host, +where qemu-dp's nbd client and nbd server can communicate over unix domain sockets. +However, it does not work for inter-host migrations as qemu-dp's nbd server is not +exposed publicly over the network (just as tapdisk's nbd server). Therefore a proxying +service on the source host is needed for forwarding the nbd connection from the +source host to the destination host. And it would be the responsiblity of +xapi to manage this proxy service. + +The following diagram illustrates the mirroring process of a single VDI: + +![sxm mirror](sxm-mirror-v3.svg) + +The first step for xapi is then to set up a nbd proxy thread that will be listening +on a local unix domain socket with path `/var/run/nbdproxy/export/` where +domain is the `domain` parameter mentioned above in `Data.mirror`. The nbd proxy +thread will accept nbd connections (or rather any connections, it does not +speak/care about nbd protocol at all) and sends an https put request +to the remote xapi. The proxy itself will then forward the data exactly as it is +to the remote side through the https connection. + +Once the proxy is set up, xapi will call `Data.mirror`, which +will be forwarded to the xapi-storage-script and is further forwarded to the qemu-dp. +This call contains, among other parameters, the destination NBD server url (`remote`) +to be connected. In this case the destination nbd server is exactly the domain +socket to which the proxy thread is listening. Therefore the `remote` parameter +will be of the form `nbd+unix:///?socket=` where the export is provided +by the destination nbd server that represents the VDI prepared on the destination +host, and the socket will be the path of the unix domain socket where the proxy +thread (which we just created) is listening at. + +When this connection is set up, the proxy process will talk to the remote xapi via +https requests, and on the remote side, an https handler will proxy this request to +the appropriate nbd server of either tapdisk or qemu-dp, using exactly the same +[import proxy](#copy-and-compose) as mentioned before. + +Note that this proxying service is tightly integrated with outbound SXM of SMAPIv3 +SRs. This is to make it simple to focus on the migration itself. + +Although there is no need to explicitly copy the VDI anymore, we still need to +transfer the data and wait for it finish. For this we use `Data.stat` call provided +by the storage backend to query the status of the mirror, and wait for it to finish +as needed. + +#### Limitations + +This way of establishing the connection simplifies the implementation of the migration +for SMAPIv3, but it also has limitations: + +One proxy per live VDI migration is needed, which can potentially consume lots of resources in dom0, and we should measure the impact of this before we switch to using more resource-efficient ways such as wire guard that allows establishing a single connection between multiple hosts. + + +### Finish + +As there is no need to copy a VDI, there is also no need to compose or delete the +snapshot. The cleanup procedure would therefore just involve destroy the datapath +that was used for receiving writes for the mirrored VDI. + +## Error Handling + +Storage migration is a long-running process, and is prone to failures in each +step. Hence it is important specifying what errors could be raised at each step +and their significance. This is beneficial both for the user and for triaging. + +There are two general cleanup functions in SXM: `MIRROR.receive_cancel` and +`MIRROR.stop`. The former is for cleaning up whatever has been created by `MIRROR.receive_start` +on the destination host (such as VDIs for receiving mirrored data). The latter is +a more comprehensive function that attempts to "undo" all the side effects that +was done during the SXM, and also calls `receive_cancel` as part of its operations. + +Currently error handling was done by building up a list of cleanup functions in +the `on_fail` list ref as the function executes. For example, if the `receive_start` +has been completed successfully, add `receive_cancel` to the list of cleanup functions. +And whenever an exception is encountered, just execute whatever has been added +to the `on_fail` list ref. This is convenient, but does entangle all the error +handling logic with the core SXM logic itself, making the code rather than hard +to understand and maintain. + +The idea to fix this is to introduce explicit "stages" during the SXM and define +explicitly what error handling should be done if it fails at a certain stage. This +helps separate the error handling logic into the `with` part of a `try with` block, +which is where they are supposed to be. Since we need to accommodate the existing +SMAPIv1 migration (which has more stages than SMAPIv3), the following stages are +introduced: preparation (v1,v3), snapshot(v1), mirror(v1, v3), copy(v1). Note that +each stage also roughly corresponds to a helper function that is called within `Storage_migrate.start`, +which is the wrapper function that initiates storage migration. And each helper +functions themselves would also have error handling logic within themselves as +needed (e.g. see `Storage_smapiv1_migrate.receive_start`) to deal with exceptions +that happen within each helper functions. + +### Preparation (SMAPIv1 and SMAPIv3) + +The preparation stage generally corresponds to what is done in `receive_start`, and +this function itself will handle exceptions when there are partial failures within +the function itself, such as an exception after the receiving VDI is created. +It will use the old-style `on_fail` function but only with a limited scope. + +There is nothing to be done at a higher level (i.e within `MIRROR.start` which +calls `receive_start`) if preparation has failed. + +### Snapshot and mirror failure (SMAPIv1) + +For SMAPIv1, the mirror is done in a bit cumbersome way. The end goal is to establish +connections between two tapdisk processes on the source and destination hosts. +To achieve this goal, xapi will do two main jobs: 1. create a connection between two +hosts and pass the connection to tapdisk; 2. create a snapshot as a starting point +of the mirroring process. + +Therefore handling of failures at these two stages are similar: clean up what was +done in the preparation stage by calling `receive_cancel`, and that is almost it. +Again, we will leave whatever is needed for partial failure handling within those +functions themselves and only clean up at a stage-level in `storage_migrate.ml` + +Note that `receive_cancel` is a multiplexed function for SMAPIv1 and SMAPIv3, which +means different clean up logic will be executed depending on what type of SR we +are migrating from. + +### Mirror failure (SMAPIv3) + +The `Data.stat` call in SMAPIv3 returns a data structure that includes the current +progress of the mirror job, whether it has completed syncing the existing data and +whether the mirorr has failed. Similar to how it is done in SMAPIv1, we wait for +the sync to complete once we issue the `Data.mirror` call, by repeatedly polling +the status of the mirror using the `Data.stat` call. During this process, the status +of the mirror is also checked and if a failure is detected, a `Migration_mirror_failure` +will be raised and then gets handled by the code in `storage_migrate.ml` by calling +`Storage_smapiv3_migrate.receive_cancel2`, which will clean up the mirror datapath +and destroy the mirror VDI, similar to what is done in SMAPIv1. + + +### Copy failure (SMAPIv1) + +The final step of storage migration for SMAPIv1 is to copy the snapshot from the +source to the destination. At this stage, most of the side effectful work has been +done, so we do need to call `MIRROR.stop` to clean things up if we experience an +failure during copying. + + +## SMAPIv1 Migration implementation detail + +{{% notice info %}} +The following doc refers to the xapi a [version](https://github.com/xapi-project/xen-api/blob/v24.37.0/ocaml/xapi/storage_migrate.ml) +of xapi that is before 24.37 after which point this code structure has undergone +many changes as part of adding support for SMAPIv3 SXM. Therefore the following +tutorial might be less relevant in terms of the implementation detail. Although +the general principle should remain the same. +{{% /notice %}} + +```mermaid sequenceDiagram participant local_tapdisk as local tapdisk participant local_smapiv2 as local SMAPIv2 @@ -129,7 +563,7 @@ opt post_detach_hook end Note over xapi: memory image migration by xenopsd Note over xapi: destroy the VM record -{{< /mermaid >}} +``` ### Receiving SXM @@ -162,7 +596,7 @@ the receiving end of storage motion: This is how xapi coordinates storage migration. We'll do it as a code walkthrough through the two layers: xapi and storage-in-xapi (SMAPIv2). -## Xapi code +### Xapi code The entry point is in [xapi_vm_migration.ml](https://github.com/xapi-project/xen-api/blob/f75d51e7a3eff89d952330ec1a739df85a2895e2/ocaml/xapi/xapi_vm_migrate.ml#L786) @@ -1056,7 +1490,7 @@ We also try to remove the VM record from the destination if we managed to send i Finally we check for mirror failure in the task - this is set by the events thread watching for events from the storage layer, in [storage_access.ml](https://github.com/xapi-project/xen-api/blob/f75d51e7a3eff89d952330ec1a739df85a2895e2/ocaml/xapi/storage_access.ml#L1169-L1207) -## Storage code +### Storage code The part of the code that is conceptually in the storage layer, but physically in xapi, is located in [storage_migrate.ml](https://github.com/xapi-project/xen-api/blob/f75d51e7a3eff89d952330ec1a739df85a2895e2/ocaml/xapi/storage_migrate.ml). There are logically a few separate parts to this file: @@ -1069,7 +1503,7 @@ The part of the code that is conceptually in the storage layer, but physically i Let's start by considering the way the storage APIs are intended to be used. -### Copying a VDI +#### Copying a VDI `DATA.copy` takes several parameters: @@ -1119,7 +1553,7 @@ The implementation uses the `url` parameter to make SMAPIv2 calls to the destina The implementation tries to minimize the amount of data copied by looking for related VDIs on the destination SR. See below for more details. -### Mirroring a VDI +#### Mirroring a VDI `DATA.MIRROR.start` takes a similar set of parameters to that of copy: @@ -1156,11 +1590,11 @@ Note that state is a list since the initial phase of the operation requires both Additionally the mirror can be cancelled using the `MIRROR.stop` API call. -### Code walkthrough +#### Code walkthrough let's go through the implementation of `copy`: -#### DATA.copy +##### DATA.copy ```ocaml let copy ~task ~dbg ~sr ~vdi ~dp ~url ~dest = @@ -1296,7 +1730,7 @@ Finally we snapshot the remote VDI to ensure we've got a VDI of type 'snapshot' The exception handler does nothing - so we leak remote VDIs if the exception happens after we've done our cloning :-( -#### DATA.copy_into +##### DATA.copy_into Let's now look at the data-copying part. This is common code shared between `VDI.copy`, `VDI.copy_into` and `MIRROR.start` and hence has some duplication of the calls made above. @@ -1467,7 +1901,7 @@ The last thing we do is to set the local and remote content_id. The local set_co Here we perform the list of cleanup operations. Theoretically. It seems we don't ever actually set this to anything, so this is dead code. -#### DATA.MIRROR.start +##### DATA.MIRROR.start ```ocaml let start' ~task ~dbg ~sr ~vdi ~dp ~url ~dest = @@ -1765,3 +2199,4 @@ let pre_deactivate_hook ~dbg ~dp ~sr ~vdi = s.failed <- true ) ``` + diff --git a/doc/content/xapi/storage/sxm/sxm-final-v1.svg b/doc/content/xapi/storage/sxm/sxm-final-v1.svg new file mode 100644 index 00000000000..7cdb2d540a3 --- /dev/null +++ b/doc/content/xapi/storage/sxm/sxm-final-v1.svg @@ -0,0 +1,4 @@ + + + +
VM1
Host1
VDI
Host2
VDI
VM2
SR1
Mirror
SR2
\ No newline at end of file diff --git a/doc/content/xapi/storage/sxm/sxm-mirror-v1.svg b/doc/content/xapi/storage/sxm/sxm-mirror-v1.svg new file mode 100644 index 00000000000..4b6f61131c5 --- /dev/null +++ b/doc/content/xapi/storage/sxm/sxm-mirror-v1.svg @@ -0,0 +1,4 @@ + + + +
xapi
xapi
VDI
VDI
xapi
xapi
tapdisk
tapdisk
Host A
Host A
Host B
Host B
http connection
http connection
pass client socket of the http connection
via SCM_RIGHTS
pass client socket o...
tapdisk
tapdisk
http handler
http handler
pass server socket of the http connection
pass server socket o...
VDI
VDI
mirror
mirror
Text is not SVG - cannot display
\ No newline at end of file diff --git a/doc/content/xapi/storage/sxm/sxm-mirror-v3.svg b/doc/content/xapi/storage/sxm/sxm-mirror-v3.svg new file mode 100644 index 00000000000..8ed03406acc --- /dev/null +++ b/doc/content/xapi/storage/sxm/sxm-mirror-v3.svg @@ -0,0 +1,4 @@ + + + +
xapi
xapi
Source Host A
Destination Host B
tapdisk
qemu-dp
generic nbd server
generic nbd server
xapi-storage-script
Data.mirror 
qemu-dp 
nbd client
Data.mirror 
nbd exporting proxy
http handler
http request
nbd import proxy
Legend
belongs/spawns
talks to
\ No newline at end of file diff --git a/doc/content/xapi/storage/sxm/sxm-new-copy-v1.svg b/doc/content/xapi/storage/sxm/sxm-new-copy-v1.svg new file mode 100644 index 00000000000..891913850d3 --- /dev/null +++ b/doc/content/xapi/storage/sxm/sxm-new-copy-v1.svg @@ -0,0 +1,4 @@ + + + +
xapi
xapi
Host A
Host B
tapdisk
http connection
qemu-dp
generic nbd server
generic nbd server
proxy
sparse_dd
http handler
\ No newline at end of file diff --git a/doc/content/xapi/storage/sxm/sxm-overview-v1.svg b/doc/content/xapi/storage/sxm/sxm-overview-v1.svg new file mode 100644 index 00000000000..b6002382db2 --- /dev/null +++ b/doc/content/xapi/storage/sxm/sxm-overview-v1.svg @@ -0,0 +1,4 @@ + + + +
VM1
Host1
VDI
VDI snapshot
Host2
VDI
VDI snapshot
VM2
SR1
SR2
S2:Mirror
S1:Snapshot
S3: Copy
S4: Compose
\ No newline at end of file diff --git a/doc/content/xapi/storage/sxm/sxm-snapshot-v1.svg b/doc/content/xapi/storage/sxm/sxm-snapshot-v1.svg new file mode 100644 index 00000000000..5fe0f398c17 --- /dev/null +++ b/doc/content/xapi/storage/sxm/sxm-snapshot-v1.svg @@ -0,0 +1,4 @@ + + + +
VDI
VDI snapshot
base
\ No newline at end of file diff --git a/doc/content/xapi/storage/sxm/sxm_mux_inbound.svg b/doc/content/xapi/storage/sxm/sxm_mux_inbound.svg new file mode 100644 index 00000000000..c38bc36ae5f --- /dev/null +++ b/doc/content/xapi/storage/sxm/sxm_mux_inbound.svg @@ -0,0 +1,4 @@ + + + +
Storage_migrate.start
Host A
Host B
v1
v3
storage_mux server
Remote.VDI.attach
storage_mux server
rpc
\ No newline at end of file diff --git a/doc/content/xapi/storage/sxm/sxm_mux_outbound.svg b/doc/content/xapi/storage/sxm/sxm_mux_outbound.svg new file mode 100644 index 00000000000..915cc7550e3 --- /dev/null +++ b/doc/content/xapi/storage/sxm/sxm_mux_outbound.svg @@ -0,0 +1,4 @@ + + + +
Storage_migrate.start
Host A
Host B
storage_mux server
Remote.VDI.attach
vdi dst_sr
MIRROR.send_start
src_sr dst_sr
storage_smapiv1_migrate.
send_start
storage_smapiv3_migrate.
send_start
RPC to host B
....
new multiplexing layer
mux based on src_sr
\ No newline at end of file diff --git a/doc/content/xcp-networkd/host-network-device-ordering-on-networkd.md b/doc/content/xcp-networkd/host-network-device-ordering-on-networkd.md new file mode 100644 index 00000000000..e142932d18a --- /dev/null +++ b/doc/content/xcp-networkd/host-network-device-ordering-on-networkd.md @@ -0,0 +1,342 @@ +--- +title: Host Network Device Ordering on Networkd +description: How does the host network device ordering work on networkd. +--- + +Purpose +------- + +One of the Toolstack's functions is to maintain a pool of hosts. A pool can be +constructed by joining a host into an existing pool. One challenge in this +process is determining which pool-wide network a network device on the joining +host should connect to. + +At first glance, this could be resolved by specifying a mapping between an +individual network device and a pool-wide network. However, this approach +would be burdensome for administrators when managing many hosts. It would be +more efficient if the Toolstack could determine this automatically. + +To achieve this, the Toolstack components on two hosts need to independently +work out consistent identifications for the host network devices and connect +the network devices with the same identification to the same pool-wide network. +The identifications on a host can be considered as an order, with each network +device assigned a unique position in the order as its identification. Network +devices with the same position will connect to the same network. + + +The assumption +-------------- + +Why can the Toolstack components on two hosts independently work out an expected +order without any communication? This is possible only under the assumption that +the hosts have identical hardware, firmware, software, and the way +network devices are plugged into them. For example, an administrator will always +plug the network devices into the same PCI slot position on multiple hosts if +they want these network devices to connect to the same network. + +The ordering is considered consistent if the positions of such network devices +(plugged into the same PCI slot position) in the generated orders are the same. + + +The biosdevname +--------------- +Particularly, when the assumption above holds, a consistent initial order can be +worked out on multiple hosts independently with the help of `biosdevname`. The +"all_ethN" policy of the `biosdevname` utility can generate a device order based +on whether the device is embedded or not, PCI cards in ascending slot order, and +ports in ascending PCI bus/device/function order breadth-first. Since the hosts +are identical, the orders generated by the `biosdevname` are consistent across +the hosts. + +An example of `biosdevname`'s output is as the following. The initial order can +be derived from the `BIOS device` field. + +``` +# biosdevname --policy all_ethN -d -x +BIOS device: eth0 +Kernel name: enp5s0 +Permanent MAC: 00:02:C9:ED:FD:F0 +Assigned MAC : 00:02:C9:ED:FD:F0 +Bus Info: 0000:05:00.0 +... + +BIOS device: eth1 +Kernel name: enp5s1 +Permanent MAC: 00:02:C9:ED:FD:F1 +Assigned MAC : 00:02:C9:ED:FD:F1 +Bus Info: 0000:05:01.0 +... +``` + +However, the `BIOS device` of a particular network device may change with the +addition or removal of devices. For example: + +``` +# biosdevname --policy all_ethN -d -x +BIOS device: eth0 +Kernel name: enp4s0 +Permanent MAC: EC:F4:BB:E6:D7:BB +Assigned MAC : EC:F4:BB:E6:D7:BB +Bus Info: 0000:04:00.0 +... + +BIOS device: eth1 +Kernel name: enp5s0 +Permanent MAC: 00:02:C9:ED:FD:F0 +Assigned MAC : 00:02:C9:ED:FD:F0 +Bus Info: 0000:05:00.0 +... + +BIOS device: eth2 +Kernel name: enp5s1 +Permanent MAC: 00:02:C9:ED:FD:F1 +Assigned MAC : 00:02:C9:ED:FD:F1 +Bus Info: 0000:05:01.0 +... +``` + +Therefore, the order derived from these values is used solely for determining +the initial order and the order of newly added devices. + +Principles +----------- +* Initially, the order is aligned with PCI slots. This is to make the connection +between cabling and order predictable: The network devices in identical PCI +slots have the same position. The rationale is that PCI slots are more +predictable than MAC addresses and correspond to physical locations. + +* Once a previous order has been established, the ordering should be maintained +as stable as possible despite changes to MAC addresses or PCI addresses. The +rationale is that the assumption is less likely to hold as long as the hosts are +experiencing updates and maintenance. Therefore, maintaining the stable order is +the best choice for automatic ordering. + +Notation +-------- + +``` +mac:pci:position +!mac:pci:position +``` + +A network device is characterised by + +* MAC address, which is unique. +* PCI slot, which is not unique and multiple network devices can share a PCI +slot. PCI addresses correspond to hardware PCI slots and thus are physically +observable. +* position, the position assigned to this network device by xcp-networkd. At any +given time, no position is assigned twice but the sequence of positions may have +holes. +* The `!mac:pci:position` notation indicates that this postion was previously +used but currently is free because the device it was assgined was removed. + +On a Linux system, MAC and PCI addresses have specific formats. However, for +simplicity, symbolic names are used here: MAC addresses use lowercase letters, +PCI addresses use uppercase letters, and positions use numbers. + +Scenarios +--------- + +### The initial order + +As mentioned above, the `biosdevname` can be used to generate consistent orders +for the network devices on multiple hosts. + +``` +current input: a:A b:D c:C +initial order: a:A:0 c:C:1 b:D:2 +``` + +This only works if the assumption of identical hardware, firmware, software, and +network device placement holds. And it is considered that the assumption will +hold for the majority of the use cases. + +Otherwise, the order can be generated from a user's configuration. The user can +specify the order explicilty for individual hosts. However, administrators would +prefer to avoid this as much as possible when managing many hosts. + +``` +user spec: a::0 c::1 b::2 +current input: a:A b:D c:C +initial order: a:A:0 c:C:1 b:D:2 +``` + +### Keep the order as stable as possible + +Once an initial order is created on an individual host, it should be kept as +stable as possible across host boot-ups and at runtime. For example, unless +there are hardware changes, the position of a network device in the initial +order should remain the same regardless of how many times the host is rebooted. + +To achieve this, the initial order should be saved persistently on the host's +local storage so it can be referenced in subsequent orderings. When performing +another ordering after the initial order has been saved, the position of a +currently unordered network device should be determined by finding its position +in the last saved order. The MAC address of the network device is a reliable +attribute for this purpose, as it is considered unique for each network device +globally. + +Therefore, the network devices in the saved order should have their MAC +addresses saved together, effectively mapping each position to a MAC address. +When performing an ordering, the stable position can be found by searching the +last saved order using the MAC address. + +``` +last order: a:A:0 c:C:1 b:D:2 +current input: a:A b:D c:C +new order: a:A:0 c:C:1 b:D:2 +``` + +Name labels of the network devices are not considered reliable enough to +identify particular devices. For example, if the name labels are determined by +the PCI address via systemd, and a firmware update changes the PCI addresses of +the network devices, the name labels will also change. + +The PCI addresses are not considered reliable as well. They may change due to +the firmeware update/setting changes or even plugging/unpluggig other devices. + +``` +last order: a:A:0 c:C:1 b:D:2 +current input: a:A b:B c:E +new order: a:A:0 c:E:1 b:B:2 +``` + +### Replacement + +However, what happens when the MAC address of an unordered network device cannot +be found in the last saved order? There are two possible scenarios: + +1. It's a newly added network device since the last ordering. +2. It's a new device that replaces an existing network device. + +Replacement is a supported scenario, as an administrator might replace a broken +network device with a new one. + +This can be recognized by comparing the PCI address where the network device is +located. Therefore, the PCI address of each network device should also be saved +in the order. In this case, searching the PCI address in the order results in +one of the following: + +1. Not found: This means the PCI address was not occupied during the last +ordering, indicating a newly added network device. +2. Found with a MAC address, but another device with this MAC address is still +present in the system: This suggests that the PCI address of an existing +network device (with the same MAC address) has changed since the last ordering. +This may be caused by either a device move or others like a firmware update. In +this case, the current unordered network device is considered newly added. + +``` +last order: a:A:0 c:C:1 b:D:2 +current input: a:A b:B c:C d:D +new order: a:A:0 c:C:1 b:B:2 d:D:3 +``` + +3. Found with a MAC address, and no current devices have this MAC address: This +indicates that a new network device has replaced the old one in the same PCI +slot. +The replacing network device should be assigned the same position as the +replaced one. + +``` +last order: a:A:0 c:C:1 b:D:2 +current input: a:A c:C d:D +new order: a:A:0 c:C:1 d:D:2 +``` + +### Removed devices + +A network device can be removed or unplugged since the last ordering. Its +position, MAC address, and PCI address are saved for future reference, and its +position will be reserved. This means there may be a gap in the order: a +position that was previously assigned to a network device is now vacant because +the device has been removed. + +``` +last order: a:A:0 c:C:1 b:D:2 +current input: a:A b:D +new order: a:A:0 !c:C:1 d:D:2 +``` + +### Newly added devices + +As long as `the assumption` holds, newly added devices since the last ordering +can be assigned positions consistently across multiple hosts. Newly added +devices will not be assigned the positions reserved for removed devices. + +``` +last order: a:A:0 !c:C:1 d:D:2 +current input: a:A d:D e:E +new order: a:A:0 !c:C:1 d:D:2 e:E:3 +``` + +### Removed and then added back + +It is a supported scenario for a removed device to be plugged back in, +regardless of whether it is in the same PCI slot or not. This can be recognized +by searching for the device in the saved removed devices using its MAC address. +The reserved position will be reassigned to the device when it is added back. + +``` +last order: a:A:0 !c:C:1 d:D:2 +current input: a:A c:F d:D e:E +new order: a:A:0 c:F:1 d:D:2 e:E:3 +``` + +### Multinic functions + +The multinic function is a special kind of network device. When this type of +physical device is plugged into a PCI slot, multiple network devices are +reported at a single PCI address. Additionally, the number of reported network +devices may change due to driver updates. + +``` +current input: a:A b:A c:A d:A +initial order: a:A:0 b:A:1 c:A:2 d:A:3 +``` + +As long as `the assumption` holds, the initial order of these devices can be +generated automatically and kept stable by using MAC addresses to identify +individual devices. However, `biosdevname` cannot reliably generate an order for +all devices reported at one PCI address. For devices located at the same PCI +address, their MAC addresses are used to generate the initial order. + +``` +last order: a:A:0 b:A:1 c:A:2 d:A:3 m:M:4 n:N:5 +current input: a:A b:A c:A d:A e:A f:A m:M n:N +new order: a:A:0 b:A:1 c:A:2 d:A:3 m:M:4 n:N:5 e:A:6 f:A:7 +``` + +For example, suppose `biosdevname` generates an order for a multinic function +and other non-multinic devices. Within this order, the N devices of the +multinic function with MAC addresses mac[1], ..., mac[N] are assigned positions +pos[1], ..., pos[N] correspondingly. `biosdevname` cannot ensure that the device +with mac[1] is always assigned position pos[1]. Instead, it ensures that the +entire set of positions pos[1], ..., pos[N] remains stable for the devices of +the multinic function. Therefore, to ensure the order follows the MAC address +order, the devices of the multinic function need to be sorted by their MAC +addresses within the set of positions. + +``` +last order: a:A:0 b:A:1 c:A:2 d:A:3 m:M:4 +current input: e:A f:A g:A h:A m:M +new order: e:A:0 f:A:1 g:A:2 h:A:3 m:M:4 +``` + +Rare cases that can not be handled automatically +------------------------------------------------ + +In summary, to keep the order stable, the auto-generated order needs to be saved +for the next ordering. When performing an automatic ordering for the current +network devices, either the MAC address or the PCI address is used to recognize +the device that was assigned the same position in the last ordering. If neither +the MAC address nor the PCI address can be used to find a position from the last +ordering, the device is considered newly added and is assigned a new position. + +However, following this sorting logic, the ordering result may not always be as +expected. In practice, this can be caused by various rare cases, such as +switching an existing network device to connect to another network, performing +firmware updates, changing firmware settings, or plugging/unplugging network +devices. It is not worth complicating the entire function for these rare cases. +Instead, the initial user's configuration can be used to handle these rare +scenarios. diff --git a/doc/content/xen-api/topics/vm-lifecycle.md b/doc/content/xen-api/topics/vm-lifecycle.md index 7390dc61e80..44727bdf3f0 100644 --- a/doc/content/xen-api/topics/vm-lifecycle.md +++ b/doc/content/xen-api/topics/vm-lifecycle.md @@ -2,7 +2,7 @@ title = "VM Lifecycle" +++ -The following figure shows the states that a VM can be in and the +The following figure shows the states that a VM can be in and the API calls that can be used to move the VM between these states. ```mermaid diff --git a/doc/content/xenopsd/architecture/_index.md b/doc/content/xenopsd/architecture/_index.md index 0f4d5eccea5..8211e838684 100644 --- a/doc/content/xenopsd/architecture/_index.md +++ b/doc/content/xenopsd/architecture/_index.md @@ -1,5 +1,6 @@ +++ -title = "Architecture" +title = "Xenopsd Architecture" +linkTitle = "Architecture" +++ Xenopsd instances run on a host and manage VMs on behalf of clients. This diff --git a/doc/content/xenopsd/design/_index.md b/doc/content/xenopsd/design/_index.md index a55a9b124b7..2047d068ad5 100644 --- a/doc/content/xenopsd/design/_index.md +++ b/doc/content/xenopsd/design/_index.md @@ -1,3 +1,6 @@ +++ title = "Design" -+++ \ No newline at end of file ++++ + +Design documents for `xenopsd`: +{{% children %}} diff --git a/doc/content/xenopsd/walkthroughs/VM.build/Domain.build.md b/doc/content/xenopsd/walkthroughs/VM.build/Domain.build.md new file mode 100644 index 00000000000..ba4274e243a --- /dev/null +++ b/doc/content/xenopsd/walkthroughs/VM.build/Domain.build.md @@ -0,0 +1,146 @@ +--- +title: Domain.build +description: + "Prepare the build of a VM: Wait for scrubbing, do NUMA placement, run xenguest." +--- + +## Overview + +```mermaid +flowchart LR +subgraph xenopsd VM_build[ + xenopsd thread pool with two VM_build micro#8209;ops: + During parallel VM_start, Many threads run this in parallel! +] +direction LR +build_domain_exn[ + VM.build_domain_exn + from thread pool Thread #1 +] --> Domain.build +Domain.build --> build_pre +build_pre --> wait_xen_free_mem +build_pre -->|if NUMA/Best_effort| numa_placement +Domain.build --> xenguest[Invoke xenguest] +click Domain.build "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1111-L1210" _blank +click build_domain_exn "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2222-L2225" _blank +click wait_xen_free_mem "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L236-L272" _blank +click numa_placement "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L862-L897" _blank +click build_pre "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L899-L964" _blank +click xenguest "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1139-L1146" _blank + +build_domain_exn2[ + VM.build_domain_exn + from thread pool Thread #2] --> Domain.build2[Domain.build] +Domain.build2 --> build_pre2[build_pre] +build_pre2 --> wait_xen_free_mem2[wait_xen_free_mem] +build_pre2 -->|if NUMA/Best_effort| numa_placement2[numa_placement] +Domain.build2 --> xenguest2[Invoke xenguest] +click Domain.build2 "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1111-L1210" _blank +click build_domain_exn2 "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2222-L2225" _blank +click wait_xen_free_mem2 "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L236-L272" _blank +click numa_placement2 "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L862-L897" _blank +click build_pre2 "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L899-L964" _blank +click xenguest2 "https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1139-L1146" _blank +end +``` + +[`VM.build_domain_exn`](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2024-L2248) +[calls](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2222-L2225) +[`Domain.build`](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1111-L1210) +to call: +- `build_pre` to prepare the build of a VM: + - If the `xe` config `numa_placement` is set to `Best_effort`, invoke the NUMA placement algorithm. + - Run `xenguest` +- `xenguest` to invoke the [xenguest](xenguest) program to setup the domain's system memory. + +## build_pre: Prepare building the VM + +[Domain.build](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1111-L1210) +[calls](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1137) +[build_pre](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L899-L964) +(which is also used for VM restore) to: + +1. [Call](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L902-L911) + [wait_xen_free_mem](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L236-L272) + to wait (if necessary), for the Xen memory scrubber to catch up reclaiming memory. + It + 1. calls `Xenctrl.physinfo` which returns: + - `hostinfo.free_pages` - the free and already scrubbed pages (available) + - `host.scrub_pages` - the not yet scrubbed pages (not yet available) + 2. repeats this until a timeout as long as `free_pages` is *lower* + than the *required* pages + - unless if `scrub_pages` is 0 (no scrubbing left to do) + + Note: `free_pages` is system-wide memory, not memory specific to a NUMA node. + Because this is not NUMA-aware, in case of temporary node-specific memory shortage, + this check is not sufficient to prevent the VM from being spread over all NUMA nodes. + It is planned to resolve this issue by claiming NUMA node memory during NUMA placement. + +2. Call the hypercall to set the timer mode +3. Call the hypercall to set the number of vCPUs +4. Call the `numa_placement` function + as described in the [NUMA feature description](/toolstack/features/NUMA) + when the `xe` configuration option `numa_placement` is set to `Best_effort` + (except when the VM has a hard CPU affinity). + + ```ml + match !Xenops_server.numa_placement with + | Any -> + () + | Best_effort -> + log_reraise (Printf.sprintf "NUMA placement") (fun () -> + if has_hard_affinity then + D.debug "VM has hard affinity set, skipping NUMA optimization" + else + numa_placement domid ~vcpus + ~memory:(Int64.mul memory.xen_max_mib 1048576L) + ) + ``` + +## NUMA placement + +`build_pre` passes the `domid`, the number of `vCPUs` and `xen_max_mib` to the +[numa_placement](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L862-L897) +function to run the algorithm to find the best NUMA placement. + +When it returns a NUMA node to use, it calls the Xen hypercalls +to set the vCPU affinity to this NUMA node: + +```ml + let vm = NUMARequest.make ~memory ~vcpus in + let nodea = + match !numa_resources with + | None -> + Array.of_list nodes + | Some a -> + Array.map2 NUMAResource.min_memory (Array.of_list nodes) a + in + numa_resources := Some nodea ; + Softaffinity.plan ~vm host nodea +``` + +By using the default `auto_node_affinity` feature of Xen, +setting the vCPU affinity causes the Xen hypervisor to activate +NUMA node affinity for memory allocations to be aligned with +the vCPU affinity of the domain. + +Summary: This passes the information to the hypervisor that memory +allocation for this domain should preferably be done from this NUMA node. + +## Invoke the xenguest program + +With the preparation in `build_pre` completed, `Domain.build` +[calls](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/domain.ml#L1127-L1155) +the `xenguest` function to invoke the [xenguest](xenguest) program to build the domain. + +## Notes on future design improvements + +The Xen domain feature flag +[domain->auto_node_affinity](https://wiki.xenproject.org/wiki/NUMA_node_affinity_in_the_Xen_hypervisor) +can be disabled by calling +[xc_domain_node_setaffinity()](../../references/xc_domain_node_setaffinity.md) +to set a specific NUMA node affinity in special cases: + +This can be used, for example, when there might not be enough memory on the preferred +NUMA node, and there are other NUMA nodes (in the same CPU package) to use +([reference](../../../lib/xenctrl/xc_domain_node_setaffinity.md)). diff --git a/doc/content/xenopsd/walkthroughs/VM.build/VM_build-chart.md b/doc/content/xenopsd/walkthroughs/VM.build/VM_build-chart.md new file mode 100644 index 00000000000..eec1f05fc0e --- /dev/null +++ b/doc/content/xenopsd/walkthroughs/VM.build/VM_build-chart.md @@ -0,0 +1,27 @@ +--- +hidden: true +title: VM_build micro-op flowchart +description: For inclusion in _index.md and VM_build.md +weight: 10 +--- + +```mermaid +flowchart +subgraph xenopsd VM_build[xenopsd: VM_build micro#8209;op] +direction LR +VM_build --> VM.build +VM.build --> VM.build_domain +VM.build_domain --> VM.build_domain_exn +VM.build_domain_exn --> Domain.build +click VM_build " +https://github.com/xapi-project/xen-api/blob/83555067/ocaml/xenopsd/lib/xenops_server.ml#L2255-L2271" _blank +click VM.build " +https://github.com/xapi-project/xen-api/blob/83555067/ocaml/xenopsd/xc/xenops_server_xen.ml#L2290-L2291" _blank +click VM.build_domain " +https://github.com/xapi-project/xen-api/blob/83555067/ocaml/xenopsd/xc/xenops_server_xen.ml#L2250-L2288" _blank +click VM.build_domain_exn " +https://github.com/xapi-project/xen-api/blob/83555067/ocaml/xenopsd/xc/xenops_server_xen.ml#L2024-L2248" _blank +click Domain.build " +https://github.com/xapi-project/xen-api/blob/83555067/ocaml/xenopsd/xc/domain.ml#L1111-L1210" _blank +end +``` diff --git a/doc/content/xenopsd/walkthroughs/VM.build/VM_build.md b/doc/content/xenopsd/walkthroughs/VM.build/VM_build.md new file mode 100644 index 00000000000..f83cccf5353 --- /dev/null +++ b/doc/content/xenopsd/walkthroughs/VM.build/VM_build.md @@ -0,0 +1,46 @@ +--- +title: VM_build micro-op +linkTitle: VM_build μ-op +description: Overview of the VM_build μ-op (runs after the VM_create μ-op created the domain). +weight: 10 +mermaid: + force: true +--- + +## Overview + +On Xen, `Xenctrl.domain_create` creates an empty domain and +returns the domain ID (`domid`) of the new domain to `xenopsd`. + +In the `build` phase, the `xenguest` program is called to create +the system memory layout of the domain, set vCPU affinity and a +lot more. + +The [VM_build](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/lib/xenops_server.ml#L2255-L2271) +micro-op collects the VM build parameters and calls +[VM.build](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2290-L2291), +which calls +[VM.build_domain](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2250-L2288), +which calls +[VM.build_domain_exn](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2024-L2248) +which calls [Domain.build](Domain.build): + +{{% include "VM_build-chart.md" %}} + +The function +[VM.build_domain_exn](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2024) +must: + +1. Run pygrub (or eliloader) to extract the kernel and initrd, if necessary +2. [Call](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L2222-L2225) + [Domain.build](Domain.build) to + - optionally run NUMA placement and + - invoke [xenguest](VM.build/xenguest) to set up the domain memory. + + See the walk-through of the [Domain.build](Domain.build) function + for more details on this phase. +3. Apply the `cpuid` configuration +4. Store the current domain configuration on disk -- it's important to know + the difference between the configuration you started with and the configuration + you would use after a reboot because some properties (such as maximum memory + and vCPUs) as fixed on create. diff --git a/doc/content/xenopsd/walkthroughs/VM.build/_index.md b/doc/content/xenopsd/walkthroughs/VM.build/_index.md new file mode 100644 index 00000000000..63770bf6bdc --- /dev/null +++ b/doc/content/xenopsd/walkthroughs/VM.build/_index.md @@ -0,0 +1,12 @@ +--- +title: Building a VM +description: After VM_create, VM_build builds the core of the domain (vCPUs, memory) +weight: 20 +mermaid: + force: true +--- +{{% include "VM_build-chart.md" %}} + +Walk-through documents for the `VM_build` phase: + +{{% children description=true %}} diff --git a/doc/content/xenopsd/walkthroughs/VM.build/xenguest.md b/doc/content/xenopsd/walkthroughs/VM.build/xenguest.md new file mode 100644 index 00000000000..70908d556fb --- /dev/null +++ b/doc/content/xenopsd/walkthroughs/VM.build/xenguest.md @@ -0,0 +1,185 @@ +--- +title: xenguest +description: + "Perform building VMs: Allocate and populate the domain's system memory." +--- +As part of starting a new domain in VM_build, `xenopsd` calls `xenguest`. +When multiple domain build threads run in parallel, +also multiple instances of `xenguest` also run in parallel: + +```mermaid +flowchart +subgraph xenopsd VM_build[xenopsd VM_build micro#8209;ops] +direction LR +xenopsd1[Domain.build - Thread #1] --> xenguest1[xenguest #1] +xenopsd2[Domain.build - Thread #2] --> xenguest2[xenguest #2] +xenguest1 --> libxenguest +xenguest2 --> libxenguest2[libxenguest] +click xenopsd1 "../Domain.build/index.html" +click xenopsd2 "../Domain.build/index.html" +click xenguest1 "https://github.com/xenserver/xen.pg/blob/XS-8/patches/xenguest.patch" _blank +click xenguest2 "https://github.com/xenserver/xen.pg/blob/XS-8/patches/xenguest.patch" _blank +click libxenguest "https://github.com/xen-project/xen/tree/master/tools/libs/guest" _blank +click libxenguest2 "https://github.com/xen-project/xen/tree/master/tools/libs/guest" _blank +libxenguest --> Xen[Xen
Hypervisor] +libxenguest2 --> Xen +end +``` + +## About xenguest + +`xenguest` is called by the xenopsd [Domain.build](Domain.build) function +to perform the build phase for new VMs, which is part of the `xenopsd` +[VM.start operation](VM.start). + +[xenguest](https://github.com/xenserver/xen.pg/blob/XS-8/patches/xenguest.patch) +was created as a separate program due to issues with +`libxenguest`: + +- It wasn't threadsafe: fixed, but it still uses a per-call global struct +- It had an incompatible licence, but now licensed under the LGPL. + +Those were fixed, but we still shell out to `xenguest`, which is currently +carried in the patch queue for the Xen hypervisor packages, but could become +an individual package once planned changes to the Xen hypercalls are stabilised. + +Over time, `xenguest` has evolved to build more of the initial domain state. + +## Interface to xenguest + +```mermaid +flowchart +subgraph xenopsd VM_build[xenopsd VM_build micro#8209;op] +direction TB +mode +domid +memmax +Xenstore +end +mode[--mode build_hvm] --> xenguest +domid --> xenguest +memmax --> xenguest +Xenstore[Xenstore platform data] --> xenguest +``` + +`xenopsd` must pass this information to `xenguest` to build a VM: + +- The domain type to build for (HVM, PHV or PV). + - It is passed using the command line option `--mode hvm_build`. +- The `domid` of the created empty domain, +- The amount of system memory of the domain, +- A number of other parameters that are domain-specific. + +`xenopsd` uses the Xenstore to provide platform data: + +- the vCPU affinity +- the vCPU credit2 weight/cap parameters +- whether the NX bit is exposed +- whether the viridian CPUID leaf is exposed +- whether the system has PAE or not +- whether the system has ACPI or not +- whether the system has nested HVM or not +- whether the system has an HPET or not + +When called to build a domain, `xenguest` reads those and builds the VM accordingly. + +## Walkthrough of the xenguest build mode + +```mermaid +flowchart +subgraph xenguest[xenguest #8209;#8209;mode hvm_build domid] +direction LR +stub_xc_hvm_build[stub_xc_hvm_build#40;#41;] --> get_flags[ + get_flags#40;#41; <#8209; Xenstore platform data +] +stub_xc_hvm_build --> configure_vcpus[ + configure_vcpus#40;#41; #8209;> Xen hypercall +] +stub_xc_hvm_build --> setup_mem[ + setup_mem#40;#41; #8209;> Xen hypercalls to setup domain memory +] +end +``` + +Based on the given domain type, the `xenguest` program calls dedicated +functions for the build process of the given domain type. + +These are: + +- `stub_xc_hvm_build()` for HVM, +- `stub_xc_pvh_build()` for PVH, and +- `stub_xc_pv_build()` for PV domains. + +These domain build functions call these functions: + +1. `get_flags()` to get the platform data from the Xenstore +2. `configure_vcpus()` which uses the platform data from the Xenstore to configure vCPU affinity and the credit scheduler parameters vCPU weight and vCPU cap (max % pCPU time for throttling) +3. The `setup_mem` function for the given VM type. + +## The function hvm_build_setup_mem() + +For HVM domains, `hvm_build_setup_mem()` is responsible for deriving the memory +layout of the new domain, allocating the required memory and populating for the +new domain. It must: + +1. Derive the `e820` memory layout of the system memory of the domain + including memory holes depending on PCI passthrough and vGPU flags. +2. Load the BIOS/UEFI firmware images +3. Store the final MMIO hole parameters in the Xenstore +4. Call the `libxenguest` function `xc_dom_boot_mem_init()` (see below) +5. Call `construct_cpuid_policy()` to apply the CPUID `featureset` policy + +## The function xc_dom_boot_mem_init() + +```mermaid +flowchart LR +subgraph xenguest +hvm_build_setup_mem[hvm_build_setup_mem#40;#41;] +end +subgraph libxenguest +hvm_build_setup_mem --> xc_dom_boot_mem_init[xc_dom_boot_mem_init#40;#41;] +xc_dom_boot_mem_init -->|vmemranges| meminit_hvm[meninit_hvm#40;#41;] +click xc_dom_boot_mem_init "https://github.com/xen-project/xen/blob/39c45c/tools/libs/guest/xg_dom_boot.c#L110-L126" _blank +click meminit_hvm "https://github.com/xen-project/xen/blob/39c45c/tools/libs/guest/xg_dom_x86.c#L1348-L1648" _blank +end +``` + +`hvm_build_setup_mem()` calls +[xc_dom_boot_mem_init()](https://github.com/xen-project/xen/blob/39c45c/tools/libs/guest/xg_dom_boot.c#L110-L126) +to allocate and populate the domain's system memory. + +It calls +[meminit_hvm()](https://github.com/xen-project/xen/blob/39c45c/tools/libs/guest/xg_dom_x86.c#L1348-L1648) +to loop over the `vmemranges` of the domain for mapping the system RAM +of the guest from the Xen hypervisor heap. Its goals are: + +- Attempt to allocate 1GB superpages when possible +- Fall back to 2MB pages when 1GB allocation failed +- Fall back to 4k pages when both failed + +It uses the hypercall +[XENMEM_populate_physmap](https://github.com/xen-project/xen/blob/39c45c/xen/common/memory.c#L1408-L1477) +to perform memory allocation and to map the allocated memory +to the system RAM ranges of the domain. + +https://github.com/xen-project/xen/blob/39c45c/xen/common/memory.c#L1022-L1071 + +`XENMEM_populate_physmap`: + +1. Uses + [construct_memop_from_reservation](https://github.com/xen-project/xen/blob/39c45c/xen/common/memory.c#L1022-L1071) + to convert the arguments for allocating a page from + [struct xen_memory_reservation](https://github.com/xen-project/xen/blob/master/xen/include/public/memory.h#L46-L80) + to `struct memop_args`. +2. Sets flags and calls functions according to the arguments +3. Allocates the requested page at the most suitable place + - depending on passed flags, allocate on a specific NUMA node + - else, if the domain has node affinity, on the affine nodes + - also in the most suitable memory zone within the NUMA node +4. Falls back to less desirable places if this fails + - or fail for "exact" allocation requests +5. When no pages of the requested size are free, + it splits larger superpages into pages of the requested size. + +For more details on the VM build step involving `xenguest` and Xen side see: +https://wiki.xenproject.org/wiki/Walkthrough:_VM_build_using_xenguest diff --git a/doc/content/xenopsd/walkthroughs/VM.migrate.md b/doc/content/xenopsd/walkthroughs/VM.migrate.md index 080ebdb8edc..8982c4690da 100644 --- a/doc/content/xenopsd/walkthroughs/VM.migrate.md +++ b/doc/content/xenopsd/walkthroughs/VM.migrate.md @@ -1,37 +1,44 @@ --- title: 'Walkthrough: Migrating a VM' +linktitle: 'Migrating a VM' +description: Walkthrough of migrating a VM from one host to another. +weight: 50 +mermaid: + force: true --- +At the end of this walkthrough, a sequence diagram of the overall process is included. -A XenAPI client wishes to migrate a VM from one host to another within -the same pool. +## Invocation -The client will issue a command to migrate the VM and it will be dispatched +The command to migrate the VM is dispatched by the autogenerated `dispatch_call` function from **xapi/server.ml**. For more information about the generated functions you can have a look to [XAPI IDL model](https://github.com/xapi-project/xen-api/tree/master/ocaml/idl/ocaml_backend). -The command will trigger the operation +The command triggers the operation [VM_migrate](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/lib/xenops_server.ml#L2572) -that has low level operations performed by the backend. These atomics operations -that we will describe in the documentation are: - -- VM.restore -- VM.rename -- VBD.set_active -- VBD.plug -- VIF.set_active -- VGPU.set_active -- VM.create_device_model -- PCI.plug -- VM.set_domain_action_request - -The command have serveral parameters such as: should it be ran asynchronously, -should it be forwared to another host, how arguments should be marshalled and -so on. A new thread is created by [xapi/server_helpers.ml](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xapi/server_helpers.ml#L55) -to handle the command asynchronously. At this point the helper also check if +that uses many low level atomics operations. These are: + +- [VM.restore](#VM-restore) +- [VM.rename](#VM-rename) +- [VBD.set_active](#restoring-devices) +- [VBD.plug](#restoring-devices) +- [VIF.set_active](#restoring-devices) +- [VGPU.set_active](#restoring-devices) +- [VM.create_device_model](#creating-the-device-model) +- [PCI.plug](#pci-plug) + +The migrate command has several parameters such as: + +- Should it be started asynchronously, +- Should it be forwarded to another host, +- How arguments should be marshalled, and so on. + +A new thread is created by [xapi/server_helpers.ml](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xapi/server_helpers.ml#L55) +to handle the command asynchronously. The helper thread checks if the command should be passed to the [message forwarding](https://github.com/xapi-project/xen-api/blob/master/ocaml/xapi/message_forwarding.ml) -layer in order to be executed on another host (the destination) or locally if -we are already at the right place. +layer in order to be executed on another host (the destination) or locally (if +it is already at the destination host). It will finally reach [xapi/api_server.ml](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xapi/api_server.ml#L242) that will take the action of posted a command to the message broker [message switch](https://github.com/xapi-project/xen-api/tree/master/ocaml/message-switch). @@ -40,77 +47,77 @@ XAPI daemons. In the case of the migration this message sends by **XAPI** will b consumed by the [xenopsd](https://github.com/xapi-project/xen-api/tree/master/ocaml/xenopsd) daemon that will do the job of migrating the VM. -# The migration of the VM +## Overview The migration is an asynchronous task and a thread is created to handle this task. -The tasks's reference is returned to the client, which can then check +The task reference is returned to the client, which can then check its status until completion. -As we see in the introduction the [xenopsd](https://github.com/xapi-project/xen-api/tree/master/ocaml/xenopsd) -daemon will pop the operation +As shown in the introduction, [xenopsd](https://github.com/xapi-project/xen-api/tree/master/ocaml/xenopsd) +fetches the [VM_migrate](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/lib/xenops_server.ml#L2572) -from the message broker. +operation from the message broker. -Only one backend is know available that interacts with libxc, libxenguest -and xenstore. It is the [xc backend](https://github.com/xapi-project/xen-api/tree/master/ocaml/xenopsd/xc). +All tasks specific to [libxenctrl](../../lib/xenctrl), +[xenguest](VM.build/xenguest) and [Xenstore](https://wiki.xenproject.org/wiki/XenStore) +are handled by the xenopsd +[xc backend](https://github.com/xapi-project/xen-api/tree/master/ocaml/xenopsd/xc). The entities that need to be migrated are: *VDI*, *VIF*, *VGPU* and *PCI* components. -During the migration process the destination domain will be built with the same -uuid than the original VM but the last part of the UUID will be +During the migration process, the destination domain will be built with the same +UUID as the original VM, except that the last part of the UUID will be `XXXXXXXX-XXXX-XXXX-XXXX-000000000001`. The original domain will be removed using `XXXXXXXX-XXXX-XXXX-XXXX-000000000000`. -There are some points called *hooks* at which `xenopsd` can execute some script. -Before starting a migration a command is send to the original domain to execute -a pre migrate script if it exists. +## Preparing VM migration -Before starting the migration a command is sent to Qemu using the Qemu Machine Protocol (QMP) +At specific places, `xenopsd` can execute *hooks* to run scripts. +In case a pre-migrate script is in place, a command to run this script +is sent to the original domain. + +Likewise, a command is sent to Qemu using the Qemu Machine Protocol (QMP) to check that the domain can be suspended (see [xenopsd/xc/device_common.ml](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/device_common.ml)). -After checking with Qemu that the VM is suspendable we can start the migration. +After checking with Qemu that the VM is can be suspended, the migration can begin. ## Importing metadata As for *hooks*, commands to source domain are sent using [stunnel](https://github.com/xapi-project/xen-api/tree/master/ocaml/libs/stunnel) a daemon which is used as a wrapper to manage SSL encryption communication between two hosts on the same -pool. To import metada an XML RPC command is sent to the original domain. +pool. To import the metadata, an XML RPC command is sent to the original domain. -Once imported it will give us a reference id and will allow to build the new domain +Once imported, it will give us a reference id and will allow building the new domain on the destination using the temporary VM uuid `XXXXXXXX-XXXX-XXXX-XXXX-000000000001` where `XXX...` is the reference id of the original VM. -## Setting memory +## Memory setup -One of the first thing to do is to setup the memory. The backend will check that there -is no ballooning operation in progress. At this point the migration can fail if a -ballooning operation is in progress and takes too much time. +One of the first steps the setup of the VM's memory: The backend checks that there +is no ballooning operation in progress. If so, the migration could fail. -Once memory checked the daemon will get the state of the VM (running, halted, ...) and -information about the VM are retrieve by the backend like the maximum memory the domain -can consume but also information about quotas for example. -Information are retrieve by the backend from xenstore. +Once memory has been checked, the daemon will get the state of the VM (running, halted, ...) and +The backend retrieves the domain's platform data (memory, vCPUs setc) from the Xenstore. Once this is complete, we can restore VIF and create the domain. -The synchronisation of the memory is the first point of synchronisation and everythin +The synchronisation of the memory is the first point of synchronisation and everything is ready for VM migration. -## VM Migration +## Destination VM setup After receiving memory we can set up the destination domain. If we have a vGPU we need to kick -off its migration process. We will need to wait the acknowledge that indicates that the entry -for the GPU has been well initialized. before starting the main VM migration. +off its migration process. We will need to wait for the acknowledgement that the +GPU entry has been successfully initialized before starting the main VM migration. -Their is a mechanism of handshake for synchronizing between the source and the -destination. Using the handshake protocol the receiver inform the sender of the -request that everything has been setup and ready to save/restore. +The receiver informs the sender using a handshake protocol +that everything is set up and ready for save/restore. -### VM restore +## Destination VM restore VM restore is a low level atomic operation [VM.restore](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L2684). This operation is represented by a function call to [backend](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/domain.ml#L1540). It uses **Xenguest**, a low-level utility from XAPI toolstack, to interact with the Xen hypervisor -and libxc for sending a request of migration to the **emu-manager**. +and `libxc` for sending a migration request to the **emu-manager**. After sending the request results coming from **emu-manager** are collected by the main thread. It blocks until results are received. @@ -120,16 +127,14 @@ transitions for the devices and handling the message passing for the VM as it's moved between hosts. This includes making sure that the state of the VM's virtual devices, like disks or network interfaces, is correctly moved over. -### VM renaming +## Destination VM rename -Once all operations are done we can rename the VM on the target from its temporary -name to its real UUID. This operation is another low level atomic one +Once all operations are done, `xenopsd` renames the target VM from its temporary +name to its real UUID. This operation is a low-level atomic [VM.rename](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L1667) -that will take care of updating the xenstore on the destination. - -The next step is the restauration of devices and unpause the domain. +which takes care of updating the Xenstore on the destination host. -### Restoring remaining devices +## Restoring devices Restoring devices starts by activating VBD using the low level atomic operation [VBD.set_active](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L3674). It is an update of Xenstore. VBDs that are read-write must @@ -140,39 +145,51 @@ is called. VDI are attached and activate. Next devices are VIFs that are set as active [VIF.set_active](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L4296) and plug [VIF.plug](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L4394). If there are VGPUs we will set them as active now using the atomic [VGPU.set_active](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L3490). -We are almost done. The next step is to create the device model - -#### create device model +### Creating the device model -Create device model is done by using the atomic operation [VM.create_device_model](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L2375). This -will configure **qemu-dm** and started. This allow to manage PCI devices. +[create_device_model](https://github.com/xapi-project/xen-api/blob/ec3b62ee/ocaml/xenopsd/xc/xenops_server_xen.ml#L2293-L2349) +configures **qemu-dm** and starts it. This allows to manage PCI devices. -#### PCI plug +### PCI plug [PCI.plug](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L3399) -is executed by the backend. It plugs a PCI device and advertise it to QEMU if this option is set. It is -the case for NVIDIA SR-IOV vGPUS. +is executed by the backend. It plugs a PCI device and advertises it to QEMU if this option is set. It is +the case for NVIDIA SR-IOV vGPUs. + +## Unpause + +The libxenctrl call +[xc_domain_unpause()](https://github.com/xen-project/xen/blob/414dde3/tools/libs/ctrl/xc_domain.c#L76) +unpauses the domain, and it starts running. + +## Cleanup + +1. [VM_set_domain_action_request](https://github.com/xapi-project/xen-api/blob/ec3b62ee/ocaml/xenopsd/lib/xenops_server.ml#L3004) + marks the domain as alive: In case `xenopsd` restarts, it no longer reboots the VM. + See the chapter on [marking domains as alive](VM.start#11-mark-the-domain-as-alive) + for more information. + +2. If a post-migrate script is in place, it is executed by the + [Xenops_hooks.VM_post_migrate](https://github.com/xapi-project/xen-api/blob/ec3b62ee/ocaml/xenopsd/lib/xenops_server.ml#L3005-L3009) + hook. + +3. The final step is a handshake to seal the success of the migration +and the old VM can now be cleaned up. -At this point devices have been restored. The new domain is considered survivable. We can -unpause the domain and performs last actions +[Syncronisation point 4](https://github.com/xapi-project/xen-api/blob/ec3b62ee/ocaml/xenopsd/lib/xenops_server.ml#L3014) +has been reached, the migration is complete. -### Unpause and done +## Live migration flowchart -Unpause is done by managing the state of the domain using bindings to [xenctrl](https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=tools/libs/ctrl/xc_domain.c;h=f2d9d14b4d9f24553fa766c5dcb289f88d684bb0;hb=HEAD#l76). -Once hypervisor has unpaused the domain some actions can be requested using [VM.set_domain_action_request](https://github.com/xapi-project/xen-api/blob/7ac88b90e762065c5ebb94a8ea61c61bdbf62c5c/ocaml/xenopsd/xc/xenops_server_xen.ml#L3172). -It is a path in xenstore. By default no action is done but a reboot can be for example -initiated. +This flowchart gives a visual representation of the VM migration workflow: -Previously we spoke about some points called *hooks* at which `xenopsd` can execute some script. There -is also a hook to run a post migrate script. After the execution of the script if there is one -the migration is almost done. The last step is a handskake to seal the success of the migration -and the old VM can now be cleaned. +{{% include live-migration %}} -# Links +## References -Some links are old but even if many changes occured they are relevant for a global understanding -of the XAPI toolstack. +These pages might help for a better understanding of the XAPI toolstack: -- [XAPI architecture](https://xapi-project.github.io/xapi/architecture.html) -- [XAPI dispatcher](https://wiki.xenproject.org/wiki/XAPI_Dispatch) -- [Xenopsd architecture](https://xapi-project.github.io/xenopsd/architecture.html) +- See the [XAPI architecture](../../xapi/_index) for the overall architecture of Xapi +- See the [XAPI dispatcher](https://wiki.xenproject.org/wiki/XAPI_Dispatch) for service dispatch and message forwarding +- See the [Xenopsd architecture](../architecture/_index) for the overall architecture of Xenopsd +- See the [How Xen suspend and resume works](https://mirage.io/docs/xen-suspend) for very similar operations in more detail. diff --git a/doc/content/xenopsd/walkthroughs/VM.start.md b/doc/content/xenopsd/walkthroughs/VM.start.md index 7e24b6d66ba..b043a5d9bf0 100644 --- a/doc/content/xenopsd/walkthroughs/VM.start.md +++ b/doc/content/xenopsd/walkthroughs/VM.start.md @@ -1,5 +1,8 @@ --- title: 'Walkthrough: Starting a VM' +linktitle: 'Starting a VM' +description: Complete walkthrough of starting a VM, from receiving the request to unpause. +weight: 10 --- A Xenopsd client wishes to start a VM. They must first tell Xenopsd the VM @@ -30,7 +33,7 @@ users: - the XenAPI has many clients which are updated on long release cycles. The main property needed is backwards compatibility, so that new release of xapi - remain compatible with these older clients. Quite often we will chose to + remain compatible with these older clients. Quite often, we will choose to "grandfather in" some poorly designed interface simply because we wish to avoid imposing churn on 3rd parties. - the Xenopsd API clients are all open-source and are part of the xapi-project. @@ -89,7 +92,7 @@ exist for: From here we shall assume the use of the "Xen via libxc, libxenguest and xenstore" (a.k.a. "Xenopsd classic") backend. -The backend [VM.add](https://github.com/xapi-project/xenopsd/blob/2a476c132c0b5732f9b224316b851a1b4d57520b/xc/xenops_server_xen.ml#L719) +The backend [VM.add](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/xc/xenops_server_xen.ml#L1603-L1659) function checks whether the VM we have to manage already exists -- and if it does then it ensures the Xenstore configuration is intact. This Xenstore configuration is important because at any time a client can query the state of a VM with @@ -132,17 +135,15 @@ When the Task has completed successfully, then calls to *.stat will show: - a valid start time - valid "targets" for memory and vCPU -Note: before a Task completes, calls to *.stat will show partial updates e.g. -the power state may be Paused but none of the disks may have become plugged. +Note: before a Task completes, calls to *.stat will show partial updates. E.g. +the power state may be paused, but no disk may have been plugged. UI clients must choose whether they are happy displaying this in-between state or whether they wish to hide it and pretend the whole operation has happened -transactionally. If a particular client wishes to perform side-effects in -response to Xenopsd state changes -- for example to clean up an external resource -when a VIF becomes unplugged -- then it must be very careful to avoid responding -to these in-between states. Generally it is safest to passively report these -values without driving things directly from them. Think of them as status lights -on the front panel of a PC: fine to look at but it's not a good idea to wire -them up to actuators which actually do things. +transactionally. If a particular, when a client wishes to perform side-effects in +response to `xenopsd` state changes (for example, to clean up an external resource +when a VIF becomes unplugged), it must be very careful to avoid responding +to these in-between states. Generally, it is safest to passively report these +values without driving things directly from them. Note: the Xenopsd implementation guarantees that, if it is restarted at any point during the start operation, on restart the VM state shall be "fixed" by either @@ -163,7 +164,7 @@ via the function It is the responsibility of the client to call [TASK.destroy](https://github.com/xapi-project/xcp-idl/blob/2e5c3dd79c63e3711227892271a6bece98eb0fa1/xen/xenops_interface.ml#L406) -when the Task is nolonger needed. Xenopsd won't destroy the task because it contains +when the Task is no longer needed. Xenopsd won't destroy the task because it contains the success/failure result of the operation which is needed by the client. What happens when a Xenopsd receives a VM.start request? @@ -196,24 +197,43 @@ takes care of: Once a thread from the worker pool becomes free, it will execute the "do it now" function. In the example above this is `perform op t` where `op` is `VM_start vm` and `t` is the Task. The function -[perform](https://github.com/xapi-project/xenopsd/blob/524d57b3c70/lib/xenops_server.ml#L1198) +[perform_exn](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/lib/xenops_server.ml#L2533) has fragments like this: ```ocaml - | VM_start id -> - debug "VM.start %s" id; - perform_atomics (atomics_of_operation op) t; - VM_DB.signal id + | VM_start (id, force) -> ( + debug "VM.start %s (force=%b)" id force ; + let power = (B.VM.get_state (VM_DB.read_exn id)).Vm.power_state in + match power with + | Running -> + info "VM %s is already running" id + | _ -> + perform_atomics (atomics_of_operation op) t ; + VM_DB.signal id "^^^^^^^^^^^^^^^^^^^^-------- + ) ``` Each "operation" (e.g. `VM_start vm`) is decomposed into "micro-ops" by the function -[atomics_of_operation](https://github.com/xapi-project/xenopsd/blob/524d57b3c70/lib/xenops_server.ml#L739) +[atomics_of_operation](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/lib/xenops_server.ml#L1583) where the micro-ops are small building-block actions common to the higher-level operations. Each operation corresponds to a list of "micro-ops", where there is no if/then/else. Some of the "micro-ops" may be a no-op depending on the VM configuration (for example a PV domain may not need a qemu). In the case of -`VM_start vm` this decomposes into the sequence: +[`VM_start vm`](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/lib/xenops_server.ml#L1584) +the `Xenopsd` server starts by calling the [functions that +decompose](https://github.com/xapi-project/xen-api/blob/master/ocaml/xenopsd/lib/xenops_server.ml#L1612-L1714) + the `VM_hook_script`, `VM_create` and `VM_build` micro-ops: +```ml + dequarantine_ops vgpus + ; [ + VM_hook_script + (id, Xenops_hooks.VM_pre_start, Xenops_hooks.reason__none) + ; VM_create (id, None, None, no_sharept) + ; VM_build (id, force) + ] +``` +This is the complete sequence of micro-ops: ## 1. run the "VM_pre_start" scripts @@ -225,8 +245,8 @@ module and looks for scripts in the hardcoded path `/etc/xapi.d`. ## 2. create a Xen domain The `VM_create` micro-op calls the `VM.create` function in the backend. -In the classic Xenopsd backend the -[VM.create_exn](https://github.com/xapi-project/xenopsd/blob/b33bab13080cea91e2fd59d5088622cd68152339/xc/xenops_server_xen.ml#L633) +In the classic Xenopsd backend, the +[VM.create_exn](https://github.com/xapi-project/xen-api/blob/bae7526faeb2a02a2fe5b71410083983f4695963/ocaml/xenopsd/xc/xenops_server_xen.ml#L1421-L1586) function must 1. check if we're creating a domain for a fresh VM or resuming an existing one: @@ -237,7 +257,13 @@ function must because domain create often fails in low-memory conditions. This means the "reservation" is associated with our "session" with squeezed; if Xenopsd crashes and restarts the reservation will be freed automatically. -3. create the Domain via the libxc hypercall +3. create the Domain via the libxc hypercall `Xenctrl.domain_create` +4. [call]( + https://github.com/xapi-project/xen-api/blob/bae7526faeb2a02a2fe5b71410083983f4695963/ocaml/xenopsd/xc/xenops_server_xen.ml#L1547) + [generate_create_info()]( + https://github.com/xapi-project/xen-api/blob/bae7526faeb2a02a2fe5b71410083983f4695963/ocaml/xenopsd/xc/xenops_server_xen.ml#L1302-L1419) + for storing the platform data (vCPUs, etc) the domain's Xenstore tree. + `xenguest` then uses this in the `build` phase (see below) to build the domain. 4. "transfer" the squeezed reservation to the domain such that squeezed will free the memory if the domain is destroyed later 5. compute and set an initial balloon target depending on the amount of memory @@ -253,38 +279,10 @@ function must ## 3. build the domain -On a Xen system a domain is created empty, and memory is actually allocated -from the host in the "build" phase via functions in *libxenguest*. The -[VM.build_domain_exn](https://github.com/xapi-project/xenopsd/blob/b33bab13080cea91e2fd59d5088622cd68152339/xc/xenops_server_xen.ml#L994) -function must - -1. run pygrub (or eliloader) to extract the kernel and initrd, if necessary -2. invoke the *xenguest* binary to interact with libxenguest. -3. apply the `cpuid` configuration -4. store the current domain configuration on disk -- it's important to know - the difference between the configuration you started with and the configuration - you would use after a reboot because some properties (such as maximum memory - and vCPUs) as fixed on create. - -The xenguest binary was originally -a separate binary for two reasons: (i) the libxenguest functions weren't -threadsafe since they used lots of global variables; and (ii) the libxenguest -functions used to have a different, incompatible license, which prevent us -linking. Both these problems have been resolved but we still shell out to -the xenguest binary. - -The xenguest binary has also evolved to configure more of the initial domain -state. It also [reads Xenstore](https://github.com/xapi-project/ocaml-xen-lowlevel-libs/blob/master/xenguest-4.4/xenguest_stubs.c#L42) -and configures - -- the vCPU affinity -- the vCPU credit2 weight/cap parameters -- whether the NX bit is exposed -- whether the viridian CPUID leaf is exposed -- whether the system has PAE or not -- whether the system has ACPI or not -- whether the system has nested HVM or not -- whether the system has an HPET or not +The `build` phase waits, if necessary, for the Xen memory scrubber to catch +up reclaiming memory, runs NUMA placement, sets vCPU affinity and invokes +the `xenguest` to build the system memory layout of the domain. +See the [walk-through of the VM_build μ-op](VM.build) for details. ## 4. mark each VBD as "active" @@ -304,7 +302,7 @@ calls bracket plug/unplug. If the "active" flag was set before the unplug attempt then as soon as the frontend/backend connection is removed clients would see the VBD as completely dissociated from the VM -- this would be misleading because Xenopsd will not have had time to use the storage API to release locks -on the disks. By doing all the cleanup before setting "active" to false, clients +on the disks. By cleaning up before setting "active" to false, clients can be assured that the disks are now free to be reassigned. ## 5. handle non-persistent disks @@ -370,7 +368,7 @@ to be the order the nodes were created so this means that (i) xenstored must continue to store directories as ordered lists rather than maps (which would be more efficient); and (ii) Xenopsd must make sure to plug the vifs in the same order. Note that relying on ethX device numbering has always been a -bad idea but is still common. I bet if you change this lots of tests will +bad idea but is still common. I bet if you change this, many tests will suddenly start to fail! The function diff --git a/doc/content/xenopsd/walkthroughs/_index.md b/doc/content/xenopsd/walkthroughs/_index.md index d54568dcbbf..6fe3f551f29 100644 --- a/doc/content/xenopsd/walkthroughs/_index.md +++ b/doc/content/xenopsd/walkthroughs/_index.md @@ -6,8 +6,10 @@ linkTitle = "Walk-throughs" Let's trace through interesting operations to see how the whole system works. -- [Starting a VM](VM.start.md) -- [Migrating a VM](VM.migrate.md) +{{% children depth=2 description=true %}} + +Inspiration for other walk-throughs: + - Shutting down a VM and waiting for it to happen - A VM wants to reboot itself - A disk is hotplugged diff --git a/doc/content/xenopsd/walkthroughs/live-migration.md b/doc/content/xenopsd/walkthroughs/live-migration.md index f0af797f85e..b93a4afbaa8 100644 --- a/doc/content/xenopsd/walkthroughs/live-migration.md +++ b/doc/content/xenopsd/walkthroughs/live-migration.md @@ -1,9 +1,13 @@ +++ title = "Live Migration Sequence Diagram" linkTitle = "Live Migration" +description = "Sequence diagram of the process of Live Migration." +# Note: This page is included by VM.migrate.md to provide a complete overview +# of the most important parts of live migration. Do not add text as that would +# break the mermaid diagram inclusion. +++ -{{}} +```mermaid sequenceDiagram autonumber participant tx as sender @@ -43,5 +47,4 @@ deactivate rx1 tx->>tx: VM_shutdown
VM_remove deactivate tx - -{{< /mermaid >}} +``` diff --git a/doc/hugo.toml b/doc/hugo.toml index 7b2dff698b4..a35112db945 100644 --- a/doc/hugo.toml +++ b/doc/hugo.toml @@ -29,6 +29,7 @@ home = [ "HTML", "RSS", "PRINT"] section = [ "HTML", "RSS", "PRINT"] [params] +editURL = 'https://github.com/xapi-project/xen-api/edit/master/doc/content/${FilePath}' # Enable the theme variant selector, default to auto: themeVariant = [ "auto", @@ -45,5 +46,31 @@ themeVariant = [ ] # auto switches between "red" and "zen-dark" depending on the browser/OS dark mode: themeVariantAuto = ["red", "zen-dark"] +# Consistency: Use the font of the Hugo Relearn theme also for Mermaid diagrams: +# securityLevel=loose is the default of Relearn, it allows HTML links in diagrams: +mermaidInitialize = '{ "fontFamily": "Roboto Flex", "securityLevel": "loose" }' alwaysopen = false collapsibleMenu = true + + [params.imageEffects] + + # + # Enable a soft shadow around the images that make the images appear to + # stand out ever so slightly like paper on a desk, giving them a smooth look: + # + shadow = true + + # + # The CSS-based photographer's lightbox makes the image border flash + # on mouse-over and darkens the rest of the page when clicking on images. + # + # It is better to disable it as it serves no proper function for the + # toolstack docs and causes a border around the image to appear/disappear + # in a flash when entering/leaving the image. Disabling it turns the sudden + # appearance and disappearance of the flashy border off. + # + # Initially, this was based on the Featherlight jQuery plugin, which would + # have enlarged the images, but the CSS-only solution appears inadequate + # for a proper lightbox as it does not zoom the image: + # + lightbox = false diff --git a/doc/layouts/partials/content.html b/doc/layouts/partials/content.html index ebba286db1e..007446b478c 100644 --- a/doc/layouts/partials/content.html +++ b/doc/layouts/partials/content.html @@ -8,6 +8,39 @@ {{ $c := .Page.Params.class }} {{ with index (where $.Site.Data.xenapi "name" $c) 0 }} + + {{ $style := resources.Get "css/xenapi.css" }} +{{ $parser := resources.Get "js/parse.js" }} + {{ with .lifecycle }}
@@ -64,11 +114,11 @@

Enums

{{ range $i, $x := .enums }}
-
{{ $x.name }}
+
{{ $x.name }}