From 60c63bb8bc4a2481310dea0d8b28da6d25d055b4 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Tue, 16 Jul 2024 10:47:57 -0400 Subject: [PATCH 1/3] bootstrap: pivot into node image before bootstrapping As per https://github.com/openshift/enhancements/pull/1637, we're trying to get rid of all OpenShift-versioned components from the bootimages. This means that there will no longer be `oc`, `kubelet`, or `crio` binaries for example, which bootstrapping obviously relies on. Instead, now we change things up so that early on when booting the bootstrap node, we pull down the node image, unencapsulate it (this just means convert it back to an OSTree commit), then mount over its `/usr`, and import new `/etc` content. This is done by isolating to a different systemd target to only bring up the minimum number of services to do the pivot and then carry on with bootstrapping. This does not incur additional reboots and should be compatible with AI/ABI/SNO. But it is of course, a huge conceptual shift in how bootstrapping works. With this, we would now always be sure that we're using the same binaries as the target version as part of bootstrapping, which should alleviate some issues such as AI late-binding (see e.g. https://issues.redhat.com/browse/MGMT-16705). The big exception of course being the kernel. Relatedly, note we do persist `/usr/lib/modules` from the booted system so that loading kernel modules still works. To be conservative, the new logic only kicks in when using bootimages which do not have `oc`. This will allow us to ratchet this in more easily. Down the line, we should be able to replace some of this with `bootc apply-live` once that's available (and also works in a live environment). (See https://github.com/containers/bootc/issues/76.) For full context, see the linked enhancement and discussions there. --- .../node-image-overlay-generator | 9 ++ .../systemd/system/node-image-finish.service | 13 +++ .../systemd/system/node-image-overlay.service | 9 ++ .../systemd/system/node-image-overlay.target | 9 ++ .../systemd/system/node-image-pull.service | 14 +++ .../files/usr/local/bin/node-image-overlay.sh | 18 ++++ .../usr/local/bin/node-image-pull.sh.template | 87 +++++++++++++++++++ pkg/asset/ignition/bootstrap/common.go | 2 +- 8 files changed, 160 insertions(+), 1 deletion(-) create mode 100755 data/data/bootstrap/files/etc/systemd/system-generators/node-image-overlay-generator create mode 100644 data/data/bootstrap/files/etc/systemd/system/node-image-finish.service create mode 100644 data/data/bootstrap/files/etc/systemd/system/node-image-overlay.service create mode 100644 data/data/bootstrap/files/etc/systemd/system/node-image-overlay.target create mode 100644 data/data/bootstrap/files/etc/systemd/system/node-image-pull.service create mode 100755 data/data/bootstrap/files/usr/local/bin/node-image-overlay.sh create mode 100755 data/data/bootstrap/files/usr/local/bin/node-image-pull.sh.template diff --git a/data/data/bootstrap/files/etc/systemd/system-generators/node-image-overlay-generator b/data/data/bootstrap/files/etc/systemd/system-generators/node-image-overlay-generator new file mode 100755 index 00000000000..44ed434a691 --- /dev/null +++ b/data/data/bootstrap/files/etc/systemd/system-generators/node-image-overlay-generator @@ -0,0 +1,9 @@ +#!/bin/bash +set -euo pipefail + +UNIT_DIR="${1:-/tmp}" + +if ! rpm -q openshift-clients &>/dev/null; then + ln -sf "/etc/systemd/system/node-image-overlay.target" \ + "${UNIT_DIR}/default.target" +fi diff --git a/data/data/bootstrap/files/etc/systemd/system/node-image-finish.service b/data/data/bootstrap/files/etc/systemd/system/node-image-finish.service new file mode 100644 index 00000000000..33d6960f824 --- /dev/null +++ b/data/data/bootstrap/files/etc/systemd/system/node-image-finish.service @@ -0,0 +1,13 @@ +# This is a separate unit because in the assisted-installer flow, we only want +# `node-image-overlay.service`, not the isolating back to `multi-user.target`. + +[Unit] +Description=Node Image Finish +Requires=node-image-overlay.service +After=node-image-overlay.service + +[Service] +Type=oneshot +# and now, back to our regularly scheduled programming... +ExecStart=/usr/bin/echo "Node image overlay complete; switching back to multi-user.target" +ExecStart=/usr/bin/systemctl --no-block isolate multi-user.target diff --git a/data/data/bootstrap/files/etc/systemd/system/node-image-overlay.service b/data/data/bootstrap/files/etc/systemd/system/node-image-overlay.service new file mode 100644 index 00000000000..7e1ea029ee7 --- /dev/null +++ b/data/data/bootstrap/files/etc/systemd/system/node-image-overlay.service @@ -0,0 +1,9 @@ +[Unit] +Description=Node Image Overlay +Requires=node-image-pull.service +After=node-image-pull.service + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/node-image-overlay.sh +RemainAfterExit=yes diff --git a/data/data/bootstrap/files/etc/systemd/system/node-image-overlay.target b/data/data/bootstrap/files/etc/systemd/system/node-image-overlay.target new file mode 100644 index 00000000000..ef52f78ed93 --- /dev/null +++ b/data/data/bootstrap/files/etc/systemd/system/node-image-overlay.target @@ -0,0 +1,9 @@ +[Unit] +Description=Node Image Overlay Target +Requires=basic.target + +# for easier debugging +Requires=sshd.service getty.target systemd-user-sessions.service + +Requires=node-image-overlay.service +Requires=node-image-finish.service diff --git a/data/data/bootstrap/files/etc/systemd/system/node-image-pull.service b/data/data/bootstrap/files/etc/systemd/system/node-image-pull.service new file mode 100644 index 00000000000..8ac36d75b8c --- /dev/null +++ b/data/data/bootstrap/files/etc/systemd/system/node-image-pull.service @@ -0,0 +1,14 @@ +[Unit] +Description=Node Image Pull +Requires=network.target NetworkManager.service +After=network.target + +[Service] +Type=oneshot +# we need to call ostree container (i.e. rpm-ostree), which has install_exec_t, +# but by default, we'll run as unconfined_service_t, which is not allowed that +# transition. Relabel the script itself. +ExecStartPre=chcon --reference=/usr/bin/ostree /usr/local/bin/node-image-pull.sh +ExecStart=/usr/local/bin/node-image-pull.sh +MountFlags=slave +RemainAfterExit=yes diff --git a/data/data/bootstrap/files/usr/local/bin/node-image-overlay.sh b/data/data/bootstrap/files/usr/local/bin/node-image-overlay.sh new file mode 100755 index 00000000000..8db8548515d --- /dev/null +++ b/data/data/bootstrap/files/usr/local/bin/node-image-overlay.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -euo pipefail + +ostree_checkout=/ostree/repo/tmp/node-image +if [ ! -d "${ostree_checkout}" ]; then + ostree_checkout=/var/ostree-container/checkout +fi + +echo "Overlaying node image content" + +# keep /usr/lib/modules from the booted deployment for kernel modules +mount -o bind,ro "/usr/lib/modules" "${ostree_checkout}/usr/lib/modules" +mount -o rbind,ro "${ostree_checkout}/usr" /usr +rsync -a "${ostree_checkout}/usr/etc/" /etc + +# reload the new policy +echo "Reloading SELinux policy" +semodule -R diff --git a/data/data/bootstrap/files/usr/local/bin/node-image-pull.sh.template b/data/data/bootstrap/files/usr/local/bin/node-image-pull.sh.template new file mode 100755 index 00000000000..bf03d11d7e4 --- /dev/null +++ b/data/data/bootstrap/files/usr/local/bin/node-image-pull.sh.template @@ -0,0 +1,87 @@ +#!/bin/bash +set -euo pipefail + +# shellcheck source=release-image.sh.template +. /usr/local/bin/release-image.sh + +# yuck... this is a good argument for renaming the node image to just `node` in both OCP and OKD +coreos_img=rhel-coreos +{{ if .IsOKD }} +coreos_img=stream-coreos +{{ end }} +until COREOS_IMAGE=$(image_for ${coreos_img}); do + echo 'Failed to query release image; retrying...' + sleep 10 +done + +# need to use rpm-ostree here since `bootc status` doesn't work in the live ISO currently +# https://github.com/containers/bootc/issues/1043 +booted_version=$(rpm-ostree status --json | jq -r .deployments[0].version) + +echo "Currently on CoreOS version $booted_version" +echo "Target node image is $COREOS_IMAGE" + +# try to do this in the system repo so we get hardlinks and the checkout is +# read-only, but fallback to using /var if we're in the live environment since +# that's truly read-only +ostree_repo=/ostree/repo +ostree_checkout="${ostree_repo}/tmp/node-image" +hardlink='-H' +if grep -q coreos.liveiso= /proc/cmdline; then + ostree_repo=/var/ostree-container/repo + ostree_checkout=/var/ostree-container/checkout + mkdir -p "${ostree_repo}" + echo "In live ISO; creating temporary repo to pull node image" + ostree init --mode=bare --repo="${ostree_repo}" + # if there are layers, import all the content in the system repo for + # layer-level deduping + if [ -d /ostree/repo/refs/heads/ostree/container ]; then + echo "Importing base content from system repo for deduplication" + ostree pull-local --repo="${ostree_repo}" /ostree/repo + fi + # but we won't be able to force hardlinks cross-device + hardlink='' +else + # (remember, we're MountFlags=slave) + mount -o rw,remount /sysroot +fi + +# Use ostree stack to pull the container here. This gives us efficient +# downloading with layers we already have, and also handles SELinux. +echo "Pulling ${COREOS_IMAGE}" +while ! ostree container image pull --authfile "/root/.docker/config.json" \ + "${ostree_repo}" ostree-unverified-image:docker://"${COREOS_IMAGE}"; do + echo 'Failed to fetch release image; retrying...' + sleep 10 +done + +# ideally, `ostree container image pull` would support `--write-ref` or a +# command to escape a pullspec, but for now it's pretty easy to tell which ref +# it is since it's the only docker one +ref=$(ostree refs --repo "${ostree_repo}" | grep ^ostree/container/image/docker) +if [ $(echo "$ref" | wc -l) != 1 ]; then + echo "Expected single docker ref, found:" + echo "$ref" + exit 1 +fi +ostree refs --repo "${ostree_repo}" "$ref" --create coreos/node-image + +# massive hack to make ostree admin config-diff work in live ISO where /etc +# is actually on a separate mount and not the deployment root proper... should +# enhance libostree for this (remember, we're MountFlags=slave) +if grep -q coreos.liveiso= /proc/cmdline; then + mount -o bind,ro /etc /ostree/deploy/*/deploy/*/etc +fi + +# get all state files in /etc; this is a cheap way to get "3-way /etc merge" semantics +etc_keep=$(ostree admin config-diff | cut -f5 -d' ' | sed -e 's,^,/usr/etc/,') + +# check out the commit +echo "Checking out node image content" +ostree checkout --repo "${ostree_repo}" ${hardlink} coreos/node-image "${ostree_checkout}" --skip-list=<(cat <<< "$etc_keep") + +# in the assisted-installer case, nuke the temporary repo to save RAM +if grep -q coreos.liveiso= /proc/cmdline; then + echo "Deleting temporary repo" + rm -rf "${ostree_repo}" +fi diff --git a/pkg/asset/ignition/bootstrap/common.go b/pkg/asset/ignition/bootstrap/common.go index 97db5b7373e..13182760d85 100644 --- a/pkg/asset/ignition/bootstrap/common.go +++ b/pkg/asset/ignition/bootstrap/common.go @@ -438,7 +438,7 @@ func AddStorageFiles(config *igntypes.Config, base string, uri string, templateD var mode int appendToFile := false - if parentDir == "bin" || parentDir == "dispatcher.d" { + if parentDir == "bin" || parentDir == "dispatcher.d" || parentDir == "system-generators" { mode = 0555 } else if filename == "motd" || filename == "containers.conf" { mode = 0644 From 1ba8f3e419e86dd12e761ae9c87fd43ed84d15e5 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Wed, 17 Jul 2024 11:48:01 -0400 Subject: [PATCH 2/3] bootstrap/common: use switch to satisfy golint golint was complaining about: ``` pkg/asset/ignition/bootstrap/common.go:406:2: ifElseChain: rewrite if-else to switch statement (gocritic) if parentDir == "bin" || parentDir == "dispatcher.d" || parentDir == "system-generators" { ^ ``` --- pkg/asset/ignition/bootstrap/common.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pkg/asset/ignition/bootstrap/common.go b/pkg/asset/ignition/bootstrap/common.go index 13182760d85..fa18d2e4fac 100644 --- a/pkg/asset/ignition/bootstrap/common.go +++ b/pkg/asset/ignition/bootstrap/common.go @@ -438,16 +438,17 @@ func AddStorageFiles(config *igntypes.Config, base string, uri string, templateD var mode int appendToFile := false - if parentDir == "bin" || parentDir == "dispatcher.d" || parentDir == "system-generators" { + switch { + case parentDir == "bin", parentDir == "dispatcher.d", parentDir == "system-generators": mode = 0555 - } else if filename == "motd" || filename == "containers.conf" { + case filename == "motd", filename == "containers.conf": mode = 0644 appendToFile = true - } else if filename == "registries.conf" { + case filename == "registries.conf": // Having the mode be private breaks rpm-ostree, xref // https://github.com/openshift/installer/pull/6789 mode = 0644 - } else { + default: mode = 0600 } ign := ignition.FileFromBytes(strings.TrimSuffix(base, ".template"), "root", mode, data) From ef7700eb038cb0dc98fd066e19b309c38b38b813 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Sun, 12 Jan 2025 20:50:25 -0500 Subject: [PATCH 3/3] bootstrap: drop OKD pivot code This is obsoleted now by `node-image-overlay.target` and its associated units which do a similar thing but applies to both OKD and OCP. --- .../usr/local/bin/bootstrap-pivot.sh.template | 119 ------------------ .../units/crio-configure.service.template | 3 - .../common/units/kubelet.service.template | 6 - .../release-image-pivot.service.template | 12 -- 4 files changed, 140 deletions(-) delete mode 100644 data/data/bootstrap/files/usr/local/bin/bootstrap-pivot.sh.template delete mode 100644 data/data/bootstrap/systemd/common/units/release-image-pivot.service.template diff --git a/data/data/bootstrap/files/usr/local/bin/bootstrap-pivot.sh.template b/data/data/bootstrap/files/usr/local/bin/bootstrap-pivot.sh.template deleted file mode 100644 index 92efabe7d07..00000000000 --- a/data/data/bootstrap/files/usr/local/bin/bootstrap-pivot.sh.template +++ /dev/null @@ -1,119 +0,0 @@ -{{if .IsOKD -}} -#!/usr/bin/env bash -set -euo pipefail - -# Exit early if pivot is attempted on SCOS Live ISO -{{if .IsSCOS -}} -source /etc/os-release -if [[ ! $(touch /usr/.test) ]] && [[ ${ID} =~ ^(centos)$ ]]; then - touch /opt/openshift/.pivot-done - exit 0 -fi -{{end -}} - -# Rebase to OKD's OSTree container image. -# This is required in OKD as the node is first provisioned with plain Fedora CoreOS. - -# shellcheck disable=SC1091 -. /usr/local/bin/bootstrap-service-record.sh -. /usr/local/bin/release-image.sh - -# Pivot bootstrap node to OKD's OSTree image -if [ ! -f /opt/openshift/.pivot-done ]; then -{{if .IsFCOS -}} - MACHINE_OS_IMAGE=$(image_for fedora-coreos) -{{else if .IsSCOS -}} - MACHINE_OS_IMAGE=$(image_for stream-coreos) -{{end -}} - echo "Pulling ${MACHINE_OS_IMAGE}..." - while true - do - record_service_stage_start "pull-okd-os-image" - if podman pull --quiet "${MACHINE_OS_IMAGE}" - then - record_service_stage_success - break - else - record_service_stage_failure - echo "Pull failed. Retrying ${MACHINE_OS_IMAGE}..." - fi - done - - record_service_stage_start "rebase-to-okd-os-image" -{{if .IsFCOS -}} - mnt="$(podman image mount "${MACHINE_OS_IMAGE}")" - - # The bootstrap host during SNO installation and the rendezvous host of Agent-based Installer both boot into a Live - # ISO which cannot be rebased. Until rpm-ostree supports this live rebase [0], the following workaround will mount the - # proper OKD/FCOS Machine OS image over the existing mount at /usr and copy new config files to /etc. - # [0] https://github.com/coreos/rpm-ostree/issues/4547 - if grep -q coreos.liveiso= /proc/cmdline; then - mount -t tmpfs -o size=50% none /var/mnt/ - rsync -aHAXx "$mnt/" /var/mnt/ - mount -t overlay overlay -o lowerdir=/usr:/var/mnt/usr /usr - rsync -rlt --ignore-existing /var/mnt/etc/ /etc/ - - # Agent-based Installer will launch a ephemeral control plane at the rendezvous host which will create and publish - # Ignition configs for the other master nodes. These Ignition configs must match what the in-cluster control plane - # would generate else machine config operator will fail [0]. Because the rendezvous host is booted with a FCOS Live - # ISO without any OKD/FCOS related changes, we have to copy the manifests from OKD Machine OS manually to the - # bootstrap manifests folder of the rendezvous host. - # [0] https://access.redhat.com/solutions/4970731 - mkdir -p /var/opt/openshift/manifests - cp -av /var/mnt/manifests/*.* /var/opt/openshift/manifests/ - - # Load new systemd unit files and configuration such as crio.service after mounting the content of OKD/FCOS Machine - # OS over /usr and copying new files to /etc - systemctl daemon-reload - - # Apply presets from OKD Machine OS - systemctl preset-all - - # On OKD/FCOS prior to commit e859a66 [0] systemd-resolved is used by default and NetworkManager's DNS handling is - # disabled. In this case, CoreDNS fails to listen to 127.0.0.53:53 when Agent-based Installer boots its the - # rendezvous host with a Fedora CoreOS bootimage because by default FCOS' systemd-resolved already listens to this - # port. OKD/FCOS disables resolved's stub listener [1] but the resolved must be restarted for this setting to take - # effect. - # On OKD/FCOS since commit e859a66 [0] systemd-resolved is disabled by default and NetworkManager's DNS handling is - # used. However, the bootimage is vanilla FCOS and thus uses systemd-resolved by default. The latter has to be - # disabled after rebasing to OKD Machine OS and NetworkManager as well as the service to fix /etc/resolv.conf have - # to be started. - # [0] https://github.com/openshift/okd-machine-os/commit/e859a6643330596a8a282aeb4bf853763a2d219e - # [1] https://github.com/openshift/okd-machine-os/blob/28dec35d60ea07069366b22ebdcb296d429b15e9/overlay.d/99okd/etc/systemd/resolved.conf.d/okd-no-dns-stub.conf - if [ -e /etc/systemd/resolved.conf.d/okd-no-dns-stub.conf ]; then - systemctl restart systemd-resolved.service - else - systemctl disable --now systemd-resolved.service - fi - - if systemctl list-unit-files -q fix-resolvconf.service >/dev/null; then - systemctl stop NetworkManager.service - systemctl start fix-resolvconf.service - systemctl start NetworkManager.service - nmcli general reload dns-full - fi - - # Workaround for SELinux denials when launching crio.service from overlayfs - setenforce Permissive - - # crio.service is not part of FCOS but of OKD Machine OS. It will loaded after systemctl daemon-reload above but has - # to be started manually - systemctl start crio.service - - # No reboot necessary because setup will reboot the system automatically - else - pushd "${mnt}/bootstrap" - # shellcheck disable=SC1091 - . ./pre-pivot.sh - popd - fi - record_service_stage_success -{{else if .IsSCOS -}} - chmod 0644 /etc/containers/registries.conf - rpm-ostree rebase --experimental "ostree-unverified-registry:${MACHINE_OS_IMAGE}" - touch /opt/openshift/.pivot-done - record_service_stage_success - systemctl reboot -{{end -}} -fi -{{end -}} diff --git a/data/data/bootstrap/systemd/common/units/crio-configure.service.template b/data/data/bootstrap/systemd/common/units/crio-configure.service.template index 1ae4af22cf3..9113d6356aa 100644 --- a/data/data/bootstrap/systemd/common/units/crio-configure.service.template +++ b/data/data/bootstrap/systemd/common/units/crio-configure.service.template @@ -2,9 +2,6 @@ Description=Configure CRI-O to use the pause image After=release-image.service Requires=release-image.service -{{if .IsOKD -}} -Requires=release-image-pivot.service -{{end -}} Before=crio.service [Service] diff --git a/data/data/bootstrap/systemd/common/units/kubelet.service.template b/data/data/bootstrap/systemd/common/units/kubelet.service.template index 8058ccbecf6..092d4c8e6e1 100644 --- a/data/data/bootstrap/systemd/common/units/kubelet.service.template +++ b/data/data/bootstrap/systemd/common/units/kubelet.service.template @@ -1,13 +1,7 @@ [Unit] Description=Kubernetes Kubelet Wants=rpc-statd.service crio.service release-image.service -{{if .IsOKD -}} -Wants=release-image-pivot.service -{{end -}} After=crio.service release-image.service -{{if .IsOKD -}} -After=release-image-pivot.service -{{end -}} [Service] Type=notify diff --git a/data/data/bootstrap/systemd/common/units/release-image-pivot.service.template b/data/data/bootstrap/systemd/common/units/release-image-pivot.service.template deleted file mode 100644 index ebe833baf04..00000000000 --- a/data/data/bootstrap/systemd/common/units/release-image-pivot.service.template +++ /dev/null @@ -1,12 +0,0 @@ -{{if .IsOKD -}} -[Unit] -Description=Pivot bootstrap to the OpenShift Release Image -Wants=release-image.service -After=release-image.service -Before=bootkube.service kubelet.service dnsmasq.service - -[Service] -Type=oneshot -ExecStart=/usr/local/bin/bootstrap-pivot.sh -RemainAfterExit=true -{{end -}}