From aae14d2a0467dbea6c92ed5a30b77f853cd74bb4 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Wed, 31 Aug 2022 11:02:56 -0700 Subject: [PATCH] lib/resourcemerge/core: Reconcile seccompProfile in ensurePodSecurityContext This recently started breaking 4.10-to-4.11-to-4.12 updates [1] with the first failing run [2] like: $ curl -s https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/origin-ci-test/logs/periodic-ci-openshift-release-master-ci-4.12-upgrade-from-stable-4.11-from-stable-4.10-e2e-aws-sdn-upgrade/1562409104817786880/artifacts/e2e-aws-sdn-upgrade/gather-extra/artifacts/clusterversion.json | jq -r '.items[].status.conditions[] | .lastTransitionTime + " " + .type + "=" + .status + " " + .reason + ": " + .message' 2022-08-24T12:12:13Z RetrievedUpdates=False NoChannel: The update channel has not been configured. 2022-08-24T12:12:13Z ReleaseAccepted=True PayloadLoaded: Payload loaded version="4.12.0-0.ci-2022-08-24-061508" image="registry.build03.ci.openshift.org/ci-op-2v4w1wbk/release@sha256:37a2aa2a46ed42eedf73cf0ee1ec0f9a37b822823518172c01ef6ad2ba3ffa12" architecture="amd64" 2022-08-24T12:31:04Z Available=True : Done applying 4.11.1 2022-08-24T14:07:56Z Failing=True WorkloadNotProgressing: deployment openshift-operator-lifecycle-manager/package-server-manager has a replica failure FailedCreate: pods "package-server-manager-fbb7cb8b9-5zmwg" is forbidden: violates PodSecurity "restricted:v1.24": seccompProfile (pod or container "package-server-manager" must set securityContext.seccompProfile.type to "RuntimeDefault" or "Localhost") 2022-08-24T13:40:48Z Progressing=True WorkloadNotProgressing: Unable to apply 4.12.0-0.ci-2022-08-24-061508: the workload openshift-operator-lifecycle-manager/package-server-manager cannot roll out 2022-08-24T12:36:55Z ImplicitlyEnabledCapabilities=False AsExpected: Capabilities match configured spec while the previous run [3] had: $ curl -s https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/origin-ci-test/logs/periodic-ci-openshift-release-master-ci-4.12-upgrade-from-stable-4.11-from-stable-4.10-e2e-aws-sdn-upgrade/1561684150702837760/artifacts/e2e-aws-sdn-upgrade/gather-extra/artifacts/clusterversion.json | jq -r '.items[].status.conditions[] | .lastTransitionTime + " " + .type + "=" + .status + " " + .reason + ": " + .message' 2022-08-22T12:12:30Z RetrievedUpdates=False NoChannel: The update channel has not been configured. 2022-08-22T12:12:30Z ReleaseAccepted=True PayloadLoaded: Payload loaded version="4.12.0-0.ci-2022-08-22-055706" image="registry.build03.ci.openshift.org/ci-op-5wrvyzw8/release@sha256:42cb26b6290927a1857dfc246336739cebd297c51d9af43f586ceaff9073e825" architecture="amd64" 2022-08-22T12:36:17Z Available=True : Done applying 4.12.0-0.ci-2022-08-22-055706 2022-08-22T14:09:42Z Failing=False : 2022-08-22T14:44:58Z Progressing=False : Cluster version is 4.12.0-0.ci-2022-08-22-055706 2022-08-22T12:40:12Z ImplicitlyEnabledCapabilities=False AsExpected: Capabilities match configured spec Jian Zhang also noticed the breakage in QE [4]. The breaking change was an OLM namespace touch: $ REF_A=https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/origin-ci-test/logs/periodic-ci-openshift-release-master-ci-4.12-upgrade-from-stable-4.11-from-stable-4.10-e2e-aws-sdn-upgrade/1561684150702837760/artifacts/release/artifacts/release-images-latest $ REF_B=https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/origin-ci-test/logs/periodic-ci-openshift-release-master-ci-4.12-upgrade-from-stable-4.11-from-stable-4.10-e2e-aws-sdn-upgrade/1562409104817786880/artifacts/release/artifacts/release-images-latest $ JQ='[.spec.tags[] | .name + " " + .annotations["io.openshift.build.source-location"] + "/commit/" + .annotations["io.openshift.build.commit.id"]] | sort[]' $ diff -U0 <(curl -s "${REF_A}" | jq -r "${JQ}") <(curl -s "${REF_B}" | jq -r "${JQ}") | grep olm -operator-lifecycle-manager https://github.com/openshift/operator-framework-olm/commit/fd429109b22e8346f75bfcc239088e6980e72c47 -operator-registry https://github.com/openshift/operator-framework-olm/commit/fd429109b22e8346f75bfcc239088e6980e72c47 +operator-lifecycle-manager https://github.com/openshift/operator-framework-olm/commit/f8c466aeea67c2e826575d83243b37018268cd02 +operator-registry https://github.com/openshift/operator-framework-olm/commit/f8c466aeea67c2e826575d83243b37018268cd02 $ git clone https://github.com/openshift/operator-framework-olm $ cd operator-framework-olm $ git --no-pager log --oneline fd429109b22e8..f8c466aeea67 | grep namespace be84d80d (psa) restrict olm namespace + remove labels from openshift-operators ns $ git --no-pager show --oneline be84d80d -- manifests be84d80d (psa) restrict olm namespace + remove labels from openshift-operators ns diff --git a/manifests/0000_50_olm_00-namespace.yaml b/manifests/0000_50_olm_00-namespace.yaml index 8fffa527..168e8867 100644 --- a/manifests/0000_50_olm_00-namespace.yaml +++ b/manifests/0000_50_olm_00-namespace.yaml @@ -3,6 +3,8 @@ kind: Namespace metadata: name: openshift-operator-lifecycle-manager labels: + pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/enforce-version: "v1.24" openshift.io/scc: "anyuid" openshift.io/cluster-monitoring: "true" annotations: @@ -16,7 +18,7 @@ kind: Namespace metadata: name: openshift-operators labels: - pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/enforce: privileged pod-security.kubernetes.io/enforce-version: "v1.24" openshift.io/scc: "anyuid" annotations: which went out with [5], and will not be backported to 4.11. The relevant seccompProfile change landed in 4.11 (bot not in 4.10), which is why born-in-4.11 clusters are not impacted by the change: $ git --no-pager grep seccompProfile: origin/release-4.11 -- manifests origin/release-4.11:manifests/0000_50_olm_00-clusterserviceversions.crd.yaml: seccompProfile: origin/release-4.11:manifests/0000_50_olm_00-clusterserviceversions.crd.yaml: seccompProfile: origin/release-4.11:manifests/0000_50_olm_00-clusterserviceversions.crd.yaml: seccompProfile: origin/release-4.11:manifests/0000_50_olm_00-clusterserviceversions.crd.yaml: seccompProfile: origin/release-4.11:manifests/0000_50_olm_06-psm-operator.deployment.ibm-cloud-managed.yaml: seccompProfile: origin/release-4.11:manifests/0000_50_olm_06-psm-operator.deployment.yaml: seccompProfile: origin/release-4.11:manifests/0000_50_olm_07-collect-profiles.cronjob.yaml: seccompProfile: origin/release-4.11:manifests/0000_50_olm_07-olm-operator.deployment.ibm-cloud-managed.yaml: seccompProfile: origin/release-4.11:manifests/0000_50_olm_07-olm-operator.deployment.yaml: seccompProfile: origin/release-4.11:manifests/0000_50_olm_08-catalog-operator.deployment.ibm-cloud-managed.yaml: seccompProfile: origin/release-4.11:manifests/0000_50_olm_08-catalog-operator.deployment.yaml: seccompProfile: $ git --no-pager grep seccompProfile: origin/release-4.10 -- manifests origin/release-4.10:manifests/0000_50_olm_00-clusterserviceversions.crd.yaml: seccompProfile: origin/release-4.10:manifests/0000_50_olm_00-clusterserviceversions.crd.yaml: seccompProfile: origin/release-4.10:manifests/0000_50_olm_00-clusterserviceversions.crd.yaml: seccompProfile: origin/release-4.10:manifests/0000_50_olm_00-clusterserviceversions.crd.yaml: seccompProfile: The property itself is from Kubernetes 1.19: $ git clone https://github.com/kubernetes/api $ cd api $ git blame origin/release-1.19 -- core/v1/types.go | grep '^type .* struct\|SeccompProfile.*seccompProfile' | grep -B1 SeccompProfile 20b270679a (Paulo Gomes 2020-06-24 21:37:49 +0100 3306) SeccompProfile *SeccompProfile `json:"seccompProfile,omitempty" protobuf:"bytes,10,opt,name=seccompProfile"` 20b270679a (Paulo Gomes 2020-06-24 21:37:49 +0100 5979) SeccompProfile *SeccompProfile `json:"seccompProfile,omitempty" protobuf:"bytes,11,opt,name=seccompProfile"` $ git blame origin/release-1.18 -- core/v1/types.go | grep 'SeccompProfile.*seccompProfile' ...no hits... So the fact that the CVO was not reconciling it would technically be a bug since OpenShift 4.6, which shipped Kubernetes 1.19. But auditing the most recent named release in each minor branch: $ oc adm release extract --to 4.6 quay.io/openshift-release-dev/ocp-release:4.6.60-x86_64 $ oc adm release extract --to 4.7 quay.io/openshift-release-dev/ocp-release:4.7.56-x86_64 $ oc adm release extract --to 4.8 quay.io/openshift-release-dev/ocp-release:4.8.48-x86_64 $ oc adm release extract --to 4.9 quay.io/openshift-release-dev/ocp-release:4.9.47-x86_64 $ oc adm release extract --to 4.10 quay.io/openshift-release-dev/ocp-release:4.10.30-x86_64 $ grep -lr seccompProfile: 4.* 4.10/0000_50_cluster-monitoring-operator_00_0alertmanager-custom-resource-definition.yaml 4.10/0000_50_cluster-monitoring-operator_00_0prometheus-custom-resource-definition.yaml 4.10/0000_50_cluster-monitoring-operator_00_0thanosruler-custom-resource-definition.yaml 4.10/0000_50_olm_00-clusterserviceversions.crd.yaml 4.7/0000_50_olm_00-clusterserviceversions.crd.yaml 4.8/0000_50_olm_00-clusterserviceversions.crd.yaml 4.9/0000_50_olm_00-clusterserviceversions.crd.yaml And we have sound CRD reconciliation. So at the moment, the only manifests asking for seccompProfile where we'd be worried about a lack of CVO reconciliation are 4.11 and later. In this commit, I'm adding seccompProfile reconciliation. There are quite a few other properties within (Pod)SecurityContext that the CVO is still ignoring, and I'm unclear on why we have been using per-property merging instead of DeepEqual. But I'm going to punt on a broader move to DeepEqual for now, to make the narrow fix that should unstick 4.10-to-4.11-to-4.12 updates. [1]: https://testgrid.k8s.io/redhat-openshift-ocp-release-4.12-informing#periodic-ci-openshift-release-master-ci-4.12-upgrade-from-stable-4.11-from-stable-4.10-e2e-aws-sdn-upgrade [2]: https://prow.ci.openshift.org/view/gs/origin-ci-test/logs/periodic-ci-openshift-release-master-ci-4.12-upgrade-from-stable-4.11-from-stable-4.10-e2e-aws-sdn-upgrade/1562409104817786880 [3]: https://prow.ci.openshift.org/view/gs/origin-ci-test/logs/periodic-ci-openshift-release-master-ci-4.12-upgrade-from-stable-4.11-from-stable-4.10-e2e-aws-sdn-upgrade/1561684150702837760 [4]: https://issues.redhat.com/browse/OCPBUGS-575 [5]: https://github.com/openshift/operator-framework-olm/pull/367 --- lib/resourcemerge/core.go | 26 ++++++++++++++++++++++++++ lib/resourcemerge/core_test.go | 6 ++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/lib/resourcemerge/core.go b/lib/resourcemerge/core.go index da4c50f7c..bd24ee462 100644 --- a/lib/resourcemerge/core.go +++ b/lib/resourcemerge/core.go @@ -488,6 +488,7 @@ func ensureSecurityContext(modified *bool, existing *corev1.SecurityContext, req setBoolPtr(modified, &existing.RunAsNonRoot, required.RunAsNonRoot) setBoolPtr(modified, &existing.ReadOnlyRootFilesystem, required.ReadOnlyRootFilesystem) setBoolPtr(modified, &existing.AllowPrivilegeEscalation, required.AllowPrivilegeEscalation) + ensureSeccompProfilePtr(modified, &existing.SeccompProfile, required.SeccompProfile) } func ensureCapabilitiesPtr(modified *bool, existing **corev1.Capabilities, required *corev1.Capabilities) { @@ -537,6 +538,30 @@ func ensureCapabilities(modified *bool, existing *corev1.Capabilities, required } } +func ensureSeccompProfilePtr(modified *bool, existing **corev1.SeccompProfile, required *corev1.SeccompProfile) { + if *existing == nil && required == nil { + return + } + + // Check if we can simply set to required. This can be done if existing is not set or it is set + // but required is not set. + if *existing == nil || (required == nil && *existing != nil) { + *modified = true + *existing = required + return + } + ensureSeccompProfile(modified, *existing, *required) +} + +func ensureSeccompProfile(modified *bool, existing *corev1.SeccompProfile, required corev1.SeccompProfile) { + if equality.Semantic.DeepEqual(existing, required) { + return + } + + *modified = true + *existing = required +} + func setStringSlice(modified *bool, existing *[]string, required []string) { if !reflect.DeepEqual(required, *existing) { *existing = required @@ -640,6 +665,7 @@ func ensurePodSecurityContext(modified *bool, existing *corev1.PodSecurityContex setInt64Ptr(modified, &existing.RunAsUser, required.RunAsUser) setInt64Ptr(modified, &existing.RunAsGroup, required.RunAsGroup) setBoolPtr(modified, &existing.RunAsNonRoot, required.RunAsNonRoot) + ensureSeccompProfilePtr(modified, &existing.SeccompProfile, required.SeccompProfile) // any SupplementalGroups we specify, we require. for _, required := range required.SupplementalGroups { diff --git a/lib/resourcemerge/core_test.go b/lib/resourcemerge/core_test.go index 82382c706..641ef144e 100644 --- a/lib/resourcemerge/core_test.go +++ b/lib/resourcemerge/core_test.go @@ -69,12 +69,14 @@ func TestEnsurePodSpec(t *testing.T) { RunAsGroup: int64Ptr(int64(1234))}}, input: corev1.PodSpec{ SecurityContext: &corev1.PodSecurityContext{RunAsNonRoot: boolPtr(false), - RunAsGroup: int64Ptr(int64(5))}}, + RunAsGroup: int64Ptr(int64(5)), + SeccompProfile: &corev1.SeccompProfile{Type: corev1.SeccompProfileTypeRuntimeDefault}}}, expectedModified: true, expected: corev1.PodSpec{ SecurityContext: &corev1.PodSecurityContext{RunAsNonRoot: boolPtr(false), - RunAsGroup: int64Ptr(int64(5))}}, + RunAsGroup: int64Ptr(int64(5)), + SeccompProfile: &corev1.SeccompProfile{Type: corev1.SeccompProfileTypeRuntimeDefault}}}, }, { name: "container SecurityContext none",