From 2cabf1899c7a470216295eb1aabcf069d2084dca Mon Sep 17 00:00:00 2001 From: Sonu Kumar Singh Date: Fri, 18 Jul 2025 12:24:30 +0530 Subject: [PATCH 1/4] Add taint for critical components not ready after update After a successful in-place update, the controller now adds the 'critical components not ready' taint to the node. This prevents pods from being scheduled until critical component pods are ready, improving node readiness handling. --- pkg/controller/deployment_inplace.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pkg/controller/deployment_inplace.go b/pkg/controller/deployment_inplace.go index 12188844a..373c66ceb 100644 --- a/pkg/controller/deployment_inplace.go +++ b/pkg/controller/deployment_inplace.go @@ -193,6 +193,13 @@ func (dc *controller) syncMachineSets(ctx context.Context, oldMachineSets []*v1a // uncordon the node since the inplace update is successful. node.Spec.Unschedulable = false + // add the critical components not ready taint to the node this is to ensure that + // the pods are not scheduled on the node until the critical components pods are ready. + node.Spec.Taints = append(node.Spec.Taints, v1.Taint{ + Key: machineutils.TaintNodeCriticalComponentsNotReady, + Effect: v1.TaintEffectNoSchedule, + }) + _, err = dc.targetCoreClient.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) if err != nil { return fmt.Errorf("failed to remove inplace labels/annotations and uncordon node %s: %w", node.Name, err) From 77c6ca203352b71b175c7a6111635d949b183cf9 Mon Sep 17 00:00:00 2001 From: Sonu Kumar Singh Date: Fri, 18 Jul 2025 13:04:48 +0530 Subject: [PATCH 2/4] Fix spelling --- pkg/util/provider/machinecontroller/controller.go | 2 +- pkg/util/provider/machinecontroller/machine_util.go | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/util/provider/machinecontroller/controller.go b/pkg/util/provider/machinecontroller/controller.go index ab63d84b4..2727ecb4c 100644 --- a/pkg/util/provider/machinecontroller/controller.go +++ b/pkg/util/provider/machinecontroller/controller.go @@ -44,7 +44,7 @@ import ( ) const ( - // MCMFinalizerName is the finalizer used to tag dependecies before deletion + // MCMFinalizerName is the finalizer used to tag dependencies before deletion // of the object. This finalizer is carried over from the MCM MCMFinalizerName = "machine.sapcloud.io/machine-controller-manager" // MCFinalizerName is the finalizer created for the external diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 29125113a..3f554e03d 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -954,8 +954,8 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph // if the label update successful or failed, then skip the timeout check if node != nil && metav1.HasLabel(node.ObjectMeta, v1alpha1.LabelKeyNodeUpdateResult) { if node.Labels[v1alpha1.LabelKeyNodeUpdateResult] == v1alpha1.LabelValueNodeUpdateSuccessful && clone.Status.CurrentStatus.Phase != v1alpha1.MachineInPlaceUpdateSuccessful { - description = fmt.Sprintf("Machine %s successfully updated dependecies", machine.Name) - klog.V(2).Infof("%s with backing node %q and providerID %q sucessfully update the dependecies", description, getNodeName(machine), getProviderID(machine)) + description = fmt.Sprintf("Machine %s successfully updated dependencies", machine.Name) + klog.V(2).Infof("%s with backing node %q and providerID %q sucessfully update the dependencies", description, getNodeName(machine), getProviderID(machine)) clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ Phase: v1alpha1.MachineInPlaceUpdateSuccessful, LastUpdateTime: metav1.Now(), @@ -968,8 +968,8 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph } cloneDirty = true } else if node.Labels[v1alpha1.LabelKeyNodeUpdateResult] == v1alpha1.LabelValueNodeUpdateFailed && clone.Status.CurrentStatus.Phase != v1alpha1.MachineInPlaceUpdateFailed { - description = fmt.Sprintf("Machine %s failed to update dependecies: %s", machine.Name, node.Annotations[v1alpha1.AnnotationKeyMachineUpdateFailedReason]) - klog.V(2).Infof("%s with backing node %q and providerID %q failed to update dependecies", description, getNodeName(machine), getProviderID(machine)) + description = fmt.Sprintf("Machine %s failed to update dependencies: %s", machine.Name, node.Annotations[v1alpha1.AnnotationKeyMachineUpdateFailedReason]) + klog.V(2).Infof("%s with backing node %q and providerID %q failed to update dependencies", description, getNodeName(machine), getProviderID(machine)) clone.Status.CurrentStatus = v1alpha1.CurrentStatus{ Phase: v1alpha1.MachineInPlaceUpdateFailed, LastUpdateTime: metav1.Now(), From 329f0301581f1889cf9f2d435d9353c1b021c997 Mon Sep 17 00:00:00 2001 From: Sonu Kumar Singh Date: Fri, 18 Jul 2025 13:12:19 +0530 Subject: [PATCH 3/4] Remove PreferNoSchedule taint after inplace update After a successful inplace update, the PreferNoSchedule taint is now removed from the node if present. This ensures that nodes are properly untainted and available for scheduling as expected. --- pkg/controller/deployment_inplace.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pkg/controller/deployment_inplace.go b/pkg/controller/deployment_inplace.go index 373c66ceb..93054a501 100644 --- a/pkg/controller/deployment_inplace.go +++ b/pkg/controller/deployment_inplace.go @@ -8,6 +8,7 @@ import ( "context" "fmt" "maps" + "slices" "sort" "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" @@ -69,7 +70,7 @@ func (dc *controller) rolloutInPlace(ctx context.Context, d *v1alpha1.MachineDep oldMachineSets, &v1.Taint{ Key: PreferNoScheduleKey, Value: "True", - Effect: "PreferNoSchedule", + Effect: v1.TaintEffectPreferNoSchedule, }, ) if err != nil { @@ -193,6 +194,11 @@ func (dc *controller) syncMachineSets(ctx context.Context, oldMachineSets []*v1a // uncordon the node since the inplace update is successful. node.Spec.Unschedulable = false + // remove the PreferNoSchedule taint if it exists which was added during the inplace update. + node.Spec.Taints = slices.DeleteFunc(node.Spec.Taints, func(t v1.Taint) bool { + return t.Key == PreferNoScheduleKey && t.Value == "True" && t.Effect == v1.TaintEffectPreferNoSchedule + }) + // add the critical components not ready taint to the node this is to ensure that // the pods are not scheduled on the node until the critical components pods are ready. node.Spec.Taints = append(node.Spec.Taints, v1.Taint{ From 8bf1c550b605dd1ec4068c433c3f9e4d573a3603 Mon Sep 17 00:00:00 2001 From: Sonu Kumar Singh Date: Fri, 18 Jul 2025 15:30:30 +0530 Subject: [PATCH 4/4] Address Review --- pkg/controller/deployment_inplace.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/controller/deployment_inplace.go b/pkg/controller/deployment_inplace.go index 93054a501..914f707df 100644 --- a/pkg/controller/deployment_inplace.go +++ b/pkg/controller/deployment_inplace.go @@ -199,8 +199,8 @@ func (dc *controller) syncMachineSets(ctx context.Context, oldMachineSets []*v1a return t.Key == PreferNoScheduleKey && t.Value == "True" && t.Effect == v1.TaintEffectPreferNoSchedule }) - // add the critical components not ready taint to the node this is to ensure that - // the pods are not scheduled on the node until the critical components pods are ready. + // add the critical components not ready taint to the node. This is to ensure that + // workload pods are not scheduled on the node until the critical components pods are ready. node.Spec.Taints = append(node.Spec.Taints, v1.Taint{ Key: machineutils.TaintNodeCriticalComponentsNotReady, Effect: v1.TaintEffectNoSchedule,