diff --git a/pkg/controller/deployment_machineset_util.go b/pkg/controller/deployment_machineset_util.go index f157ff56d..ee2250451 100644 --- a/pkg/controller/deployment_machineset_util.go +++ b/pkg/controller/deployment_machineset_util.go @@ -126,13 +126,11 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al } } - // Update the FailedMachines field only if we see new failures - // Clear FailedMachines if ready replicas equals total replicas, - // which means the machineset doesn't have any machine objects which are in any failed state - // #nosec G115 -- number of machines will not exceed MaxInt32 + // Update the FailedMachines field when we see new failures + // Clear FailedMachines if there are no failed machines. if len(failedMachines) > 0 { newStatus.FailedMachines = &failedMachines - } else if int32(readyReplicasCount) == is.Status.Replicas { + } else { newStatus.FailedMachines = nil } diff --git a/pkg/controller/metrics.go b/pkg/controller/metrics.go index f78e5f120..c46cffac2 100644 --- a/pkg/controller/metrics.go +++ b/pkg/controller/metrics.go @@ -119,6 +119,11 @@ func updateMachineSetStatusRelatedMetric(machineSet *v1alpha1.MachineSet, msMeta } func updateMachineSetStatusFailedMachinesMetric(machineSet *v1alpha1.MachineSet, msMeta metav1.ObjectMeta) { + metrics.MachineSetStatusFailedMachines.DeletePartialMatch(prometheus.Labels{ + "name": msMeta.Name, + "namespace": msMeta.Namespace, + }) + if machineSet.Status.FailedMachines != nil { for _, failedMachine := range *machineSet.Status.FailedMachines { metrics.MachineSetStatusFailedMachines.With(prometheus.Labels{