diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index ab4d540a2f..9f7e1e3ab4 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -262,6 +262,7 @@ func (optr *Operator) InitializeFromPayload(restConfig *rest.Config, burstRestCo Duration: time.Second * 10, Factor: 1.3, Steps: 3, + Cap: time.Second * 15, }, optr.exclude, optr.eventRecorder, diff --git a/pkg/cvo/internal/operatorstatus.go b/pkg/cvo/internal/operatorstatus.go index 5c76a5eacb..4c0b76a7be 100644 --- a/pkg/cvo/internal/operatorstatus.go +++ b/pkg/cvo/internal/operatorstatus.go @@ -136,13 +136,7 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp actual, err := client.Get(ctx, expected.Name) if err != nil { - return &payload.UpdateError{ - Nested: err, - UpdateEffect: payload.UpdateEffectNone, - Reason: "ClusterOperatorNotAvailable", - Message: fmt.Sprintf("Cluster operator %s has not yet reported success", expected.Name), - Name: expected.Name, - } + return err } // undone is a sorted slice of transition messages for incomplete operands. diff --git a/pkg/cvo/sync_worker.go b/pkg/cvo/sync_worker.go index 6c2d98fe0e..169186fdd3 100644 --- a/pkg/cvo/sync_worker.go +++ b/pkg/cvo/sync_worker.go @@ -678,7 +678,7 @@ func (w *SyncWorker) apply(ctx context.Context, payloadUpdate *payload.Update, w var tasks []*payload.Task backoff := w.backoff if backoff.Steps > 1 && work.State == payload.InitializingPayload { - backoff = wait.Backoff{Steps: 4, Factor: 2, Duration: time.Second} + backoff = wait.Backoff{Steps: 4, Factor: 2, Duration: time.Second, Cap: 15 * time.Second} } for i := range payloadUpdate.Manifests { tasks = append(tasks, &payload.Task{ diff --git a/pkg/payload/task.go b/pkg/payload/task.go index 91bc3110a2..e0a22691b4 100644 --- a/pkg/payload/task.go +++ b/pkg/payload/task.go @@ -100,46 +100,39 @@ func (st *Task) String() string { func (st *Task) Run(ctx context.Context, version string, builder ResourceBuilder, state State) error { var lastErr error backoff := st.Backoff - maxDuration := 15 * time.Second // TODO: fold back into Backoff in 1.13 - for { - // attempt the apply, waiting as long as necessary - err := builder.Apply(ctx, st.Manifest, state) + err := wait.ExponentialBackoffWithContext(ctx, backoff, func() (done bool, err error) { + err = builder.Apply(ctx, st.Manifest, state) if err == nil { - return nil + return true, nil + } + if updateErr, ok := lastErr.(*UpdateError); ok { + updateErr.Task = st.Copy() + return false, updateErr // failing fast for UpdateError } lastErr = err utilruntime.HandleError(errors.Wrapf(err, "error running apply for %s", st)) metricPayloadErrors.WithLabelValues(version).Inc() - - // TODO: this code will become easier in Kube 1.13 because Backoff now supports max - d := time.Duration(float64(backoff.Duration) * backoff.Factor) - if d > maxDuration { - d = maxDuration - } - d = wait.Jitter(d, backoff.Jitter) - - // sleep or wait for cancellation - select { - case <-time.After(d): - continue - case <-ctx.Done(): - if uerr, ok := lastErr.(*UpdateError); ok { - uerr.Task = st.Copy() - return uerr - } - reason, cause := reasonForPayloadSyncError(lastErr) - if len(cause) > 0 { - cause = ": " + cause - } - return &UpdateError{ - Nested: lastErr, - Reason: reason, - Message: fmt.Sprintf("Could not update %s%s", st, cause), - - Task: st.Copy(), - } - } + return false, nil + }) + if lastErr != nil { + err = lastErr + } + if err == nil { + return nil + } + if _, ok := err.(*UpdateError); ok { + return err + } + reason, cause := reasonForPayloadSyncError(err) + if len(cause) > 0 { + cause = ": " + cause + } + return &UpdateError{ + Nested: err, + Reason: reason, + Message: fmt.Sprintf("Could not update %s%s", st, cause), + Task: st.Copy(), } } @@ -177,7 +170,7 @@ func (e *UpdateError) Cause() error { return e.Nested } -// reasonForUpdateError provides a succint explanation of a known error type for use in a human readable +// reasonForPayloadSyncError provides a succint explanation of a known error type for use in a human readable // message during update. Since all objects in the image should be successfully applied, messages // should direct the reader (likely a cluster administrator) to a possible cause in their own config. func reasonForPayloadSyncError(err error) (string, string) {