Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
pkg/payload/task: Fail fast for UpdateError
Things like API-server connection errors and patch conflicts deserve
some retries before we bubble them up into ClusterVersion conditions.
But when we are able to retrieve in-cluster objects and determine that
they are not happy, we should exit more quickly so we can complain
about the resource state and start in on the next sync cycle.  For
example, see the recent e02d148 (pkg/cvo/internal/operatorstatus:
Replace wait-for with single-shot "is it alive now?", 2021-05-12, #560)
and the older cc9292a (lib/resourcebuilder: Replace wait-for with
single-shot "is it alive now?", 2020-07-07, #400).  This commit uses
the presence of an UpdateError as a marker for "fail fast; no need to
retry".

The install-time backoff is from fee2d06 (sync: Completely
parallelize the initial payload, 2019-03-11, #136).  I'm not sure if
it really wants the same cap as reconcile and update modes, but I've
left them the same for now, and future commits to pivot the backoff
settings can focus on motivating those pivots.

I'd tried dropping:

  backoff := st.Backoff

and passing st.Backoff directly to ExponentialBackoffWithContext, but
it turns out that Step() [1]:

  ... mutates the provided Backoff to update its Steps and Duration.

Luckily, Backoff has no pointer properties, so storing as a local
variable is sufficient to give us a fresh copy for the local
mutations.

[1]: https://pkg.go.dev/k8s.io/apimachinery/pkg/util/wait#Backoff.Step
  • Loading branch information
wking committed May 25, 2021
commit 6c7cd992c64b7f2d421898d203528660cee9853c
1 change: 1 addition & 0 deletions pkg/cvo/cvo.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ func (optr *Operator) InitializeFromPayload(restConfig *rest.Config, burstRestCo
Duration: time.Second * 10,
Factor: 1.3,
Steps: 3,
Cap: time.Second * 15,
},
optr.exclude,
optr.eventRecorder,
Expand Down
8 changes: 1 addition & 7 deletions pkg/cvo/internal/operatorstatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,7 @@ func checkOperatorHealth(ctx context.Context, client ClusterOperatorsGetter, exp

actual, err := client.Get(ctx, expected.Name)
if err != nil {
return &payload.UpdateError{
Nested: err,
UpdateEffect: payload.UpdateEffectNone,
Reason: "ClusterOperatorNotAvailable",
Message: fmt.Sprintf("Cluster operator %s has not yet reported success", expected.Name),
Name: expected.Name,
}
return err
}

// undone is a sorted slice of transition messages for incomplete operands.
Expand Down
2 changes: 1 addition & 1 deletion pkg/cvo/sync_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ func (w *SyncWorker) apply(ctx context.Context, payloadUpdate *payload.Update, w
var tasks []*payload.Task
backoff := w.backoff
if backoff.Steps > 1 && work.State == payload.InitializingPayload {
backoff = wait.Backoff{Steps: 4, Factor: 2, Duration: time.Second}
backoff = wait.Backoff{Steps: 4, Factor: 2, Duration: time.Second, Cap: 15 * time.Second}
}
for i := range payloadUpdate.Manifests {
tasks = append(tasks, &payload.Task{
Expand Down
63 changes: 28 additions & 35 deletions pkg/payload/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,46 +100,39 @@ func (st *Task) String() string {
func (st *Task) Run(ctx context.Context, version string, builder ResourceBuilder, state State) error {
var lastErr error
backoff := st.Backoff
maxDuration := 15 * time.Second // TODO: fold back into Backoff in 1.13
for {
// attempt the apply, waiting as long as necessary
err := builder.Apply(ctx, st.Manifest, state)
err := wait.ExponentialBackoffWithContext(ctx, backoff, func() (done bool, err error) {
err = builder.Apply(ctx, st.Manifest, state)
if err == nil {
return nil
return true, nil
}
if updateErr, ok := lastErr.(*UpdateError); ok {
updateErr.Task = st.Copy()
return false, updateErr // failing fast for UpdateError
}

lastErr = err
utilruntime.HandleError(errors.Wrapf(err, "error running apply for %s", st))
metricPayloadErrors.WithLabelValues(version).Inc()

// TODO: this code will become easier in Kube 1.13 because Backoff now supports max
d := time.Duration(float64(backoff.Duration) * backoff.Factor)
if d > maxDuration {
d = maxDuration
}
d = wait.Jitter(d, backoff.Jitter)

// sleep or wait for cancellation
select {
case <-time.After(d):
continue
case <-ctx.Done():
if uerr, ok := lastErr.(*UpdateError); ok {
uerr.Task = st.Copy()
return uerr
}
reason, cause := reasonForPayloadSyncError(lastErr)
if len(cause) > 0 {
cause = ": " + cause
}
return &UpdateError{
Nested: lastErr,
Reason: reason,
Message: fmt.Sprintf("Could not update %s%s", st, cause),

Task: st.Copy(),
}
}
return false, nil
})
if lastErr != nil {
err = lastErr
}
if err == nil {
return nil
}
if _, ok := err.(*UpdateError); ok {
return err
}
reason, cause := reasonForPayloadSyncError(err)
if len(cause) > 0 {
cause = ": " + cause
}
return &UpdateError{
Nested: err,
Reason: reason,
Message: fmt.Sprintf("Could not update %s%s", st, cause),
Task: st.Copy(),
}
}

Expand Down Expand Up @@ -177,7 +170,7 @@ func (e *UpdateError) Cause() error {
return e.Nested
}

// reasonForUpdateError provides a succint explanation of a known error type for use in a human readable
// reasonForPayloadSyncError provides a succint explanation of a known error type for use in a human readable
// message during update. Since all objects in the image should be successfully applied, messages
// should direct the reader (likely a cluster administrator) to a possible cause in their own config.
func reasonForPayloadSyncError(err error) (string, string) {
Expand Down