From cc1921d186252bd1bd02398481be32eb08312b83 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 3 Aug 2020 13:11:41 -0700 Subject: [PATCH 1/4] pkg/start: Release leader lease on graceful shutdown So the incoming cluster-version operator doesn't need to wait for the outgoing operator's lease to expire, which can take a while [1]: I0802 10:06:01.056591 1 leaderelection.go:243] attempting to acquire leader lease openshift-cluster-version/version... ... I0802 10:07:42.632719 1 leaderelection.go:253] successfully acquired lease openshift-cluster-version/version and time out the: Cluster did not acknowledge request to upgrade in a reasonable time testcase [2]. Using ReleaseOnCancel has been the plan since 2b81f47626 (cvo: Release our leader lease when we are gracefully terminated, 2019-01-16, #87). I'm not clear on why it (sometimes?) doesn't work today. The discrepancy between the "exit after 2s no matter what" comment and the 5s After dates back to dbedb7accb (cvo: When the CVO restarts, perform one final sync to write status, 2019-04-27, #179), which bumped the After from 2s to 5s, but forgot to bump the comment. I'm removing that code here in favor of the two-minute timeout from b30aa0ecdb (pkg/cvo/metrics: Graceful server shutdown, 2020-04-15, #349). We still exit immediately on a second TERM, for folks who get impatient waiting for the graceful timeout. Decouple shutdownContext from the context passed into Options.run, to allow TestIntegrationCVO_gracefulStepDown to request a graceful shutdown. And remove Context.Start(), inlining the logic in Options.run so we can count and reap the goroutines it used to launch. This also allows us to be more targeted with the context for each goroutines: * Informers are now launched before the lease controller, so they're up and running by the time we acquire the lease. They remain running until the main operator CVO.Run() exits, after which we shut them down. Having informers running before we have a lease is somewhat expensive in terms of API traffic, but we should rarely have two CVO pods competing for leadership since we transitioned to the Recreate Deployment strategy in 078686d73b (install/0000_00_cluster-version-operator_03_deployment: Set 'strategy: Recreate', 2019-03-20, #140) and 5d8a5275a3 (install/0000_00_cluster-version-operator_03_deployment: Fix Recreate strategy, 2019-04-03, #155). I don't see a way to block on their internal goroutine's completion, but maybe informers will grow an API for that in the future. * The metrics server also continues to run until CVO.Run() exits, where previously we began gracefully shutting it down at the same time we started shutting down CVO.Run(). This ensures we are around and publishing any last-minute CVO.Run() changes. * Leader election also continues to run until CVO.Run() exits. We don't want to release the lease while we're still controlling things. * CVO.Run() and AutoUpdate.Run() both stop immediately when the passed-in context is canceled or we call runCancel internally (because of a TERM, error from a goroutine, or loss of leadership). These are the only two goroutines that are actually writing to the API servers, so we want to shut them down as quickly as possible. Drop an unnecessary runCancel() from the "shutting down" branch of the error collector. I'd added it in b30aa0ecdb, but you can only ever get into the "shutting down" branch if runCancel has already been called. And fix the scoping for the shutdownTimer variable so we don't clear it on each for-loop iteration (oops :p, bug from b30aa0ecdb). Add some logging to the error collector, so it's easier to see where we are in the collection process from the operator logs. Also start logging collected goroutines by name, so we can figure out which may still be outstanding. Set terminationGracePeriodSeconds 130 to extend the default 30s [3], to give the container the full two-minute graceful timeout window before the kubelet steps in with a KILL. Push the Background() initialization all the way up to the command-line handler, to make it more obvious that the context is scoped to the whole 'start' invocation. [1]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/25365/pull-ci-openshift-origin-master-e2e-gcp-upgrade/1289853267223777280/artifacts/e2e-gcp-upgrade/pods/openshift-cluster-version_cluster-version-operator-5b6ff896c6-57ppb_cluster-version-operator.log [2]: https://bugzilla.redhat.com/show_bug.cgi?id=1843505#c7 [3]: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#podspec-v1-core squash! pkg/start: Release leader lease on graceful shutdown --- bootstrap/bootstrap-pod.yaml | 1 + cmd/start.go | 5 +- ...luster-version-operator_03_deployment.yaml | 1 + pkg/autoupdate/autoupdate.go | 7 +- pkg/cvo/cvo.go | 7 +- pkg/start/start.go | 191 +++++++++--------- pkg/start/start_integration_test.go | 23 ++- 7 files changed, 125 insertions(+), 110 deletions(-) diff --git a/bootstrap/bootstrap-pod.yaml b/bootstrap/bootstrap-pod.yaml index bc9374628..ca42facd1 100644 --- a/bootstrap/bootstrap-pod.yaml +++ b/bootstrap/bootstrap-pod.yaml @@ -37,6 +37,7 @@ spec: fieldRef: fieldPath: spec.nodeName hostNetwork: true + terminationGracePeriodSeconds: 130 volumes: - name: kubeconfig hostPath: diff --git a/cmd/start.go b/cmd/start.go index 578fc8ac4..4eb5bfa50 100644 --- a/cmd/start.go +++ b/cmd/start.go @@ -1,6 +1,8 @@ package main import ( + "context" + "github.com/spf13/cobra" "k8s.io/klog" @@ -18,9 +20,10 @@ func init() { // To help debugging, immediately log version klog.Infof("%s", version.String) - if err := opts.Run(); err != nil { + if err := opts.Run(context.Background()); err != nil { klog.Fatalf("error: %v", err) } + klog.Info("Graceful shutdown complete.") }, } diff --git a/install/0000_00_cluster-version-operator_03_deployment.yaml b/install/0000_00_cluster-version-operator_03_deployment.yaml index 3b3ae8d1b..0973d81cf 100644 --- a/install/0000_00_cluster-version-operator_03_deployment.yaml +++ b/install/0000_00_cluster-version-operator_03_deployment.yaml @@ -57,6 +57,7 @@ spec: nodeSelector: node-role.kubernetes.io/master: "" priorityClassName: "system-cluster-critical" + terminationGracePeriodSeconds: 130 tolerations: - key: "node-role.kubernetes.io/master" operator: Exists diff --git a/pkg/autoupdate/autoupdate.go b/pkg/autoupdate/autoupdate.go index 20c6af731..ce6ecfe5a 100644 --- a/pkg/autoupdate/autoupdate.go +++ b/pkg/autoupdate/autoupdate.go @@ -84,7 +84,7 @@ func New( } // Run runs the autoupdate controller. -func (ctrl *Controller) Run(ctx context.Context, workers int) { +func (ctrl *Controller) Run(ctx context.Context, workers int) error { defer utilruntime.HandleCrash() defer ctrl.queue.ShutDown() @@ -92,15 +92,16 @@ func (ctrl *Controller) Run(ctx context.Context, workers int) { defer klog.Info("Shutting down AutoUpdateController") if !cache.WaitForCacheSync(ctx.Done(), ctrl.cacheSynced...) { - klog.Info("Caches never synchronized") - return + return fmt.Errorf("caches never synchronized: %w", ctx.Err()) } for i := 0; i < workers; i++ { + // FIXME: actually wait until these complete if the Context is canceled. go wait.UntilWithContext(ctx, ctrl.worker, time.Second) } <-ctx.Done() + return nil } func (ctrl *Controller) eventHandler() cache.ResourceEventHandler { diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index b23d0e821..2a9bc7c4a 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -290,7 +290,7 @@ func loadConfigMapVerifierDataFromUpdate(update *payload.Update, clientBuilder s } // Run runs the cluster version operator until stopCh is completed. Workers is ignored for now. -func (optr *Operator) Run(ctx context.Context, workers int) { +func (optr *Operator) Run(ctx context.Context, workers int) error { defer utilruntime.HandleCrash() defer optr.queue.ShutDown() stopCh := ctx.Done() @@ -300,8 +300,7 @@ func (optr *Operator) Run(ctx context.Context, workers int) { defer klog.Info("Shutting down ClusterVersionOperator") if !cache.WaitForCacheSync(stopCh, optr.cacheSynced...) { - klog.Info("Caches never synchronized") - return + return fmt.Errorf("caches never synchronized: %w", ctx.Err()) } // trigger the first cluster version reconcile always @@ -330,6 +329,8 @@ func (optr *Operator) Run(ctx context.Context, workers int) { // stop the queue, then wait for the worker to exit optr.queue.ShutDown() <-workerStopCh + + return nil } func (optr *Operator) queueKey() string { diff --git a/pkg/start/start.go b/pkg/start/start.go index bf10fd51d..68aca64c9 100644 --- a/pkg/start/start.go +++ b/pkg/start/start.go @@ -10,7 +10,6 @@ import ( "math/rand" "os" "os/signal" - "sync" "syscall" "time" @@ -77,6 +76,11 @@ type Options struct { ResyncInterval time.Duration } +type asyncResult struct { + name string + error error +} + func defaultEnv(name, defaultValue string) string { env, ok := os.LookupEnv(name) if !ok { @@ -101,7 +105,7 @@ func NewOptions() *Options { } } -func (o *Options) Run() error { +func (o *Options) Run(ctx context.Context) error { if o.NodeName == "" { return fmt.Errorf("node-name is required") } @@ -137,29 +141,6 @@ func (o *Options) Run() error { return err } - // TODO: Kube 1.14 will contain a ReleaseOnCancel boolean on - // LeaderElectionConfig that allows us to have the lock code - // release the lease when this context is cancelled. At that - // time we can remove our changes to OnStartedLeading. - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - ch := make(chan os.Signal, 1) - defer func() { signal.Stop(ch) }() - signal.Notify(ch, os.Interrupt, syscall.SIGTERM) - go func() { - sig := <-ch - klog.Infof("Shutting down due to %s", sig) - cancel() - - // exit after 2s no matter what - select { - case <-time.After(5 * time.Second): - klog.Fatalf("Exiting") - case <-ch: - klog.Fatalf("Received shutdown signal twice, exiting") - } - }() - o.run(ctx, controllerCtx, lock) return nil } @@ -186,13 +167,32 @@ func (o *Options) makeTLSConfig() (*tls.Config, error) { }), nil } +// run launches a number of goroutines to handle manifest application, +// metrics serving, etc. It continues operating until ctx.Done(), +// and then attempts a clean shutdown limited by an internal context +// with a two-minute cap. It returns after it successfully collects all +// launched goroutines. func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourcelock.ConfigMapLock) { - runContext, runCancel := context.WithCancel(ctx) + runContext, runCancel := context.WithCancel(ctx) // so we can cancel internally on errors or TERM defer runCancel() - shutdownContext, shutdownCancel := context.WithCancel(ctx) + shutdownContext, shutdownCancel := context.WithCancel(context.Background()) // extends beyond ctx defer shutdownCancel() - errorChannel := make(chan error, 1) - errorChannelCount := 0 + postMainContext, postMainCancel := context.WithCancel(context.Background()) // extends beyond ctx + defer postMainCancel() + + ch := make(chan os.Signal, 1) + defer func() { signal.Stop(ch) }() + signal.Notify(ch, os.Interrupt, syscall.SIGTERM) + go func() { + sig := <-ch + klog.Infof("Shutting down due to %s", sig) + runCancel() + sig = <-ch + klog.Fatalf("Received shutdown signal twice, exiting: %s", sig) + }() + + resultChannel := make(chan asyncResult, 1) + resultChannelCount := 0 if o.ListenAddr != "" { var tlsConfig *tls.Config if o.ServingCertFile != "" || o.ServingKeyFile != "" { @@ -202,85 +202,92 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc klog.Fatalf("Failed to create TLS config: %v", err) } } - errorChannelCount++ + resultChannelCount++ go func() { - errorChannel <- cvo.RunMetrics(runContext, shutdownContext, o.ListenAddr, tlsConfig) + err := cvo.RunMetrics(postMainContext, shutdownContext, o.ListenAddr, tlsConfig) + resultChannel <- asyncResult{name: "metrics server", error: err} }() } - exit := make(chan struct{}) - exitClose := sync.Once{} - - // TODO: when we switch to graceful lock shutdown, this can be - // moved back inside RunOrDie - // TODO: properly wire ctx here - go leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{ - Lock: lock, - LeaseDuration: leaseDuration, - RenewDeadline: renewDeadline, - RetryPeriod: retryPeriod, - Callbacks: leaderelection.LeaderCallbacks{ - OnStartedLeading: func(localCtx context.Context) { - controllerCtx.Start(runContext) - select { - case <-runContext.Done(): - // WARNING: this is not completely safe until we have Kube 1.14 and ReleaseOnCancel - // and client-go ContextCancelable, which allows us to block new API requests before - // we step down. However, the CVO isn't that sensitive to races and can tolerate - // brief overlap. - klog.Infof("Stepping down as leader") - // give the controllers some time to shut down - time.Sleep(100 * time.Millisecond) - // if we still hold the leader lease, clear the owner identity (other lease watchers - // still have to wait for expiration) like the new ReleaseOnCancel code will do. - if err := lock.Update(localCtx, resourcelock.LeaderElectionRecord{}); err == nil { - // if we successfully clear the owner identity, we can safely delete the record - if err := lock.Client.ConfigMaps(lock.ConfigMapMeta.Namespace).Delete(localCtx, lock.ConfigMapMeta.Name, metav1.DeleteOptions{}); err != nil { - klog.Warningf("Unable to step down cleanly: %v", err) - } + informersDone := postMainContext.Done() + // FIXME: would be nice if there was a way to collect these. + controllerCtx.CVInformerFactory.Start(informersDone) + controllerCtx.OpenshiftConfigInformerFactory.Start(informersDone) + controllerCtx.OpenshiftConfigManagedInformerFactory.Start(informersDone) + controllerCtx.InformerFactory.Start(informersDone) + + resultChannelCount++ + go func() { + leaderelection.RunOrDie(postMainContext, leaderelection.LeaderElectionConfig{ + Lock: lock, + ReleaseOnCancel: true, + LeaseDuration: leaseDuration, + RenewDeadline: renewDeadline, + RetryPeriod: retryPeriod, + Callbacks: leaderelection.LeaderCallbacks{ + OnStartedLeading: func(_ context.Context) { // no need for this passed-through postMainContext, because goroutines we launch inside will use runContext + resultChannelCount++ + go func() { + err := controllerCtx.CVO.Run(runContext, 2) + resultChannel <- asyncResult{name: "main operator", error: err} + }() + + if controllerCtx.AutoUpdate != nil { + resultChannelCount++ + go func() { + err := controllerCtx.AutoUpdate.Run(runContext, 2) + resultChannel <- asyncResult{name: "auto-update controller", error: err} + }() } - klog.Infof("Finished shutdown") - exitClose.Do(func() { close(exit) }) - case <-localCtx.Done(): - // we will exit in OnStoppedLeading - } - }, - OnStoppedLeading: func() { - klog.Warning("leaderelection lost") - exitClose.Do(func() { close(exit) }) + }, + OnStoppedLeading: func() { + klog.Info("Stopped leading; shutting down.") + runCancel() + }, }, - }, - }) + }) + resultChannel <- asyncResult{name: "leader controller", error: nil} + }() - for errorChannelCount > 0 { - var shutdownTimer *time.Timer + var shutdownTimer *time.Timer + for resultChannelCount > 0 { + klog.Infof("Waiting on %d outstanding goroutines.", resultChannelCount) if shutdownTimer == nil { // running select { case <-runContext.Done(): + klog.Info("Run context completed; beginning two-minute graceful shutdown period.") shutdownTimer = time.NewTimer(2 * time.Minute) - case err := <-errorChannel: - errorChannelCount-- - if err != nil { - klog.Error(err) + case result := <-resultChannel: + resultChannelCount-- + if result.error == nil { + klog.Infof("Collected %s goroutine.", result.name) + } else { + klog.Errorf("Collected %s goroutine: %v", result.name, result.error) runCancel() // this will cause shutdownTimer initialization in the next loop } + if result.name == "main operator" { + postMainCancel() + } } } else { // shutting down select { case <-shutdownTimer.C: // never triggers after the channel is stopped, although it would not matter much if it did because subsequent cancel calls do nothing. shutdownCancel() shutdownTimer.Stop() - case err := <-errorChannel: - errorChannelCount-- - if err != nil { - klog.Error(err) - runCancel() + case result := <-resultChannel: + resultChannelCount-- + if result.error == nil { + klog.Infof("Collected %s goroutine.", result.name) + } else { + klog.Errorf("Collected %s goroutine: %v", result.name, result.error) + } + if result.name == "main operator" { + postMainCancel() } } } } - - <-exit + klog.Info("Finished collecting operator goroutines.") } // createResourceLock initializes the lock. @@ -440,17 +447,3 @@ func (o *Options) NewControllerContext(cb *ClientBuilder) *Context { } return ctx } - -// Start launches the controllers in the provided context and any supporting -// infrastructure. When ch is closed the controllers will be shut down. -func (c *Context) Start(ctx context.Context) { - ch := ctx.Done() - go c.CVO.Run(ctx, 2) - if c.AutoUpdate != nil { - go c.AutoUpdate.Run(ctx, 2) - } - c.CVInformerFactory.Start(ch) - c.OpenshiftConfigInformerFactory.Start(ch) - c.OpenshiftConfigManagedInformerFactory.Start(ch) - c.InformerFactory.Start(ch) -} diff --git a/pkg/start/start_integration_test.go b/pkg/start/start_integration_test.go index 153027f02..c3d0a8d93 100644 --- a/pkg/start/start_integration_test.go +++ b/pkg/start/start_integration_test.go @@ -267,9 +267,14 @@ func TestIntegrationCVO_initializeAndUpgrade(t *testing.T) { worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", record.NewFakeRecorder(100)) controllers.CVO.SetSyncWorkerForTesting(worker) + lock, err := createResourceLock(cb, options.Namespace, options.Name) + if err != nil { + t.Fatal(err) + } + ctx, cancel := context.WithCancel(context.Background()) defer cancel() - controllers.Start(ctx) + go options.run(ctx, controllers, lock) t.Logf("wait until we observe the cluster version become available") lastCV, err := waitForUpdateAvailable(ctx, t, client, ns, false, "0.0.1") @@ -424,9 +429,14 @@ func TestIntegrationCVO_initializeAndHandleError(t *testing.T) { worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil), 5*time.Second, wait.Backoff{Duration: time.Second, Factor: 1.2}, "", record.NewFakeRecorder(100)) controllers.CVO.SetSyncWorkerForTesting(worker) + lock, err := createResourceLock(cb, options.Namespace, options.Name) + if err != nil { + t.Fatal(err) + } + ctx, cancel := context.WithCancel(context.Background()) defer cancel() - controllers.Start(ctx) + go options.run(ctx, controllers, lock) t.Logf("wait until we observe the cluster version become available") lastCV, err := waitForUpdateAvailable(ctx, t, client, ns, false, "0.0.1") @@ -534,7 +544,7 @@ func TestIntegrationCVO_gracefulStepDown(t *testing.T) { worker := cvo.NewSyncWorker(&mapPayloadRetriever{}, cvo.NewResourceBuilder(cfg, cfg, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", record.NewFakeRecorder(100)) controllers.CVO.SetSyncWorkerForTesting(worker) - lock, err := createResourceLock(cb, ns, ns) + lock, err := createResourceLock(cb, options.Namespace, options.Name) if err != nil { t.Fatal(err) } @@ -721,9 +731,14 @@ metadata: worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil), 5*time.Second, wait.Backoff{Steps: 3}, "", record.NewFakeRecorder(100)) controllers.CVO.SetSyncWorkerForTesting(worker) + lock, err := createResourceLock(cb, options.Namespace, options.Name) + if err != nil { + t.Fatal(err) + } + ctx, cancel := context.WithCancel(context.Background()) defer cancel() - controllers.Start(ctx) + go options.run(ctx, controllers, lock) t.Logf("wait until we observe the cluster version become available") lastCV, err := waitForUpdateAvailable(ctx, t, client, ns, false, "0.0.1") From dd09c3fbe06c5f3c8cb55d0a61d5b86baded6e21 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Wed, 5 Aug 2020 14:54:47 -0700 Subject: [PATCH 2/4] pkg/start/start_integration_test: Do not assume "deleted" for ConfigMap lock release From the godocs: $ grep -A5 '// HolderIdentity' vendor/k8s.io/client-go/tools/leaderelection/resourcelock/interface.go // HolderIdentity is the ID that owns the lease. If empty, no one owns this lease and // all callers may acquire. Versions of this library prior to Kubernetes 1.14 will not // attempt to acquire leases with empty identities and will wait for the full lease // interval to expire before attempting to reacquire. This value is set to empty when // a client voluntarily steps down. HolderIdentity string `json:"holderIdentity"` The previous assumption that the release would involve ConfigMap deletion was born with the test in 2b81f47626 (cvo: Release our leader lease when we are gracefully terminated, 2019-01-16, #87). --- pkg/start/start_integration_test.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/start/start_integration_test.go b/pkg/start/start_integration_test.go index c3d0a8d93..1faaf3dca 100644 --- a/pkg/start/start_integration_test.go +++ b/pkg/start/start_integration_test.go @@ -560,7 +560,7 @@ func TestIntegrationCVO_gracefulStepDown(t *testing.T) { // wait until the lock record exists err = wait.PollImmediate(200*time.Millisecond, 60*time.Second, func() (bool, error) { - _, err := kc.CoreV1().ConfigMaps(ns).Get(ctx, ns, metav1.GetOptions{}) + _, _, err := lock.Get(ctx) if err != nil { if errors.IsNotFound(err) { return false, nil @@ -582,26 +582,26 @@ func TestIntegrationCVO_gracefulStepDown(t *testing.T) { t.Fatalf("no leader election events found in\n%#v", events.Items) } - t.Logf("after the context is closed, the lock record should be deleted quickly") + t.Logf("after the context is closed, the lock should be released quickly") cancel() startTime := time.Now() var endTime time.Time // the lock should be deleted immediately err = wait.PollImmediate(100*time.Millisecond, 10*time.Second, func() (bool, error) { - _, err := kc.CoreV1().ConfigMaps(ns).Get(ctx, ns, metav1.GetOptions{}) - if errors.IsNotFound(err) { - endTime = time.Now() - return true, nil - } + electionRecord, _, err := lock.Get(ctx) if err != nil { + if errors.IsNotFound(err) { + return false, nil + } return false, err } - return false, nil + endTime = time.Now() + return electionRecord.HolderIdentity == "", nil }) if err != nil { t.Fatal(err) } - t.Logf("lock deleted in %s", endTime.Sub(startTime)) + t.Logf("lock released in %s", endTime.Sub(startTime)) select { case <-time.After(time.Second): From 9c42a921c166fe77de43fad6c517bb6e79aca90d Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 6 Aug 2020 11:51:36 -0700 Subject: [PATCH 3/4] pkg/start: Fill in deferred HandleCrash Clayton wants these in each goroutine we launch [1]. Obviously there's no way to reach inside the informer Start()s and add it there. I'm also adding this to the FIXME comment for rerolling the auto-update worker goroutines; we'll get those straigtened out in future work. [1]: https://github.com/openshift/cluster-version-operator/pull/424/#discussion_r466585901 --- pkg/autoupdate/autoupdate.go | 3 +-- pkg/cvo/cvo.go | 1 - pkg/start/start.go | 6 ++++++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pkg/autoupdate/autoupdate.go b/pkg/autoupdate/autoupdate.go index ce6ecfe5a..f0da98511 100644 --- a/pkg/autoupdate/autoupdate.go +++ b/pkg/autoupdate/autoupdate.go @@ -85,7 +85,6 @@ func New( // Run runs the autoupdate controller. func (ctrl *Controller) Run(ctx context.Context, workers int) error { - defer utilruntime.HandleCrash() defer ctrl.queue.ShutDown() klog.Info("Starting AutoUpdateController") @@ -96,7 +95,7 @@ func (ctrl *Controller) Run(ctx context.Context, workers int) error { } for i := 0; i < workers; i++ { - // FIXME: actually wait until these complete if the Context is canceled. + // FIXME: actually wait until these complete if the Context is canceled. And possibly add utilruntime.HandleCrash. go wait.UntilWithContext(ctx, ctrl.worker, time.Second) } diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index 2a9bc7c4a..e8629c1ef 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -291,7 +291,6 @@ func loadConfigMapVerifierDataFromUpdate(update *payload.Update, clientBuilder s // Run runs the cluster version operator until stopCh is completed. Workers is ignored for now. func (optr *Operator) Run(ctx context.Context, workers int) error { - defer utilruntime.HandleCrash() defer optr.queue.ShutDown() stopCh := ctx.Done() workerStopCh := make(chan struct{}) diff --git a/pkg/start/start.go b/pkg/start/start.go index 68aca64c9..43a72040b 100644 --- a/pkg/start/start.go +++ b/pkg/start/start.go @@ -16,6 +16,7 @@ import ( "github.com/google/uuid" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" @@ -184,6 +185,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc defer func() { signal.Stop(ch) }() signal.Notify(ch, os.Interrupt, syscall.SIGTERM) go func() { + defer utilruntime.HandleCrash() sig := <-ch klog.Infof("Shutting down due to %s", sig) runCancel() @@ -204,6 +206,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc } resultChannelCount++ go func() { + defer utilruntime.HandleCrash() err := cvo.RunMetrics(postMainContext, shutdownContext, o.ListenAddr, tlsConfig) resultChannel <- asyncResult{name: "metrics server", error: err} }() @@ -218,6 +221,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc resultChannelCount++ go func() { + defer utilruntime.HandleCrash() leaderelection.RunOrDie(postMainContext, leaderelection.LeaderElectionConfig{ Lock: lock, ReleaseOnCancel: true, @@ -228,6 +232,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc OnStartedLeading: func(_ context.Context) { // no need for this passed-through postMainContext, because goroutines we launch inside will use runContext resultChannelCount++ go func() { + defer utilruntime.HandleCrash() err := controllerCtx.CVO.Run(runContext, 2) resultChannel <- asyncResult{name: "main operator", error: err} }() @@ -235,6 +240,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc if controllerCtx.AutoUpdate != nil { resultChannelCount++ go func() { + defer utilruntime.HandleCrash() err := controllerCtx.AutoUpdate.Run(runContext, 2) resultChannel <- asyncResult{name: "auto-update controller", error: err} }() From 22f35538e7866fd15b5d5810c9c922e5b055ad63 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Tue, 25 Aug 2020 12:42:59 -0700 Subject: [PATCH 4/4] cmd/start: Include the version in the outgoing log line Lala wanted the version included in the outgoing log line [1]. I'm not sure why you'd be wondering which version of the CVO code was running for that particular line, and not for other lines in the log, but including the version there is easy enough. While we're thinking about logging the CVO version, also remove the useless %s formatting from the opening log line, because we don't need to manipulate version.String at all. [1]: https://github.com/openshift/cluster-version-operator/pull/424#discussion_r476658319 --- cmd/start.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/start.go b/cmd/start.go index 4eb5bfa50..32488e13d 100644 --- a/cmd/start.go +++ b/cmd/start.go @@ -18,12 +18,12 @@ func init() { Long: "", Run: func(cmd *cobra.Command, args []string) { // To help debugging, immediately log version - klog.Infof("%s", version.String) + klog.Info(version.String) if err := opts.Run(context.Background()); err != nil { klog.Fatalf("error: %v", err) } - klog.Info("Graceful shutdown complete.") + klog.Infof("Graceful shutdown complete for %s.", version.String) }, }