From 7ae934acfd692926c4affec2c4923e713f3be345 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 11 Feb 2021 14:29:28 -0800 Subject: [PATCH] pkg/start: Fix shutdown deadlock when die before getting a leader lock a9e075a79d (pkg/cvo/cvo: Guard Operator.Run goroutine handling from early cancels, 2021-01-28, #508) made us more robust to situations where we are canceled after acquiring the leader lock but before we got into Operator.Run's UntilWithContext. However, there was still a bug from cc1921d186 (pkg/start: Release leader lease on graceful shutdown, 2020-08-03, #424) where we had not acquired the leader lock [1]. postMainContext is used for metrics, informers, and the leader election loop. We used to only call postMainCancel after reaping the main goroutine, and obviously that will only work if we've launched the main goroutine. This commit adds a new launchedMain to track that. If launchedMain is true, we get the old handling. If launchedMain is still false when runContext.Done, we now call postMainCancel without waiting to reap a non-existent main goroutine. There's also a new postMainCancel when the shutdown timer expires. I don't expect us to ever need that, but it protects us from future bugs like this one. I've added launchedMain without guarding it behind a lock, and it is touched by both the main Options.run goroutine and the leader-election callback. So there's a racy chance of: 1. Options.run goroutine: runContext canceled, so runContext.Done() matches 2. Leader-election goroutine: Leader lock acquired 3. Options.run goroutine: !launchedMain, so we call postMainCancel() 4. Leader-election goroutine: launchedMain set true 5. Leader-election goroutine: launches the main goroutine via CVO.Run(runContext, ...) I'm trusting Operator.Run to respect runContext there and not do anything significant, so the fact that we are already tearing down all the post-main stuff won't cause problems. Previous fixes like a9e075a79d will help with that. But there could still be bugs in Operator.Run. A lock around launchedMain that avoided calling Operator.Run when runContext was already done would protect against that, but seems like overkill in an already complicated goroutine tangle. Without the lock, we just have to field and fix any future Operator.Run runContext issues as we find them. [1]: https://bugzilla.redhat.com/show_bug.cgi?id=1927944 --- pkg/start/start.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/start/start.go b/pkg/start/start.go index d9e3494dc..d53187cf6 100644 --- a/pkg/start/start.go +++ b/pkg/start/start.go @@ -184,6 +184,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc defer shutdownCancel() postMainContext, postMainCancel := context.WithCancel(context.Background()) // extends beyond ctx defer postMainCancel() + launchedMain := false ch := make(chan os.Signal, 1) defer func() { signal.Stop(ch) }() @@ -234,6 +235,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc RetryPeriod: retryPeriod, Callbacks: leaderelection.LeaderCallbacks{ OnStartedLeading: func(_ context.Context) { // no need for this passed-through postMainContext, because goroutines we launch inside will use runContext + launchedMain = true resultChannelCount++ go func() { defer utilruntime.HandleCrash() @@ -267,6 +269,9 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc case <-runContext.Done(): klog.Info("Run context completed; beginning two-minute graceful shutdown period.") shutdownTimer = time.NewTimer(2 * time.Minute) + if !launchedMain { // no need to give post-main extra time if main never ran + postMainCancel() + } case result := <-resultChannel: resultChannelCount-- if result.error == nil { @@ -282,6 +287,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc } else { // shutting down select { case <-shutdownTimer.C: // never triggers after the channel is stopped, although it would not matter much if it did because subsequent cancel calls do nothing. + postMainCancel() shutdownCancel() shutdownTimer.Stop() case result := <-resultChannel: