-
Notifications
You must be signed in to change notification settings - Fork 212
Bug 1843505: pkg/start: Release leader lease on graceful shutdown #424
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
cc1921d
dd09c3f
9c42a92
22f3553
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,13 +10,13 @@ import ( | |
| "math/rand" | ||
| "os" | ||
| "os/signal" | ||
| "sync" | ||
| "syscall" | ||
| "time" | ||
|
|
||
| "github.com/google/uuid" | ||
| v1 "k8s.io/api/core/v1" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| utilruntime "k8s.io/apimachinery/pkg/util/runtime" | ||
| "k8s.io/client-go/informers" | ||
| "k8s.io/client-go/kubernetes" | ||
| "k8s.io/client-go/kubernetes/scheme" | ||
|
|
@@ -77,6 +77,11 @@ type Options struct { | |
| ResyncInterval time.Duration | ||
| } | ||
|
|
||
| type asyncResult struct { | ||
| name string | ||
| error error | ||
| } | ||
|
|
||
| func defaultEnv(name, defaultValue string) string { | ||
| env, ok := os.LookupEnv(name) | ||
| if !ok { | ||
|
|
@@ -101,7 +106,7 @@ func NewOptions() *Options { | |
| } | ||
| } | ||
|
|
||
| func (o *Options) Run() error { | ||
| func (o *Options) Run(ctx context.Context) error { | ||
| if o.NodeName == "" { | ||
| return fmt.Errorf("node-name is required") | ||
| } | ||
|
|
@@ -137,29 +142,6 @@ func (o *Options) Run() error { | |
| return err | ||
| } | ||
|
|
||
| // TODO: Kube 1.14 will contain a ReleaseOnCancel boolean on | ||
| // LeaderElectionConfig that allows us to have the lock code | ||
| // release the lease when this context is cancelled. At that | ||
| // time we can remove our changes to OnStartedLeading. | ||
| ctx, cancel := context.WithCancel(context.Background()) | ||
| defer cancel() | ||
| ch := make(chan os.Signal, 1) | ||
| defer func() { signal.Stop(ch) }() | ||
| signal.Notify(ch, os.Interrupt, syscall.SIGTERM) | ||
| go func() { | ||
| sig := <-ch | ||
| klog.Infof("Shutting down due to %s", sig) | ||
| cancel() | ||
|
|
||
| // exit after 2s no matter what | ||
| select { | ||
| case <-time.After(5 * time.Second): | ||
| klog.Fatalf("Exiting") | ||
| case <-ch: | ||
| klog.Fatalf("Received shutdown signal twice, exiting") | ||
| } | ||
| }() | ||
|
|
||
| o.run(ctx, controllerCtx, lock) | ||
| return nil | ||
| } | ||
|
|
@@ -186,13 +168,33 @@ func (o *Options) makeTLSConfig() (*tls.Config, error) { | |
| }), nil | ||
| } | ||
|
|
||
| // run launches a number of goroutines to handle manifest application, | ||
| // metrics serving, etc. It continues operating until ctx.Done(), | ||
| // and then attempts a clean shutdown limited by an internal context | ||
| // with a two-minute cap. It returns after it successfully collects all | ||
| // launched goroutines. | ||
| func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourcelock.ConfigMapLock) { | ||
| runContext, runCancel := context.WithCancel(ctx) | ||
| runContext, runCancel := context.WithCancel(ctx) // so we can cancel internally on errors or TERM | ||
| defer runCancel() | ||
| shutdownContext, shutdownCancel := context.WithCancel(ctx) | ||
| shutdownContext, shutdownCancel := context.WithCancel(context.Background()) // extends beyond ctx | ||
| defer shutdownCancel() | ||
| errorChannel := make(chan error, 1) | ||
| errorChannelCount := 0 | ||
| postMainContext, postMainCancel := context.WithCancel(context.Background()) // extends beyond ctx | ||
| defer postMainCancel() | ||
|
|
||
| ch := make(chan os.Signal, 1) | ||
| defer func() { signal.Stop(ch) }() | ||
| signal.Notify(ch, os.Interrupt, syscall.SIGTERM) | ||
| go func() { | ||
wking marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| defer utilruntime.HandleCrash() | ||
| sig := <-ch | ||
| klog.Infof("Shutting down due to %s", sig) | ||
| runCancel() | ||
| sig = <-ch | ||
| klog.Fatalf("Received shutdown signal twice, exiting: %s", sig) | ||
| }() | ||
|
|
||
| resultChannel := make(chan asyncResult, 1) | ||
| resultChannelCount := 0 | ||
| if o.ListenAddr != "" { | ||
| var tlsConfig *tls.Config | ||
| if o.ServingCertFile != "" || o.ServingKeyFile != "" { | ||
|
|
@@ -202,85 +204,96 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc | |
| klog.Fatalf("Failed to create TLS config: %v", err) | ||
| } | ||
| } | ||
| errorChannelCount++ | ||
| resultChannelCount++ | ||
| go func() { | ||
| errorChannel <- cvo.RunMetrics(runContext, shutdownContext, o.ListenAddr, tlsConfig) | ||
| defer utilruntime.HandleCrash() | ||
| err := cvo.RunMetrics(postMainContext, shutdownContext, o.ListenAddr, tlsConfig) | ||
| resultChannel <- asyncResult{name: "metrics server", error: err} | ||
| }() | ||
| } | ||
|
|
||
| exit := make(chan struct{}) | ||
| exitClose := sync.Once{} | ||
|
|
||
| // TODO: when we switch to graceful lock shutdown, this can be | ||
| // moved back inside RunOrDie | ||
| // TODO: properly wire ctx here | ||
| go leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{ | ||
| Lock: lock, | ||
| LeaseDuration: leaseDuration, | ||
| RenewDeadline: renewDeadline, | ||
| RetryPeriod: retryPeriod, | ||
| Callbacks: leaderelection.LeaderCallbacks{ | ||
| OnStartedLeading: func(localCtx context.Context) { | ||
| controllerCtx.Start(runContext) | ||
| select { | ||
| case <-runContext.Done(): | ||
| // WARNING: this is not completely safe until we have Kube 1.14 and ReleaseOnCancel | ||
| // and client-go ContextCancelable, which allows us to block new API requests before | ||
| // we step down. However, the CVO isn't that sensitive to races and can tolerate | ||
| // brief overlap. | ||
| klog.Infof("Stepping down as leader") | ||
| // give the controllers some time to shut down | ||
| time.Sleep(100 * time.Millisecond) | ||
| // if we still hold the leader lease, clear the owner identity (other lease watchers | ||
| // still have to wait for expiration) like the new ReleaseOnCancel code will do. | ||
| if err := lock.Update(localCtx, resourcelock.LeaderElectionRecord{}); err == nil { | ||
| // if we successfully clear the owner identity, we can safely delete the record | ||
| if err := lock.Client.ConfigMaps(lock.ConfigMapMeta.Namespace).Delete(localCtx, lock.ConfigMapMeta.Name, metav1.DeleteOptions{}); err != nil { | ||
| klog.Warningf("Unable to step down cleanly: %v", err) | ||
| } | ||
| informersDone := postMainContext.Done() | ||
| // FIXME: would be nice if there was a way to collect these. | ||
wking marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| controllerCtx.CVInformerFactory.Start(informersDone) | ||
| controllerCtx.OpenshiftConfigInformerFactory.Start(informersDone) | ||
| controllerCtx.OpenshiftConfigManagedInformerFactory.Start(informersDone) | ||
| controllerCtx.InformerFactory.Start(informersDone) | ||
|
|
||
| resultChannelCount++ | ||
| go func() { | ||
| defer utilruntime.HandleCrash() | ||
| leaderelection.RunOrDie(postMainContext, leaderelection.LeaderElectionConfig{ | ||
| Lock: lock, | ||
| ReleaseOnCancel: true, | ||
| LeaseDuration: leaseDuration, | ||
| RenewDeadline: renewDeadline, | ||
| RetryPeriod: retryPeriod, | ||
| Callbacks: leaderelection.LeaderCallbacks{ | ||
| OnStartedLeading: func(_ context.Context) { // no need for this passed-through postMainContext, because goroutines we launch inside will use runContext | ||
| resultChannelCount++ | ||
| go func() { | ||
| defer utilruntime.HandleCrash() | ||
| err := controllerCtx.CVO.Run(runContext, 2) | ||
| resultChannel <- asyncResult{name: "main operator", error: err} | ||
| }() | ||
|
|
||
| if controllerCtx.AutoUpdate != nil { | ||
| resultChannelCount++ | ||
| go func() { | ||
| defer utilruntime.HandleCrash() | ||
| err := controllerCtx.AutoUpdate.Run(runContext, 2) | ||
| resultChannel <- asyncResult{name: "auto-update controller", error: err} | ||
| }() | ||
| } | ||
| klog.Infof("Finished shutdown") | ||
| exitClose.Do(func() { close(exit) }) | ||
| case <-localCtx.Done(): | ||
| // we will exit in OnStoppedLeading | ||
| } | ||
| }, | ||
| OnStoppedLeading: func() { | ||
| klog.Warning("leaderelection lost") | ||
| exitClose.Do(func() { close(exit) }) | ||
| }, | ||
| OnStoppedLeading: func() { | ||
| klog.Info("Stopped leading; shutting down.") | ||
| runCancel() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you still need to exit, don't you? How confident are you that this truly resets everything?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
If it doesn't, CI should turn it up, and we'll fix those bugs ;)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't sound like what I described on slack. If we lost lease, we exit immediately, no graceful step down. When we have lost our lease we should not be running.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| }, | ||
| }, | ||
| }, | ||
| }) | ||
| }) | ||
| resultChannel <- asyncResult{name: "leader controller", error: nil} | ||
| }() | ||
|
|
||
| for errorChannelCount > 0 { | ||
| var shutdownTimer *time.Timer | ||
| var shutdownTimer *time.Timer | ||
| for resultChannelCount > 0 { | ||
| klog.Infof("Waiting on %d outstanding goroutines.", resultChannelCount) | ||
| if shutdownTimer == nil { // running | ||
| select { | ||
| case <-runContext.Done(): | ||
| klog.Info("Run context completed; beginning two-minute graceful shutdown period.") | ||
| shutdownTimer = time.NewTimer(2 * time.Minute) | ||
| case err := <-errorChannel: | ||
| errorChannelCount-- | ||
| if err != nil { | ||
| klog.Error(err) | ||
| case result := <-resultChannel: | ||
| resultChannelCount-- | ||
| if result.error == nil { | ||
| klog.Infof("Collected %s goroutine.", result.name) | ||
| } else { | ||
| klog.Errorf("Collected %s goroutine: %v", result.name, result.error) | ||
| runCancel() // this will cause shutdownTimer initialization in the next loop | ||
| } | ||
| if result.name == "main operator" { | ||
| postMainCancel() | ||
| } | ||
| } | ||
| } else { // shutting down | ||
| select { | ||
| case <-shutdownTimer.C: // never triggers after the channel is stopped, although it would not matter much if it did because subsequent cancel calls do nothing. | ||
| shutdownCancel() | ||
| shutdownTimer.Stop() | ||
| case err := <-errorChannel: | ||
| errorChannelCount-- | ||
| if err != nil { | ||
| klog.Error(err) | ||
| runCancel() | ||
| case result := <-resultChannel: | ||
| resultChannelCount-- | ||
| if result.error == nil { | ||
| klog.Infof("Collected %s goroutine.", result.name) | ||
| } else { | ||
| klog.Errorf("Collected %s goroutine: %v", result.name, result.error) | ||
| } | ||
| if result.name == "main operator" { | ||
| postMainCancel() | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| <-exit | ||
| klog.Info("Finished collecting operator goroutines.") | ||
| } | ||
|
|
||
| // createResourceLock initializes the lock. | ||
|
|
@@ -440,17 +453,3 @@ func (o *Options) NewControllerContext(cb *ClientBuilder) *Context { | |
| } | ||
| return ctx | ||
| } | ||
|
|
||
| // Start launches the controllers in the provided context and any supporting | ||
| // infrastructure. When ch is closed the controllers will be shut down. | ||
| func (c *Context) Start(ctx context.Context) { | ||
| ch := ctx.Done() | ||
| go c.CVO.Run(ctx, 2) | ||
| if c.AutoUpdate != nil { | ||
| go c.AutoUpdate.Run(ctx, 2) | ||
| } | ||
| c.CVInformerFactory.Start(ch) | ||
| c.OpenshiftConfigInformerFactory.Start(ch) | ||
| c.OpenshiftConfigManagedInformerFactory.Start(ch) | ||
| c.InformerFactory.Start(ch) | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.