Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add clean leader lease release to controller
Previously we did not cleanly release the leader lease on controller
shutdown, which resulted in slow leader elections as the existing lease
was forced to time out when a pod got rescheduled.

This adds a signal handler, a context, and sets the leaderelection
settings such that when the controller receives a shutdown signal, it
will release its leader lease and terminate so the new leader can more
quickly take over.
  • Loading branch information
jkyros committed Jun 17, 2022
commit 49857fd3971547abe08c1623ba5bdb459bce472d
25 changes: 18 additions & 7 deletions cmd/machine-config-controller/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"flag"
"fmt"
"os"

"github.com/golang/glog"
"github.com/openshift/machine-config-operator/cmd/common"
Expand All @@ -18,6 +19,7 @@ import (
"github.com/openshift/machine-config-operator/pkg/version"
"github.com/spf13/cobra"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"k8s.io/client-go/tools/leaderelection"
)

Expand Down Expand Up @@ -48,6 +50,10 @@ func runStartCmd(cmd *cobra.Command, args []string) {
flag.Set("logtostderr", "true")
flag.Parse()

// This is 'main' context that we thread through the controller context and
// the leader elections. Cancelling this is "stop everything, we are shutting down".
runContext, runCancel := context.WithCancel(context.Background())

// To help debugging, immediately log version
glog.Infof("Version: %+v (%s)", version.Raw, version.Hash)

Expand All @@ -56,6 +62,8 @@ func runStartCmd(cmd *cobra.Command, args []string) {
ctrlcommon.WriteTerminationError(fmt.Errorf("creating clients: %w", err))
}
run := func(ctx context.Context) {
go common.SignalHandler(runCancel)

ctrlctx := ctrlcommon.CreateControllerContext(cb, ctx.Done(), componentName)

// Start the metrics handler
Expand All @@ -82,20 +90,23 @@ func runStartCmd(cmd *cobra.Command, args []string) {
}
go draincontroller.Run(5, ctrlctx.Stop)

select {}
// wait here in this function until the context gets cancelled (which tells us whe were being shut down)
<-ctx.Done()
}

leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig())

leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{
Lock: common.CreateResourceLock(cb, startOpts.resourceLockNamespace, componentName),
LeaseDuration: leaderElectionCfg.LeaseDuration.Duration,
RenewDeadline: leaderElectionCfg.RenewDeadline.Duration,
RetryPeriod: leaderElectionCfg.RetryPeriod.Duration,
leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{
Lock: common.CreateResourceLock(cb, startOpts.resourceLockNamespace, componentName),
ReleaseOnCancel: true,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I correct in understanding that essentially, this release is the crux of the PR here. Previously, we took the leader election (lock?) and never released it until the timeout happens?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are correct.

LeaseDuration: leaderElectionCfg.LeaseDuration.Duration,
RenewDeadline: leaderElectionCfg.RenewDeadline.Duration,
RetryPeriod: leaderElectionCfg.RetryPeriod.Duration,
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: run,
OnStoppedLeading: func() {
glog.Fatalf("leaderelection lost")
glog.Infof("Stopped leading. Terminating.")
os.Exit(0)
},
},
})
Expand Down