Skip to content

Commit 4aa21da

Browse files
Merge pull request #3185 from jkyros/bugs/controller-slow-leaderelection
Bug 1817075: MCC & MCO don't free leader leases during shut down -> 10 minutes of leader election timeouts
2 parents e595ae9 + 61a9005 commit 4aa21da

File tree

4 files changed

+62
-16
lines changed

4 files changed

+62
-16
lines changed

cmd/common/helpers.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ package common
22

33
import (
44
"os"
5+
"os/signal"
6+
"syscall"
57

68
"github.com/golang/glog"
79
"github.com/openshift/machine-config-operator/internal/clients"
@@ -69,3 +71,23 @@ func GetLeaderElectionConfig(restcfg *rest.Config) configv1.LeaderElection {
6971

7072
return defaultLeaderElection
7173
}
74+
75+
// SignalHandler catches SIGINT/SIGTERM signals and makes sure the passed context gets cancelled when those signals happen. This allows us to use a
76+
// context to shut down our operations cleanly when we are signalled to shutdown.
77+
func SignalHandler(runCancel context.CancelFunc) {
78+
79+
// make a signal handling channel for os signals
80+
ch := make(chan os.Signal, 1)
81+
// stop listening for signals when we leave this function
82+
defer func() { signal.Stop(ch) }()
83+
// catch SIGINT and SIGTERM
84+
signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
85+
sig := <-ch
86+
glog.Infof("Shutting down due to: %s", sig)
87+
// if we're shutting down, cancel the context so everything else will stop
88+
runCancel()
89+
glog.Infof("Context cancelled")
90+
sig = <-ch
91+
glog.Fatalf("Received shutdown signal twice, exiting: %s", sig)
92+
93+
}

cmd/machine-config-controller/start.go

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"flag"
66
"fmt"
7+
"os"
78

89
"github.com/golang/glog"
910
"github.com/openshift/machine-config-operator/cmd/common"
@@ -48,6 +49,10 @@ func runStartCmd(cmd *cobra.Command, args []string) {
4849
flag.Set("logtostderr", "true")
4950
flag.Parse()
5051

52+
// This is 'main' context that we thread through the controller context and
53+
// the leader elections. Cancelling this is "stop everything, we are shutting down".
54+
runContext, runCancel := context.WithCancel(context.Background())
55+
5156
// To help debugging, immediately log version
5257
glog.Infof("Version: %+v (%s)", version.Raw, version.Hash)
5358

@@ -56,6 +61,8 @@ func runStartCmd(cmd *cobra.Command, args []string) {
5661
ctrlcommon.WriteTerminationError(fmt.Errorf("creating clients: %w", err))
5762
}
5863
run := func(ctx context.Context) {
64+
go common.SignalHandler(runCancel)
65+
5966
ctrlctx := ctrlcommon.CreateControllerContext(cb, ctx.Done(), componentName)
6067

6168
// Start the metrics handler
@@ -82,20 +89,23 @@ func runStartCmd(cmd *cobra.Command, args []string) {
8289
}
8390
go draincontroller.Run(5, ctrlctx.Stop)
8491

85-
select {}
92+
// wait here in this function until the context gets cancelled (which tells us whe were being shut down)
93+
<-ctx.Done()
8694
}
8795

8896
leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig())
8997

90-
leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{
91-
Lock: common.CreateResourceLock(cb, startOpts.resourceLockNamespace, componentName),
92-
LeaseDuration: leaderElectionCfg.LeaseDuration.Duration,
93-
RenewDeadline: leaderElectionCfg.RenewDeadline.Duration,
94-
RetryPeriod: leaderElectionCfg.RetryPeriod.Duration,
98+
leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{
99+
Lock: common.CreateResourceLock(cb, startOpts.resourceLockNamespace, componentName),
100+
ReleaseOnCancel: true,
101+
LeaseDuration: leaderElectionCfg.LeaseDuration.Duration,
102+
RenewDeadline: leaderElectionCfg.RenewDeadline.Duration,
103+
RetryPeriod: leaderElectionCfg.RetryPeriod.Duration,
95104
Callbacks: leaderelection.LeaderCallbacks{
96105
OnStartedLeading: run,
97106
OnStoppedLeading: func() {
98-
glog.Fatalf("leaderelection lost")
107+
glog.Infof("Stopped leading. Terminating.")
108+
os.Exit(0)
99109
},
100110
},
101111
})

cmd/machine-config-operator/start.go

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ func runStartCmd(cmd *cobra.Command, args []string) {
3939
flag.Set("logtostderr", "true")
4040
flag.Parse()
4141

42+
// This is 'main' context that we thread through the controller context and
43+
// the leader elections. Cancelling this is "stop everything, we are shutting down".
44+
runContext, runCancel := context.WithCancel(context.Background())
45+
4246
// To help debugging, immediately log version
4347
glog.Infof("Version: %s (Raw: %s, Hash: %s)", os.Getenv("RELEASE_VERSION"), version.Raw, version.Hash)
4448

@@ -51,6 +55,8 @@ func runStartCmd(cmd *cobra.Command, args []string) {
5155
glog.Fatalf("error creating clients: %v", err)
5256
}
5357
run := func(ctx context.Context) {
58+
go common.SignalHandler(runCancel)
59+
5460
ctrlctx := ctrlcommon.CreateControllerContext(cb, ctx.Done(), ctrlcommon.MCONamespace)
5561
controller := operator.New(
5662
ctrlcommon.MCONamespace, componentName,
@@ -91,20 +97,23 @@ func runStartCmd(cmd *cobra.Command, args []string) {
9197

9298
go controller.Run(2, ctrlctx.Stop)
9399

94-
select {}
100+
// wait here in this function until the context gets cancelled (which tells us whe were being shut down)
101+
<-ctx.Done()
95102
}
96103

97104
leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig())
98105

99-
leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{
100-
Lock: common.CreateResourceLock(cb, ctrlcommon.MCONamespace, componentName),
101-
LeaseDuration: leaderElectionCfg.LeaseDuration.Duration,
102-
RenewDeadline: leaderElectionCfg.RenewDeadline.Duration,
103-
RetryPeriod: leaderElectionCfg.RetryPeriod.Duration,
106+
leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{
107+
Lock: common.CreateResourceLock(cb, ctrlcommon.MCONamespace, componentName),
108+
ReleaseOnCancel: true,
109+
LeaseDuration: leaderElectionCfg.LeaseDuration.Duration,
110+
RenewDeadline: leaderElectionCfg.RenewDeadline.Duration,
111+
RetryPeriod: leaderElectionCfg.RetryPeriod.Duration,
104112
Callbacks: leaderelection.LeaderCallbacks{
105113
OnStartedLeading: run,
106114
OnStoppedLeading: func() {
107-
glog.Fatalf("leaderelection lost")
115+
glog.Info("Stopped leading. Terminating.")
116+
os.Exit(0)
108117
},
109118
},
110119
})

pkg/controller/common/metrics.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,12 @@ func StartMetricsListener(addr string, stopCh <-chan struct{}) {
6262
}
6363
}()
6464
<-stopCh
65-
if err := s.Shutdown(context.Background()); err != http.ErrServerClosed {
66-
glog.Errorf("error stopping metrics listener: %v", err)
65+
if err := s.Shutdown(context.Background()); err != nil {
66+
if err != http.ErrServerClosed {
67+
glog.Errorf("error stopping metrics listener: %v", err)
68+
}
69+
} else {
70+
glog.Infof("Metrics listener successfully stopped")
6771
}
72+
6873
}

0 commit comments

Comments
 (0)