Skip to content

Commit 4509154

Browse files
committed
Gracefully shut down operator and release lease
This: - adds main/lease contexts to the operator - sets up a counter and channels to track goroutine completion - sets up a signal handler to catch when the operator is being terminated so we can cancel our contexts - gracefully shuts down the operator upon receipt of a SIGINT/SIGTERM The reason this does not use sync.WaitGroup instead is that sync.WaitGroup has no awareness of 'what' it's waiting for, just 'how many', so the channels are more useful. Cribbed off of what the CVO did here: openshift/cluster-version-operator#424
1 parent 1ac1bce commit 4509154

File tree

1 file changed

+77
-11
lines changed

1 file changed

+77
-11
lines changed

cmd/machine-config-operator/start.go

Lines changed: 77 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"flag"
66
"os"
7+
"time"
78

89
"github.com/golang/glog"
910
"github.com/openshift/machine-config-operator/cmd/common"
@@ -12,6 +13,7 @@ import (
1213
"github.com/openshift/machine-config-operator/pkg/operator"
1314
"github.com/openshift/machine-config-operator/pkg/version"
1415
"github.com/spf13/cobra"
16+
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
1517
"k8s.io/client-go/tools/leaderelection"
1618
)
1719

@@ -35,10 +37,24 @@ func init() {
3537
startCmd.PersistentFlags().StringVar(&startOpts.imagesFile, "images-json", "", "images.json file for MCO.")
3638
}
3739

40+
type asyncResult struct {
41+
name string
42+
error error
43+
}
44+
3845
func runStartCmd(cmd *cobra.Command, args []string) {
3946
flag.Set("logtostderr", "true")
4047
flag.Parse()
4148

49+
// This is the context that signals whether the operator should be running and doing work
50+
runContext, runCancel := context.WithCancel(context.Background())
51+
// This is the context that signals whether we should release our leader lease
52+
leaderContext, leaderCancel := context.WithCancel(context.Background())
53+
54+
// So we can collect status of our goroutines
55+
resultChannel := make(chan asyncResult, 1)
56+
resultChannelCount := 0
57+
4258
// To help debugging, immediately log version
4359
glog.Infof("Version: %s (Raw: %s, Hash: %s)", os.Getenv("RELEASE_VERSION"), version.Raw, version.Hash)
4460

@@ -50,8 +66,11 @@ func runStartCmd(cmd *cobra.Command, args []string) {
5066
if err != nil {
5167
glog.Fatalf("error creating clients: %v", err)
5268
}
53-
run := func(ctx context.Context) {
54-
ctrlctx := ctrlcommon.CreateControllerContext(cb, ctx.Done(), ctrlcommon.MCONamespace)
69+
run := func(_ context.Context) {
70+
71+
go common.SignalHandler(runCancel)
72+
73+
ctrlctx := ctrlcommon.CreateControllerContext(cb, runContext.Done(), ctrlcommon.MCONamespace)
5574
controller := operator.New(
5675
ctrlcommon.MCONamespace, componentName,
5776
startOpts.imagesFile,
@@ -89,22 +108,69 @@ func runStartCmd(cmd *cobra.Command, args []string) {
89108
ctrlctx.KubeMAOSharedInformer.Start(ctrlctx.Stop)
90109
close(ctrlctx.InformersStarted)
91110

92-
go controller.Run(2, ctrlctx.Stop)
111+
resultChannelCount++
112+
go func() {
113+
defer utilruntime.HandleCrash()
114+
controller.Run(runContext, 2)
115+
resultChannel <- asyncResult{name: "main operator", error: err}
116+
}()
117+
118+
// TODO(jkyros); This might be overkill for the operator, it only has one goroutine
119+
var shutdownTimer *time.Timer
120+
for resultChannelCount > 0 {
121+
glog.Infof("Waiting on %d outstanding goroutines.", resultChannelCount)
122+
if shutdownTimer == nil { // running
123+
select {
124+
case <-runContext.Done():
125+
glog.Info("Run context completed; beginning two-minute graceful shutdown period.")
126+
shutdownTimer = time.NewTimer(2 * time.Minute)
93127

94-
select {}
128+
case result := <-resultChannel:
129+
// TODO(jkyros): one of our goroutines puked early, this means we shut down everything.
130+
resultChannelCount--
131+
if result.error == nil {
132+
glog.Infof("Collected %s goroutine.", result.name)
133+
} else {
134+
glog.Errorf("Collected %s goroutine: %v", result.name, result.error)
135+
runCancel() // this will cause shutdownTimer initialization in the next loop
136+
}
137+
}
138+
} else { // shutting down
139+
select {
140+
case <-shutdownTimer.C: // never triggers after the channel is stopped, although it would not matter much if it did because subsequent cancel calls do nothing.
141+
leaderCancel()
142+
shutdownTimer.Stop()
143+
case result := <-resultChannel:
144+
resultChannelCount--
145+
if result.error == nil {
146+
glog.Infof("Collected %s goroutine.", result.name)
147+
} else {
148+
glog.Errorf("Collected %s goroutine: %v", result.name, result.error)
149+
}
150+
if resultChannelCount == 0 {
151+
glog.Info("That was the last one, cancelling the leader lease.")
152+
leaderCancel()
153+
}
154+
}
155+
}
156+
}
157+
glog.Info("Finished collecting operator goroutines.")
95158
}
96159

97-
leaderElectionCfg := common.GetLeaderElectionConfig(cb.GetBuilderConfig())
160+
// TODO(jkyros): should this be a different "pre-run" context here?
161+
leaderElectionCfg := common.GetLeaderElectionConfig(runContext, cb.GetBuilderConfig())
98162

99-
leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{
100-
Lock: common.CreateResourceLock(cb, ctrlcommon.MCONamespace, componentName),
101-
LeaseDuration: leaderElectionCfg.LeaseDuration.Duration,
102-
RenewDeadline: leaderElectionCfg.RenewDeadline.Duration,
103-
RetryPeriod: leaderElectionCfg.RetryPeriod.Duration,
163+
leaderelection.RunOrDie(leaderContext, leaderelection.LeaderElectionConfig{
164+
Lock: common.CreateResourceLock(cb, ctrlcommon.MCONamespace, componentName),
165+
ReleaseOnCancel: true,
166+
LeaseDuration: leaderElectionCfg.LeaseDuration.Duration,
167+
RenewDeadline: leaderElectionCfg.RenewDeadline.Duration,
168+
RetryPeriod: leaderElectionCfg.RetryPeriod.Duration,
104169
Callbacks: leaderelection.LeaderCallbacks{
105170
OnStartedLeading: run,
106171
OnStoppedLeading: func() {
107-
glog.Fatalf("leaderelection lost")
172+
glog.Infof("Stopped leading. Terminating.")
173+
os.Exit(0)
108174
},
109175
},
110176
})

0 commit comments

Comments
 (0)