44 "context"
55 "flag"
66 "os"
7+ "time"
78
89 "github.com/golang/glog"
910 "github.com/openshift/machine-config-operator/cmd/common"
@@ -12,6 +13,7 @@ import (
1213 "github.com/openshift/machine-config-operator/pkg/operator"
1314 "github.com/openshift/machine-config-operator/pkg/version"
1415 "github.com/spf13/cobra"
16+ utilruntime "k8s.io/apimachinery/pkg/util/runtime"
1517 "k8s.io/client-go/tools/leaderelection"
1618)
1719
@@ -35,10 +37,24 @@ func init() {
3537 startCmd .PersistentFlags ().StringVar (& startOpts .imagesFile , "images-json" , "" , "images.json file for MCO." )
3638}
3739
40+ type asyncResult struct {
41+ name string
42+ error error
43+ }
44+
3845func runStartCmd (cmd * cobra.Command , args []string ) {
3946 flag .Set ("logtostderr" , "true" )
4047 flag .Parse ()
4148
49+ // This is the context that signals whether the operator should be running and doing work
50+ runContext , runCancel := context .WithCancel (context .Background ())
51+ // This is the context that signals whether we should release our leader lease
52+ leaderContext , leaderCancel := context .WithCancel (context .Background ())
53+
54+ // So we can collect status of our goroutines
55+ resultChannel := make (chan asyncResult , 1 )
56+ resultChannelCount := 0
57+
4258 // To help debugging, immediately log version
4359 glog .Infof ("Version: %s (Raw: %s, Hash: %s)" , os .Getenv ("RELEASE_VERSION" ), version .Raw , version .Hash )
4460
@@ -50,8 +66,11 @@ func runStartCmd(cmd *cobra.Command, args []string) {
5066 if err != nil {
5167 glog .Fatalf ("error creating clients: %v" , err )
5268 }
53- run := func (ctx context.Context ) {
54- ctrlctx := ctrlcommon .CreateControllerContext (cb , ctx .Done (), ctrlcommon .MCONamespace )
69+ run := func (_ context.Context ) {
70+
71+ go common .SignalHandler (runCancel )
72+
73+ ctrlctx := ctrlcommon .CreateControllerContext (cb , runContext .Done (), ctrlcommon .MCONamespace )
5574 controller := operator .New (
5675 ctrlcommon .MCONamespace , componentName ,
5776 startOpts .imagesFile ,
@@ -89,22 +108,69 @@ func runStartCmd(cmd *cobra.Command, args []string) {
89108 ctrlctx .KubeMAOSharedInformer .Start (ctrlctx .Stop )
90109 close (ctrlctx .InformersStarted )
91110
92- go controller .Run (2 , ctrlctx .Stop )
111+ resultChannelCount ++
112+ go func () {
113+ defer utilruntime .HandleCrash ()
114+ controller .Run (runContext , 2 )
115+ resultChannel <- asyncResult {name : "main operator" , error : err }
116+ }()
117+
118+ // TODO(jkyros); This might be overkill for the operator, it only has one goroutine
119+ var shutdownTimer * time.Timer
120+ for resultChannelCount > 0 {
121+ glog .Infof ("Waiting on %d outstanding goroutines." , resultChannelCount )
122+ if shutdownTimer == nil { // running
123+ select {
124+ case <- runContext .Done ():
125+ glog .Info ("Run context completed; beginning two-minute graceful shutdown period." )
126+ shutdownTimer = time .NewTimer (2 * time .Minute )
93127
94- select {}
128+ case result := <- resultChannel :
129+ // TODO(jkyros): one of our goroutines puked early, this means we shut down everything.
130+ resultChannelCount --
131+ if result .error == nil {
132+ glog .Infof ("Collected %s goroutine." , result .name )
133+ } else {
134+ glog .Errorf ("Collected %s goroutine: %v" , result .name , result .error )
135+ runCancel () // this will cause shutdownTimer initialization in the next loop
136+ }
137+ }
138+ } else { // shutting down
139+ select {
140+ case <- shutdownTimer .C : // never triggers after the channel is stopped, although it would not matter much if it did because subsequent cancel calls do nothing.
141+ leaderCancel ()
142+ shutdownTimer .Stop ()
143+ case result := <- resultChannel :
144+ resultChannelCount --
145+ if result .error == nil {
146+ glog .Infof ("Collected %s goroutine." , result .name )
147+ } else {
148+ glog .Errorf ("Collected %s goroutine: %v" , result .name , result .error )
149+ }
150+ if resultChannelCount == 0 {
151+ glog .Info ("That was the last one, cancelling the leader lease." )
152+ leaderCancel ()
153+ }
154+ }
155+ }
156+ }
157+ glog .Info ("Finished collecting operator goroutines." )
95158 }
96159
97- leaderElectionCfg := common .GetLeaderElectionConfig (cb .GetBuilderConfig ())
160+ // TODO(jkyros): should this be a different "pre-run" context here?
161+ leaderElectionCfg := common .GetLeaderElectionConfig (runContext , cb .GetBuilderConfig ())
98162
99- leaderelection .RunOrDie (context .TODO (), leaderelection.LeaderElectionConfig {
100- Lock : common .CreateResourceLock (cb , ctrlcommon .MCONamespace , componentName ),
101- LeaseDuration : leaderElectionCfg .LeaseDuration .Duration ,
102- RenewDeadline : leaderElectionCfg .RenewDeadline .Duration ,
103- RetryPeriod : leaderElectionCfg .RetryPeriod .Duration ,
163+ leaderelection .RunOrDie (leaderContext , leaderelection.LeaderElectionConfig {
164+ Lock : common .CreateResourceLock (cb , ctrlcommon .MCONamespace , componentName ),
165+ ReleaseOnCancel : true ,
166+ LeaseDuration : leaderElectionCfg .LeaseDuration .Duration ,
167+ RenewDeadline : leaderElectionCfg .RenewDeadline .Duration ,
168+ RetryPeriod : leaderElectionCfg .RetryPeriod .Duration ,
104169 Callbacks : leaderelection.LeaderCallbacks {
105170 OnStartedLeading : run ,
106171 OnStoppedLeading : func () {
107- glog .Fatalf ("leaderelection lost" )
172+ glog .Infof ("Stopped leading. Terminating." )
173+ os .Exit (0 )
108174 },
109175 },
110176 })
0 commit comments