cvo: When the CVO restarts, perform one final sync to write status

smarterclayton · smarterclayton · commit dbedb7accb0b · 2019-04-29T18:36:21.000-04:00
When we upgrade the CVO causes itself to reboot by updating the
deployment. The CVO gets signalled with SIGTERM and then releases
the leader lease. However, there is no guarantee the latest status
of the CVO has been flushed to the cluster version object which
can mean the "verified: true" flag that the sync worker calculates
when it retrieves the payload doesn't get written. The new CVO pod
loads from the payload and so doesn't have the verified flag.

While in the future we may want to completely decouple verification
from payload retrieval (background worker that verifies available
updates as well as checks historical records), for now we need to
ensure the loaded state is persisted to the CV. Since there may be
useful human information available about the payload that a failed
new CVO pod might not get a chance to write, alter the CVO sync
loop to perform one final status sync during shutdown, and increase
the amount of time we wait before hard shutdown to 5s to give it
more room to happen.
diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go
@@ -226,8 +226,11 @@ func (optr *Operator) InitializeFromPayload(restConfig *rest.Config, burstRestCo
 // Run runs the cluster version operator until stopCh is completed. Workers is ignored for now.
 func (optr *Operator) Run(ctx context.Context, workers int) {
 	defer utilruntime.HandleCrash()
-	defer optr.queue.ShutDown()
+	// TODO: when Kube 77170 is fixed we can remove the use of the once here
+	var shutdownOnce sync.Once
+	defer shutdownOnce.Do(func() { optr.queue.ShutDown() })
 	stopCh := ctx.Done()
+	workerStopCh := make(chan struct{})
 
 	glog.Infof("Starting ClusterVersionOperator with minimum reconcile period %s", optr.minimumUpdateCheckInterval)
 	defer glog.Info("Shutting down ClusterVersionOperator")
@@ -243,11 +246,22 @@ func (optr *Operator) Run(ctx context.Context, workers int) {
 	// start the config sync loop, and have it notify the queue when new status is detected
 	go runThrottledStatusNotifier(stopCh, optr.statusInterval, 2, optr.configSync.StatusCh(), func() { optr.queue.Add(optr.queueKey()) })
 	go optr.configSync.Start(ctx, 16)
-
-	go wait.Until(func() { optr.worker(optr.queue, optr.sync) }, time.Second, stopCh)
 	go wait.Until(func() { optr.worker(optr.availableUpdatesQueue, optr.availableUpdatesSync) }, time.Second, stopCh)
+	go wait.Until(func() {
+		defer close(workerStopCh)
+
+		// run the worker, then when the queue is closed sync one final time to flush any pending status
+		optr.worker(optr.queue, optr.sync)
+		if err := optr.sync(optr.queueKey()); err != nil {
+			utilruntime.HandleError(fmt.Errorf("unable to perform final sync: %v", err))
+		}
+	}, time.Second, stopCh)
 
 	<-stopCh
+
+	// stop the queue, then wait for the worker to exit
+	shutdownOnce.Do(func() { optr.queue.ShutDown() })
+	<-workerStopCh
 }
 
 func (optr *Operator) queueKey() string {
diff --git a/pkg/start/start.go b/pkg/start/start.go
@@ -132,7 +132,7 @@ func (o *Options) Run() error {
 
 		// exit after 2s no matter what
 		select {
-		case <-time.After(2 * time.Second):
+		case <-time.After(5 * time.Second):
 			glog.Fatalf("Exiting")
 		case <-ch:
 			glog.Fatalf("Received shutdown signal twice, exiting")