From 55ff6032a3920905bec670e3201707cf7fdf8e5c Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Wed, 15 Apr 2020 08:17:02 -0700
Subject: [PATCH 1/9] pkg/start: Drop the internal EnableMetrics

We've had it since 2b81f47626 (cvo: Release our leader lease when we
are gracefully terminated, 2019-01-16, #87), but it's redundant
vs. "ListenAddr is not an empty string".

I'm also switching to:

  o.ListenAddr != ""

instead of:

  len(o.ListenAddr) > 0

because it seems slightly easier to understand, but obviously either
will work.

Cherry-picked from 07e5809d54 (#349), around conflicts due to the lack
of TLS metrics in 4.5.
---
 pkg/start/start.go                  | 6 ++----
 pkg/start/start_integration_test.go | 4 ----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/pkg/start/start.go b/pkg/start/start.go
index c56436f23b..f794806409 100644
--- a/pkg/start/start.go
+++ b/pkg/start/start.go
@@ -67,7 +67,6 @@ type Options struct {
 	Name            string
 	Namespace       string
 	PayloadOverride string
-	EnableMetrics   bool
 	ResyncInterval  time.Duration
 }
 
@@ -91,7 +90,6 @@ func NewOptions() *Options {
 		Name:            defaultEnv("CVO_NAME", defaultComponentName),
 		PayloadOverride: os.Getenv("PAYLOAD_OVERRIDE"),
 		ResyncInterval:  minResyncPeriod,
-		EnableMetrics:   true,
 		Exclude:         os.Getenv("EXCLUDE_MANIFESTS"),
 	}
 }
@@ -155,7 +153,7 @@ func (o *Options) Run() error {
 
 func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourcelock.ConfigMapLock) {
 	// listen on metrics
-	if len(o.ListenAddr) > 0 {
+	if o.ListenAddr != "" {
 		mux := http.NewServeMux()
 		mux.Handle("/metrics", promhttp.Handler())
 		go func() {
@@ -345,7 +343,7 @@ func (o *Options) NewControllerContext(cb *ClientBuilder) *Context {
 			sharedInformers.Config().V1().Proxies(),
 			cb.ClientOrDie(o.Namespace),
 			cb.KubeClientOrDie(o.Namespace, useProtobuf),
-			o.EnableMetrics,
+			o.ListenAddr != "",
 			o.Exclude,
 		),
 	}
diff --git a/pkg/start/start_integration_test.go b/pkg/start/start_integration_test.go
index 446fba9d4d..e7cc2cad85 100644
--- a/pkg/start/start_integration_test.go
+++ b/pkg/start/start_integration_test.go
@@ -238,7 +238,6 @@ func TestIntegrationCVO_initializeAndUpgrade(t *testing.T) {
 	options.NodeName = "test-node"
 	options.ReleaseImage = payloadImage1
 	options.PayloadOverride = filepath.Join(dir, "ignored")
-	options.EnableMetrics = false
 	controllers := options.NewControllerContext(cb)
 
 	worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil), 5*time.Second, wait.Backoff{Steps: 3}, "")
@@ -390,7 +389,6 @@ func TestIntegrationCVO_initializeAndHandleError(t *testing.T) {
 	options.NodeName = "test-node"
 	options.ReleaseImage = payloadImage1
 	options.PayloadOverride = filepath.Join(dir, "ignored")
-	options.EnableMetrics = false
 	options.ResyncInterval = 3 * time.Second
 	controllers := options.NewControllerContext(cb)
 
@@ -497,7 +495,6 @@ func TestIntegrationCVO_gracefulStepDown(t *testing.T) {
 	options.Name = ns
 	options.ListenAddr = ""
 	options.NodeName = "test-node"
-	options.EnableMetrics = false
 	controllers := options.NewControllerContext(cb)
 
 	worker := cvo.NewSyncWorker(&mapPayloadRetriever{}, cvo.NewResourceBuilder(cfg, cfg, nil), 5*time.Second, wait.Backoff{Steps: 3}, "")
@@ -667,7 +664,6 @@ metadata:
 	options.NodeName = "test-node"
 	options.ReleaseImage = payloadImage1
 	options.PayloadOverride = payloadDir
-	options.EnableMetrics = false
 	controllers := options.NewControllerContext(cb)
 	if err := controllers.CVO.InitializeFromPayload(cb.RestConfig(defaultQPS), cb.RestConfig(highQPS)); err != nil {
 		t.Fatal(err)

From d257c32ac7bdc58b7a33ff6099d9cd1872f5410e Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Wed, 15 Apr 2020 10:04:45 -0700
Subject: [PATCH 2/9] pkg/cvo/metrics: Graceful server shutdown

Somewhat like the example in [1].  This pushes the server management
down into a new RunMetrics method, which we then run in its own
goroutine.  This is initial groundwork; I expect we will port more of
our child goroutines to this framework in follow-up work.

Cherry-picked from b30aa0ecdb (#349), around conflicts due to the lack
of TLS metrics in 4.5.

[1]: https://golang.org/pkg/net/http/#Server.Shutdown
---
 pkg/cvo/metrics.go | 66 ++++++++++++++++++++++++++++++++++++++++++++++
 pkg/start/start.go | 46 +++++++++++++++++++++++++-------
 2 files changed, 102 insertions(+), 10 deletions(-)

diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go
index db03328694..3c2be69760 100644
--- a/pkg/cvo/metrics.go
+++ b/pkg/cvo/metrics.go
@@ -1,14 +1,19 @@
 package cvo
 
 import (
+	"context"
+	"net"
+	"net/http"
 	"time"
 
 	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/client-go/tools/cache"
+	"k8s.io/klog"
 
 	configv1 "github.com/openshift/api/config/v1"
 	"github.com/openshift/cluster-version-operator/lib/resourcemerge"
@@ -86,6 +91,67 @@ version for 'cluster', or empty for 'initial'.
 	}
 }
 
+// RunMetrics launches an server bound to listenAddress serving
+// Prometheus metrics at /metrics over HTTP.  Continues serving until
+// runContext.Done() and then attempts a clean shutdown limited by
+// shutdownContext.Done().  Assumes runContext.Done() occurs before or
+// simultaneously with shutdownContext.Done().
+func RunMetrics(runContext context.Context, shutdownContext context.Context, listenAddress string) error {
+	handler := http.NewServeMux()
+	handler.Handle("/metrics", promhttp.Handler())
+	server := &http.Server{
+		Handler: handler,
+	}
+
+	errorChannel := make(chan error, 1)
+	errorChannelCount := 1
+	go func() {
+		tcpListener, err := net.Listen("tcp", listenAddress)
+		if err != nil {
+			errorChannel <- err
+			return
+		}
+
+		klog.Infof("Metrics port listening for HTTP on %v", listenAddress)
+
+		errorChannel <- server.Serve(tcpListener)
+	}()
+
+	shutdown := false
+	var loopError error
+	for errorChannelCount > 0 {
+		if shutdown {
+			err := <-errorChannel
+			errorChannelCount--
+			if err != nil && err != http.ErrServerClosed {
+				if loopError == nil {
+					loopError = err
+				} else if err != nil { // log the error we are discarding
+					klog.Errorf("Failed to gracefully shut down metrics server: %s", err)
+				}
+			}
+		} else {
+			select {
+			case <-runContext.Done(): // clean shutdown
+			case err := <-errorChannel: // crashed before a shutdown was requested
+				errorChannelCount--
+				if err != nil && err != http.ErrServerClosed {
+					loopError = err
+				}
+			}
+			shutdown = true
+			shutdownError := server.Shutdown(shutdownContext)
+			if loopError == nil {
+				loopError = shutdownError
+			} else if shutdownError != nil { // log the error we are discarding
+				klog.Errorf("Failed to gracefully shut down metrics server: %s", shutdownError)
+			}
+		}
+	}
+
+	return loopError
+}
+
 type conditionKey struct {
 	Name string
 	Type string
diff --git a/pkg/start/start.go b/pkg/start/start.go
index f794806409..76fb4b5080 100644
--- a/pkg/start/start.go
+++ b/pkg/start/start.go
@@ -6,7 +6,6 @@ import (
 	"context"
 	"fmt"
 	"math/rand"
-	"net/http"
 	"os"
 	"os/signal"
 	"sync"
@@ -14,7 +13,6 @@ import (
 	"time"
 
 	"github.com/google/uuid"
-	"github.com/prometheus/client_golang/prometheus/promhttp"
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/informers"
@@ -152,14 +150,14 @@ func (o *Options) Run() error {
 }
 
 func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourcelock.ConfigMapLock) {
-	// listen on metrics
+	runContext, runCancel := context.WithCancel(ctx)
+	shutdownContext, shutdownCancel := context.WithCancel(ctx)
+	errorChannel := make(chan error, 1)
+	errorChannelCount := 0
 	if o.ListenAddr != "" {
-		mux := http.NewServeMux()
-		mux.Handle("/metrics", promhttp.Handler())
+		errorChannelCount++
 		go func() {
-			if err := http.ListenAndServe(o.ListenAddr, mux); err != nil {
-				klog.Fatalf("Unable to start metrics server: %v", err)
-			}
+			errorChannel <- cvo.RunMetrics(runContext, shutdownContext, o.ListenAddr)
 		}()
 	}
 
@@ -176,9 +174,9 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 		RetryPeriod:   retryPeriod,
 		Callbacks: leaderelection.LeaderCallbacks{
 			OnStartedLeading: func(localCtx context.Context) {
-				controllerCtx.Start(ctx)
+				controllerCtx.Start(runContext)
 				select {
-				case <-ctx.Done():
+				case <-runContext.Done():
 					// WARNING: this is not completely safe until we have Kube 1.14 and ReleaseOnCancel
 					//   and client-go ContextCancelable, which allows us to block new API requests before
 					//   we step down. However, the CVO isn't that sensitive to races and can tolerate
@@ -207,6 +205,34 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 		},
 	})
 
+	for errorChannelCount > 0 {
+		var shutdownTimer *time.Timer
+		if shutdownTimer == nil { // running
+			select {
+			case <-runContext.Done():
+				shutdownTimer = time.NewTimer(2 * time.Minute)
+			case err := <-errorChannel:
+				errorChannelCount--
+				if err != nil {
+					klog.Error(err)
+					runCancel() // this will cause shutdownTimer initialization in the next loop
+				}
+			}
+		} else { // shutting down
+			select {
+			case <-shutdownTimer.C: // never triggers after the channel is stopped, although it would not matter much if it did because subsequent cancel calls do nothing.
+				shutdownCancel()
+				shutdownTimer.Stop()
+			case err := <-errorChannel:
+				errorChannelCount--
+				if err != nil {
+					klog.Error(err)
+					runCancel()
+				}
+			}
+		}
+	}
+
 	<-exit
 }
 

From f8774c0f4670eca0dbbb1f6e977f3eb83a4456c1 Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Wed, 15 Apr 2020 11:24:35 -0700
Subject: [PATCH 3/9] pkg/start: Register metrics directly

Pulling this up out of cvo.New() while working to decouple metrics
handling from the core CVO goroutine.
---
 pkg/cvo/cvo.go     |  6 ------
 pkg/cvo/metrics.go |  4 +++-
 pkg/start/start.go | 11 +++++++++--
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go
index d5bc3d4ce0..c38b9c5e05 100644
--- a/pkg/cvo/cvo.go
+++ b/pkg/cvo/cvo.go
@@ -169,7 +169,6 @@ func New(
 	proxyInformer configinformersv1.ProxyInformer,
 	client clientset.Interface,
 	kubeClient kubernetes.Interface,
-	enableMetrics bool,
 	exclude string,
 ) *Operator {
 	eventBroadcaster := record.NewBroadcaster()
@@ -214,11 +213,6 @@ func New(
 	// make sure this is initialized after all the listers are initialized
 	optr.upgradeableChecks = optr.defaultUpgradeableChecks()
 
-	if enableMetrics {
-		if err := optr.registerMetrics(coInformer.Informer()); err != nil {
-			panic(err)
-		}
-	}
 	return optr
 }
 
diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go
index 3c2be69760..34f92203fa 100644
--- a/pkg/cvo/metrics.go
+++ b/pkg/cvo/metrics.go
@@ -20,7 +20,9 @@ import (
 	"github.com/openshift/cluster-version-operator/pkg/internal"
 )
 
-func (optr *Operator) registerMetrics(coInformer cache.SharedInformer) error {
+// RegisterMetrics initializes metrics and registers them with the
+// Prometheus implementation.
+func (optr *Operator) RegisterMetrics(coInformer cache.SharedInformer) error {
 	m := newOperatorMetrics(optr)
 	coInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
 		UpdateFunc: m.clusterOperatorChanged,
diff --git a/pkg/start/start.go b/pkg/start/start.go
index 76fb4b5080..a598396c14 100644
--- a/pkg/start/start.go
+++ b/pkg/start/start.go
@@ -151,7 +151,9 @@ func (o *Options) Run() error {
 
 func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourcelock.ConfigMapLock) {
 	runContext, runCancel := context.WithCancel(ctx)
+	defer runCancel()
 	shutdownContext, shutdownCancel := context.WithCancel(ctx)
+	defer shutdownCancel()
 	errorChannel := make(chan error, 1)
 	errorChannelCount := 0
 	if o.ListenAddr != "" {
@@ -351,6 +353,7 @@ func (o *Options) NewControllerContext(cb *ClientBuilder) *Context {
 
 	sharedInformers := externalversions.NewSharedInformerFactory(client, resyncPeriod(o.ResyncInterval)())
 
+	coInformer := sharedInformers.Config().V1().ClusterOperators()
 	ctx := &Context{
 		CVInformerFactory:              cvInformer,
 		OpenshiftConfigInformerFactory: openshiftConfigInformer,
@@ -364,12 +367,11 @@ func (o *Options) NewControllerContext(cb *ClientBuilder) *Context {
 			o.PayloadOverride,
 			resyncPeriod(o.ResyncInterval)(),
 			cvInformer.Config().V1().ClusterVersions(),
-			sharedInformers.Config().V1().ClusterOperators(),
+			coInformer,
 			openshiftConfigInformer.Core().V1().ConfigMaps(),
 			sharedInformers.Config().V1().Proxies(),
 			cb.ClientOrDie(o.Namespace),
 			cb.KubeClientOrDie(o.Namespace, useProtobuf),
-			o.ListenAddr != "",
 			o.Exclude,
 		),
 	}
@@ -382,6 +384,11 @@ func (o *Options) NewControllerContext(cb *ClientBuilder) *Context {
 			cb.KubeClientOrDie(o.Namespace),
 		)
 	}
+	if o.ListenAddr != "" {
+		if err := ctx.CVO.RegisterMetrics(coInformer.Informer()); err != nil {
+			panic(err)
+		}
+	}
 	return ctx
 }
 

From d8ca1343ff05215736ddaf299a4dc3b3d10f3b61 Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Tue, 21 Apr 2020 13:53:13 -0700
Subject: [PATCH 4/9] pkg/cvo/egress: Pull HTTPS/Proxy egress into separate
 file

These are not just for available updates, they're also for downloading
signatures.  Placing them in a separate file makes it easier to focus
on the code that is specific to available updates.
---
 pkg/cvo/availableupdates.go | 53 --------------------------------
 pkg/cvo/egress.go           | 61 +++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 53 deletions(-)
 create mode 100644 pkg/cvo/egress.go

diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go
index 66dd6d6bb8..72aaa8ea5a 100644
--- a/pkg/cvo/availableupdates.go
+++ b/pkg/cvo/availableupdates.go
@@ -2,7 +2,6 @@ package cvo
 
 import (
 	"crypto/tls"
-	"crypto/x509"
 	"fmt"
 	"net/url"
 	"runtime"
@@ -11,7 +10,6 @@ import (
 	"github.com/blang/semver"
 	"github.com/google/uuid"
 	"k8s.io/apimachinery/pkg/api/equality"
-	"k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/klog"
 
@@ -197,54 +195,3 @@ func calculateAvailableUpdatesStatus(clusterID string, proxyURL *url.URL, tlsCon
 		LastTransitionTime: metav1.Now(),
 	}
 }
-
-// getHTTPSProxyURL returns a url.URL object for the configured
-// https proxy only. It can be nil if does not exist or there is an error.
-func (optr *Operator) getHTTPSProxyURL() (*url.URL, string, error) {
-	proxy, err := optr.proxyLister.Get("cluster")
-
-	if errors.IsNotFound(err) {
-		return nil, "", nil
-	}
-	if err != nil {
-		return nil, "", err
-	}
-
-	if &proxy.Spec != nil {
-		if proxy.Spec.HTTPSProxy != "" {
-			proxyURL, err := url.Parse(proxy.Spec.HTTPSProxy)
-			if err != nil {
-				return nil, "", err
-			}
-			return proxyURL, proxy.Spec.TrustedCA.Name, nil
-		}
-	}
-	return nil, "", nil
-}
-
-func (optr *Operator) getTLSConfig(cmNameRef string) (*tls.Config, error) {
-	cm, err := optr.cmConfigLister.Get(cmNameRef)
-
-	if err != nil {
-		return nil, err
-	}
-
-	certPool, _ := x509.SystemCertPool()
-	if certPool == nil {
-		certPool = x509.NewCertPool()
-	}
-
-	if cm.Data["ca-bundle.crt"] != "" {
-		if ok := certPool.AppendCertsFromPEM([]byte(cm.Data["ca-bundle.crt"])); !ok {
-			return nil, fmt.Errorf("unable to add ca-bundle.crt certificates")
-		}
-	} else {
-		return nil, nil
-	}
-
-	config := &tls.Config{
-		RootCAs: certPool,
-	}
-
-	return config, nil
-}
diff --git a/pkg/cvo/egress.go b/pkg/cvo/egress.go
new file mode 100644
index 0000000000..75cfa607c1
--- /dev/null
+++ b/pkg/cvo/egress.go
@@ -0,0 +1,61 @@
+package cvo
+
+import (
+	"crypto/tls"
+	"crypto/x509"
+	"fmt"
+	"net/url"
+
+	"k8s.io/apimachinery/pkg/api/errors"
+)
+
+// getHTTPSProxyURL returns a url.URL object for the configured
+// https proxy only. It can be nil if does not exist or there is an error.
+func (optr *Operator) getHTTPSProxyURL() (*url.URL, string, error) {
+	proxy, err := optr.proxyLister.Get("cluster")
+
+	if errors.IsNotFound(err) {
+		return nil, "", nil
+	}
+	if err != nil {
+		return nil, "", err
+	}
+
+	if &proxy.Spec != nil {
+		if proxy.Spec.HTTPSProxy != "" {
+			proxyURL, err := url.Parse(proxy.Spec.HTTPSProxy)
+			if err != nil {
+				return nil, "", err
+			}
+			return proxyURL, proxy.Spec.TrustedCA.Name, nil
+		}
+	}
+	return nil, "", nil
+}
+
+func (optr *Operator) getTLSConfig(cmNameRef string) (*tls.Config, error) {
+	cm, err := optr.cmConfigLister.Get(cmNameRef)
+
+	if err != nil {
+		return nil, err
+	}
+
+	certPool, _ := x509.SystemCertPool()
+	if certPool == nil {
+		certPool = x509.NewCertPool()
+	}
+
+	if cm.Data["ca-bundle.crt"] != "" {
+		if ok := certPool.AppendCertsFromPEM([]byte(cm.Data["ca-bundle.crt"])); !ok {
+			return nil, fmt.Errorf("unable to add ca-bundle.crt certificates")
+		}
+	} else {
+		return nil, nil
+	}
+
+	config := &tls.Config{
+		RootCAs: certPool,
+	}
+
+	return config, nil
+}

From 905b3058edc32f5813b670e9b4356ae6c5f55c3f Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Mon, 3 Aug 2020 13:11:41 -0700
Subject: [PATCH 5/9] pkg/start: Release leader lease on graceful shutdown

So the incoming cluster-version operator doesn't need to wait for the
outgoing operator's lease to expire, which can take a while [1]:

  I0802 10:06:01.056591       1 leaderelection.go:243] attempting to acquire leader lease  openshift-cluster-version/version...
  ...
  I0802 10:07:42.632719       1 leaderelection.go:253] successfully acquired lease openshift-cluster-version/version

and time out the:

  Cluster did not acknowledge request to upgrade in a reasonable time

testcase [2].  Using ReleaseOnCancel has been the plan since
2b81f47626 (cvo: Release our leader lease when we are gracefully
terminated, 2019-01-16, #87).  I'm not clear on why it (sometimes?)
doesn't work today.

The discrepancy between the "exit after 2s no matter what" comment and
the 5s After dates back to dbedb7accb (cvo: When the CVO restarts,
perform one final sync to write status, 2019-04-27, #179), which
bumped the After from 2s to 5s, but forgot to bump the comment.  I'm
removing that code here in favor of the two-minute timeout from
b30aa0ecdb (pkg/cvo/metrics: Graceful server shutdown, 2020-04-15, #349).
We still exit immediately on a second TERM, for folks who get
impatient waiting for the graceful timeout.

Decouple shutdownContext from the context passed into Options.run, to
allow TestIntegrationCVO_gracefulStepDown to request a graceful
shutdown.  And remove Context.Start(), inlining the logic in
Options.run so we can count and reap the goroutines it used to launch.
This also allows us to be more targeted with the context for each
goroutines:

* Informers are now launched before the lease controller, so they're
  up and running by the time we acquire the lease.  They remain
  running until the main operator CVO.Run() exits, after which we shut
  them down.  Having informers running before we have a lease is
  somewhat expensive in terms of API traffic, but we should rarely
  have two CVO pods competing for leadership since we transitioned to
  the Recreate Deployment strategy in 078686d73b
  (install/0000_00_cluster-version-operator_03_deployment: Set
  'strategy: Recreate', 2019-03-20, #140) and 5d8a5275a3
  (install/0000_00_cluster-version-operator_03_deployment: Fix
  Recreate strategy, 2019-04-03, #155).  I don't see a way to block on
  their internal goroutine's completion, but maybe informers will grow
  an API for that in the future.

* The metrics server also continues to run until CVO.Run() exits,
  where previously we began gracefully shutting it down at the same
  time we started shutting down CVO.Run().  This ensures we are around
  and publishing any last-minute CVO.Run() changes.

* Leader election also continues to run until CVO.Run() exits.  We
  don't want to release the lease while we're still controlling
  things.

* CVO.Run() and AutoUpdate.Run() both stop immediately when the
  passed-in context is canceled or we call runCancel internally
  (because of a TERM, error from a goroutine, or loss of leadership).
  These are the only two goroutines that are actually writing to the
  API servers, so we want to shut them down as quickly as possible.

Drop an unnecessary runCancel() from the "shutting down" branch of the
error collector.  I'd added it in b30aa0ecdb, but you can only ever
get into the "shutting down" branch if runCancel has already been
called.  And fix the scoping for the shutdownTimer variable so we
don't clear it on each for-loop iteration (oops :p, bug from
b30aa0ecdb).

Add some logging to the error collector, so it's easier to see where
we are in the collection process from the operator logs.  Also start
logging collected goroutines by name, so we can figure out which may
still be outstanding.

Set terminationGracePeriodSeconds 130 to extend the default 30s [3],
to give the container the full two-minute graceful timeout window
before the kubelet steps in with a KILL.

Push the Background() initialization all the way up to the
command-line handler, to make it more obvious that the context is
scoped to the whole 'start' invocation.

[1]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/25365/pull-ci-openshift-origin-master-e2e-gcp-upgrade/1289853267223777280/artifacts/e2e-gcp-upgrade/pods/openshift-cluster-version_cluster-version-operator-5b6ff896c6-57ppb_cluster-version-operator.log
[2]: https://bugzilla.redhat.com/show_bug.cgi?id=1843505#c7
[3]: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#podspec-v1-core

Cherry picked from cc1921d186 (#424), around conflicts due to the lack
of TLS metrics and the Context pivots in 4.5.
---
 bootstrap/bootstrap-pod.yaml                  |   1 +
 cmd/start.go                                  |   5 +-
 ...luster-version-operator_03_deployment.yaml |   1 +
 pkg/autoupdate/autoupdate.go                  |   7 +-
 pkg/cvo/cvo.go                                |   7 +-
 pkg/start/start.go                            | 187 +++++++++---------
 pkg/start/start_integration_test.go           |  23 ++-
 7 files changed, 123 insertions(+), 108 deletions(-)

diff --git a/bootstrap/bootstrap-pod.yaml b/bootstrap/bootstrap-pod.yaml
index bc93746282..ca42facd11 100644
--- a/bootstrap/bootstrap-pod.yaml
+++ b/bootstrap/bootstrap-pod.yaml
@@ -37,6 +37,7 @@ spec:
           fieldRef:
             fieldPath: spec.nodeName
   hostNetwork: true
+  terminationGracePeriodSeconds: 130
   volumes:
     - name: kubeconfig
       hostPath:
diff --git a/cmd/start.go b/cmd/start.go
index 20fae2301a..8f4cb7a51f 100644
--- a/cmd/start.go
+++ b/cmd/start.go
@@ -1,6 +1,8 @@
 package main
 
 import (
+	"context"
+
 	"github.com/spf13/cobra"
 	"k8s.io/klog"
 
@@ -18,9 +20,10 @@ func init() {
 			// To help debugging, immediately log version
 			klog.Infof("%s", version.String)
 
-			if err := opts.Run(); err != nil {
+			if err := opts.Run(context.Background()); err != nil {
 				klog.Fatalf("error: %v", err)
 			}
+			klog.Info("Graceful shutdown complete.")
 		},
 	}
 
diff --git a/install/0000_00_cluster-version-operator_03_deployment.yaml b/install/0000_00_cluster-version-operator_03_deployment.yaml
index bc0ba32404..76ae8a980d 100644
--- a/install/0000_00_cluster-version-operator_03_deployment.yaml
+++ b/install/0000_00_cluster-version-operator_03_deployment.yaml
@@ -52,6 +52,7 @@ spec:
       nodeSelector:
         node-role.kubernetes.io/master: ""
       priorityClassName: "system-cluster-critical"
+      terminationGracePeriodSeconds: 130
       tolerations:
       - key: "node-role.kubernetes.io/master"
         operator: Exists
diff --git a/pkg/autoupdate/autoupdate.go b/pkg/autoupdate/autoupdate.go
index 2b69378021..19c900210b 100644
--- a/pkg/autoupdate/autoupdate.go
+++ b/pkg/autoupdate/autoupdate.go
@@ -87,7 +87,7 @@ func New(
 }
 
 // Run runs the autoupdate controller.
-func (ctrl *Controller) Run(workers int, stopCh <-chan struct{}) {
+func (ctrl *Controller) Run(workers int, stopCh <-chan struct{}) error {
 	defer utilruntime.HandleCrash()
 	defer ctrl.queue.ShutDown()
 
@@ -95,15 +95,16 @@ func (ctrl *Controller) Run(workers int, stopCh <-chan struct{}) {
 	defer klog.Info("Shutting down AutoUpdateController")
 
 	if !cache.WaitForCacheSync(stopCh, ctrl.cacheSynced...) {
-		klog.Info("Caches never synchronized")
-		return
+		return fmt.Errorf("caches never synchronized")
 	}
 
 	for i := 0; i < workers; i++ {
+		// FIXME: actually wait until these complete if the Context is canceled.
 		go wait.Until(ctrl.worker, time.Second, stopCh)
 	}
 
 	<-stopCh
+	return nil
 }
 
 func (ctrl *Controller) eventHandler() cache.ResourceEventHandler {
diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go
index c38b9c5e05..6733601d75 100644
--- a/pkg/cvo/cvo.go
+++ b/pkg/cvo/cvo.go
@@ -315,7 +315,7 @@ func loadConfigMapVerifierDataFromUpdate(update *payload.Update, clientBuilder v
 }
 
 // Run runs the cluster version operator until stopCh is completed. Workers is ignored for now.
-func (optr *Operator) Run(ctx context.Context, workers int) {
+func (optr *Operator) Run(ctx context.Context, workers int) error {
 	defer utilruntime.HandleCrash()
 	defer optr.queue.ShutDown()
 	stopCh := ctx.Done()
@@ -325,8 +325,7 @@ func (optr *Operator) Run(ctx context.Context, workers int) {
 	defer klog.Info("Shutting down ClusterVersionOperator")
 
 	if !cache.WaitForCacheSync(stopCh, optr.cacheSynced...) {
-		klog.Info("Caches never synchronized")
-		return
+		return fmt.Errorf("caches never synchronized: %w", ctx.Err())
 	}
 
 	// trigger the first cluster version reconcile always
@@ -355,6 +354,8 @@ func (optr *Operator) Run(ctx context.Context, workers int) {
 	// stop the queue, then wait for the worker to exit
 	optr.queue.ShutDown()
 	<-workerStopCh
+
+	return nil
 }
 
 func (optr *Operator) queueKey() string {
diff --git a/pkg/start/start.go b/pkg/start/start.go
index a598396c14..8d29374396 100644
--- a/pkg/start/start.go
+++ b/pkg/start/start.go
@@ -8,7 +8,6 @@ import (
 	"math/rand"
 	"os"
 	"os/signal"
-	"sync"
 	"syscall"
 	"time"
 
@@ -68,6 +67,11 @@ type Options struct {
 	ResyncInterval  time.Duration
 }
 
+type asyncResult struct {
+	name  string
+	error error
+}
+
 func defaultEnv(name, defaultValue string) string {
 	env, ok := os.LookupEnv(name)
 	if !ok {
@@ -92,7 +96,7 @@ func NewOptions() *Options {
 	}
 }
 
-func (o *Options) Run() error {
+func (o *Options) Run(ctx context.Context) error {
 	if o.NodeName == "" {
 		return fmt.Errorf("node-name is required")
 	}
@@ -122,120 +126,122 @@ func (o *Options) Run() error {
 		return err
 	}
 
-	// TODO: Kube 1.14 will contain a ReleaseOnCancel boolean on
-	//   LeaderElectionConfig that allows us to have the lock code
-	//   release the lease when this context is cancelled. At that
-	//   time we can remove our changes to OnStartedLeading.
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
+	o.run(ctx, controllerCtx, lock)
+	return nil
+}
+
+// run launches a number of goroutines to handle manifest application,
+// metrics serving, etc.  It continues operating until ctx.Done(),
+// and then attempts a clean shutdown limited by an internal context
+// with a two-minute cap.  It returns after it successfully collects all
+// launched goroutines.
+func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourcelock.ConfigMapLock) {
+	runContext, runCancel := context.WithCancel(ctx) // so we can cancel internally on errors or TERM
+	defer runCancel()
+	shutdownContext, shutdownCancel := context.WithCancel(context.Background()) // extends beyond ctx
+	defer shutdownCancel()
+	postMainContext, postMainCancel := context.WithCancel(context.Background()) // extends beyond ctx
+	defer postMainCancel()
+
 	ch := make(chan os.Signal, 1)
 	defer func() { signal.Stop(ch) }()
 	signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
 	go func() {
 		sig := <-ch
 		klog.Infof("Shutting down due to %s", sig)
-		cancel()
-
-		// exit after 2s no matter what
-		select {
-		case <-time.After(5 * time.Second):
-			klog.Fatalf("Exiting")
-		case <-ch:
-			klog.Fatalf("Received shutdown signal twice, exiting")
-		}
+		runCancel()
+		sig = <-ch
+		klog.Fatalf("Received shutdown signal twice, exiting: %s", sig)
 	}()
 
-	o.run(ctx, controllerCtx, lock)
-	return nil
-}
-
-func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourcelock.ConfigMapLock) {
-	runContext, runCancel := context.WithCancel(ctx)
-	defer runCancel()
-	shutdownContext, shutdownCancel := context.WithCancel(ctx)
-	defer shutdownCancel()
-	errorChannel := make(chan error, 1)
-	errorChannelCount := 0
+	resultChannel := make(chan asyncResult, 1)
+	resultChannelCount := 0
 	if o.ListenAddr != "" {
-		errorChannelCount++
+		resultChannelCount++
 		go func() {
-			errorChannel <- cvo.RunMetrics(runContext, shutdownContext, o.ListenAddr)
+			err := cvo.RunMetrics(postMainContext, shutdownContext, o.ListenAddr)
+			resultChannel <- asyncResult{name: "metrics server", error: err}
 		}()
 	}
 
-	exit := make(chan struct{})
-	exitClose := sync.Once{}
-
-	// TODO: when we switch to graceful lock shutdown, this can be
-	// moved back inside RunOrDie
-	// TODO: properly wire ctx here
-	go leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{
-		Lock:          lock,
-		LeaseDuration: leaseDuration,
-		RenewDeadline: renewDeadline,
-		RetryPeriod:   retryPeriod,
-		Callbacks: leaderelection.LeaderCallbacks{
-			OnStartedLeading: func(localCtx context.Context) {
-				controllerCtx.Start(runContext)
-				select {
-				case <-runContext.Done():
-					// WARNING: this is not completely safe until we have Kube 1.14 and ReleaseOnCancel
-					//   and client-go ContextCancelable, which allows us to block new API requests before
-					//   we step down. However, the CVO isn't that sensitive to races and can tolerate
-					//   brief overlap.
-					klog.Infof("Stepping down as leader")
-					// give the controllers some time to shut down
-					time.Sleep(100 * time.Millisecond)
-					// if we still hold the leader lease, clear the owner identity (other lease watchers
-					// still have to wait for expiration) like the new ReleaseOnCancel code will do.
-					if err := lock.Update(resourcelock.LeaderElectionRecord{}); err == nil {
-						// if we successfully clear the owner identity, we can safely delete the record
-						if err := lock.Client.ConfigMaps(lock.ConfigMapMeta.Namespace).Delete(lock.ConfigMapMeta.Name, nil); err != nil {
-							klog.Warningf("Unable to step down cleanly: %v", err)
-						}
+	informersDone := postMainContext.Done()
+	// FIXME: would be nice if there was a way to collect these.
+	controllerCtx.CVInformerFactory.Start(informersDone)
+	controllerCtx.OpenshiftConfigInformerFactory.Start(informersDone)
+	controllerCtx.InformerFactory.Start(informersDone)
+
+	resultChannelCount++
+	go func() {
+		leaderelection.RunOrDie(postMainContext, leaderelection.LeaderElectionConfig{
+			Lock:            lock,
+			ReleaseOnCancel: true,
+			LeaseDuration:   leaseDuration,
+			RenewDeadline:   renewDeadline,
+			RetryPeriod:     retryPeriod,
+			Callbacks: leaderelection.LeaderCallbacks{
+				OnStartedLeading: func(_ context.Context) { // no need for this passed-through postMainContext, because goroutines we launch inside will use runContext
+					resultChannelCount++
+					go func() {
+						err := controllerCtx.CVO.Run(runContext, 2)
+						resultChannel <- asyncResult{name: "main operator", error: err}
+					}()
+
+					if controllerCtx.AutoUpdate != nil {
+						resultChannelCount++
+						go func() {
+							err := controllerCtx.AutoUpdate.Run(2, runContext.Done())
+							resultChannel <- asyncResult{name: "auto-update controller", error: err}
+						}()
 					}
-					klog.Infof("Finished shutdown")
-					exitClose.Do(func() { close(exit) })
-				case <-localCtx.Done():
-					// we will exit in OnStoppedLeading
-				}
-			},
-			OnStoppedLeading: func() {
-				klog.Warning("leaderelection lost")
-				exitClose.Do(func() { close(exit) })
+				},
+				OnStoppedLeading: func() {
+					klog.Info("Stopped leading; shutting down.")
+					runCancel()
+				},
 			},
-		},
-	})
+		})
+		resultChannel <- asyncResult{name: "leader controller", error: nil}
+	}()
 
-	for errorChannelCount > 0 {
-		var shutdownTimer *time.Timer
+	var shutdownTimer *time.Timer
+	for resultChannelCount > 0 {
+		klog.Infof("Waiting on %d outstanding goroutines.", resultChannelCount)
 		if shutdownTimer == nil { // running
 			select {
 			case <-runContext.Done():
+				klog.Info("Run context completed; beginning two-minute graceful shutdown period.")
 				shutdownTimer = time.NewTimer(2 * time.Minute)
-			case err := <-errorChannel:
-				errorChannelCount--
-				if err != nil {
-					klog.Error(err)
+			case result := <-resultChannel:
+				resultChannelCount--
+				if result.error == nil {
+					klog.Infof("Collected %s goroutine.", result.name)
+				} else {
+					klog.Errorf("Collected %s goroutine: %v", result.name, result.error)
 					runCancel() // this will cause shutdownTimer initialization in the next loop
 				}
+				if result.name == "main operator" {
+					postMainCancel()
+				}
 			}
 		} else { // shutting down
 			select {
 			case <-shutdownTimer.C: // never triggers after the channel is stopped, although it would not matter much if it did because subsequent cancel calls do nothing.
 				shutdownCancel()
 				shutdownTimer.Stop()
-			case err := <-errorChannel:
-				errorChannelCount--
-				if err != nil {
-					klog.Error(err)
-					runCancel()
+			case result := <-resultChannel:
+				resultChannelCount--
+				if result.error == nil {
+					klog.Infof("Collected %s goroutine.", result.name)
+				} else {
+					klog.Errorf("Collected %s goroutine: %v", result.name, result.error)
+				}
+				if result.name == "main operator" {
+					postMainCancel()
 				}
 			}
 		}
 	}
-
-	<-exit
+	klog.Info("Finished collecting operator goroutines.")
 }
 
 // createResourceLock initializes the lock.
@@ -391,16 +397,3 @@ func (o *Options) NewControllerContext(cb *ClientBuilder) *Context {
 	}
 	return ctx
 }
-
-// Start launches the controllers in the provided context and any supporting
-// infrastructure. When ch is closed the controllers will be shut down.
-func (c *Context) Start(ctx context.Context) {
-	ch := ctx.Done()
-	go c.CVO.Run(ctx, 2)
-	if c.AutoUpdate != nil {
-		go c.AutoUpdate.Run(2, ch)
-	}
-	c.CVInformerFactory.Start(ch)
-	c.OpenshiftConfigInformerFactory.Start(ch)
-	c.InformerFactory.Start(ch)
-}
diff --git a/pkg/start/start_integration_test.go b/pkg/start/start_integration_test.go
index e7cc2cad85..830b0c5ec8 100644
--- a/pkg/start/start_integration_test.go
+++ b/pkg/start/start_integration_test.go
@@ -243,9 +243,14 @@ func TestIntegrationCVO_initializeAndUpgrade(t *testing.T) {
 	worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil), 5*time.Second, wait.Backoff{Steps: 3}, "")
 	controllers.CVO.SetSyncWorkerForTesting(worker)
 
+	lock, err := createResourceLock(cb, options.Namespace, options.Name)
+	if err != nil {
+		t.Fatal(err)
+	}
+
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
-	controllers.Start(ctx)
+	go options.run(ctx, controllers, lock)
 
 	t.Logf("wait until we observe the cluster version become available")
 	lastCV, err := waitForUpdateAvailable(t, client, ns, false, "0.0.1")
@@ -395,9 +400,14 @@ func TestIntegrationCVO_initializeAndHandleError(t *testing.T) {
 	worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil), 5*time.Second, wait.Backoff{Duration: time.Second, Factor: 1.2}, "")
 	controllers.CVO.SetSyncWorkerForTesting(worker)
 
+	lock, err := createResourceLock(cb, options.Namespace, options.Name)
+	if err != nil {
+		t.Fatal(err)
+	}
+
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
-	controllers.Start(ctx)
+	go options.run(ctx, controllers, lock)
 
 	t.Logf("wait until we observe the cluster version become available")
 	lastCV, err := waitForUpdateAvailable(t, client, ns, false, "0.0.1")
@@ -500,7 +510,7 @@ func TestIntegrationCVO_gracefulStepDown(t *testing.T) {
 	worker := cvo.NewSyncWorker(&mapPayloadRetriever{}, cvo.NewResourceBuilder(cfg, cfg, nil), 5*time.Second, wait.Backoff{Steps: 3}, "")
 	controllers.CVO.SetSyncWorkerForTesting(worker)
 
-	lock, err := createResourceLock(cb, ns, ns)
+	lock, err := createResourceLock(cb, options.Namespace, options.Name)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -672,9 +682,14 @@ metadata:
 	worker := cvo.NewSyncWorker(retriever, cvo.NewResourceBuilder(cfg, cfg, nil), 5*time.Second, wait.Backoff{Steps: 3}, "")
 	controllers.CVO.SetSyncWorkerForTesting(worker)
 
+	lock, err := createResourceLock(cb, options.Namespace, options.Name)
+	if err != nil {
+		t.Fatal(err)
+	}
+
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
-	controllers.Start(ctx)
+	go options.run(ctx, controllers, lock)
 
 	t.Logf("wait until we observe the cluster version become available")
 	lastCV, err := waitForUpdateAvailable(t, client, ns, false, "0.0.1")

From c8af6398088d8e6d7259664d066b7ffdc57d4676 Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Wed, 5 Aug 2020 14:54:47 -0700
Subject: [PATCH 6/9] pkg/start/start_integration_test: Do not assume "deleted"
 for ConfigMap lock release

From the godocs:

  $ grep -A5 '// HolderIdentity' vendor/k8s.io/client-go/tools/leaderelection/resourcelock/interface.go
    // HolderIdentity is the ID that owns the lease. If empty, no one owns this lease and
    // all callers may acquire. Versions of this library prior to Kubernetes 1.14 will not
    // attempt to acquire leases with empty identities and will wait for the full lease
    // interval to expire before attempting to reacquire. This value is set to empty when
    // a client voluntarily steps down.
    HolderIdentity       string      `json:"holderIdentity"`

The previous assumption that the release would involve ConfigMap
deletion was born with the test in 2b81f47626 (cvo: Release our leader
lease when we are gracefully terminated, 2019-01-16, #87).

Cherry picked from dd09c3fbe0 (#424), around conflicts due to the lack
of Context pivots in 4.5.
---
 pkg/start/start_integration_test.go | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pkg/start/start_integration_test.go b/pkg/start/start_integration_test.go
index 830b0c5ec8..de8e1d7ca3 100644
--- a/pkg/start/start_integration_test.go
+++ b/pkg/start/start_integration_test.go
@@ -526,7 +526,7 @@ func TestIntegrationCVO_gracefulStepDown(t *testing.T) {
 
 	// wait until the lock record exists
 	err = wait.PollImmediate(200*time.Millisecond, 60*time.Second, func() (bool, error) {
-		_, err := kc.CoreV1().ConfigMaps(ns).Get(ns, metav1.GetOptions{})
+		_, _, err := lock.Get()
 		if err != nil {
 			if errors.IsNotFound(err) {
 				return false, nil
@@ -548,26 +548,26 @@ func TestIntegrationCVO_gracefulStepDown(t *testing.T) {
 		t.Fatalf("no leader election events found in\n%#v", events.Items)
 	}
 
-	t.Logf("after the context is closed, the lock record should be deleted quickly")
+	t.Logf("after the context is closed, the lock should be released quickly")
 	cancel()
 	startTime := time.Now()
 	var endTime time.Time
 	// the lock should be deleted immediately
 	err = wait.PollImmediate(100*time.Millisecond, 10*time.Second, func() (bool, error) {
-		_, err := kc.CoreV1().ConfigMaps(ns).Get(ns, metav1.GetOptions{})
-		if errors.IsNotFound(err) {
-			endTime = time.Now()
-			return true, nil
-		}
+		electionRecord, _, err := lock.Get()
 		if err != nil {
+			if errors.IsNotFound(err) {
+				return false, nil
+			}
 			return false, err
 		}
-		return false, nil
+		endTime = time.Now()
+		return electionRecord.HolderIdentity == "", nil
 	})
 	if err != nil {
 		t.Fatal(err)
 	}
-	t.Logf("lock deleted in %s", endTime.Sub(startTime))
+	t.Logf("lock released in %s", endTime.Sub(startTime))
 
 	select {
 	case <-time.After(time.Second):

From c8f99b203e57904dc09fd24c9ef5b9c4b6899285 Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Thu, 6 Aug 2020 11:51:36 -0700
Subject: [PATCH 7/9] pkg/start: Fill in deferred HandleCrash

Clayton wants these in each goroutine we launch [1].  Obviously
there's no way to reach inside the informer Start()s and add it there.
I'm also adding this to the FIXME comment for rerolling the
auto-update worker goroutines; we'll get those straigtened out in
future work.

Cherry picked from 9c42a921c1 (#424), around conflicts due to the lack
of Context pivots in 4.5.

[1]: https://github.com/openshift/cluster-version-operator/pull/424/#discussion_r466585901
---
 pkg/autoupdate/autoupdate.go | 2 +-
 pkg/cvo/cvo.go               | 1 -
 pkg/start/start.go           | 6 ++++++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/pkg/autoupdate/autoupdate.go b/pkg/autoupdate/autoupdate.go
index 19c900210b..6e74634b35 100644
--- a/pkg/autoupdate/autoupdate.go
+++ b/pkg/autoupdate/autoupdate.go
@@ -99,7 +99,7 @@ func (ctrl *Controller) Run(workers int, stopCh <-chan struct{}) error {
 	}
 
 	for i := 0; i < workers; i++ {
-		// FIXME: actually wait until these complete if the Context is canceled.
+		// FIXME: actually wait until these complete if the Context is canceled.  And possibly add utilruntime.HandleCrash.
 		go wait.Until(ctrl.worker, time.Second, stopCh)
 	}
 
diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go
index 6733601d75..4bafcf6da7 100644
--- a/pkg/cvo/cvo.go
+++ b/pkg/cvo/cvo.go
@@ -316,7 +316,6 @@ func loadConfigMapVerifierDataFromUpdate(update *payload.Update, clientBuilder v
 
 // Run runs the cluster version operator until stopCh is completed. Workers is ignored for now.
 func (optr *Operator) Run(ctx context.Context, workers int) error {
-	defer utilruntime.HandleCrash()
 	defer optr.queue.ShutDown()
 	stopCh := ctx.Done()
 	workerStopCh := make(chan struct{})
diff --git a/pkg/start/start.go b/pkg/start/start.go
index 8d29374396..97befa3b32 100644
--- a/pkg/start/start.go
+++ b/pkg/start/start.go
@@ -14,6 +14,7 @@ import (
 	"github.com/google/uuid"
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 	"k8s.io/client-go/informers"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/kubernetes/scheme"
@@ -147,6 +148,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 	defer func() { signal.Stop(ch) }()
 	signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
 	go func() {
+		defer utilruntime.HandleCrash()
 		sig := <-ch
 		klog.Infof("Shutting down due to %s", sig)
 		runCancel()
@@ -159,6 +161,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 	if o.ListenAddr != "" {
 		resultChannelCount++
 		go func() {
+			defer utilruntime.HandleCrash()
 			err := cvo.RunMetrics(postMainContext, shutdownContext, o.ListenAddr)
 			resultChannel <- asyncResult{name: "metrics server", error: err}
 		}()
@@ -172,6 +175,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 
 	resultChannelCount++
 	go func() {
+		defer utilruntime.HandleCrash()
 		leaderelection.RunOrDie(postMainContext, leaderelection.LeaderElectionConfig{
 			Lock:            lock,
 			ReleaseOnCancel: true,
@@ -182,6 +186,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 				OnStartedLeading: func(_ context.Context) { // no need for this passed-through postMainContext, because goroutines we launch inside will use runContext
 					resultChannelCount++
 					go func() {
+						defer utilruntime.HandleCrash()
 						err := controllerCtx.CVO.Run(runContext, 2)
 						resultChannel <- asyncResult{name: "main operator", error: err}
 					}()
@@ -189,6 +194,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 					if controllerCtx.AutoUpdate != nil {
 						resultChannelCount++
 						go func() {
+							defer utilruntime.HandleCrash()
 							err := controllerCtx.AutoUpdate.Run(2, runContext.Done())
 							resultChannel <- asyncResult{name: "auto-update controller", error: err}
 						}()

From a42bfb7405533a824d60a1357cc919f0242d8a1e Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Tue, 25 Aug 2020 12:42:59 -0700
Subject: [PATCH 8/9] cmd/start: Include the version in the outgoing log line

Lala wanted the version included in the outgoing log line [1].  I'm
not sure why you'd be wondering which version of the CVO code was
running for that particular line, and not for other lines in the log,
but including the version there is easy enough.

While we're thinking about logging the CVO version, also remove the
useless %s formatting from the opening log line, because we don't need
to manipulate version.String at all.

[1]: https://github.com/openshift/cluster-version-operator/pull/424#discussion_r476658319
---
 cmd/start.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmd/start.go b/cmd/start.go
index 8f4cb7a51f..f38462c1e7 100644
--- a/cmd/start.go
+++ b/cmd/start.go
@@ -18,12 +18,12 @@ func init() {
 		Long:  "",
 		Run: func(cmd *cobra.Command, args []string) {
 			// To help debugging, immediately log version
-			klog.Infof("%s", version.String)
+			klog.Info(version.String)
 
 			if err := opts.Run(context.Background()); err != nil {
 				klog.Fatalf("error: %v", err)
 			}
-			klog.Info("Graceful shutdown complete.")
+			klog.Infof("Graceful shutdown complete for %s.", version.String)
 		},
 	}
 

From 65bcffde2593875b3642ff0d6d966ef7d367b370 Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Wed, 26 Aug 2020 21:26:24 -0700
Subject: [PATCH 9/9] *: Wash through 'go fmt'

Now that we have CI that cares about this (hooray!).  Generated with:

  $ go fmt ./...

using:

  $ go version
  go version go1.14.4 linux/amd64
---
 lib/resourceread/apiext_test.go | 6 +++---
 pkg/autoupdate/autoupdate.go    | 2 +-
 pkg/cvo/cvo_test.go             | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/resourceread/apiext_test.go b/lib/resourceread/apiext_test.go
index 912faf89a6..17ee98ea3f 100644
--- a/lib/resourceread/apiext_test.go
+++ b/lib/resourceread/apiext_test.go
@@ -13,7 +13,7 @@ func TestReadCustomResourceDefinitionOrDie(t *testing.T) {
 		args args
 	}{
 		{
-			name:"v1",
+			name: "v1",
 			args: args{
 				objBytes: []byte(`
 apiVersion: apiextensions.k8s.io/v1
@@ -42,7 +42,7 @@ spec:
 			},
 		},
 		{
-			name:"v1beta1",
+			name: "v1beta1",
 			args: args{
 				objBytes: []byte(`
 apiVersion: apiextensions.k8s.io/v1beta1
@@ -82,4 +82,4 @@ spec:
 			_ = ReadCustomResourceDefinitionOrDie(tt.args.objBytes)
 		})
 	}
-}
\ No newline at end of file
+}
diff --git a/pkg/autoupdate/autoupdate.go b/pkg/autoupdate/autoupdate.go
index 6e74634b35..fe836c50ff 100644
--- a/pkg/autoupdate/autoupdate.go
+++ b/pkg/autoupdate/autoupdate.go
@@ -7,7 +7,6 @@ import (
 
 	"github.com/blang/semver"
 
-	"k8s.io/klog"
 	v1 "github.com/openshift/api/config/v1"
 	clientset "github.com/openshift/client-go/config/clientset/versioned"
 	"github.com/openshift/client-go/config/clientset/versioned/scheme"
@@ -23,6 +22,7 @@ import (
 	"k8s.io/client-go/tools/cache"
 	"k8s.io/client-go/tools/record"
 	"k8s.io/client-go/util/workqueue"
+	"k8s.io/klog"
 )
 
 const (
diff --git a/pkg/cvo/cvo_test.go b/pkg/cvo/cvo_test.go
index 97ceeb852d..7fdb3964e2 100644
--- a/pkg/cvo/cvo_test.go
+++ b/pkg/cvo/cvo_test.go
@@ -27,8 +27,8 @@ import (
 	"k8s.io/apimachinery/pkg/util/diff"
 	"k8s.io/apimachinery/pkg/watch"
 	"k8s.io/client-go/discovery"
-	"k8s.io/client-go/rest"
 	kfake "k8s.io/client-go/kubernetes/fake"
+	"k8s.io/client-go/rest"
 	ktesting "k8s.io/client-go/testing"
 	"k8s.io/client-go/util/workqueue"
 	"k8s.io/klog"