Support --options on upgrade tests to abort in progress

smarterclayton · smarterclayton · commit a53efd5e2788 · 2019-05-02T20:43:15.000-04:00
To better stress test upgrades, add disruption elements for aborting
an upgrade part of the way through as well as rebooting random masters.

--options=abort-at=PERCENT will cause the upgrade to stop and roll back
to the previous version when PERCENT of operators have been upgraded.
100 will be after the upgrade is complete, while 'random' will be at
a randomly chosen percent.

--options=disrupt-reboot=POLICY causes random periodic reboots of
masters during upgradse. If set to 'graceful' the reboot allows clean
shutdown. If set to 'force' the machines immediate exit (to simulate
power loss).
diff --git a/cmd/openshift-tests/openshift-tests.go b/cmd/openshift-tests/openshift-tests.go
@@ -116,7 +116,6 @@ func newRunCommand() *cobra.Command {
 				if err := initProvider(opt.Provider); err != nil {
 					return err
 				}
-				os.Setenv("TEST_PROVIDER", opt.Provider)
 				e2e.AfterReadingAllFlags(exutil.TestContext)
 				return opt.Run(args)
 			})
@@ -142,9 +141,18 @@ func newRunUpgradeCommand() *cobra.Command {
 		If you specify the --dry-run argument, the actions the suite will take will be printed to the
 		output.
 
+		Supported options:
+
+		* abort-at=NUMBER - Set to a number between 0 and 100 to control the percent of operators
+		at which to stop the current upgrade and roll back to the current version.
+		* disrupt-reboot=POLICY - During upgrades, periodically reboot master nodes. If set to 'graceful'
+		the reboot will allow the node to shut down services in an orderly fashion. If set to 'force' the
+		machine will terminate immediately without clean shutdown.
+
 		`) + testginkgo.SuitesString(opt.Suites, "\n\nAvailable upgrade suites:\n\n"),
 
-		SilenceUsage: true,
+		SilenceUsage:  true,
+		SilenceErrors: true,
 		RunE: func(cmd *cobra.Command, args []string) error {
 			return mirrorToFile(opt, func() error {
 				if len(upgradeOpt.ToImage) == 0 {
@@ -156,7 +164,11 @@ func newRunUpgradeCommand() *cobra.Command {
 						if suite.Name == args[0] {
 							upgradeOpt.Suite = suite.Name
 							upgradeOpt.JUnitDir = opt.JUnitDir
-							os.Setenv("TEST_UPGRADE", upgradeOpt.ToEnv())
+							value := upgradeOpt.ToEnv()
+							if err := initUpgrade(value); err != nil {
+								return err
+							}
+							opt.SuiteOptions = value
 							break
 						}
 					}
@@ -165,7 +177,6 @@ func newRunUpgradeCommand() *cobra.Command {
 				if err := initProvider(opt.Provider); err != nil {
 					return err
 				}
-				os.Setenv("TEST_PROVIDER", opt.Provider)
 				e2e.AfterReadingAllFlags(exutil.TestContext)
 				return opt.Run(args)
 			})
@@ -198,7 +209,7 @@ func newRunTestCommand() *cobra.Command {
 			if err := initProvider(os.Getenv("TEST_PROVIDER")); err != nil {
 				return err
 			}
-			if err := initUpgrade(os.Getenv("TEST_UPGRADE")); err != nil {
+			if err := initUpgrade(os.Getenv("TEST_SUITE_OPTIONS")); err != nil {
 				return err
 			}
 			e2e.AfterReadingAllFlags(exutil.TestContext)
@@ -236,6 +247,7 @@ func mirrorToFile(opt *testginkgo.Options, fn func() error) error {
 
 func bindOptions(opt *testginkgo.Options, flags *pflag.FlagSet) {
 	flags.BoolVar(&opt.DryRun, "dry-run", opt.DryRun, "Print the tests to run without executing them.")
+	flags.BoolVar(&opt.PrintCommands, "print-commands", opt.PrintCommands, "Print the sub-commands that would be executed instead.")
 	flags.StringVar(&opt.JUnitDir, "junit-dir", opt.JUnitDir, "The directory to write test reports to.")
 	flags.StringVar(&opt.Provider, "provider", opt.Provider, "The cluster infrastructure provider. Will automatically default to the correct value.")
 	flags.StringVarP(&opt.TestFile, "file", "f", opt.TestFile, "Create a suite from the newline-delimited test names in this file.")
diff --git a/cmd/openshift-tests/upgrade.go b/cmd/openshift-tests/upgrade.go
@@ -25,7 +25,23 @@ var upgradeSuites = []*ginkgo.TestSuite{
 		`),
 		Matches: func(name string) bool { return strings.Contains(name, "[Feature:ClusterUpgrade]") },
 
-		Init:        func() error { return filterUpgrade(upgrade.AllTests(), func(name string) bool { return true }) },
+		Init: func(opt map[string]string) error {
+			for k, v := range opt {
+				switch k {
+				case "abort-at":
+					if err := upgrade.SetUpgradeAbortAt(v); err != nil {
+						return err
+					}
+				case "disrupt-reboot":
+					if err := upgrade.SetUpgradeDisruptReboot(v); err != nil {
+						return err
+					}
+				default:
+					return fmt.Errorf("unrecognized upgrade option: %s", k)
+				}
+			}
+			return filterUpgrade(upgrade.AllTests(), func(name string) bool { return true })
+		},
 		TestTimeout: 120 * time.Minute,
 	},
 }
@@ -34,6 +50,27 @@ type UpgradeOptions struct {
 	Suite    string
 	ToImage  string
 	JUnitDir string
+
+	TestOptions []string
+}
+
+func (o *UpgradeOptions) OptionsMap() (map[string]string, error) {
+	options := make(map[string]string)
+	for _, option := range o.TestOptions {
+		parts := strings.SplitN(option, "=", 2)
+		if len(parts) != 2 {
+			return nil, fmt.Errorf("test option %q is not valid, must be KEY=VALUE", option)
+		}
+		if len(parts[0]) == 0 {
+			return nil, fmt.Errorf("test option %q is not valid, must be KEY=VALUE", option)
+		}
+		_, exists := options[parts[0]]
+		if exists {
+			return nil, fmt.Errorf("option %q declared twice", parts[0])
+		}
+		options[parts[0]] = parts[1]
+	}
+	return options, nil
 }
 
 func (o *UpgradeOptions) ToEnv() string {
@@ -57,8 +94,12 @@ func initUpgrade(value string) error {
 			exutil.TestContext.UpgradeTarget = ""
 			exutil.TestContext.UpgradeImage = opt.ToImage
 			exutil.TestContext.ReportDir = opt.JUnitDir
+			o, err := opt.OptionsMap()
+			if err != nil {
+				return err
+			}
 			if suite.Init != nil {
-				return suite.Init()
+				return suite.Init(o)
 			}
 			return nil
 		}
@@ -79,4 +120,5 @@ func filterUpgrade(tests []upgrades.Test, match func(string) bool) error {
 
 func bindUpgradeOptions(opt *UpgradeOptions, flags *pflag.FlagSet) {
 	flags.StringVar(&opt.ToImage, "to-image", opt.ToImage, "Specify the image to test an upgrade to.")
+	flags.StringSliceVar(&opt.TestOptions, "options", opt.TestOptions, "A set of KEY=VALUE options to control the test. See the help text.")
 }
diff --git a/pkg/test/ginkgo/cmd_runsuite.go b/pkg/test/ginkgo/cmd_runsuite.go
@@ -30,12 +30,21 @@ type Options struct {
 
 	IncludeSuccessOutput bool
 
-	Provider string
+	Provider     string
+	SuiteOptions string
 
 	Suites []*TestSuite
 
-	DryRun      bool
-	Out, ErrOut io.Writer
+	DryRun        bool
+	PrintCommands bool
+	Out, ErrOut   io.Writer
+}
+
+func (opt *Options) AsEnv() []string {
+	var args []string
+	args = append(args, fmt.Sprintf("TEST_PROVIDER=%s", opt.Provider))
+	args = append(args, fmt.Sprintf("TEST_SUITE_OPTIONS=%s", opt.SuiteOptions))
+	return args
 }
 
 func (opt *Options) Run(args []string) error {
@@ -104,6 +113,11 @@ func (opt *Options) Run(args []string) error {
 		return fmt.Errorf("suite %q does not contain any tests", suite.Name)
 	}
 
+	if opt.PrintCommands {
+		status := newTestStatus(opt.Out, true, len(tests), time.Minute, &monitor.Monitor{}, opt.AsEnv())
+		newParallelTestQueue(tests).Execute(context.Background(), 1, status.OutputCommand)
+		return nil
+	}
 	if opt.DryRun {
 		for _, test := range sortedTests(tests) {
 			fmt.Fprintf(opt.Out, "%q\n", test.name)
@@ -164,7 +178,7 @@ func (opt *Options) Run(args []string) error {
 	if len(tests) == 1 {
 		includeSuccess = true
 	}
-	status := newTestStatus(opt.Out, includeSuccess, len(tests), timeout, m)
+	status := newTestStatus(opt.Out, includeSuccess, len(tests), timeout, m, opt.AsEnv())
 
 	smoke, normal := splitTests(tests, func(t *testCase) bool {
 		return strings.Contains(t.name, "[Smoke]")
@@ -255,7 +269,7 @@ func (opt *Options) Run(args []string) error {
 		}
 
 		q := newParallelTestQueue(retries)
-		status := newTestStatus(ioutil.Discard, opt.IncludeSuccessOutput, len(retries), timeout, m)
+		status := newTestStatus(ioutil.Discard, opt.IncludeSuccessOutput, len(retries), timeout, m, opt.AsEnv())
 		q.Execute(ctx, parallelism, status.Run)
 		var flaky []string
 		var repeatFailures []*testCase
diff --git a/pkg/test/ginkgo/status.go b/pkg/test/ginkgo/status.go
@@ -1,12 +1,14 @@
 package ginkgo
 
 import (
+	"bytes"
 	"context"
 	"fmt"
 	"io"
 	"os"
 	"os/exec"
 	"sort"
+	"strings"
 	"sync"
 	"syscall"
 	"time"
@@ -18,6 +20,7 @@ type testStatus struct {
 	out     io.Writer
 	timeout time.Duration
 	monitor monitor.Interface
+	env     []string
 
 	includeSuccessfulOutput bool
 
@@ -27,12 +30,13 @@ type testStatus struct {
 	total    int
 }
 
-func newTestStatus(out io.Writer, includeSuccessfulOutput bool, total int, timeout time.Duration, m monitor.Interface) *testStatus {
+func newTestStatus(out io.Writer, includeSuccessfulOutput bool, total int, timeout time.Duration, m monitor.Interface, testEnv []string) *testStatus {
 	return &testStatus{
 		out:     out,
 		total:   total,
 		timeout: timeout,
 		monitor: m,
+		env:     testEnv,
 
 		includeSuccessfulOutput: includeSuccessfulOutput,
 	}
@@ -53,6 +57,17 @@ func (s *testStatus) Fprintf(format string) {
 	fmt.Fprintf(s.out, format, s.failures, s.index, s.total)
 }
 
+// OutputCommand prints to stdout what would have been executed.
+func (s *testStatus) OutputCommand(ctx context.Context, test *testCase) {
+	buf := &bytes.Buffer{}
+	for _, env := range s.env {
+		parts := strings.SplitN(env, "=", 2)
+		fmt.Fprintf(buf, "%s=%q ", parts[0], parts[1])
+	}
+	fmt.Fprintf(buf, "%s %s %q", os.Args[0], "run-test", test.name)
+	fmt.Fprintln(s.out, buf.String())
+}
+
 func (s *testStatus) Run(ctx context.Context, test *testCase) {
 	defer func() {
 		switch {
@@ -94,6 +109,7 @@ func (s *testStatus) Run(ctx context.Context, test *testCase) {
 
 	test.start = time.Now()
 	c := exec.Command(os.Args[0], "run-test", test.name)
+	c.Env = append(os.Environ(), s.env...)
 	s.Fprintf(fmt.Sprintf("started: (%s) %q\n\n", "%d/%d/%d", test.name))
 	out, err := runWithTimeout(ctx, c, s.timeout)
 	test.end = time.Now()
diff --git a/pkg/test/ginkgo/test.go b/pkg/test/ginkgo/test.go
@@ -63,7 +63,7 @@ type TestSuite struct {
 
 	// Init should be run once before a test in this suite is run. Not called by
 	// methods in this package.
-	Init func() error
+	Init func(map[string]string) error
 
 	Parallelism int
 	// The number of flakes that may occur before this test is marked as a failure.
diff --git a/test/e2e/upgrade/monitor.go b/test/e2e/upgrade/monitor.go
diff --git a/test/e2e/upgrade/upgrade.go b/test/e2e/upgrade/upgrade.go