diff --git a/test/extended/dr/backup_restore.go b/test/extended/dr/backup_restore.go new file mode 100644 index 000000000000..f5f318f4a8f0 --- /dev/null +++ b/test/extended/dr/backup_restore.go @@ -0,0 +1,289 @@ +package dr + +import ( + "context" + "fmt" + "io/ioutil" + "os" + "strings" + "time" + + g "github.com/onsi/ginkgo" + o "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/test/e2e/framework" + + exutil "github.com/openshift/origin/test/extended/util" + "github.com/openshift/origin/test/extended/util/disruption" +) + +var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { + defer g.GinkgoRecover() + + f := framework.NewDefaultFramework("backup-restore") + f.SkipNamespaceCreation = true + f.SkipPrivilegedPSPBinding = true + + oc := exutil.NewCLIWithoutNamespace("backup-restore") + + // Validate the documented backup and restore procedure as closely as possible: + // + // backup: https://docs.openshift.com/container-platform/4.6/backup_and_restore/backing-up-etcd.html + // restore: https://docs.openshift.com/container-platform/4.6/backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.html + // + // Comments like 'Backup 2' and 'Restore '1a' indicate where a test step + // corresponds to a step in the documentation. + // + // Backing up and recovering on the same node is tested by quorum_restore.go. + g.It("[Feature:EtcdRecovery] Cluster should recover from a backup taken on one node and recovered on another", func() { + masters := masterNodes(oc) + // Need one node to backup from and another to restore to + o.Expect(len(masters)).To(o.BeNumerically(">=", 2)) + + // Pick one node to backup on + backupNode := masters[0] + framework.Logf("Selecting node %q as the backup host", backupNode.Name) + + // Recovery 1 + // Pick a different node to recover on + recoveryNode := masters[1] + framework.Logf("Selecting node %q as the recovery host", recoveryNode.Name) + + // Recovery 2 + g.By("Verifying that all masters are reachable via ssh") + for _, master := range masters { + checkSSH(master) + } + + disruptionFunc := func() { + // Backup 4 + // + // The backup has to be taken after the upgrade tests have done + // their pre-disruption setup to ensure that the api state that is + // restored includes those changes. + g.By(fmt.Sprintf("Running the backup script on node %q", backupNode.Name)) + sudoExecOnNodeOrFail(backupNode, "rm -rf /home/core/backup && /usr/local/bin/cluster-backup.sh /home/core/backup && chown -R core /home/core/backup") + + // Recovery 3 + // Copy the backup data from the backup node to the test host and + // from there to the recovery node. + // + // Another solution could be enabling the recovery node to connect + // directly to the backup node. It seemed simpler to use the test host + // as an intermediary rather than enabling agent forwarding or copying + // the private ssh key to the recovery node. + + g.By("Creating a local temporary directory") + tempDir, err := ioutil.TempDir("", "e2e-backup-restore") + o.Expect(err).NotTo(o.HaveOccurred()) + + // Define the ssh configuration necessary to invoke scp, which does + // not appear to be supported by the golang ssh client. + commonOpts := "-o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=30" + authOpt := fmt.Sprintf("-i %s", os.Getenv("KUBE_SSH_KEY_PATH")) + bastionHost := os.Getenv("KUBE_SSH_BASTION") + proxyOpt := "" + if len(bastionHost) > 0 { + framework.Logf("Bastion host %s will be used to proxy scp to cluster nodes", bastionHost) + // The bastion host is expected to be of the form address:port + hostParts := strings.Split(bastionHost, ":") + o.Expect(len(hostParts)).To(o.Equal(2)) + address := hostParts[0] + port := hostParts[1] + // A proxy command is required for a bastion host + proxyOpt = fmt.Sprintf("-o ProxyCommand='ssh -A -W %%h:%%p %s %s -p %s core@%s'", commonOpts, authOpt, port, address) + } + + g.By(fmt.Sprintf("Copying the backup directory from backup node %q to the test host", backupNode.Name)) + backupNodeAddress := addressForNode(backupNode) + o.Expect(backupNodeAddress).NotTo(o.BeEmpty()) + copyFromBackupNodeCmd := fmt.Sprintf(`scp -v %s %s %s -r core@%s:backup %s`, commonOpts, authOpt, proxyOpt, backupNodeAddress, tempDir) + runCommandAndRetry(copyFromBackupNodeCmd) + + g.By(fmt.Sprintf("Cleaning the backup path on recovery node %q", recoveryNode.Name)) + sudoExecOnNodeOrFail(recoveryNode, "rm -rf /home/core/backup") + + g.By(fmt.Sprintf("Copying the backup directory from the test host to recovery node %q", recoveryNode.Name)) + recoveryNodeAddress := addressForNode(recoveryNode) + o.Expect(recoveryNodeAddress).NotTo(o.BeEmpty()) + copyToRecoveryNodeCmd := fmt.Sprintf(`scp %s %s %s -r %s/backup core@%s:`, commonOpts, authOpt, proxyOpt, tempDir, recoveryNodeAddress) + runCommandAndRetry(copyToRecoveryNodeCmd) + + // Stop etcd static pods on non-recovery masters. + for _, master := range masters { + // The restore script will stop static pods on the recovery node + if master.Name == recoveryNode.Name { + continue + } + // Recovery 4b + g.By(fmt.Sprintf("Stopping etcd static pod on node %q", master.Name)) + manifest := "/etc/kubernetes/manifests/etcd-pod.yaml" + // Move only if present to ensure idempotent behavior during debugging. + sudoExecOnNodeOrFail(master, fmt.Sprintf("test -f %s && mv -f %s /tmp || true", manifest, manifest)) + + // Recovery 4c + g.By(fmt.Sprintf("Waiting for etcd to exit on node %q", master.Name)) + // Look for 'etcd ' (with trailing space) to be missing to + // differentiate from pods like etcd-operator. + sudoExecOnNodeOrFail(master, "crictl ps | grep 'etcd ' | wc -l | grep -q 0") + + // Recovery 4f + g.By(fmt.Sprintf("Moving etcd data directory on node %q", master.Name)) + // Move only if present to ensure idempotent behavior during debugging. + sudoExecOnNodeOrFail(master, "test -d /var/lib/etcd && (rm -rf /tmp/etcd && mv /var/lib/etcd/ /tmp) || true") + } + + // Recovery 4d + // Trigger stop of kube-apiserver static pods on non-recovery + // masters, without waiting, to minimize the test time required for + // graceful termination to complete. + for _, master := range masters { + // The restore script will stop static pods on the recovery node + if master.Name == recoveryNode.Name { + continue + } + g.By(fmt.Sprintf("Stopping kube-apiserver static pod on node %q", master.Name)) + manifest := "/etc/kubernetes/manifests/kube-apiserver-pod.yaml" + // Move only if present to ensure idempotent behavior during debugging. + sudoExecOnNodeOrFail(master, fmt.Sprintf("test -f %s && mv -f %s /tmp || true", manifest, manifest)) + } + + // Recovery 4e + // Wait for kube-apiserver pods to exit + for _, master := range masters { + // The restore script will stop static pods on the recovery node + if master.Name == recoveryNode.Name { + continue + } + g.By(fmt.Sprintf("Waiting for kube-apiserver to exit on node %q", master.Name)) + // Look for 'kube-apiserver ' (with trailing space) to be missing + // to differentiate from pods like kube-apiserver-operator. + sudoExecOnNodeOrFail(master, "crictl ps | grep -q 'kube-apiserver ' | wc -l | grep -q 0") + } + + // Recovery 7 + restoreFromBackup(recoveryNode) + + // Recovery 8 + for _, master := range masters { + restartKubelet(master) + } + + // Recovery 9a, 9b + waitForAPIServer(oc.AdminKubeClient(), recoveryNode) + + // Recovery 10,11,12 + forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1()) + + // Recovery 13 + waitForReadyEtcdPods(oc.AdminKubeClient(), len(masters)) + + waitForOperatorsToSettle() + } + + disruption.Run(f, "Backup from one node and recover on another", "restore_different_node", + disruption.TestData{}, + disruptionTests, + disruptionFunc, + ) + }) +}) + +// addressForNode looks for an ssh-accessible ip address for a node in case the +// node name doesn't resolve in the test environment. An empty string will be +// returned if an address could not be determined. +func addressForNode(node *corev1.Node) string { + for _, a := range node.Status.Addresses { + if a.Type == corev1.NodeExternalIP && a.Address != "" { + return a.Address + } + } + // No external IPs were found, let's try to use internal as plan B + for _, a := range node.Status.Addresses { + if a.Type == corev1.NodeInternalIP && a.Address != "" { + return a.Address + } + } + return "" +} + +// What follows are helper functions corresponding to steps in the recovery +// procedure. They are defined in a granular fashion to allow reuse by the +// quorum restore test. The quorum restore test needs to interleave the +// standard commands with commands related to master recreation. + +// Recovery 7 +func restoreFromBackup(node *corev1.Node) { + g.By(fmt.Sprintf("Running restore script on recovery node %q", node.Name)) + sudoExecOnNodeOrFail(node, "/usr/local/bin/cluster-restore.sh /home/core/backup") +} + +// Recovery 8 +func restartKubelet(node *corev1.Node) { + g.By(fmt.Sprintf("Restarting the kubelet service on node %q", node.Name)) + sudoExecOnNodeOrFail(node, "systemctl restart kubelet.service") +} + +// Recovery 9a +func waitForEtcdContainer(node *corev1.Node) { + g.By(fmt.Sprintf("Verifying that the etcd container is running on recovery node %q", node.Name)) + // Look for 'etcd ' (with trailing space) to differentiate from pods + // like etcd-operator. + sudoExecOnNodeOrFail(node, "crictl ps | grep -q 'etcd '") +} + +// Recovery 9b +func waitForEtcdPod(node *corev1.Node) { + // The success of this check also ensures that the kube apiserver on + // the recovery node is accepting connections. + g.By(fmt.Sprintf("Verifying that the etcd pod is running on recovery node %q", node.Name)) + // Look for a single running etcd pod + runningEtcdPodCmd := "oc get pods -n openshift-etcd -l k8s-app=etcd --no-headers=true | grep Running | wc -l | grep -q 1" + // The kubeconfig on the node is only readable by root and usage requires sudo. + nodeKubeConfig := "/etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigs/localhost.kubeconfig" + sudoExecOnNodeOrFail(node, fmt.Sprintf("KUBECONFIG=%s %s", nodeKubeConfig, runningEtcdPodCmd)) +} + +func waitForAPIServerAvailability(client kubernetes.Interface) { + g.By("Waiting for API server to become available") + err := wait.PollImmediate(10*time.Second, 30*time.Minute, func() (done bool, err error) { + _, err = client.CoreV1().Namespaces().Get(context.Background(), "default", metav1.GetOptions{}) + if err != nil { + framework.Logf("Observed an error waiting for apiserver availability outside the cluster: %v", err) + } + return err == nil, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) +} + +// waitForAPIServer waits for the etcd container and pod running on the +// recovery node and then waits for the apiserver to be accessible outside +// the cluster. +func waitForAPIServer(client kubernetes.Interface, node *corev1.Node) { + // Recovery 9a + waitForEtcdContainer(node) + + // Recovery 9b + waitForEtcdPod(node) + + // Even with the apiserver available on the recovery node, it may + // take additional time for the api to become available externally + // to the cluster. + waitForAPIServerAvailability(client) +} + +// Recovery 13 +func waitForReadyEtcdPods(client kubernetes.Interface, masterCount int) { + g.By("Waiting for all etcd pods to become ready") + waitForPodsTolerateClientTimeout( + client.CoreV1().Pods("openshift-etcd"), + exutil.ParseLabelsOrDie("k8s-app=etcd"), + exutil.CheckPodIsReady, + masterCount, + 40*time.Minute, + ) +} diff --git a/test/extended/dr/common.go b/test/extended/dr/common.go index 7d68379aa381..69e1b1745252 100644 --- a/test/extended/dr/common.go +++ b/test/extended/dr/common.go @@ -11,8 +11,10 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/dynamic" + "k8s.io/kubernetes/test/e2e/framework" e2e "k8s.io/kubernetes/test/e2e/framework" e2elog "k8s.io/kubernetes/test/e2e/framework/log" e2essh "k8s.io/kubernetes/test/e2e/framework/ssh" @@ -20,12 +22,14 @@ import ( "github.com/openshift/origin/test/e2e/upgrade" exutil "github.com/openshift/origin/test/extended/util" + g "github.com/onsi/ginkgo" o "github.com/onsi/gomega" "github.com/stretchr/objx" ) const ( - operatorWait = 15 * time.Minute + operatorWait = 15 * time.Minute + defaultSSHTimeout = 5 * time.Minute ) func runCommandAndRetry(command string) string { @@ -87,7 +91,17 @@ func waitForMastersToUpdate(oc *exutil.CLI, mcps dynamic.NamespaceableResourceIn o.Expect(err).NotTo(o.HaveOccurred()) } -func waitForOperatorsToSettle(coc dynamic.NamespaceableResourceInterface) { +func waitForOperatorsToSettle() { + g.By("Waiting for operators to settle before performing post-disruption testing") + config, err := framework.LoadConfig() + o.Expect(err).NotTo(o.HaveOccurred()) + dynamicClient := dynamic.NewForConfigOrDie(config) + coc := dynamicClient.Resource(schema.GroupVersionResource{ + Group: "config.openshift.io", + Version: "v1", + Resource: "clusteroperators", + }) + var lastErr error // gate on all clusteroperators being ready available := make(map[string]struct{}) @@ -196,50 +210,6 @@ func restartSDNPods(oc *exutil.CLI) { o.Expect(err).NotTo(o.HaveOccurred()) } -func restartOpenshiftAPIPods(oc *exutil.CLI) { - e2elog.Logf("Restarting Openshift API server") - - pods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-apiserver").List(context.Background(), metav1.ListOptions{}) - o.Expect(err).NotTo(o.HaveOccurred()) - - for _, pod := range pods.Items { - e2elog.Logf("Deleting pod %s", pod.Name) - err := oc.AdminKubeClient().CoreV1().Pods("openshift-apiserver").Delete(context.Background(), pod.Name, metav1.DeleteOptions{}) - o.Expect(err).NotTo(o.HaveOccurred()) - } - - err = wait.Poll(10*time.Second, 5*time.Minute, func() (done bool, err error) { - apiServerDS, err := oc.AdminKubeClient().AppsV1().DaemonSets("openshift-apiserver").Get(context.Background(), "apiserver", metav1.GetOptions{}) - if err != nil { - return false, nil - } - return apiServerDS.Status.NumberReady == apiServerDS.Status.NumberAvailable, nil - }) - o.Expect(err).NotTo(o.HaveOccurred()) -} - -func restartMCDPods(oc *exutil.CLI) { - e2elog.Logf("Restarting MCD pods") - - pods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-machine-config-operator").List(context.Background(), metav1.ListOptions{}) - o.Expect(err).NotTo(o.HaveOccurred()) - - for _, pod := range pods.Items { - e2elog.Logf("Deleting pod %s", pod.Name) - err := oc.AdminKubeClient().CoreV1().Pods("openshift-machine-config-operator").Delete(context.Background(), pod.Name, metav1.DeleteOptions{}) - o.Expect(err).NotTo(o.HaveOccurred()) - } - - err = wait.Poll(10*time.Second, 5*time.Minute, func() (done bool, err error) { - mcDS, err := oc.AdminKubeClient().AppsV1().DaemonSets("openshift-machine-config-operator").Get(context.Background(), "machine-config-daemon", metav1.GetOptions{}) - if err != nil { - return false, nil - } - return mcDS.Status.NumberReady == mcDS.Status.NumberAvailable, nil - }) - o.Expect(err).NotTo(o.HaveOccurred()) -} - func objects(from *objx.Value) []objx.Map { var values []objx.Map switch { @@ -285,21 +255,48 @@ func countReady(items []corev1.Node) int { func fetchFileContents(node *corev1.Node, path string) string { e2elog.Logf("Fetching %s file contents from %s", path, node.Name) - out, err := ssh(fmt.Sprintf("cat %q", path), node) - o.Expect(err).NotTo(o.HaveOccurred()) - if len(out.Stderr) > 0 { - e2elog.Logf("file content stderr:\n%s", out.Stderr) - } + out := execOnNodeWithOutputOrFail(node, fmt.Sprintf("cat %q", path)) return out.Stdout } -func expectSSH(cmd string, node *corev1.Node) { - e2elog.Logf("cmd[%s]: %s", node.Name, cmd) - out, err := e2essh.IssueSSHCommandWithResult(cmd, e2e.TestContext.Provider, node) - o.Expect(err).NotTo(o.HaveOccurred()) - if len(out.Stderr) > 0 { - e2elog.Logf("command %q stderr:\n%s", cmd, out.Stderr) - } +// execOnNodeWithOutputOrFail executes a command via ssh against a +// node in a poll loop to ensure reliable execution in a disrupted +// environment. The calling test will be failed if the command cannot +// be executed successfully before the provided timeout. +func execOnNodeWithOutputOrFail(node *corev1.Node, cmd string) *e2essh.Result { + var out *e2essh.Result + var err error + waitErr := wait.PollImmediate(5*time.Second, defaultSSHTimeout, func() (bool, error) { + out, err = e2essh.IssueSSHCommandWithResult(cmd, e2e.TestContext.Provider, node) + // IssueSSHCommandWithResult logs output + if err != nil { + e2elog.Logf("Failed to exec cmd [%s] on node %s: %v", cmd, node.Name, err) + } + return err == nil, nil + }) + o.Expect(waitErr).NotTo(o.HaveOccurred()) + return out +} + +// execOnNodeOrFail executes a command via ssh against a node in a +// poll loop until success or timeout. The output is ignored. The +// calling test will be failed if the command cannot be executed +// successfully before the timeout. +func execOnNodeOrFail(node *corev1.Node, cmd string) { + _ = execOnNodeWithOutputOrFail(node, cmd) +} + +// sudoExecOnNodeOrFail executes a command under sudo with execOnNodeOrFail. +func sudoExecOnNodeOrFail(node *corev1.Node, cmd string) { + sudoCmd := fmt.Sprintf(`sudo -i /bin/bash -cx "%s"`, cmd) + execOnNodeOrFail(node, sudoCmd) +} + +// checkSSH repeatedly attempts to establish an ssh connection to a +// node and fails the calling test if unable to establish the +// connection before the default timeout. +func checkSSH(node *corev1.Node) { + _ = execOnNodeWithOutputOrFail(node, "true") } func ssh(cmd string, node *corev1.Node) (*e2essh.Result, error) { diff --git a/test/extended/dr/force_redeploy.go b/test/extended/dr/force_redeploy.go new file mode 100644 index 000000000000..e5db7fa023c8 --- /dev/null +++ b/test/extended/dr/force_redeploy.go @@ -0,0 +1,182 @@ +package dr + +import ( + "context" + "fmt" + "time" + + g "github.com/onsi/ginkgo" + o "github.com/onsi/gomega" + "github.com/pborman/uuid" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/kubernetes/test/e2e/framework" + + operatorv1 "github.com/openshift/api/operator/v1" + operatorv1client "github.com/openshift/client-go/operator/clientset/versioned/typed/operator/v1" +) + +// Enables forcible redeployment of etcd, kube-apiserver, +// kube-controller-manager, kube-scheduler operands. This is a +// necessary part of restoring a cluster from backup. + +const ( + redeployWaitInterval = 5 * time.Second + redeployWaitTimeout = 2 * time.Minute +) + +// operatorConfigClient supports patching and retrieving the status of +// an operator's 'cluster' config resource to support triggering +// redeployment and watching for a successful rollout. +type operatorConfigClient struct { + name string + patch func(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions) error + getStatus func(ctx context.Context, name string, opts metav1.GetOptions) (*operatorv1.StaticPodOperatorStatus, error) +} + +func (c *operatorConfigClient) String() string { + return c.name +} + +// forceOperandRedeployment forces the redeployment the etcd, +// kube-apiserver, kube-controller-manager and kube-scheduler operands +// (in that order). Only when an operand has been successfully rolled +// out will redeployment of the subsequent operand be attempted. +func forceOperandRedeployment(client operatorv1client.OperatorV1Interface) { + clients := []*operatorConfigClient{ + { + name: "etcd", + patch: func(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions) error { + _, err := client.Etcds().Patch(ctx, name, pt, data, opts) + return err + }, + getStatus: func(ctx context.Context, name string, opts metav1.GetOptions) (*operatorv1.StaticPodOperatorStatus, error) { + obj, err := client.Etcds().Get(ctx, name, opts) + if err != nil { + return nil, err + } + return &obj.Status.StaticPodOperatorStatus, nil + }, + }, + { + name: "kube-apiserver", + patch: func(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions) error { + _, err := client.KubeAPIServers().Patch(ctx, name, pt, data, opts) + return err + }, + getStatus: func(ctx context.Context, name string, opts metav1.GetOptions) (*operatorv1.StaticPodOperatorStatus, error) { + obj, err := client.KubeAPIServers().Get(ctx, name, opts) + if err != nil { + return nil, err + } + return &obj.Status.StaticPodOperatorStatus, nil + }, + }, + { + name: "kube-controller-manager", + patch: func(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions) error { + _, err := client.KubeControllerManagers().Patch(ctx, name, pt, data, opts) + return err + }, + getStatus: func(ctx context.Context, name string, opts metav1.GetOptions) (*operatorv1.StaticPodOperatorStatus, error) { + obj, err := client.KubeControllerManagers().Get(ctx, name, opts) + if err != nil { + return nil, err + } + return &obj.Status.StaticPodOperatorStatus, nil + }, + }, + { + name: "kube-scheduler", + patch: func(ctx context.Context, name string, pt types.PatchType, data []byte, opts metav1.PatchOptions) error { + _, err := client.KubeSchedulers().Patch(ctx, name, pt, data, opts) + return err + }, + getStatus: func(ctx context.Context, name string, opts metav1.GetOptions) (*operatorv1.StaticPodOperatorStatus, error) { + obj, err := client.KubeSchedulers().Get(ctx, name, opts) + if err != nil { + return nil, err + } + return &obj.Status.StaticPodOperatorStatus, nil + }, + }, + } + for _, client := range clients { + forceRedeployOperand(client) + } +} + +// forceRedeployOperand initiates redeployment of an operand and waits for a +// successful rollout. +func forceRedeployOperand(client *operatorConfigClient) { + // Retrieve the LatestAvailableRevision before rolling out to know + // what revision not to look for in the subsequent check for + // rollout success. + g.By(fmt.Sprintf("Finding LatestAvailableRevision for %s", client)) + var latestAvailableRevision int32 + err := wait.PollImmediate(redeployWaitInterval, redeployWaitTimeout, func() (done bool, err error) { + status, err := client.getStatus(context.Background(), "cluster", metav1.GetOptions{}) + if err != nil { + framework.Logf("Error retrieving %s operator status: %v", client, err) + } else { + latestAvailableRevision = status.LatestAvailableRevision + } + return err == nil, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) + framework.Logf("LatestAvailableRevision for %s is %d", client, latestAvailableRevision) + + // Ensure a unique forceRedeploymentReason for each test run to + // ensure rollout is always triggered even if running repeatedly + // against the same cluster (as when debugging). + reason := fmt.Sprintf("e2e-cluster-restore-%s", uuid.NewUUID()) + + g.By(fmt.Sprintf("Forcing redeployment of %s", client)) + data := fmt.Sprintf(`{"spec": {"forceRedeploymentReason": "%s"}}`, reason) + err = wait.PollImmediate(redeployWaitInterval, redeployWaitTimeout, func() (done bool, err error) { + err = client.patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{}) + if err != nil { + framework.Logf("Error patching %s operator status to set redeploy reason: %v", client, err) + } + return err == nil, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) + + g.By(fmt.Sprintf("Waiting for %s to be updated on all nodes to a revision greater than %d", client, latestAvailableRevision)) + waitForRollout(client, latestAvailableRevision) + framework.Logf("Rollout complete for %s", client) +} + +// waitForRollout waits for an operator status to indicate that all nodes are +// at a revision greater than that provided. +func waitForRollout(client *operatorConfigClient, previousRevision int32) { + // Need to wait as long as 15 minutes for rollout of kube apiserver + err := wait.PollImmediate(redeployWaitInterval, 15*time.Minute, func() (done bool, err error) { + status, err := client.getStatus(context.Background(), "cluster", metav1.GetOptions{}) + if err != nil { + framework.Logf("Error retrieving %s operator status: %v", client, err) + return false, nil + } + rolloutComplete := false + for _, condition := range status.Conditions { + if condition.Type == "NodeInstallerProgressing" { + rolloutComplete = condition.Reason == "AllNodesAtLatestRevision" && condition.Status == operatorv1.ConditionFalse + break + } + } + if !rolloutComplete { + return false, nil + } + // Prevent timing issues by ensuring that the revision of all nodes is + // greater than the revision observed before rollout was initiated. + for _, nodeStatus := range status.NodeStatuses { + if nodeStatus.CurrentRevision == previousRevision { + return false, nil + } + } + return true, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) +} diff --git a/test/extended/dr/machine_recover.go b/test/extended/dr/machine_recover.go index a2811cb6ca00..943b37f60927 100644 --- a/test/extended/dr/machine_recover.go +++ b/test/extended/dr/machine_recover.go @@ -29,6 +29,7 @@ import ( "k8s.io/client-go/rest" "k8s.io/kubernetes/test/e2e/framework" + e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" "k8s.io/kubernetes/test/e2e/upgrades" exutil "github.com/openshift/origin/test/extended/util" @@ -45,6 +46,7 @@ var _ = g.Describe("[sig-cluster-lifecycle][Feature:DisasterRecovery][Disruptive oc := exutil.NewCLIWithoutNamespace("machine-recovery") g.It("[Feature:NodeRecovery] Cluster should survive master and worker failure and recover with machine health checks", func() { + e2eskipper.Skipf("Skipped until the fix for https://bugzilla.redhat.com/show_bug.cgi?id=1905709 is backported to 4.6") framework.Logf("Verify SSH is available before restart") masters, workers := clusterNodes(oc) @@ -52,10 +54,10 @@ var _ = g.Describe("[sig-cluster-lifecycle][Feature:DisasterRecovery][Disruptive o.Expect(len(workers)).To(o.BeNumerically(">=", 2)) replacedMaster := masters[rand.Intn(len(masters))] - expectSSH("true", replacedMaster) + checkSSH(replacedMaster) replacedWorker := workers[rand.Intn(len(workers))] - expectSSH("true", replacedWorker) + checkSSH(replacedWorker) disruption.Run(f, "Machine Shutdown and Restore", "machine_failure", disruption.TestData{}, diff --git a/test/extended/dr/quorum_restore.go b/test/extended/dr/quorum_restore.go index 43db457eab6a..5be53f551467 100644 --- a/test/extended/dr/quorum_restore.go +++ b/test/extended/dr/quorum_restore.go @@ -37,6 +37,15 @@ const ( machineAnnotationName = "machine.openshift.io/machine" ) +var disruptionTests []upgrades.Test = []upgrades.Test{ + &upgrades.ServiceUpgradeTest{}, + &upgrades.SecretUpgradeTest{}, + &apps.ReplicaSetUpgradeTest{}, + &apps.StatefulSetUpgradeTest{}, + &apps.DeploymentUpgradeTest{}, + &apps.DaemonSetUpgradeTest{}, +} + var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { defer g.GinkgoRecover() @@ -47,6 +56,8 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { oc := exutil.NewCLIWithoutNamespace("disaster-recovery") g.It("[Feature:EtcdRecovery] Cluster should restore itself after quorum loss", func() { + e2eskipper.Skipf("Test is disabled until https://github.com/openshift/origin/pull/25774 is backported to 4.6") + config, err := framework.LoadConfig() o.Expect(err).NotTo(o.HaveOccurred()) dynamicClient := dynamic.NewForConfigOrDie(config) @@ -60,11 +71,6 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { Version: "v1", Resource: "machineconfigpools", }) - coc := dynamicClient.Resource(schema.GroupVersionResource{ - Group: "config.openshift.io", - Version: "v1", - Resource: "clusteroperators", - }) // test for machines as a proxy for "can we recover a master" machines, err := dynamicClient.Resource(schema.GroupVersionResource{ @@ -79,14 +85,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { disruption.Run(f, "Quorum Loss and Restore", "quorum_restore", disruption.TestData{}, - []upgrades.Test{ - &upgrades.ServiceUpgradeTest{}, - &upgrades.SecretUpgradeTest{}, - &apps.ReplicaSetUpgradeTest{}, - &apps.StatefulSetUpgradeTest{}, - &apps.DeploymentUpgradeTest{}, - &apps.DaemonSetUpgradeTest{}, - }, + disruptionTests, func() { framework.Logf("Verify SSH is available before restart") @@ -94,7 +93,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { o.Expect(len(masters)).To(o.BeNumerically(">=", 1)) survivingNode := masters[rand.Intn(len(masters))] survivingNodeName := survivingNode.Name - expectSSH("true", survivingNode) + checkSSH(survivingNode) err = scaleEtcdQuorum(oc.AdminKubeClient(), 0) o.Expect(err).NotTo(o.HaveOccurred()) @@ -129,6 +128,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { err = wait.Poll(5*time.Second, 30*time.Minute, func() (done bool, err error) { _, err = pollClient.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{}) if err != nil { + framework.Logf("Error seen checking for unresponsive control plane: %v", err) failures++ } else { failures = 0 @@ -137,7 +137,9 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { // there is a small chance the cluster restores the default replica size during // this loop process, so keep forcing quorum guard to be zero, without failing on // errors - scaleEtcdQuorum(pollClient, 0) + if err := scaleEtcdQuorum(pollClient, 0); err != nil { + framework.Logf("Scaling etcd quorum failed: %v", err) + } // wait to see the control plane go down for good to avoid a transient failure return failures > 4, nil @@ -145,9 +147,11 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { } framework.Logf("Perform etcd backup on remaining machine %s (machine %s)", survivingNodeName, survivingMachineName) - expectSSH("sudo -i /bin/bash -cx 'rm -rf /home/core/backup; /usr/local/bin/cluster-backup.sh ~core/backup'", survivingNode) + // Need to supply --force to the backup script to avoid failing on the api check for progressing operators. + execOnNodeOrFail(survivingNode, "sudo -i /bin/bash -cx 'rm -rf /home/core/backup; /usr/local/bin/cluster-backup.sh --force ~core/backup'") + framework.Logf("Restore etcd and control-plane on remaining node %s (machine %s)", survivingNodeName, survivingMachineName) - expectSSH("sudo -i /bin/bash -cx '/usr/local/bin/cluster-restore.sh /home/core/backup'", survivingNode) + execOnNodeOrFail(survivingNode, "sudo -i /bin/bash -cx '/usr/local/bin/cluster-restore.sh /home/core/backup'") framework.Logf("Wait for API server to come up") time.Sleep(30 * time.Second) @@ -188,7 +192,8 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { return false, nil } if err != nil { - return false, err + framework.Logf("Error seen when re-creating machines: %v", err) + return false, nil } return true, nil }) @@ -196,12 +201,13 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { } framework.Logf("Waiting for machines to be created") - err = wait.Poll(30*time.Second, 10*time.Minute, func() (done bool, err error) { + err = wait.Poll(30*time.Second, 20*time.Minute, func() (done bool, err error) { mastersList, err := ms.List(context.Background(), metav1.ListOptions{ LabelSelector: "machine.openshift.io/cluster-api-machine-role=master", }) if err != nil { - return false, err + framework.Logf("Failed to check that machines are created: %v", err) + return false, nil } if mastersList.Items == nil { return false, nil @@ -220,6 +226,7 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { nodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{LabelSelector: "node-role.kubernetes.io/master="}) if err != nil { // scale up to 2nd etcd will make this error inevitable + framework.Logf("Error seen attempting to list master nodes: %v", err) return false, nil } ready := countReady(nodes.Items) @@ -240,15 +247,8 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { _, err = oc.AdminOperatorClient().OperatorV1().KubeAPIServers().Patch(context.Background(), "cluster", types.MergePatchType, []byte(`{"spec": {"forceRedeploymentReason": "recover-kube-apiserver"}}`), metav1.PatchOptions{}) o.Expect(err).NotTo(o.HaveOccurred()) - framework.Logf("Wait for etcd pods to become available") - _, err = waitForPodsTolerateClientTimeout( - oc.AdminKubeClient().CoreV1().Pods("openshift-etcd"), - exutil.ParseLabelsOrDie("k8s-app=etcd"), - exutil.CheckPodIsReady, - expectedNumberOfMasters, - 40*time.Minute, - ) - o.Expect(err).NotTo(o.HaveOccurred()) + // Recovery 13 + waitForReadyEtcdPods(oc.AdminKubeClient(), expectedNumberOfMasters) scaleEtcdQuorum(pollClient, expectedNumberOfMasters) @@ -256,27 +256,26 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { // SDN won't switch to Degraded mode when service is down after disaster recovery // restartSDNPods(oc) waitForMastersToUpdate(oc, mcps) - waitForOperatorsToSettle(coc) + waitForOperatorsToSettle() }) }, ) }) -func waitForPodsTolerateClientTimeout(c corev1client.PodInterface, label labels.Selector, predicate func(corev1.Pod) bool, count int, timeout time.Duration) ([]string, error) { - var podNames []string - err := wait.Poll(1*time.Second, timeout, func() (bool, error) { +func waitForPodsTolerateClientTimeout(c corev1client.PodInterface, label labels.Selector, predicate func(corev1.Pod) bool, count int, timeout time.Duration) { + err := wait.Poll(10*time.Second, timeout, func() (bool, error) { p, e := exutil.GetPodNamesByFilter(c, label, predicate) if e != nil { + framework.Logf("Saw an error waiting for etcd pods to become available: %v", e) // TODO tolerate transient etcd timeout only and fail other errors return false, nil } if len(p) != count { return false, nil } - podNames = p return true, nil }) - return podNames, err + o.Expect(err).NotTo(o.HaveOccurred()) } func scaleEtcdQuorum(client kubernetes.Interface, replicas int) error { diff --git a/test/extended/dr/restore_from_snapshot.go b/test/extended/dr/restore_from_snapshot.go deleted file mode 100644 index 3cf289e21785..000000000000 --- a/test/extended/dr/restore_from_snapshot.go +++ /dev/null @@ -1,135 +0,0 @@ -package dr - -import ( - "context" - "fmt" - "os" - "strings" - "time" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/dynamic" - e2e "k8s.io/kubernetes/test/e2e/framework" - - exutil "github.com/openshift/origin/test/extended/util" - - g "github.com/onsi/ginkgo" - o "github.com/onsi/gomega" -) - -const ( - rollBackMachineConfig = "99-rollback-test" -) - -var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { - f := e2e.NewDefaultFramework("disaster-recovery") - f.SkipNamespaceCreation = true - f.SkipPrivilegedPSPBinding = true - - oc := exutil.NewCLIWithoutNamespace("disaster-recovery") - - g.It("[dr-etcd-snapshot] Cluster should restore itself from etcd snapshot", func() { - config, err := e2e.LoadConfig() - o.Expect(err).NotTo(o.HaveOccurred()) - dynamicClient := dynamic.NewForConfigOrDie(config) - mcps := dynamicClient.Resource(schema.GroupVersionResource{ - Group: "machineconfiguration.openshift.io", - Version: "v1", - Resource: "machineconfigpools", - }) - mc := dynamicClient.Resource(schema.GroupVersionResource{ - Group: "machineconfiguration.openshift.io", - Version: "v1", - Resource: "machineconfigs", - }) - coc := dynamicClient.Resource(schema.GroupVersionResource{ - Group: "config.openshift.io", - Version: "v1", - Resource: "clusteroperators", - }) - - setMachineConfig("rollback-A.yaml", oc, mcps) - - masters := masterNodes(oc) - masterNames := sets.NewString() - for _, node := range masters { - masterNames.Insert(node.Name) - } - - e2e.Logf("masters: %v", masters) - o.Expect(masters).NotTo(o.BeEmpty()) - firstMaster := masters[0] - e2e.Logf("first master: %v", firstMaster) - - e2e.Logf("Make etcd backup on first master") - expectSSH("sudo -i /bin/bash -x /usr/local/bin/etcd-snapshot-backup.sh /root/assets/backup", firstMaster) - expectSSH("sudo -i /bin/bash -c '/bin/tar -xzf /root/assets/backup/snapshot* -C /root/assets/backup snapshot.db'", firstMaster) - expectSSH("sudo -i install -o core -g core /root/assets/backup/snapshot.db /tmp/snapshot.db", firstMaster) - - setMachineConfig("rollback-B.yaml", oc, mcps) - - masterHosts := strings.Join(masterNames.List(), " ") - restoreScriptPath := exutil.FixturePath("testdata", "disaster-recovery", "restore-etcd.sh") - cmd := fmt.Sprintf("env BASTION_HOST= MASTERHOSTS='%s' KUBE_SSH_KEY_PATH='%s' /bin/bash -x %s ", masterHosts, os.Getenv("KUBE_SSH_KEY_PATH"), restoreScriptPath) - runCommandAndRetry(cmd) - - time.Sleep(30 * time.Second) - waitForAPIServer(oc) - // restartSDNPods(oc) - restartOpenshiftAPIPods(oc) - restartMCDPods(oc) - waitForMastersToUpdate(oc, mcps) - waitForOperatorsToSettle(coc) - - rollBackInMC := getRollbackContentsInMachineConfig(oc, mc, rollBackMachineConfig) - o.Expect(rollBackInMC).To(o.BeEquivalentTo("data:,A")) - - for _, master := range masters { - rollBackFile := fetchFileContents(master, "/etc/rollback-test") - o.Expect(rollBackFile).To(o.BeEquivalentTo("A")) - } - }) -}) - -func setMachineConfig(rollbackFileName string, oc *exutil.CLI, mcps dynamic.NamespaceableResourceInterface) { - e2e.Logf("Update MachineConfig using %s file on masters", rollbackFileName) - machineConfigTemplate := exutil.FixturePath("testdata", "disaster-recovery", rollbackFileName) - err := oc.Run("apply").Args("-f", machineConfigTemplate).Execute() - o.Expect(err).NotTo(o.HaveOccurred()) - - waitForMastersToUpdate(oc, mcps) -} - -func getRollbackContentsInMachineConfig(oc *exutil.CLI, mcs dynamic.NamespaceableResourceInterface, mcName string) string { - e2e.Logf("Reading contents of rollback MachineConfig") - pool, err := mcs.Get(context.Background(), mcName, metav1.GetOptions{}) - o.Expect(err).NotTo(o.HaveOccurred()) - - files, found, err := unstructured.NestedSlice(pool.Object, "spec", "config", "storage", "files") - o.Expect(err).NotTo(o.HaveOccurred()) - o.Expect(found).To(o.BeTrue()) - o.Expect(files).NotTo(o.BeEmpty()) - - file := files[0].(map[string]interface{}) - actual, found, err := unstructured.NestedString(file, "contents", "source") - o.Expect(err).NotTo(o.HaveOccurred()) - o.Expect(found).To(o.BeTrue()) - - return actual -} - -func waitForAPIServer(oc *exutil.CLI) { - e2e.Logf("Waiting for API server to restore") - err := wait.Poll(10*time.Second, 5*time.Minute, func() (done bool, err error) { - _, err = oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{}) - if err != nil { - return false, nil - } - return true, nil - }) - o.Expect(err).NotTo(o.HaveOccurred()) -} diff --git a/test/extended/testdata/bindata.go b/test/extended/testdata/bindata.go index 7f90e8586313..0de5680720bd 100644 --- a/test/extended/testdata/bindata.go +++ b/test/extended/testdata/bindata.go @@ -335,19 +335,6 @@ // test/extended/testdata/deployments/tag-images-deployment.yaml // test/extended/testdata/deployments/test-deployment-broken.yaml // test/extended/testdata/deployments/test-deployment-test.yaml -// test/extended/testdata/disaster-recovery/restore-etcd.sh -// test/extended/testdata/disaster-recovery/rollback-A.yaml -// test/extended/testdata/disaster-recovery/rollback-B.yaml -// test/extended/testdata/disaster-recovery/ssh-bastion/clusterrole.yaml -// test/extended/testdata/disaster-recovery/ssh-bastion/clusterrolebinding.yaml -// test/extended/testdata/disaster-recovery/ssh-bastion/deployment.yaml -// test/extended/testdata/disaster-recovery/ssh-bastion/namespace.yaml -// test/extended/testdata/disaster-recovery/ssh-bastion/role.yaml -// test/extended/testdata/disaster-recovery/ssh-bastion/rolebinding.yaml -// test/extended/testdata/disaster-recovery/ssh-bastion/service.yaml -// test/extended/testdata/disaster-recovery/ssh-bastion/serviceaccount.yaml -// test/extended/testdata/disaster-recovery/ssh-bastion/sshd_config -// test/extended/testdata/disaster-recovery/update_route_53.py // test/extended/testdata/forcepull-test.json // test/extended/testdata/gssapi/config/kubeconfig // test/extended/testdata/gssapi/config/oauth_config.json @@ -48495,525 +48482,6 @@ func testExtendedTestdataDeploymentsTestDeploymentTestYaml() (*asset, error) { return a, nil } -var _testExtendedTestdataDisasterRecoveryRestoreEtcdSh = []byte(`#!/bin/bash -set -euo pipefail - -if [ -z "${BASTION_HOST}" ]; then exit 1; fi -if [ -z "${MASTERHOSTS}" ]; then exit 1; fi -if [ -z "${KUBE_SSH_KEY_PATH}" ]; then exit 1; fi - -MASTERS=(${MASTERHOSTS}) -FIRST_MASTER="${MASTERS[0]}" - -function retry() { - local ATTEMPTS="${1}" - local rc=0 - shift - for i in $(seq 0 $((ATTEMPTS-1))); do - echo "--> ${@}" - set +e - "${@}" - rc="$?" - set -e - echo "--> exit code: $rc" - test "${rc}" = 0 && break - sleep 10 - done - return "${rc}" -} - -function bastion_ssh() { - retry 60 \ - ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=30 -o StrictHostKeyChecking=no \ - -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=30 -W %h:%p core@${BASTION_HOST} 2>/dev/null" \ - $@ -} - -echo "Distribute snapshot across all masters" -for master in "${MASTERS[@]}" -do - scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" ${KUBE_SSH_KEY_PATH} "core@${master}":/home/core/.ssh/id_rsa - bastion_ssh "core@${master}" "sudo -i chmod 0600 /home/core/.ssh/id_rsa" - bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /tmp/snapshot.db core@${master}:/tmp/snapshot.db" -done - -echo "Collect etcd names" -for master in "${MASTERS[@]}" -do - bastion_ssh "core@${master}" 'echo "etcd-member-$(hostname -f)" > /tmp/etcd_name && source /run/etcd/environment && echo "https://${ETCD_DNS_NAME}:2380" > /tmp/etcd_uri' - bastion_ssh "core@${FIRST_MASTER}" "mkdir -p /tmp/etcd/${master} && scp -o StrictHostKeyChecking=no core@${master}:/tmp/etcd_name /tmp/etcd/${master}/etcd_name && scp -o StrictHostKeyChecking=no core@${master}:/tmp/etcd_uri /tmp/etcd/${master}/etcd_uri" - bastion_ssh "core@${FIRST_MASTER}" "cat /tmp/etcd/${master}/etcd_name" - bastion_ssh "core@${FIRST_MASTER}" "cat /tmp/etcd/${master}/etcd_uri" -done - -echo "Assemble etcd connection string" -bastion_ssh "core@${FIRST_MASTER}" 'rm -rf /tmp/etcd/connstring && mapfile -t MASTERS < <(ls /tmp/etcd) && echo ${MASTERS[@]} && for master in "${MASTERS[@]}"; do echo -n "$(cat /tmp/etcd/${master}/etcd_name)=$(cat /tmp/etcd/${master}/etcd_uri)," >> /tmp/etcd/connstring; done && sed -i '"'$ s/.$//'"' /tmp/etcd/connstring' - -echo "Restore etcd cluster from snapshot" -for master in "${MASTERS[@]}" -do - echo "Running /usr/local/bin/etcd-snapshot-restore.sh on ${master}" - bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /tmp/etcd/connstring core@${master}:/tmp/etcd_connstring" - bastion_ssh "core@${master}" 'sudo -i /bin/bash -x /usr/local/bin/etcd-snapshot-restore.sh /tmp/snapshot.db $(cat /tmp/etcd_connstring)' -done -`) - -func testExtendedTestdataDisasterRecoveryRestoreEtcdShBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoveryRestoreEtcdSh, nil -} - -func testExtendedTestdataDisasterRecoveryRestoreEtcdSh() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoveryRestoreEtcdShBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/restore-etcd.sh", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - -var _testExtendedTestdataDisasterRecoveryRollbackAYaml = []byte(`apiVersion: machineconfiguration.openshift.io/v1 -kind: MachineConfig -metadata: - labels: - machineconfiguration.openshift.io/role: master - name: 99-rollback-test -spec: - config: - ignition: - version: 2.2.0 - storage: - files: - - contents: - source: data:,A - filesystem: root - mode: 420 - path: /etc/rollback-test -`) - -func testExtendedTestdataDisasterRecoveryRollbackAYamlBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoveryRollbackAYaml, nil -} - -func testExtendedTestdataDisasterRecoveryRollbackAYaml() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoveryRollbackAYamlBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/rollback-A.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - -var _testExtendedTestdataDisasterRecoveryRollbackBYaml = []byte(`apiVersion: machineconfiguration.openshift.io/v1 -kind: MachineConfig -metadata: - labels: - machineconfiguration.openshift.io/role: master - name: 99-rollback-test -spec: - config: - ignition: - version: 2.2.0 - storage: - files: - - contents: - source: data:,B - filesystem: root - mode: 420 - path: /etc/rollback-test -`) - -func testExtendedTestdataDisasterRecoveryRollbackBYamlBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoveryRollbackBYaml, nil -} - -func testExtendedTestdataDisasterRecoveryRollbackBYaml() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoveryRollbackBYamlBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/rollback-B.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - -var _testExtendedTestdataDisasterRecoverySshBastionClusterroleYaml = []byte(`apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: ssh-bastion -rules: -- apiGroups: - - "machineconfiguration.openshift.io" - resources: - - "machineconfigs" - verbs: - - get -- apiGroups: - - "" - resources: - - "nodes" - verbs: - - list - - get -`) - -func testExtendedTestdataDisasterRecoverySshBastionClusterroleYamlBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoverySshBastionClusterroleYaml, nil -} - -func testExtendedTestdataDisasterRecoverySshBastionClusterroleYaml() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoverySshBastionClusterroleYamlBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/clusterrole.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - -var _testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYaml = []byte(`apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - annotations: - openshift.io/description: Allows ssh-pod to read nodes and machineconfigs - name: ssh-bastion -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: ssh-bastion -subjects: -- apiGroup: rbac.authorization.k8s.io - kind: User - name: system:serviceaccount:ssh-bastion:ssh-bastion -`) - -func testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYamlBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYaml, nil -} - -func testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYaml() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYamlBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/clusterrolebinding.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - -var _testExtendedTestdataDisasterRecoverySshBastionDeploymentYaml = []byte(`apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - run: ssh-bastion - name: ssh-bastion - namespace: ssh-bastion -spec: - replicas: 1 - selector: - matchLabels: - run: ssh-bastion - template: - metadata: - labels: - run: ssh-bastion - spec: - serviceAccountName: "ssh-bastion" - containers: - - image: quay.io/eparis/ssh:latest - imagePullPolicy: Always - name: ssh-bastion - ports: - - containerPort: 22 - name: ssh - protocol: TCP - volumeMounts: - - name: ssh-host-keys - mountPath: "/etc/ssh/" - readOnly: true - securityContext: - privileged: true - volumes: - - name: ssh-host-keys - secret: - secretName: ssh-host-keys - items: - - key: ssh_host_rsa_key - path: ssh_host_rsa_key - mode: 256 - - key: ssh_host_ecdsa_key - path: ssh_host_ecdsa_key - mode: 256 - - key: ssh_host_ed25519_key - path: ssh_host_ed25519_key - mode: 256 - - key: sshd_config - path: sshd_config - restartPolicy: Always -`) - -func testExtendedTestdataDisasterRecoverySshBastionDeploymentYamlBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoverySshBastionDeploymentYaml, nil -} - -func testExtendedTestdataDisasterRecoverySshBastionDeploymentYaml() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoverySshBastionDeploymentYamlBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/deployment.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - -var _testExtendedTestdataDisasterRecoverySshBastionNamespaceYaml = []byte(`apiVersion: v1 -kind: Namespace -metadata: - name: ssh-bastion - labels: - openshift.io/run-level: "0" - -`) - -func testExtendedTestdataDisasterRecoverySshBastionNamespaceYamlBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoverySshBastionNamespaceYaml, nil -} - -func testExtendedTestdataDisasterRecoverySshBastionNamespaceYaml() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoverySshBastionNamespaceYamlBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/namespace.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - -var _testExtendedTestdataDisasterRecoverySshBastionRoleYaml = []byte(`apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: ssh-bastion - namespace: ssh-bastion -rules: -- apiGroups: - - security.openshift.io - resources: - - securitycontextconstraints - verbs: - - use - resourceNames: - - privileged -`) - -func testExtendedTestdataDisasterRecoverySshBastionRoleYamlBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoverySshBastionRoleYaml, nil -} - -func testExtendedTestdataDisasterRecoverySshBastionRoleYaml() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoverySshBastionRoleYamlBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/role.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - -var _testExtendedTestdataDisasterRecoverySshBastionRolebindingYaml = []byte(`apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - annotations: - openshift.io/description: Allows ssh-pod to run as root - name: ssh-bastion - namespace: ssh-bastion -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: ssh-bastion -subjects: -- apiGroup: rbac.authorization.k8s.io - kind: User - name: system:serviceaccount:ssh-bastion:ssh-bastion -`) - -func testExtendedTestdataDisasterRecoverySshBastionRolebindingYamlBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoverySshBastionRolebindingYaml, nil -} - -func testExtendedTestdataDisasterRecoverySshBastionRolebindingYaml() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoverySshBastionRolebindingYamlBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/rolebinding.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - -var _testExtendedTestdataDisasterRecoverySshBastionServiceYaml = []byte(`apiVersion: v1 -kind: Service -metadata: - labels: - run: ssh-bastion - name: ssh-bastion - namespace: ssh-bastion -spec: - externalTrafficPolicy: Local - ports: - - name: ssh - port: 22 - protocol: TCP - targetPort: ssh - selector: - run: ssh-bastion - type: LoadBalancer -`) - -func testExtendedTestdataDisasterRecoverySshBastionServiceYamlBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoverySshBastionServiceYaml, nil -} - -func testExtendedTestdataDisasterRecoverySshBastionServiceYaml() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoverySshBastionServiceYamlBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/service.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - -var _testExtendedTestdataDisasterRecoverySshBastionServiceaccountYaml = []byte(`apiVersion: v1 -kind: ServiceAccount -metadata: - name: ssh-bastion - namespace: ssh-bastion -`) - -func testExtendedTestdataDisasterRecoverySshBastionServiceaccountYamlBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoverySshBastionServiceaccountYaml, nil -} - -func testExtendedTestdataDisasterRecoverySshBastionServiceaccountYaml() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoverySshBastionServiceaccountYamlBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/serviceaccount.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - -var _testExtendedTestdataDisasterRecoverySshBastionSshd_config = []byte(`HostKey /etc/ssh/ssh_host_rsa_key -HostKey /etc/ssh/ssh_host_ecdsa_key -HostKey /etc/ssh/ssh_host_ed25519_key -SyslogFacility AUTHPRIV -PermitRootLogin no -AuthorizedKeysFile /home/core/.ssh/authorized_keys -PasswordAuthentication no -ChallengeResponseAuthentication no -GSSAPIAuthentication yes -GSSAPICleanupCredentials no -UsePAM yes -X11Forwarding yes -PrintMotd no -AcceptEnv LANG LC_CTYPE LC_NUMERIC LC_TIME LC_COLLATE LC_MONETARY LC_MESSAGES -AcceptEnv LC_PAPER LC_NAME LC_ADDRESS LC_TELEPHONE LC_MEASUREMENT -AcceptEnv LC_IDENTIFICATION LC_ALL LANGUAGE -AcceptEnv XMODIFIERS -Subsystem sftp /usr/libexec/openssh/sftp-server -`) - -func testExtendedTestdataDisasterRecoverySshBastionSshd_configBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoverySshBastionSshd_config, nil -} - -func testExtendedTestdataDisasterRecoverySshBastionSshd_config() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoverySshBastionSshd_configBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/ssh-bastion/sshd_config", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - -var _testExtendedTestdataDisasterRecoveryUpdate_route_53Py = []byte(`import boto3 -import os -import sys -from time import sleep - -if len(sys.argv) < 4: - print("Usage: ./update_route_53.py ") - sys.exit(1) - -attempts = 10 -pause = 10 - -domain = sys.argv[1] -record = sys.argv[2] -ip = sys.argv[3] -print("record: %s" % record) -print("ip: %s" % ip) - -client = boto3.client('route53') -r = client.list_hosted_zones_by_name(DNSName=domain, MaxItems="1") -zone_id = r['HostedZones'][0]['Id'].split('/')[-1] - -response = client.change_resource_record_sets( - HostedZoneId=zone_id, - ChangeBatch= { - 'Comment': 'add %s -> %s' % (record, ip), - 'Changes': [ - { - 'Action': 'UPSERT', - 'ResourceRecordSet': { - 'Name': record, - 'Type': 'A', - 'TTL': 60, - 'ResourceRecords': [{'Value': ip}] - } - }] -}) -for i in range(attempts): - print('response: %s' % response) - changeID = response['ChangeInfo']['Id'] - if response['ChangeInfo']['Status'] == "INSYNC": - print('insync found, response: %s' % response) - break - print('waiting for response to complete') - sleep(pause) - response = client.get_change(Id=changeID) -`) - -func testExtendedTestdataDisasterRecoveryUpdate_route_53PyBytes() ([]byte, error) { - return _testExtendedTestdataDisasterRecoveryUpdate_route_53Py, nil -} - -func testExtendedTestdataDisasterRecoveryUpdate_route_53Py() (*asset, error) { - bytes, err := testExtendedTestdataDisasterRecoveryUpdate_route_53PyBytes() - if err != nil { - return nil, err - } - - info := bindataFileInfo{name: "test/extended/testdata/disaster-recovery/update_route_53.py", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} - a := &asset{bytes: bytes, info: info} - return a, nil -} - var _testExtendedTestdataForcepullTestJson = []byte(`{ "kind": "List", "apiVersion": "v1", @@ -61812,19 +61280,6 @@ var _bindata = map[string]func() (*asset, error){ "test/extended/testdata/deployments/tag-images-deployment.yaml": testExtendedTestdataDeploymentsTagImagesDeploymentYaml, "test/extended/testdata/deployments/test-deployment-broken.yaml": testExtendedTestdataDeploymentsTestDeploymentBrokenYaml, "test/extended/testdata/deployments/test-deployment-test.yaml": testExtendedTestdataDeploymentsTestDeploymentTestYaml, - "test/extended/testdata/disaster-recovery/restore-etcd.sh": testExtendedTestdataDisasterRecoveryRestoreEtcdSh, - "test/extended/testdata/disaster-recovery/rollback-A.yaml": testExtendedTestdataDisasterRecoveryRollbackAYaml, - "test/extended/testdata/disaster-recovery/rollback-B.yaml": testExtendedTestdataDisasterRecoveryRollbackBYaml, - "test/extended/testdata/disaster-recovery/ssh-bastion/clusterrole.yaml": testExtendedTestdataDisasterRecoverySshBastionClusterroleYaml, - "test/extended/testdata/disaster-recovery/ssh-bastion/clusterrolebinding.yaml": testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYaml, - "test/extended/testdata/disaster-recovery/ssh-bastion/deployment.yaml": testExtendedTestdataDisasterRecoverySshBastionDeploymentYaml, - "test/extended/testdata/disaster-recovery/ssh-bastion/namespace.yaml": testExtendedTestdataDisasterRecoverySshBastionNamespaceYaml, - "test/extended/testdata/disaster-recovery/ssh-bastion/role.yaml": testExtendedTestdataDisasterRecoverySshBastionRoleYaml, - "test/extended/testdata/disaster-recovery/ssh-bastion/rolebinding.yaml": testExtendedTestdataDisasterRecoverySshBastionRolebindingYaml, - "test/extended/testdata/disaster-recovery/ssh-bastion/service.yaml": testExtendedTestdataDisasterRecoverySshBastionServiceYaml, - "test/extended/testdata/disaster-recovery/ssh-bastion/serviceaccount.yaml": testExtendedTestdataDisasterRecoverySshBastionServiceaccountYaml, - "test/extended/testdata/disaster-recovery/ssh-bastion/sshd_config": testExtendedTestdataDisasterRecoverySshBastionSshd_config, - "test/extended/testdata/disaster-recovery/update_route_53.py": testExtendedTestdataDisasterRecoveryUpdate_route_53Py, "test/extended/testdata/forcepull-test.json": testExtendedTestdataForcepullTestJson, "test/extended/testdata/gssapi/config/kubeconfig": testExtendedTestdataGssapiConfigKubeconfig, "test/extended/testdata/gssapi/config/oauth_config.json": testExtendedTestdataGssapiConfigOauth_configJson, @@ -62552,23 +62007,6 @@ var _bintree = &bintree{nil, map[string]*bintree{ "test-deployment-broken.yaml": {testExtendedTestdataDeploymentsTestDeploymentBrokenYaml, map[string]*bintree{}}, "test-deployment-test.yaml": {testExtendedTestdataDeploymentsTestDeploymentTestYaml, map[string]*bintree{}}, }}, - "disaster-recovery": {nil, map[string]*bintree{ - "restore-etcd.sh": {testExtendedTestdataDisasterRecoveryRestoreEtcdSh, map[string]*bintree{}}, - "rollback-A.yaml": {testExtendedTestdataDisasterRecoveryRollbackAYaml, map[string]*bintree{}}, - "rollback-B.yaml": {testExtendedTestdataDisasterRecoveryRollbackBYaml, map[string]*bintree{}}, - "ssh-bastion": {nil, map[string]*bintree{ - "clusterrole.yaml": {testExtendedTestdataDisasterRecoverySshBastionClusterroleYaml, map[string]*bintree{}}, - "clusterrolebinding.yaml": {testExtendedTestdataDisasterRecoverySshBastionClusterrolebindingYaml, map[string]*bintree{}}, - "deployment.yaml": {testExtendedTestdataDisasterRecoverySshBastionDeploymentYaml, map[string]*bintree{}}, - "namespace.yaml": {testExtendedTestdataDisasterRecoverySshBastionNamespaceYaml, map[string]*bintree{}}, - "role.yaml": {testExtendedTestdataDisasterRecoverySshBastionRoleYaml, map[string]*bintree{}}, - "rolebinding.yaml": {testExtendedTestdataDisasterRecoverySshBastionRolebindingYaml, map[string]*bintree{}}, - "service.yaml": {testExtendedTestdataDisasterRecoverySshBastionServiceYaml, map[string]*bintree{}}, - "serviceaccount.yaml": {testExtendedTestdataDisasterRecoverySshBastionServiceaccountYaml, map[string]*bintree{}}, - "sshd_config": {testExtendedTestdataDisasterRecoverySshBastionSshd_config, map[string]*bintree{}}, - }}, - "update_route_53.py": {testExtendedTestdataDisasterRecoveryUpdate_route_53Py, map[string]*bintree{}}, - }}, "forcepull-test.json": {testExtendedTestdataForcepullTestJson, map[string]*bintree{}}, "gssapi": {nil, map[string]*bintree{ "config": {nil, map[string]*bintree{ diff --git a/test/extended/testdata/disaster-recovery/restore-etcd.sh b/test/extended/testdata/disaster-recovery/restore-etcd.sh deleted file mode 100755 index e709848c560f..000000000000 --- a/test/extended/testdata/disaster-recovery/restore-etcd.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -set -euo pipefail - -if [ -z "${BASTION_HOST}" ]; then exit 1; fi -if [ -z "${MASTERHOSTS}" ]; then exit 1; fi -if [ -z "${KUBE_SSH_KEY_PATH}" ]; then exit 1; fi - -MASTERS=(${MASTERHOSTS}) -FIRST_MASTER="${MASTERS[0]}" - -function retry() { - local ATTEMPTS="${1}" - local rc=0 - shift - for i in $(seq 0 $((ATTEMPTS-1))); do - echo "--> ${@}" - set +e - "${@}" - rc="$?" - set -e - echo "--> exit code: $rc" - test "${rc}" = 0 && break - sleep 10 - done - return "${rc}" -} - -function bastion_ssh() { - retry 60 \ - ssh -o LogLevel=error -o ConnectionAttempts=100 -o ConnectTimeout=30 -o StrictHostKeyChecking=no \ - -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o LogLevel=error -o ServerAliveInterval=30 -o ConnectionAttempts=100 -o ConnectTimeout=30 -W %h:%p core@${BASTION_HOST} 2>/dev/null" \ - $@ -} - -echo "Distribute snapshot across all masters" -for master in "${MASTERS[@]}" -do - scp -o StrictHostKeyChecking=no -o ProxyCommand="ssh -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" ${KUBE_SSH_KEY_PATH} "core@${master}":/home/core/.ssh/id_rsa - bastion_ssh "core@${master}" "sudo -i chmod 0600 /home/core/.ssh/id_rsa" - bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /tmp/snapshot.db core@${master}:/tmp/snapshot.db" -done - -echo "Collect etcd names" -for master in "${MASTERS[@]}" -do - bastion_ssh "core@${master}" 'echo "etcd-member-$(hostname -f)" > /tmp/etcd_name && source /run/etcd/environment && echo "https://${ETCD_DNS_NAME}:2380" > /tmp/etcd_uri' - bastion_ssh "core@${FIRST_MASTER}" "mkdir -p /tmp/etcd/${master} && scp -o StrictHostKeyChecking=no core@${master}:/tmp/etcd_name /tmp/etcd/${master}/etcd_name && scp -o StrictHostKeyChecking=no core@${master}:/tmp/etcd_uri /tmp/etcd/${master}/etcd_uri" - bastion_ssh "core@${FIRST_MASTER}" "cat /tmp/etcd/${master}/etcd_name" - bastion_ssh "core@${FIRST_MASTER}" "cat /tmp/etcd/${master}/etcd_uri" -done - -echo "Assemble etcd connection string" -bastion_ssh "core@${FIRST_MASTER}" 'rm -rf /tmp/etcd/connstring && mapfile -t MASTERS < <(ls /tmp/etcd) && echo ${MASTERS[@]} && for master in "${MASTERS[@]}"; do echo -n "$(cat /tmp/etcd/${master}/etcd_name)=$(cat /tmp/etcd/${master}/etcd_uri)," >> /tmp/etcd/connstring; done && sed -i '"'$ s/.$//'"' /tmp/etcd/connstring' - -echo "Restore etcd cluster from snapshot" -for master in "${MASTERS[@]}" -do - echo "Running /usr/local/bin/etcd-snapshot-restore.sh on ${master}" - bastion_ssh "core@${FIRST_MASTER}" "scp -o StrictHostKeyChecking=no /tmp/etcd/connstring core@${master}:/tmp/etcd_connstring" - bastion_ssh "core@${master}" 'sudo -i /bin/bash -x /usr/local/bin/etcd-snapshot-restore.sh /tmp/snapshot.db $(cat /tmp/etcd_connstring)' -done diff --git a/test/extended/testdata/disaster-recovery/rollback-A.yaml b/test/extended/testdata/disaster-recovery/rollback-A.yaml deleted file mode 100644 index 9634a704cf9e..000000000000 --- a/test/extended/testdata/disaster-recovery/rollback-A.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: machineconfiguration.openshift.io/v1 -kind: MachineConfig -metadata: - labels: - machineconfiguration.openshift.io/role: master - name: 99-rollback-test -spec: - config: - ignition: - version: 2.2.0 - storage: - files: - - contents: - source: data:,A - filesystem: root - mode: 420 - path: /etc/rollback-test diff --git a/test/extended/testdata/disaster-recovery/rollback-B.yaml b/test/extended/testdata/disaster-recovery/rollback-B.yaml deleted file mode 100644 index f731af036ab7..000000000000 --- a/test/extended/testdata/disaster-recovery/rollback-B.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: machineconfiguration.openshift.io/v1 -kind: MachineConfig -metadata: - labels: - machineconfiguration.openshift.io/role: master - name: 99-rollback-test -spec: - config: - ignition: - version: 2.2.0 - storage: - files: - - contents: - source: data:,B - filesystem: root - mode: 420 - path: /etc/rollback-test diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/clusterrole.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/clusterrole.yaml deleted file mode 100644 index f7ce7f35641c..000000000000 --- a/test/extended/testdata/disaster-recovery/ssh-bastion/clusterrole.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: ssh-bastion -rules: -- apiGroups: - - "machineconfiguration.openshift.io" - resources: - - "machineconfigs" - verbs: - - get -- apiGroups: - - "" - resources: - - "nodes" - verbs: - - list - - get diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/clusterrolebinding.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/clusterrolebinding.yaml deleted file mode 100644 index cdad0df9e50f..000000000000 --- a/test/extended/testdata/disaster-recovery/ssh-bastion/clusterrolebinding.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - annotations: - openshift.io/description: Allows ssh-pod to read nodes and machineconfigs - name: ssh-bastion -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: ssh-bastion -subjects: -- apiGroup: rbac.authorization.k8s.io - kind: User - name: system:serviceaccount:ssh-bastion:ssh-bastion diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/deployment.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/deployment.yaml deleted file mode 100644 index 7e707120aa14..000000000000 --- a/test/extended/testdata/disaster-recovery/ssh-bastion/deployment.yaml +++ /dev/null @@ -1,49 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - run: ssh-bastion - name: ssh-bastion - namespace: ssh-bastion -spec: - replicas: 1 - selector: - matchLabels: - run: ssh-bastion - template: - metadata: - labels: - run: ssh-bastion - spec: - serviceAccountName: "ssh-bastion" - containers: - - image: quay.io/eparis/ssh:latest - imagePullPolicy: Always - name: ssh-bastion - ports: - - containerPort: 22 - name: ssh - protocol: TCP - volumeMounts: - - name: ssh-host-keys - mountPath: "/etc/ssh/" - readOnly: true - securityContext: - privileged: true - volumes: - - name: ssh-host-keys - secret: - secretName: ssh-host-keys - items: - - key: ssh_host_rsa_key - path: ssh_host_rsa_key - mode: 256 - - key: ssh_host_ecdsa_key - path: ssh_host_ecdsa_key - mode: 256 - - key: ssh_host_ed25519_key - path: ssh_host_ed25519_key - mode: 256 - - key: sshd_config - path: sshd_config - restartPolicy: Always diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/namespace.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/namespace.yaml deleted file mode 100644 index 41fe6775c02c..000000000000 --- a/test/extended/testdata/disaster-recovery/ssh-bastion/namespace.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: ssh-bastion - labels: - openshift.io/run-level: "0" - diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/role.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/role.yaml deleted file mode 100644 index 825d93b554ac..000000000000 --- a/test/extended/testdata/disaster-recovery/ssh-bastion/role.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: ssh-bastion - namespace: ssh-bastion -rules: -- apiGroups: - - security.openshift.io - resources: - - securitycontextconstraints - verbs: - - use - resourceNames: - - privileged diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/rolebinding.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/rolebinding.yaml deleted file mode 100644 index ba2e2f2b4bdb..000000000000 --- a/test/extended/testdata/disaster-recovery/ssh-bastion/rolebinding.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - annotations: - openshift.io/description: Allows ssh-pod to run as root - name: ssh-bastion - namespace: ssh-bastion -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: ssh-bastion -subjects: -- apiGroup: rbac.authorization.k8s.io - kind: User - name: system:serviceaccount:ssh-bastion:ssh-bastion diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/service.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/service.yaml deleted file mode 100644 index 63fb71775799..000000000000 --- a/test/extended/testdata/disaster-recovery/ssh-bastion/service.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - run: ssh-bastion - name: ssh-bastion - namespace: ssh-bastion -spec: - externalTrafficPolicy: Local - ports: - - name: ssh - port: 22 - protocol: TCP - targetPort: ssh - selector: - run: ssh-bastion - type: LoadBalancer diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/serviceaccount.yaml b/test/extended/testdata/disaster-recovery/ssh-bastion/serviceaccount.yaml deleted file mode 100644 index 729a2330c7e3..000000000000 --- a/test/extended/testdata/disaster-recovery/ssh-bastion/serviceaccount.yaml +++ /dev/null @@ -1,5 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: ssh-bastion - namespace: ssh-bastion diff --git a/test/extended/testdata/disaster-recovery/ssh-bastion/sshd_config b/test/extended/testdata/disaster-recovery/ssh-bastion/sshd_config deleted file mode 100644 index 1f1b17167049..000000000000 --- a/test/extended/testdata/disaster-recovery/ssh-bastion/sshd_config +++ /dev/null @@ -1,18 +0,0 @@ -HostKey /etc/ssh/ssh_host_rsa_key -HostKey /etc/ssh/ssh_host_ecdsa_key -HostKey /etc/ssh/ssh_host_ed25519_key -SyslogFacility AUTHPRIV -PermitRootLogin no -AuthorizedKeysFile /home/core/.ssh/authorized_keys -PasswordAuthentication no -ChallengeResponseAuthentication no -GSSAPIAuthentication yes -GSSAPICleanupCredentials no -UsePAM yes -X11Forwarding yes -PrintMotd no -AcceptEnv LANG LC_CTYPE LC_NUMERIC LC_TIME LC_COLLATE LC_MONETARY LC_MESSAGES -AcceptEnv LC_PAPER LC_NAME LC_ADDRESS LC_TELEPHONE LC_MEASUREMENT -AcceptEnv LC_IDENTIFICATION LC_ALL LANGUAGE -AcceptEnv XMODIFIERS -Subsystem sftp /usr/libexec/openssh/sftp-server diff --git a/test/extended/testdata/disaster-recovery/update_route_53.py b/test/extended/testdata/disaster-recovery/update_route_53.py deleted file mode 100644 index a549aee25c5a..000000000000 --- a/test/extended/testdata/disaster-recovery/update_route_53.py +++ /dev/null @@ -1,46 +0,0 @@ -import boto3 -import os -import sys -from time import sleep - -if len(sys.argv) < 4: - print("Usage: ./update_route_53.py ") - sys.exit(1) - -attempts = 10 -pause = 10 - -domain = sys.argv[1] -record = sys.argv[2] -ip = sys.argv[3] -print("record: %s" % record) -print("ip: %s" % ip) - -client = boto3.client('route53') -r = client.list_hosted_zones_by_name(DNSName=domain, MaxItems="1") -zone_id = r['HostedZones'][0]['Id'].split('/')[-1] - -response = client.change_resource_record_sets( - HostedZoneId=zone_id, - ChangeBatch= { - 'Comment': 'add %s -> %s' % (record, ip), - 'Changes': [ - { - 'Action': 'UPSERT', - 'ResourceRecordSet': { - 'Name': record, - 'Type': 'A', - 'TTL': 60, - 'ResourceRecords': [{'Value': ip}] - } - }] -}) -for i in range(attempts): - print('response: %s' % response) - changeID = response['ChangeInfo']['Id'] - if response['ChangeInfo']['Status'] == "INSYNC": - print('insync found, response: %s' % response) - break - print('waiting for response to complete') - sleep(pause) - response = client.get_change(Id=changeID) diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go index 652293d51469..f7602eb8cf8c 100644 --- a/test/extended/util/annotate/generated/zz_generated.annotations.go +++ b/test/extended/util/annotate/generated/zz_generated.annotations.go @@ -1697,9 +1697,9 @@ var annotations = map[string]string{ "[Top Level] [sig-etcd] etcd leader changes are not excessive": "leader changes are not excessive [Suite:openshift/conformance/parallel]", - "[Top Level] [sig-etcd][Feature:DisasterRecovery][Disruptive] [Feature:EtcdRecovery] Cluster should restore itself after quorum loss": "[Feature:EtcdRecovery] Cluster should restore itself after quorum loss [Serial]", + "[Top Level] [sig-etcd][Feature:DisasterRecovery][Disruptive] [Feature:EtcdRecovery] Cluster should recover from a backup taken on one node and recovered on another": "[Feature:EtcdRecovery] Cluster should recover from a backup taken on one node and recovered on another [Serial]", - "[Top Level] [sig-etcd][Feature:DisasterRecovery][Disruptive] [dr-etcd-snapshot] Cluster should restore itself from etcd snapshot": "[dr-etcd-snapshot] Cluster should restore itself from etcd snapshot [Serial]", + "[Top Level] [sig-etcd][Feature:DisasterRecovery][Disruptive] [Feature:EtcdRecovery] Cluster should restore itself after quorum loss": "[Feature:EtcdRecovery] Cluster should restore itself after quorum loss [Serial]", "[Top Level] [sig-imageregistry][Feature:ImageAppend] Image append should create images by appending them": "should create images by appending them [Suite:openshift/conformance/parallel]",