From dd346ea10257247f3ff81accfc72fd843292992a Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Mon, 15 Feb 2021 16:08:52 -0800 Subject: [PATCH 1/2] dr: Remove unused fns getPullSecret and GetImagePullSpecFromRelease --- test/extended/dr/quorum_restore.go | 43 ------------------------------ 1 file changed, 43 deletions(-) diff --git a/test/extended/dr/quorum_restore.go b/test/extended/dr/quorum_restore.go index 7b26434d6402..de640076db06 100644 --- a/test/extended/dr/quorum_restore.go +++ b/test/extended/dr/quorum_restore.go @@ -3,7 +3,6 @@ package dr import ( "context" "fmt" - "io/ioutil" "math/rand" "strings" "time" @@ -302,48 +301,6 @@ func scaleEtcdQuorum(client kubernetes.Interface, replicas int) error { return nil } -func getPullSecret(oc *exutil.CLI) string { - framework.Logf("Saving image pull secret") - //TODO: copy of test/extended/operators/images.go, move this to a common func - imagePullSecret, err := oc.KubeFramework().ClientSet.CoreV1().Secrets("openshift-config").Get(context.Background(), "pull-secret", metav1.GetOptions{}) - o.Expect(err).NotTo(o.HaveOccurred()) - if err != nil { - framework.Failf("unable to get pull secret for cluster: %v", err) - } - - // cache file to local temp location - imagePullFile, err := ioutil.TempFile("", "image-pull-secret") - if err != nil { - framework.Failf("unable to create a temporary file: %v", err) - } - - // write the content - imagePullSecretBytes := imagePullSecret.Data[".dockerconfigjson"] - if _, err := imagePullFile.Write(imagePullSecretBytes); err != nil { - framework.Failf("unable to write pull secret to temp file: %v", err) - } - if err := imagePullFile.Close(); err != nil { - framework.Failf("unable to close file: %v", err) - } - framework.Logf("Image pull secret: %s", imagePullFile.Name()) - return imagePullFile.Name() -} - -func getImagePullSpecFromRelease(oc *exutil.CLI, imagePullSecretPath, imageName string) string { - var image string - err := wait.PollImmediate(5*time.Second, 2*time.Minute, func() (bool, error) { - location, err := oc.Run("adm", "release", "info").Args("--image-for", imageName, "--registry-config", imagePullSecretPath).Output() - if err != nil { - framework.Logf("Unable to find release info, retrying: %v", err) - return false, nil - } - image = location - return true, nil - }) - o.Expect(err).NotTo(o.HaveOccurred()) - return image -} - func getMachineNameByNodeName(oc *exutil.CLI, name string) string { masterNode, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), name, metav1.GetOptions{}) o.Expect(err).NotTo(o.HaveOccurred()) From 2f82ad20b1c9498685ad5ef5cb5868ede9e79824 Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Thu, 18 Mar 2021 07:28:18 -0700 Subject: [PATCH 2/2] dr: Get quorum restore passing - ensure the qr test follows the doc'd post-restore procedure - ensure reliable execution --- cmd/openshift-tests/e2e.go | 3 +- test/extended/dr/backup_restore.go | 19 ++++ test/extended/dr/force_redeploy.go | 41 ++++++--- test/extended/dr/quorum_restore.go | 134 ++++++++++++++++------------- 4 files changed, 121 insertions(+), 76 deletions(-) diff --git a/cmd/openshift-tests/e2e.go b/cmd/openshift-tests/e2e.go index 13fc16f0239f..b60a0cfda18c 100644 --- a/cmd/openshift-tests/e2e.go +++ b/cmd/openshift-tests/e2e.go @@ -106,7 +106,8 @@ var staticSuites = testSuites{ return strings.Contains(name, "[Feature:EtcdRecovery]") || strings.Contains(name, "[Feature:NodeRecovery]") || isStandardEarlyTest(name) }, - TestTimeout: 60 * time.Minute, + // Duration of the quorum restore test exceeds 60 minutes. + TestTimeout: 90 * time.Minute, SyntheticEventTests: ginkgo.JUnitForEventsFunc(synthetictests.SystemEventInvariants), }, PreSuite: suiteWithProviderPreSuite, diff --git a/test/extended/dr/backup_restore.go b/test/extended/dr/backup_restore.go index f5f318f4a8f0..693b6bbbcf3d 100644 --- a/test/extended/dr/backup_restore.go +++ b/test/extended/dr/backup_restore.go @@ -13,8 +13,10 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/kubernetes/test/e2e/framework" exutil "github.com/openshift/origin/test/extended/util" @@ -287,3 +289,20 @@ func waitForReadyEtcdPods(client kubernetes.Interface, masterCount int) { 40*time.Minute, ) } + +func waitForPodsTolerateClientTimeout(c corev1client.PodInterface, label labels.Selector, predicate func(corev1.Pod) bool, count int, timeout time.Duration) { + err := wait.Poll(10*time.Second, timeout, func() (bool, error) { + p, e := exutil.GetPodNamesByFilter(c, label, predicate) + if e != nil { + framework.Logf("Saw an error waiting for etcd pods to become available: %v", e) + // TODO tolerate transient etcd timeout only and fail other errors + return false, nil + } + if len(p) != count { + framework.Logf("Only %d of %d expected pods are ready", len(p), count) + return false, nil + } + return true, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) +} diff --git a/test/extended/dr/force_redeploy.go b/test/extended/dr/force_redeploy.go index e5db7fa023c8..61e1dc500a91 100644 --- a/test/extended/dr/force_redeploy.go +++ b/test/extended/dr/force_redeploy.go @@ -103,31 +103,48 @@ func forceOperandRedeployment(client operatorv1client.OperatorV1Interface) { }, }, } + + // Initiate redeployment of all operands without waiting so that + // redeployment is performed in parallel. + latestRevisions := []int32{} for _, client := range clients { + // Retrieve the LatestAvailableRevision before rolling out to know + // what revision not to look for in the subsequent check for + // rollout success. + latestRevision := latestAvailableRevision(client) + latestRevisions = append(latestRevisions, latestRevision) + forceRedeployOperand(client) } + + // Wait for redeployment of operands + for i, client := range clients { + latestRevision := latestRevisions[i] + g.By(fmt.Sprintf("Waiting for %s to be updated on all nodes to a revision greater than %d", client, latestRevision)) + waitForRollout(client, latestRevision) + framework.Logf("Rollout complete for %s", client) + } } -// forceRedeployOperand initiates redeployment of an operand and waits for a -// successful rollout. -func forceRedeployOperand(client *operatorConfigClient) { - // Retrieve the LatestAvailableRevision before rolling out to know - // what revision not to look for in the subsequent check for - // rollout success. +func latestAvailableRevision(client *operatorConfigClient) int32 { g.By(fmt.Sprintf("Finding LatestAvailableRevision for %s", client)) - var latestAvailableRevision int32 + var revision int32 err := wait.PollImmediate(redeployWaitInterval, redeployWaitTimeout, func() (done bool, err error) { status, err := client.getStatus(context.Background(), "cluster", metav1.GetOptions{}) if err != nil { framework.Logf("Error retrieving %s operator status: %v", client, err) } else { - latestAvailableRevision = status.LatestAvailableRevision + revision = status.LatestAvailableRevision } return err == nil, nil }) o.Expect(err).NotTo(o.HaveOccurred()) - framework.Logf("LatestAvailableRevision for %s is %d", client, latestAvailableRevision) + framework.Logf("LatestAvailableRevision for %s is %d", client, revision) + return revision +} +// forceRedeployOperand initiates forced redeployment of an operand. +func forceRedeployOperand(client *operatorConfigClient) { // Ensure a unique forceRedeploymentReason for each test run to // ensure rollout is always triggered even if running repeatedly // against the same cluster (as when debugging). @@ -135,7 +152,7 @@ func forceRedeployOperand(client *operatorConfigClient) { g.By(fmt.Sprintf("Forcing redeployment of %s", client)) data := fmt.Sprintf(`{"spec": {"forceRedeploymentReason": "%s"}}`, reason) - err = wait.PollImmediate(redeployWaitInterval, redeployWaitTimeout, func() (done bool, err error) { + err := wait.PollImmediate(redeployWaitInterval, redeployWaitTimeout, func() (done bool, err error) { err = client.patch(context.Background(), "cluster", types.MergePatchType, []byte(data), metav1.PatchOptions{}) if err != nil { framework.Logf("Error patching %s operator status to set redeploy reason: %v", client, err) @@ -143,10 +160,6 @@ func forceRedeployOperand(client *operatorConfigClient) { return err == nil, nil }) o.Expect(err).NotTo(o.HaveOccurred()) - - g.By(fmt.Sprintf("Waiting for %s to be updated on all nodes to a revision greater than %d", client, latestAvailableRevision)) - waitForRollout(client, latestAvailableRevision) - framework.Logf("Rollout complete for %s", client) } // waitForRollout waits for an operator status to indicate that all nodes are diff --git a/test/extended/dr/quorum_restore.go b/test/extended/dr/quorum_restore.go index de640076db06..3467d158c214 100644 --- a/test/extended/dr/quorum_restore.go +++ b/test/extended/dr/quorum_restore.go @@ -10,17 +10,13 @@ import ( g "github.com/onsi/ginkgo" o "github.com/onsi/gomega" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" - corev1client "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/client-go/rest" "k8s.io/kubernetes/test/e2e/framework" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" @@ -55,9 +51,9 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { oc := exutil.NewCLIWithoutNamespace("disaster-recovery") + // Validate backing up and restoring to the same node on a cluster + // that has lost quorum after the backup was taken. g.It("[Feature:EtcdRecovery] Cluster should restore itself after quorum loss", func() { - e2eskipper.Skipf("Test is disabled pending a fix https://github.com/openshift/origin/pull/25774") - config, err := framework.LoadConfig() o.Expect(err).NotTo(o.HaveOccurred()) dynamicClient := dynamic.NewForConfigOrDie(config) @@ -103,6 +99,12 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { survivingMachine, err := ms.Get(context.Background(), survivingMachineName, metav1.GetOptions{}) o.Expect(err).NotTo(o.HaveOccurred()) + // The backup script only supports taking a backup of a cluster + // that still has quorum, so the backup must be performed before + // quorum-destroying machine deletion. + framework.Logf("Perform etcd backup on node %s (machine %s) while quorum still exists", survivingNodeName, survivingMachineName) + execOnNodeOrFail(survivingNode, "sudo -i /bin/bash -cx 'rm -rf /home/core/backup && /usr/local/bin/cluster-backup.sh /home/core/backup'") + framework.Logf("Destroy %d masters", len(masters)-1) var masterMachines []string for _, node := range masters { @@ -117,6 +119,13 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { err = ms.Delete(context.Background(), masterMachine, metav1.DeleteOptions{}) o.Expect(err).NotTo(o.HaveOccurred()) } + + // All API calls for the remainder of the test should be performed in + // a polling loop to insure against transient failures. API calls can + // only be assumed to succeed without polling against a healthy + // cluster, and only at the successful exit of this function will + // that once again be the case. + pollConfig := rest.CopyConfig(config) pollConfig.Timeout = 5 * time.Second pollClient, err := kubernetes.NewForConfig(pollConfig) @@ -146,24 +155,51 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { }) } - framework.Logf("Perform etcd backup on remaining machine %s (machine %s)", survivingNodeName, survivingMachineName) - // Need to supply --force to the backup script to avoid failing on the api check for progressing operators. - execOnNodeOrFail(survivingNode, "sudo -i /bin/bash -cx 'rm -rf /home/core/backup; /usr/local/bin/cluster-backup.sh --force ~core/backup'") + // Recovery 7 + restoreFromBackup(survivingNode) - framework.Logf("Restore etcd and control-plane on remaining node %s (machine %s)", survivingNodeName, survivingMachineName) - execOnNodeOrFail(survivingNode, "sudo -i /bin/bash -cx '/usr/local/bin/cluster-restore.sh /home/core/backup'") + // Recovery 8 + restartKubelet(survivingNode) - framework.Logf("Wait for API server to come up") - time.Sleep(30 * time.Second) - err = wait.Poll(30*time.Second, 30*time.Minute, func() (done bool, err error) { - nodes, err := pollClient.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{Limit: 2}) - if err != nil || nodes.Items == nil { - framework.Logf("return false - err %v nodes.Items %v", err, nodes.Items) - return false, nil + // Recovery 9a, 9b + waitForAPIServer(oc.AdminKubeClient(), survivingNode) + + // Restoring brings back machines and nodes deleted since the + // backup was taken. Those machines and nodes need to be removed + // before they can be created again. + // + // TODO(marun) Ensure the mechanics of node replacement around + // disaster recovery are documented. + for _, master := range masterMachines { + if master == survivingMachineName { + continue } - return true, nil - }) - o.Expect(err).NotTo(o.HaveOccurred()) + err := wait.PollImmediate(5*time.Second, 5*time.Minute, func() (bool, error) { + framework.Logf("Initiating deletion of machine removed after the backup was taken: %s", master) + err := ms.Delete(context.Background(), master, metav1.DeleteOptions{}) + if err != nil && !errors.IsNotFound(err) { + framework.Logf("Error seen when attempting to remove restored machine %s: %v", master, err) + return false, nil + } + return true, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) + } + for _, node := range masters { + if node.Name == survivingNodeName { + continue + } + err := wait.PollImmediate(5*time.Second, 5*time.Minute, func() (bool, error) { + framework.Logf("Initiating deletion of node removed after the backup was taken: %s", node.Name) + err := oc.AdminKubeClient().CoreV1().Nodes().Delete(context.Background(), node.Name, metav1.DeleteOptions{}) + if err != nil && !errors.IsNotFound(err) { + framework.Logf("Error seen when attempting to remove restored node %s: %v", node.Name, err) + return false, nil + } + return true, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) + } if expectedNumberOfMasters == 1 { framework.Logf("Cannot create new masters, you must manually create masters and update their DNS entries according to the docs") @@ -239,22 +275,23 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { o.Expect(err).NotTo(o.HaveOccurred()) } - framework.Logf("Force new revision of etcd-pod") - _, err = oc.AdminOperatorClient().OperatorV1().Etcds().Patch(context.Background(), "cluster", types.MergePatchType, []byte(`{"spec": {"forceRedeploymentReason": "recover-etcd"}}`), metav1.PatchOptions{}) - o.Expect(err).NotTo(o.HaveOccurred()) - - framework.Logf("Force new revision of kube-apiserver") - _, err = oc.AdminOperatorClient().OperatorV1().KubeAPIServers().Patch(context.Background(), "cluster", types.MergePatchType, []byte(`{"spec": {"forceRedeploymentReason": "recover-kube-apiserver"}}`), metav1.PatchOptions{}) - o.Expect(err).NotTo(o.HaveOccurred()) + // Recovery 10,11,12 + forceOperandRedeployment(oc.AdminOperatorClient().OperatorV1()) // Recovery 13 waitForReadyEtcdPods(oc.AdminKubeClient(), expectedNumberOfMasters) - scaleEtcdQuorum(pollClient, expectedNumberOfMasters) + // Scale quorum guard in a polling loop to ensure tolerance for disruption + err = wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) { + err := scaleEtcdQuorum(pollClient, int32(expectedNumberOfMasters)) + if err != nil { + framework.Logf("Saw an error attempting to scale etcd quorum guard: %v", err) + return false, nil + } + return true, nil + }) + o.Expect(err).NotTo(o.HaveOccurred()) - // Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1707006# - // SDN won't switch to Degraded mode when service is down after disaster recovery - // restartSDNPods(oc) waitForMastersToUpdate(oc, mcps) waitForOperatorsToSettle() }) @@ -262,43 +299,18 @@ var _ = g.Describe("[sig-etcd][Feature:DisasterRecovery][Disruptive]", func() { ) }) -func waitForPodsTolerateClientTimeout(c corev1client.PodInterface, label labels.Selector, predicate func(corev1.Pod) bool, count int, timeout time.Duration) { - err := wait.Poll(10*time.Second, timeout, func() (bool, error) { - p, e := exutil.GetPodNamesByFilter(c, label, predicate) - if e != nil { - framework.Logf("Saw an error waiting for etcd pods to become available: %v", e) - // TODO tolerate transient etcd timeout only and fail other errors - return false, nil - } - if len(p) != count { - return false, nil - } - return true, nil - }) - o.Expect(err).NotTo(o.HaveOccurred()) -} - -func scaleEtcdQuorum(client kubernetes.Interface, replicas int) error { +func scaleEtcdQuorum(client kubernetes.Interface, replicas int32) error { etcdQGScale, err := client.AppsV1().Deployments("openshift-etcd").GetScale(context.Background(), "etcd-quorum-guard", metav1.GetOptions{}) if err != nil { return err } - if etcdQGScale.Spec.Replicas == int32(replicas) { + if etcdQGScale.Spec.Replicas == replicas { return nil } framework.Logf("Scale etcd-quorum-guard to %d replicas", replicas) - etcdQGScale.Spec.Replicas = int32(replicas) + etcdQGScale.Spec.Replicas = replicas _, err = client.AppsV1().Deployments("openshift-etcd").UpdateScale(context.Background(), "etcd-quorum-guard", etcdQGScale, metav1.UpdateOptions{}) - if err != nil { - return err - } - - etcdQGScale, err = client.AppsV1().Deployments("openshift-etcd").GetScale(context.Background(), "etcd-quorum-guard", metav1.GetOptions{}) - if err != nil { - return err - } - o.Expect(etcdQGScale.Spec.Replicas).To(o.Equal(int32(replicas))) - return nil + return err } func getMachineNameByNodeName(oc *exutil.CLI, name string) string {