diff --git a/test/extended/two_node/tnf_recovery.go b/test/extended/two_node/tnf_recovery.go index b40e5f71a8d7..5cc17f458a52 100644 --- a/test/extended/two_node/tnf_recovery.go +++ b/test/extended/two_node/tnf_recovery.go @@ -42,7 +42,7 @@ type hypervisorExtendedConfig struct { HypervisorKnownHostsPath string } -var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive] Two Node with Fencing etcd recovery", func() { +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Disruptive][Serial] Two Node with Fencing etcd recovery", func() { defer g.GinkgoRecover() var ( @@ -234,6 +234,29 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual &nodeB, true, false, // member on node B expected started == true, learner == false membersHealthyAfterDoubleReboot, pollInterval) }) + + g.It("should recover from etcd process crash [Skipped:KnownIssue]", func() { + // Note: This test kills the etcd process/container on one node to simulate + // a process crash, testing Pacemaker's ability to detect and restart etcd + // Currently skipped due to OCPBUGS-59238: rapid podman-etcd restart fails on unpatched clusters + g.GinkgoT().Printf("Randomly selected %s (%s) for etcd process crash and %s (%s) to survive\n", + targetNode.Name, targetNode.Status.Addresses[0].Address, peerNode.Name, peerNode.Status.Addresses[0].Address) + + g.By(fmt.Sprintf("Killing etcd process/container on %s", targetNode.Name)) + _, err := util.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd", + "bash", "-c", "podman kill etcd 2>/dev/null") + o.Expect(err).To(o.BeNil(), "Expected to kill etcd process without command errors") + + g.By("Waiting for cluster to recover - both nodes become started voting members") + // Retry validation with 45-second intervals, up to 8 attempts (6 minutes total) + defer g.GinkgoRecover() + + validateEtcdRecoveryState(oc, etcdClientFactory, + &peerNode, + &targetNode, true, false, // targetNode expected started == true, learner == false + 6*time.Minute, 45*time.Second) + + }) }) func getMembers(etcdClientFactory helpers.EtcdClientCreator) ([]*etcdserverpb.Member, error) {