Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Simplify check
  • Loading branch information
fonta-rh committed Oct 23, 2025
commit cff85ef32b49fd81d63074c37db6d862f912e1c8
23 changes: 5 additions & 18 deletions test/extended/two_node/tnf_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,6 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
g.It("should recover from etcd process crash", func() {
// Note: This test kills the etcd process/container on one node to simulate
// a process crash, testing Pacemaker's ability to detect and restart etcd
survivedNode := peerNode
g.GinkgoT().Printf("Randomly selected %s (%s) for etcd process crash and %s (%s) to survive\n",
targetNode.Name, targetNode.Status.Addresses[0].Address, peerNode.Name, peerNode.Status.Addresses[0].Address)

Expand All @@ -248,25 +247,13 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
"bash", "-c", "podman kill etcd 2>/dev/null || pkill -9 etcd 2>/dev/null || systemctl stop etcd 2>/dev/null || true")
o.Expect(err).To(o.BeNil(), "Expected to kill etcd process without command errors")

g.By("Waiting for Pacemaker to detect etcd failure and begin recovery")
// Give Pacemaker time to detect the failure and start recovery
time.Sleep(30 * time.Second)

g.By(fmt.Sprintf("Ensuring %s becomes leader and %s rejoins as learner", peerNode.Name, targetNode.Name))
validateEtcdRecoveryState(etcdClientFactory,
&survivedNode, true, false, // survivedNode expected started == true, learner == false
&targetNode, false, true, // targetNode expected started == false, learner == true
memberIsLeaderTimeout, pollInterval)

g.By(fmt.Sprintf("Ensuring %s rejoins as learner", targetNode.Name))
validateEtcdRecoveryState(etcdClientFactory,
&survivedNode, true, false, // survivedNode expected started == true, learner == false
&targetNode, true, true, // targetNode expected started == true, learner == true
memberRejoinedLearnerTimeout, pollInterval)
g.By("Waiting for cluster to recover - both nodes become started voting members")
// Wait for Pacemaker to detect failure, restart etcd, and complete full recovery
time.Sleep(5 * time.Minute)

g.By(fmt.Sprintf("Ensuring %s is promoted back to voting member", targetNode.Name))
g.By("Ensuring both nodes are started and voting members after recovery")
validateEtcdRecoveryState(etcdClientFactory,
&survivedNode, true, false, // survivedNode expected started == true, learner == false
&peerNode, true, false, // peerNode expected started == true, learner == false
&targetNode, true, false, // targetNode expected started == true, learner == false
memberPromotedVotingTimeout, pollInterval)

Expand Down