Skip to content
Open
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
72e92c8
Improve etcd recovery tests for dual-replica
clobrano Jul 14, 2025
a8db197
OCPEDGE-1484: [TNF] Added kubelet failure tests in two-node recovery …
dhensel-rh Oct 17, 2025
baf1ea1
Fixing tab vs. space in common.go file
dhensel-rh Oct 27, 2025
fbd1f26
improving logging
dhensel-rh Nov 12, 2025
399994d
OCPEDGE-1484: [TNF] Added kubelet failure tests in two-node recovery …
dhensel-rh Oct 17, 2025
8131f4c
improving logging
dhensel-rh Nov 12, 2025
11342e1
Fix build errors and enhance kubelet service detection
dhensel-rh Nov 24, 2025
9527b8a
Fix monitoring operator timeout by using essential operator validation
dhensel-rh Nov 25, 2025
2f3b1e1
Fix constraint ID parsing to extract actual ID instead of 'resource'
dhensel-rh Nov 25, 2025
89f5d04
Add extensive debug logging to constraint ID parsing
dhensel-rh Dec 1, 2025
9634130
Add constraint ID validation to prevent invalid results
dhensel-rh Dec 1, 2025
2968671
Fix make update issues and enhance constraint ID parsing
dhensel-rh Dec 1, 2025
f7054e9
Enhance IsServiceRunning with comprehensive debugging and multiple fa…
dhensel-rh Dec 1, 2025
642e661
Simplify IsServiceRunning to use only primary systemctl is-active check
dhensel-rh Dec 1, 2025
45feb67
changes to common.go
dhensel-rh Dec 1, 2025
6f3e7f2
Remove tnf_recovery.go and revisionbump README.md
dhensel-rh Dec 1, 2025
b0374c0
removing etcd_status_test.go
dhensel-rh Dec 1, 2025
d453998
Fix missing constants in tnf_kubelet_disruption.go
dhensel-rh Dec 1, 2025
56973f6
add change to see if kubelet is active
dhensel-rh Dec 2, 2025
66227e1
revert changes
dhensel-rh Dec 2, 2025
8159316
removing extra file
dhensel-rh Dec 2, 2025
31d0519
adding file back
dhensel-rh Dec 2, 2025
03a414a
removing some constants that are not needed
dhensel-rh Dec 4, 2025
9688854
change code to use ban instead of contraintId
dhensel-rh Dec 8, 2025
acb113a
change to fix kubelet test from immediately failing
dhensel-rh Dec 8, 2025
7d371c7
addressing check nodeReady state from Egli
dhensel-rh Dec 8, 2025
41ac7c2
Adding changes from Egli feedback and improving logging
dhensel-rh Dec 10, 2025
692751b
changing how I interact with nodes
dhensel-rh Dec 11, 2025
dce6692
cleaning up redundant logging
dhensel-rh Dec 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
299 changes: 299 additions & 0 deletions test/extended/two_node/tnf_kubelet_disruption.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
package two_node

import (
"context"
"fmt"
"time"

g "github.com/onsi/ginkgo/v2"
o "github.com/onsi/gomega"
v1 "github.com/openshift/api/config/v1"
"github.com/openshift/origin/test/extended/etcd/helpers"
"github.com/openshift/origin/test/extended/two_node/utils"
"github.com/openshift/origin/test/extended/util"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
nodeutil "k8s.io/kubernetes/pkg/util/node"
"k8s.io/kubernetes/test/e2e/framework"
)

const (
kubeletDisruptionTimeout = 10 * time.Minute // Timeout for kubelet disruption scenarios
kubeletRestoreTimeout = 5 * time.Minute // Time to wait for kubelet service restore
kubeletPollInterval = 10 * time.Second // Poll interval for kubelet status checks
kubeletGracePeriod = 30 * time.Second // Grace period for kubelet to start/stop
)

var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Serial][Slow][Disruptive] Two Node with Fencing cluster", func() {
defer g.GinkgoRecover()

var (
oc = util.NewCLIWithoutNamespace("").SetNamespace("openshift-etcd").AsAdmin()
etcdClientFactory *helpers.EtcdClientFactoryImpl
)

g.BeforeEach(func() {
utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode)

kubeClient := oc.KubeClient()
etcdClientFactory = helpers.NewEtcdClientFactory(kubeClient)

g.By("Verifying comprehensive etcd cluster status before starting kubelet disruption test")
o.Eventually(func() error {
return utils.LogEtcdClusterStatus(oc, "BeforeEach validation", etcdClientFactory)
}, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be fully healthy before starting test")

g.By("Validating essential operators are available before kubelet disruption")
o.Eventually(func() error {
return utils.ValidateEssentialOperatorsAvailable(oc)
}, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential cluster operators should be available before kubelet disruption")
})

g.AfterEach(func() {
// Cleanup: Remove any resource bans that may have been created during the test
// This ensures the device under test is in the same state the test started in
nodeList, err := utils.GetNodes(oc, utils.AllNodes)
if err != nil {
framework.Logf("Critical: Failed to retrieve nodes for cleanup - test isolation cannot be guaranteed: %v", err)
return
}

// Track if node count is unexpected, but attempt cleanup anyway
nodeCountUnexpected := len(nodeList.Items) != 2
if nodeCountUnexpected {
framework.Logf("Warning: Expected 2 nodes but found %d - attempting cleanup anyway", len(nodeList.Items))
}

// Attempt cleanup if we have at least 1 node
if len(nodeList.Items) >= 1 {
// Use the last available node for cleanup commands (prefer second node if available)
cleanupNode := nodeList.Items[0]
if len(nodeList.Items) >= 2 {
cleanupNode = nodeList.Items[1]
}

g.By("Cleanup: Clearing any kubelet resource bans that may exist")
framework.Logf("Cleanup: Clearing all bans and failures for kubelet-clone resource using node %s", cleanupNode.Name)
cleanupErr := utils.RemoveConstraint(oc, cleanupNode.Name, "kubelet-clone")
if cleanupErr != nil {
framework.Logf("Warning: Failed to clear kubelet-clone resource during cleanup: %v (this is expected if no bans were active)", cleanupErr)
} else {
framework.Logf("Successfully cleared all bans and failures for kubelet-clone resource during cleanup")
}

g.By("Cleanup: Waiting for all nodes to become Ready after resource ban cleanup")
for _, node := range nodeList.Items {
// Use a non-blocking check with logging instead of assertion
ready := false
for i := 0; i < int(kubeletRestoreTimeout/kubeletPollInterval); i++ {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
if err == nil && nodeutil.IsNodeReady(nodeObj) {
ready = true
framework.Logf("Node %s is Ready after cleanup", node.Name)
break
}
time.Sleep(kubeletPollInterval)
}
if !ready {
framework.Logf("Warning: Node %s did not become Ready within timeout after cleanup", node.Name)
}
}

g.By("Cleanup: Validating etcd cluster status after test cleanup")
// Use a non-blocking check with logging instead of assertion
etcdHealthy := false
for i := 0; i < int(kubeletRestoreTimeout/kubeletPollInterval); i++ {
if err := utils.LogEtcdClusterStatus(oc, "AfterEach cleanup", etcdClientFactory); err == nil {
etcdHealthy = true
framework.Logf("Etcd cluster is healthy after cleanup")
break
}
time.Sleep(kubeletPollInterval)
}
if !etcdHealthy {
framework.Logf("Warning: Etcd cluster did not become healthy within timeout after cleanup")
}
} else {
framework.Logf("Critical: Cannot perform cleanup - no nodes available")
}

// Log node count issue but don't fail - cleanup should always complete
if nodeCountUnexpected {
framework.Logf("Warning: Expected exactly 2 nodes for two-node cluster but found %d - cluster topology may have changed during test", len(nodeList.Items))
}
})

g.It("Should recover from single node kubelet service disruption", func() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets' lowerase these Shoulds, dont forget that this word is in the middle of the test tile.

"Should" -> "should"

nodeList, err := utils.GetNodes(oc, utils.AllNodes)
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster")

nodes := nodeList.Items
framework.Logf("Found nodes: %s and %s for kubelet disruption test", nodes[0].Name, nodes[1].Name)

g.By("Ensuring both nodes are healthy before starting kubelet disruption test")
for _, node := range nodes {
if ready := nodeutil.IsNodeReady(&node); !ready {
o.Expect(ready).Should(o.BeTrue(), fmt.Sprintf("Node %s should be ready before kubelet disruption", node.Name))
}
}

targetNode := nodes[0]
survivingNode := nodes[1]

g.By(fmt.Sprintf("Banning kubelet resource from node: %s", targetNode.Name))
err = utils.AddConstraint(oc, survivingNode.Name, "kubelet-clone", targetNode.Name)
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to ban kubelet resource from node %s without errors", targetNode.Name))

// Register cleanup to ensure ban is removed even if test fails
g.DeferCleanup(func() {
framework.Logf("DeferCleanup: Ensuring kubelet-clone ban is removed")
cleanupErr := utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone")
if cleanupErr != nil {
framework.Logf("DeferCleanup: Warning: Failed to clear kubelet-clone ban: %v (this is expected if already cleared)", cleanupErr)
} else {
framework.Logf("DeferCleanup: Successfully cleared kubelet-clone ban")
}
})

g.By(fmt.Sprintf("Checking that node %s is not in state Ready due to kubelet resource ban", targetNode.Name))
o.Eventually(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), targetNode.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", targetNode.Name, err)
return false
}
return !nodeutil.IsNodeReady(nodeObj)
}, kubeletDisruptionTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s is not in state Ready after kubelet resource ban is applied", targetNode.Name))

g.By(fmt.Sprintf("Ensuring surviving node %s remains Ready during kubelet disruption", survivingNode.Name))
o.Consistently(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), survivingNode.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", survivingNode.Name, err)
return false
}
return nodeutil.IsNodeReady(nodeObj)
}, 2*time.Minute, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Surviving node %s should remain Ready during kubelet disruption", survivingNode.Name))

g.By("Validating etcd cluster remains healthy with surviving node")
o.Consistently(func() error {
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivingNode.Name)
}, kubeletDisruptionTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet disruption", survivingNode.Name))

g.By("Clearing kubelet resource bans to allow normal operation")
err = utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone")
o.Expect(err).To(o.BeNil(), "Expected to clear kubelet resource bans without errors")

g.By("Waiting for target node to become Ready after kubelet resource unban")
o.Eventually(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), targetNode.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", targetNode.Name, err)
return false
}
return nodeutil.IsNodeReady(nodeObj)
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should become Ready after kubelet resource ban removal", targetNode.Name))

g.By("Validating both nodes are Ready after kubelet resource ban removal")
for _, node := range nodes {
o.Eventually(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", node.Name, err)
return false
}
return nodeutil.IsNodeReady(nodeObj)
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be Ready after kubelet resource ban removal", node.Name))
}

g.By("Validating comprehensive etcd cluster recovery after kubelet resource ban removal")
o.Eventually(func() error {
return utils.LogEtcdClusterStatus(oc, "resource ban removal recovery", etcdClientFactory)
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be fully healthy after kubelet resource ban removal")

g.By("Ensuring both etcd members are healthy after kubelet resource ban removal")
for _, node := range nodes {
o.Eventually(func() error {
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, node.Name)
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should be healthy after kubelet resource ban removal", node.Name))
}

g.By("Validating essential operators recovery after kubelet resource ban disruption")
o.Eventually(func() error {
return utils.ValidateEssentialOperatorsAvailable(oc)
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential cluster operators should be available after kubelet resource ban removal")
})

g.It("Should properly stop kubelet service and verify automatic restart on target node", func() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here
"Should" -> "should"

nodeList, err := utils.GetNodes(oc, utils.AllNodes)
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster")

nodes := nodeList.Items
framework.Logf("Found nodes: %s and %s for kubelet disruption test", nodes[0].Name, nodes[1].Name)

g.By("Ensuring both nodes are healthy before starting kubelet disruption test")
for _, node := range nodes {
o.Eventually(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", node.Name, err)
return false
}
return nodeutil.IsNodeReady(nodeObj)
}, nodeIsHealthyTimeout, pollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be ready before kubelet disruption", node.Name))
}

targetNode := nodes[0]
survivingNode := nodes[1]

g.By(fmt.Sprintf("Verifying kubelet service is initially running on target node: %s", targetNode.Name))
o.Eventually(func() bool {
return utils.IsServiceRunning(oc, targetNode.Name, "kubelet")
}, kubeletGracePeriod, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Kubelet service should be running initially on node %s", targetNode.Name))

g.By(fmt.Sprintf("Stopping kubelet service on target node: %s", targetNode.Name))
err = utils.StopKubeletService(oc, targetNode.Name)
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to stop kubelet service on node %s without errors", targetNode.Name))

g.By("Validating etcd cluster eventually becomes healthy with surviving node during kubelet disruption")
o.Eventually(func() error {
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivingNode.Name)
}, kubeletDisruptionTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet service disruption", survivingNode.Name))

g.By("Waiting for kubelet service to automatically restart on target node")
o.Eventually(func() bool {
return utils.IsServiceRunning(oc, targetNode.Name, "kubelet")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this will work, once you stop the kubelet with systemd, does it start back up? If the kubelet is not up I don't think you'll be able to run a debug pod on that node, you'll need to run the ssh framework that Jeremy implemented in order to check and restart the kubelet.

Ideally you would want to crash the kubelet to test this out, that way the always restart stanza of the kubelet.service will kick in. https://github.com/openshift/machine-config-operator/blob/main/templates/master/01-master-kubelet/_base/units/kubelet.service.yaml#L48-L49

}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Kubelet service should automatically restart on node %s", targetNode.Name))

g.By("Validating both nodes are Ready after kubelet service automatic restart")
for _, node := range nodes {
o.Eventually(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", node.Name, err)
return false
}
return nodeutil.IsNodeReady(nodeObj)
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be Ready after kubelet automatic restart", node.Name))
}

g.By("Ensuring both etcd members are healthy after kubelet service automatic restart")
for _, node := range nodes {
o.Eventually(func() error {
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, node.Name)
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should be healthy after kubelet automatic restart", node.Name))
}

g.By("Validating comprehensive etcd cluster recovery after kubelet service automatic restart")
o.Eventually(func() error {
return utils.LogEtcdClusterStatus(oc, "kubelet service restart recovery", etcdClientFactory)
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be fully healthy after kubelet automatic restart")

g.By("Validating essential operators recovery after kubelet service automatic restart")
o.Eventually(func() error {
return utils.ValidateEssentialOperatorsAvailable(oc)
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential cluster operators should be available after kubelet automatic restart")
})

})
Loading