Skip to content
Open
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
72e92c8
Improve etcd recovery tests for dual-replica
clobrano Jul 14, 2025
a8db197
OCPEDGE-1484: [TNF] Added kubelet failure tests in two-node recovery …
dhensel-rh Oct 17, 2025
baf1ea1
Fixing tab vs. space in common.go file
dhensel-rh Oct 27, 2025
fbd1f26
improving logging
dhensel-rh Nov 12, 2025
399994d
OCPEDGE-1484: [TNF] Added kubelet failure tests in two-node recovery …
dhensel-rh Oct 17, 2025
8131f4c
improving logging
dhensel-rh Nov 12, 2025
11342e1
Fix build errors and enhance kubelet service detection
dhensel-rh Nov 24, 2025
9527b8a
Fix monitoring operator timeout by using essential operator validation
dhensel-rh Nov 25, 2025
2f3b1e1
Fix constraint ID parsing to extract actual ID instead of 'resource'
dhensel-rh Nov 25, 2025
89f5d04
Add extensive debug logging to constraint ID parsing
dhensel-rh Dec 1, 2025
9634130
Add constraint ID validation to prevent invalid results
dhensel-rh Dec 1, 2025
2968671
Fix make update issues and enhance constraint ID parsing
dhensel-rh Dec 1, 2025
f7054e9
Enhance IsServiceRunning with comprehensive debugging and multiple fa…
dhensel-rh Dec 1, 2025
642e661
Simplify IsServiceRunning to use only primary systemctl is-active check
dhensel-rh Dec 1, 2025
45feb67
changes to common.go
dhensel-rh Dec 1, 2025
6f3e7f2
Remove tnf_recovery.go and revisionbump README.md
dhensel-rh Dec 1, 2025
b0374c0
removing etcd_status_test.go
dhensel-rh Dec 1, 2025
d453998
Fix missing constants in tnf_kubelet_disruption.go
dhensel-rh Dec 1, 2025
56973f6
add change to see if kubelet is active
dhensel-rh Dec 2, 2025
66227e1
revert changes
dhensel-rh Dec 2, 2025
8159316
removing extra file
dhensel-rh Dec 2, 2025
31d0519
adding file back
dhensel-rh Dec 2, 2025
03a414a
removing some constants that are not needed
dhensel-rh Dec 4, 2025
9688854
change code to use ban instead of contraintId
dhensel-rh Dec 8, 2025
acb113a
change to fix kubelet test from immediately failing
dhensel-rh Dec 8, 2025
7d371c7
addressing check nodeReady state from Egli
dhensel-rh Dec 8, 2025
41ac7c2
Adding changes from Egli feedback and improving logging
dhensel-rh Dec 10, 2025
692751b
changing how I interact with nodes
dhensel-rh Dec 11, 2025
dce6692
cleaning up redundant logging
dhensel-rh Dec 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
279 changes: 279 additions & 0 deletions test/extended/two_node/tnf_kubelet_disruption.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
package two_node

import (
"context"
"fmt"
"time"

g "github.com/onsi/ginkgo/v2"
o "github.com/onsi/gomega"
v1 "github.com/openshift/api/config/v1"
"github.com/openshift/origin/test/extended/etcd/helpers"
"github.com/openshift/origin/test/extended/two_node/utils"
"github.com/openshift/origin/test/extended/util"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
nodeutil "k8s.io/kubernetes/pkg/util/node"
"k8s.io/kubernetes/test/e2e/framework"
)

const (
kubeletDisruptionTimeout = 10 * time.Minute // Timeout for kubelet disruption scenarios
kubeletRestoreTimeout = 5 * time.Minute // Time to wait for kubelet service restore
kubeletPollInterval = 10 * time.Second // Poll interval for kubelet status checks
kubeletGracePeriod = 30 * time.Second // Grace period for kubelet to start/stop
)

var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Serial][Slow][Disruptive] Two Node with Fencing cluster", func() {
defer g.GinkgoRecover()

var (
oc = util.NewCLIWithoutNamespace("").SetNamespace("openshift-etcd").AsAdmin()
etcdClientFactory *helpers.EtcdClientFactoryImpl
)

g.BeforeEach(func() {
utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode)

g.By("Verifying comprehensive etcd cluster status before starting kubelet disruption test")
o.Eventually(func() error {
return utils.LogEtcdClusterStatus(oc, "BeforeEach validation")
}, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be fully healthy before starting test")

kubeClient := oc.KubeClient()
etcdClientFactory = helpers.NewEtcdClientFactory(kubeClient)

g.By("Validating essential operators are available before kubelet disruption")
o.Eventually(func() error {
return utils.ValidateEssentialOperatorsAvailable(oc)
}, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential cluster operators should be available before kubelet disruption")
})

g.AfterEach(func() {
// Cleanup: Remove any resource bans that may have been created during the test
// This ensures the device under test is in the same state the test started in
nodeList, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets use the util function GetNodes(oc, AllNodes)

if err != nil {
framework.Logf("Warning: Failed to retrieve nodes during cleanup: %v", err)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if this fails to clean up? Would that be considered s failed state?

return
}

if len(nodeList.Items) == 2 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't get two nodes, would that be a failure?

survivingNode := nodeList.Items[1] // Use the second node as the surviving node for cleanup commands

g.By("Cleanup: Clearing any kubelet resource bans that may exist")
framework.Logf("Cleanup: Clearing all bans and failures for kubelet-clone resource")
cleanupErr := utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone")
if cleanupErr != nil {
framework.Logf("Warning: Failed to clear kubelet-clone resource during cleanup: %v (this is expected if no bans were active)", cleanupErr)
} else {
framework.Logf("Successfully cleared all bans and failures for kubelet-clone resource during cleanup")
}

g.By("Cleanup: Waiting for all nodes to become Ready after resource ban cleanup")
for _, node := range nodeList.Items {
o.Eventually(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", node.Name, err)
return false
}
return nodeutil.IsNodeReady(nodeObj)
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be Ready after cleanup", node.Name))
}

g.By("Cleanup: Validating etcd cluster status after test cleanup")
o.Eventually(func() error {
return utils.LogEtcdClusterStatus(oc, "AfterEach cleanup")
}, kubeletRestoreTimeout, kubeletPollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be healthy after test cleanup")
}
})

g.It("Should recover from single node kubelet service disruption", func() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets' lowerase these Shoulds, dont forget that this word is in the middle of the test tile.

"Should" -> "should"

nodeList, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets use the util function here utils.GetNodes(oc, utils.AllNodes)

o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster")

nodes := nodeList.Items
framework.Logf("Found nodes: %s and %s for kubelet disruption test", nodes[0].Name, nodes[1].Name)

g.By("Ensuring both nodes are healthy before starting kubelet disruption test")
for _, node := range nodes {
if ready := nodeutil.IsNodeReady(&node); !ready {
o.Expect(ready).Should(o.BeTrue(), fmt.Sprintf("Node %s should be ready before kubelet disruption", node.Name))
}
}

targetNode := nodes[0]
survivingNode := nodes[1]

g.By(fmt.Sprintf("Banning kubelet resource from node: %s", targetNode.Name))
err = utils.AddConstraint(oc, survivingNode.Name, "kubelet-clone", targetNode.Name)
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to ban kubelet resource from node %s without errors", targetNode.Name))

g.By("Checking that the node is not in state Ready due to kubelet resource ban")
o.Eventually(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), targetNode.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", targetNode.Name, err)
return false
}
return !nodeutil.IsNodeReady(nodeObj)
}, kubeletDisruptionTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s is not in state Ready after kubelet resource ban is applied", targetNode.Name))

g.By(fmt.Sprintf("Ensuring surviving node %s remains Ready during kubelet disruption", survivingNode.Name))
o.Consistently(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), survivingNode.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", survivingNode.Name, err)
return false
}
return nodeutil.IsNodeReady(nodeObj)
}, 2*time.Minute, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Surviving node %s should remain Ready during kubelet disruption", survivingNode.Name))

g.By("Validating etcd cluster remains healthy with surviving node")
o.Consistently(func() error {
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivingNode.Name)
}, kubeletDisruptionTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet disruption", survivingNode.Name))

g.By("Clearing kubelet resource bans to allow normal operation")
err = utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone")
o.Expect(err).To(o.BeNil(), "Expected to clear kubelet resource bans without errors")

g.By("Waiting for target node to become Ready after kubelet resource unban")
o.Eventually(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), targetNode.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", targetNode.Name, err)
return false
}
return nodeutil.IsNodeReady(nodeObj)
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should become Ready after kubelet resource ban removal", targetNode.Name))

g.By("Validating both nodes are Ready after kubelet resource ban removal")
for _, node := range nodes {
o.Eventually(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", node.Name, err)
return false
}
return nodeutil.IsNodeReady(nodeObj)
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be Ready after kubelet resource ban removal", node.Name))
}

g.By("Validating comprehensive etcd cluster recovery after kubelet resource ban removal")
o.Eventually(func() error {
return utils.LogEtcdClusterStatus(oc, "resource ban removal recovery")
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be fully healthy after kubelet resource ban removal")

g.By("Ensuring both etcd members are healthy after kubelet resource ban removal")
for _, node := range nodes {
o.Eventually(func() error {
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, node.Name)
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should be healthy after kubelet resource ban removal", node.Name))
}

g.By("Validating essential operators recovery after kubelet resource ban disruption")
o.Eventually(func() error {
return utils.ValidateEssentialOperatorsAvailable(oc)
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential cluster operators should be available after kubelet resource ban removal")
})

g.It("Should properly stop kubelet service and verify automatic restart on target node", func() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here
"Should" -> "should"

nodeList, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same thing here about the util function

o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster")

nodes := nodeList.Items
framework.Logf("Found nodes: %s and %s for kubelet disruption test", nodes[0].Name, nodes[1].Name)

g.By("Ensuring both nodes are healthy before starting kubelet disruption test")
for _, node := range nodes {
o.Eventually(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", node.Name, err)
return false
}
return nodeutil.IsNodeReady(nodeObj)
}, nodeIsHealthyTimeout, pollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be ready before kubelet disruption", node.Name))
}

targetNode := nodes[0]
survivingNode := nodes[1]

framework.Logf("Starting kubelet service disruption test: target node=%s, surviving node=%s",
targetNode.Name, survivingNode.Name)

g.By(fmt.Sprintf("Verifying kubelet service is initially running on target node: %s", targetNode.Name))
framework.Logf("Checking initial kubelet service status on target node %s", targetNode.Name)
o.Eventually(func() bool {
return utils.IsServiceRunning(oc, targetNode.Name, "kubelet")
}, kubeletGracePeriod, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Kubelet service should be running initially on node %s", targetNode.Name))
framework.Logf("Confirmed kubelet service is running initially on target node %s", targetNode.Name)

g.By(fmt.Sprintf("Stopping kubelet service on target node: %s", targetNode.Name))
framework.Logf("Attempting to stop kubelet service on target node %s", targetNode.Name)
err = utils.StopKubeletService(oc, targetNode.Name)
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to stop kubelet service on node %s without errors", targetNode.Name))
framework.Logf("Successfully stopped kubelet service on target node %s", targetNode.Name)

g.By("Validating etcd cluster eventually becomes healthy with surviving node during kubelet disruption")
framework.Logf("Starting etcd health validation on surviving node %s (timeout: %v)", survivingNode.Name, kubeletDisruptionTimeout)
o.Eventually(func() error {
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivingNode.Name)
}, kubeletDisruptionTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet service disruption", survivingNode.Name))
framework.Logf("Confirmed etcd member %s remains healthy during kubelet disruption", survivingNode.Name)

g.By("Waiting for kubelet service to automatically restart on target node")
framework.Logf("Monitoring kubelet service for automatic restart on target node %s (timeout: %v)", targetNode.Name, kubeletRestoreTimeout)
o.Eventually(func() bool {
return utils.IsServiceRunning(oc, targetNode.Name, "kubelet")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this will work, once you stop the kubelet with systemd, does it start back up? If the kubelet is not up I don't think you'll be able to run a debug pod on that node, you'll need to run the ssh framework that Jeremy implemented in order to check and restart the kubelet.

Ideally you would want to crash the kubelet to test this out, that way the always restart stanza of the kubelet.service will kick in. https://github.com/openshift/machine-config-operator/blob/main/templates/master/01-master-kubelet/_base/units/kubelet.service.yaml#L48-L49

}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Kubelet service should automatically restart on node %s", targetNode.Name))
framework.Logf("Kubelet service successfully restarted automatically on target node %s", targetNode.Name)

g.By("Validating both nodes are Ready after kubelet service automatic restart")
framework.Logf("Starting node readiness validation after kubelet restart")
for _, node := range nodes {
framework.Logf("Checking readiness of node %s", node.Name)
o.Eventually(func() bool {
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Error getting node %s: %v", node.Name, err)
return false
}
return nodeutil.IsNodeReady(nodeObj)
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be Ready after kubelet automatic restart", node.Name))
framework.Logf("Node %s is Ready after kubelet restart", node.Name)
}
framework.Logf("All nodes are Ready after kubelet service restart")

g.By("Ensuring both etcd members are healthy after kubelet service automatic restart")
framework.Logf("Starting etcd member health validation after kubelet restart")
for _, node := range nodes {
framework.Logf("Validating etcd member health on node %s", node.Name)
o.Eventually(func() error {
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, node.Name)
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should be healthy after kubelet automatic restart", node.Name))
framework.Logf("Etcd member on node %s is healthy after kubelet restart", node.Name)
}
framework.Logf("All etcd members are healthy after kubelet service restart")

g.By("Validating comprehensive etcd cluster recovery after kubelet service automatic restart")
framework.Logf("Starting comprehensive etcd cluster validation after kubelet restart")
o.Eventually(func() error {
return utils.LogEtcdClusterStatus(oc, "kubelet service restart recovery")
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be fully healthy after kubelet automatic restart")
framework.Logf("Comprehensive etcd cluster validation completed successfully after kubelet restart")

g.By("Validating essential operators recovery after kubelet service automatic restart")
framework.Logf("Starting essential operators availability validation after kubelet restart")
o.Eventually(func() error {
return utils.ValidateEssentialOperatorsAvailable(oc)
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential cluster operators should be available after kubelet automatic restart")
framework.Logf("All cluster operators are available after kubelet restart")

framework.Logf("Kubelet service disruption test completed successfully - full recovery validated")
})

})
Loading