-
Notifications
You must be signed in to change notification settings - Fork 4.8k
OCPEDGE-1484: [TNF] kubelet disruption test #30290
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 28 commits
72e92c8
a8db197
baf1ea1
fbd1f26
399994d
8131f4c
11342e1
9527b8a
2f3b1e1
89f5d04
9634130
2968671
f7054e9
642e661
45feb67
6f3e7f2
b0374c0
d453998
56973f6
66227e1
8159316
31d0519
03a414a
9688854
acb113a
7d371c7
41ac7c2
692751b
dce6692
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,299 @@ | ||
| package two_node | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "time" | ||
|
|
||
| g "github.com/onsi/ginkgo/v2" | ||
| o "github.com/onsi/gomega" | ||
| v1 "github.com/openshift/api/config/v1" | ||
| "github.com/openshift/origin/test/extended/etcd/helpers" | ||
| "github.com/openshift/origin/test/extended/two_node/utils" | ||
| "github.com/openshift/origin/test/extended/util" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| nodeutil "k8s.io/kubernetes/pkg/util/node" | ||
| "k8s.io/kubernetes/test/e2e/framework" | ||
| ) | ||
|
|
||
| const ( | ||
| kubeletDisruptionTimeout = 10 * time.Minute // Timeout for kubelet disruption scenarios | ||
| kubeletRestoreTimeout = 5 * time.Minute // Time to wait for kubelet service restore | ||
| kubeletPollInterval = 10 * time.Second // Poll interval for kubelet status checks | ||
| kubeletGracePeriod = 30 * time.Second // Grace period for kubelet to start/stop | ||
| ) | ||
|
|
||
| var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Serial][Slow][Disruptive] Two Node with Fencing cluster", func() { | ||
| defer g.GinkgoRecover() | ||
|
|
||
| var ( | ||
| oc = util.NewCLIWithoutNamespace("").SetNamespace("openshift-etcd").AsAdmin() | ||
| etcdClientFactory *helpers.EtcdClientFactoryImpl | ||
| ) | ||
|
|
||
| g.BeforeEach(func() { | ||
| utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode) | ||
|
|
||
| kubeClient := oc.KubeClient() | ||
| etcdClientFactory = helpers.NewEtcdClientFactory(kubeClient) | ||
|
|
||
| g.By("Verifying comprehensive etcd cluster status before starting kubelet disruption test") | ||
| o.Eventually(func() error { | ||
| return utils.LogEtcdClusterStatus(oc, "BeforeEach validation", etcdClientFactory) | ||
| }, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be fully healthy before starting test") | ||
|
|
||
| g.By("Validating essential operators are available before kubelet disruption") | ||
| o.Eventually(func() error { | ||
| return utils.ValidateEssentialOperatorsAvailable(oc) | ||
| }, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential cluster operators should be available before kubelet disruption") | ||
| }) | ||
|
|
||
| g.AfterEach(func() { | ||
| // Cleanup: Remove any resource bans that may have been created during the test | ||
| // This ensures the device under test is in the same state the test started in | ||
| nodeList, err := utils.GetNodes(oc, utils.AllNodes) | ||
| if err != nil { | ||
| framework.Logf("Critical: Failed to retrieve nodes for cleanup - test isolation cannot be guaranteed: %v", err) | ||
| return | ||
| } | ||
|
|
||
| // Track if node count is unexpected, but attempt cleanup anyway | ||
| nodeCountUnexpected := len(nodeList.Items) != 2 | ||
| if nodeCountUnexpected { | ||
| framework.Logf("Warning: Expected 2 nodes but found %d - attempting cleanup anyway", len(nodeList.Items)) | ||
| } | ||
|
|
||
| // Attempt cleanup if we have at least 1 node | ||
| if len(nodeList.Items) >= 1 { | ||
| // Use the last available node for cleanup commands (prefer second node if available) | ||
| cleanupNode := nodeList.Items[0] | ||
| if len(nodeList.Items) >= 2 { | ||
| cleanupNode = nodeList.Items[1] | ||
| } | ||
|
|
||
| g.By("Cleanup: Clearing any kubelet resource bans that may exist") | ||
| framework.Logf("Cleanup: Clearing all bans and failures for kubelet-clone resource using node %s", cleanupNode.Name) | ||
| cleanupErr := utils.RemoveConstraint(oc, cleanupNode.Name, "kubelet-clone") | ||
| if cleanupErr != nil { | ||
| framework.Logf("Warning: Failed to clear kubelet-clone resource during cleanup: %v (this is expected if no bans were active)", cleanupErr) | ||
| } else { | ||
| framework.Logf("Successfully cleared all bans and failures for kubelet-clone resource during cleanup") | ||
| } | ||
|
|
||
| g.By("Cleanup: Waiting for all nodes to become Ready after resource ban cleanup") | ||
| for _, node := range nodeList.Items { | ||
| // Use a non-blocking check with logging instead of assertion | ||
| ready := false | ||
| for i := 0; i < int(kubeletRestoreTimeout/kubeletPollInterval); i++ { | ||
| nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{}) | ||
| if err == nil && nodeutil.IsNodeReady(nodeObj) { | ||
| ready = true | ||
| framework.Logf("Node %s is Ready after cleanup", node.Name) | ||
| break | ||
| } | ||
| time.Sleep(kubeletPollInterval) | ||
| } | ||
| if !ready { | ||
| framework.Logf("Warning: Node %s did not become Ready within timeout after cleanup", node.Name) | ||
| } | ||
| } | ||
|
|
||
| g.By("Cleanup: Validating etcd cluster status after test cleanup") | ||
| // Use a non-blocking check with logging instead of assertion | ||
| etcdHealthy := false | ||
| for i := 0; i < int(kubeletRestoreTimeout/kubeletPollInterval); i++ { | ||
| if err := utils.LogEtcdClusterStatus(oc, "AfterEach cleanup", etcdClientFactory); err == nil { | ||
| etcdHealthy = true | ||
| framework.Logf("Etcd cluster is healthy after cleanup") | ||
| break | ||
| } | ||
| time.Sleep(kubeletPollInterval) | ||
| } | ||
| if !etcdHealthy { | ||
| framework.Logf("Warning: Etcd cluster did not become healthy within timeout after cleanup") | ||
| } | ||
| } else { | ||
| framework.Logf("Critical: Cannot perform cleanup - no nodes available") | ||
| } | ||
|
|
||
| // Log node count issue but don't fail - cleanup should always complete | ||
| if nodeCountUnexpected { | ||
| framework.Logf("Warning: Expected exactly 2 nodes for two-node cluster but found %d - cluster topology may have changed during test", len(nodeList.Items)) | ||
| } | ||
| }) | ||
|
|
||
| g.It("Should recover from single node kubelet service disruption", func() { | ||
| nodeList, err := utils.GetNodes(oc, utils.AllNodes) | ||
| o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") | ||
| o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster") | ||
|
|
||
| nodes := nodeList.Items | ||
| framework.Logf("Found nodes: %s and %s for kubelet disruption test", nodes[0].Name, nodes[1].Name) | ||
|
|
||
| g.By("Ensuring both nodes are healthy before starting kubelet disruption test") | ||
| for _, node := range nodes { | ||
| if ready := nodeutil.IsNodeReady(&node); !ready { | ||
| o.Expect(ready).Should(o.BeTrue(), fmt.Sprintf("Node %s should be ready before kubelet disruption", node.Name)) | ||
| } | ||
| } | ||
|
|
||
| targetNode := nodes[0] | ||
| survivingNode := nodes[1] | ||
|
|
||
| g.By(fmt.Sprintf("Banning kubelet resource from node: %s", targetNode.Name)) | ||
| err = utils.AddConstraint(oc, survivingNode.Name, "kubelet-clone", targetNode.Name) | ||
| o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to ban kubelet resource from node %s without errors", targetNode.Name)) | ||
|
|
||
| // Register cleanup to ensure ban is removed even if test fails | ||
| g.DeferCleanup(func() { | ||
| framework.Logf("DeferCleanup: Ensuring kubelet-clone ban is removed") | ||
| cleanupErr := utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone") | ||
| if cleanupErr != nil { | ||
| framework.Logf("DeferCleanup: Warning: Failed to clear kubelet-clone ban: %v (this is expected if already cleared)", cleanupErr) | ||
| } else { | ||
| framework.Logf("DeferCleanup: Successfully cleared kubelet-clone ban") | ||
| } | ||
| }) | ||
|
|
||
| g.By(fmt.Sprintf("Checking that node %s is not in state Ready due to kubelet resource ban", targetNode.Name)) | ||
| o.Eventually(func() bool { | ||
| nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), targetNode.Name, metav1.GetOptions{}) | ||
| if err != nil { | ||
| framework.Logf("Error getting node %s: %v", targetNode.Name, err) | ||
| return false | ||
| } | ||
| return !nodeutil.IsNodeReady(nodeObj) | ||
| }, kubeletDisruptionTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s is not in state Ready after kubelet resource ban is applied", targetNode.Name)) | ||
|
|
||
| g.By(fmt.Sprintf("Ensuring surviving node %s remains Ready during kubelet disruption", survivingNode.Name)) | ||
| o.Consistently(func() bool { | ||
| nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), survivingNode.Name, metav1.GetOptions{}) | ||
| if err != nil { | ||
| framework.Logf("Error getting node %s: %v", survivingNode.Name, err) | ||
| return false | ||
| } | ||
| return nodeutil.IsNodeReady(nodeObj) | ||
| }, 2*time.Minute, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Surviving node %s should remain Ready during kubelet disruption", survivingNode.Name)) | ||
|
|
||
| g.By("Validating etcd cluster remains healthy with surviving node") | ||
| o.Consistently(func() error { | ||
| return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivingNode.Name) | ||
| }, kubeletDisruptionTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet disruption", survivingNode.Name)) | ||
|
|
||
| g.By("Clearing kubelet resource bans to allow normal operation") | ||
| err = utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone") | ||
| o.Expect(err).To(o.BeNil(), "Expected to clear kubelet resource bans without errors") | ||
|
|
||
| g.By("Waiting for target node to become Ready after kubelet resource unban") | ||
| o.Eventually(func() bool { | ||
| nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), targetNode.Name, metav1.GetOptions{}) | ||
| if err != nil { | ||
| framework.Logf("Error getting node %s: %v", targetNode.Name, err) | ||
| return false | ||
| } | ||
| return nodeutil.IsNodeReady(nodeObj) | ||
| }, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should become Ready after kubelet resource ban removal", targetNode.Name)) | ||
|
|
||
| g.By("Validating both nodes are Ready after kubelet resource ban removal") | ||
| for _, node := range nodes { | ||
| o.Eventually(func() bool { | ||
| nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{}) | ||
| if err != nil { | ||
| framework.Logf("Error getting node %s: %v", node.Name, err) | ||
| return false | ||
| } | ||
| return nodeutil.IsNodeReady(nodeObj) | ||
| }, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be Ready after kubelet resource ban removal", node.Name)) | ||
| } | ||
|
|
||
| g.By("Validating comprehensive etcd cluster recovery after kubelet resource ban removal") | ||
| o.Eventually(func() error { | ||
| return utils.LogEtcdClusterStatus(oc, "resource ban removal recovery", etcdClientFactory) | ||
| }, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be fully healthy after kubelet resource ban removal") | ||
|
|
||
| g.By("Ensuring both etcd members are healthy after kubelet resource ban removal") | ||
| for _, node := range nodes { | ||
| o.Eventually(func() error { | ||
| return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, node.Name) | ||
| }, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should be healthy after kubelet resource ban removal", node.Name)) | ||
| } | ||
|
|
||
| g.By("Validating essential operators recovery after kubelet resource ban disruption") | ||
| o.Eventually(func() error { | ||
| return utils.ValidateEssentialOperatorsAvailable(oc) | ||
| }, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential cluster operators should be available after kubelet resource ban removal") | ||
| }) | ||
|
|
||
| g.It("Should properly stop kubelet service and verify automatic restart on target node", func() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here |
||
| nodeList, err := utils.GetNodes(oc, utils.AllNodes) | ||
| o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") | ||
| o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster") | ||
|
|
||
| nodes := nodeList.Items | ||
| framework.Logf("Found nodes: %s and %s for kubelet disruption test", nodes[0].Name, nodes[1].Name) | ||
|
|
||
| g.By("Ensuring both nodes are healthy before starting kubelet disruption test") | ||
| for _, node := range nodes { | ||
| o.Eventually(func() bool { | ||
| nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{}) | ||
| if err != nil { | ||
| framework.Logf("Error getting node %s: %v", node.Name, err) | ||
| return false | ||
| } | ||
| return nodeutil.IsNodeReady(nodeObj) | ||
| }, nodeIsHealthyTimeout, pollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be ready before kubelet disruption", node.Name)) | ||
| } | ||
|
|
||
| targetNode := nodes[0] | ||
| survivingNode := nodes[1] | ||
|
|
||
| g.By(fmt.Sprintf("Verifying kubelet service is initially running on target node: %s", targetNode.Name)) | ||
| o.Eventually(func() bool { | ||
| return utils.IsServiceRunning(oc, targetNode.Name, "kubelet") | ||
| }, kubeletGracePeriod, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Kubelet service should be running initially on node %s", targetNode.Name)) | ||
|
|
||
| g.By(fmt.Sprintf("Stopping kubelet service on target node: %s", targetNode.Name)) | ||
| err = utils.StopKubeletService(oc, targetNode.Name) | ||
| o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to stop kubelet service on node %s without errors", targetNode.Name)) | ||
|
|
||
| g.By("Validating etcd cluster eventually becomes healthy with surviving node during kubelet disruption") | ||
| o.Eventually(func() error { | ||
| return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivingNode.Name) | ||
| }, kubeletDisruptionTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet service disruption", survivingNode.Name)) | ||
|
|
||
| g.By("Waiting for kubelet service to automatically restart on target node") | ||
| o.Eventually(func() bool { | ||
| return utils.IsServiceRunning(oc, targetNode.Name, "kubelet") | ||
|
||
| }, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Kubelet service should automatically restart on node %s", targetNode.Name)) | ||
|
|
||
| g.By("Validating both nodes are Ready after kubelet service automatic restart") | ||
| for _, node := range nodes { | ||
| o.Eventually(func() bool { | ||
| nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{}) | ||
| if err != nil { | ||
| framework.Logf("Error getting node %s: %v", node.Name, err) | ||
| return false | ||
| } | ||
| return nodeutil.IsNodeReady(nodeObj) | ||
| }, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be Ready after kubelet automatic restart", node.Name)) | ||
| } | ||
|
|
||
| g.By("Ensuring both etcd members are healthy after kubelet service automatic restart") | ||
| for _, node := range nodes { | ||
| o.Eventually(func() error { | ||
| return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, node.Name) | ||
| }, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should be healthy after kubelet automatic restart", node.Name)) | ||
| } | ||
|
|
||
| g.By("Validating comprehensive etcd cluster recovery after kubelet service automatic restart") | ||
| o.Eventually(func() error { | ||
| return utils.LogEtcdClusterStatus(oc, "kubelet service restart recovery", etcdClientFactory) | ||
| }, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be fully healthy after kubelet automatic restart") | ||
|
|
||
| g.By("Validating essential operators recovery after kubelet service automatic restart") | ||
| o.Eventually(func() error { | ||
| return utils.ValidateEssentialOperatorsAvailable(oc) | ||
| }, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential cluster operators should be available after kubelet automatic restart") | ||
| }) | ||
|
|
||
| }) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lets' lowerase these
Shoulds, dont forget that this word is in the middle of the test tile."Should" -> "should"