add e2e to validate pods can use MIG devices

openshift · asahay19 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025
commit 1b60bc968b96a63016d8febdee45318f683c9fe4
diff --git a/test/extended/dra/dra.go b/test/extended/dra/dra.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"os"
+	"strings"
 	"time"
 
 	g "github.com/onsi/ginkgo/v2"
@@ -16,12 +17,17 @@ import (
 
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	types "k8s.io/apimachinery/pkg/types"
 	clientgodynamic "k8s.io/client-go/dynamic"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
 	"k8s.io/client-go/tools/clientcmd"
 	"k8s.io/kubernetes/test/e2e/framework"
 	e2epodutil "k8s.io/kubernetes/test/e2e/framework/pod"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
 )
 
 const (
@@ -453,5 +459,133 @@ var _ = g.Describe("[sig-node] [Suite:openshift/dra-gpu-validation] [Feature:Dyn
 				spec.Test(ctx, g.GinkgoTB())
 			})
 		})
+
+		g.Context("[MIGEnabled=true]", func() {
+			g.BeforeAll(func(ctx context.Context) {
+				// we will use a custom MIG configuration
+				config := corev1.ConfigMap{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      "dra-e2e-mig-parted-config",
+						Namespace: operator.Namespace(),
+					},
+					Data: map[string]string{
+						"config.yaml": `
+    version: v1
+    mig-configs:
+      all-disabled:
+        - devices: all
+          mig-enabled: false
+      gpu-e2e:
+        - devices: [0]
+          mig-enabled: true
+          mig-devices:
+            "3g.20gb": 1
+            "2g.10gb": 1
+            "1g.5gb": 1
+        - devices: [1, 2, 3, 4, 5, 6, 7]
+          mig-enabled: false
+`},
+				}
+				o.Expect(helper.EnsureConfigMap(ctx, clientset, &config)).Should(o.BeNil())
+
+				g.By("configuring nvidia-mig-manager")
+				patchBytes := `
+{
+  "spec": {
+    "mig": {
+      "strategy": "mixed"
+    },
+    "migManager": {
+      "config": {
+        "default": "all-disabled",
+        "name": "dra-e2e-mig-parted-config"
+      },
+      "env": [
+        {
+          "name": "WITH_REBOOT",
+          "value": "true"
+        }
+      ]
+    },
+    "validator": {
+      "cuda": {
+        "env": [
+          {
+            "name": "WITH_WORKLOAD",
+            "value": "false"
+          }
+        ]
+      }
+    }
+  }
+}
+`
+				resource := dynamic.Resource(schema.GroupVersionResource{Group: "nvidia.com", Version: "v1", Resource: "clusterpolicies"})
+				policy, err := resource.Patch(ctx, "cluster-policy", types.MergePatchType, []byte(patchBytes), metav1.PatchOptions{})
+				o.Expect(err).Should(o.BeNil())
+				t.Logf("gpu operator cluster policy: \n%s\n", framework.PrettyPrintJSON(policy))
+
+				g.By(fmt.Sprintf("waiting for nvidia-mig-manager to be ready"))
+				o.Expect(operator.MIGManagerReady(ctx, node)).Should(o.BeNil())
+			})
+
+			g.It("one pod, three containers, asking for 3g.20gb, 2g.10gb, and 1g.5gb respectively", func(ctx context.Context) {
+				ascending := func(x, y string) bool {
+					return strings.Compare(x, y) < 0 // Ascending order
+				}
+
+				// MIG devices we want to setup
+				want := []string{"3g.20gb", "2g.10gb", "1g.5gb"}
+
+				// apply the desired MIG configuration
+				g.By(fmt.Sprintf("labeling node: %s for nvidia.com/mig.config: %s", node.Name, "gpu-e2e"))
+				err := helper.EnsureNodeLabel(ctx, clientset, node.Name, "nvidia.com/mig.config", "gpu-e2e")
+				o.Expect(err).Should(o.BeNil())
+
+				g.By("waiting for the gpu driver to advertise the expected MIG devices")
+				advertised := nvidia.NvidiaGPUs{}
+				o.Eventually(ctx, func(ctx context.Context) error {
+					got, err := operator.ListMIGDevicesUsingNvidiaSMI(ctx, node)
+					if err != nil {
+						return err
+					}
+					if !cmp.Equal(want, got.Names(), cmpopts.SortSlices(ascending)) {
+						return fmt.Errorf("still waiting for MIG devices to show up, want: %v, got: %v", want, got)
+					}
+					advertised = got
+					return nil
+				}).WithPolling(10*time.Second).Should(o.BeNil(), "timeout waiting for expected mig devices")
+				t.Logf("the gpu driver is advertising these mig devices %v", advertised)
+
+				// TODO: the DRA driver does not pick up the MIG slices, restarting the plugin seems to do the trick
+				o.Expect(driver.RemovePluginFromNode(ctx, node)).Should(o.BeNil())
+				g.By("waiting for nvidia-dra-driver-gpu to be ready")
+				o.Expect(driver.Ready(ctx, node)).To(o.Succeed(), "nvidia-dra-driver-gpu should be ready")
+
+				g.By("waiting for the dra driver to advertise the mig devices in its resourceslices")
+				o.Eventually(ctx, func(ctx context.Context) error {
+					all, err := driver.ListPublishedDevicesFromResourceSlice(ctx, node)
+					if err != nil {
+						return err
+					}
+					migs := all.FilterBy(func(gpu nvidia.NvidiaGPU) bool { return gpu.Type == "mig" })
+
+					if want, got := advertised.UUIDs(), migs.UUIDs(); !cmp.Equal(want, got, cmpopts.SortSlices(ascending)) {
+						return fmt.Errorf("still waiting for the dra driver to publish the MIG devices, want: %v, got: %v", want, got)
+					}
+					t.Logf("the dra driver has published the mig devices in its resourceslices: %s", framework.PrettyPrintJSON(migs))
+					return nil
+				}).WithPolling(time.Second).Should(o.BeNil(), "timeout while waiting for the dra driver to advertise its resources")
+
+				mig := gpuMIGSpec{
+					f:       f,
+					class:   driver.Class(),
+					node:    node,
+					uuids:   advertised.UUIDs(),
+					devices: advertised.Names(),
+				}
+				mig.Test(ctx, g.GinkgoTB())
+			})
+		})
 	})
 })
diff --git a/test/extended/dra/helper/helper.go b/test/extended/dra/helper/helper.go
@@ -86,6 +86,18 @@ func EnsureNodeLabel(ctx context.Context, clientset kubernetes.Interface, node s
 	return err
 }
 
+func EnsureConfigMap(ctx context.Context, clientset kubernetes.Interface, want *corev1.ConfigMap) error {
+	client := clientset.CoreV1().ConfigMaps(want.Namespace)
+	_, err := client.Create(context.Background(), want, metav1.CreateOptions{})
+	if err != nil {
+		if apierrors.IsAlreadyExists(err) {
+			return nil
+		}
+		return err
+	}
+	return nil
+}
+
 func GetLogs(ctx context.Context, clientset kubernetes.Interface, namespace, name, ctr string) (string, error) {
 	client := clientset.CoreV1().Pods(namespace)
 	options := corev1.PodLogOptions{Container: ctr}

diff --git a/test/extended/dra/mig_spec.go b/test/extended/dra/mig_spec.go
@@ -0,0 +1,138 @@
+package dra
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"testing"
+
+	g "github.com/onsi/ginkgo/v2"
+	o "github.com/onsi/gomega"
+
+	corev1 "k8s.io/api/core/v1"
+	resourceapi "k8s.io/api/resource/v1beta1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/kubernetes/test/e2e/framework"
+	e2epodutil "k8s.io/kubernetes/test/e2e/framework/pod"
+	"k8s.io/utils/ptr"
+
+	helper "github.com/openshift/origin/test/extended/dra/helper"
+	nvidia "github.com/openshift/origin/test/extended/dra/nvidia"
+)
+
+// exercises a use case with static MIG devices
+// reference:
+//   - https://docs.nvidia.com/datacenter/tesla/mig-user-guide/
+//   - https://github.com/NVIDIA/mig-parted
+type gpuMIGSpec struct {
+	f     *framework.Framework
+	class string
+	// the node onto which the pod is expected to run
+	node *corev1.Node
+	// MIG devices, same MIG device can appear twice or more
+	devices []string
+	// UUIDs of the MIG devices advertised by the gpu driver
+	uuids []string
+}
+
+// One pod, N containers, each asking for a MIG device on a shared mig-enabled GPU
+func (spec gpuMIGSpec) Test(ctx context.Context, t testing.TB) {
+	namespace := spec.f.Namespace.Name
+	clientset := spec.f.ClientSet
+
+	// create a resource claim template that contains a request for each mig device
+	template := &resourceapi.ResourceClaimTemplate{
+		ObjectMeta: metav1.ObjectMeta{
+			Namespace: namespace,
+			Name:      "mig-devices",
+		},
+	}
+	for i, device := range spec.devices {
+		name := fmt.Sprintf("%s-%d", strings.ReplaceAll(device, ".", "-"), i)
+		template.Spec.Spec.Devices.Requests = append(template.Spec.Spec.Devices.Requests, resourceapi.DeviceRequest{
+			Name:            name,
+			DeviceClassName: "mig.nvidia.com",
+			Selectors: []resourceapi.DeviceSelector{
+				{
+					CEL: &resourceapi.CELDeviceSelector{
+						Expression: "device.attributes['" + spec.class + "'].profile == '" + device + "'",
+					},
+				},
+			},
+		})
+	}
+	template.Spec.Spec.Devices.Constraints = []resourceapi.DeviceConstraint{
+		{
+			MatchAttribute: ptr.To(resourceapi.FullyQualifiedName(spec.class + "/parentUUID")),
+		},
+	}
+
+	// one pod, N container(s), each wants a MIG device
+	pod := helper.NewPod(namespace, "pod")
+	for i, request := range template.Spec.Spec.Devices.Requests {
+		ctr := corev1.Container{
+			Name:    fmt.Sprintf("ctr%d", i),
+			Image:   "ubuntu:22.04",
+			Command: []string{"bash", "-c"},
+			Args:    []string{"nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"},
+		}
+		ctr.Resources.Claims = []corev1.ResourceClaim{{Name: "mig-devices", Request: request.Name}}
+		pod.Spec.Containers = append(pod.Spec.Containers, ctr)
+	}
+	pod.Spec.ResourceClaims = []corev1.PodResourceClaim{
+		{
+			Name:                      "mig-devices",
+			ResourceClaimTemplateName: ptr.To(template.Name),
+		},
+	}
+	pod.Spec.Tolerations = []corev1.Toleration{
+		{
+			Key:      "nvidia.com/gpu",
+			Operator: corev1.TolerationOpExists,
+			Effect:   corev1.TaintEffectNoSchedule,
+		},
+	}
+
+	g.By("creating external claim and pod")
+	_, err := clientset.ResourceV1beta1().ResourceClaimTemplates(namespace).Create(ctx, template, metav1.CreateOptions{})
+	o.Expect(err).To(o.BeNil())
+
+	pod, err = clientset.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{})
+	o.Expect(err).To(o.BeNil())
+
+	g.DeferCleanup(func(ctx context.Context) {
+		g.By(fmt.Sprintf("listing resources in namespace: %s", namespace))
+		t.Logf("pod in test namespace: %s\n%s", namespace, framework.PrettyPrintJSON(pod))
+
+		result, err := clientset.ResourceV1beta1().ResourceClaims(namespace).List(ctx, metav1.ListOptions{})
+		o.Expect(err).Should(o.BeNil())
+		t.Logf("resource claim in test namespace: %s\n%s", namespace, framework.PrettyPrintJSON(result))
+	})
+
+	g.By(fmt.Sprintf("waiting for pod %s/%s to be running", pod.Namespace, pod.Name))
+	err = e2epodutil.WaitForPodRunningInNamespace(ctx, spec.f.ClientSet, pod)
+	o.Expect(err).To(o.BeNil())
+
+	// the pod should run on the expected node
+	pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, pod.Name, metav1.GetOptions{})
+	o.Expect(err).To(o.BeNil())
+	o.Expect(pod.Spec.NodeName).To(o.Equal(spec.node.Name))
+
+	claim, err := helper.GetResourceClaimFor(ctx, clientset, pod)
+	o.Expect(err).To(o.BeNil())
+	o.Expect(claim).ToNot(o.BeNil())
+
+	o.Expect(claim.Status.Allocation).NotTo(o.BeNil())
+	o.Expect(len(claim.Status.Allocation.Devices.Results)).To(o.Equal(len(spec.devices)))
+
+	migUsed := nvidia.NvidiaGPUs{}
+	for _, ctr := range pod.Spec.Containers {
+		g.By(fmt.Sprintf("running nvidia-smi command into pod %s/%s container: %s", pod.Namespace, pod.Name, ctr.Name))
+		lines, err := helper.ExecIntoContainer(ctx, t, spec.f, pod.Name, pod.Namespace, ctr.Name,
+			[]string{"nvidia-smi", "-L"})
+		o.Expect(err).To(o.BeNil())
+		got := nvidia.ExtractMIGDeviceInfoFromNvidiaSMILines(lines)
+		migUsed = append(migUsed, got...)
+	}
+	o.Expect(migUsed.UUIDs()).To(o.Equal(spec.uuids))
+}