Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add e2e to validate pods can use MIG devices
  • Loading branch information
tkashem committed Nov 6, 2025
commit 1b60bc968b96a63016d8febdee45318f683c9fe4
134 changes: 134 additions & 0 deletions test/extended/dra/dra.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"os"
"strings"
"time"

g "github.com/onsi/ginkgo/v2"
Expand All @@ -16,12 +17,17 @@ import (

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
types "k8s.io/apimachinery/pkg/types"
clientgodynamic "k8s.io/client-go/dynamic"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/kubernetes/test/e2e/framework"
e2epodutil "k8s.io/kubernetes/test/e2e/framework/pod"

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
)

const (
Expand Down Expand Up @@ -453,5 +459,133 @@ var _ = g.Describe("[sig-node] [Suite:openshift/dra-gpu-validation] [Feature:Dyn
spec.Test(ctx, g.GinkgoTB())
})
})

g.Context("[MIGEnabled=true]", func() {
g.BeforeAll(func(ctx context.Context) {
// we will use a custom MIG configuration
config := corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: "dra-e2e-mig-parted-config",
Namespace: operator.Namespace(),
},
Data: map[string]string{
"config.yaml": `
version: v1
mig-configs:
all-disabled:
- devices: all
mig-enabled: false
gpu-e2e:
- devices: [0]
mig-enabled: true
mig-devices:
"3g.20gb": 1
"2g.10gb": 1
"1g.5gb": 1
- devices: [1, 2, 3, 4, 5, 6, 7]
mig-enabled: false
`},
}
o.Expect(helper.EnsureConfigMap(ctx, clientset, &config)).Should(o.BeNil())

g.By("configuring nvidia-mig-manager")
patchBytes := `
{
"spec": {
"mig": {
"strategy": "mixed"
},
"migManager": {
"config": {
"default": "all-disabled",
"name": "dra-e2e-mig-parted-config"
},
"env": [
{
"name": "WITH_REBOOT",
"value": "true"
}
]
},
"validator": {
"cuda": {
"env": [
{
"name": "WITH_WORKLOAD",
"value": "false"
}
]
}
}
}
}
`
resource := dynamic.Resource(schema.GroupVersionResource{Group: "nvidia.com", Version: "v1", Resource: "clusterpolicies"})
policy, err := resource.Patch(ctx, "cluster-policy", types.MergePatchType, []byte(patchBytes), metav1.PatchOptions{})
o.Expect(err).Should(o.BeNil())
t.Logf("gpu operator cluster policy: \n%s\n", framework.PrettyPrintJSON(policy))

g.By(fmt.Sprintf("waiting for nvidia-mig-manager to be ready"))
o.Expect(operator.MIGManagerReady(ctx, node)).Should(o.BeNil())
})

g.It("one pod, three containers, asking for 3g.20gb, 2g.10gb, and 1g.5gb respectively", func(ctx context.Context) {
ascending := func(x, y string) bool {
return strings.Compare(x, y) < 0 // Ascending order
}

// MIG devices we want to setup
want := []string{"3g.20gb", "2g.10gb", "1g.5gb"}

// apply the desired MIG configuration
g.By(fmt.Sprintf("labeling node: %s for nvidia.com/mig.config: %s", node.Name, "gpu-e2e"))
err := helper.EnsureNodeLabel(ctx, clientset, node.Name, "nvidia.com/mig.config", "gpu-e2e")
o.Expect(err).Should(o.BeNil())

g.By("waiting for the gpu driver to advertise the expected MIG devices")
advertised := nvidia.NvidiaGPUs{}
o.Eventually(ctx, func(ctx context.Context) error {
got, err := operator.ListMIGDevicesUsingNvidiaSMI(ctx, node)
if err != nil {
return err
}
if !cmp.Equal(want, got.Names(), cmpopts.SortSlices(ascending)) {
return fmt.Errorf("still waiting for MIG devices to show up, want: %v, got: %v", want, got)
}
advertised = got
return nil
}).WithPolling(10*time.Second).Should(o.BeNil(), "timeout waiting for expected mig devices")
t.Logf("the gpu driver is advertising these mig devices %v", advertised)

// TODO: the DRA driver does not pick up the MIG slices, restarting the plugin seems to do the trick
o.Expect(driver.RemovePluginFromNode(ctx, node)).Should(o.BeNil())
g.By("waiting for nvidia-dra-driver-gpu to be ready")
o.Expect(driver.Ready(ctx, node)).To(o.Succeed(), "nvidia-dra-driver-gpu should be ready")

g.By("waiting for the dra driver to advertise the mig devices in its resourceslices")
o.Eventually(ctx, func(ctx context.Context) error {
all, err := driver.ListPublishedDevicesFromResourceSlice(ctx, node)
if err != nil {
return err
}
migs := all.FilterBy(func(gpu nvidia.NvidiaGPU) bool { return gpu.Type == "mig" })

if want, got := advertised.UUIDs(), migs.UUIDs(); !cmp.Equal(want, got, cmpopts.SortSlices(ascending)) {
return fmt.Errorf("still waiting for the dra driver to publish the MIG devices, want: %v, got: %v", want, got)
}
t.Logf("the dra driver has published the mig devices in its resourceslices: %s", framework.PrettyPrintJSON(migs))
return nil
}).WithPolling(time.Second).Should(o.BeNil(), "timeout while waiting for the dra driver to advertise its resources")

mig := gpuMIGSpec{
f: f,
class: driver.Class(),
node: node,
uuids: advertised.UUIDs(),
devices: advertised.Names(),
}
mig.Test(ctx, g.GinkgoTB())
})
})
})
})
12 changes: 12 additions & 0 deletions test/extended/dra/helper/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,18 @@ func EnsureNodeLabel(ctx context.Context, clientset kubernetes.Interface, node s
return err
}

func EnsureConfigMap(ctx context.Context, clientset kubernetes.Interface, want *corev1.ConfigMap) error {
client := clientset.CoreV1().ConfigMaps(want.Namespace)
_, err := client.Create(context.Background(), want, metav1.CreateOptions{})
if err != nil {
if apierrors.IsAlreadyExists(err) {
return nil
}
return err
}
return nil
}

func GetLogs(ctx context.Context, clientset kubernetes.Interface, namespace, name, ctr string) (string, error) {
client := clientset.CoreV1().Pods(namespace)
options := corev1.PodLogOptions{Container: ctr}
Expand Down
138 changes: 138 additions & 0 deletions test/extended/dra/mig_spec.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
package dra

import (
"context"
"fmt"
"strings"
"testing"

g "github.com/onsi/ginkgo/v2"
o "github.com/onsi/gomega"

corev1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/test/e2e/framework"
e2epodutil "k8s.io/kubernetes/test/e2e/framework/pod"
"k8s.io/utils/ptr"

helper "github.com/openshift/origin/test/extended/dra/helper"
nvidia "github.com/openshift/origin/test/extended/dra/nvidia"
)

// exercises a use case with static MIG devices
// reference:
// - https://docs.nvidia.com/datacenter/tesla/mig-user-guide/
// - https://github.com/NVIDIA/mig-parted
type gpuMIGSpec struct {
f *framework.Framework
class string
// the node onto which the pod is expected to run
node *corev1.Node
// MIG devices, same MIG device can appear twice or more
devices []string
// UUIDs of the MIG devices advertised by the gpu driver
uuids []string
}

// One pod, N containers, each asking for a MIG device on a shared mig-enabled GPU
func (spec gpuMIGSpec) Test(ctx context.Context, t testing.TB) {
namespace := spec.f.Namespace.Name
clientset := spec.f.ClientSet

// create a resource claim template that contains a request for each mig device
template := &resourceapi.ResourceClaimTemplate{
ObjectMeta: metav1.ObjectMeta{
Namespace: namespace,
Name: "mig-devices",
},
}
for i, device := range spec.devices {
name := fmt.Sprintf("%s-%d", strings.ReplaceAll(device, ".", "-"), i)
template.Spec.Spec.Devices.Requests = append(template.Spec.Spec.Devices.Requests, resourceapi.DeviceRequest{
Name: name,
DeviceClassName: "mig.nvidia.com",
Selectors: []resourceapi.DeviceSelector{
{
CEL: &resourceapi.CELDeviceSelector{
Expression: "device.attributes['" + spec.class + "'].profile == '" + device + "'",
},
},
},
})
}
template.Spec.Spec.Devices.Constraints = []resourceapi.DeviceConstraint{
{
MatchAttribute: ptr.To(resourceapi.FullyQualifiedName(spec.class + "/parentUUID")),
},
}

// one pod, N container(s), each wants a MIG device
pod := helper.NewPod(namespace, "pod")
for i, request := range template.Spec.Spec.Devices.Requests {
ctr := corev1.Container{
Name: fmt.Sprintf("ctr%d", i),
Image: "ubuntu:22.04",
Command: []string{"bash", "-c"},
Args: []string{"nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"},
}
ctr.Resources.Claims = []corev1.ResourceClaim{{Name: "mig-devices", Request: request.Name}}
pod.Spec.Containers = append(pod.Spec.Containers, ctr)
}
pod.Spec.ResourceClaims = []corev1.PodResourceClaim{
{
Name: "mig-devices",
ResourceClaimTemplateName: ptr.To(template.Name),
},
}
pod.Spec.Tolerations = []corev1.Toleration{
{
Key: "nvidia.com/gpu",
Operator: corev1.TolerationOpExists,
Effect: corev1.TaintEffectNoSchedule,
},
}

g.By("creating external claim and pod")
_, err := clientset.ResourceV1beta1().ResourceClaimTemplates(namespace).Create(ctx, template, metav1.CreateOptions{})
o.Expect(err).To(o.BeNil())

pod, err = clientset.CoreV1().Pods(namespace).Create(ctx, pod, metav1.CreateOptions{})
o.Expect(err).To(o.BeNil())

g.DeferCleanup(func(ctx context.Context) {
g.By(fmt.Sprintf("listing resources in namespace: %s", namespace))
t.Logf("pod in test namespace: %s\n%s", namespace, framework.PrettyPrintJSON(pod))

result, err := clientset.ResourceV1beta1().ResourceClaims(namespace).List(ctx, metav1.ListOptions{})
o.Expect(err).Should(o.BeNil())
t.Logf("resource claim in test namespace: %s\n%s", namespace, framework.PrettyPrintJSON(result))
})

g.By(fmt.Sprintf("waiting for pod %s/%s to be running", pod.Namespace, pod.Name))
err = e2epodutil.WaitForPodRunningInNamespace(ctx, spec.f.ClientSet, pod)
o.Expect(err).To(o.BeNil())

// the pod should run on the expected node
pod, err = clientset.CoreV1().Pods(namespace).Get(ctx, pod.Name, metav1.GetOptions{})
o.Expect(err).To(o.BeNil())
o.Expect(pod.Spec.NodeName).To(o.Equal(spec.node.Name))

claim, err := helper.GetResourceClaimFor(ctx, clientset, pod)
o.Expect(err).To(o.BeNil())
o.Expect(claim).ToNot(o.BeNil())

o.Expect(claim.Status.Allocation).NotTo(o.BeNil())
o.Expect(len(claim.Status.Allocation.Devices.Results)).To(o.Equal(len(spec.devices)))

migUsed := nvidia.NvidiaGPUs{}
for _, ctr := range pod.Spec.Containers {
g.By(fmt.Sprintf("running nvidia-smi command into pod %s/%s container: %s", pod.Namespace, pod.Name, ctr.Name))
lines, err := helper.ExecIntoContainer(ctx, t, spec.f, pod.Name, pod.Namespace, ctr.Name,
[]string{"nvidia-smi", "-L"})
o.Expect(err).To(o.BeNil())
got := nvidia.ExtractMIGDeviceInfoFromNvidiaSMILines(lines)
migUsed = append(migUsed, got...)
}
o.Expect(migUsed.UUIDs()).To(o.Equal(spec.uuids))
}
Loading