Skip to content

Commit 5d90e53

Browse files
authored
fix: mpi flow and add resourceClaim (#3446)
Signed-off-by: Rohan Varma <[email protected]>
1 parent 2626126 commit 5d90e53

File tree

11 files changed

+380
-14
lines changed

11 files changed

+380
-14
lines changed

deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10173,6 +10173,26 @@ spec:
1017310173
Resources requested and limits for this component, including CPU, memory,
1017410174
GPUs/devices, and any runtime-specific resources.
1017510175
properties:
10176+
claims:
10177+
items:
10178+
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
10179+
properties:
10180+
name:
10181+
description: |-
10182+
Name must match the name of one entry in pod.spec.resourceClaims of
10183+
the Pod where this field is used. It makes that resource available
10184+
inside a container.
10185+
type: string
10186+
request:
10187+
description: |-
10188+
Request is the name chosen for a request in the referenced claim.
10189+
If empty, everything from the claim is made available, otherwise
10190+
only the result of this request.
10191+
type: string
10192+
required:
10193+
- name
10194+
type: object
10195+
type: array
1017610196
limits:
1017710197
properties:
1017810198
cpu:

deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10307,6 +10307,26 @@ spec:
1030710307
Resources requested and limits for this component, including CPU, memory,
1030810308
GPUs/devices, and any runtime-specific resources.
1030910309
properties:
10310+
claims:
10311+
items:
10312+
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
10313+
properties:
10314+
name:
10315+
description: |-
10316+
Name must match the name of one entry in pod.spec.resourceClaims of
10317+
the Pod where this field is used. It makes that resource available
10318+
inside a container.
10319+
type: string
10320+
request:
10321+
description: |-
10322+
Request is the name chosen for a request in the referenced claim.
10323+
If empty, everything from the claim is made available, otherwise
10324+
only the result of this request.
10325+
type: string
10326+
required:
10327+
- name
10328+
type: object
10329+
type: array
1031010330
limits:
1031110331
properties:
1031210332
cpu:

deploy/cloud/operator/api/dynamo/common/common.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ type ResourceItem struct {
3232
}
3333

3434
type Resources struct {
35-
Requests *ResourceItem `json:"requests,omitempty"`
36-
Limits *ResourceItem `json:"limits,omitempty"`
35+
Requests *ResourceItem `json:"requests,omitempty"`
36+
Limits *ResourceItem `json:"limits,omitempty"`
37+
Claims []corev1.ResourceClaim `json:"claims,omitempty"`
3738
}
3839

3940
type DeploymentTargetHPAConf struct {

deploy/cloud/operator/api/dynamo/common/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10173,6 +10173,26 @@ spec:
1017310173
Resources requested and limits for this component, including CPU, memory,
1017410174
GPUs/devices, and any runtime-specific resources.
1017510175
properties:
10176+
claims:
10177+
items:
10178+
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
10179+
properties:
10180+
name:
10181+
description: |-
10182+
Name must match the name of one entry in pod.spec.resourceClaims of
10183+
the Pod where this field is used. It makes that resource available
10184+
inside a container.
10185+
type: string
10186+
request:
10187+
description: |-
10188+
Request is the name chosen for a request in the referenced claim.
10189+
If empty, everything from the claim is made available, otherwise
10190+
only the result of this request.
10191+
type: string
10192+
required:
10193+
- name
10194+
type: object
10195+
type: array
1017610196
limits:
1017710197
properties:
1017810198
cpu:

deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10307,6 +10307,26 @@ spec:
1030710307
Resources requested and limits for this component, including CPU, memory,
1030810308
GPUs/devices, and any runtime-specific resources.
1030910309
properties:
10310+
claims:
10311+
items:
10312+
description: ResourceClaim references one entry in PodSpec.ResourceClaims.
10313+
properties:
10314+
name:
10315+
description: |-
10316+
Name must match the name of one entry in pod.spec.resourceClaims of
10317+
the Pod where this field is used. It makes that resource available
10318+
inside a container.
10319+
type: string
10320+
request:
10321+
description: |-
10322+
Request is the name chosen for a request in the referenced claim.
10323+
If empty, everything from the claim is made available, otherwise
10324+
only the result of this request.
10325+
type: string
10326+
required:
10327+
- name
10328+
type: object
10329+
type: array
1031010330
limits:
1031110331
properties:
1031210332
cpu:

deploy/cloud/operator/internal/controller_common/resource.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,12 @@ func GetResourcesConfig(resources *common.Resources) (*corev1.ResourceRequiremen
468468
currentResources.Requests[corev1.ResourceName(k)] = q
469469
}
470470
}
471+
if resources.Claims != nil {
472+
if currentResources.Claims == nil {
473+
currentResources.Claims = make([]corev1.ResourceClaim, 0)
474+
}
475+
currentResources.Claims = append(currentResources.Claims, resources.Claims...)
476+
}
471477
return currentResources, nil
472478
}
473479

deploy/cloud/operator/internal/dynamo/backend_trtllm.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,12 +143,12 @@ func (b *TRTLLMBackend) setupLeaderContainer(container *corev1.Container, number
143143

144144
// Build mpirun command with explicit SSH configuration and environment variables
145145
// Wrap the entire command (trtllm-llmapi-launch + original command) in bash -c for proper shell interpretation
146-
wrappedCommand := fmt.Sprintf("bash -c 'source /opt/dynamo/venv/bin/activate && trtllm-llmapi-launch %s'", originalCommand)
146+
wrappedCommand := fmt.Sprintf("bash -c 'trtllm-llmapi-launch %s'", originalCommand)
147147

148148
// Generate environment variable flags for mpirun
149149
envVarsStr := generateEnvVarFlags(container.Env)
150150

151-
mpirunCmd := fmt.Sprintf("mpirun --oversubscribe -n %d -H %s --mca pml ob1 --mca plm_rsh_args \"-p %d -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" %s %s",
151+
mpirunCmd := fmt.Sprintf("mpirun --allow-run-as-root --oversubscribe -n %d -H %s --mca pml ob1 --mca plm_rsh_args \"-p %d -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa\" %s %s",
152152
totalGPUs,
153153
workerHosts,
154154
commonconsts.MpiRunSshPort,

0 commit comments

Comments
 (0)