Skip to content

Commit 74aaf23

Browse files
committed
fix: increase shm default size and make it configurable
1 parent ae4fb58 commit 74aaf23

File tree

10 files changed

+104
-33
lines changed

10 files changed

+104
-33
lines changed

deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10241,6 +10241,18 @@ spec:
1024110241
serviceName:
1024210242
description: contains the name of the component
1024310243
type: string
10244+
sharedMemory:
10245+
description: SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size).
10246+
properties:
10247+
disabled:
10248+
type: boolean
10249+
size:
10250+
anyOf:
10251+
- type: integer
10252+
- type: string
10253+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
10254+
x-kubernetes-int-or-string: true
10255+
type: object
1024410256
type: object
1024510257
status:
1024610258
description: Status reflects the current observed state of the component deployment.

deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10340,6 +10340,18 @@ spec:
1034010340
serviceName:
1034110341
description: contains the name of the component
1034210342
type: string
10343+
sharedMemory:
10344+
description: SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size).
10345+
properties:
10346+
disabled:
10347+
type: boolean
10348+
size:
10349+
anyOf:
10350+
- type: integer
10351+
- type: string
10352+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
10353+
x-kubernetes-int-or-string: true
10354+
type: object
1034310355
type: object
1034410356
description: |-
1034510357
Services allows per-service overrides of the component deployment settings.

deploy/cloud/operator/api/v1alpha1/common.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,8 @@ type Autoscaling struct {
4444
Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"`
4545
Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"`
4646
}
47+
48+
type SharedMemorySpec struct {
49+
Disabled bool `json:"disabled,omitempty"`
50+
Size resource.Quantity `json:"size,omitempty"`
51+
}

deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ type DynamoComponentDeploymentSharedSpec struct {
9292
// Ingress config to expose the component outside the cluster (or through a service mesh).
9393
Ingress *IngressSpec `json:"ingress,omitempty"`
9494

95+
// SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size).
96+
SharedMemory *SharedMemorySpec `json:"sharedMemory,omitempty"`
97+
9598
// +optional
9699
// ExtraPodMetadata adds labels/annotations to the created Pods.
97100
ExtraPodMetadata *dynamoCommon.ExtraPodMetadata `json:"extraPodMetadata,omitempty"`

deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10241,6 +10241,18 @@ spec:
1024110241
serviceName:
1024210242
description: contains the name of the component
1024310243
type: string
10244+
sharedMemory:
10245+
description: SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size).
10246+
properties:
10247+
disabled:
10248+
type: boolean
10249+
size:
10250+
anyOf:
10251+
- type: integer
10252+
- type: string
10253+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
10254+
x-kubernetes-int-or-string: true
10255+
type: object
1024410256
type: object
1024510257
status:
1024610258
description: Status reflects the current observed state of the component deployment.

deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10340,6 +10340,18 @@ spec:
1034010340
serviceName:
1034110341
description: contains the name of the component
1034210342
type: string
10343+
sharedMemory:
10344+
description: SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size).
10345+
properties:
10346+
disabled:
10347+
type: boolean
10348+
size:
10349+
anyOf:
10350+
- type: integer
10351+
- type: string
10352+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
10353+
x-kubernetes-int-or-string: true
10354+
type: object
1034310355
type: object
1034410356
description: |-
1034510357
Services allows per-service overrides of the component deployment settings.

deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -793,7 +793,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
793793
VolumeSource: corev1.VolumeSource{
794794
EmptyDir: &corev1.EmptyDirVolumeSource{
795795
Medium: corev1.StorageMediumMemory,
796-
SizeLimit: resource.NewQuantity(5*1024*1024*1024, resource.BinarySI), // 5gi (calculated from memory limit / 4)
796+
SizeLimit: resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
797797
},
798798
},
799799
},
@@ -893,7 +893,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
893893
VolumeSource: corev1.VolumeSource{
894894
EmptyDir: &corev1.EmptyDirVolumeSource{
895895
Medium: corev1.StorageMediumMemory,
896-
SizeLimit: resource.NewQuantity(5*1024*1024*1024, resource.BinarySI), // 5gi (calculated from memory limit / 4)
896+
SizeLimit: resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
897897
},
898898
},
899899
},

deploy/cloud/operator/internal/dynamo/graph.go

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -778,9 +778,10 @@ func GenerateBasePodSpec(
778778
MountPath: *component.PVC.MountPoint,
779779
})
780780
}
781-
shmVolume, shmVolumeMount := generateSharedMemoryVolumeAndMount(&container.Resources)
782-
volumes = append(volumes, shmVolume)
783-
container.VolumeMounts = append(container.VolumeMounts, shmVolumeMount)
781+
if shmVol, shmMount := generateSharedMemoryVolumeAndMount(component.SharedMemory); shmVol != nil && shmMount != nil {
782+
volumes = append(volumes, *shmVol)
783+
container.VolumeMounts = append(container.VolumeMounts, *shmMount)
784+
}
784785

785786
// Apply backend-specific container modifications
786787
multinodeDeployer := MultinodeDeployerFactory(multinodeDeploymentType)
@@ -1179,36 +1180,29 @@ func GenerateBasePodSpecForController(
11791180
return podSpec, nil
11801181
}
11811182

1182-
func generateSharedMemoryVolumeAndMount(resources *corev1.ResourceRequirements) (corev1.Volume, corev1.VolumeMount) {
1183-
sharedMemorySizeLimit := resource.MustParse("512Mi")
1184-
// Check if we have memory limits to work with
1185-
memoryLimit := resources.Limits[corev1.ResourceMemory]
1186-
if !memoryLimit.IsZero() {
1187-
// Use 1/4 of memory limit
1188-
calculatedSize := resource.NewQuantity(memoryLimit.Value()/4, resource.BinarySI)
1189-
// Apply bounds: minimum 512Mi, maximum 8Gi
1190-
minSize := resource.MustParse("512Mi")
1191-
maxSize := resource.MustParse("8Gi")
1192-
1193-
if calculatedSize.Cmp(minSize) > 0 && calculatedSize.Cmp(maxSize) < 0 {
1194-
sharedMemorySizeLimit = *calculatedSize
1195-
} else if calculatedSize.Cmp(maxSize) >= 0 {
1196-
sharedMemorySizeLimit = maxSize // Cap at maximum
1183+
func generateSharedMemoryVolumeAndMount(spec *v1alpha1.SharedMemorySpec) (*corev1.Volume, *corev1.VolumeMount) {
1184+
// default: enabled=true, size=8Gi
1185+
size := resource.MustParse("8Gi")
1186+
if spec != nil {
1187+
if spec.Disabled {
1188+
return nil, nil
1189+
}
1190+
if !spec.Size.IsZero() {
1191+
size = spec.Size
11971192
}
1198-
// If calculatedSize < minSize, keep the 512Mi base
11991193
}
12001194
volume := corev1.Volume{
12011195
Name: commonconsts.KubeValueNameSharedMemory,
12021196
VolumeSource: corev1.VolumeSource{
12031197
EmptyDir: &corev1.EmptyDirVolumeSource{
12041198
Medium: corev1.StorageMediumMemory,
1205-
SizeLimit: &sharedMemorySizeLimit,
1199+
SizeLimit: &size,
12061200
},
12071201
},
12081202
}
12091203
volumeMount := corev1.VolumeMount{
12101204
Name: commonconsts.KubeValueNameSharedMemory,
12111205
MountPath: "/dev/shm",
12121206
}
1213-
return volume, volumeMount
1207+
return &volume, &volumeMount
12141208
}

deploy/cloud/operator/internal/dynamo/graph_test.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1235,7 +1235,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
12351235
VolumeSource: corev1.VolumeSource{
12361236
EmptyDir: &corev1.EmptyDirVolumeSource{
12371237
Medium: corev1.StorageMediumMemory,
1238-
SizeLimit: resource.NewQuantity(536870912, resource.BinarySI),
1238+
SizeLimit: resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
12391239
},
12401240
},
12411241
},
@@ -1378,7 +1378,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
13781378
VolumeSource: corev1.VolumeSource{
13791379
EmptyDir: &corev1.EmptyDirVolumeSource{
13801380
Medium: corev1.StorageMediumMemory,
1381-
SizeLimit: resource.NewQuantity(536870912, resource.BinarySI),
1381+
SizeLimit: resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
13821382
},
13831383
},
13841384
},
@@ -1733,7 +1733,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
17331733
VolumeSource: corev1.VolumeSource{
17341734
EmptyDir: &corev1.EmptyDirVolumeSource{
17351735
Medium: corev1.StorageMediumMemory,
1736-
SizeLimit: resource.NewQuantity(512*1024*1024, resource.BinarySI),
1736+
SizeLimit: resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
17371737
},
17381738
},
17391739
},
@@ -1883,7 +1883,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
18831883
VolumeSource: corev1.VolumeSource{
18841884
EmptyDir: &corev1.EmptyDirVolumeSource{
18851885
Medium: corev1.StorageMediumMemory,
1886-
SizeLimit: resource.NewQuantity(512*1024*1024, resource.BinarySI),
1886+
SizeLimit: resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
18871887
},
18881888
},
18891889
},
@@ -1989,7 +1989,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
19891989
VolumeSource: corev1.VolumeSource{
19901990
EmptyDir: &corev1.EmptyDirVolumeSource{
19911991
Medium: corev1.StorageMediumMemory,
1992-
SizeLimit: resource.NewQuantity(512*1024*1024, resource.BinarySI),
1992+
SizeLimit: resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
19931993
},
19941994
},
19951995
},
@@ -2134,7 +2134,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
21342134
VolumeSource: corev1.VolumeSource{
21352135
EmptyDir: &corev1.EmptyDirVolumeSource{
21362136
Medium: corev1.StorageMediumMemory,
2137-
SizeLimit: resource.NewQuantity(512*1024*1024, resource.BinarySI),
2137+
SizeLimit: resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
21382138
},
21392139
},
21402140
},
@@ -2509,7 +2509,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
25092509
VolumeSource: corev1.VolumeSource{
25102510
EmptyDir: &corev1.EmptyDirVolumeSource{
25112511
Medium: corev1.StorageMediumMemory,
2512-
SizeLimit: resource.NewQuantity(512*1024*1024, resource.BinarySI),
2512+
SizeLimit: resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
25132513
},
25142514
},
25152515
},
@@ -2648,7 +2648,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
26482648
VolumeSource: corev1.VolumeSource{
26492649
EmptyDir: &corev1.EmptyDirVolumeSource{
26502650
Medium: corev1.StorageMediumMemory,
2651-
SizeLimit: resource.NewQuantity(512*1024*1024, resource.BinarySI),
2651+
SizeLimit: resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
26522652
},
26532653
},
26542654
},
@@ -2755,7 +2755,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
27552755
VolumeSource: corev1.VolumeSource{
27562756
EmptyDir: &corev1.EmptyDirVolumeSource{
27572757
Medium: corev1.StorageMediumMemory,
2758-
SizeLimit: resource.NewQuantity(512*1024*1024, resource.BinarySI),
2758+
SizeLimit: resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
27592759
},
27602760
},
27612761
},
@@ -2899,7 +2899,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
28992899
VolumeSource: corev1.VolumeSource{
29002900
EmptyDir: &corev1.EmptyDirVolumeSource{
29012901
Medium: corev1.StorageMediumMemory,
2902-
SizeLimit: resource.NewQuantity(512*1024*1024, resource.BinarySI),
2902+
SizeLimit: resource.NewQuantity(8*1024*1024*1024, resource.BinarySI),
29032903
},
29042904
},
29052905
},

0 commit comments

Comments
 (0)