Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions deploy/cloud/helm/platform/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.controllerManager.manager.args[0] | string | `"--health-probe-bind-address=:8081"` | Health probe endpoint for Kubernetes health checks |
| dynamo-operator.controllerManager.manager.args[1] | string | `"--metrics-bind-address=127.0.0.1:8080"` | Metrics endpoint for Prometheus scraping (localhost only for security) |
| dynamo-operator.imagePullSecrets | list | `[]` | Secrets for pulling private container images |
| dynamo-operator.dynamo.groveTerminationDelay | string | `"15m"` | How long to wait before forcefully terminating Grove instances |
| dynamo-operator.dynamo.groveTerminationDelay | string | `"4h"` | How long to wait before forcefully terminating Grove instances |
| dynamo-operator.dynamo.internalImages.debugger | string | `"python:3.12-slim"` | Debugger image for troubleshooting deployments |
| dynamo-operator.dynamo.enableRestrictedSecurityContext | bool | `false` | Whether to enable restricted security contexts for enhanced security |
| dynamo-operator.dynamo.dockerRegistry.useKubernetesSecret | bool | `false` | Whether to use Kubernetes secrets for registry authentication |
Expand All @@ -134,9 +134,9 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.dynamo.mpiRun.sshKeygen.enabled | bool | `true` | Whether to enable SSH key generation for MPI Run |
| grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide |
| kai-scheduler.enabled | bool | `false` | Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide |
| etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance |
| etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd." |
| etcd.image.repository | string | `"bitnamilegacy/etcd"` | following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository |
| nats.enabled | bool | `true` | Whether to enable NATS deployment, disable if you want to use an external NATS instance |
| nats.enabled | bool | `true` | Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats." |

### NATS Configuration

Expand Down
12 changes: 6 additions & 6 deletions deploy/cloud/helm/platform/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ dynamo-operator:
# Core Dynamo platform configuration
dynamo:
# -- How long to wait before forcefully terminating Grove instances
groveTerminationDelay: 15m
groveTerminationDelay: 4h

# Internal utility images used by the platform
internalImages:
Expand Down Expand Up @@ -147,10 +147,9 @@ kai-scheduler:
enabled: false

# etcd configuration - distributed key-value store for operator state
# For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd
etcd:

# -- Whether to enable etcd deployment, disable if you want to use an external etcd instance
# -- Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd."
enabled: true

image:
Expand Down Expand Up @@ -195,9 +194,8 @@ etcd:
tolerations: []

# NATS configuration - messaging system for operator communication
# For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats
nats:
# -- Whether to enable NATS deployment, disable if you want to use an external NATS instance
# -- Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats."
enabled: true

# TLS Certificate Authority configuration for secure communication
Expand Down Expand Up @@ -338,7 +336,9 @@ nats:
# token: << $TOKEN >>
# jetstream:
# max_memory_store: << 1GB >>
merge: {}
merge:
# 10MB which allows for larger context size : The default NATS max payload size is 1MB, and 256K tokens (with tokens being int32 - 4 bytes each) tips over that 1MB max.
max_payload: 10485760
patch: []

############################################################
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -887,7 +887,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
TimeoutSeconds: 5,
PeriodSeconds: 10,
SuccessThreshold: 0,
FailureThreshold: 60,
FailureThreshold: 720,
},
},
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont
},
PeriodSeconds: 10,
TimeoutSeconds: 5,
FailureThreshold: 60,
FailureThreshold: 720, // 10s * 720 = 7200s = 2h
}

container.Env = append(container.Env, []corev1.EnvVar{
Expand Down
4 changes: 2 additions & 2 deletions deploy/cloud/operator/internal/dynamo/graph_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1937,7 +1937,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
TimeoutSeconds: 5,
PeriodSeconds: 10,
SuccessThreshold: 0,
FailureThreshold: 60,
FailureThreshold: 720,
},
},
},
Expand Down Expand Up @@ -4721,7 +4721,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
},
PeriodSeconds: 10,
TimeoutSeconds: 5,
FailureThreshold: 60,
FailureThreshold: 720,
},
Ports: []corev1.ContainerPort{
{
Expand Down
Loading