diff --git a/deploy/cloud/helm/platform/README.md b/deploy/cloud/helm/platform/README.md index b3f794553c..8c3aefa972 100644 --- a/deploy/cloud/helm/platform/README.md +++ b/deploy/cloud/helm/platform/README.md @@ -113,7 +113,7 @@ The chart includes built-in validation to prevent all operator conflicts: | dynamo-operator.controllerManager.manager.args[0] | string | `"--health-probe-bind-address=:8081"` | Health probe endpoint for Kubernetes health checks | | dynamo-operator.controllerManager.manager.args[1] | string | `"--metrics-bind-address=127.0.0.1:8080"` | Metrics endpoint for Prometheus scraping (localhost only for security) | | dynamo-operator.imagePullSecrets | list | `[]` | Secrets for pulling private container images | -| dynamo-operator.dynamo.groveTerminationDelay | string | `"15m"` | How long to wait before forcefully terminating Grove instances | +| dynamo-operator.dynamo.groveTerminationDelay | string | `"4h"` | How long to wait before forcefully terminating Grove instances | | dynamo-operator.dynamo.internalImages.debugger | string | `"python:3.12-slim"` | Debugger image for troubleshooting deployments | | dynamo-operator.dynamo.enableRestrictedSecurityContext | bool | `false` | Whether to enable restricted security contexts for enhanced security | | dynamo-operator.dynamo.dockerRegistry.useKubernetesSecret | bool | `false` | Whether to use Kubernetes secrets for registry authentication | @@ -134,9 +134,9 @@ The chart includes built-in validation to prevent all operator conflicts: | dynamo-operator.dynamo.mpiRun.sshKeygen.enabled | bool | `true` | Whether to enable SSH key generation for MPI Run | | grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide | | kai-scheduler.enabled | bool | `false` | Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide | -| etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance | +| etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd." | | etcd.image.repository | string | `"bitnamilegacy/etcd"` | following bitnami announcement for brownout - https://github.com/bitnami/charts/tree/main/bitnami/etcd#%EF%B8%8F-important-notice-upcoming-changes-to-the-bitnami-catalog, we need to use the legacy repository until we migrate to the new "secure" repository | -| nats.enabled | bool | `true` | Whether to enable NATS deployment, disable if you want to use an external NATS instance | +| nats.enabled | bool | `true` | Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats." | ### NATS Configuration diff --git a/deploy/cloud/helm/platform/values.yaml b/deploy/cloud/helm/platform/values.yaml index 2d2cc469be..f1df7228bf 100644 --- a/deploy/cloud/helm/platform/values.yaml +++ b/deploy/cloud/helm/platform/values.yaml @@ -74,7 +74,7 @@ dynamo-operator: # Core Dynamo platform configuration dynamo: # -- How long to wait before forcefully terminating Grove instances - groveTerminationDelay: 15m + groveTerminationDelay: 4h # Internal utility images used by the platform internalImages: @@ -147,10 +147,9 @@ kai-scheduler: enabled: false # etcd configuration - distributed key-value store for operator state -# For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd etcd: - # -- Whether to enable etcd deployment, disable if you want to use an external etcd instance + # -- Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd." enabled: true image: @@ -195,9 +194,8 @@ etcd: tolerations: [] # NATS configuration - messaging system for operator communication -# For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats nats: - # -- Whether to enable NATS deployment, disable if you want to use an external NATS instance + # -- Whether to enable NATS deployment, disable if you want to use an external NATS instance. For complete configuration options, see: https://github.com/nats-io/k8s/tree/main/helm/charts/nats , all nats settings should be prefixed with "nats." enabled: true # TLS Certificate Authority configuration for secure communication @@ -338,7 +336,9 @@ nats: # token: << $TOKEN >> # jetstream: # max_memory_store: << 1GB >> - merge: {} + merge: + # 10MB which allows for larger context size : The default NATS max payload size is 1MB, and 256K tokens (with tokens being int32 - 4 bytes each) tips over that 1MB max. + max_payload: 10485760 patch: [] ############################################################ diff --git a/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller_test.go b/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller_test.go index 0137b98490..3f57d42efc 100644 --- a/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller_test.go +++ b/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller_test.go @@ -887,7 +887,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. TimeoutSeconds: 5, PeriodSeconds: 10, SuccessThreshold: 0, - FailureThreshold: 60, + FailureThreshold: 720, }, }, }, diff --git a/deploy/cloud/operator/internal/dynamo/component_worker.go b/deploy/cloud/operator/internal/dynamo/component_worker.go index bd86aa095d..d63812ef07 100644 --- a/deploy/cloud/operator/internal/dynamo/component_worker.go +++ b/deploy/cloud/operator/internal/dynamo/component_worker.go @@ -67,7 +67,7 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont }, PeriodSeconds: 10, TimeoutSeconds: 5, - FailureThreshold: 60, + FailureThreshold: 720, // 10s * 720 = 7200s = 2h } container.Env = append(container.Env, []corev1.EnvVar{ diff --git a/deploy/cloud/operator/internal/dynamo/graph_test.go b/deploy/cloud/operator/internal/dynamo/graph_test.go index db7bfe1b19..a8c29cfbf5 100644 --- a/deploy/cloud/operator/internal/dynamo/graph_test.go +++ b/deploy/cloud/operator/internal/dynamo/graph_test.go @@ -1937,7 +1937,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { TimeoutSeconds: 5, PeriodSeconds: 10, SuccessThreshold: 0, - FailureThreshold: 60, + FailureThreshold: 720, }, }, }, @@ -4721,7 +4721,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) { }, PeriodSeconds: 10, TimeoutSeconds: 5, - FailureThreshold: 60, + FailureThreshold: 720, }, Ports: []corev1.ContainerPort{ {