From b539fd260765eeddbf75361ff4837f2fd8be61d1 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Mon, 1 Dec 2025 14:12:36 -0700 Subject: [PATCH 01/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- .../nvidia.com_dgdscalingadapters.yaml | 233 ++++++++++++ .../operator/templates/manager-rbac.yaml | 3 + .../api/v1alpha1/dgdscalingadapter_types.go | 200 ++++++++++ .../api/v1alpha1/zz_generated.deepcopy.go | 151 ++++++++ deploy/cloud/operator/cmd/main.go | 10 + .../bases/nvidia.com_dgdscalingadapters.yaml | 233 ++++++++++++ deploy/cloud/operator/config/rbac/role.yaml | 20 +- .../cloud/operator/internal/consts/consts.go | 1 + .../dgdscalingadapter_controller.go | 350 ++++++++++++++++++ .../dynamographdeployment_controller.go | 91 +++++ 10 files changed, 1283 insertions(+), 9 deletions(-) create mode 100644 deploy/cloud/helm/crds/templates/nvidia.com_dgdscalingadapters.yaml create mode 100644 deploy/cloud/operator/api/v1alpha1/dgdscalingadapter_types.go create mode 100644 deploy/cloud/operator/config/crd/bases/nvidia.com_dgdscalingadapters.yaml create mode 100644 deploy/cloud/operator/internal/controller/dgdscalingadapter_controller.go diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dgdscalingadapters.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dgdscalingadapters.yaml new file mode 100644 index 0000000000..bb3816a76d --- /dev/null +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dgdscalingadapters.yaml @@ -0,0 +1,233 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + helm.sh/resource-policy: keep + name: dgdscalingadapters.nvidia.com +spec: + group: nvidia.com + names: + kind: DGDScalingAdapter + listKind: DGDScalingAdapterList + plural: dgdscalingadapters + shortNames: + - dgdsa + singular: dgdscalingadapter + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: DynamoGraphDeployment name + jsonPath: .spec.dgdRef.name + name: DGD + type: string + - description: Service name + jsonPath: .spec.dgdRef.service + name: SERVICE + type: string + - description: Desired replicas + jsonPath: .spec.replicas + name: DESIRED + type: integer + - description: Current replicas + jsonPath: .status.replicas + name: CURRENT + type: integer + - description: Ready status + jsonPath: .status.conditions[?(@.type=='Ready')].status + name: READY + type: string + - jsonPath: .metadata.creationTimestamp + name: AGE + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + DGDScalingAdapter provides a scaling interface for individual services + within a DynamoGraphDeployment. It implements the Kubernetes scale + subresource, enabling integration with HPA, KEDA, and custom autoscalers. + + The adapter acts as an intermediary between autoscalers and the DGD, + ensuring that only the adapter controller modifies the DGD's service replicas. + This prevents conflicts when multiple autoscaling mechanisms are in play. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: DGDScalingAdapterSpec defines the desired state of DGDScalingAdapter + properties: + dgdRef: + description: DGDRef references the DynamoGraphDeployment and the specific service to scale. + properties: + name: + description: Name of the DynamoGraphDeployment + minLength: 1 + type: string + service: + description: Service is the key name of the service within the DGD's spec.services map to scale + minLength: 1 + type: string + required: + - name + - service + type: object + replicas: + description: |- + Replicas is the desired number of replicas for the target service. + This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. + format: int32 + minimum: 0 + type: integer + scalingPolicy: + description: |- + ScalingPolicy defines optional constraints for scaling behavior. + These constraints are enforced by the adapter controller, providing + an additional safety layer beyond HPA's own min/max settings. + properties: + maxReplicas: + description: |- + MaxReplicas is the upper bound for scaling. + The adapter will not scale above this value even if the autoscaler requests it. + format: int32 + minimum: 1 + type: integer + minReplicas: + description: |- + MinReplicas is the lower bound for scaling. + The adapter will not scale below this value even if the autoscaler requests it. + format: int32 + minimum: 0 + type: integer + scaleDownStabilizationSeconds: + default: 0 + description: |- + ScaleDownStabilizationSeconds is the time to wait before scaling down + after the last scale operation. This provides additional protection against + rapid scale oscillations beyond what HPA provides. + format: int32 + minimum: 0 + type: integer + type: object + required: + - dgdRef + - replicas + type: object + status: + description: DGDScalingAdapterStatus defines the observed state of DGDScalingAdapter + properties: + conditions: + description: Conditions represent the latest available observations of the adapter's state. + items: + description: Condition contains details for one aspect of the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + lastScaleTime: + description: LastScaleTime is the last time the adapter scaled the target service. + format: date-time + type: string + replicas: + description: |- + Replicas is the current number of replicas for the target service. + This is synced from the DGD's service replicas. + format: int32 + type: integer + selector: + description: |- + Selector is a label selector string for the pods managed by this adapter. + Required for HPA compatibility via the scale subresource. + type: string + type: object + type: object + served: true + storage: true + subresources: + scale: + labelSelectorPath: .status.selector + specReplicasPath: .spec.replicas + statusReplicasPath: .status.replicas + status: {} diff --git a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml index 8ab42c0988..e8896b74d3 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml @@ -366,6 +366,7 @@ rules: - apiGroups: - nvidia.com resources: + - dgdscalingadapters - dynamocomponentdeployments - dynamographdeploymentrequests - dynamographdeployments @@ -381,6 +382,7 @@ rules: - apiGroups: - nvidia.com resources: + - dgdscalingadapters/finalizers - dynamocomponentdeployments/finalizers - dynamographdeploymentrequests/finalizers - dynamographdeployments/finalizers @@ -390,6 +392,7 @@ rules: - apiGroups: - nvidia.com resources: + - dgdscalingadapters/status - dynamocomponentdeployments/status - dynamographdeploymentrequests/status - dynamographdeployments/status diff --git a/deploy/cloud/operator/api/v1alpha1/dgdscalingadapter_types.go b/deploy/cloud/operator/api/v1alpha1/dgdscalingadapter_types.go new file mode 100644 index 0000000000..cd299955a3 --- /dev/null +++ b/deploy/cloud/operator/api/v1alpha1/dgdscalingadapter_types.go @@ -0,0 +1,200 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// DGDScalingAdapterSpec defines the desired state of DGDScalingAdapter +type DGDScalingAdapterSpec struct { + // Replicas is the desired number of replicas for the target service. + // This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. + // +kubebuilder:validation:Required + // +kubebuilder:validation:Minimum=0 + Replicas int32 `json:"replicas"` + + // DGDRef references the DynamoGraphDeployment and the specific service to scale. + // +kubebuilder:validation:Required + DGDRef DGDServiceRef `json:"dgdRef"` + + // ScalingPolicy defines optional constraints for scaling behavior. + // These constraints are enforced by the adapter controller, providing + // an additional safety layer beyond HPA's own min/max settings. + // +optional + ScalingPolicy *ScalingPolicy `json:"scalingPolicy,omitempty"` +} + +// DGDServiceRef identifies a specific service within a DynamoGraphDeployment +type DGDServiceRef struct { + // Name of the DynamoGraphDeployment + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + Name string `json:"name"` + + // Service is the key name of the service within the DGD's spec.services map to scale + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + Service string `json:"service"` +} + +// ScalingPolicy defines constraints and behavior for scaling operations +type ScalingPolicy struct { + // MinReplicas is the lower bound for scaling. + // The adapter will not scale below this value even if the autoscaler requests it. + // +kubebuilder:validation:Minimum=0 + // +optional + MinReplicas *int32 `json:"minReplicas,omitempty"` + + // MaxReplicas is the upper bound for scaling. + // The adapter will not scale above this value even if the autoscaler requests it. + // +kubebuilder:validation:Minimum=1 + // +optional + MaxReplicas *int32 `json:"maxReplicas,omitempty"` + + // ScaleDownStabilizationSeconds is the time to wait before scaling down + // after the last scale operation. This provides additional protection against + // rapid scale oscillations beyond what HPA provides. + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:default=0 + // +optional + ScaleDownStabilizationSeconds *int32 `json:"scaleDownStabilizationSeconds,omitempty"` +} + +// DGDScalingAdapterStatus defines the observed state of DGDScalingAdapter +type DGDScalingAdapterStatus struct { + // Replicas is the current number of replicas for the target service. + // This is synced from the DGD's service replicas. + // +optional + Replicas int32 `json:"replicas,omitempty"` + + // Selector is a label selector string for the pods managed by this adapter. + // Required for HPA compatibility via the scale subresource. + // +optional + Selector string `json:"selector,omitempty"` + + // LastScaleTime is the last time the adapter scaled the target service. + // +optional + LastScaleTime *metav1.Time `json:"lastScaleTime,omitempty"` + + // Conditions represent the latest available observations of the adapter's state. + // +optional + // +patchMergeKey=type + // +patchStrategy=merge + // +listType=map + // +listMapKey=type + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas,selectorpath=.status.selector +// +kubebuilder:printcolumn:name="DGD",type="string",JSONPath=".spec.dgdRef.name",description="DynamoGraphDeployment name" +// +kubebuilder:printcolumn:name="SERVICE",type="string",JSONPath=".spec.dgdRef.service",description="Service name" +// +kubebuilder:printcolumn:name="DESIRED",type="integer",JSONPath=".spec.replicas",description="Desired replicas" +// +kubebuilder:printcolumn:name="CURRENT",type="integer",JSONPath=".status.replicas",description="Current replicas" +// +kubebuilder:printcolumn:name="READY",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status",description="Ready status" +// +kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" +// +kubebuilder:resource:shortName={dgdsa} + +// DGDScalingAdapter provides a scaling interface for individual services +// within a DynamoGraphDeployment. It implements the Kubernetes scale +// subresource, enabling integration with HPA, KEDA, and custom autoscalers. +// +// The adapter acts as an intermediary between autoscalers and the DGD, +// ensuring that only the adapter controller modifies the DGD's service replicas. +// This prevents conflicts when multiple autoscaling mechanisms are in play. +type DGDScalingAdapter struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec DGDScalingAdapterSpec `json:"spec,omitempty"` + Status DGDScalingAdapterStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// DGDScalingAdapterList contains a list of DGDScalingAdapter +type DGDScalingAdapterList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []DGDScalingAdapter `json:"items"` +} + +func init() { + SchemeBuilder.Register(&DGDScalingAdapter{}, &DGDScalingAdapterList{}) +} + +// Condition types for DGDScalingAdapter +const ( + // ConditionTypeReady indicates the adapter is synced with DGD and functioning correctly + ConditionTypeAdapterReady = "Ready" +) + +// Condition reasons for DGDScalingAdapter +const ( + // ReasonDGDNotFound indicates the referenced DGD does not exist + ReasonDGDNotFound = "DGDNotFound" + // ReasonServiceNotFound indicates the referenced service does not exist in the DGD + ReasonServiceNotFound = "ServiceNotFound" + // ReasonSynced indicates the adapter is successfully synced with the DGD + ReasonSynced = "Synced" + // ReasonScalingPolicyViolation indicates a scaling request was blocked by policy + ReasonScalingPolicyViolation = "ScalingPolicyViolation" +) + +// SetCondition updates or adds a condition to the adapter's status +func (a *DGDScalingAdapter) SetCondition(condType string, status metav1.ConditionStatus, reason, message string) { + now := metav1.Now() + condition := metav1.Condition{ + Type: condType, + Status: status, + LastTransitionTime: now, + Reason: reason, + Message: message, + ObservedGeneration: a.Generation, + } + + // Update existing condition or append new one + for i, c := range a.Status.Conditions { + if c.Type == condType { + // Only update if status or reason changed + if c.Status != status || c.Reason != reason || c.Message != message { + a.Status.Conditions[i] = condition + } + return + } + } + a.Status.Conditions = append(a.Status.Conditions, condition) +} + +// GetCondition returns the condition with the given type, or nil if not found +func (a *DGDScalingAdapter) GetCondition(condType string) *metav1.Condition { + for i := range a.Status.Conditions { + if a.Status.Conditions[i].Type == condType { + return &a.Status.Conditions[i] + } + } + return nil +} + +// IsReady returns true if the adapter is in Ready state +func (a *DGDScalingAdapter) IsReady() bool { + cond := a.GetCondition(ConditionTypeAdapterReady) + return cond != nil && cond.Status == metav1.ConditionTrue +} diff --git a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go index 56d33cd498..513df1db15 100644 --- a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go @@ -129,6 +129,127 @@ func (in *ConfigMapKeySelector) DeepCopy() *ConfigMapKeySelector { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DGDScalingAdapter) DeepCopyInto(out *DGDScalingAdapter) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DGDScalingAdapter. +func (in *DGDScalingAdapter) DeepCopy() *DGDScalingAdapter { + if in == nil { + return nil + } + out := new(DGDScalingAdapter) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *DGDScalingAdapter) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DGDScalingAdapterList) DeepCopyInto(out *DGDScalingAdapterList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]DGDScalingAdapter, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DGDScalingAdapterList. +func (in *DGDScalingAdapterList) DeepCopy() *DGDScalingAdapterList { + if in == nil { + return nil + } + out := new(DGDScalingAdapterList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *DGDScalingAdapterList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DGDScalingAdapterSpec) DeepCopyInto(out *DGDScalingAdapterSpec) { + *out = *in + out.DGDRef = in.DGDRef + if in.ScalingPolicy != nil { + in, out := &in.ScalingPolicy, &out.ScalingPolicy + *out = new(ScalingPolicy) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DGDScalingAdapterSpec. +func (in *DGDScalingAdapterSpec) DeepCopy() *DGDScalingAdapterSpec { + if in == nil { + return nil + } + out := new(DGDScalingAdapterSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DGDScalingAdapterStatus) DeepCopyInto(out *DGDScalingAdapterStatus) { + *out = *in + if in.LastScaleTime != nil { + in, out := &in.LastScaleTime, &out.LastScaleTime + *out = (*in).DeepCopy() + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DGDScalingAdapterStatus. +func (in *DGDScalingAdapterStatus) DeepCopy() *DGDScalingAdapterStatus { + if in == nil { + return nil + } + out := new(DGDScalingAdapterStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DGDServiceRef) DeepCopyInto(out *DGDServiceRef) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DGDServiceRef. +func (in *DGDServiceRef) DeepCopy() *DGDServiceRef { + if in == nil { + return nil + } + out := new(DGDServiceRef) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DeploymentOverridesSpec) DeepCopyInto(out *DeploymentOverridesSpec) { *out = *in @@ -1085,6 +1206,36 @@ func (in *Resources) DeepCopy() *Resources { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScalingPolicy) DeepCopyInto(out *ScalingPolicy) { + *out = *in + if in.MinReplicas != nil { + in, out := &in.MinReplicas, &out.MinReplicas + *out = new(int32) + **out = **in + } + if in.MaxReplicas != nil { + in, out := &in.MaxReplicas, &out.MaxReplicas + *out = new(int32) + **out = **in + } + if in.ScaleDownStabilizationSeconds != nil { + in, out := &in.ScaleDownStabilizationSeconds, &out.ScaleDownStabilizationSeconds + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScalingPolicy. +func (in *ScalingPolicy) DeepCopy() *ScalingPolicy { + if in == nil { + return nil + } + out := new(ScalingPolicy) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SharedMemorySpec) DeepCopyInto(out *SharedMemorySpec) { *out = *in diff --git a/deploy/cloud/operator/cmd/main.go b/deploy/cloud/operator/cmd/main.go index 4d79cfe3f0..aedd775e54 100644 --- a/deploy/cloud/operator/cmd/main.go +++ b/deploy/cloud/operator/cmd/main.go @@ -578,6 +578,16 @@ func main() { os.Exit(1) } + if err = (&controller.DGDScalingAdapterReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("dgdscalingadapter"), + Config: ctrlConfig, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "DGDScalingAdapter") + os.Exit(1) + } + if err = (&controller.DynamoGraphDeploymentRequestReconciler{ Client: mgr.GetClient(), Recorder: mgr.GetEventRecorderFor("dynamographdeploymentrequest"), diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dgdscalingadapters.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dgdscalingadapters.yaml new file mode 100644 index 0000000000..bb3816a76d --- /dev/null +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dgdscalingadapters.yaml @@ -0,0 +1,233 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + helm.sh/resource-policy: keep + name: dgdscalingadapters.nvidia.com +spec: + group: nvidia.com + names: + kind: DGDScalingAdapter + listKind: DGDScalingAdapterList + plural: dgdscalingadapters + shortNames: + - dgdsa + singular: dgdscalingadapter + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: DynamoGraphDeployment name + jsonPath: .spec.dgdRef.name + name: DGD + type: string + - description: Service name + jsonPath: .spec.dgdRef.service + name: SERVICE + type: string + - description: Desired replicas + jsonPath: .spec.replicas + name: DESIRED + type: integer + - description: Current replicas + jsonPath: .status.replicas + name: CURRENT + type: integer + - description: Ready status + jsonPath: .status.conditions[?(@.type=='Ready')].status + name: READY + type: string + - jsonPath: .metadata.creationTimestamp + name: AGE + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + DGDScalingAdapter provides a scaling interface for individual services + within a DynamoGraphDeployment. It implements the Kubernetes scale + subresource, enabling integration with HPA, KEDA, and custom autoscalers. + + The adapter acts as an intermediary between autoscalers and the DGD, + ensuring that only the adapter controller modifies the DGD's service replicas. + This prevents conflicts when multiple autoscaling mechanisms are in play. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: DGDScalingAdapterSpec defines the desired state of DGDScalingAdapter + properties: + dgdRef: + description: DGDRef references the DynamoGraphDeployment and the specific service to scale. + properties: + name: + description: Name of the DynamoGraphDeployment + minLength: 1 + type: string + service: + description: Service is the key name of the service within the DGD's spec.services map to scale + minLength: 1 + type: string + required: + - name + - service + type: object + replicas: + description: |- + Replicas is the desired number of replicas for the target service. + This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. + format: int32 + minimum: 0 + type: integer + scalingPolicy: + description: |- + ScalingPolicy defines optional constraints for scaling behavior. + These constraints are enforced by the adapter controller, providing + an additional safety layer beyond HPA's own min/max settings. + properties: + maxReplicas: + description: |- + MaxReplicas is the upper bound for scaling. + The adapter will not scale above this value even if the autoscaler requests it. + format: int32 + minimum: 1 + type: integer + minReplicas: + description: |- + MinReplicas is the lower bound for scaling. + The adapter will not scale below this value even if the autoscaler requests it. + format: int32 + minimum: 0 + type: integer + scaleDownStabilizationSeconds: + default: 0 + description: |- + ScaleDownStabilizationSeconds is the time to wait before scaling down + after the last scale operation. This provides additional protection against + rapid scale oscillations beyond what HPA provides. + format: int32 + minimum: 0 + type: integer + type: object + required: + - dgdRef + - replicas + type: object + status: + description: DGDScalingAdapterStatus defines the observed state of DGDScalingAdapter + properties: + conditions: + description: Conditions represent the latest available observations of the adapter's state. + items: + description: Condition contains details for one aspect of the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + lastScaleTime: + description: LastScaleTime is the last time the adapter scaled the target service. + format: date-time + type: string + replicas: + description: |- + Replicas is the current number of replicas for the target service. + This is synced from the DGD's service replicas. + format: int32 + type: integer + selector: + description: |- + Selector is a label selector string for the pods managed by this adapter. + Required for HPA compatibility via the scale subresource. + type: string + type: object + type: object + served: true + storage: true + subresources: + scale: + labelSelectorPath: .status.selector + specReplicasPath: .spec.replicas + statusReplicasPath: .status.replicas + status: {} diff --git a/deploy/cloud/operator/config/rbac/role.yaml b/deploy/cloud/operator/config/rbac/role.yaml index b473aa1ad7..1cdcead130 100644 --- a/deploy/cloud/operator/config/rbac/role.yaml +++ b/deploy/cloud/operator/config/rbac/role.yaml @@ -179,6 +179,7 @@ rules: - apiGroups: - nvidia.com resources: + - dgdscalingadapters - dynamocomponentdeployments - dynamographdeploymentrequests - dynamographdeployments @@ -194,15 +195,7 @@ rules: - apiGroups: - nvidia.com resources: - - dynamocomponentdeployments/finalizers - - dynamographdeploymentrequests/finalizers - - dynamographdeployments/finalizers - - dynamomodels/finalizers - verbs: - - update -- apiGroups: - - nvidia.com - resources: + - dgdscalingadapters/status - dynamocomponentdeployments/status - dynamographdeploymentrequests/status - dynamographdeployments/status @@ -211,6 +204,15 @@ rules: - get - patch - update +- apiGroups: + - nvidia.com + resources: + - dynamocomponentdeployments/finalizers + - dynamographdeploymentrequests/finalizers + - dynamographdeployments/finalizers + - dynamomodels/finalizers + verbs: + - update - apiGroups: - scheduling.run.ai resources: diff --git a/deploy/cloud/operator/internal/consts/consts.go b/deploy/cloud/operator/internal/consts/consts.go index 882f9f18d9..13fb938442 100644 --- a/deploy/cloud/operator/internal/consts/consts.go +++ b/deploy/cloud/operator/internal/consts/consts.go @@ -54,6 +54,7 @@ const ( KubeLabelValueTrue = "true" KubeLabelDynamoComponentPod = "nvidia.com/dynamo-component-pod" + KubeLabelServiceName = "nvidia.com/service-name" KubeResourceGPUNvidia = "nvidia.com/gpu" diff --git a/deploy/cloud/operator/internal/controller/dgdscalingadapter_controller.go b/deploy/cloud/operator/internal/controller/dgdscalingadapter_controller.go new file mode 100644 index 0000000000..85d254fafe --- /dev/null +++ b/deploy/cloud/operator/internal/controller/dgdscalingadapter_controller.go @@ -0,0 +1,350 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controller + +import ( + "context" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" + commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common" +) + +// DGDScalingAdapterReconciler reconciles a DGDScalingAdapter object +type DGDScalingAdapterReconciler struct { + client.Client + Scheme *runtime.Scheme + Recorder record.EventRecorder + Config commonController.Config +} + +// +kubebuilder:rbac:groups=nvidia.com,resources=dgdscalingadapters,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=nvidia.com,resources=dgdscalingadapters/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;update;patch + +// Reconcile implements the reconciliation loop for DGDScalingAdapter +func (r *DGDScalingAdapterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + // 1. Fetch the DGDScalingAdapter + adapter := &nvidiacomv1alpha1.DGDScalingAdapter{} + if err := r.Get(ctx, req.NamespacedName, adapter); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + // Skip reconciliation if being deleted + if !adapter.GetDeletionTimestamp().IsZero() { + logger.V(1).Info("Adapter is being deleted, skipping reconciliation") + return ctrl.Result{}, nil + } + + // 2. Fetch the referenced DGD + dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{} + dgdKey := types.NamespacedName{ + Name: adapter.Spec.DGDRef.Name, + Namespace: adapter.Namespace, + } + if err := r.Get(ctx, dgdKey, dgd); err != nil { + if errors.IsNotFound(err) { + logger.Error(err, "Referenced DGD not found", "dgd", dgdKey) + adapter.SetCondition( + nvidiacomv1alpha1.ConditionTypeAdapterReady, + metav1.ConditionFalse, + nvidiacomv1alpha1.ReasonDGDNotFound, + fmt.Sprintf("DGD %s not found", dgdKey), + ) + statusErr := r.Status().Update(ctx, adapter) + if statusErr != nil { + logger.Error(statusErr, "Failed to update adapter status") + } + return ctrl.Result{}, err + } + return ctrl.Result{}, err + } + + // 3. Find the target service in DGD's spec.services map + component, exists := dgd.Spec.Services[adapter.Spec.DGDRef.Service] + if !exists { + logger.Error(nil, "Service not found in DGD", + "service", adapter.Spec.DGDRef.Service, + "dgd", dgd.Name, + "availableServices", getServiceKeys(dgd.Spec.Services)) + adapter.SetCondition( + nvidiacomv1alpha1.ConditionTypeAdapterReady, + metav1.ConditionFalse, + nvidiacomv1alpha1.ReasonServiceNotFound, + fmt.Sprintf("Service %s not found in DGD %s", adapter.Spec.DGDRef.Service, dgd.Name), + ) + statusErr := r.Status().Update(ctx, adapter) + if statusErr != nil { + logger.Error(statusErr, "Failed to update adapter status") + } + return ctrl.Result{}, fmt.Errorf("service %s not found in DGD", adapter.Spec.DGDRef.Service) + } + + // Get current replicas from DGD (default to 1 if not set) + currentReplicas := int32(1) + if component.Replicas != nil { + currentReplicas = *component.Replicas + } + + // 4. Detect out-of-band DGD changes (Scenario 1: User manually edited DGD) + // If DGD replicas differ from adapter status, DGD was modified externally + if currentReplicas != adapter.Status.Replicas { + logger.Info("Detected out-of-band DGD change, syncing adapter from DGD", + "service", adapter.Spec.DGDRef.Service, + "dgdReplicas", currentReplicas, + "adapterStatusReplicas", adapter.Status.Replicas) + + // Sync adapter spec from DGD (treat DGD as source of truth for out-of-band changes) + adapter.Spec.Replicas = currentReplicas + if err := r.Update(ctx, adapter); err != nil { + logger.Error(err, "Failed to sync adapter spec from DGD") + return ctrl.Result{}, err + } + + r.Recorder.Eventf(adapter, corev1.EventTypeNormal, "Synced", + "Synced adapter from DGD manual edit: replicas=%d", currentReplicas) + } + + // 5. Apply scaling policy constraints + desiredReplicas, policyViolation := r.applyScalingPolicy(adapter, adapter.Spec.Replicas) + if policyViolation != "" { + logger.Info("Scaling policy violation", "violation", policyViolation, "desired", adapter.Spec.Replicas, "constrained", desiredReplicas) + r.Recorder.Eventf(adapter, corev1.EventTypeWarning, "ScalingPolicyViolation", policyViolation) + } + + // 6. Check scale-down stabilization + if desiredReplicas < currentReplicas { + if !r.canScaleDown(adapter) { + logger.Info("Scale-down blocked by stabilization window", + "current", currentReplicas, + "desired", desiredReplicas, + "lastScaleTime", adapter.Status.LastScaleTime) + desiredReplicas = currentReplicas + } + } + + // 7. Update DGD if replicas changed + if currentReplicas != desiredReplicas { + // Update the service's replicas in DGD + component.Replicas = &desiredReplicas + dgd.Spec.Services[adapter.Spec.DGDRef.Service] = component + + if err := r.Update(ctx, dgd); err != nil { + logger.Error(err, "Failed to update DGD") + r.Recorder.Eventf(adapter, corev1.EventTypeWarning, "UpdateFailed", + "Failed to update DGD %s: %v", dgd.Name, err) + return ctrl.Result{}, err + } + + logger.Info("Scaled service", + "dgd", dgd.Name, + "service", adapter.Spec.DGDRef.Service, + "from", currentReplicas, + "to", desiredReplicas) + + r.Recorder.Eventf(adapter, corev1.EventTypeNormal, "Scaled", + "Scaled service %s from %d to %d replicas", adapter.Spec.DGDRef.Service, currentReplicas, desiredReplicas) + + // Record scaling event + now := metav1.Now() + adapter.Status.LastScaleTime = &now + } + + // 8. Update adapter status + adapter.Status.Replicas = desiredReplicas + adapter.Status.Selector = r.buildPodSelector(dgd, adapter.Spec.DGDRef.Service) + + adapter.SetCondition( + nvidiacomv1alpha1.ConditionTypeAdapterReady, + metav1.ConditionTrue, + nvidiacomv1alpha1.ReasonSynced, + "Adapter synced with DGD", + ) + + if err := r.Status().Update(ctx, adapter); err != nil { + logger.Error(err, "Failed to update adapter status") + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +// applyScalingPolicy enforces min/max constraints +// Returns the constrained replica count and a violation message (empty if no violation) +func (r *DGDScalingAdapterReconciler) applyScalingPolicy(adapter *nvidiacomv1alpha1.DGDScalingAdapter, desired int32) (int32, string) { + if adapter.Spec.ScalingPolicy == nil { + return desired, "" + } + + policy := adapter.Spec.ScalingPolicy + + if policy.MinReplicas != nil && desired < *policy.MinReplicas { + return *policy.MinReplicas, fmt.Sprintf("Desired replicas %d below minimum %d", desired, *policy.MinReplicas) + } + + if policy.MaxReplicas != nil && desired > *policy.MaxReplicas { + return *policy.MaxReplicas, fmt.Sprintf("Desired replicas %d exceeds maximum %d", desired, *policy.MaxReplicas) + } + + return desired, "" +} + +// canScaleDown checks if scale-down is allowed based on stabilization window +func (r *DGDScalingAdapterReconciler) canScaleDown(adapter *nvidiacomv1alpha1.DGDScalingAdapter) bool { + if adapter.Spec.ScalingPolicy == nil || + adapter.Spec.ScalingPolicy.ScaleDownStabilizationSeconds == nil || + *adapter.Spec.ScalingPolicy.ScaleDownStabilizationSeconds == 0 { + return true + } + + if adapter.Status.LastScaleTime == nil { + return true + } + + stabilization := time.Duration(*adapter.Spec.ScalingPolicy.ScaleDownStabilizationSeconds) * time.Second + return time.Since(adapter.Status.LastScaleTime.Time) >= stabilization +} + +// buildPodSelector constructs a label selector for the pods managed by this service +func (r *DGDScalingAdapterReconciler) buildPodSelector(dgd *nvidiacomv1alpha1.DynamoGraphDeployment, serviceName string) string { + return fmt.Sprintf("%s=%s,%s=%s", + consts.KubeLabelDynamoGraphDeploymentName, dgd.Name, + consts.KubeLabelServiceName, serviceName) +} + +// getServiceKeys returns the keys of the services map for logging purposes +func getServiceKeys(services map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) []string { + keys := make([]string, 0, len(services)) + for k := range services { + keys = append(keys, k) + } + return keys +} + +// SetupWithManager sets up the controller with the Manager +func (r *DGDScalingAdapterReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&nvidiacomv1alpha1.DGDScalingAdapter{}, builder.WithPredicates( + predicate.GenerationChangedPredicate{}, + )). + Named("dgdscalingadapter"). + // Watch DGDs to sync status when DGD service replicas change + Watches( + &nvidiacomv1alpha1.DynamoGraphDeployment{}, + handler.EnqueueRequestsFromMapFunc(r.findAdaptersForDGD), + builder.WithPredicates(predicate.Funcs{ + CreateFunc: func(ce event.CreateEvent) bool { return false }, + DeleteFunc: func(de event.DeleteEvent) bool { return true }, + UpdateFunc: func(ue event.UpdateEvent) bool { + // Only trigger on spec changes (not status) + oldDGD, okOld := ue.ObjectOld.(*nvidiacomv1alpha1.DynamoGraphDeployment) + newDGD, okNew := ue.ObjectNew.(*nvidiacomv1alpha1.DynamoGraphDeployment) + if !okOld || !okNew { + return false + } + // Trigger if services map changed + return !servicesEqual(oldDGD.Spec.Services, newDGD.Spec.Services) + }, + GenericFunc: func(ge event.GenericEvent) bool { return false }, + }), + ). + WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)). + Complete(r) +} + +// servicesEqual compares two services maps to detect changes in replica counts +func servicesEqual(old, new map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) bool { + if len(old) != len(new) { + return false + } + + for key, oldSvc := range old { + newSvc, exists := new[key] + if !exists { + return false + } + + // Compare replicas + oldReplicas := int32(1) + if oldSvc.Replicas != nil { + oldReplicas = *oldSvc.Replicas + } + + newReplicas := int32(1) + if newSvc.Replicas != nil { + newReplicas = *newSvc.Replicas + } + + if oldReplicas != newReplicas { + return false + } + } + + return true +} + +// findAdaptersForDGD maps DGD changes to adapter reconcile requests +// Uses label selector to efficiently query only adapters for this specific DGD +func (r *DGDScalingAdapterReconciler) findAdaptersForDGD(ctx context.Context, obj client.Object) []reconcile.Request { + dgd, ok := obj.(*nvidiacomv1alpha1.DynamoGraphDeployment) + if !ok { + return nil + } + + // Use label selector to filter at API level (more efficient than in-memory filtering) + adapterList := &nvidiacomv1alpha1.DGDScalingAdapterList{} + if err := r.List(ctx, adapterList, + client.InNamespace(dgd.Namespace), + client.MatchingLabels{consts.KubeLabelDynamoGraphDeploymentName: dgd.Name}, + ); err != nil { + log.FromContext(ctx).Error(err, "Failed to list adapters for DGD", "dgd", dgd.Name) + return nil + } + + // All returned adapters are guaranteed to belong to this DGD + requests := make([]reconcile.Request, 0, len(adapterList.Items)) + for i := range adapterList.Items { + requests = append(requests, reconcile.Request{ + NamespacedName: types.NamespacedName{ + Name: adapterList.Items[i].Name, + Namespace: adapterList.Items[i].Namespace, + }, + }) + } + + return requests +} diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go index 22dcdb5490..0b7ca2acc5 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go @@ -86,6 +86,7 @@ type DynamoGraphDeploymentReconciler struct { // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update +// +kubebuilder:rbac:groups=nvidia.com,resources=dgdscalingadapters,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=grove.io,resources=podcliquesets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch // +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch @@ -225,6 +226,13 @@ func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context return "", "", "", fmt.Errorf("failed to reconcile top-level PVCs: %w", err) } + // Reconcile DGDScalingAdapters for each service + err = r.reconcileScalingAdapters(ctx, dynamoDeployment) + if err != nil { + logger.Error(err, "Failed to reconcile scaling adapters") + return "", "", "", fmt.Errorf("failed to reconcile scaling adapters: %w", err) + } + // Reconcile the SA, Role and RoleBinding if k8s discovery is enabled err = r.reconcileK8sDiscoveryResources(ctx, dynamoDeployment) if err != nil { @@ -607,6 +615,82 @@ func (r *DynamoGraphDeploymentReconciler) reconcilePVCs(ctx context.Context, dyn return nil } +// reconcileScalingAdapters ensures a DGDScalingAdapter exists for each service in the DGD +// This enables pluggable autoscaling via HPA, KEDA, or Planner +func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error { + logger := log.FromContext(ctx) + + // Create or update an adapter for each service using SyncResource pattern + for serviceName, component := range dynamoDeployment.Spec.Services { + // Get current replicas (default to 1 if not set) + currentReplicas := int32(1) + if component.Replicas != nil { + currentReplicas = *component.Replicas + } + + // Use SyncResource to handle creation/updates + _, _, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DGDScalingAdapter, bool, error) { + adapterName := generateAdapterName(dynamoDeployment.Name, serviceName) + adapter := &nvidiacomv1alpha1.DGDScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: adapterName, + Namespace: dynamoDeployment.Namespace, + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: dynamoDeployment.Name, + consts.KubeLabelServiceName: serviceName, + }, + }, + Spec: nvidiacomv1alpha1.DGDScalingAdapterSpec{ + Replicas: currentReplicas, + DGDRef: nvidiacomv1alpha1.DGDServiceRef{ + Name: dynamoDeployment.Name, + Service: serviceName, + }, + }, + } + return adapter, false, nil + }) + + if err != nil { + logger.Error(err, "Failed to sync DGDScalingAdapter", "service", serviceName) + return err + } + } + + // Clean up orphaned adapters (services that no longer exist in DGD) + adapterList := &nvidiacomv1alpha1.DGDScalingAdapterList{} + if err := r.List(ctx, adapterList, + client.InNamespace(dynamoDeployment.Namespace), + client.MatchingLabels{consts.KubeLabelDynamoGraphDeploymentName: dynamoDeployment.Name}, + ); err != nil { + logger.Error(err, "Failed to list DGDScalingAdapters") + return err + } + + for i := range adapterList.Items { + adapter := &adapterList.Items[i] + serviceName := adapter.Spec.DGDRef.Service + + // Check if service still exists in DGD + if _, exists := dynamoDeployment.Spec.Services[serviceName]; !exists { + logger.Info("Deleting orphaned DGDScalingAdapter", "adapter", adapter.Name, "service", serviceName) + if err := r.Delete(ctx, adapter); err != nil && !errors.IsNotFound(err) { + logger.Error(err, "Failed to delete orphaned adapter", "adapter", adapter.Name) + return err + } + r.Recorder.Eventf(dynamoDeployment, corev1.EventTypeNormal, "AdapterDeleted", + "Deleted orphaned scaling adapter %s for removed service %s", adapter.Name, serviceName) + } + } + + return nil +} + +// generateAdapterName creates a consistent name for a DGDScalingAdapter +func generateAdapterName(dgdName, serviceName string) string { + return fmt.Sprintf("%s-%s", dgdName, serviceName) +} + func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error { // for now doing nothing return nil @@ -626,6 +710,13 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err UpdateFunc: func(de event.UpdateEvent) bool { return true }, GenericFunc: func(ge event.GenericEvent) bool { return true }, })). + Owns(&nvidiacomv1alpha1.DGDScalingAdapter{}, builder.WithPredicates(predicate.Funcs{ + // ignore creation cause we don't want to be called again after we create the adapter + CreateFunc: func(ce event.CreateEvent) bool { return false }, + DeleteFunc: func(de event.DeleteEvent) bool { return true }, + UpdateFunc: func(de event.UpdateEvent) bool { return false }, // Adapter updates are handled by adapter controller + GenericFunc: func(ge event.GenericEvent) bool { return false }, + })). Owns(&corev1.PersistentVolumeClaim{}, builder.WithPredicates(predicate.Funcs{ // ignore creation cause we don't want to be called again after we create the PVC CreateFunc: func(ce event.CreateEvent) bool { return false }, From 71cad604aabf223676e60776584c9098f5aeed74 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Mon, 1 Dec 2025 14:52:31 -0700 Subject: [PATCH 02/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- ...dynamographdeploymentscalingadapters.yaml} | 16 +- .../operator/templates/manager-rbac.yaml | 5 +- ...amographdeploymentscalingadapter_types.go} | 42 +-- .../api/v1alpha1/zz_generated.deepcopy.go | 242 +++++++++--------- deploy/cloud/operator/cmd/main.go | 2 +- ...dynamographdeploymentscalingadapters.yaml} | 16 +- deploy/cloud/operator/config/rbac/role.yaml | 21 +- .../operator/internal/controller/common.go | 40 +++ .../dynamographdeployment_controller.go | 24 +- ...aphdeploymentscalingadapter_controller.go} | 70 ++--- 10 files changed, 239 insertions(+), 239 deletions(-) rename deploy/cloud/helm/crds/templates/{nvidia.com_dgdscalingadapters.yaml => nvidia.com_dynamographdeploymentscalingadapters.yaml} (94%) rename deploy/cloud/operator/api/v1alpha1/{dgdscalingadapter_types.go => dynamographdeploymentscalingadapter_types.go} (80%) rename deploy/cloud/operator/config/crd/bases/{nvidia.com_dgdscalingadapters.yaml => nvidia.com_dynamographdeploymentscalingadapters.yaml} (94%) rename deploy/cloud/operator/internal/controller/{dgdscalingadapter_controller.go => dynamographdeploymentscalingadapter_controller.go} (82%) diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dgdscalingadapters.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml similarity index 94% rename from deploy/cloud/helm/crds/templates/nvidia.com_dgdscalingadapters.yaml rename to deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml index bb3816a76d..bc2fad3e21 100644 --- a/deploy/cloud/helm/crds/templates/nvidia.com_dgdscalingadapters.yaml +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml @@ -20,16 +20,16 @@ metadata: annotations: controller-gen.kubebuilder.io/version: v0.16.4 helm.sh/resource-policy: keep - name: dgdscalingadapters.nvidia.com + name: dynamographdeploymentscalingadapters.nvidia.com spec: group: nvidia.com names: - kind: DGDScalingAdapter - listKind: DGDScalingAdapterList - plural: dgdscalingadapters + kind: DynamoGraphDeploymentScalingAdapter + listKind: DynamoGraphDeploymentScalingAdapterList + plural: dynamographdeploymentscalingadapters shortNames: - dgdsa - singular: dgdscalingadapter + singular: dynamographdeploymentscalingadapter scope: Namespaced versions: - additionalPrinterColumns: @@ -60,7 +60,7 @@ spec: schema: openAPIV3Schema: description: |- - DGDScalingAdapter provides a scaling interface for individual services + DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services within a DynamoGraphDeployment. It implements the Kubernetes scale subresource, enabling integration with HPA, KEDA, and custom autoscalers. @@ -86,7 +86,7 @@ spec: metadata: type: object spec: - description: DGDScalingAdapterSpec defines the desired state of DGDScalingAdapter + description: DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter properties: dgdRef: description: DGDRef references the DynamoGraphDeployment and the specific service to scale. @@ -145,7 +145,7 @@ spec: - replicas type: object status: - description: DGDScalingAdapterStatus defines the observed state of DGDScalingAdapter + description: DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter properties: conditions: description: Conditions represent the latest available observations of the adapter's state. diff --git a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml index e8896b74d3..7ae1eb6c5d 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml @@ -366,10 +366,10 @@ rules: - apiGroups: - nvidia.com resources: - - dgdscalingadapters - dynamocomponentdeployments - dynamographdeploymentrequests - dynamographdeployments + - dynamographdeploymentscalingadapters - dynamomodels verbs: - create @@ -382,7 +382,6 @@ rules: - apiGroups: - nvidia.com resources: - - dgdscalingadapters/finalizers - dynamocomponentdeployments/finalizers - dynamographdeploymentrequests/finalizers - dynamographdeployments/finalizers @@ -392,10 +391,10 @@ rules: - apiGroups: - nvidia.com resources: - - dgdscalingadapters/status - dynamocomponentdeployments/status - dynamographdeploymentrequests/status - dynamographdeployments/status + - dynamographdeploymentscalingadapters/status - dynamomodels/status verbs: - get diff --git a/deploy/cloud/operator/api/v1alpha1/dgdscalingadapter_types.go b/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go similarity index 80% rename from deploy/cloud/operator/api/v1alpha1/dgdscalingadapter_types.go rename to deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go index cd299955a3..fa0b3c7f90 100644 --- a/deploy/cloud/operator/api/v1alpha1/dgdscalingadapter_types.go +++ b/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go @@ -21,8 +21,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// DGDScalingAdapterSpec defines the desired state of DGDScalingAdapter -type DGDScalingAdapterSpec struct { +// DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter +type DynamoGraphDeploymentScalingAdapterSpec struct { // Replicas is the desired number of replicas for the target service. // This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. // +kubebuilder:validation:Required @@ -31,7 +31,7 @@ type DGDScalingAdapterSpec struct { // DGDRef references the DynamoGraphDeployment and the specific service to scale. // +kubebuilder:validation:Required - DGDRef DGDServiceRef `json:"dgdRef"` + DGDRef DynamoGraphDeploymentServiceRef `json:"dgdRef"` // ScalingPolicy defines optional constraints for scaling behavior. // These constraints are enforced by the adapter controller, providing @@ -40,8 +40,8 @@ type DGDScalingAdapterSpec struct { ScalingPolicy *ScalingPolicy `json:"scalingPolicy,omitempty"` } -// DGDServiceRef identifies a specific service within a DynamoGraphDeployment -type DGDServiceRef struct { +// DynamoGraphDeploymentServiceRef identifies a specific service within a DynamoGraphDeployment +type DynamoGraphDeploymentServiceRef struct { // Name of the DynamoGraphDeployment // +kubebuilder:validation:Required // +kubebuilder:validation:MinLength=1 @@ -76,8 +76,8 @@ type ScalingPolicy struct { ScaleDownStabilizationSeconds *int32 `json:"scaleDownStabilizationSeconds,omitempty"` } -// DGDScalingAdapterStatus defines the observed state of DGDScalingAdapter -type DGDScalingAdapterStatus struct { +// DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter +type DynamoGraphDeploymentScalingAdapterStatus struct { // Replicas is the current number of replicas for the target service. // This is synced from the DGD's service replicas. // +optional @@ -112,41 +112,41 @@ type DGDScalingAdapterStatus struct { // +kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" // +kubebuilder:resource:shortName={dgdsa} -// DGDScalingAdapter provides a scaling interface for individual services +// DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services // within a DynamoGraphDeployment. It implements the Kubernetes scale // subresource, enabling integration with HPA, KEDA, and custom autoscalers. // // The adapter acts as an intermediary between autoscalers and the DGD, // ensuring that only the adapter controller modifies the DGD's service replicas. // This prevents conflicts when multiple autoscaling mechanisms are in play. -type DGDScalingAdapter struct { +type DynamoGraphDeploymentScalingAdapter struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - Spec DGDScalingAdapterSpec `json:"spec,omitempty"` - Status DGDScalingAdapterStatus `json:"status,omitempty"` + Spec DynamoGraphDeploymentScalingAdapterSpec `json:"spec,omitempty"` + Status DynamoGraphDeploymentScalingAdapterStatus `json:"status,omitempty"` } // +kubebuilder:object:root=true -// DGDScalingAdapterList contains a list of DGDScalingAdapter -type DGDScalingAdapterList struct { +// DynamoGraphDeploymentScalingAdapterList contains a list of DynamoGraphDeploymentScalingAdapter +type DynamoGraphDeploymentScalingAdapterList struct { metav1.TypeMeta `json:",inline"` metav1.ListMeta `json:"metadata,omitempty"` - Items []DGDScalingAdapter `json:"items"` + Items []DynamoGraphDeploymentScalingAdapter `json:"items"` } func init() { - SchemeBuilder.Register(&DGDScalingAdapter{}, &DGDScalingAdapterList{}) + SchemeBuilder.Register(&DynamoGraphDeploymentScalingAdapter{}, &DynamoGraphDeploymentScalingAdapterList{}) } -// Condition types for DGDScalingAdapter +// Condition types for DynamoGraphDeploymentScalingAdapter const ( - // ConditionTypeReady indicates the adapter is synced with DGD and functioning correctly + // ConditionTypeAdapterReady indicates the adapter is synced with DGD and functioning correctly ConditionTypeAdapterReady = "Ready" ) -// Condition reasons for DGDScalingAdapter +// Condition reasons for DynamoGraphDeploymentScalingAdapter const ( // ReasonDGDNotFound indicates the referenced DGD does not exist ReasonDGDNotFound = "DGDNotFound" @@ -159,7 +159,7 @@ const ( ) // SetCondition updates or adds a condition to the adapter's status -func (a *DGDScalingAdapter) SetCondition(condType string, status metav1.ConditionStatus, reason, message string) { +func (a *DynamoGraphDeploymentScalingAdapter) SetCondition(condType string, status metav1.ConditionStatus, reason, message string) { now := metav1.Now() condition := metav1.Condition{ Type: condType, @@ -184,7 +184,7 @@ func (a *DGDScalingAdapter) SetCondition(condType string, status metav1.Conditio } // GetCondition returns the condition with the given type, or nil if not found -func (a *DGDScalingAdapter) GetCondition(condType string) *metav1.Condition { +func (a *DynamoGraphDeploymentScalingAdapter) GetCondition(condType string) *metav1.Condition { for i := range a.Status.Conditions { if a.Status.Conditions[i].Type == condType { return &a.Status.Conditions[i] @@ -194,7 +194,7 @@ func (a *DGDScalingAdapter) GetCondition(condType string) *metav1.Condition { } // IsReady returns true if the adapter is in Ready state -func (a *DGDScalingAdapter) IsReady() bool { +func (a *DynamoGraphDeploymentScalingAdapter) IsReady() bool { cond := a.GetCondition(ConditionTypeAdapterReady) return cond != nil && cond.Status == metav1.ConditionTrue } diff --git a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go index 513df1db15..6f05c9de43 100644 --- a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go @@ -129,127 +129,6 @@ func (in *ConfigMapKeySelector) DeepCopy() *ConfigMapKeySelector { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *DGDScalingAdapter) DeepCopyInto(out *DGDScalingAdapter) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - in.Spec.DeepCopyInto(&out.Spec) - in.Status.DeepCopyInto(&out.Status) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DGDScalingAdapter. -func (in *DGDScalingAdapter) DeepCopy() *DGDScalingAdapter { - if in == nil { - return nil - } - out := new(DGDScalingAdapter) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *DGDScalingAdapter) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *DGDScalingAdapterList) DeepCopyInto(out *DGDScalingAdapterList) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]DGDScalingAdapter, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DGDScalingAdapterList. -func (in *DGDScalingAdapterList) DeepCopy() *DGDScalingAdapterList { - if in == nil { - return nil - } - out := new(DGDScalingAdapterList) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *DGDScalingAdapterList) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *DGDScalingAdapterSpec) DeepCopyInto(out *DGDScalingAdapterSpec) { - *out = *in - out.DGDRef = in.DGDRef - if in.ScalingPolicy != nil { - in, out := &in.ScalingPolicy, &out.ScalingPolicy - *out = new(ScalingPolicy) - (*in).DeepCopyInto(*out) - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DGDScalingAdapterSpec. -func (in *DGDScalingAdapterSpec) DeepCopy() *DGDScalingAdapterSpec { - if in == nil { - return nil - } - out := new(DGDScalingAdapterSpec) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *DGDScalingAdapterStatus) DeepCopyInto(out *DGDScalingAdapterStatus) { - *out = *in - if in.LastScaleTime != nil { - in, out := &in.LastScaleTime, &out.LastScaleTime - *out = (*in).DeepCopy() - } - if in.Conditions != nil { - in, out := &in.Conditions, &out.Conditions - *out = make([]metav1.Condition, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DGDScalingAdapterStatus. -func (in *DGDScalingAdapterStatus) DeepCopy() *DGDScalingAdapterStatus { - if in == nil { - return nil - } - out := new(DGDScalingAdapterStatus) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *DGDServiceRef) DeepCopyInto(out *DGDServiceRef) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DGDServiceRef. -func (in *DGDServiceRef) DeepCopy() *DGDServiceRef { - if in == nil { - return nil - } - out := new(DGDServiceRef) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DeploymentOverridesSpec) DeepCopyInto(out *DeploymentOverridesSpec) { *out = *in @@ -720,6 +599,127 @@ func (in *DynamoGraphDeploymentRequestStatus) DeepCopy() *DynamoGraphDeploymentR return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DynamoGraphDeploymentScalingAdapter) DeepCopyInto(out *DynamoGraphDeploymentScalingAdapter) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentScalingAdapter. +func (in *DynamoGraphDeploymentScalingAdapter) DeepCopy() *DynamoGraphDeploymentScalingAdapter { + if in == nil { + return nil + } + out := new(DynamoGraphDeploymentScalingAdapter) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *DynamoGraphDeploymentScalingAdapter) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DynamoGraphDeploymentScalingAdapterList) DeepCopyInto(out *DynamoGraphDeploymentScalingAdapterList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]DynamoGraphDeploymentScalingAdapter, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentScalingAdapterList. +func (in *DynamoGraphDeploymentScalingAdapterList) DeepCopy() *DynamoGraphDeploymentScalingAdapterList { + if in == nil { + return nil + } + out := new(DynamoGraphDeploymentScalingAdapterList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *DynamoGraphDeploymentScalingAdapterList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DynamoGraphDeploymentScalingAdapterSpec) DeepCopyInto(out *DynamoGraphDeploymentScalingAdapterSpec) { + *out = *in + out.DGDRef = in.DGDRef + if in.ScalingPolicy != nil { + in, out := &in.ScalingPolicy, &out.ScalingPolicy + *out = new(ScalingPolicy) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentScalingAdapterSpec. +func (in *DynamoGraphDeploymentScalingAdapterSpec) DeepCopy() *DynamoGraphDeploymentScalingAdapterSpec { + if in == nil { + return nil + } + out := new(DynamoGraphDeploymentScalingAdapterSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DynamoGraphDeploymentScalingAdapterStatus) DeepCopyInto(out *DynamoGraphDeploymentScalingAdapterStatus) { + *out = *in + if in.LastScaleTime != nil { + in, out := &in.LastScaleTime, &out.LastScaleTime + *out = (*in).DeepCopy() + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentScalingAdapterStatus. +func (in *DynamoGraphDeploymentScalingAdapterStatus) DeepCopy() *DynamoGraphDeploymentScalingAdapterStatus { + if in == nil { + return nil + } + out := new(DynamoGraphDeploymentScalingAdapterStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DynamoGraphDeploymentServiceRef) DeepCopyInto(out *DynamoGraphDeploymentServiceRef) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentServiceRef. +func (in *DynamoGraphDeploymentServiceRef) DeepCopy() *DynamoGraphDeploymentServiceRef { + if in == nil { + return nil + } + out := new(DynamoGraphDeploymentServiceRef) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DynamoGraphDeploymentSpec) DeepCopyInto(out *DynamoGraphDeploymentSpec) { *out = *in diff --git a/deploy/cloud/operator/cmd/main.go b/deploy/cloud/operator/cmd/main.go index aedd775e54..dc1a33b262 100644 --- a/deploy/cloud/operator/cmd/main.go +++ b/deploy/cloud/operator/cmd/main.go @@ -578,7 +578,7 @@ func main() { os.Exit(1) } - if err = (&controller.DGDScalingAdapterReconciler{ + if err = (&controller.DynamoGraphDeploymentScalingAdapterReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), Recorder: mgr.GetEventRecorderFor("dgdscalingadapter"), diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dgdscalingadapters.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml similarity index 94% rename from deploy/cloud/operator/config/crd/bases/nvidia.com_dgdscalingadapters.yaml rename to deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml index bb3816a76d..bc2fad3e21 100644 --- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dgdscalingadapters.yaml +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml @@ -20,16 +20,16 @@ metadata: annotations: controller-gen.kubebuilder.io/version: v0.16.4 helm.sh/resource-policy: keep - name: dgdscalingadapters.nvidia.com + name: dynamographdeploymentscalingadapters.nvidia.com spec: group: nvidia.com names: - kind: DGDScalingAdapter - listKind: DGDScalingAdapterList - plural: dgdscalingadapters + kind: DynamoGraphDeploymentScalingAdapter + listKind: DynamoGraphDeploymentScalingAdapterList + plural: dynamographdeploymentscalingadapters shortNames: - dgdsa - singular: dgdscalingadapter + singular: dynamographdeploymentscalingadapter scope: Namespaced versions: - additionalPrinterColumns: @@ -60,7 +60,7 @@ spec: schema: openAPIV3Schema: description: |- - DGDScalingAdapter provides a scaling interface for individual services + DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services within a DynamoGraphDeployment. It implements the Kubernetes scale subresource, enabling integration with HPA, KEDA, and custom autoscalers. @@ -86,7 +86,7 @@ spec: metadata: type: object spec: - description: DGDScalingAdapterSpec defines the desired state of DGDScalingAdapter + description: DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter properties: dgdRef: description: DGDRef references the DynamoGraphDeployment and the specific service to scale. @@ -145,7 +145,7 @@ spec: - replicas type: object status: - description: DGDScalingAdapterStatus defines the observed state of DGDScalingAdapter + description: DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter properties: conditions: description: Conditions represent the latest available observations of the adapter's state. diff --git a/deploy/cloud/operator/config/rbac/role.yaml b/deploy/cloud/operator/config/rbac/role.yaml index 1cdcead130..f45ac9beee 100644 --- a/deploy/cloud/operator/config/rbac/role.yaml +++ b/deploy/cloud/operator/config/rbac/role.yaml @@ -183,6 +183,7 @@ rules: - dynamocomponentdeployments - dynamographdeploymentrequests - dynamographdeployments + - dynamographdeploymentscalingadapters - dynamomodels verbs: - create @@ -195,24 +196,24 @@ rules: - apiGroups: - nvidia.com resources: - - dgdscalingadapters/status + - dynamocomponentdeployments/finalizers + - dynamographdeploymentrequests/finalizers + - dynamographdeployments/finalizers + - dynamomodels/finalizers + verbs: + - update +- apiGroups: + - nvidia.com + resources: - dynamocomponentdeployments/status - dynamographdeploymentrequests/status - dynamographdeployments/status + - dynamographdeploymentscalingadapters/status - dynamomodels/status verbs: - get - patch - update -- apiGroups: - - nvidia.com - resources: - - dynamocomponentdeployments/finalizers - - dynamographdeploymentrequests/finalizers - - dynamographdeployments/finalizers - - dynamomodels/finalizers - verbs: - - update - apiGroups: - scheduling.run.ai resources: diff --git a/deploy/cloud/operator/internal/controller/common.go b/deploy/cloud/operator/internal/controller/common.go index 70a70fdead..e41cbe1deb 100644 --- a/deploy/cloud/operator/internal/controller/common.go +++ b/deploy/cloud/operator/internal/controller/common.go @@ -53,3 +53,43 @@ type dockerSecretRetriever interface { // returns a list of secret names associated with the docker registry GetSecrets(namespace, registry string) ([]string, error) } + +// getServiceKeys returns the keys of the services map for logging purposes +func getServiceKeys(services map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec) []string { + keys := make([]string, 0, len(services)) + for k := range services { + keys = append(keys, k) + } + return keys +} + +// servicesEqual compares two services maps to detect changes in replica counts +func servicesEqual(old, new map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec) bool { + if len(old) != len(new) { + return false + } + + for key, oldSvc := range old { + newSvc, exists := new[key] + if !exists { + return false + } + + // Compare replicas + oldReplicas := int32(1) + if oldSvc.Replicas != nil { + oldReplicas = *oldSvc.Replicas + } + + newReplicas := int32(1) + if newSvc.Replicas != nil { + newReplicas = *newSvc.Replicas + } + + if oldReplicas != newReplicas { + return false + } + } + + return true +} diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go index 0b7ca2acc5..4207682273 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go @@ -226,7 +226,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context return "", "", "", fmt.Errorf("failed to reconcile top-level PVCs: %w", err) } - // Reconcile DGDScalingAdapters for each service + // Reconcile DynamoGraphDeploymentScalingAdapters for each service err = r.reconcileScalingAdapters(ctx, dynamoDeployment) if err != nil { logger.Error(err, "Failed to reconcile scaling adapters") @@ -615,7 +615,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcilePVCs(ctx context.Context, dyn return nil } -// reconcileScalingAdapters ensures a DGDScalingAdapter exists for each service in the DGD +// reconcileScalingAdapters ensures a DynamoGraphDeploymentScalingAdapter exists for each service in the DGD // This enables pluggable autoscaling via HPA, KEDA, or Planner func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error { logger := log.FromContext(ctx) @@ -629,9 +629,9 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C } // Use SyncResource to handle creation/updates - _, _, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DGDScalingAdapter, bool, error) { + _, _, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter, bool, error) { adapterName := generateAdapterName(dynamoDeployment.Name, serviceName) - adapter := &nvidiacomv1alpha1.DGDScalingAdapter{ + adapter := &nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter{ ObjectMeta: metav1.ObjectMeta{ Name: adapterName, Namespace: dynamoDeployment.Namespace, @@ -640,9 +640,9 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C consts.KubeLabelServiceName: serviceName, }, }, - Spec: nvidiacomv1alpha1.DGDScalingAdapterSpec{ + Spec: nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ Replicas: currentReplicas, - DGDRef: nvidiacomv1alpha1.DGDServiceRef{ + DGDRef: nvidiacomv1alpha1.DynamoGraphDeploymentServiceRef{ Name: dynamoDeployment.Name, Service: serviceName, }, @@ -652,18 +652,18 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C }) if err != nil { - logger.Error(err, "Failed to sync DGDScalingAdapter", "service", serviceName) + logger.Error(err, "Failed to sync DynamoGraphDeploymentScalingAdapter", "service", serviceName) return err } } // Clean up orphaned adapters (services that no longer exist in DGD) - adapterList := &nvidiacomv1alpha1.DGDScalingAdapterList{} + adapterList := &nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapterList{} if err := r.List(ctx, adapterList, client.InNamespace(dynamoDeployment.Namespace), client.MatchingLabels{consts.KubeLabelDynamoGraphDeploymentName: dynamoDeployment.Name}, ); err != nil { - logger.Error(err, "Failed to list DGDScalingAdapters") + logger.Error(err, "Failed to list DynamoGraphDeploymentScalingAdapters") return err } @@ -673,7 +673,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C // Check if service still exists in DGD if _, exists := dynamoDeployment.Spec.Services[serviceName]; !exists { - logger.Info("Deleting orphaned DGDScalingAdapter", "adapter", adapter.Name, "service", serviceName) + logger.Info("Deleting orphaned DynamoGraphDeploymentScalingAdapter", "adapter", adapter.Name, "service", serviceName) if err := r.Delete(ctx, adapter); err != nil && !errors.IsNotFound(err) { logger.Error(err, "Failed to delete orphaned adapter", "adapter", adapter.Name) return err @@ -686,7 +686,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C return nil } -// generateAdapterName creates a consistent name for a DGDScalingAdapter +// generateAdapterName creates a consistent name for a DynamoGraphDeploymentScalingAdapter func generateAdapterName(dgdName, serviceName string) string { return fmt.Sprintf("%s-%s", dgdName, serviceName) } @@ -710,7 +710,7 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err UpdateFunc: func(de event.UpdateEvent) bool { return true }, GenericFunc: func(ge event.GenericEvent) bool { return true }, })). - Owns(&nvidiacomv1alpha1.DGDScalingAdapter{}, builder.WithPredicates(predicate.Funcs{ + Owns(&nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter{}, builder.WithPredicates(predicate.Funcs{ // ignore creation cause we don't want to be called again after we create the adapter CreateFunc: func(ce event.CreateEvent) bool { return false }, DeleteFunc: func(de event.DeleteEvent) bool { return true }, diff --git a/deploy/cloud/operator/internal/controller/dgdscalingadapter_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go similarity index 82% rename from deploy/cloud/operator/internal/controller/dgdscalingadapter_controller.go rename to deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go index 85d254fafe..6510ff44c0 100644 --- a/deploy/cloud/operator/internal/controller/dgdscalingadapter_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go @@ -42,24 +42,24 @@ import ( commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common" ) -// DGDScalingAdapterReconciler reconciles a DGDScalingAdapter object -type DGDScalingAdapterReconciler struct { +// DynamoGraphDeploymentScalingAdapterReconciler reconciles a DynamoGraphDeploymentScalingAdapter object +type DynamoGraphDeploymentScalingAdapterReconciler struct { client.Client Scheme *runtime.Scheme Recorder record.EventRecorder Config commonController.Config } -// +kubebuilder:rbac:groups=nvidia.com,resources=dgdscalingadapters,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=nvidia.com,resources=dgdscalingadapters/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters/status,verbs=get;update;patch // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;update;patch -// Reconcile implements the reconciliation loop for DGDScalingAdapter -func (r *DGDScalingAdapterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +// Reconcile implements the reconciliation loop for DynamoGraphDeploymentScalingAdapter +func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) - // 1. Fetch the DGDScalingAdapter - adapter := &nvidiacomv1alpha1.DGDScalingAdapter{} + // 1. Fetch the DynamoGraphDeploymentScalingAdapter + adapter := &nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter{} if err := r.Get(ctx, req.NamespacedName, adapter); err != nil { return ctrl.Result{}, client.IgnoreNotFound(err) } @@ -205,7 +205,7 @@ func (r *DGDScalingAdapterReconciler) Reconcile(ctx context.Context, req ctrl.Re // applyScalingPolicy enforces min/max constraints // Returns the constrained replica count and a violation message (empty if no violation) -func (r *DGDScalingAdapterReconciler) applyScalingPolicy(adapter *nvidiacomv1alpha1.DGDScalingAdapter, desired int32) (int32, string) { +func (r *DynamoGraphDeploymentScalingAdapterReconciler) applyScalingPolicy(adapter *nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter, desired int32) (int32, string) { if adapter.Spec.ScalingPolicy == nil { return desired, "" } @@ -224,7 +224,7 @@ func (r *DGDScalingAdapterReconciler) applyScalingPolicy(adapter *nvidiacomv1alp } // canScaleDown checks if scale-down is allowed based on stabilization window -func (r *DGDScalingAdapterReconciler) canScaleDown(adapter *nvidiacomv1alpha1.DGDScalingAdapter) bool { +func (r *DynamoGraphDeploymentScalingAdapterReconciler) canScaleDown(adapter *nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter) bool { if adapter.Spec.ScalingPolicy == nil || adapter.Spec.ScalingPolicy.ScaleDownStabilizationSeconds == nil || *adapter.Spec.ScalingPolicy.ScaleDownStabilizationSeconds == 0 { @@ -240,25 +240,16 @@ func (r *DGDScalingAdapterReconciler) canScaleDown(adapter *nvidiacomv1alpha1.DG } // buildPodSelector constructs a label selector for the pods managed by this service -func (r *DGDScalingAdapterReconciler) buildPodSelector(dgd *nvidiacomv1alpha1.DynamoGraphDeployment, serviceName string) string { +func (r *DynamoGraphDeploymentScalingAdapterReconciler) buildPodSelector(dgd *nvidiacomv1alpha1.DynamoGraphDeployment, serviceName string) string { return fmt.Sprintf("%s=%s,%s=%s", consts.KubeLabelDynamoGraphDeploymentName, dgd.Name, consts.KubeLabelServiceName, serviceName) } -// getServiceKeys returns the keys of the services map for logging purposes -func getServiceKeys(services map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) []string { - keys := make([]string, 0, len(services)) - for k := range services { - keys = append(keys, k) - } - return keys -} - // SetupWithManager sets up the controller with the Manager -func (r *DGDScalingAdapterReconciler) SetupWithManager(mgr ctrl.Manager) error { +func (r *DynamoGraphDeploymentScalingAdapterReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). - For(&nvidiacomv1alpha1.DGDScalingAdapter{}, builder.WithPredicates( + For(&nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter{}, builder.WithPredicates( predicate.GenerationChangedPredicate{}, )). Named("dgdscalingadapter"). @@ -286,47 +277,16 @@ func (r *DGDScalingAdapterReconciler) SetupWithManager(mgr ctrl.Manager) error { Complete(r) } -// servicesEqual compares two services maps to detect changes in replica counts -func servicesEqual(old, new map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) bool { - if len(old) != len(new) { - return false - } - - for key, oldSvc := range old { - newSvc, exists := new[key] - if !exists { - return false - } - - // Compare replicas - oldReplicas := int32(1) - if oldSvc.Replicas != nil { - oldReplicas = *oldSvc.Replicas - } - - newReplicas := int32(1) - if newSvc.Replicas != nil { - newReplicas = *newSvc.Replicas - } - - if oldReplicas != newReplicas { - return false - } - } - - return true -} - // findAdaptersForDGD maps DGD changes to adapter reconcile requests // Uses label selector to efficiently query only adapters for this specific DGD -func (r *DGDScalingAdapterReconciler) findAdaptersForDGD(ctx context.Context, obj client.Object) []reconcile.Request { +func (r *DynamoGraphDeploymentScalingAdapterReconciler) findAdaptersForDGD(ctx context.Context, obj client.Object) []reconcile.Request { dgd, ok := obj.(*nvidiacomv1alpha1.DynamoGraphDeployment) if !ok { return nil } // Use label selector to filter at API level (more efficient than in-memory filtering) - adapterList := &nvidiacomv1alpha1.DGDScalingAdapterList{} + adapterList := &nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapterList{} if err := r.List(ctx, adapterList, client.InNamespace(dgd.Namespace), client.MatchingLabels{consts.KubeLabelDynamoGraphDeploymentName: dgd.Name}, From 2c7bd1752d68060ecc7b071ebf5a528e242846df Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Mon, 1 Dec 2025 15:09:13 -0700 Subject: [PATCH 03/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- .../internal/controller/dynamographdeployment_controller.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go index 4207682273..6f250d8b8f 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go @@ -687,8 +687,9 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C } // generateAdapterName creates a consistent name for a DynamoGraphDeploymentScalingAdapter +// Service names are lowercased to comply with Kubernetes DNS subdomain naming requirements func generateAdapterName(dgdName, serviceName string) string { - return fmt.Sprintf("%s-%s", dgdName, serviceName) + return fmt.Sprintf("%s-%s", dgdName, strings.ToLower(serviceName)) } func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error { From 6746518267deebdf9fabaa3b95ec2b01e37195ea Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Tue, 2 Dec 2025 11:18:37 -0700 Subject: [PATCH 04/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- ...nvidia.com_dynamocomponentdeployments.yaml | 591 ------------------ .../nvidia.com_dynamographdeployments.yaml | 591 ------------------ ..._dynamographdeploymentscalingadapters.yaml | 101 +-- deploy/cloud/operator/api/v1alpha1/common.go | 9 - .../dynamocomponentdeployment_types.go | 2 - ...namographdeploymentscalingadapter_types.go | 102 +-- .../api/v1alpha1/zz_generated.deepcopy.go | 77 +-- ...nvidia.com_dynamocomponentdeployments.yaml | 591 ------------------ .../nvidia.com_dynamographdeployments.yaml | 591 ------------------ ..._dynamographdeploymentscalingadapters.yaml | 101 +-- .../cloud/operator/internal/consts/consts.go | 3 - .../dynamocomponentdeployment_controller.go | 72 --- .../dynamographdeployment_controller.go | 2 +- ...raphdeploymentscalingadapter_controller.go | 102 +-- .../cloud/operator/internal/dynamo/graph.go | 3 +- .../operator/internal/dynamo/graph_test.go | 28 +- .../dynamocomponentdeployment_test.go | 25 - .../validation/dynamographdeployment_test.go | 22 - .../internal/webhook/validation/shared.go | 28 - .../webhook/validation/shared_test.go | 43 -- docs/_sections/k8s_deployment.rst | 1 + docs/kubernetes/api_reference.md | 172 +++-- docs/kubernetes/autoscaling.md | 402 ++++++++++++ 23 files changed, 554 insertions(+), 3105 deletions(-) create mode 100644 docs/kubernetes/autoscaling.md diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml index 558a5b973d..39b04bf3f0 100644 --- a/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml @@ -76,597 +76,6 @@ spec: Annotations to add to generated Kubernetes resources for this component (such as Pod, Service, and Ingress when applicable). type: object - autoscaling: - description: Autoscaling config for this component (replica range, target utilization, etc.). - properties: - behavior: - description: |- - HorizontalPodAutoscalerBehavior configures the scaling behavior of the target - in both Up and Down directions (scaleUp and scaleDown fields respectively). - properties: - scaleDown: - description: |- - scaleDown is scaling policy for scaling Down. - If not set, the default value is to allow to scale down to minReplicas pods, with a - 300 second stabilization window (i.e., the highest recommendation for - the last 300sec is used). - properties: - policies: - description: |- - policies is a list of potential scaling polices which can be used during scaling. - If not set, use the default values: - - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. - - For scale down: allow all pods to be removed in a 15s window. - items: - description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. - properties: - periodSeconds: - description: |- - periodSeconds specifies the window of time for which the policy should hold true. - PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). - format: int32 - type: integer - type: - description: type is used to specify the scaling policy. - type: string - value: - description: |- - value contains the amount of change which is permitted by the policy. - It must be greater than zero - format: int32 - type: integer - required: - - periodSeconds - - type - - value - type: object - type: array - x-kubernetes-list-type: atomic - selectPolicy: - description: |- - selectPolicy is used to specify which policy should be used. - If not set, the default value Max is used. - type: string - stabilizationWindowSeconds: - description: |- - stabilizationWindowSeconds is the number of seconds for which past recommendations should be - considered while scaling up or scaling down. - StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). - If not set, use the default values: - - For scale up: 0 (i.e. no stabilization is done). - - For scale down: 300 (i.e. the stabilization window is 300 seconds long). - format: int32 - type: integer - tolerance: - anyOf: - - type: integer - - type: string - description: |- - tolerance is the tolerance on the ratio between the current and desired - metric value under which no updates are made to the desired number of - replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not - set, the default cluster-wide tolerance is applied (by default 10%). - - For example, if autoscaling is configured with a memory consumption target of 100Mi, - and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be - triggered when the actual consumption falls below 95Mi or exceeds 101Mi. - - This is an alpha field and requires enabling the HPAConfigurableTolerance - feature gate. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - scaleUp: - description: |- - scaleUp is scaling policy for scaling Up. - If not set, the default value is the higher of: - * increase no more than 4 pods per 60 seconds - * double the number of pods per 60 seconds - No stabilization is used. - properties: - policies: - description: |- - policies is a list of potential scaling polices which can be used during scaling. - If not set, use the default values: - - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. - - For scale down: allow all pods to be removed in a 15s window. - items: - description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. - properties: - periodSeconds: - description: |- - periodSeconds specifies the window of time for which the policy should hold true. - PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). - format: int32 - type: integer - type: - description: type is used to specify the scaling policy. - type: string - value: - description: |- - value contains the amount of change which is permitted by the policy. - It must be greater than zero - format: int32 - type: integer - required: - - periodSeconds - - type - - value - type: object - type: array - x-kubernetes-list-type: atomic - selectPolicy: - description: |- - selectPolicy is used to specify which policy should be used. - If not set, the default value Max is used. - type: string - stabilizationWindowSeconds: - description: |- - stabilizationWindowSeconds is the number of seconds for which past recommendations should be - considered while scaling up or scaling down. - StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). - If not set, use the default values: - - For scale up: 0 (i.e. no stabilization is done). - - For scale down: 300 (i.e. the stabilization window is 300 seconds long). - format: int32 - type: integer - tolerance: - anyOf: - - type: integer - - type: string - description: |- - tolerance is the tolerance on the ratio between the current and desired - metric value under which no updates are made to the desired number of - replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not - set, the default cluster-wide tolerance is applied (by default 10%). - - For example, if autoscaling is configured with a memory consumption target of 100Mi, - and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be - triggered when the actual consumption falls below 95Mi or exceeds 101Mi. - - This is an alpha field and requires enabling the HPAConfigurableTolerance - feature gate. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - type: object - enabled: - type: boolean - maxReplicas: - type: integer - metrics: - items: - description: |- - MetricSpec specifies how to scale based on a single metric - (only `type` and one other matching field should be set at once). - properties: - containerResource: - description: |- - containerResource refers to a resource metric (such as those specified in - requests and limits) known to Kubernetes describing a single container in - each pod of the current scale target (e.g. CPU or memory). Such metrics are - built in to Kubernetes, and have special scaling options on top of those - available to normal per-pod metrics using the "pods" source. - properties: - container: - description: container is the name of the container in the pods of the scaling target - type: string - name: - description: name is the name of the resource in question. - type: string - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - container - - name - - target - type: object - external: - description: |- - external refers to a global metric that is not associated - with any Kubernetes object. It allows autoscaling based on information - coming from components running outside of cluster - (for example length of queue in cloud messaging service, or - QPS from loadbalancer running outside of cluster). - properties: - metric: - description: metric identifies the target metric by name and selector - properties: - name: - description: name is the name of the given metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - required: - - name - type: object - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - metric - - target - type: object - object: - description: |- - object refers to a metric describing a single kubernetes object - (for example, hits-per-second on an Ingress object). - properties: - describedObject: - description: describedObject specifies the descriptions of a object,such as kind,name apiVersion - properties: - apiVersion: - description: apiVersion is the API version of the referent - type: string - kind: - description: 'kind is the kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - name: - description: 'name is the name of the referent; More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' - type: string - required: - - kind - - name - type: object - metric: - description: metric identifies the target metric by name and selector - properties: - name: - description: name is the name of the given metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - required: - - name - type: object - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - describedObject - - metric - - target - type: object - pods: - description: |- - pods refers to a metric describing each pod in the current scale target - (for example, transactions-processed-per-second). The values will be - averaged together before being compared to the target value. - properties: - metric: - description: metric identifies the target metric by name and selector - properties: - name: - description: name is the name of the given metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - required: - - name - type: object - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - metric - - target - type: object - resource: - description: |- - resource refers to a resource metric (such as those specified in - requests and limits) known to Kubernetes describing each pod in the - current scale target (e.g. CPU or memory). Such metrics are built in to - Kubernetes, and have special scaling options on top of those available - to normal per-pod metrics using the "pods" source. - properties: - name: - description: name is the name of the resource in question. - type: string - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - name - - target - type: object - type: - description: |- - type is the type of metric source. It should be one of "ContainerResource", "External", - "Object", "Pods" or "Resource", each mapping to a matching field in the object. - type: string - required: - - type - type: object - type: array - minReplicas: - type: integer - type: object backendFramework: description: BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm") enum: diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml index ba2b19fef9..7814ec4ea9 100644 --- a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml @@ -218,597 +218,6 @@ spec: Annotations to add to generated Kubernetes resources for this component (such as Pod, Service, and Ingress when applicable). type: object - autoscaling: - description: Autoscaling config for this component (replica range, target utilization, etc.). - properties: - behavior: - description: |- - HorizontalPodAutoscalerBehavior configures the scaling behavior of the target - in both Up and Down directions (scaleUp and scaleDown fields respectively). - properties: - scaleDown: - description: |- - scaleDown is scaling policy for scaling Down. - If not set, the default value is to allow to scale down to minReplicas pods, with a - 300 second stabilization window (i.e., the highest recommendation for - the last 300sec is used). - properties: - policies: - description: |- - policies is a list of potential scaling polices which can be used during scaling. - If not set, use the default values: - - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. - - For scale down: allow all pods to be removed in a 15s window. - items: - description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. - properties: - periodSeconds: - description: |- - periodSeconds specifies the window of time for which the policy should hold true. - PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). - format: int32 - type: integer - type: - description: type is used to specify the scaling policy. - type: string - value: - description: |- - value contains the amount of change which is permitted by the policy. - It must be greater than zero - format: int32 - type: integer - required: - - periodSeconds - - type - - value - type: object - type: array - x-kubernetes-list-type: atomic - selectPolicy: - description: |- - selectPolicy is used to specify which policy should be used. - If not set, the default value Max is used. - type: string - stabilizationWindowSeconds: - description: |- - stabilizationWindowSeconds is the number of seconds for which past recommendations should be - considered while scaling up or scaling down. - StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). - If not set, use the default values: - - For scale up: 0 (i.e. no stabilization is done). - - For scale down: 300 (i.e. the stabilization window is 300 seconds long). - format: int32 - type: integer - tolerance: - anyOf: - - type: integer - - type: string - description: |- - tolerance is the tolerance on the ratio between the current and desired - metric value under which no updates are made to the desired number of - replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not - set, the default cluster-wide tolerance is applied (by default 10%). - - For example, if autoscaling is configured with a memory consumption target of 100Mi, - and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be - triggered when the actual consumption falls below 95Mi or exceeds 101Mi. - - This is an alpha field and requires enabling the HPAConfigurableTolerance - feature gate. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - scaleUp: - description: |- - scaleUp is scaling policy for scaling Up. - If not set, the default value is the higher of: - * increase no more than 4 pods per 60 seconds - * double the number of pods per 60 seconds - No stabilization is used. - properties: - policies: - description: |- - policies is a list of potential scaling polices which can be used during scaling. - If not set, use the default values: - - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. - - For scale down: allow all pods to be removed in a 15s window. - items: - description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. - properties: - periodSeconds: - description: |- - periodSeconds specifies the window of time for which the policy should hold true. - PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). - format: int32 - type: integer - type: - description: type is used to specify the scaling policy. - type: string - value: - description: |- - value contains the amount of change which is permitted by the policy. - It must be greater than zero - format: int32 - type: integer - required: - - periodSeconds - - type - - value - type: object - type: array - x-kubernetes-list-type: atomic - selectPolicy: - description: |- - selectPolicy is used to specify which policy should be used. - If not set, the default value Max is used. - type: string - stabilizationWindowSeconds: - description: |- - stabilizationWindowSeconds is the number of seconds for which past recommendations should be - considered while scaling up or scaling down. - StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). - If not set, use the default values: - - For scale up: 0 (i.e. no stabilization is done). - - For scale down: 300 (i.e. the stabilization window is 300 seconds long). - format: int32 - type: integer - tolerance: - anyOf: - - type: integer - - type: string - description: |- - tolerance is the tolerance on the ratio between the current and desired - metric value under which no updates are made to the desired number of - replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not - set, the default cluster-wide tolerance is applied (by default 10%). - - For example, if autoscaling is configured with a memory consumption target of 100Mi, - and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be - triggered when the actual consumption falls below 95Mi or exceeds 101Mi. - - This is an alpha field and requires enabling the HPAConfigurableTolerance - feature gate. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - type: object - enabled: - type: boolean - maxReplicas: - type: integer - metrics: - items: - description: |- - MetricSpec specifies how to scale based on a single metric - (only `type` and one other matching field should be set at once). - properties: - containerResource: - description: |- - containerResource refers to a resource metric (such as those specified in - requests and limits) known to Kubernetes describing a single container in - each pod of the current scale target (e.g. CPU or memory). Such metrics are - built in to Kubernetes, and have special scaling options on top of those - available to normal per-pod metrics using the "pods" source. - properties: - container: - description: container is the name of the container in the pods of the scaling target - type: string - name: - description: name is the name of the resource in question. - type: string - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - container - - name - - target - type: object - external: - description: |- - external refers to a global metric that is not associated - with any Kubernetes object. It allows autoscaling based on information - coming from components running outside of cluster - (for example length of queue in cloud messaging service, or - QPS from loadbalancer running outside of cluster). - properties: - metric: - description: metric identifies the target metric by name and selector - properties: - name: - description: name is the name of the given metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - required: - - name - type: object - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - metric - - target - type: object - object: - description: |- - object refers to a metric describing a single kubernetes object - (for example, hits-per-second on an Ingress object). - properties: - describedObject: - description: describedObject specifies the descriptions of a object,such as kind,name apiVersion - properties: - apiVersion: - description: apiVersion is the API version of the referent - type: string - kind: - description: 'kind is the kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - name: - description: 'name is the name of the referent; More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' - type: string - required: - - kind - - name - type: object - metric: - description: metric identifies the target metric by name and selector - properties: - name: - description: name is the name of the given metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - required: - - name - type: object - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - describedObject - - metric - - target - type: object - pods: - description: |- - pods refers to a metric describing each pod in the current scale target - (for example, transactions-processed-per-second). The values will be - averaged together before being compared to the target value. - properties: - metric: - description: metric identifies the target metric by name and selector - properties: - name: - description: name is the name of the given metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - required: - - name - type: object - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - metric - - target - type: object - resource: - description: |- - resource refers to a resource metric (such as those specified in - requests and limits) known to Kubernetes describing each pod in the - current scale target (e.g. CPU or memory). Such metrics are built in to - Kubernetes, and have special scaling options on top of those available - to normal per-pod metrics using the "pods" source. - properties: - name: - description: name is the name of the resource in question. - type: string - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - name - - target - type: object - type: - description: |- - type is the type of metric source. It should be one of "ContainerResource", "External", - "Object", "Pods" or "Resource", each mapping to a matching field in the object. - type: string - required: - - type - type: object - type: array - minReplicas: - type: integer - type: object componentType: description: ComponentType indicates the role of this component (for example, "main"). type: string diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml index bc2fad3e21..4a9ecb3b3b 100644 --- a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml @@ -41,18 +41,10 @@ spec: jsonPath: .spec.dgdRef.service name: SERVICE type: string - - description: Desired replicas - jsonPath: .spec.replicas - name: DESIRED - type: integer - description: Current replicas jsonPath: .status.replicas - name: CURRENT + name: REPLICAS type: integer - - description: Ready status - jsonPath: .status.conditions[?(@.type=='Ready')].status - name: READY - type: string - jsonPath: .metadata.creationTimestamp name: AGE type: date @@ -110,36 +102,6 @@ spec: format: int32 minimum: 0 type: integer - scalingPolicy: - description: |- - ScalingPolicy defines optional constraints for scaling behavior. - These constraints are enforced by the adapter controller, providing - an additional safety layer beyond HPA's own min/max settings. - properties: - maxReplicas: - description: |- - MaxReplicas is the upper bound for scaling. - The adapter will not scale above this value even if the autoscaler requests it. - format: int32 - minimum: 1 - type: integer - minReplicas: - description: |- - MinReplicas is the lower bound for scaling. - The adapter will not scale below this value even if the autoscaler requests it. - format: int32 - minimum: 0 - type: integer - scaleDownStabilizationSeconds: - default: 0 - description: |- - ScaleDownStabilizationSeconds is the time to wait before scaling down - after the last scale operation. This provides additional protection against - rapid scale oscillations beyond what HPA provides. - format: int32 - minimum: 0 - type: integer - type: object required: - dgdRef - replicas @@ -147,65 +109,6 @@ spec: status: description: DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter properties: - conditions: - description: Conditions represent the latest available observations of the adapter's state. - items: - description: Condition contains details for one aspect of the current state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map lastScaleTime: description: LastScaleTime is the last time the adapter scaled the target service. format: date-time @@ -213,7 +116,7 @@ spec: replicas: description: |- Replicas is the current number of replicas for the target service. - This is synced from the DGD's service replicas. + This is synced from the DGD's service replicas and is required for the scale subresource. format: int32 type: integer selector: diff --git a/deploy/cloud/operator/api/v1alpha1/common.go b/deploy/cloud/operator/api/v1alpha1/common.go index 5673fd5cfd..f967c6dbca 100644 --- a/deploy/cloud/operator/api/v1alpha1/common.go +++ b/deploy/cloud/operator/api/v1alpha1/common.go @@ -18,7 +18,6 @@ package v1alpha1 import ( - autoscalingv2 "k8s.io/api/autoscaling/v2" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" ) @@ -53,14 +52,6 @@ type VolumeMount struct { UseAsCompilationCache bool `json:"useAsCompilationCache,omitempty"` } -type Autoscaling struct { - Enabled bool `json:"enabled,omitempty"` - MinReplicas int `json:"minReplicas,omitempty"` - MaxReplicas int `json:"maxReplicas,omitempty"` - Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"` - Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"` -} - type SharedMemorySpec struct { Disabled bool `json:"disabled,omitempty"` Size resource.Quantity `json:"size,omitempty"` diff --git a/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go b/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go index 8f484057ab..06202948c6 100644 --- a/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go +++ b/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go @@ -74,8 +74,6 @@ type DynamoComponentDeploymentSharedSpec struct { // Resources requested and limits for this component, including CPU, memory, // GPUs/devices, and any runtime-specific resources. Resources *Resources `json:"resources,omitempty"` - // Autoscaling config for this component (replica range, target utilization, etc.). - Autoscaling *Autoscaling `json:"autoscaling,omitempty"` // Envs defines additional environment variables to inject into the component containers. Envs []corev1.EnvVar `json:"envs,omitempty"` // EnvFromSecret references a Secret whose key/value pairs will be exposed as diff --git a/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go b/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go index fa0b3c7f90..eccf7de2f2 100644 --- a/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go +++ b/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go @@ -32,12 +32,6 @@ type DynamoGraphDeploymentScalingAdapterSpec struct { // DGDRef references the DynamoGraphDeployment and the specific service to scale. // +kubebuilder:validation:Required DGDRef DynamoGraphDeploymentServiceRef `json:"dgdRef"` - - // ScalingPolicy defines optional constraints for scaling behavior. - // These constraints are enforced by the adapter controller, providing - // an additional safety layer beyond HPA's own min/max settings. - // +optional - ScalingPolicy *ScalingPolicy `json:"scalingPolicy,omitempty"` } // DynamoGraphDeploymentServiceRef identifies a specific service within a DynamoGraphDeployment @@ -53,33 +47,10 @@ type DynamoGraphDeploymentServiceRef struct { Service string `json:"service"` } -// ScalingPolicy defines constraints and behavior for scaling operations -type ScalingPolicy struct { - // MinReplicas is the lower bound for scaling. - // The adapter will not scale below this value even if the autoscaler requests it. - // +kubebuilder:validation:Minimum=0 - // +optional - MinReplicas *int32 `json:"minReplicas,omitempty"` - - // MaxReplicas is the upper bound for scaling. - // The adapter will not scale above this value even if the autoscaler requests it. - // +kubebuilder:validation:Minimum=1 - // +optional - MaxReplicas *int32 `json:"maxReplicas,omitempty"` - - // ScaleDownStabilizationSeconds is the time to wait before scaling down - // after the last scale operation. This provides additional protection against - // rapid scale oscillations beyond what HPA provides. - // +kubebuilder:validation:Minimum=0 - // +kubebuilder:default=0 - // +optional - ScaleDownStabilizationSeconds *int32 `json:"scaleDownStabilizationSeconds,omitempty"` -} - // DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter type DynamoGraphDeploymentScalingAdapterStatus struct { // Replicas is the current number of replicas for the target service. - // This is synced from the DGD's service replicas. + // This is synced from the DGD's service replicas and is required for the scale subresource. // +optional Replicas int32 `json:"replicas,omitempty"` @@ -91,14 +62,6 @@ type DynamoGraphDeploymentScalingAdapterStatus struct { // LastScaleTime is the last time the adapter scaled the target service. // +optional LastScaleTime *metav1.Time `json:"lastScaleTime,omitempty"` - - // Conditions represent the latest available observations of the adapter's state. - // +optional - // +patchMergeKey=type - // +patchStrategy=merge - // +listType=map - // +listMapKey=type - Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` } // +kubebuilder:object:root=true @@ -106,9 +69,7 @@ type DynamoGraphDeploymentScalingAdapterStatus struct { // +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas,selectorpath=.status.selector // +kubebuilder:printcolumn:name="DGD",type="string",JSONPath=".spec.dgdRef.name",description="DynamoGraphDeployment name" // +kubebuilder:printcolumn:name="SERVICE",type="string",JSONPath=".spec.dgdRef.service",description="Service name" -// +kubebuilder:printcolumn:name="DESIRED",type="integer",JSONPath=".spec.replicas",description="Desired replicas" -// +kubebuilder:printcolumn:name="CURRENT",type="integer",JSONPath=".status.replicas",description="Current replicas" -// +kubebuilder:printcolumn:name="READY",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status",description="Ready status" +// +kubebuilder:printcolumn:name="REPLICAS",type="integer",JSONPath=".status.replicas",description="Current replicas" // +kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" // +kubebuilder:resource:shortName={dgdsa} @@ -139,62 +100,3 @@ type DynamoGraphDeploymentScalingAdapterList struct { func init() { SchemeBuilder.Register(&DynamoGraphDeploymentScalingAdapter{}, &DynamoGraphDeploymentScalingAdapterList{}) } - -// Condition types for DynamoGraphDeploymentScalingAdapter -const ( - // ConditionTypeAdapterReady indicates the adapter is synced with DGD and functioning correctly - ConditionTypeAdapterReady = "Ready" -) - -// Condition reasons for DynamoGraphDeploymentScalingAdapter -const ( - // ReasonDGDNotFound indicates the referenced DGD does not exist - ReasonDGDNotFound = "DGDNotFound" - // ReasonServiceNotFound indicates the referenced service does not exist in the DGD - ReasonServiceNotFound = "ServiceNotFound" - // ReasonSynced indicates the adapter is successfully synced with the DGD - ReasonSynced = "Synced" - // ReasonScalingPolicyViolation indicates a scaling request was blocked by policy - ReasonScalingPolicyViolation = "ScalingPolicyViolation" -) - -// SetCondition updates or adds a condition to the adapter's status -func (a *DynamoGraphDeploymentScalingAdapter) SetCondition(condType string, status metav1.ConditionStatus, reason, message string) { - now := metav1.Now() - condition := metav1.Condition{ - Type: condType, - Status: status, - LastTransitionTime: now, - Reason: reason, - Message: message, - ObservedGeneration: a.Generation, - } - - // Update existing condition or append new one - for i, c := range a.Status.Conditions { - if c.Type == condType { - // Only update if status or reason changed - if c.Status != status || c.Reason != reason || c.Message != message { - a.Status.Conditions[i] = condition - } - return - } - } - a.Status.Conditions = append(a.Status.Conditions, condition) -} - -// GetCondition returns the condition with the given type, or nil if not found -func (a *DynamoGraphDeploymentScalingAdapter) GetCondition(condType string) *metav1.Condition { - for i := range a.Status.Conditions { - if a.Status.Conditions[i].Type == condType { - return &a.Status.Conditions[i] - } - } - return nil -} - -// IsReady returns true if the adapter is in Ready state -func (a *DynamoGraphDeploymentScalingAdapter) IsReady() bool { - cond := a.GetCondition(ConditionTypeAdapterReady) - return cond != nil && cond.Status == metav1.ConditionTrue -} diff --git a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go index 6f05c9de43..da95b14745 100644 --- a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go @@ -38,40 +38,12 @@ limitations under the License. package v1alpha1 import ( - "k8s.io/api/autoscaling/v2" "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ) -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *Autoscaling) DeepCopyInto(out *Autoscaling) { - *out = *in - if in.Behavior != nil { - in, out := &in.Behavior, &out.Behavior - *out = new(v2.HorizontalPodAutoscalerBehavior) - (*in).DeepCopyInto(*out) - } - if in.Metrics != nil { - in, out := &in.Metrics, &out.Metrics - *out = make([]v2.MetricSpec, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Autoscaling. -func (in *Autoscaling) DeepCopy() *Autoscaling { - if in == nil { - return nil - } - out := new(Autoscaling) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *BaseCRD) DeepCopyInto(out *BaseCRD) { *out = *in @@ -304,11 +276,6 @@ func (in *DynamoComponentDeploymentSharedSpec) DeepCopyInto(out *DynamoComponent *out = new(Resources) (*in).DeepCopyInto(*out) } - if in.Autoscaling != nil { - in, out := &in.Autoscaling, &out.Autoscaling - *out = new(Autoscaling) - (*in).DeepCopyInto(*out) - } if in.Envs != nil { in, out := &in.Envs, &out.Envs *out = make([]v1.EnvVar, len(*in)) @@ -604,7 +571,7 @@ func (in *DynamoGraphDeploymentScalingAdapter) DeepCopyInto(out *DynamoGraphDepl *out = *in out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - in.Spec.DeepCopyInto(&out.Spec) + out.Spec = in.Spec in.Status.DeepCopyInto(&out.Status) } @@ -662,11 +629,6 @@ func (in *DynamoGraphDeploymentScalingAdapterList) DeepCopyObject() runtime.Obje func (in *DynamoGraphDeploymentScalingAdapterSpec) DeepCopyInto(out *DynamoGraphDeploymentScalingAdapterSpec) { *out = *in out.DGDRef = in.DGDRef - if in.ScalingPolicy != nil { - in, out := &in.ScalingPolicy, &out.ScalingPolicy - *out = new(ScalingPolicy) - (*in).DeepCopyInto(*out) - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentScalingAdapterSpec. @@ -686,13 +648,6 @@ func (in *DynamoGraphDeploymentScalingAdapterStatus) DeepCopyInto(out *DynamoGra in, out := &in.LastScaleTime, &out.LastScaleTime *out = (*in).DeepCopy() } - if in.Conditions != nil { - in, out := &in.Conditions, &out.Conditions - *out = make([]metav1.Condition, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentScalingAdapterStatus. @@ -1206,36 +1161,6 @@ func (in *Resources) DeepCopy() *Resources { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ScalingPolicy) DeepCopyInto(out *ScalingPolicy) { - *out = *in - if in.MinReplicas != nil { - in, out := &in.MinReplicas, &out.MinReplicas - *out = new(int32) - **out = **in - } - if in.MaxReplicas != nil { - in, out := &in.MaxReplicas, &out.MaxReplicas - *out = new(int32) - **out = **in - } - if in.ScaleDownStabilizationSeconds != nil { - in, out := &in.ScaleDownStabilizationSeconds, &out.ScaleDownStabilizationSeconds - *out = new(int32) - **out = **in - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScalingPolicy. -func (in *ScalingPolicy) DeepCopy() *ScalingPolicy { - if in == nil { - return nil - } - out := new(ScalingPolicy) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SharedMemorySpec) DeepCopyInto(out *SharedMemorySpec) { *out = *in diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml index 558a5b973d..39b04bf3f0 100644 --- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml @@ -76,597 +76,6 @@ spec: Annotations to add to generated Kubernetes resources for this component (such as Pod, Service, and Ingress when applicable). type: object - autoscaling: - description: Autoscaling config for this component (replica range, target utilization, etc.). - properties: - behavior: - description: |- - HorizontalPodAutoscalerBehavior configures the scaling behavior of the target - in both Up and Down directions (scaleUp and scaleDown fields respectively). - properties: - scaleDown: - description: |- - scaleDown is scaling policy for scaling Down. - If not set, the default value is to allow to scale down to minReplicas pods, with a - 300 second stabilization window (i.e., the highest recommendation for - the last 300sec is used). - properties: - policies: - description: |- - policies is a list of potential scaling polices which can be used during scaling. - If not set, use the default values: - - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. - - For scale down: allow all pods to be removed in a 15s window. - items: - description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. - properties: - periodSeconds: - description: |- - periodSeconds specifies the window of time for which the policy should hold true. - PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). - format: int32 - type: integer - type: - description: type is used to specify the scaling policy. - type: string - value: - description: |- - value contains the amount of change which is permitted by the policy. - It must be greater than zero - format: int32 - type: integer - required: - - periodSeconds - - type - - value - type: object - type: array - x-kubernetes-list-type: atomic - selectPolicy: - description: |- - selectPolicy is used to specify which policy should be used. - If not set, the default value Max is used. - type: string - stabilizationWindowSeconds: - description: |- - stabilizationWindowSeconds is the number of seconds for which past recommendations should be - considered while scaling up or scaling down. - StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). - If not set, use the default values: - - For scale up: 0 (i.e. no stabilization is done). - - For scale down: 300 (i.e. the stabilization window is 300 seconds long). - format: int32 - type: integer - tolerance: - anyOf: - - type: integer - - type: string - description: |- - tolerance is the tolerance on the ratio between the current and desired - metric value under which no updates are made to the desired number of - replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not - set, the default cluster-wide tolerance is applied (by default 10%). - - For example, if autoscaling is configured with a memory consumption target of 100Mi, - and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be - triggered when the actual consumption falls below 95Mi or exceeds 101Mi. - - This is an alpha field and requires enabling the HPAConfigurableTolerance - feature gate. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - scaleUp: - description: |- - scaleUp is scaling policy for scaling Up. - If not set, the default value is the higher of: - * increase no more than 4 pods per 60 seconds - * double the number of pods per 60 seconds - No stabilization is used. - properties: - policies: - description: |- - policies is a list of potential scaling polices which can be used during scaling. - If not set, use the default values: - - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. - - For scale down: allow all pods to be removed in a 15s window. - items: - description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. - properties: - periodSeconds: - description: |- - periodSeconds specifies the window of time for which the policy should hold true. - PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). - format: int32 - type: integer - type: - description: type is used to specify the scaling policy. - type: string - value: - description: |- - value contains the amount of change which is permitted by the policy. - It must be greater than zero - format: int32 - type: integer - required: - - periodSeconds - - type - - value - type: object - type: array - x-kubernetes-list-type: atomic - selectPolicy: - description: |- - selectPolicy is used to specify which policy should be used. - If not set, the default value Max is used. - type: string - stabilizationWindowSeconds: - description: |- - stabilizationWindowSeconds is the number of seconds for which past recommendations should be - considered while scaling up or scaling down. - StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). - If not set, use the default values: - - For scale up: 0 (i.e. no stabilization is done). - - For scale down: 300 (i.e. the stabilization window is 300 seconds long). - format: int32 - type: integer - tolerance: - anyOf: - - type: integer - - type: string - description: |- - tolerance is the tolerance on the ratio between the current and desired - metric value under which no updates are made to the desired number of - replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not - set, the default cluster-wide tolerance is applied (by default 10%). - - For example, if autoscaling is configured with a memory consumption target of 100Mi, - and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be - triggered when the actual consumption falls below 95Mi or exceeds 101Mi. - - This is an alpha field and requires enabling the HPAConfigurableTolerance - feature gate. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - type: object - enabled: - type: boolean - maxReplicas: - type: integer - metrics: - items: - description: |- - MetricSpec specifies how to scale based on a single metric - (only `type` and one other matching field should be set at once). - properties: - containerResource: - description: |- - containerResource refers to a resource metric (such as those specified in - requests and limits) known to Kubernetes describing a single container in - each pod of the current scale target (e.g. CPU or memory). Such metrics are - built in to Kubernetes, and have special scaling options on top of those - available to normal per-pod metrics using the "pods" source. - properties: - container: - description: container is the name of the container in the pods of the scaling target - type: string - name: - description: name is the name of the resource in question. - type: string - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - container - - name - - target - type: object - external: - description: |- - external refers to a global metric that is not associated - with any Kubernetes object. It allows autoscaling based on information - coming from components running outside of cluster - (for example length of queue in cloud messaging service, or - QPS from loadbalancer running outside of cluster). - properties: - metric: - description: metric identifies the target metric by name and selector - properties: - name: - description: name is the name of the given metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - required: - - name - type: object - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - metric - - target - type: object - object: - description: |- - object refers to a metric describing a single kubernetes object - (for example, hits-per-second on an Ingress object). - properties: - describedObject: - description: describedObject specifies the descriptions of a object,such as kind,name apiVersion - properties: - apiVersion: - description: apiVersion is the API version of the referent - type: string - kind: - description: 'kind is the kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - name: - description: 'name is the name of the referent; More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' - type: string - required: - - kind - - name - type: object - metric: - description: metric identifies the target metric by name and selector - properties: - name: - description: name is the name of the given metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - required: - - name - type: object - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - describedObject - - metric - - target - type: object - pods: - description: |- - pods refers to a metric describing each pod in the current scale target - (for example, transactions-processed-per-second). The values will be - averaged together before being compared to the target value. - properties: - metric: - description: metric identifies the target metric by name and selector - properties: - name: - description: name is the name of the given metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - required: - - name - type: object - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - metric - - target - type: object - resource: - description: |- - resource refers to a resource metric (such as those specified in - requests and limits) known to Kubernetes describing each pod in the - current scale target (e.g. CPU or memory). Such metrics are built in to - Kubernetes, and have special scaling options on top of those available - to normal per-pod metrics using the "pods" source. - properties: - name: - description: name is the name of the resource in question. - type: string - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - name - - target - type: object - type: - description: |- - type is the type of metric source. It should be one of "ContainerResource", "External", - "Object", "Pods" or "Resource", each mapping to a matching field in the object. - type: string - required: - - type - type: object - type: array - minReplicas: - type: integer - type: object backendFramework: description: BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm") enum: diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml index ba2b19fef9..7814ec4ea9 100644 --- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml @@ -218,597 +218,6 @@ spec: Annotations to add to generated Kubernetes resources for this component (such as Pod, Service, and Ingress when applicable). type: object - autoscaling: - description: Autoscaling config for this component (replica range, target utilization, etc.). - properties: - behavior: - description: |- - HorizontalPodAutoscalerBehavior configures the scaling behavior of the target - in both Up and Down directions (scaleUp and scaleDown fields respectively). - properties: - scaleDown: - description: |- - scaleDown is scaling policy for scaling Down. - If not set, the default value is to allow to scale down to minReplicas pods, with a - 300 second stabilization window (i.e., the highest recommendation for - the last 300sec is used). - properties: - policies: - description: |- - policies is a list of potential scaling polices which can be used during scaling. - If not set, use the default values: - - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. - - For scale down: allow all pods to be removed in a 15s window. - items: - description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. - properties: - periodSeconds: - description: |- - periodSeconds specifies the window of time for which the policy should hold true. - PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). - format: int32 - type: integer - type: - description: type is used to specify the scaling policy. - type: string - value: - description: |- - value contains the amount of change which is permitted by the policy. - It must be greater than zero - format: int32 - type: integer - required: - - periodSeconds - - type - - value - type: object - type: array - x-kubernetes-list-type: atomic - selectPolicy: - description: |- - selectPolicy is used to specify which policy should be used. - If not set, the default value Max is used. - type: string - stabilizationWindowSeconds: - description: |- - stabilizationWindowSeconds is the number of seconds for which past recommendations should be - considered while scaling up or scaling down. - StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). - If not set, use the default values: - - For scale up: 0 (i.e. no stabilization is done). - - For scale down: 300 (i.e. the stabilization window is 300 seconds long). - format: int32 - type: integer - tolerance: - anyOf: - - type: integer - - type: string - description: |- - tolerance is the tolerance on the ratio between the current and desired - metric value under which no updates are made to the desired number of - replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not - set, the default cluster-wide tolerance is applied (by default 10%). - - For example, if autoscaling is configured with a memory consumption target of 100Mi, - and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be - triggered when the actual consumption falls below 95Mi or exceeds 101Mi. - - This is an alpha field and requires enabling the HPAConfigurableTolerance - feature gate. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - scaleUp: - description: |- - scaleUp is scaling policy for scaling Up. - If not set, the default value is the higher of: - * increase no more than 4 pods per 60 seconds - * double the number of pods per 60 seconds - No stabilization is used. - properties: - policies: - description: |- - policies is a list of potential scaling polices which can be used during scaling. - If not set, use the default values: - - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. - - For scale down: allow all pods to be removed in a 15s window. - items: - description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. - properties: - periodSeconds: - description: |- - periodSeconds specifies the window of time for which the policy should hold true. - PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). - format: int32 - type: integer - type: - description: type is used to specify the scaling policy. - type: string - value: - description: |- - value contains the amount of change which is permitted by the policy. - It must be greater than zero - format: int32 - type: integer - required: - - periodSeconds - - type - - value - type: object - type: array - x-kubernetes-list-type: atomic - selectPolicy: - description: |- - selectPolicy is used to specify which policy should be used. - If not set, the default value Max is used. - type: string - stabilizationWindowSeconds: - description: |- - stabilizationWindowSeconds is the number of seconds for which past recommendations should be - considered while scaling up or scaling down. - StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). - If not set, use the default values: - - For scale up: 0 (i.e. no stabilization is done). - - For scale down: 300 (i.e. the stabilization window is 300 seconds long). - format: int32 - type: integer - tolerance: - anyOf: - - type: integer - - type: string - description: |- - tolerance is the tolerance on the ratio between the current and desired - metric value under which no updates are made to the desired number of - replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not - set, the default cluster-wide tolerance is applied (by default 10%). - - For example, if autoscaling is configured with a memory consumption target of 100Mi, - and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be - triggered when the actual consumption falls below 95Mi or exceeds 101Mi. - - This is an alpha field and requires enabling the HPAConfigurableTolerance - feature gate. - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: object - type: object - enabled: - type: boolean - maxReplicas: - type: integer - metrics: - items: - description: |- - MetricSpec specifies how to scale based on a single metric - (only `type` and one other matching field should be set at once). - properties: - containerResource: - description: |- - containerResource refers to a resource metric (such as those specified in - requests and limits) known to Kubernetes describing a single container in - each pod of the current scale target (e.g. CPU or memory). Such metrics are - built in to Kubernetes, and have special scaling options on top of those - available to normal per-pod metrics using the "pods" source. - properties: - container: - description: container is the name of the container in the pods of the scaling target - type: string - name: - description: name is the name of the resource in question. - type: string - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - container - - name - - target - type: object - external: - description: |- - external refers to a global metric that is not associated - with any Kubernetes object. It allows autoscaling based on information - coming from components running outside of cluster - (for example length of queue in cloud messaging service, or - QPS from loadbalancer running outside of cluster). - properties: - metric: - description: metric identifies the target metric by name and selector - properties: - name: - description: name is the name of the given metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - required: - - name - type: object - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - metric - - target - type: object - object: - description: |- - object refers to a metric describing a single kubernetes object - (for example, hits-per-second on an Ingress object). - properties: - describedObject: - description: describedObject specifies the descriptions of a object,such as kind,name apiVersion - properties: - apiVersion: - description: apiVersion is the API version of the referent - type: string - kind: - description: 'kind is the kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - name: - description: 'name is the name of the referent; More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' - type: string - required: - - kind - - name - type: object - metric: - description: metric identifies the target metric by name and selector - properties: - name: - description: name is the name of the given metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - required: - - name - type: object - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - describedObject - - metric - - target - type: object - pods: - description: |- - pods refers to a metric describing each pod in the current scale target - (for example, transactions-processed-per-second). The values will be - averaged together before being compared to the target value. - properties: - metric: - description: metric identifies the target metric by name and selector - properties: - name: - description: name is the name of the given metric - type: string - selector: - description: |- - selector is the string-encoded form of a standard kubernetes label selector for the given metric - When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. - When unset, just the metricName will be used to gather metrics. - properties: - matchExpressions: - description: matchExpressions is a list of label selector requirements. The requirements are ANDed. - items: - description: |- - A label selector requirement is a selector that contains values, a key, and an operator that - relates the key and values. - properties: - key: - description: key is the label key that the selector applies to. - type: string - operator: - description: |- - operator represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists and DoesNotExist. - type: string - values: - description: |- - values is an array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. This array is replaced during a strategic - merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - description: |- - matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels - map is equivalent to an element of matchExpressions, whose key field is "key", the - operator is "In", and the values array contains only "value". The requirements are ANDed. - type: object - type: object - x-kubernetes-map-type: atomic - required: - - name - type: object - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - metric - - target - type: object - resource: - description: |- - resource refers to a resource metric (such as those specified in - requests and limits) known to Kubernetes describing each pod in the - current scale target (e.g. CPU or memory). Such metrics are built in to - Kubernetes, and have special scaling options on top of those available - to normal per-pod metrics using the "pods" source. - properties: - name: - description: name is the name of the resource in question. - type: string - target: - description: target specifies the target value for the given metric - properties: - averageUtilization: - description: |- - averageUtilization is the target value of the average of the - resource metric across all relevant pods, represented as a percentage of - the requested value of the resource for the pods. - Currently only valid for Resource metric source type - format: int32 - type: integer - averageValue: - anyOf: - - type: integer - - type: string - description: |- - averageValue is the target value of the average of the - metric across all relevant pods (as a quantity) - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - type: - description: type represents whether the metric type is Utilization, Value, or AverageValue - type: string - value: - anyOf: - - type: integer - - type: string - description: value is the target value of the metric (as a quantity). - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - required: - - type - type: object - required: - - name - - target - type: object - type: - description: |- - type is the type of metric source. It should be one of "ContainerResource", "External", - "Object", "Pods" or "Resource", each mapping to a matching field in the object. - type: string - required: - - type - type: object - type: array - minReplicas: - type: integer - type: object componentType: description: ComponentType indicates the role of this component (for example, "main"). type: string diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml index bc2fad3e21..4a9ecb3b3b 100644 --- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml @@ -41,18 +41,10 @@ spec: jsonPath: .spec.dgdRef.service name: SERVICE type: string - - description: Desired replicas - jsonPath: .spec.replicas - name: DESIRED - type: integer - description: Current replicas jsonPath: .status.replicas - name: CURRENT + name: REPLICAS type: integer - - description: Ready status - jsonPath: .status.conditions[?(@.type=='Ready')].status - name: READY - type: string - jsonPath: .metadata.creationTimestamp name: AGE type: date @@ -110,36 +102,6 @@ spec: format: int32 minimum: 0 type: integer - scalingPolicy: - description: |- - ScalingPolicy defines optional constraints for scaling behavior. - These constraints are enforced by the adapter controller, providing - an additional safety layer beyond HPA's own min/max settings. - properties: - maxReplicas: - description: |- - MaxReplicas is the upper bound for scaling. - The adapter will not scale above this value even if the autoscaler requests it. - format: int32 - minimum: 1 - type: integer - minReplicas: - description: |- - MinReplicas is the lower bound for scaling. - The adapter will not scale below this value even if the autoscaler requests it. - format: int32 - minimum: 0 - type: integer - scaleDownStabilizationSeconds: - default: 0 - description: |- - ScaleDownStabilizationSeconds is the time to wait before scaling down - after the last scale operation. This provides additional protection against - rapid scale oscillations beyond what HPA provides. - format: int32 - minimum: 0 - type: integer - type: object required: - dgdRef - replicas @@ -147,65 +109,6 @@ spec: status: description: DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter properties: - conditions: - description: Conditions represent the latest available observations of the adapter's state. - items: - description: Condition contains details for one aspect of the current state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map lastScaleTime: description: LastScaleTime is the last time the adapter scaled the target service. format: date-time @@ -213,7 +116,7 @@ spec: replicas: description: |- Replicas is the current number of replicas for the target service. - This is synced from the DGD's service replicas. + This is synced from the DGD's service replicas and is required for the scale subresource. format: int32 type: integer selector: diff --git a/deploy/cloud/operator/internal/consts/consts.go b/deploy/cloud/operator/internal/consts/consts.go index 13fb938442..6dd3bc0712 100644 --- a/deploy/cloud/operator/internal/consts/consts.go +++ b/deploy/cloud/operator/internal/consts/consts.go @@ -7,8 +7,6 @@ import ( ) const ( - HPACPUDefaultAverageUtilization = 80 - DefaultUserId = "default" DefaultOrgId = "default" @@ -54,7 +52,6 @@ const ( KubeLabelValueTrue = "true" KubeLabelDynamoComponentPod = "nvidia.com/dynamo-component-pod" - KubeLabelServiceName = "nvidia.com/service-name" KubeResourceGPUNvidia = "nvidia.com/gpu" diff --git a/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller.go index 307bf7ac05..88d92e2f42 100644 --- a/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller.go @@ -338,21 +338,6 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req } deployment = obj - - // create or update api-server hpa - modified_, _, err = commonController.SyncResource(ctx, r, dynamoComponentDeployment, func(ctx context.Context) (*autoscalingv2.HorizontalPodAutoscaler, bool, error) { - return r.generateHPA(generateResourceOption{ - dynamoComponentDeployment: dynamoComponentDeployment, - }) - }) - if err != nil { - return ctrl.Result{}, err - } - - if modified_ { - modified = true - } - } // create or update api-server service @@ -1114,63 +1099,6 @@ type generateResourceOption struct { instanceID *int } -func (r *DynamoComponentDeploymentReconciler) generateHPA(opt generateResourceOption) (*autoscalingv2.HorizontalPodAutoscaler, bool, error) { - labels := r.getKubeLabels(opt.dynamoComponentDeployment) - - annotations := r.getKubeAnnotations(opt.dynamoComponentDeployment) - - kubeName := r.getKubeName(opt.dynamoComponentDeployment, false) - - kubeNs := opt.dynamoComponentDeployment.Namespace - - hpaConf := opt.dynamoComponentDeployment.Spec.Autoscaling - - kubeHpa := &autoscalingv2.HorizontalPodAutoscaler{ - ObjectMeta: metav1.ObjectMeta{ - Name: kubeName, - Namespace: kubeNs, - Labels: labels, - Annotations: annotations, - }, - } - - if hpaConf == nil || !hpaConf.Enabled { - // if hpa is not enabled, we need to delete the hpa - return kubeHpa, true, nil - } - - minReplica := int32(hpaConf.MinReplicas) - - kubeHpa.Spec = autoscalingv2.HorizontalPodAutoscalerSpec{ - MinReplicas: &minReplica, - MaxReplicas: int32(hpaConf.MaxReplicas), - ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{ - APIVersion: "apps/v1", - Kind: "Deployment", - Name: kubeName, - }, - Metrics: hpaConf.Metrics, - } - - if len(kubeHpa.Spec.Metrics) == 0 { - averageUtilization := int32(commonconsts.HPACPUDefaultAverageUtilization) - kubeHpa.Spec.Metrics = []autoscalingv2.MetricSpec{ - { - Type: autoscalingv2.ResourceMetricSourceType, - Resource: &autoscalingv2.ResourceMetricSource{ - Name: corev1.ResourceCPU, - Target: autoscalingv2.MetricTarget{ - Type: autoscalingv2.UtilizationMetricType, - AverageUtilization: &averageUtilization, - }, - }, - }, - } - } - - return kubeHpa, false, nil -} - //nolint:gocyclo,nakedret func (r *DynamoComponentDeploymentReconciler) generatePodTemplateSpec(ctx context.Context, opt generateResourceOption, role dynamo.Role) (podTemplateSpec *corev1.PodTemplateSpec, err error) { podLabels := r.getKubeLabels(opt.dynamoComponentDeployment) diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go index 6f250d8b8f..5ba60b5fde 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go @@ -637,7 +637,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C Namespace: dynamoDeployment.Namespace, Labels: map[string]string{ consts.KubeLabelDynamoGraphDeploymentName: dynamoDeployment.Name, - consts.KubeLabelServiceName: serviceName, + consts.KubeLabelDynamoComponent: serviceName, }, }, Spec: nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ diff --git a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go index 6510ff44c0..895dbbe97a 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go @@ -20,7 +20,6 @@ package controller import ( "context" "fmt" - "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -79,16 +78,7 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co if err := r.Get(ctx, dgdKey, dgd); err != nil { if errors.IsNotFound(err) { logger.Error(err, "Referenced DGD not found", "dgd", dgdKey) - adapter.SetCondition( - nvidiacomv1alpha1.ConditionTypeAdapterReady, - metav1.ConditionFalse, - nvidiacomv1alpha1.ReasonDGDNotFound, - fmt.Sprintf("DGD %s not found", dgdKey), - ) - statusErr := r.Status().Update(ctx, adapter) - if statusErr != nil { - logger.Error(statusErr, "Failed to update adapter status") - } + // DGD doesn't exist, can't proceed return ctrl.Result{}, err } return ctrl.Result{}, err @@ -101,16 +91,6 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co "service", adapter.Spec.DGDRef.Service, "dgd", dgd.Name, "availableServices", getServiceKeys(dgd.Spec.Services)) - adapter.SetCondition( - nvidiacomv1alpha1.ConditionTypeAdapterReady, - metav1.ConditionFalse, - nvidiacomv1alpha1.ReasonServiceNotFound, - fmt.Sprintf("Service %s not found in DGD %s", adapter.Spec.DGDRef.Service, dgd.Name), - ) - statusErr := r.Status().Update(ctx, adapter) - if statusErr != nil { - logger.Error(statusErr, "Failed to update adapter status") - } return ctrl.Result{}, fmt.Errorf("service %s not found in DGD", adapter.Spec.DGDRef.Service) } @@ -139,28 +119,10 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co "Synced adapter from DGD manual edit: replicas=%d", currentReplicas) } - // 5. Apply scaling policy constraints - desiredReplicas, policyViolation := r.applyScalingPolicy(adapter, adapter.Spec.Replicas) - if policyViolation != "" { - logger.Info("Scaling policy violation", "violation", policyViolation, "desired", adapter.Spec.Replicas, "constrained", desiredReplicas) - r.Recorder.Eventf(adapter, corev1.EventTypeWarning, "ScalingPolicyViolation", policyViolation) - } - - // 6. Check scale-down stabilization - if desiredReplicas < currentReplicas { - if !r.canScaleDown(adapter) { - logger.Info("Scale-down blocked by stabilization window", - "current", currentReplicas, - "desired", desiredReplicas, - "lastScaleTime", adapter.Status.LastScaleTime) - desiredReplicas = currentReplicas - } - } - - // 7. Update DGD if replicas changed - if currentReplicas != desiredReplicas { + // 5. Update DGD if replicas changed + if currentReplicas != adapter.Spec.Replicas { // Update the service's replicas in DGD - component.Replicas = &desiredReplicas + component.Replicas = &adapter.Spec.Replicas dgd.Spec.Services[adapter.Spec.DGDRef.Service] = component if err := r.Update(ctx, dgd); err != nil { @@ -174,27 +136,20 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co "dgd", dgd.Name, "service", adapter.Spec.DGDRef.Service, "from", currentReplicas, - "to", desiredReplicas) + "to", adapter.Spec.Replicas) r.Recorder.Eventf(adapter, corev1.EventTypeNormal, "Scaled", - "Scaled service %s from %d to %d replicas", adapter.Spec.DGDRef.Service, currentReplicas, desiredReplicas) + "Scaled service %s from %d to %d replicas", adapter.Spec.DGDRef.Service, currentReplicas, adapter.Spec.Replicas) // Record scaling event now := metav1.Now() adapter.Status.LastScaleTime = &now } - // 8. Update adapter status - adapter.Status.Replicas = desiredReplicas + // 7. Update adapter status + adapter.Status.Replicas = adapter.Spec.Replicas adapter.Status.Selector = r.buildPodSelector(dgd, adapter.Spec.DGDRef.Service) - adapter.SetCondition( - nvidiacomv1alpha1.ConditionTypeAdapterReady, - metav1.ConditionTrue, - nvidiacomv1alpha1.ReasonSynced, - "Adapter synced with DGD", - ) - if err := r.Status().Update(ctx, adapter); err != nil { logger.Error(err, "Failed to update adapter status") return ctrl.Result{}, err @@ -203,47 +158,14 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co return ctrl.Result{}, nil } -// applyScalingPolicy enforces min/max constraints -// Returns the constrained replica count and a violation message (empty if no violation) -func (r *DynamoGraphDeploymentScalingAdapterReconciler) applyScalingPolicy(adapter *nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter, desired int32) (int32, string) { - if adapter.Spec.ScalingPolicy == nil { - return desired, "" - } - - policy := adapter.Spec.ScalingPolicy - - if policy.MinReplicas != nil && desired < *policy.MinReplicas { - return *policy.MinReplicas, fmt.Sprintf("Desired replicas %d below minimum %d", desired, *policy.MinReplicas) - } - - if policy.MaxReplicas != nil && desired > *policy.MaxReplicas { - return *policy.MaxReplicas, fmt.Sprintf("Desired replicas %d exceeds maximum %d", desired, *policy.MaxReplicas) - } - - return desired, "" -} - -// canScaleDown checks if scale-down is allowed based on stabilization window -func (r *DynamoGraphDeploymentScalingAdapterReconciler) canScaleDown(adapter *nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter) bool { - if adapter.Spec.ScalingPolicy == nil || - adapter.Spec.ScalingPolicy.ScaleDownStabilizationSeconds == nil || - *adapter.Spec.ScalingPolicy.ScaleDownStabilizationSeconds == 0 { - return true - } - - if adapter.Status.LastScaleTime == nil { - return true - } - - stabilization := time.Duration(*adapter.Spec.ScalingPolicy.ScaleDownStabilizationSeconds) * time.Second - return time.Since(adapter.Status.LastScaleTime.Time) >= stabilization -} - // buildPodSelector constructs a label selector for the pods managed by this service func (r *DynamoGraphDeploymentScalingAdapterReconciler) buildPodSelector(dgd *nvidiacomv1alpha1.DynamoGraphDeployment, serviceName string) string { + // Pods are labeled with: + // - nvidia.com/dynamo-graph-deployment-name = dgd.Name + // - nvidia.com/dynamo-component = serviceName (the key from spec.services map) return fmt.Sprintf("%s=%s,%s=%s", consts.KubeLabelDynamoGraphDeploymentName, dgd.Name, - consts.KubeLabelServiceName, serviceName) + consts.KubeLabelDynamoComponent, serviceName) } // SetupWithManager sets up the controller with the Manager diff --git a/deploy/cloud/operator/internal/dynamo/graph.go b/deploy/cloud/operator/internal/dynamo/graph.go index 706dcec234..e644e5e881 100644 --- a/deploy/cloud/operator/internal/dynamo/graph.go +++ b/deploy/cloud/operator/internal/dynamo/graph.go @@ -1034,7 +1034,7 @@ func GenerateGrovePodCliqueSet( PodSpec: *podSpec, }, } - labels, err := generateLabels(component, dynamoDeployment, r.Name) + labels, err := generateLabels(component, dynamoDeployment, serviceName) if err != nil { return nil, fmt.Errorf("failed to generate labels: %w", err) } @@ -1075,6 +1075,7 @@ func generateLabels(component *v1alpha1.DynamoComponentDeploymentSharedSpec, dyn labels := make(map[string]string) labels[commonconsts.KubeLabelDynamoSelector] = GetDynamoComponentName(dynamoDeployment, componentName) labels[commonconsts.KubeLabelDynamoGraphDeploymentName] = dynamoDeployment.Name + labels[commonconsts.KubeLabelDynamoComponent] = componentName if component.DynamoNamespace != nil { labels[commonconsts.KubeLabelDynamoNamespace] = *component.DynamoNamespace } diff --git a/deploy/cloud/operator/internal/dynamo/graph_test.go b/deploy/cloud/operator/internal/dynamo/graph_test.go index d93a60459b..6a126cf445 100644 --- a/deploy/cloud/operator/internal/dynamo/graph_test.go +++ b/deploy/cloud/operator/internal/dynamo/graph_test.go @@ -121,7 +121,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment", }, - Autoscaling: nil, }, }, }, @@ -153,7 +152,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { Custom: map[string]string{}, }, }, - Autoscaling: nil, }, }, }, @@ -229,7 +227,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment", }, - Autoscaling: nil, }, }, }, @@ -261,7 +258,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { Custom: map[string]string{}, }, }, - Autoscaling: nil, }, }, }, @@ -341,7 +337,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment", }, - Autoscaling: nil, Ingress: &v1alpha1.IngressSpec{ Enabled: true, Host: "test-dynamographdeployment", @@ -377,7 +372,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { Custom: map[string]string{}, }, }, - Autoscaling: nil, }, }, }, @@ -465,7 +459,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment", }, - Autoscaling: nil, Envs: []corev1.EnvVar{ { Name: "DYN_DEPLOYMENT_CONFIG", @@ -503,7 +496,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { Custom: map[string]string{}, }, }, - Autoscaling: nil, Envs: []corev1.EnvVar{ { Name: "DYN_DEPLOYMENT_CONFIG", @@ -599,7 +591,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment", }, - Autoscaling: nil, ExtraPodSpec: &v1alpha1.ExtraPodSpec{ MainContainer: &corev1.Container{ Command: []string{"sh", "-c"}, @@ -644,7 +635,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) { Custom: map[string]string{}, }, }, - Autoscaling: nil, Envs: []corev1.EnvVar{ { Name: "TEST_ENV", @@ -1307,6 +1297,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "frontend", Labels: map[string]string{ commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend", + commonconsts.KubeLabelDynamoComponent: "Frontend", commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeFrontend, commonconsts.KubeLabelDynamoSubComponentType: "test-sub-component", @@ -1483,6 +1474,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Labels: map[string]string{ commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner", + commonconsts.KubeLabelDynamoComponent: "Planner", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypePlanner, commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", @@ -1884,8 +1876,9 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker, commonconsts.KubeLabelDynamoSubComponentType: "test-sub-component", commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, - commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker-ldr", + commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", + commonconsts.KubeLabelDynamoComponent: "worker", commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", "nvidia.com/label1": "label1", "nvidia.com/label2": "label2", @@ -2059,8 +2052,9 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker, commonconsts.KubeLabelDynamoSubComponentType: "test-sub-component", commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, - commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker-wkr", + commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", + commonconsts.KubeLabelDynamoComponent: "worker", commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", "nvidia.com/label1": "label1", "nvidia.com/label2": "label2", @@ -2200,6 +2194,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend", commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeFrontend, commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", + commonconsts.KubeLabelDynamoComponent: "Frontend", commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", }, Annotations: map[string]string{}, @@ -2358,6 +2353,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Name: "planner", Labels: map[string]string{ commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner", + commonconsts.KubeLabelDynamoComponent: "Planner", commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypePlanner, @@ -2779,7 +2775,8 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { { Name: "worker-ldr", Labels: map[string]string{ - commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker-ldr", + commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker", + commonconsts.KubeLabelDynamoComponent: "worker", commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker, commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", @@ -2943,7 +2940,8 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Labels: map[string]string{ commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker, commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, - commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker-wkr", + commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker", + commonconsts.KubeLabelDynamoComponent: "worker", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", "nvidia.com/label1": "label1", @@ -3084,6 +3082,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", + commonconsts.KubeLabelDynamoComponent: "Frontend", commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", }, Annotations: map[string]string{}, @@ -3243,6 +3242,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { Labels: map[string]string{ commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue, commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner", + commonconsts.KubeLabelDynamoComponent: "Planner", commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment", commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypePlanner, commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment", diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment_test.go b/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment_test.go index 0324856dfd..f38240c8ee 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment_test.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment_test.go @@ -47,11 +47,6 @@ func TestDynamoComponentDeploymentValidator_Validate(t *testing.T) { Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{ DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ Replicas: &validReplicas, - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: true, - MinReplicas: 1, - MaxReplicas: 10, - }, }, BackendFramework: "sglang", }, @@ -74,26 +69,6 @@ func TestDynamoComponentDeploymentValidator_Validate(t *testing.T) { wantErr: true, errMsg: "spec.replicas must be non-negative", }, - { - name: "invalid autoscaling", - deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-deployment", - Namespace: "default", - }, - Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{ - DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: true, - MinReplicas: 5, - MaxReplicas: 3, - }, - }, - }, - }, - wantErr: true, - errMsg: "spec.autoscaling.maxReplicas must be > minReplicas", - }, { name: "invalid ingress", deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{ diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go index 75c18dd33f..de354c25bc 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go @@ -93,28 +93,6 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) { wantErr: true, errMsg: "spec.services[main].replicas must be non-negative", }, - { - name: "service with invalid autoscaling", - deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-graph", - Namespace: "default", - }, - Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{ - Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ - "prefill": { - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: true, - MinReplicas: 10, - MaxReplicas: 5, - }, - }, - }, - }, - }, - wantErr: true, - errMsg: "spec.services[prefill].autoscaling.maxReplicas must be > minReplicas", - }, { name: "service with invalid ingress", deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{ diff --git a/deploy/cloud/operator/internal/webhook/validation/shared.go b/deploy/cloud/operator/internal/webhook/validation/shared.go index 5348193f3f..f22f40abed 100644 --- a/deploy/cloud/operator/internal/webhook/validation/shared.go +++ b/deploy/cloud/operator/internal/webhook/validation/shared.go @@ -48,13 +48,6 @@ func (v *SharedSpecValidator) Validate() error { return fmt.Errorf("%s.replicas must be non-negative", v.fieldPath) } - // Validate autoscaling configuration if specified - if v.spec.Autoscaling != nil { - if err := v.validateAutoscaling(); err != nil { - return err - } - } - // Validate ingress configuration if enabled if v.spec.Ingress != nil && v.spec.Ingress.Enabled { if err := v.validateIngress(); err != nil { @@ -77,27 +70,6 @@ func (v *SharedSpecValidator) Validate() error { return nil } -// validateAutoscaling validates the autoscaling configuration. -func (v *SharedSpecValidator) validateAutoscaling() error { - autoscaling := v.spec.Autoscaling - - if !autoscaling.Enabled { - return nil - } - - // Validate minReplicas - if autoscaling.MinReplicas < 1 { - return fmt.Errorf("%s.autoscaling.minReplicas must be >= 1", v.fieldPath) - } - - // Validate maxReplicas - if autoscaling.MaxReplicas <= autoscaling.MinReplicas { - return fmt.Errorf("%s.autoscaling.maxReplicas must be > minReplicas", v.fieldPath) - } - - return nil -} - // validateIngress validates the ingress configuration. func (v *SharedSpecValidator) validateIngress() error { if v.spec.Ingress.Host == "" { diff --git a/deploy/cloud/operator/internal/webhook/validation/shared_test.go b/deploy/cloud/operator/internal/webhook/validation/shared_test.go index 472bb7d990..0d009b4f0f 100644 --- a/deploy/cloud/operator/internal/webhook/validation/shared_test.go +++ b/deploy/cloud/operator/internal/webhook/validation/shared_test.go @@ -41,11 +41,6 @@ func TestSharedSpecValidator_Validate(t *testing.T) { name: "valid spec with all fields", spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ Replicas: &validReplicas, - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: true, - MinReplicas: 1, - MaxReplicas: 10, - }, Ingress: &nvidiacomv1alpha1.IngressSpec{ Enabled: true, Host: "example.com", @@ -77,44 +72,6 @@ func TestSharedSpecValidator_Validate(t *testing.T) { wantErr: true, errMsg: "spec.replicas must be non-negative", }, - { - name: "autoscaling minReplicas too low", - spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: true, - MinReplicas: 0, - MaxReplicas: 10, - }, - }, - fieldPath: "spec", - wantErr: true, - errMsg: "spec.autoscaling.minReplicas must be >= 1", - }, - { - name: "autoscaling maxReplicas less than minReplicas", - spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: true, - MinReplicas: 5, - MaxReplicas: 3, - }, - }, - fieldPath: "spec", - wantErr: true, - errMsg: "spec.autoscaling.maxReplicas must be > minReplicas", - }, - { - name: "autoscaling disabled - no validation", - spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ - Autoscaling: &nvidiacomv1alpha1.Autoscaling{ - Enabled: false, - MinReplicas: 0, - MaxReplicas: 0, - }, - }, - fieldPath: "spec", - wantErr: false, - }, { name: "ingress enabled without host", spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ diff --git a/docs/_sections/k8s_deployment.rst b/docs/_sections/k8s_deployment.rst index 81d06513cb..cdd7d2029a 100644 --- a/docs/_sections/k8s_deployment.rst +++ b/docs/_sections/k8s_deployment.rst @@ -10,3 +10,4 @@ Deployment Guide Webhooks <../kubernetes/webhooks> Minikube Setup <../kubernetes/deployment/minikube> Managing Models with DynamoModel <../kubernetes/deployment/dynamomodel-guide> + Autoscaling <../kubernetes/autoscaling> diff --git a/docs/kubernetes/api_reference.md b/docs/kubernetes/api_reference.md index 09e7415769..d16b71e700 100644 --- a/docs/kubernetes/api_reference.md +++ b/docs/kubernetes/api_reference.md @@ -37,31 +37,11 @@ Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API - [DynamoComponentDeployment](#dynamocomponentdeployment) - [DynamoGraphDeployment](#dynamographdeployment) - [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest) +- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter) - [DynamoModel](#dynamomodel) -#### Autoscaling - - - - - - - -_Appears in:_ -- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) -- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) - -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `enabled` _boolean_ | | | | -| `minReplicas` _integer_ | | | | -| `maxReplicas` _integer_ | | | | -| `behavior` _[HorizontalPodAutoscalerBehavior](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling)_ | | | | -| `metrics` _[MetricSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling) array_ | | | | - - #### ConfigMapKeySelector @@ -78,7 +58,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: \{\}
| +| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: {}
| | `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | | @@ -96,11 +76,11 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR name. | | Optional: \{\}
| -| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR namespace. | | Optional: \{\}
| -| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.
These are merged with auto-generated labels from the profiling process. | | Optional: \{\}
| -| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: \{\}
| -| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
This image is used for both temporary DGDs created during online profiling and the final DGD.
If omitted, the image from the base config file (e.g., disagg.yaml) is used.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Optional: \{\}
| +| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR name. | | Optional: {}
| +| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR namespace. | | Optional: {}
| +| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.
These are merged with auto-generated labels from the profiling process. | | Optional: {}
| +| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: {}
| +| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
This image is used for both temporary DGDs created during online profiling and the final DGD.
If omitted, the image from the base config file (e.g., disagg.yaml) is used.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Optional: {}
| #### DeploymentStatus @@ -162,10 +142,9 @@ _Appears in:_ | `serviceName` _string_ | The name of the component | | | | `componentType` _string_ | ComponentType indicates the role of this component (for example, "main"). | | | | `subComponentType` _string_ | SubComponentType indicates the sub-role of this component (for example, "prefill"). | | | -| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\}
| +| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: {}
| | `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | | | `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources. | | | -| `autoscaling` _[Autoscaling](#autoscaling)_ | Autoscaling config for this component (replica range, target utilization, etc.). | | | | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | | | `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as
environment variables in the component containers. | | | | `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. | | | @@ -199,10 +178,9 @@ _Appears in:_ | `serviceName` _string_ | The name of the component | | | | `componentType` _string_ | ComponentType indicates the role of this component (for example, "main"). | | | | `subComponentType` _string_ | SubComponentType indicates the sub-role of this component (for example, "prefill"). | | | -| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\}
| +| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: {}
| | `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | | | `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources. | | | -| `autoscaling` _[Autoscaling](#autoscaling)_ | Autoscaling config for this component (replica range, target utilization, etc.). | | | | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | | | `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as
environment variables in the component containers. | | | | `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. | | | @@ -244,6 +222,7 @@ DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests It serves as the primary interface for users to request model deployments with specific performance and resource constraints, enabling SLA-driven deployments. + Lifecycle: 1. Initial → Pending: Validates spec and prepares for profiling 2. Pending → Profiling: Creates and runs profiling job (online or AIC) @@ -252,6 +231,7 @@ Lifecycle: 5. Ready: Terminal state when DGD is operational or spec is available 6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted + The spec becomes immutable once profiling starts. Users must delete and recreate the DGDR to modify configuration after this point. @@ -283,12 +263,12 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
This is a high-level identifier for easy reference in kubectl output and logs.
The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: \{\}
| -| `backend` _string_ | Backend specifies the inference backend to use.
The controller automatically sets this value in profilingConfig.config.engine.backend. | | Enum: [vllm sglang trtllm]
Required: \{\}
| -| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU
resources from the Kubernetes cluster nodes. When enabled, the profiler will override
any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
num_gpus_per_node) with values detected from the cluster.
Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: \{\}
| -| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.
This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
Note: deployment.model and engine.backend are automatically set from the high-level
modelName and backend fields and should not be specified in this config. | | Required: \{\}
| +| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
This is a high-level identifier for easy reference in kubectl output and logs.
The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: {}
| +| `backend` _string_ | Backend specifies the inference backend to use.
The controller automatically sets this value in profilingConfig.config.engine.backend. | | Enum: [vllm sglang trtllm]
Required: {}
| +| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU
resources from the Kubernetes cluster nodes. When enabled, the profiler will override
any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
num_gpus_per_node) with values detected from the cluster.
Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: {}
| +| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.
This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
Note: deployment.model and engine.backend are automatically set from the high-level
modelName and backend fields and should not be specified in this config. | | Required: {}
| | `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment
after profiling completes. If false, only the spec is generated and stored in status.
Users can then manually create a DGD using the generated spec. | false | | -| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.
Only applicable when AutoApply is true. | | Optional: \{\}
| +| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.
Only applicable when AutoApply is true. | | Optional: {}
| #### DynamoGraphDeploymentRequestStatus @@ -306,12 +286,90 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `state` _string_ | State is a high-level textual status of the deployment request lifecycle.
Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
Empty string ("") represents the initial state before initialization. | | | -| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.
This field is populated by the controller and shown in kubectl output. | | Optional: \{\}
| +| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.
This field is populated by the controller and shown in kubectl output. | | Optional: {}
| | `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.
Used to detect spec changes and enforce immutability after profiling starts. | | | | `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.
Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.
Conditions are merged by type on patch updates. | | | -| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.
Format: "configmap/" | | Optional: \{\}
| -| `generatedDeployment` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#rawextension-runtime-pkg)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification
including metadata, based on profiling results. Users can extract this to create
a DGD manually, or it's used automatically when autoApply is true.
Stored as RawExtension to preserve all fields including metadata. | | EmbeddedResource: \{\}
Optional: \{\}
| -| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.
Contains name, namespace, state, and creation status of the managed DGD. | | Optional: \{\}
| +| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.
Format: "configmap/" | | Optional: {}
| +| `generatedDeployment` _[RawExtension](#rawextension)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification
including metadata, based on profiling results. Users can extract this to create
a DGD manually, or it's used automatically when autoApply is true.
Stored as RawExtension to preserve all fields including metadata. | | EmbeddedResource: {}
Optional: {}
| +| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.
Contains name, namespace, state, and creation status of the managed DGD. | | Optional: {}
| + + +#### DynamoGraphDeploymentScalingAdapter + + + +DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services +within a DynamoGraphDeployment. It implements the Kubernetes scale +subresource, enabling integration with HPA, KEDA, and custom autoscalers. + + +The adapter acts as an intermediary between autoscalers and the DGD, +ensuring that only the adapter controller modifies the DGD's service replicas. +This prevents conflicts when multiple autoscaling mechanisms are in play. + + + + + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | | +| `kind` _string_ | `DynamoGraphDeploymentScalingAdapter` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)_ | | | | +| `status` _[DynamoGraphDeploymentScalingAdapterStatus](#dynamographdeploymentscalingadapterstatus)_ | | | | + + +#### DynamoGraphDeploymentScalingAdapterSpec + + + +DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter + + + +_Appears in:_ +- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `replicas` _integer_ | Replicas is the desired number of replicas for the target service.
This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. | | Minimum: 0
Required: {}
| +| `dgdRef` _[DynamoGraphDeploymentServiceRef](#dynamographdeploymentserviceref)_ | DGDRef references the DynamoGraphDeployment and the specific service to scale. | | Required: {}
| + + +#### DynamoGraphDeploymentScalingAdapterStatus + + + +DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter + + + +_Appears in:_ +- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `replicas` _integer_ | Replicas is the current number of replicas for the target service.
This is synced from the DGD's service replicas and is required for the scale subresource. | | | +| `selector` _string_ | Selector is a label selector string for the pods managed by this adapter.
Required for HPA compatibility via the scale subresource. | | | +| `lastScaleTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | LastScaleTime is the last time the adapter scaled the target service. | | | + + +#### DynamoGraphDeploymentServiceRef + + + +DynamoGraphDeploymentServiceRef identifies a specific service within a DynamoGraphDeployment + + + +_Appears in:_ +- [DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `name` _string_ | Name of the DynamoGraphDeployment | | MinLength: 1
Required: {}
| +| `service` _string_ | Service is the key name of the service within the DGD's spec.services map to scale | | MinLength: 1
Required: {}
| #### DynamoGraphDeploymentSpec @@ -327,9 +385,9 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.
Each PVC must have a unique name that can be referenced in component specifications. | | MaxItems: 100
Optional: \{\}
| -| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | MaxProperties: 25
Optional: \{\}
| -| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless
overridden by service-specific configuration. | | Optional: \{\}
| +| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.
Each PVC must have a unique name that can be referenced in component specifications. | | MaxItems: 100
Optional: {}
| +| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | MaxProperties: 25
Optional: {}
| +| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless
overridden by service-specific configuration. | | Optional: {}
| | `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm"). | | Enum: [sglang vllm trtllm]
| @@ -382,8 +440,8 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `modelName` _string_ | ModelName is the full model identifier (e.g., "meta-llama/Llama-3.3-70B-Instruct-lora") | | Required: \{\}
| -| `baseModelName` _string_ | BaseModelName is the base model identifier that matches the service label
This is used to discover endpoints via headless services | | Required: \{\}
| +| `modelName` _string_ | ModelName is the full model identifier (e.g., "meta-llama/Llama-3.3-70B-Instruct-lora") | | Required: {}
| +| `baseModelName` _string_ | BaseModelName is the base model identifier that matches the service label
This is used to discover endpoints via headless services | | Required: {}
| | `modelType` _string_ | ModelType specifies the type of model (e.g., "base", "lora", "adapter") | base | Enum: [base lora adapter]
| | `source` _[ModelSource](#modelsource)_ | Source specifies the model source location (only applicable for lora model type) | | | @@ -518,7 +576,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | Name is the base model identifier (e.g., "llama-3-70b-instruct-v1") | | Required: \{\}
| +| `name` _string_ | Name is the base model identifier (e.g., "llama-3-70b-instruct-v1") | | Required: {}
| | `revision` _string_ | Revision is the model revision/version (optional) | | | @@ -535,7 +593,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `uri` _string_ | URI is the model source URI
Supported formats:
- S3: s3://bucket/path/to/model
- HuggingFace: hf://org/model@revision_sha | | Required: \{\}
| +| `uri` _string_ | URI is the model source URI
Supported formats:
- S3: s3://bucket/path/to/model
- HuggingFace: hf://org/model@revision_sha | | Required: {}
| #### MultinodeSpec @@ -569,9 +627,9 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `create` _boolean_ | Create indicates to create a new PVC | | | -| `name` _string_ | Name is the name of the PVC | | Required: \{\}
| +| `name` _string_ | Name is the name of the PVC | | Required: {}
| | `storageClass` _string_ | StorageClass to be used for PVC creation. Required when create is true. | | | -| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | | +| `size` _[Quantity](#quantity)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | | | `volumeAccessMode` _[PersistentVolumeAccessMode](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#persistentvolumeaccessmode-v1-core)_ | VolumeAccessMode is the volume access mode of the PVC. Required when create is true. | | | @@ -590,12 +648,12 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `config` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.
The profiler will validate the configuration and report any errors. | | Optional: \{\}
Type: object
| -| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment
base config file (disagg.yaml). This is separate from the profiling config above.
The path to this config will be set as engine.config in the profiling config. | | Optional: \{\}
| -| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.
This image contains the profiler code and dependencies needed for SLA-based profiling.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Required: \{\}
| -| `outputPVC` _string_ | OutputPVC is an optional PersistentVolumeClaim name for storing profiling output.
If specified, all profiling artifacts (logs, plots, configs, raw data) will be written
to this PVC instead of an ephemeral emptyDir volume. This allows users to access
complete profiling results after the job completes by mounting the PVC.
The PVC must exist in the same namespace as the DGDR.
If not specified, profiling uses emptyDir and only essential data is saved to ConfigMaps.
Note: ConfigMaps are still created regardless of this setting for planner integration. | | Optional: \{\}
| -| `resources` _[ResourceRequirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourcerequirements-v1-core)_ | Resources specifies the compute resource requirements for the profiling job container.
If not specified, no resource requests or limits are set. | | Optional: \{\}
| -| `tolerations` _[Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#toleration-v1-core) array_ | Tolerations allows the profiling job to be scheduled on nodes with matching taints.
For example, to schedule on GPU nodes, add a toleration for the nvidia.com/gpu taint. | | Optional: \{\}
| +| `config` _[JSON](#json)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.
The profiler will validate the configuration and report any errors. | | Optional: {}
Type: object
| +| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment
base config file (disagg.yaml). This is separate from the profiling config above.
The path to this config will be set as engine.config in the profiling config. | | Optional: {}
| +| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.
This image contains the profiler code and dependencies needed for SLA-based profiling.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Required: {}
| +| `outputPVC` _string_ | OutputPVC is an optional PersistentVolumeClaim name for storing profiling output.
If specified, all profiling artifacts (logs, plots, configs, raw data) will be written
to this PVC instead of an ephemeral emptyDir volume. This allows users to access
complete profiling results after the job completes by mounting the PVC.
The PVC must exist in the same namespace as the DGDR.
If not specified, profiling uses emptyDir and only essential data is saved to ConfigMaps.
Note: ConfigMaps are still created regardless of this setting for planner integration. | | Optional: {}
| +| `resources` _[ResourceRequirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourcerequirements-v1-core)_ | Resources specifies the compute resource requirements for the profiling job container.
If not specified, no resource requests or limits are set. | | Optional: {}
| +| `tolerations` _[Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#toleration-v1-core) array_ | Tolerations allows the profiling job to be scheduled on nodes with matching taints.
For example, to schedule on GPU nodes, add a toleration for the nvidia.com/gpu taint. | | Optional: {}
| #### ResourceItem @@ -653,7 +711,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `disabled` _boolean_ | | | | -| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | | | | +| `size` _[Quantity](#quantity)_ | | | | #### VolumeMount @@ -670,7 +728,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: \{\}
| +| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: {}
| | `mountPoint` _string_ | MountPoint specifies where to mount the volume.
If useAsCompilationCache is true and mountPoint is not specified,
a backend-specific default will be used. | | | | `useAsCompilationCache` _boolean_ | UseAsCompilationCache indicates this volume should be used as a compilation cache.
When true, backend-specific environment variables will be set and default mount points may be used. | false | | diff --git a/docs/kubernetes/autoscaling.md b/docs/kubernetes/autoscaling.md new file mode 100644 index 0000000000..0f159577b2 --- /dev/null +++ b/docs/kubernetes/autoscaling.md @@ -0,0 +1,402 @@ +# Autoscaling + +This guide explains how to configure autoscaling for DynamoGraphDeployment (DGD) services. Dynamo supports multiple autoscaling strategies to meet different use cases, from simple CPU-based scaling to sophisticated LLM-aware optimization. + +## Overview + +Dynamo provides flexible autoscaling through the `DynamoGraphDeploymentScalingAdapter` (DGDSA) resource. When you deploy a DGD, the operator automatically creates one adapter per service. These adapters implement the Kubernetes [Scale subresource](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#scale-subresource), enabling integration with: + +| Autoscaler | Description | Best For | +|------------|-------------|----------| +| **Dynamo Planner** | LLM-aware autoscaling with SLA optimization | Production LLM workloads | +| **Kubernetes HPA** | Native horizontal pod autoscaling | Simple CPU/memory-based scaling | +| **KEDA** | Event-driven autoscaling | Queue-based or external metrics | +| **Custom Controllers** | Any scale-subresource-compatible controller | Custom requirements | + +## Architecture + +``` +┌──────────────────────────────────┐ ┌─────────────────────────────────────┐ +│ DynamoGraphDeployment │ │ Scaling Adapters (auto-created) │ +│ "my-llm-deployment" │ │ (one per service) │ +├──────────────────────────────────┤ ├─────────────────────────────────────┤ +│ │ │ │ +│ spec.services: │ │ ┌─────────────────────────────┐ │ ┌──────────────────┐ +│ │ │ │ my-llm-deployment-frontend │◄───┼──────│ Autoscalers │ +│ ┌────────────────────────┐◄───┼──────────┼──│ spec.replicas: 2 │ │ │ │ +│ │ frontend: 2 replicas │ │ │ └─────────────────────────────┘ │ │ • Planner │ +│ └────────────────────────┘ │ │ │ │ • HPA │ +│ │ │ ┌─────────────────────────────┐ │ │ • KEDA │ +│ ┌────────────────────────┐◄───┼──────────┼──│ my-llm-deployment-prefill │◄───┼──────│ • Custom │ +│ │ prefill: 4 replicas │ │ │ │ spec.replicas: 4 │ │ │ │ +│ └────────────────────────┘ │ │ └─────────────────────────────┘ │ └──────────────────┘ +│ │ │ │ +│ ┌────────────────────────┐◄───┼──────────┼──┌─────────────────────────────┐ │ +│ │ decode: 8 replicas │ │ │ │ my-llm-deployment-decode │◄───┼────── +│ └────────────────────────┘ │ │ │ spec.replicas: 8 │ │ +│ │ │ └─────────────────────────────┘ │ +└──────────────────────────────────┘ └─────────────────────────────────────┘ +``` + +**How it works:** + +1. You deploy a DGD with services (frontend, prefill, decode, etc.) +2. The operator auto-creates one DGDSA per service +3. Autoscalers (HPA, KEDA, Planner) target the adapters via `/scale` subresource +4. Adapter controller syncs replica changes to the DGD +5. DGD controller reconciles the underlying pods + +## Viewing Scaling Adapters + +After deploying a DGD, verify the auto-created adapters: + +```bash +kubectl get dgdsa -n + +# Example output: +# NAME DGD SERVICE REPLICAS AGE +# my-llm-deployment-frontend my-llm-deployment frontend 2 5m +# my-llm-deployment-prefill my-llm-deployment prefill 4 5m +# my-llm-deployment-decode my-llm-deployment decode 8 5m +``` + +## Autoscaling with Dynamo Planner + +The Dynamo Planner is an LLM-aware autoscaler that optimizes scaling decisions based on inference-specific metrics like Time To First Token (TTFT), Inter-Token Latency (ITL), and KV cache utilization. + +**When to use Planner:** +- You want LLM-optimized autoscaling out of the box +- You need coordinated scaling across prefill/decode services +- You want SLA-driven scaling (e.g., target TTFT < 500ms) + +**How Planner works:** + +Planner is deployed as a service component within your DGD. It: +1. Queries Prometheus for frontend metrics (request rate, latency, etc.) +2. Uses profiling data to predict optimal replica counts +3. Scales prefill/decode workers to meet SLA targets + +**Deployment:** + +The recommended way to deploy Planner is via `DynamoGraphDeploymentRequest` (DGDR), which automatically: +1. Profiles your model to find optimal configurations +2. Generates a DGD with Planner included +3. Deploys the optimized configuration + +See the [SLA Planner Quick Start](../planner/sla_planner_quickstart.md) for complete instructions. + +**Manual Planner deployment:** + +You can also manually add Planner to your DGD. Example configurations are available in: +- `examples/backends/vllm/deploy/disagg_planner.yaml` +- `examples/backends/sglang/deploy/disagg_planner.yaml` +- `examples/backends/trtllm/deploy/disagg_planner.yaml` + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-llm-deployment + namespace: llm-serving +spec: + backendFramework: vllm + services: + frontend: + replicas: 2 + componentType: frontend + prefill: + replicas: 4 + componentType: worker + subComponentType: prefill + decode: + replicas: 8 + componentType: worker + subComponentType: decode + # Planner service + planner: + replicas: 1 + componentType: planner + # Planner requires profiling data and Prometheus access + # See examples/backends/*/deploy/disagg_planner.yaml for full configuration +``` + +For more details, see the [SLA Planner documentation](../planner/sla_planner.md). + +## Autoscaling with Kubernetes HPA + +The Horizontal Pod Autoscaler (HPA) is Kubernetes' native autoscaling solution. + +**When to use HPA:** +- You have simple, predictable scaling requirements +- You want to use standard Kubernetes tooling +- You need CPU or memory-based scaling + +### Basic HPA (CPU-based) + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: frontend-hpa + namespace: llm-serving +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: my-llm-deployment-frontend + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + behavior: + scaleDown: + stabilizationWindowSeconds: 300 + scaleUp: + stabilizationWindowSeconds: 0 +``` + +### HPA with Custom Metrics + +To use LLM-specific metrics, you need [Prometheus Adapter](https://github.com/kubernetes-sigs/prometheus-adapter) or similar: + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: decode-hpa + namespace: llm-serving +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: my-llm-deployment-decode + minReplicas: 2 + maxReplicas: 20 + metrics: + # Scale based on KV cache utilization + - type: Pods + pods: + metric: + name: vllm_gpu_cache_usage_perc + target: + type: AverageValue + averageValue: "70" + # Also consider queue depth + - type: External + external: + metric: + name: vllm_num_requests_waiting + selector: + matchLabels: + service: decode + target: + type: AverageValue + averageValue: "5" +``` + +## Autoscaling with KEDA + +KEDA extends Kubernetes with event-driven autoscaling, supporting 50+ scalers. + +**When to use KEDA:** +- You need event-driven scaling (e.g., queue depth) +- You want to scale to zero when idle +- You need complex scaling triggers + +### KEDA with Prometheus + +```yaml +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: prefill-scaledobject + namespace: llm-serving +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: my-llm-deployment-prefill + minReplicaCount: 1 + maxReplicaCount: 15 + pollingInterval: 15 + cooldownPeriod: 120 + triggers: + - type: prometheus + metadata: + serverAddress: http://prometheus-server.monitoring.svc.cluster.local:9090 + metricName: vllm_queue_depth + query: | + sum(vllm_num_requests_waiting{ + namespace="llm-serving", + dynamo_graph_deployment="my-llm-deployment", + service="prefill" + }) + threshold: "10" +``` + +## Mixed Autoscaling + +You can use different autoscaling strategies for different services: + +```yaml +# DGD with three services +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: my-llm-deployment + namespace: llm-serving +spec: + services: + frontend: + replicas: 2 # Managed by HPA (CPU-based) + prefill: + replicas: 3 # Managed by KEDA (queue-based) + decode: + replicas: 6 # Managed by Planner (LLM-optimized) + +--- +# HPA for Frontend +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: frontend-hpa + namespace: llm-serving +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: my-llm-deployment-frontend + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + +--- +# KEDA for Prefill +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: prefill-scaledobject + namespace: llm-serving +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: my-llm-deployment-prefill + minReplicaCount: 1 + maxReplicaCount: 12 + triggers: + - type: prometheus + metadata: + serverAddress: http://prometheus-server.monitoring.svc.cluster.local:9090 + query: sum(vllm_num_requests_waiting{service="prefill"}) + threshold: "10" + +# Decode is managed by Planner (no additional config needed) +``` + +## Manual Scaling + +You can manually scale a service by patching the adapter: + +```bash +kubectl patch dgdsa my-llm-deployment-decode -n llm-serving \ + --type='json' -p='[{"op": "replace", "path": "/spec/replicas", "value": 10}]' +``` + +> **Note**: If an autoscaler is managing the adapter, your change will be overwritten on the next evaluation cycle. + +## Best Practices + +### 1. Choose One Autoscaler Per Service + +Avoid configuring multiple autoscalers for the same service: + +| Configuration | Status | +|---------------|--------| +| HPA for frontend, Planner for prefill/decode | ✅ Good | +| KEDA for all services | ✅ Good | +| Planner only (default) | ✅ Good | +| HPA + Planner both targeting decode | ❌ Bad - they will fight | + +### 2. Use Appropriate Metrics + +| Service Type | Recommended Metrics | +|--------------|---------------------| +| Frontend | CPU utilization, request rate | +| Prefill | Queue depth, TTFT (Time To First Token) | +| Decode | KV cache utilization, ITL (Inter-Token Latency) | + +### 3. Configure Stabilization Windows + +Prevent thrashing with appropriate stabilization: + +```yaml +# HPA +behavior: + scaleDown: + stabilizationWindowSeconds: 300 # Wait 5 min before scaling down + scaleUp: + stabilizationWindowSeconds: 0 # Scale up immediately + +# KEDA +spec: + cooldownPeriod: 300 +``` + +### 4. Set Sensible Min/Max Replicas + +Always configure minimum and maximum replicas in your HPA/KEDA to prevent: +- Scaling to zero (unless intentional) +- Unbounded scaling that exhausts cluster resources + +## Troubleshooting + +### Adapters Not Created + +```bash +# Check DGD status +kubectl describe dgd my-llm-deployment -n llm-serving + +# Check operator logs +kubectl logs -n dynamo-system deployment/dynamo-operator +``` + +### Scaling Not Working + +```bash +# Check adapter status +kubectl describe dgdsa my-llm-deployment-decode -n llm-serving + +# Check HPA status +kubectl describe hpa decode-hpa -n llm-serving + +# Verify metrics are available +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 +``` + +### Rapid Scaling Up and Down + +If you see unstable scaling: + +1. Check if multiple autoscalers are targeting the same adapter +2. Increase stabilization window in HPA behavior +3. Increase cooldown period in KEDA ScaledObject + +## References + +- [Kubernetes HPA Documentation](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) +- [KEDA Documentation](https://keda.sh/) +- [Prometheus Adapter](https://github.com/kubernetes-sigs/prometheus-adapter) +- [Planner Documentation](../planner/sla_planner.md) + From c9ab33e1e38cb13b39dc177aa2b1260959b7806c Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Tue, 2 Dec 2025 13:05:50 -0700 Subject: [PATCH 05/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- deploy/cloud/operator/config/rbac/role.yaml | 1 - .../dynamographdeployment_controller.go | 2 +- docs/kubernetes/api_reference.md | 79 +++++++++---------- 3 files changed, 39 insertions(+), 43 deletions(-) diff --git a/deploy/cloud/operator/config/rbac/role.yaml b/deploy/cloud/operator/config/rbac/role.yaml index f45ac9beee..2a3a00c6f8 100644 --- a/deploy/cloud/operator/config/rbac/role.yaml +++ b/deploy/cloud/operator/config/rbac/role.yaml @@ -179,7 +179,6 @@ rules: - apiGroups: - nvidia.com resources: - - dgdscalingadapters - dynamocomponentdeployments - dynamographdeploymentrequests - dynamographdeployments diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go index 5ba60b5fde..ae561b9a24 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go @@ -86,7 +86,7 @@ type DynamoGraphDeploymentReconciler struct { // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update -// +kubebuilder:rbac:groups=nvidia.com,resources=dgdscalingadapters,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=grove.io,resources=podcliquesets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch // +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch diff --git a/docs/kubernetes/api_reference.md b/docs/kubernetes/api_reference.md index d16b71e700..5218dd85ac 100644 --- a/docs/kubernetes/api_reference.md +++ b/docs/kubernetes/api_reference.md @@ -58,7 +58,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: {}
| +| `name` _string_ | Name of the ConfigMap containing the desired data. | | Required: \{\}
| | `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml | | @@ -76,11 +76,11 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR name. | | Optional: {}
| -| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR namespace. | | Optional: {}
| -| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.
These are merged with auto-generated labels from the profiling process. | | Optional: {}
| -| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: {}
| -| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
This image is used for both temporary DGDs created during online profiling and the final DGD.
If omitted, the image from the base config file (e.g., disagg.yaml) is used.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Optional: {}
| +| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR name. | | Optional: \{\}
| +| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.
If not specified, defaults to the DGDR namespace. | | Optional: \{\}
| +| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.
These are merged with auto-generated labels from the profiling process. | | Optional: \{\}
| +| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. | | Optional: \{\}
| +| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.
This image is used for both temporary DGDs created during online profiling and the final DGD.
If omitted, the image from the base config file (e.g., disagg.yaml) is used.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Optional: \{\}
| #### DeploymentStatus @@ -142,7 +142,7 @@ _Appears in:_ | `serviceName` _string_ | The name of the component | | | | `componentType` _string_ | ComponentType indicates the role of this component (for example, "main"). | | | | `subComponentType` _string_ | SubComponentType indicates the sub-role of this component (for example, "prefill"). | | | -| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: {}
| +| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\}
| | `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | | | `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources. | | | | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | | @@ -178,7 +178,7 @@ _Appears in:_ | `serviceName` _string_ | The name of the component | | | | `componentType` _string_ | ComponentType indicates the role of this component (for example, "main"). | | | | `subComponentType` _string_ | SubComponentType indicates the sub-role of this component (for example, "prefill"). | | | -| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: {}
| +| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\}
| | `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | | | `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources. | | | | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | | @@ -222,7 +222,6 @@ DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests It serves as the primary interface for users to request model deployments with specific performance and resource constraints, enabling SLA-driven deployments. - Lifecycle: 1. Initial → Pending: Validates spec and prepares for profiling 2. Pending → Profiling: Creates and runs profiling job (online or AIC) @@ -231,7 +230,6 @@ Lifecycle: 5. Ready: Terminal state when DGD is operational or spec is available 6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted - The spec becomes immutable once profiling starts. Users must delete and recreate the DGDR to modify configuration after this point. @@ -263,12 +261,12 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
This is a high-level identifier for easy reference in kubectl output and logs.
The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: {}
| -| `backend` _string_ | Backend specifies the inference backend to use.
The controller automatically sets this value in profilingConfig.config.engine.backend. | | Enum: [vllm sglang trtllm]
Required: {}
| -| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU
resources from the Kubernetes cluster nodes. When enabled, the profiler will override
any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
num_gpus_per_node) with values detected from the cluster.
Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: {}
| -| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.
This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
Note: deployment.model and engine.backend are automatically set from the high-level
modelName and backend fields and should not be specified in this config. | | Required: {}
| +| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").
This is a high-level identifier for easy reference in kubectl output and logs.
The controller automatically sets this value in profilingConfig.config.deployment.model. | | Required: \{\}
| +| `backend` _string_ | Backend specifies the inference backend to use.
The controller automatically sets this value in profilingConfig.config.engine.backend. | | Enum: [vllm sglang trtllm]
Required: \{\}
| +| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU
resources from the Kubernetes cluster nodes. When enabled, the profiler will override
any manually specified hardware configuration (min_num_gpus_per_engine, max_num_gpus_per_engine,
num_gpus_per_node) with values detected from the cluster.
Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: \{\}
| +| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.
This configuration is passed directly to the profiler.
The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).
Note: deployment.model and engine.backend are automatically set from the high-level
modelName and backend fields and should not be specified in this config. | | Required: \{\}
| | `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment
after profiling completes. If false, only the spec is generated and stored in status.
Users can then manually create a DGD using the generated spec. | false | | -| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.
Only applicable when AutoApply is true. | | Optional: {}
| +| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.
Only applicable when AutoApply is true. | | Optional: \{\}
| #### DynamoGraphDeploymentRequestStatus @@ -286,12 +284,12 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `state` _string_ | State is a high-level textual status of the deployment request lifecycle.
Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
Empty string ("") represents the initial state before initialization. | | | -| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.
This field is populated by the controller and shown in kubectl output. | | Optional: {}
| +| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.
This field is populated by the controller and shown in kubectl output. | | Optional: \{\}
| | `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.
Used to detect spec changes and enforce immutability after profiling starts. | | | | `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.
Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.
Conditions are merged by type on patch updates. | | | -| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.
Format: "configmap/" | | Optional: {}
| -| `generatedDeployment` _[RawExtension](#rawextension)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification
including metadata, based on profiling results. Users can extract this to create
a DGD manually, or it's used automatically when autoApply is true.
Stored as RawExtension to preserve all fields including metadata. | | EmbeddedResource: {}
Optional: {}
| -| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.
Contains name, namespace, state, and creation status of the managed DGD. | | Optional: {}
| +| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.
Format: "configmap/" | | Optional: \{\}
| +| `generatedDeployment` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#rawextension-runtime-pkg)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification
including metadata, based on profiling results. Users can extract this to create
a DGD manually, or it's used automatically when autoApply is true.
Stored as RawExtension to preserve all fields including metadata. | | EmbeddedResource: \{\}
Optional: \{\}
| +| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.
Contains name, namespace, state, and creation status of the managed DGD. | | Optional: \{\}
| #### DynamoGraphDeploymentScalingAdapter @@ -302,7 +300,6 @@ DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual within a DynamoGraphDeployment. It implements the Kubernetes scale subresource, enabling integration with HPA, KEDA, and custom autoscalers. - The adapter acts as an intermediary between autoscalers and the DGD, ensuring that only the adapter controller modifies the DGD's service replicas. This prevents conflicts when multiple autoscaling mechanisms are in play. @@ -333,8 +330,8 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `replicas` _integer_ | Replicas is the desired number of replicas for the target service.
This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. | | Minimum: 0
Required: {}
| -| `dgdRef` _[DynamoGraphDeploymentServiceRef](#dynamographdeploymentserviceref)_ | DGDRef references the DynamoGraphDeployment and the specific service to scale. | | Required: {}
| +| `replicas` _integer_ | Replicas is the desired number of replicas for the target service.
This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. | | Minimum: 0
Required: \{\}
| +| `dgdRef` _[DynamoGraphDeploymentServiceRef](#dynamographdeploymentserviceref)_ | DGDRef references the DynamoGraphDeployment and the specific service to scale. | | Required: \{\}
| #### DynamoGraphDeploymentScalingAdapterStatus @@ -368,8 +365,8 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | Name of the DynamoGraphDeployment | | MinLength: 1
Required: {}
| -| `service` _string_ | Service is the key name of the service within the DGD's spec.services map to scale | | MinLength: 1
Required: {}
| +| `name` _string_ | Name of the DynamoGraphDeployment | | MinLength: 1
Required: \{\}
| +| `service` _string_ | Service is the key name of the service within the DGD's spec.services map to scale | | MinLength: 1
Required: \{\}
| #### DynamoGraphDeploymentSpec @@ -385,9 +382,9 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.
Each PVC must have a unique name that can be referenced in component specifications. | | MaxItems: 100
Optional: {}
| -| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | MaxProperties: 25
Optional: {}
| -| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless
overridden by service-specific configuration. | | Optional: {}
| +| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.
Each PVC must have a unique name that can be referenced in component specifications. | | MaxItems: 100
Optional: \{\}
| +| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. | | MaxProperties: 25
Optional: \{\}
| +| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless
overridden by service-specific configuration. | | Optional: \{\}
| | `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm"). | | Enum: [sglang vllm trtllm]
| @@ -440,8 +437,8 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `modelName` _string_ | ModelName is the full model identifier (e.g., "meta-llama/Llama-3.3-70B-Instruct-lora") | | Required: {}
| -| `baseModelName` _string_ | BaseModelName is the base model identifier that matches the service label
This is used to discover endpoints via headless services | | Required: {}
| +| `modelName` _string_ | ModelName is the full model identifier (e.g., "meta-llama/Llama-3.3-70B-Instruct-lora") | | Required: \{\}
| +| `baseModelName` _string_ | BaseModelName is the base model identifier that matches the service label
This is used to discover endpoints via headless services | | Required: \{\}
| | `modelType` _string_ | ModelType specifies the type of model (e.g., "base", "lora", "adapter") | base | Enum: [base lora adapter]
| | `source` _[ModelSource](#modelsource)_ | Source specifies the model source location (only applicable for lora model type) | | | @@ -576,7 +573,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | Name is the base model identifier (e.g., "llama-3-70b-instruct-v1") | | Required: {}
| +| `name` _string_ | Name is the base model identifier (e.g., "llama-3-70b-instruct-v1") | | Required: \{\}
| | `revision` _string_ | Revision is the model revision/version (optional) | | | @@ -593,7 +590,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `uri` _string_ | URI is the model source URI
Supported formats:
- S3: s3://bucket/path/to/model
- HuggingFace: hf://org/model@revision_sha | | Required: {}
| +| `uri` _string_ | URI is the model source URI
Supported formats:
- S3: s3://bucket/path/to/model
- HuggingFace: hf://org/model@revision_sha | | Required: \{\}
| #### MultinodeSpec @@ -627,9 +624,9 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `create` _boolean_ | Create indicates to create a new PVC | | | -| `name` _string_ | Name is the name of the PVC | | Required: {}
| +| `name` _string_ | Name is the name of the PVC | | Required: \{\}
| | `storageClass` _string_ | StorageClass to be used for PVC creation. Required when create is true. | | | -| `size` _[Quantity](#quantity)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | | +| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. | | | | `volumeAccessMode` _[PersistentVolumeAccessMode](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#persistentvolumeaccessmode-v1-core)_ | VolumeAccessMode is the volume access mode of the PVC. Required when create is true. | | | @@ -648,12 +645,12 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `config` _[JSON](#json)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.
The profiler will validate the configuration and report any errors. | | Optional: {}
Type: object
| -| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment
base config file (disagg.yaml). This is separate from the profiling config above.
The path to this config will be set as engine.config in the profiling config. | | Optional: {}
| -| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.
This image contains the profiler code and dependencies needed for SLA-based profiling.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Required: {}
| -| `outputPVC` _string_ | OutputPVC is an optional PersistentVolumeClaim name for storing profiling output.
If specified, all profiling artifacts (logs, plots, configs, raw data) will be written
to this PVC instead of an ephemeral emptyDir volume. This allows users to access
complete profiling results after the job completes by mounting the PVC.
The PVC must exist in the same namespace as the DGDR.
If not specified, profiling uses emptyDir and only essential data is saved to ConfigMaps.
Note: ConfigMaps are still created regardless of this setting for planner integration. | | Optional: {}
| -| `resources` _[ResourceRequirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourcerequirements-v1-core)_ | Resources specifies the compute resource requirements for the profiling job container.
If not specified, no resource requests or limits are set. | | Optional: {}
| -| `tolerations` _[Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#toleration-v1-core) array_ | Tolerations allows the profiling job to be scheduled on nodes with matching taints.
For example, to schedule on GPU nodes, add a toleration for the nvidia.com/gpu taint. | | Optional: {}
| +| `config` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.
The profiler will validate the configuration and report any errors. | | Optional: \{\}
Type: object
| +| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment
base config file (disagg.yaml). This is separate from the profiling config above.
The path to this config will be set as engine.config in the profiling config. | | Optional: \{\}
| +| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.
This image contains the profiler code and dependencies needed for SLA-based profiling.
Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" | | Required: \{\}
| +| `outputPVC` _string_ | OutputPVC is an optional PersistentVolumeClaim name for storing profiling output.
If specified, all profiling artifacts (logs, plots, configs, raw data) will be written
to this PVC instead of an ephemeral emptyDir volume. This allows users to access
complete profiling results after the job completes by mounting the PVC.
The PVC must exist in the same namespace as the DGDR.
If not specified, profiling uses emptyDir and only essential data is saved to ConfigMaps.
Note: ConfigMaps are still created regardless of this setting for planner integration. | | Optional: \{\}
| +| `resources` _[ResourceRequirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourcerequirements-v1-core)_ | Resources specifies the compute resource requirements for the profiling job container.
If not specified, no resource requests or limits are set. | | Optional: \{\}
| +| `tolerations` _[Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#toleration-v1-core) array_ | Tolerations allows the profiling job to be scheduled on nodes with matching taints.
For example, to schedule on GPU nodes, add a toleration for the nvidia.com/gpu taint. | | Optional: \{\}
| #### ResourceItem @@ -711,7 +708,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `disabled` _boolean_ | | | | -| `size` _[Quantity](#quantity)_ | | | | +| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | | | | #### VolumeMount @@ -728,7 +725,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: {}
| +| `name` _string_ | Name references a PVC name defined in the top-level PVCs map | | Required: \{\}
| | `mountPoint` _string_ | MountPoint specifies where to mount the volume.
If useAsCompilationCache is true and mountPoint is not specified,
a backend-specific default will be used. | | | | `useAsCompilationCache` _boolean_ | UseAsCompilationCache indicates this volume should be used as a compilation cache.
When true, backend-specific environment variables will be set and default mount points may be used. | false | | From eca67c7f016ec9fa797791c8db16c22377deadd8 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Wed, 3 Dec 2025 08:55:32 -0700 Subject: [PATCH 06/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- docs/kubernetes/autoscaling.md | 184 +++++++++++++++++++++++++++++++-- 1 file changed, 178 insertions(+), 6 deletions(-) diff --git a/docs/kubernetes/autoscaling.md b/docs/kubernetes/autoscaling.md index 0f159577b2..76a938ab0e 100644 --- a/docs/kubernetes/autoscaling.md +++ b/docs/kubernetes/autoscaling.md @@ -199,6 +199,154 @@ spec: averageValue: "5" ``` +### HPA with Dynamo Metrics + +Dynamo exports several metrics useful for autoscaling. These are available at the `/metrics` endpoint on each frontend pod. + +> **See also**: For a complete list of all Dynamo metrics, see the [Metrics Reference](../observability/metrics.md). For Prometheus and Grafana setup, see the [Prometheus and Grafana Setup Guide](../observability/prometheus-grafana.md). + +#### Available Dynamo Metrics + +| Metric | Type | Description | Good for scaling | +|--------|------|-------------|------------------| +| `dynamo_frontend_queued_requests` | Gauge | Requests waiting in HTTP queue | ✅ Prefill | +| `dynamo_frontend_inflight_requests` | Gauge | Concurrent requests to engine | ✅ All services | +| `dynamo_frontend_time_to_first_token_seconds` | Histogram | TTFT latency | ✅ Prefill | +| `dynamo_frontend_inter_token_latency_seconds` | Histogram | ITL latency | ✅ Decode | +| `dynamo_frontend_request_duration_seconds` | Histogram | Total request duration | ⚠️ General | +| `kvstats_gpu_cache_usage_percent` | Gauge | GPU KV cache usage (0-1) | ✅ Decode | + +#### Metric Labels + +Dynamo metrics include these labels for filtering: + +| Label | Description | Example | +|-------|-------------|---------| +| `dynamo_namespace` | Unique DGD identifier (`{k8s-namespace}-{dgd-name}`) | `llm-serving-my-deployment` | +| `model` | Model being served | `meta-llama/Llama-3-70B` | + +> **Note**: When you have multiple DGDs in the same namespace, use `dynamo_namespace` to filter metrics for a specific DGD. + +#### Example: Scale Prefill Based on TTFT + +This example scales **Prefill workers** when Time To First Token (TTFT) exceeds 500ms. Note that TTFT is measured at the Frontend, but reflects Prefill performance. + +First, configure Prometheus Adapter to expose the TTFT metric: + +```yaml +# Prometheus Adapter ConfigMap (add to your existing config) +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-adapter-config + namespace: monitoring +data: + config.yaml: | + rules: + # TTFT p95 from frontend - used to scale prefill + - seriesQuery: 'dynamo_frontend_time_to_first_token_seconds_bucket{namespace!=""}' + resources: + overrides: + namespace: {resource: "namespace"} + name: + as: "dynamo_ttft_p95_seconds" + metricsQuery: | + histogram_quantile(0.95, + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{<<.LabelMatchers>>}[5m])) + by (le, namespace, dynamo_namespace) + ) +``` + +Then create the HPA targeting the Prefill adapter: + +```yaml +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: prefill-ttft-hpa + namespace: llm-serving +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: my-llm-deployment-prefill # ← Target: PREFILL adapter + minReplicas: 1 + maxReplicas: 10 + metrics: + - type: External + external: + metric: + name: dynamo_ttft_p95_seconds + selector: + matchLabels: + # Filter by DGD using dynamo_namespace label + dynamo_namespace: "llm-serving-my-llm-deployment" + target: + type: Value + value: "500m" # Scale up when TTFT p95 > 500ms + behavior: + scaleDown: + stabilizationWindowSeconds: 300 # Wait 5 min before scaling down + policies: + - type: Pods + value: 1 + periodSeconds: 60 + scaleUp: + stabilizationWindowSeconds: 0 # Scale up immediately + policies: + - type: Pods + value: 2 + periodSeconds: 30 +``` + +**How it works:** +1. Frontend pods export `dynamo_frontend_time_to_first_token_seconds` histogram +2. Prometheus Adapter calculates p95 TTFT per `dynamo_namespace` +3. HPA monitors this metric for your specific DGD +4. When TTFT p95 > 500ms, HPA scales up the Prefill adapter +5. Adapter controller syncs the replica count to the DGD +6. More Prefill workers are created, reducing TTFT + +#### Example: Scale Decode Based on Queue Depth + +```yaml +# Prometheus Adapter rule +rules: +- seriesQuery: 'dynamo_frontend_queued_requests{namespace!=""}' + resources: + overrides: + namespace: {resource: "namespace"} + name: + as: "dynamo_queued_requests" + metricsQuery: | + sum(<<.Series>>{<<.LabelMatchers>>}) by (namespace, dynamo_namespace) + +--- +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: decode-queue-hpa + namespace: llm-serving +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: my-llm-deployment-decode + minReplicas: 2 + maxReplicas: 20 + metrics: + - type: External + external: + metric: + name: dynamo_queued_requests + selector: + matchLabels: + dynamo_namespace: "llm-serving-my-llm-deployment" + target: + type: Value + value: "10" # Scale up when queue > 10 requests +``` + ## Autoscaling with KEDA KEDA extends Kubernetes with event-driven autoscaling, supporting 50+ scalers. @@ -331,11 +479,11 @@ Avoid configuring multiple autoscalers for the same service: ### 2. Use Appropriate Metrics -| Service Type | Recommended Metrics | -|--------------|---------------------| -| Frontend | CPU utilization, request rate | -| Prefill | Queue depth, TTFT (Time To First Token) | -| Decode | KV cache utilization, ITL (Inter-Token Latency) | +| Service Type | Recommended Metrics | Dynamo Metric | +|--------------|---------------------|---------------| +| Frontend | CPU utilization, request rate | `dynamo_frontend_requests_total` | +| Prefill | Queue depth, TTFT | `dynamo_frontend_queued_requests`, `dynamo_frontend_time_to_first_token_seconds` | +| Decode | KV cache utilization, ITL | `kvstats_gpu_cache_usage_percent`, `dynamo_frontend_inter_token_latency_seconds` | ### 3. Configure Stabilization Windows @@ -381,8 +529,30 @@ kubectl describe dgdsa my-llm-deployment-decode -n llm-serving # Check HPA status kubectl describe hpa decode-hpa -n llm-serving -# Verify metrics are available +# Verify metrics are available in Kubernetes metrics API kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 +kubectl get --raw /apis/external.metrics.k8s.io/v1beta1 +``` + +### Metrics Not Available + +If HPA shows `` for metrics: + +```bash +# Check if Dynamo metrics are being scraped +kubectl port-forward -n llm-serving pod/ 8000:8000 +curl http://localhost:8000/metrics | grep dynamo_frontend + +# Example output: +# dynamo_frontend_queued_requests{model="meta-llama/Llama-3-70B"} 2 +# dynamo_frontend_inflight_requests{model="meta-llama/Llama-3-70B"} 5 + +# Verify Prometheus is scraping the metrics +kubectl port-forward -n monitoring svc/prometheus-server 9090:9090 +# Then query: dynamo_frontend_time_to_first_token_seconds_bucket + +# Check Prometheus Adapter logs +kubectl logs -n monitoring deployment/prometheus-adapter ``` ### Rapid Scaling Up and Down @@ -399,4 +569,6 @@ If you see unstable scaling: - [KEDA Documentation](https://keda.sh/) - [Prometheus Adapter](https://github.com/kubernetes-sigs/prometheus-adapter) - [Planner Documentation](../planner/sla_planner.md) +- [Dynamo Metrics Reference](../observability/metrics.md) +- [Prometheus and Grafana Setup](../observability/prometheus-grafana.md) From 3245e878ce129dee9d2307cc3bef8ed8980d1a05 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Wed, 3 Dec 2025 09:23:03 -0700 Subject: [PATCH 07/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- docs/kubernetes/autoscaling.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/kubernetes/autoscaling.md b/docs/kubernetes/autoscaling.md index 76a938ab0e..2dc7cdb563 100644 --- a/docs/kubernetes/autoscaling.md +++ b/docs/kubernetes/autoscaling.md @@ -252,7 +252,7 @@ data: as: "dynamo_ttft_p95_seconds" metricsQuery: | histogram_quantile(0.95, - sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{<<.LabelMatchers>>}[5m])) + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{<<.LabelMatchers>>}[5m])) by (le, namespace, dynamo_namespace) ) ``` From 1576a50ef983163df6310f49325c265d6e6a9bb8 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Wed, 3 Dec 2025 16:26:22 -0700 Subject: [PATCH 08/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- docs/kubernetes/autoscaling.md | 73 +++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/docs/kubernetes/autoscaling.md b/docs/kubernetes/autoscaling.md index 2dc7cdb563..dd8e7a3b48 100644 --- a/docs/kubernetes/autoscaling.md +++ b/docs/kubernetes/autoscaling.md @@ -231,30 +231,45 @@ Dynamo metrics include these labels for filtering: This example scales **Prefill workers** when Time To First Token (TTFT) exceeds 500ms. Note that TTFT is measured at the Frontend, but reflects Prefill performance. -First, configure Prometheus Adapter to expose the TTFT metric: +First, configure Prometheus Adapter to expose the TTFT metric. Add this to your Helm values file (e.g., `prometheus-adapter-values.yaml`): ```yaml -# Prometheus Adapter ConfigMap (add to your existing config) -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-adapter-config - namespace: monitoring -data: - config.yaml: | - rules: - # TTFT p95 from frontend - used to scale prefill - - seriesQuery: 'dynamo_frontend_time_to_first_token_seconds_bucket{namespace!=""}' - resources: - overrides: - namespace: {resource: "namespace"} - name: - as: "dynamo_ttft_p95_seconds" - metricsQuery: | - histogram_quantile(0.95, - sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{<<.LabelMatchers>>}[5m])) - by (le, namespace, dynamo_namespace) - ) +# prometheus-adapter-values.yaml +prometheus: + url: http://prometheus-kube-prometheus-prometheus.monitoring.svc + port: 9090 + +rules: + external: + # TTFT p95 from frontend - used to scale prefill + - seriesQuery: 'dynamo_frontend_time_to_first_token_seconds_bucket{namespace!=""}' + resources: + overrides: + namespace: {resource: "namespace"} + name: + as: "dynamo_ttft_p95_seconds" + metricsQuery: | + histogram_quantile(0.95, + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{<<.LabelMatchers>>}[5m])) + by (le, namespace, dynamo_namespace) + ) +``` + +Then install or upgrade the Helm release: + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update + +helm upgrade --install prometheus-adapter prometheus-community/prometheus-adapter \ + -n monitoring --create-namespace \ + -f prometheus-adapter-values.yaml +``` + +Verify the metric is available: + +```bash +kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1/namespaces//dynamo_ttft_p95_seconds" | jq ``` Then create the HPA targeting the Prefill adapter: @@ -286,11 +301,11 @@ spec: value: "500m" # Scale up when TTFT p95 > 500ms behavior: scaleDown: - stabilizationWindowSeconds: 300 # Wait 5 min before scaling down + stabilizationWindowSeconds: 60 # Wait 1 min before scaling down policies: - type: Pods value: 1 - periodSeconds: 60 + periodSeconds: 30 scaleUp: stabilizationWindowSeconds: 0 # Scale up immediately policies: @@ -309,9 +324,10 @@ spec: #### Example: Scale Decode Based on Queue Depth +Add this rule to your `prometheus-adapter-values.yaml` (alongside the TTFT rule): + ```yaml -# Prometheus Adapter rule -rules: +# Add to rules.external in prometheus-adapter-values.yaml - seriesQuery: 'dynamo_frontend_queued_requests{namespace!=""}' resources: overrides: @@ -320,8 +336,11 @@ rules: as: "dynamo_queued_requests" metricsQuery: | sum(<<.Series>>{<<.LabelMatchers>>}) by (namespace, dynamo_namespace) +``` ---- +Then create the HPA: + +```yaml apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: From 0d322d0f4ee796d3768cccf3b02f7dcad42c2a63 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Wed, 3 Dec 2025 17:31:45 -0700 Subject: [PATCH 09/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- docs/kubernetes/autoscaling.md | 73 +++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/docs/kubernetes/autoscaling.md b/docs/kubernetes/autoscaling.md index dd8e7a3b48..5d09006e62 100644 --- a/docs/kubernetes/autoscaling.md +++ b/docs/kubernetes/autoscaling.md @@ -222,16 +222,43 @@ Dynamo metrics include these labels for filtering: | Label | Description | Example | |-------|-------------|---------| -| `dynamo_namespace` | Unique DGD identifier (`{k8s-namespace}-{dgd-name}`) | `llm-serving-my-deployment` | +| `dynamo_namespace` | Unique DGD identifier (`{k8s-namespace}-{dynamoNamespace}`) | `default-sglang-agg` | | `model` | Model being served | `meta-llama/Llama-3-70B` | > **Note**: When you have multiple DGDs in the same namespace, use `dynamo_namespace` to filter metrics for a specific DGD. -#### Example: Scale Prefill Based on TTFT +#### Example: Scale Decode Service Based on TTFT -This example scales **Prefill workers** when Time To First Token (TTFT) exceeds 500ms. Note that TTFT is measured at the Frontend, but reflects Prefill performance. +This example uses the `sglang-agg` DGD from `examples/backends/sglang/deploy/agg.yaml`: -First, configure Prometheus Adapter to expose the TTFT metric. Add this to your Helm values file (e.g., `prometheus-adapter-values.yaml`): +```yaml +# examples/backends/sglang/deploy/agg.yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: sglang-agg +spec: + services: + Frontend: + dynamoNamespace: sglang-agg + componentType: frontend + replicas: 1 + # ... + decode: + dynamoNamespace: sglang-agg + componentType: worker + replicas: 1 + resources: + limits: + gpu: "1" + # ... +``` + +When deployed in namespace `default`, the `dynamo_namespace` label will be `default-sglang-agg`. + +**Step 1: Configure Prometheus Adapter** + +Add this to your Helm values file (e.g., `prometheus-adapter-values.yaml`): ```yaml # prometheus-adapter-values.yaml @@ -241,7 +268,7 @@ prometheus: rules: external: - # TTFT p95 from frontend - used to scale prefill + # TTFT p95 from frontend - used to scale decode - seriesQuery: 'dynamo_frontend_time_to_first_token_seconds_bucket{namespace!=""}' resources: overrides: @@ -255,7 +282,7 @@ rules: ) ``` -Then install or upgrade the Helm release: +**Step 2: Install Prometheus Adapter** ```bash helm repo add prometheus-community https://prometheus-community.github.io/helm-charts @@ -266,25 +293,24 @@ helm upgrade --install prometheus-adapter prometheus-community/prometheus-adapte -f prometheus-adapter-values.yaml ``` -Verify the metric is available: +**Step 3: Verify the metric is available** ```bash kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1/namespaces//dynamo_ttft_p95_seconds" | jq ``` -Then create the HPA targeting the Prefill adapter: +**Step 4: Create the HPA** ```yaml apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: - name: prefill-ttft-hpa - namespace: llm-serving + name: sglang-agg-decode-hpa spec: scaleTargetRef: apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeploymentScalingAdapter - name: my-llm-deployment-prefill # ← Target: PREFILL adapter + name: sglang-agg-decode # ← DGD name + service name (lowercase) minReplicas: 1 maxReplicas: 10 metrics: @@ -294,8 +320,7 @@ spec: name: dynamo_ttft_p95_seconds selector: matchLabels: - # Filter by DGD using dynamo_namespace label - dynamo_namespace: "llm-serving-my-llm-deployment" + dynamo_namespace: "default-sglang-agg" # ← {namespace}-{dynamoNamespace} target: type: Value value: "500m" # Scale up when TTFT p95 > 500ms @@ -317,12 +342,12 @@ spec: **How it works:** 1. Frontend pods export `dynamo_frontend_time_to_first_token_seconds` histogram 2. Prometheus Adapter calculates p95 TTFT per `dynamo_namespace` -3. HPA monitors this metric for your specific DGD -4. When TTFT p95 > 500ms, HPA scales up the Prefill adapter -5. Adapter controller syncs the replica count to the DGD -6. More Prefill workers are created, reducing TTFT +3. HPA monitors this metric filtered by `dynamo_namespace: "default-sglang-agg"` +4. When TTFT p95 > 500ms, HPA scales up the `sglang-agg-decode` adapter +5. Adapter controller syncs the replica count to the DGD's `decode` service +6. More decode workers are created, reducing TTFT -#### Example: Scale Decode Based on Queue Depth +#### Example: Scale Based on Queue Depth Add this rule to your `prometheus-adapter-values.yaml` (alongside the TTFT rule): @@ -344,15 +369,15 @@ Then create the HPA: apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: - name: decode-queue-hpa - namespace: llm-serving + name: sglang-agg-decode-queue-hpa + namespace: default spec: scaleTargetRef: apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeploymentScalingAdapter - name: my-llm-deployment-decode - minReplicas: 2 - maxReplicas: 20 + name: sglang-agg-decode + minReplicas: 1 + maxReplicas: 10 metrics: - type: External external: @@ -360,7 +385,7 @@ spec: name: dynamo_queued_requests selector: matchLabels: - dynamo_namespace: "llm-serving-my-llm-deployment" + dynamo_namespace: "default-sglang-agg" target: type: Value value: "10" # Scale up when queue > 10 requests From 2dc6a795c5a793f4197974f7fd5efdbe41310358 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Wed, 3 Dec 2025 17:43:38 -0700 Subject: [PATCH 10/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- docs/kubernetes/autoscaling.md | 146 +++++++++++++++++++++++++++++---- 1 file changed, 128 insertions(+), 18 deletions(-) diff --git a/docs/kubernetes/autoscaling.md b/docs/kubernetes/autoscaling.md index 5d09006e62..2e2b68c845 100644 --- a/docs/kubernetes/autoscaling.md +++ b/docs/kubernetes/autoscaling.md @@ -391,44 +391,154 @@ spec: value: "10" # Scale up when queue > 10 requests ``` -## Autoscaling with KEDA +## Autoscaling with KEDA (Recommended) -KEDA extends Kubernetes with event-driven autoscaling, supporting 50+ scalers. +KEDA (Kubernetes Event-driven Autoscaling) extends Kubernetes with event-driven autoscaling, supporting 50+ scalers including Prometheus. + +**Advantages over HPA + Prometheus Adapter:** +- No Prometheus Adapter configuration needed +- PromQL queries are defined in the ScaledObject itself (declarative, per-deployment) +- Easy to update - just `kubectl apply` the ScaledObject +- Can scale to zero when idle +- Supports multiple triggers per object **When to use KEDA:** -- You need event-driven scaling (e.g., queue depth) +- You want simpler configuration (no Prometheus Adapter to manage) +- You need event-driven scaling (e.g., queue depth, Kafka, etc.) - You want to scale to zero when idle -- You need complex scaling triggers -### KEDA with Prometheus +### Installing KEDA + +```bash +# Add KEDA Helm repo +helm repo add kedacore https://kedacore.github.io/charts +helm repo update + +# Install KEDA +helm install keda kedacore/keda \ + --namespace keda \ + --create-namespace + +# Verify installation +kubectl get pods -n keda +``` + +> **Note**: If you have Prometheus Adapter installed, either uninstall it first (`helm uninstall prometheus-adapter -n monitoring`) or install KEDA with `--set metricsServer.enabled=false` to avoid API conflicts. + +### Example: Scale Decode Based on TTFT + +Using the `sglang-agg` DGD from `examples/backends/sglang/deploy/agg.yaml`: ```yaml apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: prefill-scaledobject - namespace: llm-serving + name: sglang-agg-decode-scaler + namespace: default spec: scaleTargetRef: apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeploymentScalingAdapter - name: my-llm-deployment-prefill + name: sglang-agg-decode + minReplicaCount: 1 + maxReplicaCount: 10 + pollingInterval: 15 # Check metrics every 15 seconds + cooldownPeriod: 60 # Wait 60s before scaling down + triggers: + - type: prometheus + metadata: + # Update this URL to match your Prometheus service + serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090 + metricName: dynamo_ttft_p95 + query: | + histogram_quantile(0.95, + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m])) + by (le) + ) + threshold: "0.5" # Scale up when TTFT p95 > 500ms (0.5 seconds) + activationThreshold: "0.1" # Start scaling when TTFT > 100ms +``` + +Apply it: + +```bash +kubectl apply -f sglang-agg-decode-scaler.yaml +``` + +### Verify KEDA Scaling + +```bash +# Check ScaledObject status +kubectl get scaledobject -n default + +# KEDA creates an HPA under the hood - you can see it +kubectl get hpa -n default + +# Example output: +# NAME REFERENCE TARGETS MINPODS MAXPODS REPLICAS +# keda-hpa-sglang-agg-decode-scaler DynamoGraphDeploymentScalingAdapter/sglang-agg-decode 45m/500m 1 10 1 + +# Get detailed status +kubectl describe scaledobject sglang-agg-decode-scaler -n default +``` + +### Example: Scale Based on Queue Depth + +```yaml +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: sglang-agg-decode-queue-scaler + namespace: default +spec: + scaleTargetRef: + apiVersion: nvidia.com/v1alpha1 + kind: DynamoGraphDeploymentScalingAdapter + name: sglang-agg-decode minReplicaCount: 1 - maxReplicaCount: 15 + maxReplicaCount: 10 pollingInterval: 15 - cooldownPeriod: 120 + cooldownPeriod: 60 triggers: - type: prometheus metadata: - serverAddress: http://prometheus-server.monitoring.svc.cluster.local:9090 - metricName: vllm_queue_depth + serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090 + metricName: dynamo_queued_requests query: | - sum(vllm_num_requests_waiting{ - namespace="llm-serving", - dynamo_graph_deployment="my-llm-deployment", - service="prefill" - }) - threshold: "10" + sum(dynamo_frontend_queued_requests{dynamo_namespace="default-sglang-agg"}) + threshold: "10" # Scale up when queue > 10 requests +``` + +### How KEDA Works + +KEDA creates and manages an HPA under the hood: + +``` +┌──────────────────────────────────────────────────────────────────────┐ +│ You create: ScaledObject │ +│ - scaleTargetRef: sglang-agg-decode │ +│ - triggers: prometheus query │ +└──────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ KEDA Operator automatically creates: HPA │ +│ - name: keda-hpa-sglang-agg-decode-scaler │ +│ - scaleTargetRef: sglang-agg-decode │ +│ - metrics: External (from KEDA metrics server) │ +└──────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ DynamoGraphDeploymentScalingAdapter: sglang-agg-decode │ +│ - spec.replicas: updated by HPA │ +└──────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────────────┐ +│ DynamoGraphDeployment: sglang-agg │ +│ - spec.services.decode.replicas: synced from adapter │ +└──────────────────────────────────────────────────────────────────────┘ ``` ## Mixed Autoscaling From 66fddabf082b06430398be6da19bc2e920aebf5d Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Wed, 3 Dec 2025 17:49:17 -0700 Subject: [PATCH 11/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- docs/kubernetes/autoscaling.md | 272 ++++++++++++--------------------- 1 file changed, 95 insertions(+), 177 deletions(-) diff --git a/docs/kubernetes/autoscaling.md b/docs/kubernetes/autoscaling.md index 2e2b68c845..40c19ef690 100644 --- a/docs/kubernetes/autoscaling.md +++ b/docs/kubernetes/autoscaling.md @@ -1,6 +1,39 @@ # Autoscaling -This guide explains how to configure autoscaling for DynamoGraphDeployment (DGD) services. Dynamo supports multiple autoscaling strategies to meet different use cases, from simple CPU-based scaling to sophisticated LLM-aware optimization. +This guide explains how to configure autoscaling for DynamoGraphDeployment (DGD) services using the `sglang-agg` example from `examples/backends/sglang/deploy/agg.yaml`. + +## Example DGD + +All examples in this guide use the following DGD: + +```yaml +# examples/backends/sglang/deploy/agg.yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: sglang-agg + namespace: default +spec: + services: + Frontend: + dynamoNamespace: sglang-agg + componentType: frontend + replicas: 1 + + decode: + dynamoNamespace: sglang-agg + componentType: worker + replicas: 1 + resources: + limits: + gpu: "1" +``` + +**Key identifiers:** +- **DGD name**: `sglang-agg` +- **Namespace**: `default` +- **Services**: `Frontend`, `decode` +- **dynamo_namespace label**: `default-sglang-agg` (used for metric filtering) ## Overview @@ -8,9 +41,9 @@ Dynamo provides flexible autoscaling through the `DynamoGraphDeploymentScalingAd | Autoscaler | Description | Best For | |------------|-------------|----------| -| **Dynamo Planner** | LLM-aware autoscaling with SLA optimization | Production LLM workloads | +| **KEDA** | Event-driven autoscaling (recommended) | Most use cases | | **Kubernetes HPA** | Native horizontal pod autoscaling | Simple CPU/memory-based scaling | -| **KEDA** | Event-driven autoscaling | Queue-based or external metrics | +| **Dynamo Planner** | LLM-aware autoscaling with SLA optimization | Production LLM workloads | | **Custom Controllers** | Any scale-subresource-compatible controller | Custom requirements | ## Architecture @@ -18,46 +51,41 @@ Dynamo provides flexible autoscaling through the `DynamoGraphDeploymentScalingAd ``` ┌──────────────────────────────────┐ ┌─────────────────────────────────────┐ │ DynamoGraphDeployment │ │ Scaling Adapters (auto-created) │ -│ "my-llm-deployment" │ │ (one per service) │ +│ "sglang-agg" │ │ (one per service) │ ├──────────────────────────────────┤ ├─────────────────────────────────────┤ │ │ │ │ │ spec.services: │ │ ┌─────────────────────────────┐ │ ┌──────────────────┐ -│ │ │ │ my-llm-deployment-frontend │◄───┼──────│ Autoscalers │ -│ ┌────────────────────────┐◄───┼──────────┼──│ spec.replicas: 2 │ │ │ │ -│ │ frontend: 2 replicas │ │ │ └─────────────────────────────┘ │ │ • Planner │ +│ │ │ │ sglang-agg-frontend │◄───┼──────│ Autoscalers │ +│ ┌────────────────────────┐◄───┼──────────┼──│ spec.replicas: 1 │ │ │ │ +│ │ Frontend: 1 replica │ │ │ └─────────────────────────────┘ │ │ • KEDA │ │ └────────────────────────┘ │ │ │ │ • HPA │ -│ │ │ ┌─────────────────────────────┐ │ │ • KEDA │ -│ ┌────────────────────────┐◄───┼──────────┼──│ my-llm-deployment-prefill │◄───┼──────│ • Custom │ -│ │ prefill: 4 replicas │ │ │ │ spec.replicas: 4 │ │ │ │ +│ │ │ ┌─────────────────────────────┐ │ │ • Planner │ +│ ┌────────────────────────┐◄───┼──────────┼──│ sglang-agg-decode │◄───┼──────│ • Custom │ +│ │ decode: 1 replica │ │ │ │ spec.replicas: 1 │ │ │ │ │ └────────────────────────┘ │ │ └─────────────────────────────┘ │ └──────────────────┘ │ │ │ │ -│ ┌────────────────────────┐◄───┼──────────┼──┌─────────────────────────────┐ │ -│ │ decode: 8 replicas │ │ │ │ my-llm-deployment-decode │◄───┼────── -│ └────────────────────────┘ │ │ │ spec.replicas: 8 │ │ -│ │ │ └─────────────────────────────┘ │ └──────────────────────────────────┘ └─────────────────────────────────────┘ ``` **How it works:** -1. You deploy a DGD with services (frontend, prefill, decode, etc.) +1. You deploy a DGD with services (Frontend, decode) 2. The operator auto-creates one DGDSA per service -3. Autoscalers (HPA, KEDA, Planner) target the adapters via `/scale` subresource +3. Autoscalers (KEDA, HPA, Planner) target the adapters via `/scale` subresource 4. Adapter controller syncs replica changes to the DGD 5. DGD controller reconciles the underlying pods ## Viewing Scaling Adapters -After deploying a DGD, verify the auto-created adapters: +After deploying the `sglang-agg` DGD, verify the auto-created adapters: ```bash -kubectl get dgdsa -n +kubectl get dgdsa -n default # Example output: -# NAME DGD SERVICE REPLICAS AGE -# my-llm-deployment-frontend my-llm-deployment frontend 2 5m -# my-llm-deployment-prefill my-llm-deployment prefill 4 5m -# my-llm-deployment-decode my-llm-deployment decode 8 5m +# NAME DGD SERVICE REPLICAS AGE +# sglang-agg-frontend sglang-agg Frontend 1 5m +# sglang-agg-decode sglang-agg decode 1 5m ``` ## Autoscaling with Dynamo Planner @@ -78,48 +106,13 @@ Planner is deployed as a service component within your DGD. It: **Deployment:** -The recommended way to deploy Planner is via `DynamoGraphDeploymentRequest` (DGDR), which automatically: -1. Profiles your model to find optimal configurations -2. Generates a DGD with Planner included -3. Deploys the optimized configuration +The recommended way to deploy Planner is via `DynamoGraphDeploymentRequest` (DGDR). See the [SLA Planner Quick Start](../planner/sla_planner_quickstart.md) for complete instructions. -See the [SLA Planner Quick Start](../planner/sla_planner_quickstart.md) for complete instructions. - -**Manual Planner deployment:** - -You can also manually add Planner to your DGD. Example configurations are available in: +Example configurations with Planner: - `examples/backends/vllm/deploy/disagg_planner.yaml` - `examples/backends/sglang/deploy/disagg_planner.yaml` - `examples/backends/trtllm/deploy/disagg_planner.yaml` -```yaml -apiVersion: nvidia.com/v1alpha1 -kind: DynamoGraphDeployment -metadata: - name: my-llm-deployment - namespace: llm-serving -spec: - backendFramework: vllm - services: - frontend: - replicas: 2 - componentType: frontend - prefill: - replicas: 4 - componentType: worker - subComponentType: prefill - decode: - replicas: 8 - componentType: worker - subComponentType: decode - # Planner service - planner: - replicas: 1 - componentType: planner - # Planner requires profiling data and Prometheus access - # See examples/backends/*/deploy/disagg_planner.yaml for full configuration -``` - For more details, see the [SLA Planner documentation](../planner/sla_planner.md). ## Autoscaling with Kubernetes HPA @@ -131,19 +124,21 @@ The Horizontal Pod Autoscaler (HPA) is Kubernetes' native autoscaling solution. - You want to use standard Kubernetes tooling - You need CPU or memory-based scaling +> **Note**: For custom metrics (like TTFT or queue depth), consider using [KEDA](#autoscaling-with-keda-recommended) instead - it's simpler to configure. + ### Basic HPA (CPU-based) ```yaml apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: - name: frontend-hpa - namespace: llm-serving + name: sglang-agg-frontend-hpa + namespace: default spec: scaleTargetRef: apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeploymentScalingAdapter - name: my-llm-deployment-frontend + name: sglang-agg-frontend minReplicas: 1 maxReplicas: 10 metrics: @@ -160,45 +155,6 @@ spec: stabilizationWindowSeconds: 0 ``` -### HPA with Custom Metrics - -To use LLM-specific metrics, you need [Prometheus Adapter](https://github.com/kubernetes-sigs/prometheus-adapter) or similar: - -```yaml -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: decode-hpa - namespace: llm-serving -spec: - scaleTargetRef: - apiVersion: nvidia.com/v1alpha1 - kind: DynamoGraphDeploymentScalingAdapter - name: my-llm-deployment-decode - minReplicas: 2 - maxReplicas: 20 - metrics: - # Scale based on KV cache utilization - - type: Pods - pods: - metric: - name: vllm_gpu_cache_usage_perc - target: - type: AverageValue - averageValue: "70" - # Also consider queue depth - - type: External - external: - metric: - name: vllm_num_requests_waiting - selector: - matchLabels: - service: decode - target: - type: AverageValue - averageValue: "5" -``` - ### HPA with Dynamo Metrics Dynamo exports several metrics useful for autoscaling. These are available at the `/metrics` endpoint on each frontend pod. @@ -209,9 +165,9 @@ Dynamo exports several metrics useful for autoscaling. These are available at th | Metric | Type | Description | Good for scaling | |--------|------|-------------|------------------| -| `dynamo_frontend_queued_requests` | Gauge | Requests waiting in HTTP queue | ✅ Prefill | +| `dynamo_frontend_queued_requests` | Gauge | Requests waiting in HTTP queue | ✅ Workers | | `dynamo_frontend_inflight_requests` | Gauge | Concurrent requests to engine | ✅ All services | -| `dynamo_frontend_time_to_first_token_seconds` | Histogram | TTFT latency | ✅ Prefill | +| `dynamo_frontend_time_to_first_token_seconds` | Histogram | TTFT latency | ✅ Workers | | `dynamo_frontend_inter_token_latency_seconds` | Histogram | ITL latency | ✅ Decode | | `dynamo_frontend_request_duration_seconds` | Histogram | Total request duration | ⚠️ General | | `kvstats_gpu_cache_usage_percent` | Gauge | GPU KV cache usage (0-1) | ✅ Decode | @@ -223,38 +179,13 @@ Dynamo metrics include these labels for filtering: | Label | Description | Example | |-------|-------------|---------| | `dynamo_namespace` | Unique DGD identifier (`{k8s-namespace}-{dynamoNamespace}`) | `default-sglang-agg` | -| `model` | Model being served | `meta-llama/Llama-3-70B` | +| `model` | Model being served | `Qwen/Qwen3-0.6B` | > **Note**: When you have multiple DGDs in the same namespace, use `dynamo_namespace` to filter metrics for a specific DGD. #### Example: Scale Decode Service Based on TTFT -This example uses the `sglang-agg` DGD from `examples/backends/sglang/deploy/agg.yaml`: - -```yaml -# examples/backends/sglang/deploy/agg.yaml -apiVersion: nvidia.com/v1alpha1 -kind: DynamoGraphDeployment -metadata: - name: sglang-agg -spec: - services: - Frontend: - dynamoNamespace: sglang-agg - componentType: frontend - replicas: 1 - # ... - decode: - dynamoNamespace: sglang-agg - componentType: worker - replicas: 1 - resources: - limits: - gpu: "1" - # ... -``` - -When deployed in namespace `default`, the `dynamo_namespace` label will be `default-sglang-agg`. +Using HPA with Prometheus Adapter requires configuring external metrics. **Step 1: Configure Prometheus Adapter** @@ -543,38 +474,23 @@ KEDA creates and manages an HPA under the hood: ## Mixed Autoscaling -You can use different autoscaling strategies for different services: +For disaggregated deployments (prefill + decode), you can use different autoscaling strategies for different services: ```yaml -# DGD with three services -apiVersion: nvidia.com/v1alpha1 -kind: DynamoGraphDeployment -metadata: - name: my-llm-deployment - namespace: llm-serving -spec: - services: - frontend: - replicas: 2 # Managed by HPA (CPU-based) - prefill: - replicas: 3 # Managed by KEDA (queue-based) - decode: - replicas: 6 # Managed by Planner (LLM-optimized) - --- -# HPA for Frontend +# HPA for Frontend (CPU-based) apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: - name: frontend-hpa - namespace: llm-serving + name: sglang-agg-frontend-hpa + namespace: default spec: scaleTargetRef: apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeploymentScalingAdapter - name: my-llm-deployment-frontend + name: sglang-agg-frontend minReplicas: 1 - maxReplicas: 10 + maxReplicas: 5 metrics: - type: Resource resource: @@ -584,27 +500,29 @@ spec: averageUtilization: 70 --- -# KEDA for Prefill +# KEDA for Decode (TTFT-based) apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: - name: prefill-scaledobject - namespace: llm-serving + name: sglang-agg-decode-scaler + namespace: default spec: scaleTargetRef: apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeploymentScalingAdapter - name: my-llm-deployment-prefill + name: sglang-agg-decode minReplicaCount: 1 - maxReplicaCount: 12 + maxReplicaCount: 10 triggers: - type: prometheus metadata: - serverAddress: http://prometheus-server.monitoring.svc.cluster.local:9090 - query: sum(vllm_num_requests_waiting{service="prefill"}) - threshold: "10" - -# Decode is managed by Planner (no additional config needed) + serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090 + query: | + histogram_quantile(0.95, + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m])) + by (le) + ) + threshold: "0.5" ``` ## Manual Scaling @@ -612,8 +530,8 @@ spec: You can manually scale a service by patching the adapter: ```bash -kubectl patch dgdsa my-llm-deployment-decode -n llm-serving \ - --type='json' -p='[{"op": "replace", "path": "/spec/replicas", "value": 10}]' +kubectl patch dgdsa sglang-agg-decode -n default \ + --type='json' -p='[{"op": "replace", "path": "/spec/replicas", "value": 3}]' ``` > **Note**: If an autoscaler is managing the adapter, your change will be overwritten on the next evaluation cycle. @@ -668,7 +586,7 @@ Always configure minimum and maximum replicas in your HPA/KEDA to prevent: ```bash # Check DGD status -kubectl describe dgd my-llm-deployment -n llm-serving +kubectl describe dgd sglang-agg -n default # Check operator logs kubectl logs -n dynamo-system deployment/dynamo-operator @@ -678,35 +596,35 @@ kubectl logs -n dynamo-system deployment/dynamo-operator ```bash # Check adapter status -kubectl describe dgdsa my-llm-deployment-decode -n llm-serving +kubectl describe dgdsa sglang-agg-decode -n default -# Check HPA status -kubectl describe hpa decode-hpa -n llm-serving +# Check HPA/KEDA status +kubectl describe hpa sglang-agg-decode-hpa -n default +kubectl describe scaledobject sglang-agg-decode-scaler -n default # Verify metrics are available in Kubernetes metrics API -kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 kubectl get --raw /apis/external.metrics.k8s.io/v1beta1 ``` ### Metrics Not Available -If HPA shows `` for metrics: +If HPA/KEDA shows `` for metrics: ```bash # Check if Dynamo metrics are being scraped -kubectl port-forward -n llm-serving pod/ 8000:8000 +kubectl port-forward -n default svc/sglang-agg-frontend 8000:8000 curl http://localhost:8000/metrics | grep dynamo_frontend # Example output: -# dynamo_frontend_queued_requests{model="meta-llama/Llama-3-70B"} 2 -# dynamo_frontend_inflight_requests{model="meta-llama/Llama-3-70B"} 5 +# dynamo_frontend_queued_requests{model="Qwen/Qwen3-0.6B"} 2 +# dynamo_frontend_inflight_requests{model="Qwen/Qwen3-0.6B"} 5 # Verify Prometheus is scraping the metrics -kubectl port-forward -n monitoring svc/prometheus-server 9090:9090 +kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090 # Then query: dynamo_frontend_time_to_first_token_seconds_bucket -# Check Prometheus Adapter logs -kubectl logs -n monitoring deployment/prometheus-adapter +# Check KEDA operator logs +kubectl logs -n keda deployment/keda-operator ``` ### Rapid Scaling Up and Down @@ -714,8 +632,8 @@ kubectl logs -n monitoring deployment/prometheus-adapter If you see unstable scaling: 1. Check if multiple autoscalers are targeting the same adapter -2. Increase stabilization window in HPA behavior -3. Increase cooldown period in KEDA ScaledObject +2. Increase `cooldownPeriod` in KEDA ScaledObject +3. Increase `stabilizationWindowSeconds` in HPA behavior ## References From f8bed11e9c0408a1676cbc3cb4b23d259612b6e7 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Wed, 3 Dec 2025 17:53:46 -0700 Subject: [PATCH 12/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- docs/kubernetes/autoscaling.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/docs/kubernetes/autoscaling.md b/docs/kubernetes/autoscaling.md index 40c19ef690..3167d3a3b3 100644 --- a/docs/kubernetes/autoscaling.md +++ b/docs/kubernetes/autoscaling.md @@ -527,14 +527,23 @@ spec: ## Manual Scaling -You can manually scale a service by patching the adapter: +You can manually scale a service using the scale subresource: ```bash -kubectl patch dgdsa sglang-agg-decode -n default \ - --type='json' -p='[{"op": "replace", "path": "/spec/replicas", "value": 3}]' +kubectl scale dgdsa sglang-agg-decode -n default --replicas=3 ``` -> **Note**: If an autoscaler is managing the adapter, your change will be overwritten on the next evaluation cycle. +Verify the scaling: + +```bash +kubectl get dgdsa sglang-agg-decode -n default + +# Output: +# NAME DGD SERVICE REPLICAS AGE +# sglang-agg-decode sglang-agg decode 3 10m +``` + +> **Note**: If an autoscaler (KEDA, HPA, Planner) is managing the adapter, your change will be overwritten on the next evaluation cycle. ## Best Practices From de1a1573adad76d0e8a79706ce5b6462af72b04e Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Wed, 3 Dec 2025 18:02:50 -0700 Subject: [PATCH 13/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- docs/kubernetes/autoscaling.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/kubernetes/autoscaling.md b/docs/kubernetes/autoscaling.md index 3167d3a3b3..d51d0ad6bf 100644 --- a/docs/kubernetes/autoscaling.md +++ b/docs/kubernetes/autoscaling.md @@ -185,7 +185,7 @@ Dynamo metrics include these labels for filtering: #### Example: Scale Decode Service Based on TTFT -Using HPA with Prometheus Adapter requires configuring external metrics. +Using HPA with Prometheus Adapter requires configuring external metrics. **Step 1: Configure Prometheus Adapter** @@ -208,7 +208,7 @@ rules: as: "dynamo_ttft_p95_seconds" metricsQuery: | histogram_quantile(0.95, - sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{<<.LabelMatchers>>}[5m])) + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{<<.LabelMatchers>>}[5m])) by (le, namespace, dynamo_namespace) ) ``` @@ -383,7 +383,7 @@ spec: metricName: dynamo_ttft_p95 query: | histogram_quantile(0.95, - sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m])) + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m])) by (le) ) threshold: "0.5" # Scale up when TTFT p95 > 500ms (0.5 seconds) @@ -519,7 +519,7 @@ spec: serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090 query: | histogram_quantile(0.95, - sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m])) + sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m])) by (le) ) threshold: "0.5" From 7ec54a622f56af134b7e3c790ac26f76668ffe22 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Thu, 4 Dec 2025 18:55:50 -0700 Subject: [PATCH 14/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- ...nvidia.com_dynamocomponentdeployments.yaml | 596 ++++++++++++++++++ .../nvidia.com_dynamographdeployments.yaml | 596 ++++++++++++++++++ ..._dynamographdeploymentscalingadapters.yaml | 8 +- deploy/cloud/operator/api/v1alpha1/common.go | 17 + .../dynamocomponentdeployment_types.go | 4 + ...namographdeploymentscalingadapter_types.go | 6 +- .../api/v1alpha1/zz_generated.deepcopy.go | 33 + ...nvidia.com_dynamocomponentdeployments.yaml | 596 ++++++++++++++++++ .../nvidia.com_dynamographdeployments.yaml | 596 ++++++++++++++++++ ..._dynamographdeploymentscalingadapters.yaml | 8 +- .../dynamographdeployment_controller.go | 6 +- ...raphdeploymentscalingadapter_controller.go | 16 +- .../validation/dynamocomponentdeployment.go | 5 +- .../validation/dynamographdeployment.go | 20 +- .../internal/webhook/validation/shared.go | 17 + docs/kubernetes/api_reference.md | 27 +- 16 files changed, 2523 insertions(+), 28 deletions(-) diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml index 39b04bf3f0..9ac97c1430 100644 --- a/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml @@ -76,6 +76,602 @@ spec: Annotations to add to generated Kubernetes resources for this component (such as Pod, Service, and Ingress when applicable). type: object + autoscaling: + description: |- + Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter + with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md + for migration guidance. This field will be removed in a future API version. + properties: + behavior: + description: 'Deprecated: This field is ignored.' + properties: + scaleDown: + description: |- + scaleDown is scaling policy for scaling Down. + If not set, the default value is to allow to scale down to minReplicas pods, with a + 300 second stabilization window (i.e., the highest recommendation for + the last 300sec is used). + properties: + policies: + description: |- + policies is a list of potential scaling polices which can be used during scaling. + If not set, use the default values: + - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. + - For scale down: allow all pods to be removed in a 15s window. + items: + description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. + properties: + periodSeconds: + description: |- + periodSeconds specifies the window of time for which the policy should hold true. + PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). + format: int32 + type: integer + type: + description: type is used to specify the scaling policy. + type: string + value: + description: |- + value contains the amount of change which is permitted by the policy. + It must be greater than zero + format: int32 + type: integer + required: + - periodSeconds + - type + - value + type: object + type: array + x-kubernetes-list-type: atomic + selectPolicy: + description: |- + selectPolicy is used to specify which policy should be used. + If not set, the default value Max is used. + type: string + stabilizationWindowSeconds: + description: |- + stabilizationWindowSeconds is the number of seconds for which past recommendations should be + considered while scaling up or scaling down. + StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). + If not set, use the default values: + - For scale up: 0 (i.e. no stabilization is done). + - For scale down: 300 (i.e. the stabilization window is 300 seconds long). + format: int32 + type: integer + tolerance: + anyOf: + - type: integer + - type: string + description: |- + tolerance is the tolerance on the ratio between the current and desired + metric value under which no updates are made to the desired number of + replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not + set, the default cluster-wide tolerance is applied (by default 10%). + + For example, if autoscaling is configured with a memory consumption target of 100Mi, + and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be + triggered when the actual consumption falls below 95Mi or exceeds 101Mi. + + This is an alpha field and requires enabling the HPAConfigurableTolerance + feature gate. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scaleUp: + description: |- + scaleUp is scaling policy for scaling Up. + If not set, the default value is the higher of: + * increase no more than 4 pods per 60 seconds + * double the number of pods per 60 seconds + No stabilization is used. + properties: + policies: + description: |- + policies is a list of potential scaling polices which can be used during scaling. + If not set, use the default values: + - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. + - For scale down: allow all pods to be removed in a 15s window. + items: + description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. + properties: + periodSeconds: + description: |- + periodSeconds specifies the window of time for which the policy should hold true. + PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). + format: int32 + type: integer + type: + description: type is used to specify the scaling policy. + type: string + value: + description: |- + value contains the amount of change which is permitted by the policy. + It must be greater than zero + format: int32 + type: integer + required: + - periodSeconds + - type + - value + type: object + type: array + x-kubernetes-list-type: atomic + selectPolicy: + description: |- + selectPolicy is used to specify which policy should be used. + If not set, the default value Max is used. + type: string + stabilizationWindowSeconds: + description: |- + stabilizationWindowSeconds is the number of seconds for which past recommendations should be + considered while scaling up or scaling down. + StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). + If not set, use the default values: + - For scale up: 0 (i.e. no stabilization is done). + - For scale down: 300 (i.e. the stabilization window is 300 seconds long). + format: int32 + type: integer + tolerance: + anyOf: + - type: integer + - type: string + description: |- + tolerance is the tolerance on the ratio between the current and desired + metric value under which no updates are made to the desired number of + replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not + set, the default cluster-wide tolerance is applied (by default 10%). + + For example, if autoscaling is configured with a memory consumption target of 100Mi, + and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be + triggered when the actual consumption falls below 95Mi or exceeds 101Mi. + + This is an alpha field and requires enabling the HPAConfigurableTolerance + feature gate. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + type: object + enabled: + description: 'Deprecated: This field is ignored.' + type: boolean + maxReplicas: + description: 'Deprecated: This field is ignored.' + type: integer + metrics: + description: 'Deprecated: This field is ignored.' + items: + description: |- + MetricSpec specifies how to scale based on a single metric + (only `type` and one other matching field should be set at once). + properties: + containerResource: + description: |- + containerResource refers to a resource metric (such as those specified in + requests and limits) known to Kubernetes describing a single container in + each pod of the current scale target (e.g. CPU or memory). Such metrics are + built in to Kubernetes, and have special scaling options on top of those + available to normal per-pod metrics using the "pods" source. + properties: + container: + description: container is the name of the container in the pods of the scaling target + type: string + name: + description: name is the name of the resource in question. + type: string + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - container + - name + - target + type: object + external: + description: |- + external refers to a global metric that is not associated + with any Kubernetes object. It allows autoscaling based on information + coming from components running outside of cluster + (for example length of queue in cloud messaging service, or + QPS from loadbalancer running outside of cluster). + properties: + metric: + description: metric identifies the target metric by name and selector + properties: + name: + description: name is the name of the given metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - metric + - target + type: object + object: + description: |- + object refers to a metric describing a single kubernetes object + (for example, hits-per-second on an Ingress object). + properties: + describedObject: + description: describedObject specifies the descriptions of a object,such as kind,name apiVersion + properties: + apiVersion: + description: apiVersion is the API version of the referent + type: string + kind: + description: 'kind is the kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'name is the name of the referent; More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + required: + - kind + - name + type: object + metric: + description: metric identifies the target metric by name and selector + properties: + name: + description: name is the name of the given metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - describedObject + - metric + - target + type: object + pods: + description: |- + pods refers to a metric describing each pod in the current scale target + (for example, transactions-processed-per-second). The values will be + averaged together before being compared to the target value. + properties: + metric: + description: metric identifies the target metric by name and selector + properties: + name: + description: name is the name of the given metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - metric + - target + type: object + resource: + description: |- + resource refers to a resource metric (such as those specified in + requests and limits) known to Kubernetes describing each pod in the + current scale target (e.g. CPU or memory). Such metrics are built in to + Kubernetes, and have special scaling options on top of those available + to normal per-pod metrics using the "pods" source. + properties: + name: + description: name is the name of the resource in question. + type: string + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - name + - target + type: object + type: + description: |- + type is the type of metric source. It should be one of "ContainerResource", "External", + "Object", "Pods" or "Resource", each mapping to a matching field in the object. + type: string + required: + - type + type: object + type: array + minReplicas: + description: 'Deprecated: This field is ignored.' + type: integer + type: object backendFramework: description: BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm") enum: diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml index 7814ec4ea9..a9c94eeaa3 100644 --- a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml @@ -218,6 +218,602 @@ spec: Annotations to add to generated Kubernetes resources for this component (such as Pod, Service, and Ingress when applicable). type: object + autoscaling: + description: |- + Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter + with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md + for migration guidance. This field will be removed in a future API version. + properties: + behavior: + description: 'Deprecated: This field is ignored.' + properties: + scaleDown: + description: |- + scaleDown is scaling policy for scaling Down. + If not set, the default value is to allow to scale down to minReplicas pods, with a + 300 second stabilization window (i.e., the highest recommendation for + the last 300sec is used). + properties: + policies: + description: |- + policies is a list of potential scaling polices which can be used during scaling. + If not set, use the default values: + - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. + - For scale down: allow all pods to be removed in a 15s window. + items: + description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. + properties: + periodSeconds: + description: |- + periodSeconds specifies the window of time for which the policy should hold true. + PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). + format: int32 + type: integer + type: + description: type is used to specify the scaling policy. + type: string + value: + description: |- + value contains the amount of change which is permitted by the policy. + It must be greater than zero + format: int32 + type: integer + required: + - periodSeconds + - type + - value + type: object + type: array + x-kubernetes-list-type: atomic + selectPolicy: + description: |- + selectPolicy is used to specify which policy should be used. + If not set, the default value Max is used. + type: string + stabilizationWindowSeconds: + description: |- + stabilizationWindowSeconds is the number of seconds for which past recommendations should be + considered while scaling up or scaling down. + StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). + If not set, use the default values: + - For scale up: 0 (i.e. no stabilization is done). + - For scale down: 300 (i.e. the stabilization window is 300 seconds long). + format: int32 + type: integer + tolerance: + anyOf: + - type: integer + - type: string + description: |- + tolerance is the tolerance on the ratio between the current and desired + metric value under which no updates are made to the desired number of + replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not + set, the default cluster-wide tolerance is applied (by default 10%). + + For example, if autoscaling is configured with a memory consumption target of 100Mi, + and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be + triggered when the actual consumption falls below 95Mi or exceeds 101Mi. + + This is an alpha field and requires enabling the HPAConfigurableTolerance + feature gate. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scaleUp: + description: |- + scaleUp is scaling policy for scaling Up. + If not set, the default value is the higher of: + * increase no more than 4 pods per 60 seconds + * double the number of pods per 60 seconds + No stabilization is used. + properties: + policies: + description: |- + policies is a list of potential scaling polices which can be used during scaling. + If not set, use the default values: + - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. + - For scale down: allow all pods to be removed in a 15s window. + items: + description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. + properties: + periodSeconds: + description: |- + periodSeconds specifies the window of time for which the policy should hold true. + PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). + format: int32 + type: integer + type: + description: type is used to specify the scaling policy. + type: string + value: + description: |- + value contains the amount of change which is permitted by the policy. + It must be greater than zero + format: int32 + type: integer + required: + - periodSeconds + - type + - value + type: object + type: array + x-kubernetes-list-type: atomic + selectPolicy: + description: |- + selectPolicy is used to specify which policy should be used. + If not set, the default value Max is used. + type: string + stabilizationWindowSeconds: + description: |- + stabilizationWindowSeconds is the number of seconds for which past recommendations should be + considered while scaling up or scaling down. + StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). + If not set, use the default values: + - For scale up: 0 (i.e. no stabilization is done). + - For scale down: 300 (i.e. the stabilization window is 300 seconds long). + format: int32 + type: integer + tolerance: + anyOf: + - type: integer + - type: string + description: |- + tolerance is the tolerance on the ratio between the current and desired + metric value under which no updates are made to the desired number of + replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not + set, the default cluster-wide tolerance is applied (by default 10%). + + For example, if autoscaling is configured with a memory consumption target of 100Mi, + and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be + triggered when the actual consumption falls below 95Mi or exceeds 101Mi. + + This is an alpha field and requires enabling the HPAConfigurableTolerance + feature gate. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + type: object + enabled: + description: 'Deprecated: This field is ignored.' + type: boolean + maxReplicas: + description: 'Deprecated: This field is ignored.' + type: integer + metrics: + description: 'Deprecated: This field is ignored.' + items: + description: |- + MetricSpec specifies how to scale based on a single metric + (only `type` and one other matching field should be set at once). + properties: + containerResource: + description: |- + containerResource refers to a resource metric (such as those specified in + requests and limits) known to Kubernetes describing a single container in + each pod of the current scale target (e.g. CPU or memory). Such metrics are + built in to Kubernetes, and have special scaling options on top of those + available to normal per-pod metrics using the "pods" source. + properties: + container: + description: container is the name of the container in the pods of the scaling target + type: string + name: + description: name is the name of the resource in question. + type: string + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - container + - name + - target + type: object + external: + description: |- + external refers to a global metric that is not associated + with any Kubernetes object. It allows autoscaling based on information + coming from components running outside of cluster + (for example length of queue in cloud messaging service, or + QPS from loadbalancer running outside of cluster). + properties: + metric: + description: metric identifies the target metric by name and selector + properties: + name: + description: name is the name of the given metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - metric + - target + type: object + object: + description: |- + object refers to a metric describing a single kubernetes object + (for example, hits-per-second on an Ingress object). + properties: + describedObject: + description: describedObject specifies the descriptions of a object,such as kind,name apiVersion + properties: + apiVersion: + description: apiVersion is the API version of the referent + type: string + kind: + description: 'kind is the kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'name is the name of the referent; More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + required: + - kind + - name + type: object + metric: + description: metric identifies the target metric by name and selector + properties: + name: + description: name is the name of the given metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - describedObject + - metric + - target + type: object + pods: + description: |- + pods refers to a metric describing each pod in the current scale target + (for example, transactions-processed-per-second). The values will be + averaged together before being compared to the target value. + properties: + metric: + description: metric identifies the target metric by name and selector + properties: + name: + description: name is the name of the given metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - metric + - target + type: object + resource: + description: |- + resource refers to a resource metric (such as those specified in + requests and limits) known to Kubernetes describing each pod in the + current scale target (e.g. CPU or memory). Such metrics are built in to + Kubernetes, and have special scaling options on top of those available + to normal per-pod metrics using the "pods" source. + properties: + name: + description: name is the name of the resource in question. + type: string + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - name + - target + type: object + type: + description: |- + type is the type of metric source. It should be one of "ContainerResource", "External", + "Object", "Pods" or "Resource", each mapping to a matching field in the object. + type: string + required: + - type + type: object + type: array + minReplicas: + description: 'Deprecated: This field is ignored.' + type: integer + type: object componentType: description: ComponentType indicates the role of this component (for example, "main"). type: string diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml index 4a9ecb3b3b..f822bb91db 100644 --- a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentscalingadapters.yaml @@ -38,7 +38,7 @@ spec: name: DGD type: string - description: Service name - jsonPath: .spec.dgdRef.service + jsonPath: .spec.dgdRef.serviceName name: SERVICE type: string - description: Current replicas @@ -87,13 +87,13 @@ spec: description: Name of the DynamoGraphDeployment minLength: 1 type: string - service: - description: Service is the key name of the service within the DGD's spec.services map to scale + serviceName: + description: ServiceName is the key name of the service within the DGD's spec.services map to scale minLength: 1 type: string required: - name - - service + - serviceName type: object replicas: description: |- diff --git a/deploy/cloud/operator/api/v1alpha1/common.go b/deploy/cloud/operator/api/v1alpha1/common.go index f967c6dbca..cb181523cc 100644 --- a/deploy/cloud/operator/api/v1alpha1/common.go +++ b/deploy/cloud/operator/api/v1alpha1/common.go @@ -18,6 +18,7 @@ package v1alpha1 import ( + autoscalingv2 "k8s.io/api/autoscaling/v2" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" ) @@ -52,6 +53,22 @@ type VolumeMount struct { UseAsCompilationCache bool `json:"useAsCompilationCache,omitempty"` } +// Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter +// with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md +// for migration guidance. This field will be removed in a future API version. +type Autoscaling struct { + // Deprecated: This field is ignored. + Enabled bool `json:"enabled,omitempty"` + // Deprecated: This field is ignored. + MinReplicas int `json:"minReplicas,omitempty"` + // Deprecated: This field is ignored. + MaxReplicas int `json:"maxReplicas,omitempty"` + // Deprecated: This field is ignored. + Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"` + // Deprecated: This field is ignored. + Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"` +} + type SharedMemorySpec struct { Disabled bool `json:"disabled,omitempty"` Size resource.Quantity `json:"size,omitempty"` diff --git a/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go b/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go index 06202948c6..5328a93cf5 100644 --- a/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go +++ b/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go @@ -74,6 +74,10 @@ type DynamoComponentDeploymentSharedSpec struct { // Resources requested and limits for this component, including CPU, memory, // GPUs/devices, and any runtime-specific resources. Resources *Resources `json:"resources,omitempty"` + // Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter + // with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md + // for migration guidance. This field will be removed in a future API version. + Autoscaling *Autoscaling `json:"autoscaling,omitempty"` // Envs defines additional environment variables to inject into the component containers. Envs []corev1.EnvVar `json:"envs,omitempty"` // EnvFromSecret references a Secret whose key/value pairs will be exposed as diff --git a/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go b/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go index eccf7de2f2..d4da1a0ccf 100644 --- a/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go +++ b/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentscalingadapter_types.go @@ -41,10 +41,10 @@ type DynamoGraphDeploymentServiceRef struct { // +kubebuilder:validation:MinLength=1 Name string `json:"name"` - // Service is the key name of the service within the DGD's spec.services map to scale + // ServiceName is the key name of the service within the DGD's spec.services map to scale // +kubebuilder:validation:Required // +kubebuilder:validation:MinLength=1 - Service string `json:"service"` + ServiceName string `json:"serviceName"` } // DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter @@ -68,7 +68,7 @@ type DynamoGraphDeploymentScalingAdapterStatus struct { // +kubebuilder:subresource:status // +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas,selectorpath=.status.selector // +kubebuilder:printcolumn:name="DGD",type="string",JSONPath=".spec.dgdRef.name",description="DynamoGraphDeployment name" -// +kubebuilder:printcolumn:name="SERVICE",type="string",JSONPath=".spec.dgdRef.service",description="Service name" +// +kubebuilder:printcolumn:name="SERVICE",type="string",JSONPath=".spec.dgdRef.serviceName",description="Service name" // +kubebuilder:printcolumn:name="REPLICAS",type="integer",JSONPath=".status.replicas",description="Current replicas" // +kubebuilder:printcolumn:name="AGE",type="date",JSONPath=".metadata.creationTimestamp" // +kubebuilder:resource:shortName={dgdsa} diff --git a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go index da95b14745..69b32f1d96 100644 --- a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go @@ -38,12 +38,40 @@ limitations under the License. package v1alpha1 import ( + "k8s.io/api/autoscaling/v2" "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Autoscaling) DeepCopyInto(out *Autoscaling) { + *out = *in + if in.Behavior != nil { + in, out := &in.Behavior, &out.Behavior + *out = new(v2.HorizontalPodAutoscalerBehavior) + (*in).DeepCopyInto(*out) + } + if in.Metrics != nil { + in, out := &in.Metrics, &out.Metrics + *out = make([]v2.MetricSpec, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Autoscaling. +func (in *Autoscaling) DeepCopy() *Autoscaling { + if in == nil { + return nil + } + out := new(Autoscaling) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *BaseCRD) DeepCopyInto(out *BaseCRD) { *out = *in @@ -276,6 +304,11 @@ func (in *DynamoComponentDeploymentSharedSpec) DeepCopyInto(out *DynamoComponent *out = new(Resources) (*in).DeepCopyInto(*out) } + if in.Autoscaling != nil { + in, out := &in.Autoscaling, &out.Autoscaling + *out = new(Autoscaling) + (*in).DeepCopyInto(*out) + } if in.Envs != nil { in, out := &in.Envs, &out.Envs *out = make([]v1.EnvVar, len(*in)) diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml index 39b04bf3f0..9ac97c1430 100644 --- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml @@ -76,6 +76,602 @@ spec: Annotations to add to generated Kubernetes resources for this component (such as Pod, Service, and Ingress when applicable). type: object + autoscaling: + description: |- + Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter + with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md + for migration guidance. This field will be removed in a future API version. + properties: + behavior: + description: 'Deprecated: This field is ignored.' + properties: + scaleDown: + description: |- + scaleDown is scaling policy for scaling Down. + If not set, the default value is to allow to scale down to minReplicas pods, with a + 300 second stabilization window (i.e., the highest recommendation for + the last 300sec is used). + properties: + policies: + description: |- + policies is a list of potential scaling polices which can be used during scaling. + If not set, use the default values: + - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. + - For scale down: allow all pods to be removed in a 15s window. + items: + description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. + properties: + periodSeconds: + description: |- + periodSeconds specifies the window of time for which the policy should hold true. + PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). + format: int32 + type: integer + type: + description: type is used to specify the scaling policy. + type: string + value: + description: |- + value contains the amount of change which is permitted by the policy. + It must be greater than zero + format: int32 + type: integer + required: + - periodSeconds + - type + - value + type: object + type: array + x-kubernetes-list-type: atomic + selectPolicy: + description: |- + selectPolicy is used to specify which policy should be used. + If not set, the default value Max is used. + type: string + stabilizationWindowSeconds: + description: |- + stabilizationWindowSeconds is the number of seconds for which past recommendations should be + considered while scaling up or scaling down. + StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). + If not set, use the default values: + - For scale up: 0 (i.e. no stabilization is done). + - For scale down: 300 (i.e. the stabilization window is 300 seconds long). + format: int32 + type: integer + tolerance: + anyOf: + - type: integer + - type: string + description: |- + tolerance is the tolerance on the ratio between the current and desired + metric value under which no updates are made to the desired number of + replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not + set, the default cluster-wide tolerance is applied (by default 10%). + + For example, if autoscaling is configured with a memory consumption target of 100Mi, + and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be + triggered when the actual consumption falls below 95Mi or exceeds 101Mi. + + This is an alpha field and requires enabling the HPAConfigurableTolerance + feature gate. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scaleUp: + description: |- + scaleUp is scaling policy for scaling Up. + If not set, the default value is the higher of: + * increase no more than 4 pods per 60 seconds + * double the number of pods per 60 seconds + No stabilization is used. + properties: + policies: + description: |- + policies is a list of potential scaling polices which can be used during scaling. + If not set, use the default values: + - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. + - For scale down: allow all pods to be removed in a 15s window. + items: + description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. + properties: + periodSeconds: + description: |- + periodSeconds specifies the window of time for which the policy should hold true. + PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). + format: int32 + type: integer + type: + description: type is used to specify the scaling policy. + type: string + value: + description: |- + value contains the amount of change which is permitted by the policy. + It must be greater than zero + format: int32 + type: integer + required: + - periodSeconds + - type + - value + type: object + type: array + x-kubernetes-list-type: atomic + selectPolicy: + description: |- + selectPolicy is used to specify which policy should be used. + If not set, the default value Max is used. + type: string + stabilizationWindowSeconds: + description: |- + stabilizationWindowSeconds is the number of seconds for which past recommendations should be + considered while scaling up or scaling down. + StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). + If not set, use the default values: + - For scale up: 0 (i.e. no stabilization is done). + - For scale down: 300 (i.e. the stabilization window is 300 seconds long). + format: int32 + type: integer + tolerance: + anyOf: + - type: integer + - type: string + description: |- + tolerance is the tolerance on the ratio between the current and desired + metric value under which no updates are made to the desired number of + replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not + set, the default cluster-wide tolerance is applied (by default 10%). + + For example, if autoscaling is configured with a memory consumption target of 100Mi, + and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be + triggered when the actual consumption falls below 95Mi or exceeds 101Mi. + + This is an alpha field and requires enabling the HPAConfigurableTolerance + feature gate. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + type: object + enabled: + description: 'Deprecated: This field is ignored.' + type: boolean + maxReplicas: + description: 'Deprecated: This field is ignored.' + type: integer + metrics: + description: 'Deprecated: This field is ignored.' + items: + description: |- + MetricSpec specifies how to scale based on a single metric + (only `type` and one other matching field should be set at once). + properties: + containerResource: + description: |- + containerResource refers to a resource metric (such as those specified in + requests and limits) known to Kubernetes describing a single container in + each pod of the current scale target (e.g. CPU or memory). Such metrics are + built in to Kubernetes, and have special scaling options on top of those + available to normal per-pod metrics using the "pods" source. + properties: + container: + description: container is the name of the container in the pods of the scaling target + type: string + name: + description: name is the name of the resource in question. + type: string + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - container + - name + - target + type: object + external: + description: |- + external refers to a global metric that is not associated + with any Kubernetes object. It allows autoscaling based on information + coming from components running outside of cluster + (for example length of queue in cloud messaging service, or + QPS from loadbalancer running outside of cluster). + properties: + metric: + description: metric identifies the target metric by name and selector + properties: + name: + description: name is the name of the given metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - metric + - target + type: object + object: + description: |- + object refers to a metric describing a single kubernetes object + (for example, hits-per-second on an Ingress object). + properties: + describedObject: + description: describedObject specifies the descriptions of a object,such as kind,name apiVersion + properties: + apiVersion: + description: apiVersion is the API version of the referent + type: string + kind: + description: 'kind is the kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'name is the name of the referent; More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + required: + - kind + - name + type: object + metric: + description: metric identifies the target metric by name and selector + properties: + name: + description: name is the name of the given metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - describedObject + - metric + - target + type: object + pods: + description: |- + pods refers to a metric describing each pod in the current scale target + (for example, transactions-processed-per-second). The values will be + averaged together before being compared to the target value. + properties: + metric: + description: metric identifies the target metric by name and selector + properties: + name: + description: name is the name of the given metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - metric + - target + type: object + resource: + description: |- + resource refers to a resource metric (such as those specified in + requests and limits) known to Kubernetes describing each pod in the + current scale target (e.g. CPU or memory). Such metrics are built in to + Kubernetes, and have special scaling options on top of those available + to normal per-pod metrics using the "pods" source. + properties: + name: + description: name is the name of the resource in question. + type: string + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - name + - target + type: object + type: + description: |- + type is the type of metric source. It should be one of "ContainerResource", "External", + "Object", "Pods" or "Resource", each mapping to a matching field in the object. + type: string + required: + - type + type: object + type: array + minReplicas: + description: 'Deprecated: This field is ignored.' + type: integer + type: object backendFramework: description: BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm") enum: diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml index 7814ec4ea9..a9c94eeaa3 100644 --- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml @@ -218,6 +218,602 @@ spec: Annotations to add to generated Kubernetes resources for this component (such as Pod, Service, and Ingress when applicable). type: object + autoscaling: + description: |- + Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter + with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md + for migration guidance. This field will be removed in a future API version. + properties: + behavior: + description: 'Deprecated: This field is ignored.' + properties: + scaleDown: + description: |- + scaleDown is scaling policy for scaling Down. + If not set, the default value is to allow to scale down to minReplicas pods, with a + 300 second stabilization window (i.e., the highest recommendation for + the last 300sec is used). + properties: + policies: + description: |- + policies is a list of potential scaling polices which can be used during scaling. + If not set, use the default values: + - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. + - For scale down: allow all pods to be removed in a 15s window. + items: + description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. + properties: + periodSeconds: + description: |- + periodSeconds specifies the window of time for which the policy should hold true. + PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). + format: int32 + type: integer + type: + description: type is used to specify the scaling policy. + type: string + value: + description: |- + value contains the amount of change which is permitted by the policy. + It must be greater than zero + format: int32 + type: integer + required: + - periodSeconds + - type + - value + type: object + type: array + x-kubernetes-list-type: atomic + selectPolicy: + description: |- + selectPolicy is used to specify which policy should be used. + If not set, the default value Max is used. + type: string + stabilizationWindowSeconds: + description: |- + stabilizationWindowSeconds is the number of seconds for which past recommendations should be + considered while scaling up or scaling down. + StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). + If not set, use the default values: + - For scale up: 0 (i.e. no stabilization is done). + - For scale down: 300 (i.e. the stabilization window is 300 seconds long). + format: int32 + type: integer + tolerance: + anyOf: + - type: integer + - type: string + description: |- + tolerance is the tolerance on the ratio between the current and desired + metric value under which no updates are made to the desired number of + replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not + set, the default cluster-wide tolerance is applied (by default 10%). + + For example, if autoscaling is configured with a memory consumption target of 100Mi, + and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be + triggered when the actual consumption falls below 95Mi or exceeds 101Mi. + + This is an alpha field and requires enabling the HPAConfigurableTolerance + feature gate. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + scaleUp: + description: |- + scaleUp is scaling policy for scaling Up. + If not set, the default value is the higher of: + * increase no more than 4 pods per 60 seconds + * double the number of pods per 60 seconds + No stabilization is used. + properties: + policies: + description: |- + policies is a list of potential scaling polices which can be used during scaling. + If not set, use the default values: + - For scale up: allow doubling the number of pods, or an absolute change of 4 pods in a 15s window. + - For scale down: allow all pods to be removed in a 15s window. + items: + description: HPAScalingPolicy is a single policy which must hold true for a specified past interval. + properties: + periodSeconds: + description: |- + periodSeconds specifies the window of time for which the policy should hold true. + PeriodSeconds must be greater than zero and less than or equal to 1800 (30 min). + format: int32 + type: integer + type: + description: type is used to specify the scaling policy. + type: string + value: + description: |- + value contains the amount of change which is permitted by the policy. + It must be greater than zero + format: int32 + type: integer + required: + - periodSeconds + - type + - value + type: object + type: array + x-kubernetes-list-type: atomic + selectPolicy: + description: |- + selectPolicy is used to specify which policy should be used. + If not set, the default value Max is used. + type: string + stabilizationWindowSeconds: + description: |- + stabilizationWindowSeconds is the number of seconds for which past recommendations should be + considered while scaling up or scaling down. + StabilizationWindowSeconds must be greater than or equal to zero and less than or equal to 3600 (one hour). + If not set, use the default values: + - For scale up: 0 (i.e. no stabilization is done). + - For scale down: 300 (i.e. the stabilization window is 300 seconds long). + format: int32 + type: integer + tolerance: + anyOf: + - type: integer + - type: string + description: |- + tolerance is the tolerance on the ratio between the current and desired + metric value under which no updates are made to the desired number of + replicas (e.g. 0.01 for 1%). Must be greater than or equal to zero. If not + set, the default cluster-wide tolerance is applied (by default 10%). + + For example, if autoscaling is configured with a memory consumption target of 100Mi, + and scale-down and scale-up tolerances of 5% and 1% respectively, scaling will be + triggered when the actual consumption falls below 95Mi or exceeds 101Mi. + + This is an alpha field and requires enabling the HPAConfigurableTolerance + feature gate. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + type: object + enabled: + description: 'Deprecated: This field is ignored.' + type: boolean + maxReplicas: + description: 'Deprecated: This field is ignored.' + type: integer + metrics: + description: 'Deprecated: This field is ignored.' + items: + description: |- + MetricSpec specifies how to scale based on a single metric + (only `type` and one other matching field should be set at once). + properties: + containerResource: + description: |- + containerResource refers to a resource metric (such as those specified in + requests and limits) known to Kubernetes describing a single container in + each pod of the current scale target (e.g. CPU or memory). Such metrics are + built in to Kubernetes, and have special scaling options on top of those + available to normal per-pod metrics using the "pods" source. + properties: + container: + description: container is the name of the container in the pods of the scaling target + type: string + name: + description: name is the name of the resource in question. + type: string + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - container + - name + - target + type: object + external: + description: |- + external refers to a global metric that is not associated + with any Kubernetes object. It allows autoscaling based on information + coming from components running outside of cluster + (for example length of queue in cloud messaging service, or + QPS from loadbalancer running outside of cluster). + properties: + metric: + description: metric identifies the target metric by name and selector + properties: + name: + description: name is the name of the given metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - metric + - target + type: object + object: + description: |- + object refers to a metric describing a single kubernetes object + (for example, hits-per-second on an Ingress object). + properties: + describedObject: + description: describedObject specifies the descriptions of a object,such as kind,name apiVersion + properties: + apiVersion: + description: apiVersion is the API version of the referent + type: string + kind: + description: 'kind is the kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + name: + description: 'name is the name of the referent; More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + required: + - kind + - name + type: object + metric: + description: metric identifies the target metric by name and selector + properties: + name: + description: name is the name of the given metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - describedObject + - metric + - target + type: object + pods: + description: |- + pods refers to a metric describing each pod in the current scale target + (for example, transactions-processed-per-second). The values will be + averaged together before being compared to the target value. + properties: + metric: + description: metric identifies the target metric by name and selector + properties: + name: + description: name is the name of the given metric + type: string + selector: + description: |- + selector is the string-encoded form of a standard kubernetes label selector for the given metric + When set, it is passed as an additional parameter to the metrics server for more specific metrics scoping. + When unset, just the metricName will be used to gather metrics. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + required: + - name + type: object + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - metric + - target + type: object + resource: + description: |- + resource refers to a resource metric (such as those specified in + requests and limits) known to Kubernetes describing each pod in the + current scale target (e.g. CPU or memory). Such metrics are built in to + Kubernetes, and have special scaling options on top of those available + to normal per-pod metrics using the "pods" source. + properties: + name: + description: name is the name of the resource in question. + type: string + target: + description: target specifies the target value for the given metric + properties: + averageUtilization: + description: |- + averageUtilization is the target value of the average of the + resource metric across all relevant pods, represented as a percentage of + the requested value of the resource for the pods. + Currently only valid for Resource metric source type + format: int32 + type: integer + averageValue: + anyOf: + - type: integer + - type: string + description: |- + averageValue is the target value of the average of the + metric across all relevant pods (as a quantity) + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: + description: type represents whether the metric type is Utilization, Value, or AverageValue + type: string + value: + anyOf: + - type: integer + - type: string + description: value is the target value of the metric (as a quantity). + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - type + type: object + required: + - name + - target + type: object + type: + description: |- + type is the type of metric source. It should be one of "ContainerResource", "External", + "Object", "Pods" or "Resource", each mapping to a matching field in the object. + type: string + required: + - type + type: object + type: array + minReplicas: + description: 'Deprecated: This field is ignored.' + type: integer + type: object componentType: description: ComponentType indicates the role of this component (for example, "main"). type: string diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml index 4a9ecb3b3b..f822bb91db 100644 --- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentscalingadapters.yaml @@ -38,7 +38,7 @@ spec: name: DGD type: string - description: Service name - jsonPath: .spec.dgdRef.service + jsonPath: .spec.dgdRef.serviceName name: SERVICE type: string - description: Current replicas @@ -87,13 +87,13 @@ spec: description: Name of the DynamoGraphDeployment minLength: 1 type: string - service: - description: Service is the key name of the service within the DGD's spec.services map to scale + serviceName: + description: ServiceName is the key name of the service within the DGD's spec.services map to scale minLength: 1 type: string required: - name - - service + - serviceName type: object replicas: description: |- diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go index ae561b9a24..3f48ac9d01 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go @@ -643,8 +643,8 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C Spec: nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ Replicas: currentReplicas, DGDRef: nvidiacomv1alpha1.DynamoGraphDeploymentServiceRef{ - Name: dynamoDeployment.Name, - Service: serviceName, + Name: dynamoDeployment.Name, + ServiceName: serviceName, }, }, } @@ -669,7 +669,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C for i := range adapterList.Items { adapter := &adapterList.Items[i] - serviceName := adapter.Spec.DGDRef.Service + serviceName := adapter.Spec.DGDRef.ServiceName // Check if service still exists in DGD if _, exists := dynamoDeployment.Spec.Services[serviceName]; !exists { diff --git a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go index 895dbbe97a..4994f21e1c 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go @@ -85,13 +85,13 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co } // 3. Find the target service in DGD's spec.services map - component, exists := dgd.Spec.Services[adapter.Spec.DGDRef.Service] + component, exists := dgd.Spec.Services[adapter.Spec.DGDRef.ServiceName] if !exists { logger.Error(nil, "Service not found in DGD", - "service", adapter.Spec.DGDRef.Service, + "service", adapter.Spec.DGDRef.ServiceName, "dgd", dgd.Name, "availableServices", getServiceKeys(dgd.Spec.Services)) - return ctrl.Result{}, fmt.Errorf("service %s not found in DGD", adapter.Spec.DGDRef.Service) + return ctrl.Result{}, fmt.Errorf("service %s not found in DGD", adapter.Spec.DGDRef.ServiceName) } // Get current replicas from DGD (default to 1 if not set) @@ -104,7 +104,7 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co // If DGD replicas differ from adapter status, DGD was modified externally if currentReplicas != adapter.Status.Replicas { logger.Info("Detected out-of-band DGD change, syncing adapter from DGD", - "service", adapter.Spec.DGDRef.Service, + "service", adapter.Spec.DGDRef.ServiceName, "dgdReplicas", currentReplicas, "adapterStatusReplicas", adapter.Status.Replicas) @@ -123,7 +123,7 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co if currentReplicas != adapter.Spec.Replicas { // Update the service's replicas in DGD component.Replicas = &adapter.Spec.Replicas - dgd.Spec.Services[adapter.Spec.DGDRef.Service] = component + dgd.Spec.Services[adapter.Spec.DGDRef.ServiceName] = component if err := r.Update(ctx, dgd); err != nil { logger.Error(err, "Failed to update DGD") @@ -134,12 +134,12 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co logger.Info("Scaled service", "dgd", dgd.Name, - "service", adapter.Spec.DGDRef.Service, + "service", adapter.Spec.DGDRef.ServiceName, "from", currentReplicas, "to", adapter.Spec.Replicas) r.Recorder.Eventf(adapter, corev1.EventTypeNormal, "Scaled", - "Scaled service %s from %d to %d replicas", adapter.Spec.DGDRef.Service, currentReplicas, adapter.Spec.Replicas) + "Scaled service %s from %d to %d replicas", adapter.Spec.DGDRef.ServiceName, currentReplicas, adapter.Spec.Replicas) // Record scaling event now := metav1.Now() @@ -148,7 +148,7 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co // 7. Update adapter status adapter.Status.Replicas = adapter.Spec.Replicas - adapter.Status.Selector = r.buildPodSelector(dgd, adapter.Spec.DGDRef.Service) + adapter.Status.Selector = r.buildPodSelector(dgd, adapter.Spec.DGDRef.ServiceName) if err := r.Status().Update(ctx, adapter); err != nil { logger.Error(err, "Failed to update adapter status") diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go b/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go index c77303fde2..621d2c5eb4 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go @@ -46,9 +46,12 @@ func (v *DynamoComponentDeploymentValidator) Validate() (admission.Warnings, err return nil, err } + // Collect deprecation warnings + warnings := sharedValidator.GetWarnings() + // DCD-specific validation would go here (currently none) - return nil, nil + return warnings, nil } // ValidateUpdate performs stateful validation comparing old and new DynamoComponentDeployment. diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go index e6bf9e3893..1e66d8ae42 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go @@ -51,14 +51,18 @@ func (v *DynamoGraphDeploymentValidator) Validate() (admission.Warnings, error) return nil, err } + var allWarnings admission.Warnings + // Validate each service for serviceName, service := range v.deployment.Spec.Services { - if err := v.validateService(serviceName, service); err != nil { + warnings, err := v.validateService(serviceName, service) + if err != nil { return nil, err } + allWarnings = append(allWarnings, warnings...) } - return nil, nil + return allWarnings, nil } // ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeployment. @@ -74,11 +78,19 @@ func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.D } // validateService validates a single service configuration using SharedSpecValidator. -func (v *DynamoGraphDeploymentValidator) validateService(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) error { +// Returns warnings and error. +func (v *DynamoGraphDeploymentValidator) validateService(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) (admission.Warnings, error) { // Use SharedSpecValidator to validate service spec (which is a DynamoComponentDeploymentSharedSpec) fieldPath := fmt.Sprintf("spec.services[%s]", serviceName) sharedValidator := NewSharedSpecValidator(service, fieldPath) - return sharedValidator.Validate() + + if err := sharedValidator.Validate(); err != nil { + return nil, err + } + + // Collect deprecation warnings + warnings := sharedValidator.GetWarnings() + return warnings, nil } // validatePVCs validates the PVC configurations. diff --git a/deploy/cloud/operator/internal/webhook/validation/shared.go b/deploy/cloud/operator/internal/webhook/validation/shared.go index f22f40abed..c93f63fb82 100644 --- a/deploy/cloud/operator/internal/webhook/validation/shared.go +++ b/deploy/cloud/operator/internal/webhook/validation/shared.go @@ -105,3 +105,20 @@ func (v *SharedSpecValidator) validateSharedMemory() error { } return nil } + +// GetWarnings returns deprecation warnings for the spec. +// This should be called after Validate() to collect any deprecation notices. +func (v *SharedSpecValidator) GetWarnings() []string { + var warnings []string + + // Check for deprecated autoscaling field + //nolint:staticcheck // SA1019: Intentionally checking deprecated field to warn users + if v.spec.Autoscaling != nil { + warnings = append(warnings, fmt.Sprintf( + "%s.autoscaling is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter "+ + "with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md", + v.fieldPath)) + } + + return warnings +} diff --git a/docs/kubernetes/api_reference.md b/docs/kubernetes/api_reference.md index 5218dd85ac..e943dabd81 100644 --- a/docs/kubernetes/api_reference.md +++ b/docs/kubernetes/api_reference.md @@ -42,6 +42,29 @@ Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API +#### Autoscaling + + + +Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter +with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md +for migration guidance. This field will be removed in a future API version. + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `enabled` _boolean_ | Deprecated: This field is ignored. | | | +| `minReplicas` _integer_ | Deprecated: This field is ignored. | | | +| `maxReplicas` _integer_ | Deprecated: This field is ignored. | | | +| `behavior` _[HorizontalPodAutoscalerBehavior](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling)_ | Deprecated: This field is ignored. | | | +| `metrics` _[MetricSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling) array_ | Deprecated: This field is ignored. | | | + + #### ConfigMapKeySelector @@ -145,6 +168,7 @@ _Appears in:_ | `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\}
| | `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | | | `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources. | | | +| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter
with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md
for migration guidance. This field will be removed in a future API version. | | | | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | | | `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as
environment variables in the component containers. | | | | `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. | | | @@ -181,6 +205,7 @@ _Appears in:_ | `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.
The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\}
| | `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | | | `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,
GPUs/devices, and any runtime-specific resources. | | | +| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter
with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md
for migration guidance. This field will be removed in a future API version. | | | | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | | | `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as
environment variables in the component containers. | | | | `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. | | | @@ -366,7 +391,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `name` _string_ | Name of the DynamoGraphDeployment | | MinLength: 1
Required: \{\}
| -| `service` _string_ | Service is the key name of the service within the DGD's spec.services map to scale | | MinLength: 1
Required: \{\}
| +| `serviceName` _string_ | ServiceName is the key name of the service within the DGD's spec.services map to scale | | MinLength: 1
Required: \{\}
| #### DynamoGraphDeploymentSpec From 7706428833d6c2951daa5ff44cd0bb22bed3886d Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Mon, 8 Dec 2025 15:23:32 -0700 Subject: [PATCH 15/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- ...nvidia.com_dynamocomponentdeployments.yaml | 20 ++++- .../nvidia.com_dynamographdeployments.yaml | 20 ++++- deploy/cloud/operator/api/v1alpha1/common.go | 12 +++ .../dynamocomponentdeployment_types.go | 10 ++- .../api/v1alpha1/zz_generated.deepcopy.go | 20 +++++ ...nvidia.com_dynamocomponentdeployments.yaml | 20 ++++- .../nvidia.com_dynamographdeployments.yaml | 20 ++++- .../dynamographdeployment_controller.go | 18 +++-- ...raphdeploymentscalingadapter_controller.go | 2 +- .../cloud/operator/internal/webhook/common.go | 53 +++++++++++++ .../validation/dynamocomponentdeployment.go | 8 +- .../validation/dynamographdeployment.go | 77 +++++++++++++++++-- .../dynamographdeployment_handler.go | 18 ++++- .../internal/webhook/validation/shared.go | 44 +++++------ .../webhook/validation/shared_test.go | 2 +- docs/kubernetes/api_reference.md | 25 +++++- 16 files changed, 313 insertions(+), 56 deletions(-) diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml index 9ac97c1430..c90e3bdfe7 100644 --- a/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml @@ -10189,8 +10189,12 @@ spec: type: integer type: object replicas: - description: Replicas is the desired number of Pods for this component when autoscaling is not used. + description: |- + Replicas is the desired number of Pods for this component. + When scalingAdapter is enabled (default), this field is managed by the + DynamoGraphDeploymentScalingAdapter and should not be modified directly. format: int32 + minimum: 0 type: integer resources: description: |- @@ -10269,6 +10273,20 @@ spec: type: string type: object type: object + scalingAdapter: + description: |- + ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter. + When enabled (default), replicas are managed via DGDSA and external autoscalers can scale + the service using the Scale subresource. When disabled, replicas can be modified directly. + properties: + disable: + default: false + description: |- + Disable indicates whether the ScalingAdapter should be disabled for this service. + When false (default), a DGDSA is created and owns the replicas field. + When true, no DGDSA is created and replicas can be modified directly in the DGD. + type: boolean + type: object serviceName: description: The name of the component type: string diff --git a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml index a9c94eeaa3..4db1e902b8 100644 --- a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml +++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml @@ -10324,8 +10324,12 @@ spec: type: integer type: object replicas: - description: Replicas is the desired number of Pods for this component when autoscaling is not used. + description: |- + Replicas is the desired number of Pods for this component. + When scalingAdapter is enabled (default), this field is managed by the + DynamoGraphDeploymentScalingAdapter and should not be modified directly. format: int32 + minimum: 0 type: integer resources: description: |- @@ -10404,6 +10408,20 @@ spec: type: string type: object type: object + scalingAdapter: + description: |- + ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter. + When enabled (default), replicas are managed via DGDSA and external autoscalers can scale + the service using the Scale subresource. When disabled, replicas can be modified directly. + properties: + disable: + default: false + description: |- + Disable indicates whether the ScalingAdapter should be disabled for this service. + When false (default), a DGDSA is created and owns the replicas field. + When true, no DGDSA is created and replicas can be modified directly in the DGD. + type: boolean + type: object serviceName: description: The name of the component type: string diff --git a/deploy/cloud/operator/api/v1alpha1/common.go b/deploy/cloud/operator/api/v1alpha1/common.go index cb181523cc..b68dd818c0 100644 --- a/deploy/cloud/operator/api/v1alpha1/common.go +++ b/deploy/cloud/operator/api/v1alpha1/common.go @@ -123,3 +123,15 @@ type ExtraPodSpec struct { *corev1.PodSpec `json:",inline"` MainContainer *corev1.Container `json:"mainContainer,omitempty"` } + +// ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter +// for replica management. When enabled (default), the DGDSA owns the replicas field and +// external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource. +type ScalingAdapter struct { + // Disable indicates whether the ScalingAdapter should be disabled for this service. + // When false (default), a DGDSA is created and owns the replicas field. + // When true, no DGDSA is created and replicas can be modified directly in the DGD. + // +optional + // +kubebuilder:default=false + Disable bool `json:"disable,omitempty"` +} diff --git a/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go b/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go index 5328a93cf5..8a2abb78f2 100644 --- a/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go +++ b/deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go @@ -110,10 +110,18 @@ type DynamoComponentDeploymentSharedSpec struct { LivenessProbe *corev1.Probe `json:"livenessProbe,omitempty"` // ReadinessProbe to signal when the container is ready to receive traffic. ReadinessProbe *corev1.Probe `json:"readinessProbe,omitempty"` - // Replicas is the desired number of Pods for this component when autoscaling is not used. + // Replicas is the desired number of Pods for this component. + // When scalingAdapter is enabled (default), this field is managed by the + // DynamoGraphDeploymentScalingAdapter and should not be modified directly. + // +kubebuilder:validation:Minimum=0 Replicas *int32 `json:"replicas,omitempty"` // Multinode is the configuration for multinode components. Multinode *MultinodeSpec `json:"multinode,omitempty"` + // ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter. + // When enabled (default), replicas are managed via DGDSA and external autoscalers can scale + // the service using the Scale subresource. When disabled, replicas can be modified directly. + // +optional + ScalingAdapter *ScalingAdapter `json:"scalingAdapter,omitempty"` } type MultinodeSpec struct { diff --git a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go index 69b32f1d96..d3ecbb44ec 100644 --- a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go @@ -371,6 +371,11 @@ func (in *DynamoComponentDeploymentSharedSpec) DeepCopyInto(out *DynamoComponent *out = new(MultinodeSpec) **out = **in } + if in.ScalingAdapter != nil { + in, out := &in.ScalingAdapter, &out.ScalingAdapter + *out = new(ScalingAdapter) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoComponentDeploymentSharedSpec. @@ -1194,6 +1199,21 @@ func (in *Resources) DeepCopy() *Resources { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScalingAdapter) DeepCopyInto(out *ScalingAdapter) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScalingAdapter. +func (in *ScalingAdapter) DeepCopy() *ScalingAdapter { + if in == nil { + return nil + } + out := new(ScalingAdapter) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SharedMemorySpec) DeepCopyInto(out *SharedMemorySpec) { *out = *in diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml index 9ac97c1430..c90e3bdfe7 100644 --- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml @@ -10189,8 +10189,12 @@ spec: type: integer type: object replicas: - description: Replicas is the desired number of Pods for this component when autoscaling is not used. + description: |- + Replicas is the desired number of Pods for this component. + When scalingAdapter is enabled (default), this field is managed by the + DynamoGraphDeploymentScalingAdapter and should not be modified directly. format: int32 + minimum: 0 type: integer resources: description: |- @@ -10269,6 +10273,20 @@ spec: type: string type: object type: object + scalingAdapter: + description: |- + ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter. + When enabled (default), replicas are managed via DGDSA and external autoscalers can scale + the service using the Scale subresource. When disabled, replicas can be modified directly. + properties: + disable: + default: false + description: |- + Disable indicates whether the ScalingAdapter should be disabled for this service. + When false (default), a DGDSA is created and owns the replicas field. + When true, no DGDSA is created and replicas can be modified directly in the DGD. + type: boolean + type: object serviceName: description: The name of the component type: string diff --git a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml index a9c94eeaa3..4db1e902b8 100644 --- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml +++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml @@ -10324,8 +10324,12 @@ spec: type: integer type: object replicas: - description: Replicas is the desired number of Pods for this component when autoscaling is not used. + description: |- + Replicas is the desired number of Pods for this component. + When scalingAdapter is enabled (default), this field is managed by the + DynamoGraphDeploymentScalingAdapter and should not be modified directly. format: int32 + minimum: 0 type: integer resources: description: |- @@ -10404,6 +10408,20 @@ spec: type: string type: object type: object + scalingAdapter: + description: |- + ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter. + When enabled (default), replicas are managed via DGDSA and external autoscalers can scale + the service using the Scale subresource. When disabled, replicas can be modified directly. + properties: + disable: + default: false + description: |- + Disable indicates whether the ScalingAdapter should be disabled for this service. + When false (default), a DGDSA is created and owns the replicas field. + When true, no DGDSA is created and replicas can be modified directly in the DGD. + type: boolean + type: object serviceName: description: The name of the component type: string diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go index 3f48ac9d01..823818ac1e 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go @@ -616,19 +616,24 @@ func (r *DynamoGraphDeploymentReconciler) reconcilePVCs(ctx context.Context, dyn } // reconcileScalingAdapters ensures a DynamoGraphDeploymentScalingAdapter exists for each service in the DGD -// This enables pluggable autoscaling via HPA, KEDA, or Planner +// that has scaling adapter enabled (default). Services with scalingAdapter.disable=true will not have a DGDSA. +// This enables pluggable autoscaling via HPA, KEDA, or Planner. func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error { logger := log.FromContext(ctx) - // Create or update an adapter for each service using SyncResource pattern + // Process each service - SyncResource handles create, update, and delete via toDelete flag for serviceName, component := range dynamoDeployment.Spec.Services { + // Check if scaling adapter is disabled for this service + scalingAdapterDisabled := component.ScalingAdapter != nil && component.ScalingAdapter.Disable + // Get current replicas (default to 1 if not set) currentReplicas := int32(1) if component.Replicas != nil { currentReplicas = *component.Replicas } - // Use SyncResource to handle creation/updates + // Use SyncResource to handle creation/updates/deletion + // When toDelete=true, SyncResource will delete the existing resource if it exists _, _, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter, bool, error) { adapterName := generateAdapterName(dynamoDeployment.Name, serviceName) adapter := &nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter{ @@ -648,7 +653,8 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C }, }, } - return adapter, false, nil + // Return toDelete=true if scaling adapter is disabled + return adapter, scalingAdapterDisabled, nil }) if err != nil { @@ -657,7 +663,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C } } - // Clean up orphaned adapters (services that no longer exist in DGD) + // Clean up adapters for services that were removed from DGD entirely adapterList := &nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapterList{} if err := r.List(ctx, adapterList, client.InNamespace(dynamoDeployment.Namespace), @@ -671,7 +677,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C adapter := &adapterList.Items[i] serviceName := adapter.Spec.DGDRef.ServiceName - // Check if service still exists in DGD + // Delete adapter if service no longer exists in DGD if _, exists := dynamoDeployment.Spec.Services[serviceName]; !exists { logger.Info("Deleting orphaned DynamoGraphDeploymentScalingAdapter", "adapter", adapter.Name, "service", serviceName) if err := r.Delete(ctx, adapter); err != nil && !errors.IsNotFound(err) { diff --git a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go index 4994f21e1c..4a0577d1de 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go @@ -86,7 +86,7 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co // 3. Find the target service in DGD's spec.services map component, exists := dgd.Spec.Services[adapter.Spec.DGDRef.ServiceName] - if !exists { + if !exists || component == nil { logger.Error(nil, "Service not found in DGD", "service", adapter.Spec.DGDRef.ServiceName, "dgd", dgd.Name, diff --git a/deploy/cloud/operator/internal/webhook/common.go b/deploy/cloud/operator/internal/webhook/common.go index 6333738739..c18edd98f4 100644 --- a/deploy/cloud/operator/internal/webhook/common.go +++ b/deploy/cloud/operator/internal/webhook/common.go @@ -19,7 +19,9 @@ package webhook import ( "context" + "strings" + authenticationv1 "k8s.io/api/authentication/v1" "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/client" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -118,3 +120,54 @@ func (v *LeaseAwareValidator) shouldSkipValidation(obj runtime.Object) bool { return false } + +// DGDReplicasModifierSuffixes defines suffixes for service accounts that are authorized +// to modify DGD replicas when scaling adapter is enabled. +// Service accounts matching any of these suffixes are allowed regardless of namespace. +var DGDReplicasModifierSuffixes = []string{ + // Dynamo operator controller manager (handles DGDSA reconciliation) + // Example: "dynamo-platform-dynamo-operator-controller-manager" + "-dynamo-operator-controller-manager", + + // Planner service account (manages DGD replicas for autoscaling) + // Example: "planner-serviceaccount" + "planner-serviceaccount", +} + +// CanModifyDGDReplicas checks if the request comes from a service account authorized +// to modify DGD replicas when scaling adapter is enabled. +// Service accounts are identified by username format: system:serviceaccount:: +// +// Authorized service accounts (by suffix): +// - *-dynamo-operator-controller-manager (for DGDSA reconciliation) +// - *planner-serviceaccount (for Planner autoscaling) +func CanModifyDGDReplicas(userInfo authenticationv1.UserInfo) bool { + username := userInfo.Username + + // Service accounts have username format: system:serviceaccount:: + if !strings.HasPrefix(username, "system:serviceaccount:") { + return false + } + + // Parse: system:serviceaccount:: + parts := strings.Split(username, ":") + if len(parts) != 4 { + return false + } + + namespace := parts[2] + saName := parts[3] + + // Check against authorized suffixes + for _, suffix := range DGDReplicasModifierSuffixes { + if strings.HasSuffix(saName, suffix) { + webhookCommonLog.V(1).Info("allowing DGD replicas modification", + "serviceAccount", saName, + "namespace", namespace, + "matchedSuffix", suffix) + return true + } + } + + return false +} diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go b/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go index 621d2c5eb4..c0e0628834 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go @@ -42,16 +42,10 @@ func NewDynamoComponentDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoC func (v *DynamoComponentDeploymentValidator) Validate() (admission.Warnings, error) { // Validate shared spec fields using SharedSpecValidator sharedValidator := NewSharedSpecValidator(&v.deployment.Spec.DynamoComponentDeploymentSharedSpec, "spec") - if err := sharedValidator.Validate(); err != nil { - return nil, err - } - - // Collect deprecation warnings - warnings := sharedValidator.GetWarnings() // DCD-specific validation would go here (currently none) - return warnings, nil + return sharedValidator.Validate() } // ValidateUpdate performs stateful validation comparing old and new DynamoComponentDeployment. diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go index 1e66d8ae42..d880501626 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go @@ -22,6 +22,8 @@ import ( "fmt" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook" + authenticationv1 "k8s.io/api/authentication/v1" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" ) @@ -68,29 +70,88 @@ func (v *DynamoGraphDeploymentValidator) Validate() (admission.Warnings, error) // ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeployment. // Returns warnings and error. func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment) (admission.Warnings, error) { + return v.ValidateUpdateWithUserInfo(old, nil) +} + +// ValidateUpdateWithUserInfo performs stateful validation with user identity checking. +// When userInfo is provided, it validates that only allowed controllers can modify +// replicas for services with scaling adapter enabled. +// Returns warnings and error. +func (v *DynamoGraphDeploymentValidator) ValidateUpdateWithUserInfo(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) (admission.Warnings, error) { // Validate that BackendFramework is not changed (immutable) if v.deployment.Spec.BackendFramework != old.Spec.BackendFramework { warning := "Changing spec.backendFramework may cause unexpected behavior" return admission.Warnings{warning}, fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation") } + // Validate replicas changes for services with scaling adapter enabled + if userInfo != nil { + if err := v.validateReplicasChanges(old, *userInfo); err != nil { + return nil, err + } + } + return nil, nil } +// validateReplicasChanges checks if replicas were changed for services with scaling adapter enabled. +// Only authorized service accounts (operator controller, planner) can modify these fields. +func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo authenticationv1.UserInfo) error { + // If the request comes from an authorized service account, allow the change + if internalwebhook.CanModifyDGDReplicas(userInfo) { + return nil + } + + var errs []error + + for serviceName, newService := range v.deployment.Spec.Services { + // Check if scaling adapter is enabled for this service (enabled by default) + scalingAdapterEnabled := true + if newService.ScalingAdapter != nil && newService.ScalingAdapter.Disable { + scalingAdapterEnabled = false + } + + if !scalingAdapterEnabled { + // Scaling adapter is disabled, users can modify replicas directly + continue + } + + // Get old service (if exists) + oldService, exists := old.Spec.Services[serviceName] + if !exists { + // New service, no comparison needed + continue + } + + // Check if replicas changed + oldReplicas := int32(1) // default + if oldService.Replicas != nil { + oldReplicas = *oldService.Replicas + } + + newReplicas := int32(1) // default + if newService.Replicas != nil { + newReplicas = *newService.Replicas + } + + if oldReplicas != newReplicas { + errs = append(errs, fmt.Errorf( + "spec.services[%s].replicas cannot be modified directly when scaling adapter is enabled; "+ + "scale or update the related DynamoGraphDeploymentScalingAdapter instead", + serviceName)) + } + } + + return errors.Join(errs...) +} + // validateService validates a single service configuration using SharedSpecValidator. // Returns warnings and error. func (v *DynamoGraphDeploymentValidator) validateService(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) (admission.Warnings, error) { // Use SharedSpecValidator to validate service spec (which is a DynamoComponentDeploymentSharedSpec) fieldPath := fmt.Sprintf("spec.services[%s]", serviceName) sharedValidator := NewSharedSpecValidator(service, fieldPath) - - if err := sharedValidator.Validate(); err != nil { - return nil, err - } - - // Collect deprecation warnings - warnings := sharedValidator.GetWarnings() - return warnings, nil + return sharedValidator.Validate() } // validatePVCs validates the PVC configurations. diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go index 074a4c5cc2..f200e91486 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go @@ -91,9 +91,23 @@ func (h *DynamoGraphDeploymentHandler) ValidateUpdate(ctx context.Context, oldOb return warnings, err } - // Validate stateful rules (immutability) - updateWarnings, err := validator.ValidateUpdate(oldDeployment) + // Get user info from admission request context for identity-based validation + req, err := admission.RequestFromContext(ctx) if err != nil { + logger.Error(err, "failed to get admission request from context, skipping user-based validation") + // Fall back to basic validation without user info + updateWarnings, err := validator.ValidateUpdate(oldDeployment) + if err != nil { + return updateWarnings, err + } + warnings = append(warnings, updateWarnings...) + return warnings, nil + } + + // Validate stateful rules (immutability + replicas protection) + updateWarnings, err := validator.ValidateUpdateWithUserInfo(oldDeployment, &req.UserInfo) + if err != nil { + logger.Info("validation failed", "error", err.Error(), "user", req.UserInfo.Username) return updateWarnings, err } diff --git a/deploy/cloud/operator/internal/webhook/validation/shared.go b/deploy/cloud/operator/internal/webhook/validation/shared.go index c93f63fb82..30edb0500d 100644 --- a/deploy/cloud/operator/internal/webhook/validation/shared.go +++ b/deploy/cloud/operator/internal/webhook/validation/shared.go @@ -21,6 +21,7 @@ import ( "fmt" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" ) // SharedSpecValidator validates DynamoComponentDeploymentSharedSpec fields. @@ -41,33 +42,45 @@ func NewSharedSpecValidator(spec *nvidiacomv1alpha1.DynamoComponentDeploymentSha } // Validate performs validation on the shared spec fields. -// Returns an error if validation fails. -func (v *SharedSpecValidator) Validate() error { +// Returns warnings (e.g., deprecation notices) and error if validation fails. +func (v *SharedSpecValidator) Validate() (admission.Warnings, error) { // Validate replicas if specified if v.spec.Replicas != nil && *v.spec.Replicas < 0 { - return fmt.Errorf("%s.replicas must be non-negative", v.fieldPath) + return nil, fmt.Errorf("%s.replicas must be non-negative", v.fieldPath) } // Validate ingress configuration if enabled if v.spec.Ingress != nil && v.spec.Ingress.Enabled { if err := v.validateIngress(); err != nil { - return err + return nil, err } } // Validate volume mounts if err := v.validateVolumeMounts(); err != nil { - return err + return nil, err } // Validate shared memory if v.spec.SharedMemory != nil { if err := v.validateSharedMemory(); err != nil { - return err + return nil, err } } - return nil + // Collect warnings (e.g., deprecation notices) + var warnings admission.Warnings + + // Check for deprecated autoscaling field + //nolint:staticcheck // SA1019: Intentionally checking deprecated field to warn users + if v.spec.Autoscaling != nil { + warnings = append(warnings, fmt.Sprintf( + "%s.autoscaling is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter "+ + "with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md", + v.fieldPath)) + } + + return warnings, nil } // validateIngress validates the ingress configuration. @@ -105,20 +118,3 @@ func (v *SharedSpecValidator) validateSharedMemory() error { } return nil } - -// GetWarnings returns deprecation warnings for the spec. -// This should be called after Validate() to collect any deprecation notices. -func (v *SharedSpecValidator) GetWarnings() []string { - var warnings []string - - // Check for deprecated autoscaling field - //nolint:staticcheck // SA1019: Intentionally checking deprecated field to warn users - if v.spec.Autoscaling != nil { - warnings = append(warnings, fmt.Sprintf( - "%s.autoscaling is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter "+ - "with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md", - v.fieldPath)) - } - - return warnings -} diff --git a/deploy/cloud/operator/internal/webhook/validation/shared_test.go b/deploy/cloud/operator/internal/webhook/validation/shared_test.go index 0d009b4f0f..97cbe87e24 100644 --- a/deploy/cloud/operator/internal/webhook/validation/shared_test.go +++ b/deploy/cloud/operator/internal/webhook/validation/shared_test.go @@ -184,7 +184,7 @@ func TestSharedSpecValidator_Validate(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { validator := NewSharedSpecValidator(tt.spec, tt.fieldPath) - err := validator.Validate() + _, err := validator.Validate() if (err != nil) != tt.wantErr { t.Errorf("SharedSpecValidator.Validate() error = %v, wantErr %v", err, tt.wantErr) diff --git a/docs/kubernetes/api_reference.md b/docs/kubernetes/api_reference.md index e943dabd81..4ae3246155 100644 --- a/docs/kubernetes/api_reference.md +++ b/docs/kubernetes/api_reference.md @@ -179,8 +179,9 @@ _Appears in:_ | `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.
It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field
that allows overriding the main container configuration. | | | | `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. | | | | `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. | | | -| `replicas` _integer_ | Replicas is the desired number of Pods for this component when autoscaling is not used. | | | +| `replicas` _integer_ | Replicas is the desired number of Pods for this component.
When scalingAdapter is enabled (default), this field is managed by the
DynamoGraphDeploymentScalingAdapter and should not be modified directly. | | Minimum: 0
| | `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | | +| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
the service using the Scale subresource. When disabled, replicas can be modified directly. | | | #### DynamoComponentDeploymentSpec @@ -216,8 +217,9 @@ _Appears in:_ | `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.
It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field
that allows overriding the main container configuration. | | | | `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. | | | | `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. | | | -| `replicas` _integer_ | Replicas is the desired number of Pods for this component when autoscaling is not used. | | | +| `replicas` _integer_ | Replicas is the desired number of Pods for this component.
When scalingAdapter is enabled (default), this field is managed by the
DynamoGraphDeploymentScalingAdapter and should not be modified directly. | | Minimum: 0
| | `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | | +| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
the service using the Scale subresource. When disabled, replicas can be modified directly. | | | #### DynamoGraphDeployment @@ -718,6 +720,25 @@ _Appears in:_ | `claims` _[ResourceClaim](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourceclaim-v1-core) array_ | Claims specifies resource claims for dynamic resource allocation | | | +#### ScalingAdapter + + + +ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter +for replica management. When enabled (default), the DGDSA owns the replicas field and +external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource. + + + +_Appears in:_ +- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec) +- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `disable` _boolean_ | Disable indicates whether the ScalingAdapter should be disabled for this service.
When false (default), a DGDSA is created and owns the replicas field.
When true, no DGDSA is created and replicas can be modified directly in the DGD. | false | | + + #### SharedMemorySpec From f1cb53b65f73de58cc2456cf1aedce6819afa932 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Mon, 8 Dec 2025 16:53:18 -0700 Subject: [PATCH 16/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- .../dynamographdeployment_controller_test.go | 321 ++++++++++++++++++ .../webhook/validation/shared_test.go | 50 +++ docs/kubernetes/autoscaling.md | 82 ++++- 3 files changed, 451 insertions(+), 2 deletions(-) create mode 100644 deploy/cloud/operator/internal/controller/dynamographdeployment_controller_test.go diff --git a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller_test.go b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller_test.go new file mode 100644 index 0000000000..a217fd403c --- /dev/null +++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller_test.go @@ -0,0 +1,321 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controller + +import ( + "context" + "testing" + + "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/tools/record" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestDynamoGraphDeploymentReconciler_reconcileScalingAdapters(t *testing.T) { + // Register custom types with the scheme + if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 to scheme: %v", err) + } + + tests := []struct { + name string + dgd *v1alpha1.DynamoGraphDeployment + existingAdapters []v1alpha1.DynamoGraphDeploymentScalingAdapter + expectedAdapterCount int + expectedAdapters map[string]int32 // map of adapter name to expected replicas + expectDeleted []string // adapter names that should be deleted + }{ + { + name: "creates adapters for all services", + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(2)), + }, + "decode": { + Replicas: ptr.To(int32(3)), + }, + }, + }, + }, + expectedAdapterCount: 2, + expectedAdapters: map[string]int32{ + "test-dgd-frontend": 2, + "test-dgd-decode": 3, + }, + }, + { + name: "uses default replicas when not specified", + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "worker": {}, + }, + }, + }, + expectedAdapterCount: 1, + expectedAdapters: map[string]int32{ + "test-dgd-worker": 1, // default replicas + }, + }, + { + name: "skips adapter creation when disabled", + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(2)), + }, + "decode": { + Replicas: ptr.To(int32(3)), + ScalingAdapter: &v1alpha1.ScalingAdapter{ + Disable: true, + }, + }, + }, + }, + }, + expectedAdapterCount: 1, + expectedAdapters: map[string]int32{ + "test-dgd-frontend": 2, + }, + }, + { + name: "deletes adapter when service is removed", + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + UID: "test-uid", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(2)), + }, + }, + }, + }, + existingAdapters: []v1alpha1.DynamoGraphDeploymentScalingAdapter{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: "test-dgd", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "nvidia.com/v1alpha1", + Kind: "DynamoGraphDeployment", + Name: "test-dgd", + UID: "test-uid", + }, + }, + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 2, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "Frontend", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-removed", + Namespace: "default", + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: "test-dgd", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "nvidia.com/v1alpha1", + Kind: "DynamoGraphDeployment", + Name: "test-dgd", + UID: "test-uid", + }, + }, + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 1, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "removed", + }, + }, + }, + }, + expectedAdapterCount: 1, + expectedAdapters: map[string]int32{ + "test-dgd-frontend": 2, + }, + expectDeleted: []string{"test-dgd-removed"}, + }, + { + name: "deletes adapter when scalingAdapter.disable is set to true", + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + UID: "test-uid", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(2)), + ScalingAdapter: &v1alpha1.ScalingAdapter{ + Disable: true, + }, + }, + }, + }, + }, + existingAdapters: []v1alpha1.DynamoGraphDeploymentScalingAdapter{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: "test-dgd", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "nvidia.com/v1alpha1", + Kind: "DynamoGraphDeployment", + Name: "test-dgd", + UID: "test-uid", + }, + }, + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 2, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "Frontend", + }, + }, + }, + }, + expectedAdapterCount: 0, + expectedAdapters: map[string]int32{}, + expectDeleted: []string{"test-dgd-frontend"}, + }, + { + name: "adapter name uses lowercase service name", + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "MyService": { + Replicas: ptr.To(int32(1)), + }, + }, + }, + }, + expectedAdapterCount: 1, + expectedAdapters: map[string]int32{ + "my-dgd-myservice": 1, // lowercase + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Build initial objects + var initObjs []client.Object + initObjs = append(initObjs, tt.dgd) + for i := range tt.existingAdapters { + initObjs = append(initObjs, &tt.existingAdapters[i]) + } + + // Create fake client + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(initObjs...). + Build() + + // Create reconciler + r := &DynamoGraphDeploymentReconciler{ + Client: fakeClient, + Recorder: record.NewFakeRecorder(10), + } + + // Run reconcileScalingAdapters + ctx := context.Background() + err := r.reconcileScalingAdapters(ctx, tt.dgd) + if err != nil { + t.Fatalf("reconcileScalingAdapters() error = %v", err) + } + + // Verify adapters + adapterList := &v1alpha1.DynamoGraphDeploymentScalingAdapterList{} + if err := fakeClient.List(ctx, adapterList, client.InNamespace("default")); err != nil { + t.Fatalf("Failed to list adapters: %v", err) + } + + if len(adapterList.Items) != tt.expectedAdapterCount { + t.Errorf("Expected %d adapters, got %d", tt.expectedAdapterCount, len(adapterList.Items)) + } + + // Check expected adapters exist with correct replicas + for name, expectedReplicas := range tt.expectedAdapters { + adapter := &v1alpha1.DynamoGraphDeploymentScalingAdapter{} + err := fakeClient.Get(ctx, types.NamespacedName{Name: name, Namespace: "default"}, adapter) + if err != nil { + t.Errorf("Expected adapter %s to exist, but got error: %v", name, err) + continue + } + if adapter.Spec.Replicas != expectedReplicas { + t.Errorf("Adapter %s has replicas=%d, expected %d", name, adapter.Spec.Replicas, expectedReplicas) + } + } + + // Check that deleted adapters don't exist + for _, name := range tt.expectDeleted { + adapter := &v1alpha1.DynamoGraphDeploymentScalingAdapter{} + err := fakeClient.Get(ctx, types.NamespacedName{Name: name, Namespace: "default"}, adapter) + if err == nil { + t.Errorf("Expected adapter %s to be deleted, but it still exists", name) + } + } + }) + } +} diff --git a/deploy/cloud/operator/internal/webhook/validation/shared_test.go b/deploy/cloud/operator/internal/webhook/validation/shared_test.go index 97cbe87e24..b7a2687cbd 100644 --- a/deploy/cloud/operator/internal/webhook/validation/shared_test.go +++ b/deploy/cloud/operator/internal/webhook/validation/shared_test.go @@ -197,3 +197,53 @@ func TestSharedSpecValidator_Validate(t *testing.T) { }) } } + +func TestSharedSpecValidator_Validate_Warnings(t *testing.T) { + validReplicas := int32(3) + + tests := []struct { + name string + spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec + fieldPath string + wantWarnings int + }{ + { + name: "no warnings for spec without autoscaling", + spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ + Replicas: &validReplicas, + }, + fieldPath: "spec", + wantWarnings: 0, + }, + { + name: "warning for deprecated autoscaling field enabled", + spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{ + Replicas: &validReplicas, + //nolint:staticcheck // SA1019: Intentionally testing deprecated field + Autoscaling: &nvidiacomv1alpha1.Autoscaling{ + Enabled: true, + MinReplicas: 1, + MaxReplicas: 10, + }, + }, + fieldPath: "spec", + wantWarnings: 1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + validator := NewSharedSpecValidator(tt.spec, tt.fieldPath) + warnings, err := validator.Validate() + + if err != nil { + t.Errorf("SharedSpecValidator.Validate() unexpected error = %v", err) + return + } + + if len(warnings) != tt.wantWarnings { + t.Errorf("SharedSpecValidator.Validate() warnings count = %d, want %d", len(warnings), tt.wantWarnings) + } + }) + } +} diff --git a/docs/kubernetes/autoscaling.md b/docs/kubernetes/autoscaling.md index d51d0ad6bf..8adaf09107 100644 --- a/docs/kubernetes/autoscaling.md +++ b/docs/kubernetes/autoscaling.md @@ -37,7 +37,7 @@ spec: ## Overview -Dynamo provides flexible autoscaling through the `DynamoGraphDeploymentScalingAdapter` (DGDSA) resource. When you deploy a DGD, the operator automatically creates one adapter per service. These adapters implement the Kubernetes [Scale subresource](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#scale-subresource), enabling integration with: +Dynamo provides flexible autoscaling through the `DynamoGraphDeploymentScalingAdapter` (DGDSA) resource. When you deploy a DGD, the operator automatically creates one adapter per service (unless explicitly disabled). These adapters implement the Kubernetes [Scale subresource](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#scale-subresource), enabling integration with: | Autoscaler | Description | Best For | |------------|-------------|----------| @@ -46,6 +46,8 @@ Dynamo provides flexible autoscaling through the `DynamoGraphDeploymentScalingAd | **Dynamo Planner** | LLM-aware autoscaling with SLA optimization | Production LLM workloads | | **Custom Controllers** | Any scale-subresource-compatible controller | Custom requirements | +> **⚠️ Deprecation Notice**: The `spec.services[X].autoscaling` field in DGD is **deprecated and ignored**. Use DGDSA with HPA, KEDA, or Planner instead. If you have existing DGDs with `autoscaling` configured, you'll see a warning. Remove the field to silence the warning. + ## Architecture ``` @@ -88,6 +90,61 @@ kubectl get dgdsa -n default # sglang-agg-decode sglang-agg decode 1 5m ``` +## Replica Ownership Model + +When DGDSA is enabled (the default), it becomes the **source of truth** for replica counts. This follows the same pattern as Kubernetes Deployments owning ReplicaSets. + +### How It Works + +1. **DGDSA owns replicas**: Autoscalers (HPA, KEDA, Planner) update the DGDSA's `spec.replicas` +2. **DGDSA syncs to DGD**: The DGDSA controller writes the replica count to the DGD's service +3. **Direct DGD edits blocked**: A validating webhook prevents users from directly editing `spec.services[X].replicas` in the DGD +4. **Controllers allowed**: Only authorized controllers (operator, Planner) can modify DGD replicas + +### Manual Scaling with DGDSA Enabled + +When DGDSA is enabled, use `kubectl scale` on the adapter (not the DGD): + +```bash +# ✅ Correct - scale via DGDSA +kubectl scale dgdsa sglang-agg-decode --replicas=3 + +# ❌ Blocked - direct DGD edit rejected by webhook +kubectl patch dgd sglang-agg --type=merge -p '{"spec":{"services":{"decode":{"replicas":3}}}}' +# Error: spec.services[decode].replicas cannot be modified directly when scaling adapter is enabled; +# use 'kubectl scale dgdsa/sglang-agg-decode --replicas=3' or update the DynamoGraphDeploymentScalingAdapter instead +``` + +## Disabling DGDSA for a Service + +If you want to manage replicas directly in the DGD (without autoscaling), you can disable the scaling adapter per service: + +```yaml +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: sglang-agg +spec: + services: + Frontend: + replicas: 2 + scalingAdapter: + disable: true # ← No DGDSA created, direct edits allowed + + decode: + replicas: 1 # ← DGDSA created by default, managed via adapter +``` + +**When to disable DGDSA:** +- You want simple, manual replica management +- You don't need autoscaling for that service +- You prefer direct DGD edits over adapter-based scaling + +**When to keep DGDSA enabled (default):** +- You want to use HPA, KEDA, or Planner for autoscaling +- You want a clear separation between "desired scale" (adapter) and "deployment config" (DGD) +- You want protection against accidental direct replica edits + ## Autoscaling with Dynamo Planner The Dynamo Planner is an LLM-aware autoscaler that optimizes scaling decisions based on inference-specific metrics like Time To First Token (TTFT), Inter-Token Latency (ITL), and KV cache utilization. @@ -527,7 +584,9 @@ spec: ## Manual Scaling -You can manually scale a service using the scale subresource: +### With DGDSA Enabled (Default) + +When DGDSA is enabled (the default), scale via the adapter: ```bash kubectl scale dgdsa sglang-agg-decode -n default --replicas=3 @@ -545,6 +604,25 @@ kubectl get dgdsa sglang-agg-decode -n default > **Note**: If an autoscaler (KEDA, HPA, Planner) is managing the adapter, your change will be overwritten on the next evaluation cycle. +### With DGDSA Disabled + +If you've disabled the scaling adapter for a service, edit the DGD directly: + +```bash +kubectl patch dgd sglang-agg --type=merge -p '{"spec":{"services":{"decode":{"replicas":3}}}}' +``` + +Or edit the YAML: + +```yaml +spec: + services: + decode: + replicas: 3 + scalingAdapter: + disable: true +``` + ## Best Practices ### 1. Choose One Autoscaler Per Service From f4abc599ad91e3958bf79afe6d5ac7b4e87199ff Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Mon, 8 Dec 2025 17:44:51 -0700 Subject: [PATCH 17/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- ...raphdeploymentscalingadapter_controller.go | 23 ++----------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go index 4a0577d1de..edaa4323ae 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller.go @@ -100,26 +100,7 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co currentReplicas = *component.Replicas } - // 4. Detect out-of-band DGD changes (Scenario 1: User manually edited DGD) - // If DGD replicas differ from adapter status, DGD was modified externally - if currentReplicas != adapter.Status.Replicas { - logger.Info("Detected out-of-band DGD change, syncing adapter from DGD", - "service", adapter.Spec.DGDRef.ServiceName, - "dgdReplicas", currentReplicas, - "adapterStatusReplicas", adapter.Status.Replicas) - - // Sync adapter spec from DGD (treat DGD as source of truth for out-of-band changes) - adapter.Spec.Replicas = currentReplicas - if err := r.Update(ctx, adapter); err != nil { - logger.Error(err, "Failed to sync adapter spec from DGD") - return ctrl.Result{}, err - } - - r.Recorder.Eventf(adapter, corev1.EventTypeNormal, "Synced", - "Synced adapter from DGD manual edit: replicas=%d", currentReplicas) - } - - // 5. Update DGD if replicas changed + // 4. Update DGD if replicas changed (DGDSA is the source of truth) if currentReplicas != adapter.Spec.Replicas { // Update the service's replicas in DGD component.Replicas = &adapter.Spec.Replicas @@ -146,7 +127,7 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co adapter.Status.LastScaleTime = &now } - // 7. Update adapter status + // 5. Update adapter status adapter.Status.Replicas = adapter.Spec.Replicas adapter.Status.Selector = r.buildPodSelector(dgd, adapter.Spec.DGDRef.ServiceName) From e3a25ac4a56e810add4af5ef0ac02498cd03bdd4 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Mon, 8 Dec 2025 18:07:44 -0700 Subject: [PATCH 18/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- ...eploymentscalingadapter_controller_test.go | 512 ++++++++++++++++++ 1 file changed, 512 insertions(+) create mode 100644 deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller_test.go diff --git a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller_test.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller_test.go new file mode 100644 index 0000000000..ccbaf68327 --- /dev/null +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller_test.go @@ -0,0 +1,512 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controller + +import ( + "context" + "testing" + + "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" + "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/tools/record" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestDynamoGraphDeploymentScalingAdapterReconciler_Reconcile(t *testing.T) { + // Register custom types with the scheme + if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 to scheme: %v", err) + } + + tests := []struct { + name string + adapter *v1alpha1.DynamoGraphDeploymentScalingAdapter + dgd *v1alpha1.DynamoGraphDeployment + expectedDGDReplicas int32 + expectedStatusReplicas int32 + expectError bool + expectRequeue bool + }{ + { + name: "updates DGD replicas when DGDSA spec differs", + adapter: &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 5, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "Frontend", + }, + }, + }, + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(2)), + }, + }, + }, + }, + expectedDGDReplicas: 5, + expectedStatusReplicas: 5, + expectError: false, + }, + { + name: "no update when replicas already match", + adapter: &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 3, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "Frontend", + }, + }, + }, + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(3)), + }, + }, + }, + }, + expectedDGDReplicas: 3, + expectedStatusReplicas: 3, + expectError: false, + }, + { + name: "uses default replicas (1) when DGD service has no replicas set", + adapter: &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-worker", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 4, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "worker", + }, + }, + }, + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "worker": {}, // no replicas set + }, + }, + }, + expectedDGDReplicas: 4, + expectedStatusReplicas: 4, + expectError: false, + }, + { + name: "error when service not found in DGD", + adapter: &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-missing", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 2, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "nonexistent", + }, + }, + }, + dgd: &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(1)), + }, + }, + }, + }, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Build initial objects + var initObjs []client.Object + initObjs = append(initObjs, tt.adapter, tt.dgd) + + // Create fake client with status subresource support + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(initObjs...). + WithStatusSubresource(&v1alpha1.DynamoGraphDeploymentScalingAdapter{}). + Build() + + // Create reconciler + r := &DynamoGraphDeploymentScalingAdapterReconciler{ + Client: fakeClient, + Scheme: scheme.Scheme, + Recorder: record.NewFakeRecorder(10), + } + + // Run Reconcile + ctx := context.Background() + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: tt.adapter.Name, + Namespace: tt.adapter.Namespace, + }, + } + + result, err := r.Reconcile(ctx, req) + + // Check error expectation + if tt.expectError && err == nil { + t.Errorf("Expected error, but got none") + } + if !tt.expectError && err != nil { + t.Errorf("Unexpected error: %v", err) + } + + // Skip further checks if error was expected + if tt.expectError { + return + } + + // Check requeue + if tt.expectRequeue && result.Requeue == false && result.RequeueAfter == 0 { + t.Errorf("Expected requeue, but got none") + } + + // Verify DGD replicas were updated + updatedDGD := &v1alpha1.DynamoGraphDeployment{} + if err := fakeClient.Get(ctx, types.NamespacedName{Name: tt.dgd.Name, Namespace: tt.dgd.Namespace}, updatedDGD); err != nil { + t.Fatalf("Failed to get updated DGD: %v", err) + } + + service, exists := updatedDGD.Spec.Services[tt.adapter.Spec.DGDRef.ServiceName] + if !exists { + t.Fatalf("Service %s not found in updated DGD", tt.adapter.Spec.DGDRef.ServiceName) + } + + actualReplicas := int32(1) + if service.Replicas != nil { + actualReplicas = *service.Replicas + } + + if actualReplicas != tt.expectedDGDReplicas { + t.Errorf("DGD service replicas = %d, expected %d", actualReplicas, tt.expectedDGDReplicas) + } + + // Verify adapter status was updated + updatedAdapter := &v1alpha1.DynamoGraphDeploymentScalingAdapter{} + if err := fakeClient.Get(ctx, types.NamespacedName{Name: tt.adapter.Name, Namespace: tt.adapter.Namespace}, updatedAdapter); err != nil { + t.Fatalf("Failed to get updated adapter: %v", err) + } + + if updatedAdapter.Status.Replicas != tt.expectedStatusReplicas { + t.Errorf("Adapter status.replicas = %d, expected %d", updatedAdapter.Status.Replicas, tt.expectedStatusReplicas) + } + + // Verify selector is set + if updatedAdapter.Status.Selector == "" { + t.Errorf("Adapter status.selector is empty, expected non-empty") + } + }) + } +} + +func TestDynamoGraphDeploymentScalingAdapterReconciler_Reconcile_NotFound(t *testing.T) { + // Register custom types with the scheme + if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 to scheme: %v", err) + } + + // Create fake client with no objects + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + Build() + + r := &DynamoGraphDeploymentScalingAdapterReconciler{ + Client: fakeClient, + Scheme: scheme.Scheme, + Recorder: record.NewFakeRecorder(10), + } + + ctx := context.Background() + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: "nonexistent", + Namespace: "default", + }, + } + + // Should return no error when adapter not found (client.IgnoreNotFound) + result, err := r.Reconcile(ctx, req) + if err != nil { + t.Errorf("Expected no error for not found adapter, got: %v", err) + } + if result.RequeueAfter != 0 { + t.Errorf("Expected no requeueAfter for not found adapter, got: %v", result.RequeueAfter) + } +} + +func TestDynamoGraphDeploymentScalingAdapterReconciler_Reconcile_DGDNotFound(t *testing.T) { + // Register custom types with the scheme + if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 to scheme: %v", err) + } + + adapter := &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 5, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "nonexistent-dgd", + ServiceName: "Frontend", + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(adapter). + Build() + + r := &DynamoGraphDeploymentScalingAdapterReconciler{ + Client: fakeClient, + Scheme: scheme.Scheme, + Recorder: record.NewFakeRecorder(10), + } + + ctx := context.Background() + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: adapter.Name, + Namespace: adapter.Namespace, + }, + } + + // Should return error when DGD not found + _, err := r.Reconcile(ctx, req) + if err == nil { + t.Errorf("Expected error when DGD not found, got none") + } +} + +func TestDynamoGraphDeploymentScalingAdapterReconciler_Reconcile_BeingDeleted(t *testing.T) { + // Register custom types with the scheme + if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 to scheme: %v", err) + } + + now := metav1.Now() + adapter := &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + DeletionTimestamp: &now, + Finalizers: []string{"test-finalizer"}, // Required for deletion timestamp to be set + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + Replicas: 5, + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "Frontend", + }, + }, + } + + dgd := &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + Spec: v1alpha1.DynamoGraphDeploymentSpec{ + Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{ + "Frontend": { + Replicas: ptr.To(int32(2)), + }, + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(adapter, dgd). + Build() + + r := &DynamoGraphDeploymentScalingAdapterReconciler{ + Client: fakeClient, + Scheme: scheme.Scheme, + Recorder: record.NewFakeRecorder(10), + } + + ctx := context.Background() + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: adapter.Name, + Namespace: adapter.Namespace, + }, + } + + // Should return no error and skip reconciliation + result, err := r.Reconcile(ctx, req) + if err != nil { + t.Errorf("Expected no error for deleting adapter, got: %v", err) + } + if result.RequeueAfter != 0 { + t.Errorf("Expected no requeueAfter for deleting adapter, got: %v", result.RequeueAfter) + } + + // DGD replicas should NOT be updated (still 2) + updatedDGD := &v1alpha1.DynamoGraphDeployment{} + if err := fakeClient.Get(ctx, types.NamespacedName{Name: dgd.Name, Namespace: dgd.Namespace}, updatedDGD); err != nil { + t.Fatalf("Failed to get DGD: %v", err) + } + + if *updatedDGD.Spec.Services["Frontend"].Replicas != 2 { + t.Errorf("DGD replicas should remain unchanged, got %d", *updatedDGD.Spec.Services["Frontend"].Replicas) + } +} + +func TestDynamoGraphDeploymentScalingAdapterReconciler_findAdaptersForDGD(t *testing.T) { + // Register custom types with the scheme + if err := v1alpha1.AddToScheme(scheme.Scheme); err != nil { + t.Fatalf("Failed to add v1alpha1 to scheme: %v", err) + } + + dgd := &v1alpha1.DynamoGraphDeployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd", + Namespace: "default", + }, + } + + // Adapters belonging to test-dgd + adapter1 := &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-frontend", + Namespace: "default", + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: "test-dgd", + }, + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "Frontend", + }, + }, + } + + adapter2 := &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-dgd-decode", + Namespace: "default", + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: "test-dgd", + }, + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "test-dgd", + ServiceName: "decode", + }, + }, + } + + // Adapter belonging to different DGD + adapterOther := &v1alpha1.DynamoGraphDeploymentScalingAdapter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "other-dgd-frontend", + Namespace: "default", + Labels: map[string]string{ + consts.KubeLabelDynamoGraphDeploymentName: "other-dgd", + }, + }, + Spec: v1alpha1.DynamoGraphDeploymentScalingAdapterSpec{ + DGDRef: v1alpha1.DynamoGraphDeploymentServiceRef{ + Name: "other-dgd", + ServiceName: "Frontend", + }, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(adapter1, adapter2, adapterOther). + Build() + + r := &DynamoGraphDeploymentScalingAdapterReconciler{ + Client: fakeClient, + } + + ctx := context.Background() + requests := r.findAdaptersForDGD(ctx, dgd) + + // Should return 2 requests (for test-dgd adapters only) + if len(requests) != 2 { + t.Errorf("findAdaptersForDGD() returned %d requests, expected 2", len(requests)) + } + + // Verify correct adapters are returned + expectedNames := map[string]bool{ + "test-dgd-frontend": true, + "test-dgd-decode": true, + } + + for _, req := range requests { + if !expectedNames[req.Name] { + t.Errorf("Unexpected adapter in results: %s", req.Name) + } + } +} From 03604d6d7f596a1162cfe766f22fb4ab86519533 Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Mon, 8 Dec 2025 18:27:34 -0700 Subject: [PATCH 19/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- .../dynamographdeploymentscalingadapter_controller_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller_test.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller_test.go index ccbaf68327..33c6b9f5e8 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller_test.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentscalingadapter_controller_test.go @@ -219,7 +219,7 @@ func TestDynamoGraphDeploymentScalingAdapterReconciler_Reconcile(t *testing.T) { } // Check requeue - if tt.expectRequeue && result.Requeue == false && result.RequeueAfter == 0 { + if tt.expectRequeue && result.RequeueAfter == 0 { t.Errorf("Expected requeue, but got none") } From 159f30b1aded94a9805fd823d00babee95e5a1cb Mon Sep 17 00:00:00 2001 From: Julien Mancuso Date: Tue, 9 Dec 2025 07:39:20 -0700 Subject: [PATCH 20/20] fix: add scaling adapter Signed-off-by: Julien Mancuso --- .../validation/dynamographdeployment.go | 43 +++++++++++-------- .../dynamographdeployment_handler.go | 22 +++++----- .../validation/dynamographdeployment_test.go | 3 +- 3 files changed, 38 insertions(+), 30 deletions(-) diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go index d880501626..00a1668806 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go @@ -68,37 +68,42 @@ func (v *DynamoGraphDeploymentValidator) Validate() (admission.Warnings, error) } // ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeployment. +// userInfo is used for identity-based validation (replica protection). +// If userInfo is nil, replica changes for DGDSA-enabled services are rejected (fail closed). // Returns warnings and error. -func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment) (admission.Warnings, error) { - return v.ValidateUpdateWithUserInfo(old, nil) -} +func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) (admission.Warnings, error) { + var warnings admission.Warnings -// ValidateUpdateWithUserInfo performs stateful validation with user identity checking. -// When userInfo is provided, it validates that only allowed controllers can modify -// replicas for services with scaling adapter enabled. -// Returns warnings and error. -func (v *DynamoGraphDeploymentValidator) ValidateUpdateWithUserInfo(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) (admission.Warnings, error) { - // Validate that BackendFramework is not changed (immutable) - if v.deployment.Spec.BackendFramework != old.Spec.BackendFramework { - warning := "Changing spec.backendFramework may cause unexpected behavior" - return admission.Warnings{warning}, fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation") + // Validate immutable fields + if err := v.validateImmutableFields(old, &warnings); err != nil { + return warnings, err } // Validate replicas changes for services with scaling adapter enabled - if userInfo != nil { - if err := v.validateReplicasChanges(old, *userInfo); err != nil { - return nil, err - } + // Pass userInfo (may be nil - will fail closed for DGDSA-enabled services) + if err := v.validateReplicasChanges(old, userInfo); err != nil { + return warnings, err } - return nil, nil + return warnings, nil +} + +// validateImmutableFields checks that immutable fields have not been changed. +// Appends warnings to the provided slice. +func (v *DynamoGraphDeploymentValidator) validateImmutableFields(old *nvidiacomv1alpha1.DynamoGraphDeployment, warnings *admission.Warnings) error { + if v.deployment.Spec.BackendFramework != old.Spec.BackendFramework { + *warnings = append(*warnings, "Changing spec.backendFramework may cause unexpected behavior") + return fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation") + } + return nil } // validateReplicasChanges checks if replicas were changed for services with scaling adapter enabled. // Only authorized service accounts (operator controller, planner) can modify these fields. -func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo authenticationv1.UserInfo) error { +// If userInfo is nil, all replica changes for DGDSA-enabled services are rejected (fail closed). +func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) error { // If the request comes from an authorized service account, allow the change - if internalwebhook.CanModifyDGDReplicas(userInfo) { + if userInfo != nil && internalwebhook.CanModifyDGDReplicas(*userInfo) { return nil } diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go index f200e91486..e98bd03442 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go @@ -23,6 +23,7 @@ import ( nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1" internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook" + authenticationv1 "k8s.io/api/authentication/v1" "k8s.io/apimachinery/pkg/runtime" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -92,22 +93,23 @@ func (h *DynamoGraphDeploymentHandler) ValidateUpdate(ctx context.Context, oldOb } // Get user info from admission request context for identity-based validation + var userInfo *authenticationv1.UserInfo req, err := admission.RequestFromContext(ctx) if err != nil { - logger.Error(err, "failed to get admission request from context, skipping user-based validation") - // Fall back to basic validation without user info - updateWarnings, err := validator.ValidateUpdate(oldDeployment) - if err != nil { - return updateWarnings, err - } - warnings = append(warnings, updateWarnings...) - return warnings, nil + logger.Error(err, "failed to get admission request from context, replica changes for DGDSA-enabled services will be rejected") + // userInfo remains nil - validateReplicasChanges will fail closed + } else { + userInfo = &req.UserInfo } // Validate stateful rules (immutability + replicas protection) - updateWarnings, err := validator.ValidateUpdateWithUserInfo(oldDeployment, &req.UserInfo) + updateWarnings, err := validator.ValidateUpdate(oldDeployment, userInfo) if err != nil { - logger.Info("validation failed", "error", err.Error(), "user", req.UserInfo.Username) + username := "" + if userInfo != nil { + username = userInfo.Username + } + logger.Info("validation failed", "error", err.Error(), "user", username) return updateWarnings, err } diff --git a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go index de354c25bc..71228327b6 100644 --- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go +++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go @@ -419,7 +419,8 @@ func TestDynamoGraphDeploymentValidator_ValidateUpdate(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { validator := NewDynamoGraphDeploymentValidator(tt.newDeployment) - warnings, err := validator.ValidateUpdate(tt.oldDeployment) + // Pass nil userInfo - these tests don't modify replicas, so it's safe + warnings, err := validator.ValidateUpdate(tt.oldDeployment, nil) if (err != nil) != tt.wantErr { t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)