Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix: add scaling adapter
Signed-off-by: Julien Mancuso <[email protected]>
  • Loading branch information
julienmancuso committed Dec 8, 2025
commit 7706428833d6c2951daa5ff44cd0bb22bed3886d
Original file line number Diff line number Diff line change
Expand Up @@ -10189,8 +10189,12 @@ spec:
type: integer
type: object
replicas:
description: Replicas is the desired number of Pods for this component when autoscaling is not used.
description: |-
Replicas is the desired number of Pods for this component.
When scalingAdapter is enabled (default), this field is managed by the
DynamoGraphDeploymentScalingAdapter and should not be modified directly.
format: int32
minimum: 0
type: integer
resources:
description: |-
Expand Down Expand Up @@ -10269,6 +10273,20 @@ spec:
type: string
type: object
type: object
scalingAdapter:
description: |-
ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
the service using the Scale subresource. When disabled, replicas can be modified directly.
properties:
disable:
default: false
description: |-
Disable indicates whether the ScalingAdapter should be disabled for this service.
When false (default), a DGDSA is created and owns the replicas field.
When true, no DGDSA is created and replicas can be modified directly in the DGD.
type: boolean
type: object
serviceName:
description: The name of the component
type: string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10324,8 +10324,12 @@ spec:
type: integer
type: object
replicas:
description: Replicas is the desired number of Pods for this component when autoscaling is not used.
description: |-
Replicas is the desired number of Pods for this component.
When scalingAdapter is enabled (default), this field is managed by the
DynamoGraphDeploymentScalingAdapter and should not be modified directly.
format: int32
minimum: 0
type: integer
resources:
description: |-
Expand Down Expand Up @@ -10404,6 +10408,20 @@ spec:
type: string
type: object
type: object
scalingAdapter:
description: |-
ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
the service using the Scale subresource. When disabled, replicas can be modified directly.
properties:
disable:
default: false
description: |-
Disable indicates whether the ScalingAdapter should be disabled for this service.
When false (default), a DGDSA is created and owns the replicas field.
When true, no DGDSA is created and replicas can be modified directly in the DGD.
type: boolean
type: object
serviceName:
description: The name of the component
type: string
Expand Down
12 changes: 12 additions & 0 deletions deploy/cloud/operator/api/v1alpha1/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,15 @@ type ExtraPodSpec struct {
*corev1.PodSpec `json:",inline"`
MainContainer *corev1.Container `json:"mainContainer,omitempty"`
}

// ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter
// for replica management. When enabled (default), the DGDSA owns the replicas field and
// external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource.
type ScalingAdapter struct {
// Disable indicates whether the ScalingAdapter should be disabled for this service.
// When false (default), a DGDSA is created and owns the replicas field.
// When true, no DGDSA is created and replicas can be modified directly in the DGD.
// +optional
// +kubebuilder:default=false
Disable bool `json:"disable,omitempty"`
}
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,18 @@ type DynamoComponentDeploymentSharedSpec struct {
LivenessProbe *corev1.Probe `json:"livenessProbe,omitempty"`
// ReadinessProbe to signal when the container is ready to receive traffic.
ReadinessProbe *corev1.Probe `json:"readinessProbe,omitempty"`
// Replicas is the desired number of Pods for this component when autoscaling is not used.
// Replicas is the desired number of Pods for this component.
// When scalingAdapter is enabled (default), this field is managed by the
// DynamoGraphDeploymentScalingAdapter and should not be modified directly.
// +kubebuilder:validation:Minimum=0
Replicas *int32 `json:"replicas,omitempty"`
// Multinode is the configuration for multinode components.
Multinode *MultinodeSpec `json:"multinode,omitempty"`
// ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
// When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
// the service using the Scale subresource. When disabled, replicas can be modified directly.
// +optional
ScalingAdapter *ScalingAdapter `json:"scalingAdapter,omitempty"`
}

type MultinodeSpec struct {
Expand Down
20 changes: 20 additions & 0 deletions deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -10189,8 +10189,12 @@ spec:
type: integer
type: object
replicas:
description: Replicas is the desired number of Pods for this component when autoscaling is not used.
description: |-
Replicas is the desired number of Pods for this component.
When scalingAdapter is enabled (default), this field is managed by the
DynamoGraphDeploymentScalingAdapter and should not be modified directly.
format: int32
minimum: 0
type: integer
resources:
description: |-
Expand Down Expand Up @@ -10269,6 +10273,20 @@ spec:
type: string
type: object
type: object
scalingAdapter:
description: |-
ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
the service using the Scale subresource. When disabled, replicas can be modified directly.
properties:
disable:
default: false
description: |-
Disable indicates whether the ScalingAdapter should be disabled for this service.
When false (default), a DGDSA is created and owns the replicas field.
When true, no DGDSA is created and replicas can be modified directly in the DGD.
type: boolean
type: object
serviceName:
description: The name of the component
type: string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10324,8 +10324,12 @@ spec:
type: integer
type: object
replicas:
description: Replicas is the desired number of Pods for this component when autoscaling is not used.
description: |-
Replicas is the desired number of Pods for this component.
When scalingAdapter is enabled (default), this field is managed by the
DynamoGraphDeploymentScalingAdapter and should not be modified directly.
format: int32
minimum: 0
type: integer
resources:
description: |-
Expand Down Expand Up @@ -10404,6 +10408,20 @@ spec:
type: string
type: object
type: object
scalingAdapter:
description: |-
ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.
When enabled (default), replicas are managed via DGDSA and external autoscalers can scale
the service using the Scale subresource. When disabled, replicas can be modified directly.
properties:
disable:
default: false
description: |-
Disable indicates whether the ScalingAdapter should be disabled for this service.
When false (default), a DGDSA is created and owns the replicas field.
When true, no DGDSA is created and replicas can be modified directly in the DGD.
type: boolean
type: object
serviceName:
description: The name of the component
type: string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -616,19 +616,24 @@ func (r *DynamoGraphDeploymentReconciler) reconcilePVCs(ctx context.Context, dyn
}

// reconcileScalingAdapters ensures a DynamoGraphDeploymentScalingAdapter exists for each service in the DGD
// This enables pluggable autoscaling via HPA, KEDA, or Planner
// that has scaling adapter enabled (default). Services with scalingAdapter.disable=true will not have a DGDSA.
// This enables pluggable autoscaling via HPA, KEDA, or Planner.
func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error {
logger := log.FromContext(ctx)

// Create or update an adapter for each service using SyncResource pattern
// Process each service - SyncResource handles create, update, and delete via toDelete flag
for serviceName, component := range dynamoDeployment.Spec.Services {
// Check if scaling adapter is disabled for this service
scalingAdapterDisabled := component.ScalingAdapter != nil && component.ScalingAdapter.Disable

// Get current replicas (default to 1 if not set)
currentReplicas := int32(1)
if component.Replicas != nil {
currentReplicas = *component.Replicas
}

// Use SyncResource to handle creation/updates
// Use SyncResource to handle creation/updates/deletion
// When toDelete=true, SyncResource will delete the existing resource if it exists
_, _, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter, bool, error) {
adapterName := generateAdapterName(dynamoDeployment.Name, serviceName)
adapter := &nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapter{
Expand All @@ -648,7 +653,8 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C
},
},
}
return adapter, false, nil
// Return toDelete=true if scaling adapter is disabled
return adapter, scalingAdapterDisabled, nil
})

if err != nil {
Expand All @@ -657,7 +663,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C
}
}

// Clean up orphaned adapters (services that no longer exist in DGD)
// Clean up adapters for services that were removed from DGD entirely
adapterList := &nvidiacomv1alpha1.DynamoGraphDeploymentScalingAdapterList{}
if err := r.List(ctx, adapterList,
client.InNamespace(dynamoDeployment.Namespace),
Expand All @@ -671,7 +677,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileScalingAdapters(ctx context.C
adapter := &adapterList.Items[i]
serviceName := adapter.Spec.DGDRef.ServiceName

// Check if service still exists in DGD
// Delete adapter if service no longer exists in DGD
if _, exists := dynamoDeployment.Spec.Services[serviceName]; !exists {
logger.Info("Deleting orphaned DynamoGraphDeploymentScalingAdapter", "adapter", adapter.Name, "service", serviceName)
if err := r.Delete(ctx, adapter); err != nil && !errors.IsNotFound(err) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) Reconcile(ctx context.Co

// 3. Find the target service in DGD's spec.services map
component, exists := dgd.Spec.Services[adapter.Spec.DGDRef.ServiceName]
if !exists {
if !exists || component == nil {
logger.Error(nil, "Service not found in DGD",
"service", adapter.Spec.DGDRef.ServiceName,
"dgd", dgd.Name,
Expand Down
53 changes: 53 additions & 0 deletions deploy/cloud/operator/internal/webhook/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ package webhook

import (
"context"
"strings"

authenticationv1 "k8s.io/api/authentication/v1"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
logf "sigs.k8s.io/controller-runtime/pkg/log"
Expand Down Expand Up @@ -118,3 +120,54 @@ func (v *LeaseAwareValidator) shouldSkipValidation(obj runtime.Object) bool {

return false
}

// DGDReplicasModifierSuffixes defines suffixes for service accounts that are authorized
// to modify DGD replicas when scaling adapter is enabled.
// Service accounts matching any of these suffixes are allowed regardless of namespace.
var DGDReplicasModifierSuffixes = []string{
// Dynamo operator controller manager (handles DGDSA reconciliation)
// Example: "dynamo-platform-dynamo-operator-controller-manager"
"-dynamo-operator-controller-manager",

// Planner service account (manages DGD replicas for autoscaling)
// Example: "planner-serviceaccount"
"planner-serviceaccount",
}

// CanModifyDGDReplicas checks if the request comes from a service account authorized
// to modify DGD replicas when scaling adapter is enabled.
// Service accounts are identified by username format: system:serviceaccount:<namespace>:<name>
//
// Authorized service accounts (by suffix):
// - *-dynamo-operator-controller-manager (for DGDSA reconciliation)
// - *planner-serviceaccount (for Planner autoscaling)
func CanModifyDGDReplicas(userInfo authenticationv1.UserInfo) bool {
username := userInfo.Username

// Service accounts have username format: system:serviceaccount:<namespace>:<name>
if !strings.HasPrefix(username, "system:serviceaccount:") {
return false
}

// Parse: system:serviceaccount:<namespace>:<name>
parts := strings.Split(username, ":")
if len(parts) != 4 {
return false
}

namespace := parts[2]
saName := parts[3]

// Check against authorized suffixes
for _, suffix := range DGDReplicasModifierSuffixes {
if strings.HasSuffix(saName, suffix) {
webhookCommonLog.V(1).Info("allowing DGD replicas modification",
"serviceAccount", saName,
"namespace", namespace,
"matchedSuffix", suffix)
return true
}
}

return false
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,10 @@ func NewDynamoComponentDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoC
func (v *DynamoComponentDeploymentValidator) Validate() (admission.Warnings, error) {
// Validate shared spec fields using SharedSpecValidator
sharedValidator := NewSharedSpecValidator(&v.deployment.Spec.DynamoComponentDeploymentSharedSpec, "spec")
if err := sharedValidator.Validate(); err != nil {
return nil, err
}

// Collect deprecation warnings
warnings := sharedValidator.GetWarnings()

// DCD-specific validation would go here (currently none)

return warnings, nil
return sharedValidator.Validate()
}

// ValidateUpdate performs stateful validation comparing old and new DynamoComponentDeployment.
Expand Down
Loading
Loading