Skip to content
Merged
3 changes: 1 addition & 2 deletions deploy/cloud/helm/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}"
export INGRESS_CLASS="${INGRESS_CLASS:=nginx}"
export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}"
export ENABLE_LWS="${ENABLE_LWS:=false}"
export ENABLE_GROVE="${ENABLE_GROVE:=false}"

# Add command line options
INTERACTIVE=false
Expand Down Expand Up @@ -165,7 +164,7 @@ echo "DYNAMO_INGRESS_SUFFIX: $DYNAMO_INGRESS_SUFFIX"
echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS"
echo "INSTALL_CRDS: $INSTALL_CRDS"

envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS} ${ENABLE_GROVE}' < dynamo-platform-values.yaml > generated-values.yaml
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS}' < dynamo-platform-values.yaml > generated-values.yaml
echo "generated file contents:"
cat generated-values.yaml

Expand Down
1 change: 0 additions & 1 deletion deploy/cloud/helm/dynamo-platform-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ dynamo-operator:

dynamo:
enableLWS: ${ENABLE_LWS}
enableGrove: ${ENABLE_GROVE}
ingress:
enabled: ${INGRESS_ENABLED}
className: ${INGRESS_CLASS}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ spec:
{{- if .Values.dynamo.enableLWS }}
- --enable-lws
{{- end }}
{{- if .Values.dynamo.enableGrove }}
- --enable-grove
{{- if .Values.dynamo.groveTerminationDelay }}
- --grove-termination-delay={{ .Values.dynamo.groveTerminationDelay }}
{{- end }}
command:
- /manager
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ rules:
- patch
- update
- watch
{{- if .Values.dynamo.enableGrove }}
- apiGroups:
- grove.io
resources:
Expand All @@ -129,7 +128,6 @@ rules:
- patch
- update
- watch
{{- end }}
- apiGroups:
- apps
resources:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ dynamo:
annotations: {}

enableLWS: false
enableGrove: false
groveTerminationDelay: 15m

internalImages:
debugger: python:3.12-slim
Expand Down
2 changes: 1 addition & 1 deletion deploy/cloud/helm/platform/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dynamo-operator:
imagePullSecrets: []
dynamo:
enableLWS: false
enableGrove: false
groveTerminationDelay: 15m
internalImages:
debugger: python:3.12-slim
enableRestrictedSecurityContext: false
Expand Down
28 changes: 20 additions & 8 deletions deploy/cloud/operator/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
// to ensure that exec-entrypoint and run can make use of them.
clientv3 "go.etcd.io/etcd/client/v3"
corev1 "k8s.io/api/core/v1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
_ "k8s.io/client-go/plugin/pkg/client/auth"
Expand All @@ -50,6 +51,7 @@ import (

grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/etcd"
Expand All @@ -73,6 +75,10 @@ func init() {
utilruntime.Must(volcanoscheme.AddToScheme(scheme))

utilruntime.Must(grovev1alpha1.AddToScheme(scheme))

utilruntime.Must(apiextensionsv1.AddToScheme(scheme))

utilruntime.Must(istioclientsetscheme.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}

Expand All @@ -92,7 +98,7 @@ func main() {
var ingressControllerTLSSecretName string
var ingressHostSuffix string
var enableLWS bool
var enableGrove bool
var groveTerminationDelay time.Duration
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
Expand Down Expand Up @@ -120,22 +126,23 @@ func main() {
"The suffix to use for the ingress host")
flag.BoolVar(&enableLWS, "enable-lws", false,
"If set, enable leader worker set")
flag.BoolVar(&enableGrove, "enable-grove", false,
"If set, enable grove")
flag.DurationVar(&groveTerminationDelay, "grove-termination-delay", consts.DefaultGroveTerminationDelay,
"The termination delay for Grove PodGangSets")
opts := zap.Options{
Development: true,
}
opts.BindFlags(flag.CommandLine)
flag.Parse()

utilruntime.Must(istioclientsetscheme.AddToScheme(scheme))

ctrlConfig := commonController.Config{
RestrictedNamespace: restrictedNamespace,
EnableLWS: enableLWS,
EnableGrove: enableGrove,
EtcdAddress: etcdAddr,
NatsAddress: natsAddr,
Grove: commonController.GroveConfig{
Enabled: false, // Will be set after Grove discovery
TerminationDelay: groveTerminationDelay,
},
EtcdAddress: etcdAddr,
NatsAddress: natsAddr,
IngressConfig: commonController.IngressConfig{
VirtualServiceGateway: istioVirtualServiceGateway,
IngressControllerClassName: ingressControllerClassName,
Expand Down Expand Up @@ -201,6 +208,11 @@ func main() {
os.Exit(1)
}

// Detect Grove availability using discovery client
setupLog.Info("Detecting Grove availability...")
groveEnabled := commonController.DetectGroveAvailability(mainCtx, mgr)
ctrlConfig.Grove.Enabled = groveEnabled

// Create etcd client
cli, err := clientv3.New(clientv3.Config{
Endpoints: []string{etcdAddr},
Expand Down
4 changes: 4 additions & 0 deletions deploy/cloud/operator/internal/consts/consts.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package consts

import "time"

const (
HPACPUDefaultAverageUtilization = 80

Expand Down Expand Up @@ -37,4 +39,6 @@ const (
PlannerServiceAccountName = "planner-serviceaccount"

DefaultIngressSuffix = "local"

DefaultGroveTerminationDelay = 15 * time.Minute
)
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ type Resource interface {

func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
logger := log.FromContext(ctx)
if r.Config.EnableGrove {
if r.Config.Grove.Enabled {
// check if explicit opt out of grove
if dynamoDeployment.Annotations[consts.KubeAnnotationEnableGrove] == consts.KubeLabelValueFalse {
logger.Info("Grove is explicitly disabled for this deployment, skipping grove resources reconciliation")
Expand Down Expand Up @@ -308,7 +308,7 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
GenericFunc: func(ge event.GenericEvent) bool { return true },
})).
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config))
if r.Config.EnableGrove {
if r.Config.Grove.Enabled {
ctrlBuilder = ctrlBuilder.Owns(&grovev1alpha1.PodGangSet{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the pod gang set
CreateFunc: func(ce event.CreateEvent) bool { return false },
Expand Down
49 changes: 48 additions & 1 deletion deploy/cloud/operator/internal/controller_common/predicate.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,28 @@ package controller_common
import (
"context"
"strings"
"time"

"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/client-go/discovery"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
)

type GroveConfig struct {
// Enabled is automatically determined by checking if Grove CRDs are installed in the cluster
Enabled bool
// TerminationDelay configures the termination delay for Grove PodGangSets
TerminationDelay time.Duration
}

type Config struct {
// Enable resources filtering, only the resources belonging to the given namespace will be handled.
RestrictedNamespace string
EnableLWS bool
EnableGrove bool
Grove GroveConfig
EtcdAddress string
NatsAddress string
IngressConfig IngressConfig
Expand All @@ -48,6 +58,43 @@ func (i *IngressConfig) UseVirtualService() bool {
return i.VirtualServiceGateway != ""
}

// DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectGroveAvailability(ctx context.Context, mgr ctrl.Manager) bool {
logger := log.FromContext(ctx)

// Use the discovery client to check if Grove API groups are available
cfg := mgr.GetConfig()
if cfg == nil {
logger.Info("Grove detection failed, no discovery client available")
return false
}

// Try to create a discovery client
discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg)
if err != nil {
logger.Error(err, "Grove detection failed, could not create discovery client")
return false
}

// Check if grove.io API group is available
apiGroups, err := discoveryClient.ServerGroups()
if err != nil {
logger.Error(err, "Grove detection failed, could not list server groups")
return false
}

for _, group := range apiGroups.Groups {
if group.Name == "grove.io" {
logger.Info("Grove is available, grove.io API group found")
return true
}
}

logger.Info("Grove not available, grove.io API group not found")
return false
}

func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
return predicate.NewPredicateFuncs(func(o client.Object) bool {
l := log.FromContext(context.Background())
Expand Down
3 changes: 3 additions & 0 deletions deploy/cloud/operator/internal/dynamo/graph.go
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,9 @@ func GenerateGrovePodGangSet(ctx context.Context, dynamoDeployment *v1alpha1.Dyn
gangSet.Name = dynamoDeployment.Name
gangSet.Namespace = dynamoDeployment.Namespace
gangSet.Spec.Replicas = 1
if controllerConfig.Grove.TerminationDelay > 0 {
gangSet.Spec.Template.TerminationDelay = &metav1.Duration{Duration: controllerConfig.Grove.TerminationDelay}
}
for componentName, component := range dynamoDeployment.Spec.Services {
container := corev1.Container{
Name: "main",
Expand Down
5 changes: 5 additions & 0 deletions deploy/cloud/operator/internal/dynamo/graph_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"reflect"
"sort"
"testing"
"time"

grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/dynamo/common"
Expand Down Expand Up @@ -1136,6 +1137,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
controllerConfig: controller_common.Config{
EtcdAddress: "etcd-address",
NatsAddress: "nats-address",
Grove: controller_common.GroveConfig{
TerminationDelay: 15 * time.Minute,
},
},
dynamoDeployment: &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -1272,6 +1276,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
Spec: grovev1alpha1.PodGangSetSpec{
Replicas: 1,
Template: grovev1alpha1.PodGangSetTemplateSpec{
TerminationDelay: &metav1.Duration{Duration: 15 * time.Minute},
Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{
{
Name: "frontend",
Expand Down
Loading