diff --git a/demos/cuj1-eks.md b/demos/cuj1-eks.md index 2fd5b4e41..4db45f6de 100644 --- a/demos/cuj1-eks.md +++ b/demos/cuj1-eks.md @@ -97,30 +97,10 @@ spec: nvidia.com/gpu: 1 limits: nvidia.com/gpu: 1 - # Inject AICR-standard GPU node scheduling. kubeflow-trainer v2.2.0 replaced - # podTemplateOverrides with the runtimePatches API (PR kubeflow/trainer#3309). - runtimePatches: - - manager: aicr.nvidia.com/demo - trainingRuntimeSpec: - template: - spec: - replicatedJobs: - - name: node - template: - spec: - template: - spec: - nodeSelector: - nodeGroup: gpu-worker - tolerations: - - key: dedicated - operator: Equal - value: worker-workload - effect: NoSchedule - - key: dedicated - operator: Equal - value: worker-workload - effect: NoExecute + # No podTemplateOverrides / runtimePatches needed — the torch-distributed + # ClusterTrainingRuntime carries the cluster-aware nodeSelector and + # tolerations baked in at bundle time from --accelerated-node-selector / + # --accelerated-node-toleration flags. runtimeRef: name: torch-distributed apiGroup: trainer.kubeflow.org diff --git a/demos/cuj1-gke.md b/demos/cuj1-gke.md index 98215b7b8..a1879d27c 100644 --- a/demos/cuj1-gke.md +++ b/demos/cuj1-gke.md @@ -99,32 +99,10 @@ spec: nvidia.com/gpu: 1 limits: nvidia.com/gpu: 1 - # Inject GKE GPU node scheduling. Matches the snapshot/bundle/validate - # tolerations above (`dedicated=gpu-workload:NoSchedule` plus the GKE-managed - # `nvidia.com/gpu=present:NoSchedule` taint). kubeflow-trainer v2.2.0 replaced - # podTemplateOverrides with the runtimePatches API (PR kubeflow/trainer#3309). - runtimePatches: - - manager: aicr.nvidia.com/demo - trainingRuntimeSpec: - template: - spec: - replicatedJobs: - - name: node - template: - spec: - template: - spec: - nodeSelector: - nodeGroup: gpu-worker - tolerations: - - key: dedicated - operator: Equal - value: gpu-workload - effect: NoSchedule - - key: nvidia.com/gpu - operator: Equal - value: present - effect: NoSchedule + # No podTemplateOverrides / runtimePatches needed — the torch-distributed + # ClusterTrainingRuntime carries the cluster-aware nodeSelector and + # tolerations baked in at bundle time from --accelerated-node-selector / + # --accelerated-node-toleration flags. runtimeRef: name: torch-distributed apiGroup: trainer.kubeflow.org diff --git a/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml b/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml index 012668cd0..a65bb1f01 100644 --- a/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml +++ b/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml @@ -41,6 +41,22 @@ spec: spec: template: spec: + # nodeSelector and tolerations are injected by the AICR bundler + # from --accelerated-node-selector / --accelerated-node-toleration + # flags via the registry's nodeScheduling.accelerated paths + # (see recipes/registry.yaml). This lets users submit a bare + # TrainJob with no podTemplateOverrides / runtimePatches — the + # runtime carries the per-cluster scheduling vocabulary baked + # in at bundle time. + {{- $kft := index .Values "kubeflow-trainer" }} + {{- with $kft.acceleratedNodeSelector }} + nodeSelector: + {{- toYaml . | nindent 20 }} + {{- end }} + {{- with $kft.acceleratedTolerations }} + tolerations: + {{- toYaml . | nindent 20 }} + {{- end }} containers: - name: node image: pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime diff --git a/recipes/registry.yaml b/recipes/registry.yaml index ff94a24ba..af170a7b1 100644 --- a/recipes/registry.yaml +++ b/recipes/registry.yaml @@ -495,3 +495,13 @@ components: tolerationPaths: - manager.tolerations - jobset.controller.tolerations + # accelerated paths target top-level keys; consumed by the + # torch-distributed ClusterTrainingRuntime template in + # components/kubeflow-trainer/manifests/. Lets users submit a bare + # TrainJob with no podTemplateOverrides / runtimePatches — the + # runtime carries the per-cluster scheduling baked in at bundle time. + accelerated: + nodeSelectorPaths: + - acceleratedNodeSelector + tolerationPaths: + - acceleratedTolerations