diff --git a/demos/cuj1-eks.md b/demos/cuj1-eks.md
index 2fd5b4e41..4db45f6de 100644
--- a/demos/cuj1-eks.md
+++ b/demos/cuj1-eks.md
@@ -97,30 +97,10 @@ spec:
         nvidia.com/gpu: 1
       limits:
         nvidia.com/gpu: 1
-  # Inject AICR-standard GPU node scheduling. kubeflow-trainer v2.2.0 replaced
-  # podTemplateOverrides with the runtimePatches API (PR kubeflow/trainer#3309).
-  runtimePatches:
-    - manager: aicr.nvidia.com/demo
-      trainingRuntimeSpec:
-        template:
-          spec:
-            replicatedJobs:
-              - name: node
-                template:
-                  spec:
-                    template:
-                      spec:
-                        nodeSelector:
-                          nodeGroup: gpu-worker
-                        tolerations:
-                          - key: dedicated
-                            operator: Equal
-                            value: worker-workload
-                            effect: NoSchedule
-                          - key: dedicated
-                            operator: Equal
-                            value: worker-workload
-                            effect: NoExecute
+  # No podTemplateOverrides / runtimePatches needed — the torch-distributed
+  # ClusterTrainingRuntime carries the cluster-aware nodeSelector and
+  # tolerations baked in at bundle time from --accelerated-node-selector /
+  # --accelerated-node-toleration flags.
   runtimeRef:
     name: torch-distributed
     apiGroup: trainer.kubeflow.org
diff --git a/demos/cuj1-gke.md b/demos/cuj1-gke.md
index 98215b7b8..a1879d27c 100644
--- a/demos/cuj1-gke.md
+++ b/demos/cuj1-gke.md
@@ -99,32 +99,10 @@ spec:
         nvidia.com/gpu: 1
       limits:
         nvidia.com/gpu: 1
-  # Inject GKE GPU node scheduling. Matches the snapshot/bundle/validate
-  # tolerations above (`dedicated=gpu-workload:NoSchedule` plus the GKE-managed
-  # `nvidia.com/gpu=present:NoSchedule` taint). kubeflow-trainer v2.2.0 replaced
-  # podTemplateOverrides with the runtimePatches API (PR kubeflow/trainer#3309).
-  runtimePatches:
-    - manager: aicr.nvidia.com/demo
-      trainingRuntimeSpec:
-        template:
-          spec:
-            replicatedJobs:
-              - name: node
-                template:
-                  spec:
-                    template:
-                      spec:
-                        nodeSelector:
-                          nodeGroup: gpu-worker
-                        tolerations:
-                          - key: dedicated
-                            operator: Equal
-                            value: gpu-workload
-                            effect: NoSchedule
-                          - key: nvidia.com/gpu
-                            operator: Equal
-                            value: present
-                            effect: NoSchedule
+  # No podTemplateOverrides / runtimePatches needed — the torch-distributed
+  # ClusterTrainingRuntime carries the cluster-aware nodeSelector and
+  # tolerations baked in at bundle time from --accelerated-node-selector /
+  # --accelerated-node-toleration flags.
   runtimeRef:
     name: torch-distributed
     apiGroup: trainer.kubeflow.org
diff --git a/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml b/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
index 012668cd0..a65bb1f01 100644
--- a/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
+++ b/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
@@ -41,6 +41,22 @@ spec:
             spec:
               template:
                 spec:
+                  # nodeSelector and tolerations are injected by the AICR bundler
+                  # from --accelerated-node-selector / --accelerated-node-toleration
+                  # flags via the registry's nodeScheduling.accelerated paths
+                  # (see recipes/registry.yaml). This lets users submit a bare
+                  # TrainJob with no podTemplateOverrides / runtimePatches — the
+                  # runtime carries the per-cluster scheduling vocabulary baked
+                  # in at bundle time.
+                  {{- $kft := index .Values "kubeflow-trainer" }}
+                  {{- with $kft.acceleratedNodeSelector }}
+                  nodeSelector:
+                    {{- toYaml . | nindent 20 }}
+                  {{- end }}
+                  {{- with $kft.acceleratedTolerations }}
+                  tolerations:
+                    {{- toYaml . | nindent 20 }}
+                  {{- end }}
                   containers:
                     - name: node
                       image: pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
diff --git a/recipes/registry.yaml b/recipes/registry.yaml
index ff94a24ba..af170a7b1 100644
--- a/recipes/registry.yaml
+++ b/recipes/registry.yaml
@@ -495,3 +495,13 @@ components:
         tolerationPaths:
           - manager.tolerations
           - jobset.controller.tolerations
+      # accelerated paths target top-level keys; consumed by the
+      # torch-distributed ClusterTrainingRuntime template in
+      # components/kubeflow-trainer/manifests/. Lets users submit a bare
+      # TrainJob with no podTemplateOverrides / runtimePatches — the
+      # runtime carries the per-cluster scheduling baked in at bundle time.
+      accelerated:
+        nodeSelectorPaths:
+          - acceleratedNodeSelector
+        tolerationPaths:
+          - acceleratedTolerations