migrating to cuda_config

ai-dynamo · tanmayv25 · Jul 31, 2025 · Jul 23, 2025 · Jul 28, 2025 · Jul 28, 2025
commit 45397330b492062b0a0081f660c138ed2f62f264
diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml
@@ -4,7 +4,7 @@ backend: pytorch
 
 # WideEP related settings
 moe_config:
-  backend: WideEP
+  backend: WIDEEP
   # moe_max_num_tokens will default to max_num_tokens if left unspecified.
   #
   # If you want to set this value explicitly, one recommendation is below:
@@ -20,18 +20,20 @@ enable_attention_dp: true
 max_batch_size: 256
 max_num_tokens: 256
 max_seq_len: 8448
+
 kv_cache_config:
   free_gpu_memory_fraction: 0.7
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes:
-- 1
-- 2
-- 4
-- 8
-- 16
-- 32
-- 64
-- 128
-- 256
-kv_cache_dtype: fp8
+  dtype: fp8
+
+cuda_graph_config:
+  enable_padding: true
+  batch_sizes:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+  - 64
+  - 128
+  - 256
diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml
@@ -16,7 +16,7 @@ backend: pytorch
 
 # WideEP related settings
 moe_config:
-  backend: WideEP
+  backend: WIFEEP
   load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
 
 # TP/EP/PP/DP
@@ -36,25 +36,28 @@ kv_cache_config:
   # With dp attention enabled: large ISL at high concurrency may need
   # free_gpu_memory_fraction low to have enough available memory.
   free_gpu_memory_fraction: 0.30
+  dtype: fp8
+
 
 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
 # NOTE: overlap_scheduler enabled by default since this commit and changed
 # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
 disable_overlap_scheduler: false
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-# NOTE: For larger max batch size, you may want to add larger cuda graph
-# batch sizes below to match.
-cuda_graph_batch_sizes:
-- 1
-- 2
-- 4
-- 8
-- 16
-- 32
-- 64
-- 128
-- 256
+cuda_graph_config:
+  enable_padding: true
+  # NOTE: For larger max batch size, you may want to
+  # add larger cuda graph batch sizes below to match.
+  batch_sizes:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+  - 64
+  - 128
+  - 256
+
+
 print_iter_log: true
-kv_cache_dtype: fp8
diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml
@@ -16,7 +16,7 @@ backend: pytorch
 
 # WideEP related settings
 moe_config:
-  backend: WideEP
+  backend: WIDEEP
   load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
 
 # TP/EP/PP/DP
@@ -31,12 +31,11 @@ max_seq_len: 8192
 
 kv_cache_config:
   free_gpu_memory_fraction: 0.75
+  dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs
 
 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
 # NOTE: overlap_scheduler enabled by default since this commit and changed
 # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
 disable_overlap_scheduler: true
-print_iter_log: true
-# NOTE: This dtype must match in both prefill/decode configs
-kv_cache_dtype: fp8
+print_iter_log: true