Skip to content
Merged
Prev Previous commit
Next Next commit
migrating to cuda_config
  • Loading branch information
KrishnanPrash committed Jul 28, 2025
commit 45397330b492062b0a0081f660c138ed2f62f264
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ backend: pytorch

# WideEP related settings
moe_config:
backend: WideEP
backend: WIDEEP
# moe_max_num_tokens will default to max_num_tokens if left unspecified.
#
# If you want to set this value explicitly, one recommendation is below:
Expand All @@ -20,18 +20,20 @@ enable_attention_dp: true
max_batch_size: 256
max_num_tokens: 256
max_seq_len: 8448

kv_cache_config:
free_gpu_memory_fraction: 0.7
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
kv_cache_dtype: fp8
dtype: fp8

cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ backend: pytorch

# WideEP related settings
moe_config:
backend: WideEP
backend: WIFEEP
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml

# TP/EP/PP/DP
Expand All @@ -36,25 +36,28 @@ kv_cache_config:
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
free_gpu_memory_fraction: 0.30
dtype: fp8


# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: false
use_cuda_graph: true
cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match.
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
cuda_graph_config:
enable_padding: true
# NOTE: For larger max batch size, you may want to
# add larger cuda graph batch sizes below to match.
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256


print_iter_log: true
kv_cache_dtype: fp8
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ backend: pytorch

# WideEP related settings
moe_config:
backend: WideEP
backend: WIDEEP
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml

# TP/EP/PP/DP
Expand All @@ -31,12 +31,11 @@ max_seq_len: 8192

kv_cache_config:
free_gpu_memory_fraction: 0.75
dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs

# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true
print_iter_log: true
# NOTE: This dtype must match in both prefill/decode configs
kv_cache_dtype: fp8
print_iter_log: true