Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
moved allreduce_strategy to BaseLlmArgs
Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com>
  • Loading branch information
MrGeva committed Nov 10, 2025
commit c8ea3ca85863b4fba17c0026bad0335d5a9ce22f
19 changes: 0 additions & 19 deletions tensorrt_llm/_torch/auto_deploy/llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,25 +123,6 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings):

device: str = Field(default="cuda", description="The device to use for the model.", frozen=True)

allreduce_strategy: Literal[
"AUTO",
"NCCL",
"ONESHOT",
"TWOSHOT",
"MIN_LATENCY",
"LOWPRECISION",
"UB",
"MNNVL",
"NCCL_SYMMETRIC",
] = Field(
default="AUTO",
description="AllReduce strategy for distributed inference. Options: AUTO (automatic selection), "
"NCCL (NCCL-based), ONESHOT (single-phase fusion kernel), TWOSHOT (two-phase fusion kernel), "
"MIN_LATENCY (minimum latency heuristic), LOWPRECISION (low precision allreduce), "
"UB (unified buffer), MNNVL (multi-node NVLINK), NCCL_SYMMETRIC (NCCL symmetric). "
"AUTO is recommended for most use cases.",
)

# TODO: see if we can just remove this field and use kv_cache_config.dtype instead?
kv_cache_dtype: str = Field(
default="auto",
Expand Down
18 changes: 12 additions & 6 deletions tensorrt_llm/llmapi/llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -1577,6 +1577,18 @@ class BaseLlmArgs(StrictBaseModel):
default=None,
description="The expert parallel size for MoE models's expert weights.")

allreduce_strategy: Optional[Literal[
'AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT',
'LOWPRECISION', 'MNNVL', 'NCCL_SYMMETRIC']] = Field(
default='AUTO',
description=
"AllReduce strategy for distributed inference. Options: AUTO (automatic selection), "
"NCCL (NCCL-based), ONESHOT (single-phase fusion kernel), TWOSHOT (two-phase fusion kernel), "
"MIN_LATENCY (minimum latency heuristic), LOWPRECISION (low precision allreduce), "
"UB (unified buffer), MNNVL (multi-node NVLINK), NCCL_SYMMETRIC (NCCL symmetric). "
"AUTO is recommended for most use cases.",
status="beta")

enable_attention_dp: bool = Field(
default=False,
description="Enable attention data parallel.",
Expand Down Expand Up @@ -2531,12 +2543,6 @@ class TorchLlmArgs(BaseLlmArgs):
status="prototype",
)

allreduce_strategy: Optional[Literal[
'AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT',
'LOWPRECISION', 'MNNVL',
'NCCL_SYMMETRIC']] = Field(default='AUTO',
description="Allreduce strategy to use.",
status="beta")
checkpoint_loader: Optional[object] = Field(
default=None,
description=
Expand Down