moved allreduce_strategy to BaseLlmArgs

Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com>
NVIDIA · MrGeva · Nov 25, 2025 · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025
commit c8ea3ca85863b4fba17c0026bad0335d5a9ce22f
diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py
@@ -123,25 +123,6 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings):
 
     device: str = Field(default="cuda", description="The device to use for the model.", frozen=True)
 
-    allreduce_strategy: Literal[
-        "AUTO",
-        "NCCL",
-        "ONESHOT",
-        "TWOSHOT",
-        "MIN_LATENCY",
-        "LOWPRECISION",
-        "UB",
-        "MNNVL",
-        "NCCL_SYMMETRIC",
-    ] = Field(
-        default="AUTO",
-        description="AllReduce strategy for distributed inference. Options: AUTO (automatic selection), "
-        "NCCL (NCCL-based), ONESHOT (single-phase fusion kernel), TWOSHOT (two-phase fusion kernel), "
-        "MIN_LATENCY (minimum latency heuristic), LOWPRECISION (low precision allreduce), "
-        "UB (unified buffer), MNNVL (multi-node NVLINK), NCCL_SYMMETRIC (NCCL symmetric). "
-        "AUTO is recommended for most use cases.",
-    )
-
     # TODO: see if we can just remove this field and use kv_cache_config.dtype instead?
     kv_cache_dtype: str = Field(
         default="auto",

diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -1577,6 +1577,18 @@ class BaseLlmArgs(StrictBaseModel):
         default=None,
         description="The expert parallel size for MoE models's expert weights.")
 
+    allreduce_strategy: Optional[Literal[
+        'AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT',
+        'LOWPRECISION', 'MNNVL', 'NCCL_SYMMETRIC']] = Field(
+            default='AUTO',
+            description=
+            "AllReduce strategy for distributed inference. Options: AUTO (automatic selection), "
+            "NCCL (NCCL-based), ONESHOT (single-phase fusion kernel), TWOSHOT (two-phase fusion kernel), "
+            "MIN_LATENCY (minimum latency heuristic), LOWPRECISION (low precision allreduce), "
+            "UB (unified buffer), MNNVL (multi-node NVLINK), NCCL_SYMMETRIC (NCCL symmetric). "
+            "AUTO is recommended for most use cases.",
+            status="beta")
+
     enable_attention_dp: bool = Field(
         default=False,
         description="Enable attention data parallel.",
@@ -2531,12 +2543,6 @@ class TorchLlmArgs(BaseLlmArgs):
         status="prototype",
     )
 
-    allreduce_strategy: Optional[Literal[
-        'AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT',
-        'LOWPRECISION', 'MNNVL',
-        'NCCL_SYMMETRIC']] = Field(default='AUTO',
-                                   description="Allreduce strategy to use.",
-                                   status="beta")
     checkpoint_loader: Optional[object] = Field(
         default=None,
         description=