NVIDIA · jhaotingc · Dec 24, 2025 · coderabbitai · Dec 24, 2025 · coderabbitai
@@ -796,6 +796,7 @@ def __init__(self, model: TModel, model_config: ModelConfig[TConfig]):
                     assert key in model_config.extra_attrs
                     model_config.extra_attrs[key].update(value)
         self.layer_idx = -1
+        self.enable_cuda_graph_for_draft_model = spec_config.enable_cuda_graph_for_draft_model
-        self.enable_cuda_graph_for_draft_model = spec_config.enable_cuda_graph_for_draft_model
+        self.layer_idx = -1
+        self.enable_cuda_graph_for_draft_model = spec_config.enable_cuda_graph_for_draft_model if spec_config else True
-        self.enable_cuda_graph_for_draft_model = spec_config.enable_cuda_graph_for_draft_model
+        self.layer_idx = -1
+        self.enable_cuda_graph_for_draft_model = spec_config.enable_cuda_graph_for_draft_model if spec_config else True
 
     def forward(
         self,
@@ -823,33 +824,15 @@ def forward(
         if attn_metadata.padded_num_tokens is not None:
             hidden_states = hidden_states[:attn_metadata.num_tokens]
 
+        is_capturing = torch.cuda.is_current_stream_capturing()
+
         if self.draft_model is not None:
-            # get logits
-            logits = self.logits_processor.forward(
-                hidden_states[spec_metadata.gather_ids],
-                self.lm_head,
-                attn_metadata,
-                True,
-            )
-            mtp_input_ids = input_ids
-            mtp_position_ids = position_ids
-            if attn_metadata.padded_num_tokens is not None:
-                if input_ids is not None:
-                    # Slice along the first dimension
-                    mtp_input_ids = input_ids[:attn_metadata.num_tokens]
-                if position_ids is not None:
-                    # Slice along the last dimension
-                    mtp_position_ids = position_ids[:, :attn_metadata.
-                                                    num_tokens]
-
-            # get accepted tokens and next draft tokens
-            return self.spec_worker(input_ids=mtp_input_ids,
-                                    position_ids=mtp_position_ids,
-                                    hidden_states=hidden_states,
-                                    logits=logits,
-                                    attn_metadata=attn_metadata,
-                                    spec_metadata=spec_metadata,
-                                    draft_model=self.draft_model)
+            if is_capturing and not self.enable_cuda_graph_for_draft_model:
+                return hidden_states
+            else:
+                return self.forward_draft(hidden_states, input_ids,
+                                          position_ids, attn_metadata,
+                                          spec_metadata)
         else:
             logits = self.logits_processor.forward(
                 hidden_states,
@@ -860,6 +843,34 @@ def forward(
 
         return logits
 
+    def forward_draft(self, hidden_states, input_ids, position_ids,
+                      attn_metadata, spec_metadata):
+        # get logits
+        logits = self.logits_processor.forward(
+            hidden_states[spec_metadata.gather_ids],
+            self.lm_head,
+            attn_metadata,
+            True,
+        )
+        mtp_input_ids = input_ids
+        mtp_position_ids = position_ids
+        if attn_metadata.padded_num_tokens is not None:
+            if input_ids is not None:
+                # Slice along the first dimension
+                mtp_input_ids = input_ids[:attn_metadata.num_tokens]
+            if position_ids is not None:
+                # Slice along the last dimension
+                mtp_position_ids = position_ids[:, :attn_metadata.num_tokens]
+
+        # get accepted tokens and next draft tokens
+        return self.spec_worker(input_ids=mtp_input_ids,
+                                position_ids=mtp_position_ids,
+                                hidden_states=hidden_states,
+                                logits=logits,
+                                attn_metadata=attn_metadata,
+                                spec_metadata=spec_metadata,
+                                draft_model=self.draft_model)
+
     def load_weights(self,
                      weights: Dict,
                      weight_mapper: Optional[BaseWeightMapper] = None,

@@ -338,6 +338,7 @@ def __init__(
             ) or self.model_is_wrapped
             self.max_draft_len = spec_config.max_draft_len
             self.max_total_draft_tokens = spec_config.max_total_draft_tokens
+            self.enable_cuda_graph_for_draft_model = spec_config.enable_cuda_graph_for_draft_model
         else:
             self.without_logits = False
             self.max_draft_len = 0
@@ -3265,6 +3266,12 @@ def capture_postprocess_fn(inputs: Dict[str, Any]):
                     else:
                         with MoeLoadBalancerIterContext(moe_load_balancer):
                             outputs = self.cuda_graph_runner.replay(key, inputs)
+                            if not self.enable_cuda_graph_for_draft_model:
+                                outputs = self.model.forward_draft(
+                                    outputs, inputs['input_ids'],
+                                    inputs['position_ids'],
+                                    inputs['attn_metadata'],
+                                    inputs['spec_metadata'])
 
             if self.forward_pass_callable is not None:
                 self.forward_pass_callable()

@@ -858,6 +858,8 @@ class EagleDecodingConfig(DecodingBaseConfig):
     # The model architecture of the eagle3 model.
     # choices: llama3, mistral_large3
     eagle3_model_arch: str = "llama3"
+    # Whether if draft is captured in cuda graph
+    enable_cuda_graph_for_draft_model: Optional[bool] = True
 
     def __init__(self, **kwargs):
         super().__init__()
@@ -912,6 +914,11 @@ def __init__(self, **kwargs):
             assert self.dynamic_tree_max_topK is not None and self.dynamic_tree_max_topK > 0, "dynamic_tree_max_topK should be provided, which indicates the number of nodes to expand each time"
             assert self.max_total_draft_tokens is not None and self.max_total_draft_tokens > 0, "max_total_draft_tokens should be provided, which indicates the total nodes of the final draft tree. (exclude the root node)"
 
+        if self.enable_cuda_graph_for_draft_model == False and self.eagle3_one_model == False:
+            raise ValueError(
+                "enable_cuda_graph_for_draft_model can be false only when eagle3_one_model is True"
+            )
+
     @classmethod
     def from_dict(cls, data: dict):
         return cls(**data)

diff --git a/tests/integration/defs/.test_durations b/tests/integration/defs/.test_durations
@@ -309,8 +309,8 @@
     "accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=True]": 166.85348949534819,
     "accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]": 167.15153613401344,
     "accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM]": 90.12104846700095,
-    "accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]": 1112.0988524899585,
-    "accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]": 979.2759481471148,
+    "accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-eagle3_one_model=False-overlap_scheduler=False]": 1112.0988524899585,
+    "accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-eagle3_one_model=True-overlap_scheduler=True]": 979.2759481471148,
     "accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=FLASHINFER-torch_compile=False]": 237.24446990108117,
     "accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=FLASHINFER-torch_compile=True]": 226.39608797896653,
     "accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=TRTLLM-torch_compile=False]": 174.38962662010454,

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -266,8 +266,13 @@ def test_fp8_llm_sampler(self):
     @parametrize_with_ids("overlap_scheduler", [True, False])
     @parametrize_with_ids("eagle3_one_model", [True, False])
     @parametrize_with_ids("sampler_async_worker", [True, False])
+    @parametrize_with_ids("enable_cuda_graph_for_draft_model", [True, False])
     def test_eagle3(self, overlap_scheduler, eagle3_one_model,
-                    sampler_async_worker):
+                    sampler_async_worker, enable_cuda_graph_for_draft_model):
+        if enable_cuda_graph_for_draft_model == False and eagle3_one_model == False:
+            pytest.skip(
+                "enable_cuda_graph_for_draft_model can be false only when eagle3_one_model is True"
+            )
         pytorch_config = dict(
             max_batch_size=
             1,  # add max_batch_size to avoid error in overlap scheduler
@@ -284,9 +289,11 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model,
         target_model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct"
 
         draft_len = 4
-        spec_config = EagleDecodingConfig(max_draft_len=draft_len,
-                                          speculative_model_dir=eagle_model_dir,
-                                          eagle3_one_model=eagle3_one_model)
+        spec_config = EagleDecodingConfig(
+            max_draft_len=draft_len,
+            speculative_model_dir=eagle_model_dir,
+            eagle3_one_model=eagle3_one_model,
+            enable_cuda_graph_for_draft_model=enable_cuda_graph_for_draft_model)
 
         with LLM(model=target_model_dir,
                  **pytorch_config,

diff --git a/tests/integration/test_lists/qa/llm_digits_func.txt b/tests/integration/test_lists/qa/llm_digits_func.txt
@@ -16,9 +16,10 @@ test_e2e.py::test_ptp_quickstart_advanced[Mistral-Nemo-12b-Base-Mistral-Nemo-Bas
 test_e2e.py::test_ptp_quickstart_advanced[DeepSeek-R1-Distill-Qwen-32B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B]
 
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=True-sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
 
 accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype

diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -390,9 +390,10 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=True-sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]

diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt
@@ -128,9 +128,10 @@ accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus]
 accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=True-sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]

diff --git a/tests/integration/test_lists/qa/llm_function_l20.txt b/tests/integration/test_lists/qa/llm_function_l20.txt
@@ -21,9 +21,10 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=True]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=True-sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]

diff --git a/tests/integration/test_lists/qa/llm_function_rtx6k.txt b/tests/integration/test_lists/qa/llm_function_rtx6k.txt
@@ -59,9 +59,10 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=False-sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[enable_cuda_graph_for_draft_model=True-sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]