We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent f156221 commit 64e3bfaCopy full SHA for 64e3bfa
tensorrt_llm/_torch/speculative/model_drafter.py
@@ -151,8 +151,10 @@ def _create_draft_request_for_request(
151
assert num_draft_tokens == 0
152
return self._create_context_request(request, input_tokens)
153
154
- # No tokens accepted - generation request
155
- elif num_accepted_tokens == 0:
+ # No tokens accepted - generation request. This only applies to speculation algorithms
+ # that need to recompute KV cache for accepted tokens like eagle3.
156
+ elif num_accepted_tokens == 0 or not self.spec_config.spec_dec_mode.needs_kv_cache_recompute(
157
+ ):
158
return self._create_generation_request(request, input_tokens)
159
160
# Tokens accepted - chunked context request
0 commit comments