diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index d04f9a25352..eb8619308b3 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -870,7 +870,7 @@ def _executor_loop(self): self.resource_manager.prepare_resources(scheduled_batch) if self.drafter is not None: self.drafter.prepare_draft_tokens( - scheduled_batch, self.resource_manager) + scheduled_batch, self.resource_manager, iter_stats) batch_outputs = self._forward_step(scheduled_batch) diff --git a/tensorrt_llm/_torch/speculative/drafter.py b/tensorrt_llm/_torch/speculative/drafter.py index e08044cbb4f..abaa05985f1 100644 --- a/tensorrt_llm/_torch/speculative/drafter.py +++ b/tensorrt_llm/_torch/speculative/drafter.py @@ -1,6 +1,8 @@ from abc import ABC, abstractmethod from typing import Optional +from tensorrt_llm.bindings.executor import IterationStats + from ..pyexecutor.resource_manager import ResourceManager from ..pyexecutor.scheduler import ScheduledRequests @@ -13,6 +15,7 @@ def prepare_draft_tokens( self, scheduled_requests: ScheduledRequests, resource_manager: Optional[ResourceManager] = None, + iter_stats: IterationStats = None, ) -> None: """ Prepare the drafter tokens for the forward computation this step. diff --git a/tensorrt_llm/_torch/speculative/model_drafter.py b/tensorrt_llm/_torch/speculative/model_drafter.py index 53d7af3d360..38707d0e47e 100644 --- a/tensorrt_llm/_torch/speculative/model_drafter.py +++ b/tensorrt_llm/_torch/speculative/model_drafter.py @@ -6,6 +6,7 @@ import torch from tensorrt_llm._utils import nvtx_range +from tensorrt_llm.bindings.executor import IterationStats from tensorrt_llm.logger import logger from ..pyexecutor.llm_request import LlmRequest, LlmRequestState, SamplingConfig @@ -297,6 +298,7 @@ def prepare_draft_tokens( self, scheduled_requests: ScheduledRequests, resource_manager: Optional[ResourceManager] = None, + iter_stats: IterationStats = None, ) -> None: """ Prepare draft tokens for the scheduled requests. diff --git a/tensorrt_llm/_torch/speculative/ngram.py b/tensorrt_llm/_torch/speculative/ngram.py index 9113900ef94..473e257bb1d 100644 --- a/tensorrt_llm/_torch/speculative/ngram.py +++ b/tensorrt_llm/_torch/speculative/ngram.py @@ -1,7 +1,10 @@ +import time from itertools import chain +from typing import Dict, Tuple from ordered_set import OrderedSet +from tensorrt_llm.bindings.executor import IterationStats from tensorrt_llm.logger import logger from ..pyexecutor.llm_request import * @@ -174,6 +177,21 @@ def prepare_draft_tokens( self, scheduled_requests: ScheduledRequests, resource_manager: Optional[ResourceManager] = None, + iter_stats: IterationStats = None, + ) -> None: + + if iter_stats is not None: + start_time = time.time() + + self._prepare_draft_tokens(scheduled_requests) + + if iter_stats is not None: + self._update_ngram_iter_stats(scheduled_requests, iter_stats, + start_time) + + def _prepare_draft_tokens( + self, + scheduled_requests: ScheduledRequests, ) -> None: # Sort by request_id when py_batch_idx is None as a fallback. # This happens in the disagg case: for a set of new requests, we draft @@ -195,6 +213,48 @@ def prepare_draft_tokens( ) # Pad length to `self.max_draft_len` if len(draft_tokens) > 0: - pad_length = self.max_draft_len - len(draft_tokens) + draft_length = len(draft_tokens) + pad_length = self.max_draft_len - draft_length draft_tokens.extend([request.py_end_id] * pad_length) + else: + draft_length = 0 request.py_draft_tokens = draft_tokens + request.py_draft_length = draft_length + + def _update_ngram_iter_stats( + self, + scheduled_requests: ScheduledRequests, + iter_stats: IterationStats, + start_time: float, + ) -> Tuple[ScheduledRequests, Dict[int, LlmRequest]]: + """ + Get statistic information from the draft tokens in NGram drafter + """ + now_time = time.time() + + total_num_draft_tokens = 0 + total_num_accepted_tokens = 0 + num_requests_with_draft_tokens = 0 + for request in scheduled_requests.generation_requests: + if request.py_last_draft_tokens is not None: + total_num_draft_tokens += request.py_draft_length + total_num_accepted_tokens += request.py_num_accepted_draft_tokens + num_requests_with_draft_tokens += 1 + + if num_requests_with_draft_tokens > 0: + iter_stats.specdec_stats.iter_latency_ms = (now_time - + start_time) * 1e3 + iter_stats.specdec_stats.num_draft_tokens = total_num_draft_tokens + iter_stats.specdec_stats.num_accepted_tokens = total_num_accepted_tokens + iter_stats.specdec_stats.num_requests_with_draft_tokens = num_requests_with_draft_tokens + iter_stats.specdec_stats.acceptance_length = ( + total_num_accepted_tokens + + num_requests_with_draft_tokens) / num_requests_with_draft_tokens + else: + iter_stats.specdec_stats.iter_latency_ms = 0.0 + iter_stats.specdec_stats.num_draft_tokens = 0 + iter_stats.specdec_stats.num_accepted_tokens = 0 + iter_stats.specdec_stats.num_requests_with_draft_tokens = 0 + iter_stats.specdec_stats.acceptance_length = 1.0 + + return