Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"HF_HOME": "/huggingface",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history"
},
"workspaceFolder": "/workspaces/tensorrt_llm",
"workspaceFolder": "/workspaces/tensorrt_llm", # zuker-remove me
"initializeCommand": "cd ${localWorkspaceFolder} && ./.devcontainer/make_env.py",
// Note: sourcing .profile is required since we use a local user and the python interpreter is
// global (/usr/bin/python). In this case, pip will default to a local user path which is not
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ services:

volumes:
- ${SOURCE_DIR}:/workspaces/tensorrt_llm
- ${LOCAL_HF_HOME}:/huggingface # HF cache
#- ${LOCAL_HF_HOME}:/huggingface # HF cache

environment:
- CCACHE_DIR=/workspaces/tensorrt_llm/cpp/.ccache
Expand Down
7 changes: 6 additions & 1 deletion tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(
spec_metadata: Optional[SpecMetadata] = None,
use_mrope: bool = False,
max_beam_width: int = 1,
lora_params: Optional[dict] = None,
) -> None:
"""
Stores a CUDA graph and its associated input buffers.
Expand Down Expand Up @@ -54,6 +55,7 @@ def __init__(

self.attn_metadata = attn_metadata
self.spec_metadata = spec_metadata
self.lora_params = lora_params
self._output = None
self._graph = None
self.optional_extra_model_inputs = ["mrope_position_deltas"]
Expand All @@ -74,8 +76,8 @@ def capture(
"inputs_embeds": None,
"spec_metadata": self.spec_metadata,
"mrope_position_deltas": self.mrope_position_deltas,
"lora_params": self.lora_params,
}

# We have to do warm up runs to initialize PyTorch's
# internal states according to the docs:
# https://pytorch.org/docs/stable/notes/cuda.html#cuda-graph-semantics
Expand Down Expand Up @@ -108,6 +110,9 @@ def run(self, inputs: Dict[str, Any]) -> torch.Tensor:
"spec_metadata does not match the spec_metadata instance that was used to "
"capture this graph.")

if "lora_params" in inputs:
self.lora_params = inputs["lora_params"]

input_ids = inputs["input_ids"]
position_ids = inputs["position_ids"]
seqlen = input_ids.shape[0]
Expand Down
139 changes: 118 additions & 21 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from tensorrt_llm.inputs.multimodal import (MultimodalParams,
MultimodalRuntimeData)
from tensorrt_llm.logger import logger
from tensorrt_llm.lora_helper import LoraConfig
from tensorrt_llm.lora_helper import LoraConfig, LoraManager
from tensorrt_llm.lora_manager import LoraModelConfig
from tensorrt_llm.mapping import CpType, Mapping
from tensorrt_llm.models.modeling_utils import QuantAlgo
Expand Down Expand Up @@ -297,6 +297,16 @@ def __init__(
)

attn_backend = pytorch_backend_config.attn_backend

self.lora_manager: Optional[LoraManager] = None
self.lora_prefetch_requests_list = None
# TODO smor- do we want to get the request inside the lora config?
# TODO smor- what happens if you get target modules?
# TODO smor- answer and guard against this
if lora_config is not None and lora_config.lora_request is not None:
self.lora_prefetch_requests_list = lora_config.lora_request
self.has_lora_prefetched = False

self.model = self._load_model(
model_path,
mapping=self.mapping,
Expand Down Expand Up @@ -475,6 +485,25 @@ def set_lora_model_config(self, lora_target_modules: list[str],
hidden_size=self.model.config.hidden_size,
dtype=torch_dtype_to_str(self.model.config.torch_dtype))

def set_lora_manager(self, resource_manager: ResourceManager):
peft_cache_manager = resource_manager.get_resource_manager(
ResourceManagerType.PEFT_CACHE_MANAGER)
if peft_cache_manager is not None:
self.lora_manager = peft_cache_manager.get_lora_manager()

def prefetch_lora_dirs(self):
if self.lora_prefetch_requests_list is None:
return

for request in self.lora_prefetch_requests_list:
self.lora_manager.load_from_ckpt(
[request.path],
model_config=self.lora_model_config,
runtime_mapping=None,
uids=[request.adapter_id])

self.has_lora_prefetched = True

@property
def use_mrope(self):
use_mrope = False
Expand Down Expand Up @@ -560,6 +589,24 @@ def warmup(self, resource_manager: ResourceManager) -> None:
self.cuda_graph_dummy_request = None

def get_cuda_graph_warmup_request(batch_size, draft_len):
lora_configs = []
if self.has_lora_prefetched:
from tensorrt_llm.bindings import executor as tllm

available_uids = sorted(
list(self.lora_manager.cpp_lora_weights.keys()))

for request_id in range(batch_size):
uid = available_uids[request_id % len(available_uids)]

weights = self.lora_manager.cpp_lora_weights[uid]
config = self.lora_manager.cpp_lora_config[uid]

lora_config = tllm.LoraConfig(task_id=uid,
weights=weights,
config=config)
lora_configs.append(lora_config)

# Divide by max_beam_width to get an approximation of the number of requests that can be run in parallel.
available_blocks = kv_cache_manager.get_num_free_blocks(
) // self.max_beam_width
Expand All @@ -568,12 +615,21 @@ def get_cuda_graph_warmup_request(batch_size, draft_len):
result.context_requests = []
# Add (batch_size - 1) dummy requests with seq_len=1.
# Should only need one more page per request.

short_requests_lora = lora_configs[:batch_size -
1] if lora_configs else None
lora_request_for_max = [
lora_configs[batch_size - 1]
] if lora_configs and len(lora_configs) >= batch_size else None

requests = kv_cache_manager.add_dummy_requests(
list(range(batch_size - 1)),
is_gen=True,
max_num_draft_tokens=draft_len,
use_mrope=use_mrope,
max_beam_width=self.max_beam_width)
max_beam_width=self.max_beam_width,
lora_request=short_requests_lora,
)
# Divide by max_beam_width to get an approximation of the number of tokens that can be added to the final request.
available_tokens = kv_cache_manager.get_num_available_tokens(
draft_len)
Expand All @@ -587,7 +643,8 @@ def get_cuda_graph_warmup_request(batch_size, draft_len):
is_gen=True,
max_num_draft_tokens=draft_len,
use_mrope=use_mrope,
max_beam_width=self.max_beam_width)[0]
max_beam_width=self.max_beam_width,
lora_request=lora_request_for_max)[0]
# Add the longest request before all other seq_len=1 request to simulate the padding CUDA graph case.
# This batch contains both the longest request and the shortest requests,
# it also contains the maximum number of requests and the maximum token number,
Expand Down Expand Up @@ -792,7 +849,7 @@ def release_batch(result: ScheduledRequests | None):
self.forward(batch,
new_tensors_device=None,
resource_manager=resource_manager)
torch.cuda.synchronize()
torch.cuda.synchronize() # SMOR suppose to fail here, verify

if self._torch_compile_piecewise_cuda_graph and self._torch_compile_enabled:
piecewise_cuda_graph_num_tokens = sorted(
Expand Down Expand Up @@ -975,6 +1032,7 @@ def _round_up_batch_size(self, batch_size: int) -> int:
def _maybe_get_cuda_graph(
self,
batch: ScheduledRequests,
resource_manager: Optional[ResourceManager] = None
) -> Optional[DecodingCUDAGraphRunner]:
"""
Get a CUDA graph runner or return None (e.g. if CUDA graphs are disabled
Expand Down Expand Up @@ -1021,13 +1079,46 @@ def _maybe_get_cuda_graph(
else:
spec_metadata = None

lora_params = None
if self.has_lora_prefetched:
peft_cache_manager = resource_manager.get_resource_manager(
ResourceManagerType.PEFT_CACHE_MANAGER)

context_requests = batch.context_requests
generation_requests = batch.generation_requests

if len(context_requests) > 0 and len(generation_requests) > 0:
raise ValueError(
"SMOR, non empty context and generation requests isn't tested yet"
)

if len(context_requests) > 0:
raise ValueError("SMOR, context requests isn't tested yet")

for generation_request in generation_requests:
peft_cache_manager.add_request_peft(generation_request)

py_lora_task_layer_module_configs = peft_cache_manager.impl.ensure_batch(
context_requests, generation_requests, False)
for req in context_requests:
req.py_lora_task_layer_module_configs = py_lora_task_layer_module_configs[
req.
py_request_id] if req.py_request_id in py_lora_task_layer_module_configs else None
for req in generation_requests:
req.py_lora_task_layer_module_configs = py_lora_task_layer_module_configs[
req.
py_request_id] if req.py_request_id in py_lora_task_layer_module_configs else None

lora_params = self._get_lora_params_from_requests(
batch, attn_metadata) # TODO smor consider just creating shape?

# Initialize nested dictionary if needed
if batch_size not in self._cuda_graphs:
self._cuda_graphs[batch_size] = {}

self._cuda_graphs[batch_size][draft_len] = DecodingCUDAGraphRunner(
batch_size, "cuda", attn_metadata, spec_metadata, self.use_mrope,
self.max_beam_width)
self.max_beam_width, lora_params)
return self._cuda_graphs[batch_size][draft_len]

def __del__(self) -> None:
Expand Down Expand Up @@ -1230,7 +1321,8 @@ def _prepare_tp_inputs(
attn_metadata: AttentionMetadata,
spec_metadata: Optional[SpecMetadata] = None,
new_tensors_device: Optional[SampleStateTensors] = None,
cache_indirection_buffer: Optional[torch.Tensor] = None):
cache_indirection_buffer: Optional[torch.Tensor] = None,
lora_params: Optional[dict] = None):
"""
Prepare inputs for Pytorch Model.
"""
Expand Down Expand Up @@ -1590,8 +1682,9 @@ def previous_seq_slots_device():

attn_metadata.prepare()

lora_params = self._get_lora_params_from_requests(
scheduled_requests, attn_metadata)
if lora_params is None:
lora_params = self._get_lora_params_from_requests(
scheduled_requests, attn_metadata)

# Prepare inputs
inputs = {
Expand Down Expand Up @@ -2124,14 +2217,14 @@ def _get_lora_params_from_requests(self,
return lora_params

@nvtx_range("_prepare_inputs")
def _prepare_inputs(
self,
scheduled_requests: ScheduledRequests,
kv_cache_manager: KVCacheManager,
attn_metadata: AttentionMetadata,
spec_metadata: Optional[SpecMetadata] = None,
new_tensors_device: Optional[SampleStateTensors] = None,
cache_indirection_buffer: Optional[torch.Tensor] = None):
def _prepare_inputs(self,
scheduled_requests: ScheduledRequests,
kv_cache_manager: KVCacheManager,
attn_metadata: AttentionMetadata,
spec_metadata: Optional[SpecMetadata] = None,
new_tensors_device: Optional[SampleStateTensors] = None,
cache_indirection_buffer: Optional[torch.Tensor] = None,
lora_params: Optional[dict] = None):
if self.mapping is not None and 'cp_type' in self.mapping.cp_config:
cp_type = self.mapping.cp_config['cp_type']
if CpType.STAR == cp_type:
Expand All @@ -2143,7 +2236,8 @@ def _prepare_inputs(
return self._prepare_tp_inputs(scheduled_requests, kv_cache_manager,
attn_metadata, spec_metadata,
new_tensors_device,
cache_indirection_buffer)
cache_indirection_buffer,
lora_params)

@torch.inference_mode()
@with_model_extra_attrs(lambda self: self.model.extra_attrs)
Expand Down Expand Up @@ -2187,20 +2281,25 @@ def forward(
gather_context_logits)
with self._maybe_pad_batch(scheduled_requests, kv_cache_manager,
spec_resource_manager) as scheduled_requests:
maybe_graph = self._maybe_get_cuda_graph(scheduled_requests)
maybe_graph = self._maybe_get_cuda_graph(
scheduled_requests, resource_manager=resource_manager)
if maybe_graph is not None:
attn_metadata = maybe_graph.attn_metadata
spec_metadata = maybe_graph.spec_metadata
# Don't use pre-captured lora_params - always get fresh ones from current requests
lora_params = None
else:
attn_metadata = self.attn_metadata
lora_params = None
if self.enable_spec_decode:
spec_metadata = self.spec_metadata
else:
spec_metadata = None

inputs, gather_ids = self._prepare_inputs(
scheduled_requests, kv_cache_manager, attn_metadata,
spec_metadata, new_tensors_device, cache_indirection_buffer)
spec_metadata, new_tensors_device, cache_indirection_buffer,
lora_params)

self.iter_counter += 1

Expand All @@ -2223,7 +2322,6 @@ def capture_forward_fn(inputs: Dict[str, Any]):
self._cuda_graph_mem_pool,
)
self._cuda_graph_mem_pool = pool

# here we don't need to use context since cuda graph capture didn't run kernel.
# maybe we need a cleaner way to do this.
outputs = maybe_graph.run(inputs)
Expand All @@ -2232,7 +2330,6 @@ def capture_forward_fn(inputs: Dict[str, Any]):
outputs = maybe_graph.run(inputs)

self._execute_logit_post_processors(scheduled_requests, outputs)

return outputs

def model_forward(self, **kwargs):
Expand Down
5 changes: 5 additions & 0 deletions tensorrt_llm/_torch/pyexecutor/py_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,8 @@ def __init__(self,
self.micro_batches: List[BatchStatePP
| None] = [None] * self.num_micro_batches
self.send_handles = [None] * self.num_micro_batches
self.model_engine.set_lora_manager(self.resource_manager)
self.model_engine.prefetch_lora_dirs()

self.inflight_req_ids = ReqIdsSet()

Expand Down Expand Up @@ -296,6 +298,9 @@ def is_warmup(self, value: bool):
if self.draft_model_engine is not None:
self.draft_model_engine.is_warmup = value

def get_lora_manager(self):
return self.model_engine.lora_manager

def start_worker(self):
with self.worker_lock:
if self.worker_started == False:
Expand Down
Loading