diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index f09c046ac9a..6a832a624b0 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -13,7 +13,7 @@ "HF_HOME": "/huggingface", "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history" }, - "workspaceFolder": "/workspaces/tensorrt_llm", + "workspaceFolder": "/workspaces/tensorrt_llm", # zuker-remove me "initializeCommand": "cd ${localWorkspaceFolder} && ./.devcontainer/make_env.py", // Note: sourcing .profile is required since we use a local user and the python interpreter is // global (/usr/bin/python). In this case, pip will default to a local user path which is not diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 7d7a64bbd0d..6e962e3340f 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -23,7 +23,7 @@ services: volumes: - ${SOURCE_DIR}:/workspaces/tensorrt_llm - - ${LOCAL_HF_HOME}:/huggingface # HF cache + #- ${LOCAL_HF_HOME}:/huggingface # HF cache environment: - CCACHE_DIR=/workspaces/tensorrt_llm/cpp/.ccache diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py index df674a94968..307246f5182 100644 --- a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py +++ b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py @@ -18,6 +18,7 @@ def __init__( spec_metadata: Optional[SpecMetadata] = None, use_mrope: bool = False, max_beam_width: int = 1, + lora_params: Optional[dict] = None, ) -> None: """ Stores a CUDA graph and its associated input buffers. @@ -54,6 +55,7 @@ def __init__( self.attn_metadata = attn_metadata self.spec_metadata = spec_metadata + self.lora_params = lora_params self._output = None self._graph = None self.optional_extra_model_inputs = ["mrope_position_deltas"] @@ -74,8 +76,8 @@ def capture( "inputs_embeds": None, "spec_metadata": self.spec_metadata, "mrope_position_deltas": self.mrope_position_deltas, + "lora_params": self.lora_params, } - # We have to do warm up runs to initialize PyTorch's # internal states according to the docs: # https://pytorch.org/docs/stable/notes/cuda.html#cuda-graph-semantics @@ -108,6 +110,9 @@ def run(self, inputs: Dict[str, Any]) -> torch.Tensor: "spec_metadata does not match the spec_metadata instance that was used to " "capture this graph.") + if "lora_params" in inputs: + self.lora_params = inputs["lora_params"] + input_ids = inputs["input_ids"] position_ids = inputs["position_ids"] seqlen = input_ids.shape[0] diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index a34f03edb55..4302f5c7dd6 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -27,7 +27,7 @@ from tensorrt_llm.inputs.multimodal import (MultimodalParams, MultimodalRuntimeData) from tensorrt_llm.logger import logger -from tensorrt_llm.lora_helper import LoraConfig +from tensorrt_llm.lora_helper import LoraConfig, LoraManager from tensorrt_llm.lora_manager import LoraModelConfig from tensorrt_llm.mapping import CpType, Mapping from tensorrt_llm.models.modeling_utils import QuantAlgo @@ -297,6 +297,16 @@ def __init__( ) attn_backend = pytorch_backend_config.attn_backend + + self.lora_manager: Optional[LoraManager] = None + self.lora_prefetch_requests_list = None + # TODO smor- do we want to get the request inside the lora config? + # TODO smor- what happens if you get target modules? + # TODO smor- answer and guard against this + if lora_config is not None and lora_config.lora_request is not None: + self.lora_prefetch_requests_list = lora_config.lora_request + self.has_lora_prefetched = False + self.model = self._load_model( model_path, mapping=self.mapping, @@ -475,6 +485,25 @@ def set_lora_model_config(self, lora_target_modules: list[str], hidden_size=self.model.config.hidden_size, dtype=torch_dtype_to_str(self.model.config.torch_dtype)) + def set_lora_manager(self, resource_manager: ResourceManager): + peft_cache_manager = resource_manager.get_resource_manager( + ResourceManagerType.PEFT_CACHE_MANAGER) + if peft_cache_manager is not None: + self.lora_manager = peft_cache_manager.get_lora_manager() + + def prefetch_lora_dirs(self): + if self.lora_prefetch_requests_list is None: + return + + for request in self.lora_prefetch_requests_list: + self.lora_manager.load_from_ckpt( + [request.path], + model_config=self.lora_model_config, + runtime_mapping=None, + uids=[request.adapter_id]) + + self.has_lora_prefetched = True + @property def use_mrope(self): use_mrope = False @@ -560,6 +589,24 @@ def warmup(self, resource_manager: ResourceManager) -> None: self.cuda_graph_dummy_request = None def get_cuda_graph_warmup_request(batch_size, draft_len): + lora_configs = [] + if self.has_lora_prefetched: + from tensorrt_llm.bindings import executor as tllm + + available_uids = sorted( + list(self.lora_manager.cpp_lora_weights.keys())) + + for request_id in range(batch_size): + uid = available_uids[request_id % len(available_uids)] + + weights = self.lora_manager.cpp_lora_weights[uid] + config = self.lora_manager.cpp_lora_config[uid] + + lora_config = tllm.LoraConfig(task_id=uid, + weights=weights, + config=config) + lora_configs.append(lora_config) + # Divide by max_beam_width to get an approximation of the number of requests that can be run in parallel. available_blocks = kv_cache_manager.get_num_free_blocks( ) // self.max_beam_width @@ -568,12 +615,21 @@ def get_cuda_graph_warmup_request(batch_size, draft_len): result.context_requests = [] # Add (batch_size - 1) dummy requests with seq_len=1. # Should only need one more page per request. + + short_requests_lora = lora_configs[:batch_size - + 1] if lora_configs else None + lora_request_for_max = [ + lora_configs[batch_size - 1] + ] if lora_configs and len(lora_configs) >= batch_size else None + requests = kv_cache_manager.add_dummy_requests( list(range(batch_size - 1)), is_gen=True, max_num_draft_tokens=draft_len, use_mrope=use_mrope, - max_beam_width=self.max_beam_width) + max_beam_width=self.max_beam_width, + lora_request=short_requests_lora, + ) # Divide by max_beam_width to get an approximation of the number of tokens that can be added to the final request. available_tokens = kv_cache_manager.get_num_available_tokens( draft_len) @@ -587,7 +643,8 @@ def get_cuda_graph_warmup_request(batch_size, draft_len): is_gen=True, max_num_draft_tokens=draft_len, use_mrope=use_mrope, - max_beam_width=self.max_beam_width)[0] + max_beam_width=self.max_beam_width, + lora_request=lora_request_for_max)[0] # Add the longest request before all other seq_len=1 request to simulate the padding CUDA graph case. # This batch contains both the longest request and the shortest requests, # it also contains the maximum number of requests and the maximum token number, @@ -792,7 +849,7 @@ def release_batch(result: ScheduledRequests | None): self.forward(batch, new_tensors_device=None, resource_manager=resource_manager) - torch.cuda.synchronize() + torch.cuda.synchronize() # SMOR suppose to fail here, verify if self._torch_compile_piecewise_cuda_graph and self._torch_compile_enabled: piecewise_cuda_graph_num_tokens = sorted( @@ -975,6 +1032,7 @@ def _round_up_batch_size(self, batch_size: int) -> int: def _maybe_get_cuda_graph( self, batch: ScheduledRequests, + resource_manager: Optional[ResourceManager] = None ) -> Optional[DecodingCUDAGraphRunner]: """ Get a CUDA graph runner or return None (e.g. if CUDA graphs are disabled @@ -1021,13 +1079,46 @@ def _maybe_get_cuda_graph( else: spec_metadata = None + lora_params = None + if self.has_lora_prefetched: + peft_cache_manager = resource_manager.get_resource_manager( + ResourceManagerType.PEFT_CACHE_MANAGER) + + context_requests = batch.context_requests + generation_requests = batch.generation_requests + + if len(context_requests) > 0 and len(generation_requests) > 0: + raise ValueError( + "SMOR, non empty context and generation requests isn't tested yet" + ) + + if len(context_requests) > 0: + raise ValueError("SMOR, context requests isn't tested yet") + + for generation_request in generation_requests: + peft_cache_manager.add_request_peft(generation_request) + + py_lora_task_layer_module_configs = peft_cache_manager.impl.ensure_batch( + context_requests, generation_requests, False) + for req in context_requests: + req.py_lora_task_layer_module_configs = py_lora_task_layer_module_configs[ + req. + py_request_id] if req.py_request_id in py_lora_task_layer_module_configs else None + for req in generation_requests: + req.py_lora_task_layer_module_configs = py_lora_task_layer_module_configs[ + req. + py_request_id] if req.py_request_id in py_lora_task_layer_module_configs else None + + lora_params = self._get_lora_params_from_requests( + batch, attn_metadata) # TODO smor consider just creating shape? + # Initialize nested dictionary if needed if batch_size not in self._cuda_graphs: self._cuda_graphs[batch_size] = {} self._cuda_graphs[batch_size][draft_len] = DecodingCUDAGraphRunner( batch_size, "cuda", attn_metadata, spec_metadata, self.use_mrope, - self.max_beam_width) + self.max_beam_width, lora_params) return self._cuda_graphs[batch_size][draft_len] def __del__(self) -> None: @@ -1230,7 +1321,8 @@ def _prepare_tp_inputs( attn_metadata: AttentionMetadata, spec_metadata: Optional[SpecMetadata] = None, new_tensors_device: Optional[SampleStateTensors] = None, - cache_indirection_buffer: Optional[torch.Tensor] = None): + cache_indirection_buffer: Optional[torch.Tensor] = None, + lora_params: Optional[dict] = None): """ Prepare inputs for Pytorch Model. """ @@ -1590,8 +1682,9 @@ def previous_seq_slots_device(): attn_metadata.prepare() - lora_params = self._get_lora_params_from_requests( - scheduled_requests, attn_metadata) + if lora_params is None: + lora_params = self._get_lora_params_from_requests( + scheduled_requests, attn_metadata) # Prepare inputs inputs = { @@ -2124,14 +2217,14 @@ def _get_lora_params_from_requests(self, return lora_params @nvtx_range("_prepare_inputs") - def _prepare_inputs( - self, - scheduled_requests: ScheduledRequests, - kv_cache_manager: KVCacheManager, - attn_metadata: AttentionMetadata, - spec_metadata: Optional[SpecMetadata] = None, - new_tensors_device: Optional[SampleStateTensors] = None, - cache_indirection_buffer: Optional[torch.Tensor] = None): + def _prepare_inputs(self, + scheduled_requests: ScheduledRequests, + kv_cache_manager: KVCacheManager, + attn_metadata: AttentionMetadata, + spec_metadata: Optional[SpecMetadata] = None, + new_tensors_device: Optional[SampleStateTensors] = None, + cache_indirection_buffer: Optional[torch.Tensor] = None, + lora_params: Optional[dict] = None): if self.mapping is not None and 'cp_type' in self.mapping.cp_config: cp_type = self.mapping.cp_config['cp_type'] if CpType.STAR == cp_type: @@ -2143,7 +2236,8 @@ def _prepare_inputs( return self._prepare_tp_inputs(scheduled_requests, kv_cache_manager, attn_metadata, spec_metadata, new_tensors_device, - cache_indirection_buffer) + cache_indirection_buffer, + lora_params) @torch.inference_mode() @with_model_extra_attrs(lambda self: self.model.extra_attrs) @@ -2187,12 +2281,16 @@ def forward( gather_context_logits) with self._maybe_pad_batch(scheduled_requests, kv_cache_manager, spec_resource_manager) as scheduled_requests: - maybe_graph = self._maybe_get_cuda_graph(scheduled_requests) + maybe_graph = self._maybe_get_cuda_graph( + scheduled_requests, resource_manager=resource_manager) if maybe_graph is not None: attn_metadata = maybe_graph.attn_metadata spec_metadata = maybe_graph.spec_metadata + # Don't use pre-captured lora_params - always get fresh ones from current requests + lora_params = None else: attn_metadata = self.attn_metadata + lora_params = None if self.enable_spec_decode: spec_metadata = self.spec_metadata else: @@ -2200,7 +2298,8 @@ def forward( inputs, gather_ids = self._prepare_inputs( scheduled_requests, kv_cache_manager, attn_metadata, - spec_metadata, new_tensors_device, cache_indirection_buffer) + spec_metadata, new_tensors_device, cache_indirection_buffer, + lora_params) self.iter_counter += 1 @@ -2223,7 +2322,6 @@ def capture_forward_fn(inputs: Dict[str, Any]): self._cuda_graph_mem_pool, ) self._cuda_graph_mem_pool = pool - # here we don't need to use context since cuda graph capture didn't run kernel. # maybe we need a cleaner way to do this. outputs = maybe_graph.run(inputs) @@ -2232,7 +2330,6 @@ def capture_forward_fn(inputs: Dict[str, Any]): outputs = maybe_graph.run(inputs) self._execute_logit_post_processors(scheduled_requests, outputs) - return outputs def model_forward(self, **kwargs): diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 4cebfae58b1..38cca966d58 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -217,6 +217,8 @@ def __init__(self, self.micro_batches: List[BatchStatePP | None] = [None] * self.num_micro_batches self.send_handles = [None] * self.num_micro_batches + self.model_engine.set_lora_manager(self.resource_manager) + self.model_engine.prefetch_lora_dirs() self.inflight_req_ids = ReqIdsSet() @@ -296,6 +298,9 @@ def is_warmup(self, value: bool): if self.draft_model_engine is not None: self.draft_model_engine.is_warmup = value + def get_lora_manager(self): + return self.model_engine.lora_manager + def start_worker(self): with self.worker_lock: if self.worker_started == False: diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index 9a5b42166dc..cf04cef941c 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -428,20 +428,21 @@ def prepare_resources(self, scheduled_batch: ScheduledRequests): self.impl.add_token(req.py_request_id) def add_dummy_requests( - self, - request_ids: List[int], - # Note that token_nums should be past_kv_len + input_len (without - # spec decoding). The draft tokens will be added in this function, - # so we don't need to take care of it in the caller. When preparing - # token_nums, we should not take the draft tokens into account, so - # don't use the kv_cache_manager.max_seq_len, which includes both - # extra tokens and draft tokens. - token_nums: Optional[List[int]] = None, - is_gen: bool = False, - prepare_resource: bool = True, - max_num_draft_tokens: int = 0, - use_mrope: bool = False, - max_beam_width: int = 1, + self, + request_ids: List[int], + # Note that token_nums should be past_kv_len + input_len (without + # spec decoding). The draft tokens will be added in this function, + # so we don't need to take care of it in the caller. When preparing + # token_nums, we should not take the draft tokens into account, so + # don't use the kv_cache_manager.max_seq_len, which includes both + # extra tokens and draft tokens. + token_nums: Optional[List[int]] = None, + is_gen: bool = False, + prepare_resource: bool = True, + max_num_draft_tokens: int = 0, + use_mrope: bool = False, + max_beam_width: int = 1, + lora_request: Optional[List] = None, # TODO smor fill type hint ): beam_width = max_beam_width requests = [] @@ -461,6 +462,16 @@ def add_dummy_requests( # Using 1 instead of 0 prevents NaN during warmup in e.g. Deepseek mrope_position_deltas = torch.zeros( 1, device="cuda", dtype=torch.int32) if use_mrope else None + + lora_task_id = None + lora_weights = None + lora_config = None + + if lora_request is not None and i < len(lora_request): + lora_task_id = lora_request[i].task_id + lora_weights = lora_request[i].weights + lora_config = lora_request[i].config + req = LlmRequest(request_id=req_id, max_new_tokens=1, input_tokens=[1] * token_num, @@ -468,7 +479,10 @@ def add_dummy_requests( sampling_params._get_sampling_config()), is_streaming=False, mrope_position_deltas=mrope_position_deltas, - encoder_input_tokens=encoder_input_tokens) + encoder_input_tokens=encoder_input_tokens, + lora_task_id=lora_task_id, + lora_weights=lora_weights, + lora_config=lora_config) req.is_dummy_request = True req.paged_kv_block_ids = [] if prepare_resource: @@ -1009,7 +1023,6 @@ def __init__(self, model_config: ModelConfig, world_config: WorldConfig | None = None): import tensorrt_llm.bindings as _tb - peft_cache_manager_config = _tb.PeftCacheManagerConfig( num_host_module_layer=peft_cache_config.num_host_module_layer, num_device_module_layer=peft_cache_config.num_device_module_layer, @@ -1075,6 +1088,10 @@ def ensure_batch(self, return self.impl.ensure_batch(context_batch, generation_batch, reset_gpu_cache) + def get_lora_manager(self): + assert self._lora_manager is not None, "Lora manager not initialized" + return self._lora_manager + def get_max_resource_count(self) -> int: return 0 diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 6d5ec9c1d78..959a4b33deb 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -163,12 +163,7 @@ def _create_engine(): if getattr(executor_config, "backend", "") == "pytorch" and lora_config is not None: - from tensorrt_llm._torch.pyexecutor.resource_manager import \ - ResourceManagerType - peft_cache_manager = self.engine.resource_manager.resource_managers.get( - ResourceManagerType.PEFT_CACHE_MANAGER) - self._lora_manager = LoraManager( - cpp_peft_cache_manager=peft_cache_manager.impl) + self._lora_manager = self.engine.get_lora_manager() lora_model_config = self.engine.model_engine.lora_model_config assert lora_model_config is not None self._lora_model_config = lora_model_config diff --git a/tensorrt_llm/lora_helper.py b/tensorrt_llm/lora_helper.py index 37f5d534f7d..0cc8aec6722 100644 --- a/tensorrt_llm/lora_helper.py +++ b/tensorrt_llm/lora_helper.py @@ -14,7 +14,7 @@ # limitations under the License. from dataclasses import dataclass, field -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional from ._utils import DictConversion @@ -88,6 +88,7 @@ class LoraConfig(DictConversion): trtllm_modules_to_hf_modules: Dict[str, str] = field(default_factory=dict) max_loras: Optional[int] = None max_cpu_loras: Optional[int] = None + lora_request: Optional[List[Any]] = None # TODO smor fix def __post_init__(self): assert self.lora_ckpt_source in [ diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py index 7440715474c..760743f27fc 100644 --- a/tensorrt_llm/lora_manager.py +++ b/tensorrt_llm/lora_manager.py @@ -8,7 +8,7 @@ from dataclasses import dataclass from functools import lru_cache from pathlib import Path -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union import numpy as np import torch @@ -702,6 +702,11 @@ def __init__( self._cpp_lora_weights: Dict[str, torch.Tensor] = {} # on cpu self._cpp_lora_config: Dict[str, torch.Tensor] = {} # on cpu self.lora_target_modules: List[str] = [] + self._cpp_peft_cache_manager: Optional[tb_internal.batch_manager.PeftCacheManager] = None + + def set_cpp_peft_cache_manager( + self, cpp_peft_cache_manager: tb_internal.batch_manager.PeftCacheManager + ): self._cpp_peft_cache_manager = cpp_peft_cache_manager def is_adapter_in_cpu_cache(self, adapter_uid: int) -> bool: diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 6b78c46bd73..b70ddb77acc 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -5,7 +5,7 @@ from tensorrt_llm import LLM from tensorrt_llm.llmapi import KvCacheConfig -from tensorrt_llm.llmapi.llm_args import PeftCacheConfig +from tensorrt_llm.llmapi.llm_args import CudaGraphConfig, PeftCacheConfig from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer from tensorrt_llm.metrics import MetricNames from tensorrt_llm.sampling_params import SamplingParams @@ -818,3 +818,105 @@ def test_max_num_token_check(self): match="should not exceed max_num_tokens"): ids = [random.randint(10, 100) for _ in range(101)] llm.generate([ids]) + + +@pytest.mark.parametrize("cuda_graph_config", + [None, CudaGraphConfig(max_batch_size=1)]) +def test_lora_dir_with_graph(cuda_graph_config): + lora_req = LoRARequest( + "task-0", 0, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1") + + lora_config = LoraConfig( + lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"], + max_lora_rank=8, + lora_request=[lora_req]) + + llm = LLM(model=f"{llm_models_root()}/llama-models/llama-7b-hf", + lora_config=lora_config, + cuda_graph_config=cuda_graph_config) + + prompts = [ + "美国的首都在哪里? \n答案:", + ] + references = [ + "美国的首都是华盛顿。\n\n美国的", + ] + sampling_params = SamplingParams(max_tokens=20) + lora_request = [lora_req] + + outputs = llm.generate(prompts, sampling_params, lora_request=lora_request) + + assert similar(outputs[0].outputs[0].text, references[0]) + print(f"lora output: {outputs[0].outputs[0].text}") + print(f"ref output: {references[0]}") + + +def test_lora_graph_single_request(): + lora_req = LoRARequest( + "task-0", 0, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1") + + lora_config = LoraConfig( + lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"], + max_lora_rank=8, + lora_request=[lora_req]) + + llm = LLM(model=f"{llm_models_root()}/llama-models/llama-7b-hf", + lora_config=lora_config, + cuda_graph_config=CudaGraphConfig(max_batch_size=1)) + + prompts = [ + "美国的首都在哪里? \n答案:", + ] + references = [ + "美国的首都是华盛顿。\n\n美国的", + ] + sampling_params = SamplingParams(max_tokens=20) + lora_request = [lora_req] + + outputs = llm.generate(prompts, sampling_params, lora_request=lora_request) + + assert similar(outputs[0].outputs[0].text, references[0]) + print(f"lora output: {outputs[0].outputs[0].text}") + print(f"ref output: {references[0]}") + + +def test_lora_graph_multiple_requests(): + lora_req = LoRARequest( + "task-0", 0, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1") + # lora_req2 = LoRARequest( + # "task-1", 1, + # f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0") + lora_req2 = LoRARequest( + "task-1", 1, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1") + + lora_requests = [lora_req, lora_req2] + lora_config = LoraConfig( + lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"], + max_lora_rank=8, + lora_request=lora_requests) + + llm = LLM( + model=f"{llm_models_root()}/llama-models/llama-7b-hf", + lora_config=lora_config, + # cuda_graph_config=None) + cuda_graph_config=CudaGraphConfig(max_batch_size=2)) + + prompts = [ + "美国的首都在哪里? \n答案:", + "美国的首都在哪里? \n答案:", + ] + references = [ + "美国的首都是华盛顿。\n\n美国的", + "美国的首都是华盛顿。\n\n美国的", + # "纽约\n\n### カンファレンスの", + ] + sampling_params = SamplingParams(max_tokens=20) + + outputs = llm.generate(prompts, sampling_params, lora_request=lora_requests) + + print(f"lora output 0: {outputs[0].outputs[0].text}") + print(f"ref output 0: {references[0]}") + print(f"lora output 1: {outputs[1].outputs[0].text}") + print(f"ref output 1: {references[1]}") + assert similar(outputs[0].outputs[0].text, references[0]) + assert similar(outputs[1].outputs[0].text, references[1])