Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion tests/integration/defs/.test_durations
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,6 @@
"examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_fp8]": 314.3205590210273,
"examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_int8_wo]": 329.1954380639363,
"examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_sq_ootb]": 216.2645359209855,
"examples/test_llama.py::test_llm_llama_v2_lora_benchmark_2gpu[chinese_lora-llama-v2-13b-hf]": 3641.9526145930286,
"examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[disable_gemm_allreduce_plugin-llama-3.1-70b-enable_fp8]": 1654.751242957951,
"examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-enable_fp8]": 20655.04908744397,
"examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-fp8-disable_fp8]": 13962.460933016031,
Expand Down
183 changes: 1 addition & 182 deletions tests/integration/defs/examples/test_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,10 @@
import os
import re
import shutil
import subprocess
from copy import deepcopy

import defs.ci_profiler
import pytest
from defs.common import (convert_weights, generate_summary_cmd,
get_cpp_benchmark, get_trt_llm_lib_dir, parse_output,
from defs.common import (convert_weights, generate_summary_cmd, parse_output,
quantize_data, similar,
test_llm_torch_multi_lora_support,
test_multi_lora_support, venv_check_call,
Expand Down Expand Up @@ -2683,184 +2680,6 @@ def test_llm_llama_v1_multiple_lora_1gpu(data_type, lora_data_type,
venv_check_call(llm_venv, run_cmd)


@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("llama_model_root", ['llama-v2-13b-hf'], indirect=True)
@pytest.mark.parametrize("llm_lora_model_root", ["chinese-llama-2-lora-13b"],
ids=["chinese_lora"],
indirect=True)
def test_llm_llama_v2_lora_benchmark_2gpu(llama_example_root, llama_model_root,
llm_venv, llm_root, cmodel_dir,
engine_dir, llm_lora_model_root):
"benchmark llama with multi lora on 2gpu"
print("Build engines...")

num_layers = 40
num_lora_mods = 7
max_lora_rank = 64
max_len = 1024
max_batch = 32
eos_id = 2
num_loras = (8, 16)
num_requests = 1024

model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model="llama-lora",
model_path=llama_model_root,
gpus=2,
tp_size=2,
data_type="float16")

print("Build engines...")
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={model_dir}",
f"--output_dir={engine_dir}",
f"--max_batch_size={max_batch}",
f"--max_input_len={max_len}",
f"--max_seq_len={2 * max_len}",
"--gemm_plugin=float16",
"--lora_plugin=float16",
"--use_paged_context_fmha=enable",
"--lora_target_modules",
"attn_q",
"attn_k",
"attn_v",
"attn_dense",
"mlp_h_to_4h",
"mlp_4h_to_h",
"mlp_gate",
f"--max_lora_rank={max_lora_rank}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

print("Convert LoRA to cpp format")
convert_cmd = [
"python",
f"{llama_example_root}/../../../hf_lora_convert.py",
f"-i={llm_lora_model_root}",
"--storage-type=float16",
f"-o={llm_venv.get_working_directory()}/lora_cpp",
]
check_call(" ".join(convert_cmd), shell=True, env=llm_venv._new_env)

print("Prepare datasets")
benchmark_root = f"{llama_example_root}/../../../../benchmarks/cpp"
lora_eg = f"{llm_venv.get_working_directory()}/lora-eg"
base_dataset_cmd = [
f"mkdir -p {lora_eg}/data",
"&&",
"python",
f"{benchmark_root}/prepare_dataset.py",
f"--output={lora_eg}/data/token-norm-dist.json",
f"--tokenizer={llama_model_root}",
"token-norm-dist",
f"--num-requests={num_requests}",
"--input-mean=256",
"--input-stdev=16",
"--output-mean=128",
"--output-stdev 24",
]
check_call(" ".join(base_dataset_cmd), shell=True, env=llm_venv._new_env)

for nloras in num_loras:
lora_dataset_cmd = [
"python",
f"{benchmark_root}/prepare_dataset.py",
f"--output={lora_eg}/data/token-norm-dist-lora-{nloras}.json",
f"--rand-task-id 0 {nloras-1}",
f"--tokenizer={llama_model_root}",
"token-norm-dist",
f"--num-requests={num_requests}",
"--input-mean=256",
"--input-stdev=16",
"--output-mean=128",
"--output-stdev 24",
]
check_call(" ".join(lora_dataset_cmd),
shell=True,
env=llm_venv._new_env)

print("Generate random lora weights for 16 adapters")

lora_weights_cmd = [
"python", f"{benchmark_root}/utils/generate_rand_loras.py",
f"{llm_venv.get_working_directory()}/lora_cpp", f"{lora_eg}/loras", "16"
]
check_call(" ".join(lora_weights_cmd), shell=True, env=llm_venv._new_env)

benchmark_exe = get_cpp_benchmark('gptManagerBenchmark', llm_root)
envs = deepcopy(os.environ)
_ = envs.pop("CUDA_VISIBLE_DEVICES", "")
envs[
"LD_LIBRARY_PATH"] = f'{get_trt_llm_lib_dir(llm_venv)}:{os.path.dirname(benchmark_exe)}:{envs.get("LD_LIBRARY_PATH", "")}'

print(
f'CUDA_VISIBLE_DEVICES: {os.environ.get("CUDA_VISIBLE_DEVICES", None)}')

print("Perform base model benchmarking")
check_call(f"mkdir -p {lora_eg}/log-base-lora", shell=True, env=envs)
base_benchmark_cmd = [
f"{benchmark_exe}",
f"--engine_dir={engine_dir}",
"--type=IFB",
f"--dataset={lora_eg}/data/token-norm-dist.json",
"--lora_host_cache_bytes=8589934592",
f"--lora_num_device_mod_layers={32 * num_layers * num_lora_mods * max_lora_rank}",
"--kv_cache_free_gpu_mem_fraction=0.70",
"--log_level=info",
f"--eos_id={eos_id}",
]
mpi_cmd = [
"mpirun",
"-n",
"2",
"--allow-run-as-root",
"--output-filename",
f"{lora_eg}/log-base-lora",
]
base_benchmark_cmd = mpi_cmd + base_benchmark_cmd
print(
f"Running gptManagerBenchmark using base cmd: {' '.join(base_benchmark_cmd)}"
)
subprocess.check_output(base_benchmark_cmd, env=envs)
# check_call(" ".join(base_benchmark_cmd), env=envs)

print("Perform lora model benchmarking")
for nloras in num_loras:
check_call(f"mkdir -p {lora_eg}/log-lora-{nloras}",
shell=True,
env=envs)
lora_benchmark_cmd = [
f"{benchmark_exe}",
f"--engine_dir={engine_dir}",
"--type=IFB",
f"--dataset={lora_eg}/data/token-norm-dist-lora-{nloras}.json",
"--lora_host_cache_bytes=8589934592",
f"--lora_num_device_mod_layers={16 * num_layers * num_lora_mods * max_lora_rank}",
"--kv_cache_free_gpu_mem_fraction=0.70",
"--log_level=info",
f"--eos_id={eos_id}",
f"--lora_dir={lora_eg}/loras",
]
mpi_cmd = [
"mpirun",
"-n",
"2",
"--allow-run-as-root",
"--output-filename",
f"{lora_eg}/log-lora-{nloras}",
]
lora_benchmark_cmd = mpi_cmd + lora_benchmark_cmd
print(
f"Running gptManagerBenchmark using lora cmd: {' '.join(lora_benchmark_cmd)}"
)
subprocess.check_output(lora_benchmark_cmd, env=envs)
# check_call(lora_benchmark_cmd, env=envs)


@pytest.mark.timeout(7200)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(4)
Expand Down
1 change: 0 additions & 1 deletion tests/integration/test_lists/qa/llm_function_core.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-lla
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_fp8]
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_int8_wo]
examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_sq_ootb]
examples/test_llama.py::test_llm_llama_v2_lora_benchmark_2gpu[chinese_lora-llama-v2-13b-hf]
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-enable_fp8] TIMEOUT (120)
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-fp8-disable_fp8] TIMEOUT (90)
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8]
Expand Down
1 change: 0 additions & 1 deletion tests/integration/test_lists/test-db/l0_dgx_h200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@ l0_dgx_h200:
- accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_tp2cp2
- accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
- examples/test_llama.py::test_llm_llama_long_alpaca_8gpu_summary[pg64317-tp4pp2-nb:4]
- examples/test_llama.py::test_llm_llama_v2_lora_benchmark_2gpu[chinese_lora-llama-v2-13b-hf]
- examples/test_mixtral.py::test_llm_mixtral_moe_plugin_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora]
- unittest/llmapi/test_llm_multi_gpu.py -m "gpu2 and part0"
- unittest/llmapi/test_llm_multi_gpu.py -m "gpu2 and part1"
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -419,3 +419,6 @@ accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutla
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] SKIP (https://nvbugs/5702795)
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] SKIP (https://nvbugs/5702795)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5698897)
test_e2e/test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5648560)
test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] SKIP (https://nvbugs/5648560)
test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5633700)