From b3e8fa29601567a86d62e90b5cffbcd89d5ad6f4 Mon Sep 17 00:00:00 2001 From: Eran Geva <19514940+MrGeva@users.noreply.github.com> Date: Mon, 11 Aug 2025 08:33:13 +0300 Subject: [PATCH 01/15] [None][test] Test trtllm-bench AD vs, PT BEs on H100 single gpu (#6487) Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com> Signed-off-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com> Co-authored-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com> --- .../integration/test_lists/test-db/l0_a30.yml | 3 +- .../test_lists/test-db/l0_b200.yml | 2 +- .../test_lists/test-db/l0_h100.yml | 1 + .../unit/singlegpu/test_ad_trtllm_bench.py | 566 +++++++++++++++++- 4 files changed, 549 insertions(+), 23 deletions(-) diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml index ce8058136fa..5ec16996e7c 100644 --- a/tests/integration/test_lists/test-db/l0_a30.yml +++ b/tests/integration/test_lists/test-db/l0_a30.yml @@ -18,8 +18,7 @@ l0_a30: - unittest/_torch/modeling -k "modeling_phi3" - unittest/_torch/modeling -k "modeling_qwen" - unittest/_torch/modeling -k "modeling_qwen_moe" - - unittest/_torch/modeling -k "modeling_exaone4" - - unittest/_torch/auto_deploy/unit/singlegpu + - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison" - unittest/_torch/test_beam_search.py - condition: ranges: diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 730cd016743..26b4b2a0a88 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -70,7 +70,7 @@ l0_b200: - unittest/_torch/modeling -k "modeling_mixtral" - unittest/_torch/modeling -k "modeling_deepseek" - unittest/_torch/modeling -k "modeling_gpt_oss" - - unittest/_torch/auto_deploy/unit/singlegpu + - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison" - unittest/_torch/speculative/test_eagle3.py - unittest/_torch/speculative/test_kv_cache_reuse.py - unittest/_torch/speculative/test_dynamic_spec_decode.py diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 43ee39de1af..1a8fded524b 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -15,6 +15,7 @@ l0_h100: tests: # ------------- PyTorch tests --------------- # Only key models in H100: llama/mixtral/nemotron/deepseek + - unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py::test_trtllm_bench_backend_comparison - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (90) - unittest/_torch -k "modeling_llama" - unittest/_torch/modeling -k "modeling_mixtral" diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py index 2985e662b27..f5ec68e28d9 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py @@ -1,14 +1,231 @@ +import json +import re import subprocess import tempfile from pathlib import Path +import pytest import yaml from _model_test_utils import _hf_model_dir_or_hub_id -from click.testing import CliRunner from utils.cpp_paths import llm_root # noqa: F401 from utils.llm_data import llm_models_root -from tensorrt_llm.commands.bench import main + +def parse_kv_cache_metrics(log_output: str, free_mem_ratio: float = 0.8): + """Parse KV cache metrics from the benchmark log output.""" + metrics = {} + + # Simple patterns based on actual log format + patterns = { + "current_cache_size": r"Current cache size:\s*(\d+)", + "free_mem_pre_mb": r"Free memory before forward pass \(MB\):\s*(\d+)", + "free_mem_post_mb": r"Free memory after forward pass \(MB\):\s*(\d+)", + } + + # Extract metrics using simple regex patterns + for metric_name, pattern in patterns.items(): + match = re.search(pattern, log_output, re.IGNORECASE) + if match: + value = int(match.group(1)) + metrics[metric_name] = value + print(f" ✅ Found {metric_name}: {value}") + else: + print(f" ❌ Could not find {metric_name}") + + # Calculate new_cache_size using the same formula as in resize_kv_cache + # new_cache_size = free_mem_post * 1024 * 1024 * free_mem_ratio + current_cache_size + if "free_mem_post_mb" in metrics and "current_cache_size" in metrics: + metrics["new_cache_size"] = int( + metrics["free_mem_post_mb"] * 1024 * 1024 * free_mem_ratio + + metrics["current_cache_size"] + ) + print( + f" ✅ Calculated new_cache_size: {metrics['new_cache_size']} (using free_mem_ratio={free_mem_ratio})" + ) + else: + print(" ❌ Cannot calculate new_cache_size - missing required metrics") + + return metrics + + +def run_benchmark( + model_name: str, + dataset_path: str, + temp_dir: str, + backend: str = "_autodeploy", + report_json_path: str = None, + max_batch_size: int = 32, + num_hidden_layers: int = 2, + free_mem_ratio: float = 0.1, +): + """Run benchmark and capture KV cache metrics from log output.""" + + # Read the test config to get free_mem_ratio + config_path = f"{temp_dir}/extra_llm_api_options.yaml" + + # Build the command to run the benchmark + cmd = [ + "python", + "-m", + "tensorrt_llm.commands.bench", + "--model", + model_name, + "throughput", + "--backend", + backend, + "--dataset", + str(dataset_path), + "--max_batch_size", + str(max_batch_size), + ] + + # Add report_json argument if path is provided + if report_json_path: + cmd.extend(["--report_json", report_json_path]) + + if backend == "_autodeploy": + # Add extra_llm_api_options only for autodeploy backend + cmd.extend(["--extra_llm_api_options", config_path]) + + # Run benchmark as subprocess to capture ALL output + import os + + env = os.environ.copy() + if backend == "pytorch": + env["TLLM_OVERRIDE_LAYER_NUM"] = str(num_hidden_layers) + print(f"📋 Using TLLM_OVERRIDE_LAYER_NUM from env: {env['TLLM_OVERRIDE_LAYER_NUM']}") + cmd.extend(["--kv_cache_free_gpu_mem_fraction", str(free_mem_ratio)]) + print(f"🚀 Running benchmark command ({backend} backend): {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True, env=env, timeout=600) + + # Check if the command succeeded + assert result.returncode == 0, ( + f"Benchmark failed with return code {result.returncode}:\n" + f"STDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}" + ) + + # Combine stdout and stderr for parsing + full_log_output = f"{result.stdout}\n{result.stderr}" + + # Parse KV cache metrics from the combined log output (only for autodeploy backend) + kv_cache_metrics = {} + if backend == "_autodeploy": + kv_cache_metrics = parse_kv_cache_metrics(full_log_output, free_mem_ratio) + print("📊 KV Cache Metrics parsed from logs:") + if kv_cache_metrics: + for key, value in kv_cache_metrics.items(): + if "mb" in key.lower(): + print(f" {key}: {value}MB") + else: + print(f" {key}: {value} bytes") + else: + print(" âš ī¸ No KV cache metrics were parsed successfully") + else: + print(f"📊 KV Cache Metrics: Skipped for {backend} backend") + + # Return parsed JSON report with KV cache metrics if requested + if report_json_path and Path(report_json_path).exists(): + with open(report_json_path, "r") as f: + report_data = json.load(f) + + # Add KV cache metrics to the report (only for autodeploy backend) + if backend == "_autodeploy": + report_data["kv_cache_metrics"] = kv_cache_metrics + report_data["backend"] = backend + return report_data + return None + + +def compare_backends_performance( + autodeploy_tokens_per_sec: float, + pytorch_tokens_per_sec: float, + relative_tolerance: float = 0.20, + absolute_tolerance: float = 10.0, +): + """ + Compare performance between autodeploy and pytorch backends. + Fails if autodeploy is significantly worse than pytorch. + + Args: + autodeploy_tokens_per_sec: Performance of autodeploy backend + pytorch_tokens_per_sec: Performance of pytorch backend + relative_tolerance: Relative tolerance (20% by default for backend comparison) + absolute_tolerance: Absolute tolerance (10 tokens/sec by default) + """ + # Calculate performance difference + performance_diff = pytorch_tokens_per_sec - autodeploy_tokens_per_sec + relative_diff = performance_diff / pytorch_tokens_per_sec if pytorch_tokens_per_sec > 0 else 0 + + print("=== BACKEND PERFORMANCE COMPARISON ===") + print(f"PyTorch backend: {pytorch_tokens_per_sec:.2f} tokens/sec/user") + print(f"Autodeploy backend: {autodeploy_tokens_per_sec:.2f} tokens/sec/user") + print(f"Performance difference: {performance_diff:.2f} tokens/sec ({relative_diff:.2%})") + + # If autodeploy is better than or equal to pytorch, always pass + if autodeploy_tokens_per_sec >= pytorch_tokens_per_sec: + print("✅ Autodeploy backend matches or exceeds PyTorch backend performance") + return + + # Autodeploy is slower - check if it's within acceptable tolerance + within_relative_tolerance = relative_diff <= relative_tolerance + within_absolute_tolerance = performance_diff <= absolute_tolerance + + if within_relative_tolerance or within_absolute_tolerance: + print("✅ Autodeploy backend performance within acceptable tolerance") + print( + f" Tolerance: {relative_tolerance:.2%} relative OR {absolute_tolerance:.2f} tokens/sec absolute" + ) + else: + assert False, ( + f"Autodeploy backend significantly underperforms compared to PyTorch! " + f"Autodeploy: {autodeploy_tokens_per_sec:.2f} tokens/sec/user, " + f"PyTorch: {pytorch_tokens_per_sec:.2f} tokens/sec/user, " + f"Performance gap: {performance_diff:.2f} tokens/sec ({relative_diff:.2%}), " + f"Tolerance: {relative_tolerance:.2%} relative OR {absolute_tolerance:.2f} tokens/sec absolute" + ) + + +def assert_performance_within_tolerance( + actual_tokens_per_sec: float, + golden_tokens_per_sec: float, + relative_tolerance: float = 0.15, + absolute_tolerance: float = 10.0, +): + """ + Assert that actual performance is within tolerance of golden result. + Only fails if performance is WORSE than golden - improvements always pass. + + Args: + actual_tokens_per_sec: Measured performance metric + golden_tokens_per_sec: Expected performance metric + relative_tolerance: Relative tolerance (15% by default) + absolute_tolerance: Absolute tolerance (10 tokens/sec by default) + """ + # If actual performance is better than or equal to golden, always pass + if actual_tokens_per_sec >= golden_tokens_per_sec: + print( + f"✅ Performance improvement detected:" + f" {actual_tokens_per_sec:.2f} >= {golden_tokens_per_sec:.2f} tokens/sec/user" + ) + return + + # Performance is worse than golden - check if it's within acceptable tolerance + performance_drop = golden_tokens_per_sec - actual_tokens_per_sec + relative_drop = ( + performance_drop / golden_tokens_per_sec if golden_tokens_per_sec > 0 else float("inf") + ) + + # Performance should be within relative tolerance OR absolute tolerance + within_relative_tolerance = relative_drop <= relative_tolerance + within_absolute_tolerance = performance_drop <= absolute_tolerance + + assert within_relative_tolerance or within_absolute_tolerance, ( + f"Performance regression detected! " + f"Actual: {actual_tokens_per_sec:.2f} tokens/sec/user, " + f"Golden: {golden_tokens_per_sec:.2f} tokens/sec/user, " + f"Performance drop: {performance_drop:.2f} tokens/sec ({relative_drop:.2%}), " + f"Tolerance: {relative_tolerance:.2%} relative OR {absolute_tolerance:.2f} tokens/sec absolute" + ) def prepare_dataset(root_dir: str, temp_dir: str, model_name: str): @@ -17,7 +234,7 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_name: str): dataset_tool = Path(root_dir, "benchmarks", "cpp", "prepare_dataset.py") script_dir = Path(root_dir, "benchmarks", "cpp") - # Generate a small dataset to run a test. + # Generate a small dataset to run a test - matching workload configuration command = [ "python3", f"{dataset_tool}", @@ -37,7 +254,9 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_name: str): "10", ] print(f"Running command: {' '.join(command)}") - result = subprocess.run(command, cwd=str(script_dir), capture_output=True, text=True) + result = subprocess.run( + command, cwd=str(script_dir), capture_output=True, text=True, timeout=300 + ) if result.returncode != 0: raise RuntimeError(f"Failed to prepare dataset: {result.stderr}") # Grab the stdout and write it to a dataset file for passing to suite. @@ -46,22 +265,324 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_name: str): return dataset_path -def run_benchmark(model_name: str, dataset_path: str, temp_dir: str): - runner = CliRunner() +def calculate_expected_kv_cache_metrics(free_mem_ratio: float): + """Calculate expected KV cache metrics based on actual GPU memory.""" + try: + import torch - args = [ - "--model", - model_name, - "throughput", - "--backend", - "_autodeploy", - "--dataset", - dataset_path, - "--extra_llm_api_options", - f"{temp_dir}/model_kwargs.yaml", + if torch.cuda.is_available(): + # Get total GPU memory in MB + _, total_mem_bytes = torch.cuda.mem_get_info(0) + total_mem_mb = total_mem_bytes // (1024 * 1024) + + # Estimate expected values based on model size + # For TinyLlama-1.1B, model should be 2.2GB + estimated_model_size_mb = 2200 # Conservative estimate + # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/6335 check why there is extra consumption + extra_consumption_mb = 2500 + expected_free_mem_range = ( + total_mem_mb - estimated_model_size_mb - extra_consumption_mb, + total_mem_mb - estimated_model_size_mb, + ) + + # Current cache size is typically small initially (16MB range) + expected_current_cache_size = 16777216 + + # Free memory values should be in reasonable range + expected_free_mem_pre_range = expected_free_mem_range + expected_free_mem_post_range = ( + expected_free_mem_range[0] - 1000, + expected_free_mem_range[1] - 500, + ) + + print("📊 GPU Memory Analysis:") + print(f" Total GPU memory: {total_mem_mb}MB") + print( + f" Expected free memory range: {expected_free_mem_range[0]}-{expected_free_mem_range[1]}MB" + ) + + return { + "total_mem_mb": total_mem_mb, + "expected_current_cache_size": expected_current_cache_size, + "expected_free_mem_pre_range": expected_free_mem_pre_range, + "expected_free_mem_post_range": expected_free_mem_post_range, + "free_mem_ratio": free_mem_ratio, + } + else: + return None + except ImportError: + return None + + +def validate_kv_cache_metrics_dynamic(kv_cache_metrics: dict, expected_metrics: dict): + """Validate KV cache metrics using dynamic expected values.""" + + # Validate current_cache_size (should be relatively stable) + current_cache_size = kv_cache_metrics.get("current_cache_size") + expected_cache_size = expected_metrics["expected_current_cache_size"] + if current_cache_size: + cache_diff = abs(current_cache_size - expected_cache_size) / expected_cache_size + assert cache_diff <= 0.5, ( # 50% tolerance for cache size + f"Current cache size outside expected range: {current_cache_size} vs expected ~{expected_cache_size}" + ) + print(f" ✅ current_cache_size: {current_cache_size} bytes (within range)") + + # Validate free memory values are in reasonable ranges + free_mem_pre = kv_cache_metrics.get("free_mem_pre_mb") + free_mem_post = kv_cache_metrics.get("free_mem_post_mb") + + if free_mem_pre: + pre_range = expected_metrics["expected_free_mem_pre_range"] + assert pre_range[0] <= free_mem_pre <= pre_range[1], ( + f"Free memory before forward pass outside expected range: " + f"{free_mem_pre}MB not in range {pre_range[0]}-{pre_range[1]}MB" + ) + print(f" ✅ free_mem_pre_mb: {free_mem_pre}MB (within range)") + + if free_mem_post: + post_range = expected_metrics["expected_free_mem_post_range"] + assert post_range[0] <= free_mem_post <= post_range[1], ( + f"Free memory after forward pass outside expected range: " + f"{free_mem_post}MB not in range {post_range[0]}-{post_range[1]}MB" + ) + print(f" ✅ free_mem_post_mb: {free_mem_post}MB (within range)") + + # Validate memory consumption (pre should be > post) + if free_mem_pre and free_mem_post: + memory_consumed = free_mem_pre - free_mem_post + assert memory_consumed > 0, ( + f"Expected memory consumption during forward pass, got {memory_consumed}MB" + ) + assert memory_consumed < 5000, f"Memory consumption too high: {memory_consumed}MB" + print(f" ✅ Memory consumed during forward pass: {memory_consumed}MB (reasonable)") + + # Validate calculated new_cache_size + new_cache_size = kv_cache_metrics.get("new_cache_size") + if new_cache_size and free_mem_post and current_cache_size: + expected_new_cache = int( + free_mem_post * 1024 * 1024 * expected_metrics["free_mem_ratio"] + current_cache_size + ) + cache_size_diff = abs(new_cache_size - expected_new_cache) / expected_new_cache + assert cache_size_diff <= 0.01, ( # 1% tolerance for calculated value + f"Calculated new_cache_size mismatch: {new_cache_size} vs expected {expected_new_cache}" + ) + print(f" ✅ new_cache_size: {new_cache_size} bytes (calculation correct)") + + +def extract_performance_metric(report_data, report_name="benchmark"): + """Extract performance metric from a benchmark report with validation.""" + assert report_data is not None, f"Failed to capture {report_name} report" + assert "performance" in report_data, f"Performance metrics not found in {report_name} report" + + tokens_per_sec = report_data["performance"].get("output_throughput_per_user_tok_s") + assert tokens_per_sec is not None, ( + f"output_throughput_per_user_tok_s not found in {report_name} performance metrics" + ) + + return tokens_per_sec + + +def validate_and_extract_kv_cache_metrics(report_data, free_mem_ratio, require_metrics=True): + """ + Validate and extract KV cache metrics from report. + + Args: + report_data: The benchmark report data + free_mem_ratio: Free memory ratio for calculating expected metrics + require_metrics: If True, fail when metrics are missing. If False, just warn. + + Returns: + Tuple of (kv_cache_metrics, expected_metrics) or (None, None) if validation fails + """ + required_metrics = [ + "current_cache_size", + "free_mem_pre_mb", + "free_mem_post_mb", + "new_cache_size", ] - result = runner.invoke(main, args, catch_exceptions=False) - assert result.exit_code == 0 + + # Extract KV cache metrics + kv_cache_metrics = report_data.get("kv_cache_metrics", {}) + + if not kv_cache_metrics: + message = ( + "KV cache metrics not found! " + "The autodeploy backend must log memory statistics for this test to pass. " + f"Expected metrics: {', '.join(required_metrics)}" + ) + if require_metrics: + assert False, f"REQUIRED {message}" + else: + print(f"â„šī¸ {message}") + assert False, "KV cache metrics are missing" + + # Check for missing metrics + missing_metrics = [metric for metric in required_metrics if metric not in kv_cache_metrics] + + if missing_metrics: + message = ( + f"Missing required KV cache metrics: {missing_metrics}. " + f"Found metrics: {list(kv_cache_metrics.keys())}. " + f"All of {required_metrics} are required for the test to pass." + ) + if require_metrics: + assert False, message + else: + print(f"â„šī¸ KV cache validation skipped - {message}") + assert False, "KV cache metrics are missing" + + # Calculate expected metrics + expected_metrics = calculate_expected_kv_cache_metrics(free_mem_ratio) + assert expected_metrics, "Could not determine expected metrics for this GPU" + + return kv_cache_metrics, expected_metrics + + +def print_kv_cache_metrics(kv_cache_metrics): + """Print KV cache metrics in a formatted way.""" + print("=== KV CACHE METRICS (DYNAMIC VALIDATION) ===") + for metric_name, actual_value in kv_cache_metrics.items(): + if "mb" in metric_name.lower(): + print(f"{metric_name}: {actual_value}MB") + else: + print(f"{metric_name}: {actual_value} bytes") + + +def trtllm_bench_unified_comparison( + llm_root, # noqa: F811 + comparison_mode="backend", + free_mem_ratio=0.1, + num_hidden_layers=2, + max_batch_size=32, # below this value the kv cache resizing is skipped + golden_tokens_per_sec=1400, + backend_relative_tolerance=0.2, + backend_absolute_tolerance=250.0, + golden_relative_tolerance=0.1, + golden_absolute_tolerance=5.0, +): + """ + Unified test that compares autodeploy backend performance in two modes: + - "backend": compares against pytorch backend performance + - "golden": compares against predefined golden performance values + + Args: + llm_root: Root directory for LLM models (pytest fixture) + comparison_mode: Either "backend" or "golden" to determine comparison type + free_mem_ratio: Ratio of free memory to use for KV cache + num_hidden_layers: Number of hidden layers for the model + max_batch_size: Maximum batch size for benchmarking + golden_tokens_per_sec: Golden performance value in tokens/sec/user + backend_relative_tolerance: Relative tolerance for backend comparison + backend_absolute_tolerance: Absolute tolerance for backend comparison + golden_relative_tolerance: Relative tolerance for golden comparison + golden_absolute_tolerance: Absolute tolerance for golden comparison + """ + model_name = _hf_model_dir_or_hub_id( + f"{llm_models_root()}/TinyLlama-1.1B-Chat-v1.0", "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + ) + + with tempfile.TemporaryDirectory() as temp_dir: + with open(f"{temp_dir}/extra_llm_api_options.yaml", "w") as f: + yaml.dump( + { + "model_kwargs": {"num_hidden_layers": num_hidden_layers}, + "cuda_graph_batch_sizes": [1, 2, 4, 8, 16, 32], + "compile_backend": "torch-opt", + "free_mem_ratio": free_mem_ratio, + "runtime": "trtllm", + }, + f, + ) + + dataset_path = prepare_dataset(llm_root, temp_dir, model_name) + + # Always run autodeploy backend + autodeploy_report_path = f"{temp_dir}/autodeploy_report.json" + print("=== RUNNING AUTODEPLOY BACKEND ===") + autodeploy_report = run_benchmark( + model_name, + dataset_path, + temp_dir, + "_autodeploy", + autodeploy_report_path, + max_batch_size, + num_hidden_layers, + free_mem_ratio, + ) + + # Extract autodeploy performance metrics + autodeploy_tokens_per_sec = extract_performance_metric(autodeploy_report, "autodeploy") + + # Validate and extract KV cache metrics (now required for both modes after user's changes) + kv_cache_metrics, expected_metrics = validate_and_extract_kv_cache_metrics( + autodeploy_report, free_mem_ratio, require_metrics=True + ) + + if comparison_mode == "backend": + # Backend comparison mode: also run pytorch backend + pytorch_report_path = f"{temp_dir}/pytorch_report.json" + print("=== RUNNING PYTORCH BACKEND ===") + pytorch_report = run_benchmark( + model_name, + dataset_path, + temp_dir, + "pytorch", + pytorch_report_path, + max_batch_size, + num_hidden_layers, + free_mem_ratio, + ) + + # Extract pytorch performance metrics + pytorch_tokens_per_sec = extract_performance_metric(pytorch_report, "pytorch") + + # Compare backend performance + compare_backends_performance( + autodeploy_tokens_per_sec, + pytorch_tokens_per_sec, + relative_tolerance=backend_relative_tolerance, + absolute_tolerance=backend_absolute_tolerance, + ) + + # Validate KV cache metrics + validate_kv_cache_metrics_dynamic(kv_cache_metrics, expected_metrics) + print("✅ KV Cache Metrics validation passed") + + print("=== BACKEND COMPARISON TEST PASSED ===") + print(f"Autodeploy: {autodeploy_tokens_per_sec:.2f} tokens/sec/user") + print(f"PyTorch: {pytorch_tokens_per_sec:.2f} tokens/sec/user") + + elif comparison_mode == "golden": + # Golden comparison mode: compare against golden values + print("=== PERFORMANCE METRICS ===") + print(f"Measured performance: {autodeploy_tokens_per_sec:.2f} tokens/sec/user") + print(f"Golden performance: {golden_tokens_per_sec:.2f} tokens/sec/user") + + # Print KV cache metrics + print_kv_cache_metrics(kv_cache_metrics) + + # Performance validation + assert_performance_within_tolerance( + autodeploy_tokens_per_sec, + golden_tokens_per_sec, + relative_tolerance=golden_relative_tolerance, + absolute_tolerance=golden_absolute_tolerance, + ) + + # KV cache metrics validation + print( + f"Validating {len(kv_cache_metrics)} KV cache metrics against GPU-specific ranges..." + ) + validate_kv_cache_metrics_dynamic(kv_cache_metrics, expected_metrics) + + print("=== ALL TESTS PASSED ===") + print(f"Performance: ✅ {autodeploy_tokens_per_sec:.2f} tokens/sec/user within bounds") + print("KV Cache Metrics: ✅ All metrics within GPU-specific expected ranges") + + else: + raise ValueError( + f"Invalid comparison_mode: {comparison_mode}. Must be 'backend' or 'golden'" + ) def test_trtllm_bench(llm_root): # noqa: F811 @@ -70,15 +591,20 @@ def test_trtllm_bench(llm_root): # noqa: F811 ) with tempfile.TemporaryDirectory() as temp_dir: - with open(f"{temp_dir}/model_kwargs.yaml", "w") as f: + with open(f"{temp_dir}/extra_llm_api_options.yaml", "w") as f: yaml.dump( { "model_kwargs": {"num_hidden_layers": 2}, "cuda_graph_batch_sizes": [1, 2], - "max_batch_size": 128, }, f, ) dataset_path = prepare_dataset(llm_root, temp_dir, model_name) run_benchmark(model_name, dataset_path, temp_dir) + + +@pytest.mark.no_xdist +def test_trtllm_bench_backend_comparison(llm_root): # noqa: F811 + """Test that compares autodeploy backend performance against pytorch backend.""" + trtllm_bench_unified_comparison(llm_root, comparison_mode="backend") From 62d6c98d68b5a83f05f5c0d04d6fbb056fc19806 Mon Sep 17 00:00:00 2001 From: Yiqing Yan Date: Mon, 11 Aug 2025 14:38:05 +0800 Subject: [PATCH 02/15] [TRTLLM-5633][infra] Force set changed file diff to empty string for post-merge CI (#6777) Signed-off-by: Yiqing Yan --- jenkins/L0_MergeRequest.groovy | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 95522b2bf26..d00dd66d534 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -591,6 +591,12 @@ def getMergeRequestChangedFileList(pipeline, globalVars) { } def getMergeRequestOneFileChanges(pipeline, globalVars, filePath) { + def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/) + if (env.alternativeTRT || isOfficialPostMergeJob) { + pipeline.echo("Force set changed file diff to empty string.") + return "" + } + def githubPrApiUrl = globalVars[GITHUB_PR_API_URL] def diff = "" From 9c358c26e486db89de44ed55e3d210eb198a6556 Mon Sep 17 00:00:00 2001 From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Date: Mon, 11 Aug 2025 14:39:58 +0800 Subject: [PATCH 03/15] [None][chore] remove closed bugs (#6772) Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Co-authored-by: Larry <197874197+LarryXFly@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 71643434923..fb5964279e6 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -256,7 +256,6 @@ unittest/trt/attention/test_gpt_attention.py -k "partition3" SKIP (https://nvbug test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False] SKIP (https://nvbugs/5414909) unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout] SKIP (https://nvbugs/5418673) unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep4-enable_adp-enable_graph-tp8-trtllm-scout] SKIP (https://nvbugs/5418673) -examples/test_llama.py::test_llm_api_lookahead_decoding_1gpu[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] SKIP (https://nvbugs/5419066) examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5141288) examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5419067) examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_vl_7b_instruct-nb:4] SKIP (https://nvbugs/5419068) From d6ad4a9d5b0e1a2c8211f3688985d125459c9cde Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Mon, 11 Aug 2025 15:16:25 +0800 Subject: [PATCH 04/15] [None][infra] Waive failed tests on main 0811 (#6778) Signed-off-by: qqiao --- tests/integration/test_lists/waives.txt | 1 + tests/unittest/llmapi/test_llm_pytorch.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index fb5964279e6..026eeeca5c4 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -293,3 +293,4 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5445466) disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[llama-3.1-8b] SKIP (https://nvbugs/5445642) examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (https://nvbugs/5447530) +examples/test_nemotron_nas.py::test_nemotron_nas_summary_2gpu[DeciLM-7B] SKIP (https://nvbugs/5444636) diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 541965b588f..e519df1cf2c 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -471,6 +471,7 @@ def test_llama_7b_lora_config_overrides_peft_cache_config(): # TODO smor: currently Nemotron-Super-49B-v1 with LoRA memory consumption is overly high # https://jirasw.nvidia.com/browse/TRTLLM-5045 +@pytest.mark.skip(reason="https://nvbugs/5448464") @skip_gpu_memory_less_than_138gb def test_nemotron_nas_lora() -> None: lora_config = LoraConfig(lora_dir=[ From 9a8195ef88b7f8279e3608e32f19f2959e68c671 Mon Sep 17 00:00:00 2001 From: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> Date: Mon, 11 Aug 2025 00:18:17 -0700 Subject: [PATCH 05/15] fix: Ensure that Python stub generation works against libnvidia-ml stubs (#6188) Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> --- cpp/tensorrt_llm/nanobind/CMakeLists.txt | 2 +- cpp/tensorrt_llm/pybind/CMakeLists.txt | 2 +- docker/Dockerfile.multi | 5 +- scripts/build_wheel.py | 185 +++++++++++++---------- 4 files changed, 113 insertions(+), 81 deletions(-) diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt index aa5b3cf45da..af657a625e2 100755 --- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt +++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt @@ -52,6 +52,6 @@ if(NOT WIN32) ${TRTLLM_NB_MODULE} PROPERTIES LINK_FLAGS - "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}" + "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}" ) endif() diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt index b4809d5135e..bb1d87f9d4b 100755 --- a/cpp/tensorrt_llm/pybind/CMakeLists.txt +++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt @@ -53,6 +53,6 @@ if(NOT WIN32) ${TRTLLM_PYBIND_MODULE} PROPERTIES LINK_FLAGS - "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}" + "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}" ) endif() diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index c832481da9f..eeafc8f4a65 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -71,8 +71,9 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999" # Install OpenCV with FFMPEG support -RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/ -RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir +RUN pip3 uninstall -y opencv && \ + rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \ + pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir # WARs against security issues inherited from pytorch:25.06 # * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7 diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index 52abdbcb844..3041e684c96 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -27,7 +27,7 @@ from shutil import copy, copytree, rmtree from subprocess import DEVNULL, CalledProcessError, check_output, run from textwrap import dedent -from typing import List +from typing import Sequence try: from packaging.requirements import Requirement @@ -120,7 +120,8 @@ def create_venv(project_dir: Path): return venv_prefix -def setup_venv(project_dir: Path, requirements_file: Path, no_venv: bool): +def setup_venv(project_dir: Path, requirements_file: Path, + no_venv: bool) -> tuple[Path, Path]: """Creates/updates a venv and installs requirements. Args: @@ -279,6 +280,103 @@ def generate_fmha_cu(project_dir, venv_python): os.chdir(project_dir) +def create_cuda_stub_links(cuda_stub_dir: str): + """ + Creates symbolic links for CUDA stub libraries in the provided directory. + + Args: + cuda_stub_dir (str): Path to the directory containing CUDA stubs. + """ + cuda_stub_path = Path(cuda_stub_dir) + if not cuda_stub_path.exists(): + raise RuntimeError( + f"CUDA stub directory '{cuda_stub_dir}' does not exist.") + + shared_objects = ["cuda.so", + "nvidia-ml.so"] # List of shared object names to process. + + for lib_name in shared_objects: + # Define the full paths for the library (.so) and its versioned link (.so.1). + so = cuda_stub_path / f"lib{lib_name}" # e.g., libcuda.so + so_versioned = cuda_stub_path / f"lib{lib_name}.1" # e.g., libcuda.so.1 + + # Check if the library exists and the versioned link does not. + if so.exists() and not so_versioned.exists(): + try: + # Attempt to create the symbolic link. + so_versioned.symlink_to(so) + except PermissionError: + # Handle permission errors by attempting to use `sudo` to create the link. + try: + build_run(f"sudo ln -s {str(so)} {str(so_versioned)}") + except CalledProcessError as sudo_error: + print( + f"Failed to create symbolic link even with sudo: {sudo_error}" + ) + + +def generate_python_stubs_linux(binding_type: str, venv_python: Path, + deep_ep: bool): + is_nanobind = binding_type == "nanobind" + package = "nanobind" if is_nanobind else "pybind11-stubgen" + build_run(f"\"{venv_python}\" -m pip install {package}") + + env_stub_gen = os.environ.copy() + cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get( + "CUDA_PATH") or "/usr/local/cuda" + cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs" + ld_library_path = env_stub_gen.get("LD_LIBRARY_PATH") + if Path(cuda_stub_dir).exists(): + # Create symbolic links for the CUDA stubs + create_cuda_stub_links(cuda_stub_dir) + env_stub_gen[ + "LD_LIBRARY_PATH"] = f"{ld_library_path}:{cuda_stub_dir}" if ld_library_path else cuda_stub_dir + if is_nanobind: + build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .", + env=env_stub_gen) + else: + build_run( + f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code", + env=env_stub_gen) + build_run( + f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code", + env=env_stub_gen) + if deep_ep: + build_run( + f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code", + env=env_stub_gen) + + +def generate_python_stubs_windows(binding_type: str, venv_python: Path, + pkg_dir: Path, lib_dir: Path): + if binding_type == "nanobind": + print("Windows not yet supported for nanobind stubs") + exit(1) + else: + build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen") + stubgen = "stubgen.py" + stubgen_contents = """ + # Loading torch, trt before bindings is required to avoid import errors on windows. + # isort: off + import torch + import tensorrt as trt + # isort: on + import os + import platform + + from pybind11_stubgen import main + + if __name__ == "__main__": + # Load dlls from `libs` directory before launching bindings. + if platform.system() == "Windows": + os.add_dll_directory(r\"{lib_dir}\") + main() + """.format(lib_dir=lib_dir) + (pkg_dir / stubgen).write_text(dedent(stubgen_contents)) + build_run(f"\"{venv_python}\" {stubgen} -o . bindings") + (pkg_dir / stubgen).unlink() + + def main(*, build_type: str = "Release", generator: str = "", @@ -286,7 +384,7 @@ def main(*, dist_dir: Path = None, cuda_architectures: str = None, job_count: int = None, - extra_cmake_vars: List[str] = list(), + extra_cmake_vars: Sequence[str] = tuple(), extra_make_targets: str = "", trt_root: str = '/usr/local/tensorrt', nccl_root: str = None, @@ -361,7 +459,7 @@ def main(*, if on_windows: # Windows does not support multi-device currently. - extra_cmake_vars.extend(["ENABLE_MULTI_DEVICE=0"]) + extra_cmake_vars += ["ENABLE_MULTI_DEVICE=0"] # The Ninja CMake generator is used for our Windows build # (Easier than MSBuild to make compatible with our Docker image) @@ -703,81 +801,14 @@ def get_binding_lib(subdirectory, name): dirs_exist_ok=True) if not skip_stubs: - with working_directory(project_dir): - if binding_type == "nanobind": - build_run(f"\"{venv_python}\" -m pip install nanobind") - else: - build_run( - f"\"{venv_python}\" -m pip install pybind11-stubgen") with working_directory(pkg_dir): if on_windows: - if binding_type == "nanobind": - print("Windows not yet supported for nanobind stubs") - exit(1) - else: - stubgen = "stubgen.py" - stubgen_contents = """ - # Loading torch, trt before bindings is required to avoid import errors on windows. - # isort: off - import torch - import tensorrt as trt - # isort: on - import os - import platform - - from pybind11_stubgen import main - - if __name__ == "__main__": - # Load dlls from `libs` directory before launching bindings. - if platform.system() == "Windows": - os.add_dll_directory(r\"{lib_dir}\") - main() - """.format(lib_dir=lib_dir) - (pkg_dir / stubgen).write_text(dedent(stubgen_contents)) - build_run(f"\"{venv_python}\" {stubgen} -o . bindings") - (pkg_dir / stubgen).unlink() - else: - env_ld = os.environ.copy() - - new_library_path = "/usr/local/cuda/compat:/usr/local/cuda/compat/lib:/usr/local/cuda/compat/lib.real" - if 'LD_LIBRARY_PATH' in env_ld: - new_library_path += f":{env_ld['LD_LIBRARY_PATH']}" - - result = build_run("find /usr -name *libnvidia-ml.so*", - capture_output=True, - text=True) - assert result.returncode == 0, f"Failed to run find *libnvidia-ml.so*: {result.stderr}" - - # Build containers only contain stub version of libnvidia-ml.so and not the real version. - # If real version not in system, we need to create symbolic link to stub version to prevent import errors. - if "libnvidia-ml.so.1" not in result.stdout: - if "libnvidia-ml.so" in result.stdout: - line = result.stdout.splitlines()[0] - path = os.path.dirname(line) - new_library_path += f":{path}" - build_run(f"ln -s {line} {path}/libnvidia-ml.so.1") - else: - print( - f"Failed to find libnvidia-ml.so: {result.stderr}", - file=sys.stderr) - exit(1) - - env_ld["LD_LIBRARY_PATH"] = new_library_path - if binding_type == "nanobind": - build_run( - f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .", - env=env_ld) - else: - build_run( - f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code", - env=env_ld) - if deep_ep_cuda_architectures: - build_run( - f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code", - env=env_ld) - build_run( - f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code", - env=env_ld) + generate_python_stubs_windows(binding_type, venv_python, + pkg_dir, lib_dir) + else: # on linux + generate_python_stubs_linux( + binding_type, venv_python, + bool(deep_ep_cuda_architectures)) if not skip_building_wheel: if dist_dir is None: From 83dbc6c75dd1b107bfbdc5d7af943ef3db78be28 Mon Sep 17 00:00:00 2001 From: bhsueh_NV <11360707+byshiue@users.noreply.github.com> Date: Mon, 11 Aug 2025 16:14:52 +0800 Subject: [PATCH 06/15] [TRTLLM-5532][feat] store the block of context request into kv cache (#6683) Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp | 9 ++++++--- tensorrt_llm/_torch/pyexecutor/resource_manager.py | 4 ++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp index c032c80757c..d5fa982a37a 100644 --- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp @@ -2043,10 +2043,13 @@ void KVCacheManager::addSequence( void KVCacheManager::storeContextBlocks(LlmRequest const& llmRequest) { auto const requestId = llmRequest.mRequestId; - auto& sequence = getSequence(requestId); - if (mEnableBlockReuse && !sequence.isCyclic() && !llmRequest.isDummyRequest()) + if (mSequences.find(requestId) != mSequences.end()) { - mBlockManager.storeContextBlocks(sequence, llmRequest); + auto& sequence = getSequence(requestId); + if (mEnableBlockReuse && !sequence.isCyclic() && !llmRequest.isDummyRequest()) + { + mBlockManager.storeContextBlocks(sequence, llmRequest); + } } } diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index b08c106e7e1..89be7d40e35 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -492,6 +492,10 @@ def update_resources(self, scheduled_batch: ScheduledRequests): if request.py_rewind_len > 0: self.rewind_kv_cache(request, request.py_rewind_len) + # For context requests, we store the blocks for reuse. + for request in scheduled_batch.context_requests: + self.impl.store_context_blocks(request) + def free_resources(self, request: LlmRequest): self.impl.remove_sequence(request.py_request_id, request) From a2e9153cb0abba0e5ba4f47404f41cdaf40aa4e4 Mon Sep 17 00:00:00 2001 From: Liao Lanyu <108499334+lancelly@users.noreply.github.com> Date: Mon, 11 Aug 2025 16:25:41 +0800 Subject: [PATCH 07/15] [None][doc] Add K2 tool calling examples (#6667) Signed-off-by: Lanyu Liao Co-authored-by: Lanyu Liao --- examples/models/core/kimi_k2/README.md | 127 +++++++++++ .../kimi_k2/kimi_k2_tool_calling_example.py | 201 ++++++++++++++++++ 2 files changed, 328 insertions(+) create mode 100644 examples/models/core/kimi_k2/README.md create mode 100644 examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py diff --git a/examples/models/core/kimi_k2/README.md b/examples/models/core/kimi_k2/README.md new file mode 100644 index 00000000000..1dd3e353c5a --- /dev/null +++ b/examples/models/core/kimi_k2/README.md @@ -0,0 +1,127 @@ +# K2 (Kimi-K2-Instruct) + +## Overview + +Kimi K2 is Moonshot AI's Mixture-of-Experts model with 32 billion activated parameters and 1 trillion total parameters. It achieves state-of-the-art performance in frontier knowledge, math, and coding among non-thinking models. Notably, K2 also excels in agentic capabilities, demonstrating outstanding performance across complex, multi-step tasks. + +## Prerequisites for Tool Calling in Kimi-K2 + +K2 model supports tool calling functionality. The official guide can be found at: [tool_call_guidance](https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md) + +As described in the official guide, a tool calling process in Kimi-K2 includes: +1. Passing function descriptions to Kimi-K2. +2. Kimi-K2 decides to make a function call and returns the necessary information for the function call to the user. +3. The user performs the function call, collects the call results, and passes the function call results to Kimi-K2 +4. Kimi-K2 continues to generate content based on the function call results until the model believes it has obtained sufficient information to respond to the user + +Tools are the primary way to define callable functions for K2. Each tool requires: +- A unique name +- A clear description +- A JSON schema defining the expected parameters + +A possible example of tool description(you may refer to [Using tools](https://huggingface.co/docs/hugs/guides/function-calling) for more information) is as follows: +```python +# Collect the tool descriptions in tools +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather information. Call this tool when the user needs to get weather information", + "parameters": { + "type": "object", + "required": ["location"], + "properties": { + "location": { + "type": "string", + "description": "location name", + } + } + } + } +}] +``` + +Kimi currently supports two main approaches for tool calling: +1. *Use openai.OpenAI to send messages to Kimi-K2 together with tool descriptions.* +In this mode, the descriptions of the tools are passed as an argument to `client.chat.completions.create`, and the tool-call details can be read directly from the corresponding fields in the response. +2. *Manually parse the tool-call requests from the outputs generated by Kimi-K2.* +The tool call requests generated by Kimi-K2 are wrapped by <|tool_calls_section_begin|> and <|tool_calls_section_end|>, with each tool call wrapped by <|tool_call_begin|> and <|tool_call_end|>. The tool ID and arguments are separated by <|tool_call_argument_begin|>. The format of the tool ID is functions.{func_name}:{idx}, from which we can parse the function name. + +**Note that TensorRT-LLM does not support the first approach for now. If you deploy K2 with TensorRT-LLM, you need to manually parse the tool-call requests from the outputs.** + +The next section is an example that deploys the K2 model using TensorRT-LLM and then manually parses the tool-call results. + +## Example: Manually Parsing Tool-Call Requests from Kimi-K2 Outputs + +First, launch a server using trtllm-serve: + +```bash +cat > ./extra_llm_api_options.yaml < +<|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location": "shanghai"}<|tool_call_end|> +<|tool_calls_section_end|>user + +[The tool-call requests parsed from the output]: [{'id': 'functions.get_weather:0', 'type': 'function', 'function': {'name': 'get_weather', 'arguments': '{"location": "shanghai"}'}}] + +[Tool call result]: tool_name=get_weather, tool_result=Cloudy +``` + +The tool call works successfully: +- In `[The original output from Kimi-K2]`, the LLM selects the correct tool `get_weather` and provides the appropriate arguments. +- In `[The tool-call requests parsed from the output]`, the client parses the LLM response. +- In `[Tool call result]`, the client executes the tool function and get the result. + +Let's try another query, "What's the weather like in beijing today?", using a predefined system prompt to specify the output format as shown below. + +```bash +python kimi_k2_tool_calling_example.py \ + --model "moonshotai/Kimi-K2-Instruct" \ + --prompt "What's the weather like in beijing today?" + --specify_output_format +``` + +The output would look like: + +```txt +[The original output from Kimi-K2]: [get_weather(location='beijing')]user + +[The tool-call requests parsed from the output]: [{'type': 'function', 'function': {'name': 'get_weather', 'arguments': {'location': 'beijing'}}}] + +[Tool call result]: tool_name=get_weather, tool_result=Sunny +``` +Once again, the tool call works successfully and the original output from Kimi-K2 is formatted. + +**Note that, without guided decoding or other deterministic tool adapters, K2 sometimes deviates from the specified output format. Because TensorRT-LLM does not support K2 with guided decoding for now, you have to parse the tool calls carefully from the raw model output to ensure they meet the required format.** diff --git a/examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py b/examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py new file mode 100644 index 00000000000..28505477041 --- /dev/null +++ b/examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py @@ -0,0 +1,201 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import ast +import json +import re + +from openai import OpenAI + +SPECIFY_OUTPUT_FORMAT_PROMPT = """You are an AI assistant with the role name "assistant." \ +Based on the provided API specifications and conversation history from steps 1 to t, \ +generate the API requests that the assistant should call in step t+1. \ +The API requests should be output in the format [api_name(key1='value1', key2='value2', ...)], \ +replacing api_name with the actual API name, key1, key2, etc., with the actual parameter names, \ +and value1, value2, etc., with the actual parameter values. The output should start with a square bracket "[" and end with a square bracket "]". +If there are multiple API requests, separate them with commas, for example: \ +[api_name(key1='value1', key2='value2', ...), api_name(key1='value1', key2='value2', ...), ...]. \ +Do not include any other explanations, prompts, or API call results in the output. +If the API parameter description does not specify otherwise, the parameter is optional \ +(parameters mentioned in the user input need to be included in the output; if not mentioned, they do not need to be included). +If the API parameter description does not specify the required format for the value, use the user's original text for the parameter value. \ +If the API requires no parameters, output the API request directly in the format [api_name()], and do not invent any nonexistent parameter names. + +API Specifications: +{tools}""" + +NOT_SPECIFY_OUTPUT_FORMAT_PROMPT = """Important: Only give the tool call requests, \ +do not include any other explanations, prompts, or API call results in the output. +The tool call requests generated by you are wrapped by \ +<|tool_calls_section_begin|> and <|tool_calls_section_end|>, with each tool call wrapped by <|tool_call_begin|> and <|tool_call_end|>. \ +The tool ID and arguments are separated by <|tool_call_argument_begin|>. The format of the tool ID is functions.func_name:idx, \ +from which we can parse the function name. + +API Specifications: +{tools}""" + + +def get_weather(location: str): + if location.lower() == "beijing": + return "Sunny" + elif location.lower() == "shanghai": + return "Cloudy" + else: + return "Rainy" + + +# Tool name->object mapping for easy calling later +tool_map = {"get_weather": get_weather} + + +# ref: https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md +def extract_tool_call_info(tool_call_rsp: str): + if '<|tool_calls_section_begin|>' not in tool_call_rsp: + # No tool calls + return [] + pattern = r"<\|tool_calls_section_begin\|>(.*?)<\|tool_calls_section_end\|>" + + tool_calls_sections = re.findall(pattern, tool_call_rsp, re.DOTALL) + + # Extract multiple tool calls + func_call_pattern = r"<\|tool_call_begin\|>\s*(?P[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P.*?)\s*<\|tool_call_end\|>" + tool_calls = [] + for match in re.findall(func_call_pattern, tool_calls_sections[0], + re.DOTALL): + function_id, function_args = match + # function_id: functions.get_weather:0 + function_name = function_id.split('.')[1].split(':')[0] + tool_calls.append({ + "id": function_id, + "type": "function", + "function": { + "name": function_name, + "arguments": function_args + } + }) + return tool_calls + + +def parse_specified_format_tool_calls(text: str): + pattern = re.compile(r'(\w+)\s*\(([^)]*)\)') + tool_calls = [] + + for m in pattern.finditer(text): + api_name, kv_body = m.group(1), m.group(2) + + kv_pattern = re.compile(r'(\w+)\s*=\s*([^,]+)') + kwargs = {} + for k, v in kv_pattern.findall(kv_body): + try: + kwargs[k] = ast.literal_eval(v.strip()) + except Exception: + kwargs[k] = v.strip() + + tool_calls.append({ + "type": "function", + "function": { + "name": api_name, + "arguments": kwargs + } + }) + + return tool_calls + + +def get_tools(): + # Collect the tool descriptions in tools + return [{ + "type": "function", + "function": { + "name": "get_weather", + "description": + "Get weather information. Call this tool when the user needs to get weather information", + "parameters": { + "type": "object", + "required": ["location"], + "properties": { + "location": { + "type": "string", + "description": "Location name", + } + } + } + } + }] + + +def get_tool_call_requests(args, client): + model = args.model + tools = get_tools() + system_prompt = SPECIFY_OUTPUT_FORMAT_PROMPT if args.specify_output_format else NOT_SPECIFY_OUTPUT_FORMAT_PROMPT.format( + tools=tools) + messages = [{ + "role": "system", + "content": system_prompt + }, { + "role": "user", + "content": args.prompt + }] + + response = client.chat.completions.create(model=model, + messages=messages, + max_tokens=256, + temperature=0.0) + + output = response.choices[0].message.content + tool_calls = parse_specified_format_tool_calls( + output) if args.specify_output_format else extract_tool_call_info( + output) + print(f"[The original output from Kimi-K2]: {output}\n") + print(f"[The tool-call requests parsed from the output]: {tool_calls}\n") + return tool_calls, messages + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", + type=str, + default="moonshotai/Kimi-K2-Instruct") + parser.add_argument("--prompt", + type=str, + default="What's the weather like in Shanghai today?") + parser.add_argument("--specify_output_format", + action="store_true", + default=False) + + args = parser.parse_args() + + # start trt-llm server before running this script + client = OpenAI( + api_key="tensorrt_llm", + base_url="http://localhost:8000/v1", + ) + + tool_calls, messages = get_tool_call_requests(args, client) + + for tool_call in tool_calls: + tool_name = tool_call['function']['name'] + if args.specify_output_format: + tool_arguments = tool_call['function']['arguments'] + else: + tool_arguments = json.loads(tool_call['function']['arguments']) + tool_function = tool_map[tool_name] + tool_result = tool_function(**tool_arguments) + print( + f"[Tool call result]: tool_name={tool_name}, tool_result={tool_result}\n" + ) From c9f216fe5f490ac7f7b2a2111484e152ad580775 Mon Sep 17 00:00:00 2001 From: Bo Deng Date: Tue, 5 Aug 2025 07:57:47 +0000 Subject: [PATCH 08/15] [TRTLLM-6675][Infra] nixl doc and test completion - I Signed-off-by: Bo Deng --- .../serve/scripts/benchmark_serving.py | 2 +- .../defs/disaggregated/test_disaggregated.py | 202 +++++++++++++++++- .../test_lists/test-db/l0_dgx_h100.yml | 4 + 3 files changed, 204 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/serve/scripts/benchmark_serving.py b/tensorrt_llm/serve/scripts/benchmark_serving.py index 1aeb87554d9..303688f0017 100644 --- a/tensorrt_llm/serve/scripts/benchmark_serving.py +++ b/tensorrt_llm/serve/scripts/benchmark_serving.py @@ -581,7 +581,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={k: [results[k]] - for k in metrics}, + for k in metrics if k in results}, extra_info={ k: results[k] for k in results if k not in metrics and k not in ignored_metrics diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index 2a961553905..c3819b6e1ad 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -14,18 +14,23 @@ # limitations under the License. import os +import re import subprocess import pytest -from defs.conftest import skip_arm, skip_no_hopper -from defs.trt_test_alternative import check_call, popen +import yaml +from defs.conftest import llm_models_root, skip_arm, skip_no_hopper +from defs.trt_test_alternative import check_call, check_output, popen from tensorrt_llm.logger import logger def cleanup_output_files(): """Clean up output files from previous runs.""" - for file in ['output.json', 'output_streaming.json']: + for file in [ + 'output.json', 'output_streaming.json', 'ucx_config.yaml', + 'nixl_config.yaml' + ]: try: os.remove(file) except FileNotFoundError: @@ -1051,3 +1056,194 @@ def test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp( "deepseek_v3_lite_fp8_tp1_two_mtp", env=llm_venv._new_env, cwd=llm_venv.get_working_directory()) + + +@pytest.fixture(scope="module") +def benchmark_root(): + llm_root = os.getenv("LLM_ROOT") + return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts") + + +@pytest.fixture(scope="module") +def shared_gpt_path(): + DEFAULT_LLM_MODEL_ROOT = os.path.join("/scratch.trt_llm_data", "llm-models") + LLM_MODELS_ROOT = os.environ.get("LLM_MODELS_ROOT", DEFAULT_LLM_MODEL_ROOT) + return os.path.join(LLM_MODELS_ROOT, "datasets", + "ShareGPT_V3_unfiltered_cleaned_split.json") + + +@pytest.fixture(scope="function") +def benchmark_model_root(request): + models_root = llm_models_root() + if (request.param == "DeepSeek-V3-Lite-fp8"): + model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "fp8") + elif (request.param == "DeepSeek-V3-Lite-bf16"): + model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "bf16") + elif request.param == "llama-v3-8b-hf": + model_path = os.path.join(models_root, "llama-models-v3", "8B") + elif request.param == "llama-3.1-8b-instruct-hf-fp8": + model_path = os.path.join(models_root, "llama-3.1-model", + "Llama-3.1-8B-Instruct-FP8") + else: + raise ValueError(f"Failed to find the model: {request.param}") + return model_path + + +def run_disaggregated_benchmark(example_dir, + config_file, + benchmark_root, + benchmark_model_root, + shared_gpt_path, + env=None, + cwd=None): + """Run disaggregated test with given configuration.""" + run_env = env.copy() + run_env["UCX_TLS"] = "^ib" + num_rank = 2 + workers_cmd = [ + 'mpirun', '--allow-run-as-root', '--oversubscribe', '-n', + str(num_rank), 'trtllm-serve', 'disaggregated_mpi_worker', '-c', + config_file + ] + + server_start_timeout = 900 + server_cmd = [ + 'trtllm-serve', 'disaggregated', '--server_start_timeout', + str(server_start_timeout), '-c', config_file + ] + try: + with ( # Start workers + open('output_workers.log', 'w') as output_workers, + popen(workers_cmd, + stdout=output_workers, + stderr=subprocess.STDOUT, + env=run_env, + cwd=cwd) as workers_proc, + # Start server + open('output_disagg.log', 'w') as output_disagg, + popen(server_cmd, + stdout=output_disagg, + stderr=subprocess.STDOUT, + env=run_env, + cwd=cwd) as server_proc): + # Ensure the sever has started + client_dir = f"{example_dir}/clients" + client_cmd = [ + 'python3', f'{client_dir}/disagg_client.py', '-c', + f'{example_dir}/disagg_config.yaml', '-p', + f'{client_dir}/prompts.json', '--ignore-eos', + '--server-start-timeout', + str(server_start_timeout) + ] + # Warm up + check_call(client_cmd, + env=env, + poll_procs=[workers_proc, server_proc]) + # Start Benchmark + benchmark_script = os.path.join(benchmark_root, + "benchmark_serving.py") + benchmark_cmd = [ + 'python3', + benchmark_script, + '--model', + benchmark_model_root, + '--tokenizer', + benchmark_model_root, + '--dataset-name', + 'random', + '--dataset-path', + shared_gpt_path, + '--random-input-len', + '256', + '--random-output-len', + '64', + '--random-prefix-len', + '0', + '--num-prompts', + '320', + '--max-concurrency', + '32', + '--host', + 'localhost', + '--port', + '8000', + '--ignore-eos', + '--no-test-input', + '--percentile-metrics', + 'e2el,ttft', + ] + # warm up + check_call(benchmark_cmd, env=env) + output = check_output(benchmark_cmd, env=env) + e2el_pattern = r"Median E2EL \(ms\):\s*(\d+\.?\d*)" + ttft_pattern = r"Median TTFT \(ms\):\s*(\d+\.?\d*)" + e2el_match = re.search(e2el_pattern, output) + ttft_match = re.search(ttft_pattern, output) + if e2el_match and ttft_match: + median_e2el = float(e2el_match.group(1)) + median_ttft = float(ttft_match.group(1)) + return median_e2el, median_ttft + else: + raise ValueError("No benchmark result found") + + except Exception: + # Print outputs on error + logger.error("-------- Workers output --------") + with open('output_workers.log', 'r') as f: + logger.error(f.read()) + + logger.error("-------- Disagg server output --------") + with open('output_disagg.log', 'r') as f: + logger.error(f.read()) + raise + finally: + server_proc.terminate() + workers_proc.terminate() + server_proc.wait() + workers_proc.wait() + + +@pytest.mark.parametrize("benchmark_model_root", [ + 'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf', + 'llama-3.1-8b-instruct-hf-fp8' +], + indirect=True) +def test_disaggregated_benchmark_on_diff_backends( + disaggregated_test_root, disaggregated_example_root, llm_venv, + benchmark_model_root, benchmark_root, shared_gpt_path): + base_config_path = os.path.join(os.path.dirname(__file__), "test_configs", + "disagg_config_for_benchmark.yaml") + with open(base_config_path, 'r', encoding='utf-8') as f: + config = yaml.load(f, Loader=yaml.SafeLoader) + config["model"] = benchmark_model_root + with open("ucx_config.yaml", 'w', encoding='utf-8') as ucx_config: + yaml.dump(config, ucx_config) + config["context_servers"]["cache_transceiver_config"][ + "backend"] = "nixl" + config["generation_servers"]["cache_transceiver_config"][ + "backend"] = "nixl" + with open("nixl_config.yaml", 'w', encoding='utf-8') as nixl_config: + yaml.dump(config, nixl_config) + + env = llm_venv._new_env.copy() + nixl_e2el, nixl_ttft = run_disaggregated_benchmark( + disaggregated_example_root, + f"{os.path.dirname(__file__)}/nixl_config.yaml", + benchmark_root, + benchmark_model_root, + shared_gpt_path, + env=env, + cwd=llm_venv.get_working_directory()) + ucx_e2el, ucx_ttft = run_disaggregated_benchmark( + disaggregated_example_root, + f"{os.path.dirname(__file__)}/ucx_config.yaml", + benchmark_root, + benchmark_model_root, + shared_gpt_path, + env=env, + cwd=llm_venv.get_working_directory()) + print(f"Nixl E2EL: {nixl_e2el} ms, UCX E2EL: {ucx_e2el} ms") + print(f"Nixl TTFT: {nixl_ttft} ms, UCX TTFT: {ucx_ttft} ms") + + assert ucx_e2el > 0 and nixl_e2el > 0 and nixl_e2el < 1.05 * ucx_e2el + assert ucx_ttft > 0 and nixl_ttft > 0 and nixl_ttft < 1.05 * ucx_ttft diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 29d8efac07f..4160efb6529 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -110,6 +110,10 @@ l0_dgx_h100: - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] From 0a6143d69eb2ce45d23d74ad45d55b64ddffe120 Mon Sep 17 00:00:00 2001 From: Bo Deng Date: Tue, 5 Aug 2025 08:18:10 +0000 Subject: [PATCH 09/15] [TRTLLM-6675][Infra] nixl doc and test completion - II Signed-off-by: Bo Deng --- .../accuracy/test_disaggregated_serving.py | 50 +++++++++++++++++++ .../test_lists/qa/llm_function_full.txt | 2 + .../test_lists/qa/llm_function_sanity.txt | 2 + .../test_lists/test-db/l0_dgx_b200.yml | 7 +++ .../test_lists/test-db/l0_dgx_h100.yml | 2 + 5 files changed, 63 insertions(+) diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index e0801302eba..a276f172e7a 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -281,6 +281,30 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct" + def test_nixl_backend(self, backend): + ctx_server_config = {"cache_transceiver_config": {"backend": "nixl"}} + gen_server_config = {"cache_transceiver_config": {"backend": "nixl"}} + disaggregated_server_config = { + "hostname": "localhost", + "port": 8000, + "backend": "pytorch", + "context_servers": { + "num_instances": 1, + "urls": ["localhost:8001"] + }, + "generation_servers": { + "num_instances": 1, + "urls": ["localhost:8002"] + } + } + with launch_disaggregated_llm(disaggregated_server_config, + ctx_server_config, gen_server_config, + self.MODEL_PATH) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + @pytest.mark.skip_less_device_memory(32000) @pytest.mark.parametrize("disable_overlap_scheduler", [False, True]) def test_auto_dtype(self, disable_overlap_scheduler): @@ -565,6 +589,32 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite" MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16" + def test_nixl_backend(self, backend): + ctx_server_config = {"cache_transceiver_config": {"backend": "nixl"}} + gen_server_config = {"cache_transceiver_config": {"backend": "nixl"}} + disaggregated_server_config = { + "hostname": "localhost", + "port": 8000, + "backend": "pytorch", + "context_servers": { + "num_instances": 1, + "urls": ["localhost:8001"] + }, + "generation_servers": { + "num_instances": 1, + "urls": ["localhost:8002"] + } + } + with launch_disaggregated_llm(disaggregated_server_config, + ctx_server_config, + gen_server_config, + self.MODEL_PATH, + tensor_parallel_size=4) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + @parametrize_with_ids("overlap_scheduler", [True, False]) @parametrize_with_ids("mtp_nextn", [0, pytest.param(2, marks=skip_pre_hopper)]) diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index 037bb0c3d29..da6bac0e419 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -573,6 +573,8 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-] test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-] diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index ae7008f815d..488336a77ab 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -110,6 +110,8 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 29b83ac0778..7c8c92fa186 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -79,3 +79,10 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRTLLM] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4] + - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] + - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] + - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend + - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 4160efb6529..c4bbca390ef 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -52,6 +52,8 @@ l0_dgx_h100: - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] + - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend + - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend - test_e2e.py::test_ptp_quickstart_advanced_bs1 - test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8] - unittest/_torch/modeling/test_modeling_pixtral.py::test_tensor_parallelism From dc60530f1c3a9389214d6fe275e88a659eaa8c80 Mon Sep 17 00:00:00 2001 From: Bo Deng Date: Tue, 5 Aug 2025 08:41:25 +0000 Subject: [PATCH 10/15] fix typo Signed-off-by: Bo Deng --- tests/integration/defs/accuracy/test_disaggregated_serving.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index a276f172e7a..fd03bbb036f 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -281,7 +281,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct" - def test_nixl_backend(self, backend): + def test_nixl_backend(self): ctx_server_config = {"cache_transceiver_config": {"backend": "nixl"}} gen_server_config = {"cache_transceiver_config": {"backend": "nixl"}} disaggregated_server_config = { @@ -589,7 +589,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite" MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16" - def test_nixl_backend(self, backend): + def test_nixl_backend(self): ctx_server_config = {"cache_transceiver_config": {"backend": "nixl"}} gen_server_config = {"cache_transceiver_config": {"backend": "nixl"}} disaggregated_server_config = { From 148e3af7b1d8140535c8d87e6a14af91641f35ce Mon Sep 17 00:00:00 2001 From: Bo Deng Date: Wed, 6 Aug 2025 00:32:24 +0000 Subject: [PATCH 11/15] add missing config file Signed-off-by: Bo Deng --- .../disagg_config_for_benchmark.yaml | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml new file mode 100644 index 00000000000..be2ced4b463 --- /dev/null +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml @@ -0,0 +1,29 @@ +model: DeepSeek-V3-Lite/fp8 +hostname: localhost +port: 8000 +backend: "pytorch" +context_servers: + num_instances: 1 + max_batch_size: 2 + max_num_tokens: 384 + max_seq_len: 320 + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + disable_overlap_scheduler: true + cache_transceiver_config: + backend: ucx + max_tokens_in_buffer: 512 + urls: + - "localhost:8001" +generation_servers: + num_instances: 1 + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 2 + max_num_tokens: 384 + max_seq_len: 320 + cache_transceiver_config: + backend: ucx + max_tokens_in_buffer: 512 + urls: + - "localhost:8002" From 0610d8cad47ee0bf8ff6acb7f70ef702c9a7742b Mon Sep 17 00:00:00 2001 From: Bo Deng Date: Wed, 6 Aug 2025 09:58:39 +0000 Subject: [PATCH 12/15] fix accuracy tests Signed-off-by: Bo Deng --- .../accuracy/test_disaggregated_serving.py | 80 ++++++++++++------- .../test_lists/qa/llm_function_full.txt | 2 +- .../test_lists/qa/llm_function_sanity.txt | 2 +- .../test_lists/test-db/l0_dgx_b200.yml | 2 +- .../test_lists/test-db/l0_dgx_h100.yml | 2 +- 5 files changed, 54 insertions(+), 34 deletions(-) diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index fd03bbb036f..08a96916ff3 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -281,30 +281,6 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct" - def test_nixl_backend(self): - ctx_server_config = {"cache_transceiver_config": {"backend": "nixl"}} - gen_server_config = {"cache_transceiver_config": {"backend": "nixl"}} - disaggregated_server_config = { - "hostname": "localhost", - "port": 8000, - "backend": "pytorch", - "context_servers": { - "num_instances": 1, - "urls": ["localhost:8001"] - }, - "generation_servers": { - "num_instances": 1, - "urls": ["localhost:8002"] - } - } - with launch_disaggregated_llm(disaggregated_server_config, - ctx_server_config, gen_server_config, - self.MODEL_PATH) as llm: - task = MMLU(self.MODEL_NAME) - task.evaluate(llm) - task = GSM8K(self.MODEL_NAME) - task.evaluate(llm) - @pytest.mark.skip_less_device_memory(32000) @pytest.mark.parametrize("disable_overlap_scheduler", [False, True]) def test_auto_dtype(self, disable_overlap_scheduler): @@ -590,8 +566,18 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16" def test_nixl_backend(self): - ctx_server_config = {"cache_transceiver_config": {"backend": "nixl"}} - gen_server_config = {"cache_transceiver_config": {"backend": "nixl"}} + ctx_server_config = { + "disable_overlap_scheduler": True, + "cache_transceiver_config": { + "backend": "nixl" + } + } + gen_server_config = { + "disable_overlap_scheduler": True, + "cache_transceiver_config": { + "backend": "nixl" + } + } disaggregated_server_config = { "hostname": "localhost", "port": 8000, @@ -606,10 +592,8 @@ def test_nixl_backend(self): } } with launch_disaggregated_llm(disaggregated_server_config, - ctx_server_config, - gen_server_config, - self.MODEL_PATH, - tensor_parallel_size=4) as llm: + ctx_server_config, gen_server_config, + self.MODEL_PATH) as llm: task = MMLU(self.MODEL_NAME) task.evaluate(llm) task = GSM8K(self.MODEL_NAME) @@ -716,6 +700,42 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen3/Qwen3-8B" MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8" + def test_nixl_backend(self): + ctx_server_config = { + "disable_overlap_scheduler": True, + "cache_transceiver_config": { + "backend": "nixl" + } + } + gen_server_config = { + "disable_overlap_scheduler": True, + "cache_transceiver_config": { + "backend": "nixl" + } + } + ctx_server_config["cache_transceiver_config"] + ctx_server_config["cache_transceiver_config"] + disaggregated_server_config = { + "hostname": "localhost", + "port": 8000, + "backend": "pytorch", + "context_servers": { + "num_instances": 1, + "urls": ["localhost:8001"] + }, + "generation_servers": { + "num_instances": 1, + "urls": ["localhost:8002"] + } + } + with launch_disaggregated_llm(disaggregated_server_config, + ctx_server_config, gen_server_config, + self.MODEL_PATH) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + @pytest.mark.parametrize("overlap_scheduler", [False, True]) def test_auto_dtype(self, overlap_scheduler): ctx_server_config = { diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index da6bac0e419..4a1f44dbb29 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -573,7 +573,7 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend +accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-] diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index 488336a77ab..42ec8d21a91 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -110,7 +110,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend +accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 7c8c92fa186..ca23535a199 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -84,5 +84,5 @@ l0_dgx_b200: - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] - - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend + - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index c4bbca390ef..798353ddc02 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -52,7 +52,7 @@ l0_dgx_h100: - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] - - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend + - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend - test_e2e.py::test_ptp_quickstart_advanced_bs1 - test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8] From 57c7e4f0ccdf8f095308291a8f4d7ccc44cb4f51 Mon Sep 17 00:00:00 2001 From: Bo Deng Date: Wed, 6 Aug 2025 20:05:19 -0700 Subject: [PATCH 13/15] fix some tests Signed-off-by: Bo Deng --- .../disagg_config_for_benchmark.yaml | 29 --------- .../defs/disaggregated/test_disaggregated.py | 64 ++++++++++++++----- 2 files changed, 49 insertions(+), 44 deletions(-) delete mode 100644 tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml deleted file mode 100644 index be2ced4b463..00000000000 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml +++ /dev/null @@ -1,29 +0,0 @@ -model: DeepSeek-V3-Lite/fp8 -hostname: localhost -port: 8000 -backend: "pytorch" -context_servers: - num_instances: 1 - max_batch_size: 2 - max_num_tokens: 384 - max_seq_len: 320 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - disable_overlap_scheduler: true - cache_transceiver_config: - backend: ucx - max_tokens_in_buffer: 512 - urls: - - "localhost:8001" -generation_servers: - num_instances: 1 - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - max_batch_size: 2 - max_num_tokens: 384 - max_seq_len: 320 - cache_transceiver_config: - backend: ucx - max_tokens_in_buffer: 512 - urls: - - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index c3819b6e1ad..9c95defe0ab 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -16,6 +16,7 @@ import os import re import subprocess +import tempfile import pytest import yaml @@ -1203,6 +1204,43 @@ def run_disaggregated_benchmark(example_dir, workers_proc.wait() +def get_config_for_benchmark(model_root, backend): + serve_config = { + "model": model_root, + "hostname": "localhost", + "port": 8000, + "backend": "pytorch", + "context_servers": { + "num_instances": 1, + "max_batch_size": 2, + "max_num_tokens": 384, + "max_seq_len": 320, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "disable_overlap_scheduler": True, + "cache_transceiver_config": { + "backend": backend, + "max_tokens_in_buffer": 512, + }, + "urls": ["localhost:8001"] + }, + "generation_servers": { + "num_instances": 1, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "max_batch_size": 2, + "max_num_tokens": 384, + "max_seq_len": 320, + "cache_transceiver_config": { + "backend": backend, + "max_tokens_in_buffer": 512, + }, + "urls": ["localhost:8002"] + } + } + return serve_config + + @pytest.mark.parametrize("benchmark_model_root", [ 'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf', 'llama-3.1-8b-instruct-hf-fp8' @@ -1211,24 +1249,20 @@ def run_disaggregated_benchmark(example_dir, def test_disaggregated_benchmark_on_diff_backends( disaggregated_test_root, disaggregated_example_root, llm_venv, benchmark_model_root, benchmark_root, shared_gpt_path): - base_config_path = os.path.join(os.path.dirname(__file__), "test_configs", - "disagg_config_for_benchmark.yaml") - with open(base_config_path, 'r', encoding='utf-8') as f: - config = yaml.load(f, Loader=yaml.SafeLoader) - config["model"] = benchmark_model_root - with open("ucx_config.yaml", 'w', encoding='utf-8') as ucx_config: - yaml.dump(config, ucx_config) - config["context_servers"]["cache_transceiver_config"][ - "backend"] = "nixl" - config["generation_servers"]["cache_transceiver_config"][ - "backend"] = "nixl" - with open("nixl_config.yaml", 'w', encoding='utf-8') as nixl_config: - yaml.dump(config, nixl_config) + nixl_config = get_config_for_benchmark(benchmark_model_root, "nixl") + ucx_config = get_config_for_benchmark(benchmark_model_root, "ucx") + temp_dir = tempfile.TemporaryDirectory() + nixl_config_path = os.path.join(temp_dir.name, "nixl_config.yaml") + ucx_config_path = os.path.join(temp_dir.name, "ucx_config.yaml") + with open(nixl_config_path, 'w', encoding='utf-8') as f: + yaml.dump(nixl_config, f) + with open(ucx_config_path, 'w', encoding='utf-8') as f: + yaml.dump(ucx_config, f) env = llm_venv._new_env.copy() nixl_e2el, nixl_ttft = run_disaggregated_benchmark( disaggregated_example_root, - f"{os.path.dirname(__file__)}/nixl_config.yaml", + nixl_config_path, benchmark_root, benchmark_model_root, shared_gpt_path, @@ -1236,7 +1270,7 @@ def test_disaggregated_benchmark_on_diff_backends( cwd=llm_venv.get_working_directory()) ucx_e2el, ucx_ttft = run_disaggregated_benchmark( disaggregated_example_root, - f"{os.path.dirname(__file__)}/ucx_config.yaml", + ucx_config_path, benchmark_root, benchmark_model_root, shared_gpt_path, From fb8c19da325d974152a9ee0d4c1630c391df063d Mon Sep 17 00:00:00 2001 From: Bo Deng Date: Wed, 6 Aug 2025 20:11:39 -0700 Subject: [PATCH 14/15] clean codes Signed-off-by: Bo Deng --- tests/integration/defs/disaggregated/test_disaggregated.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index 9c95defe0ab..fa150ea26e7 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -28,10 +28,7 @@ def cleanup_output_files(): """Clean up output files from previous runs.""" - for file in [ - 'output.json', 'output_streaming.json', 'ucx_config.yaml', - 'nixl_config.yaml' - ]: + for file in ['output.json', 'output_streaming.json']: try: os.remove(file) except FileNotFoundError: From a87a577ae382e715fed79cf291d190a1cb263356 Mon Sep 17 00:00:00 2001 From: Bo Deng Date: Sun, 10 Aug 2025 21:45:45 -0700 Subject: [PATCH 15/15] adjust tests Signed-off-by: Bo Deng --- tests/integration/defs/accuracy/test_disaggregated_serving.py | 2 -- tests/integration/defs/disaggregated/test_disaggregated.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 08a96916ff3..98432a3aab8 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -713,8 +713,6 @@ def test_nixl_backend(self): "backend": "nixl" } } - ctx_server_config["cache_transceiver_config"] - ctx_server_config["cache_transceiver_config"] disaggregated_server_config = { "hostname": "localhost", "port": 8000, diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index fa150ea26e7..c193a358197 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -1211,7 +1211,7 @@ def get_config_for_benchmark(model_root, backend): "num_instances": 1, "max_batch_size": 2, "max_num_tokens": 384, - "max_seq_len": 320, + "max_seq_len": 384, "tensor_parallel_size": 1, "pipeline_parallel_size": 1, "disable_overlap_scheduler": True, @@ -1227,7 +1227,7 @@ def get_config_for_benchmark(model_root, backend): "pipeline_parallel_size": 1, "max_batch_size": 2, "max_num_tokens": 384, - "max_seq_len": 320, + "max_seq_len": 384, "cache_transceiver_config": { "backend": backend, "max_tokens_in_buffer": 512,