diff --git a/requirements.txt b/requirements.txt index 475490aa87e..d5a3a8ecb96 100644 --- a/requirements.txt +++ b/requirements.txt @@ -63,3 +63,4 @@ llguidance==0.7.29 soundfile deep_gemm @ git+https://github.com/zongfeijing/DeepGEMM.git@a9d538ef4dff0326fe521c6ca0bfde115703b56a triton==3.3.1; platform_machine == "x86_64" +blobfile diff --git a/tensorrt_llm/llmapi/tokenizer.py b/tensorrt_llm/llmapi/tokenizer.py index 858f98289cd..6e5f7bbcee0 100644 --- a/tensorrt_llm/llmapi/tokenizer.py +++ b/tensorrt_llm/llmapi/tokenizer.py @@ -160,7 +160,8 @@ def decode_incrementally( # HF incremental detokenization implementation is faster than TRTLLM when stream_interval is smaller. if (TLLM_INCREMENTAL_DETOKENIZATION_BACKEND == "TRTLLM" or stream_interval >= TLLM_STREAM_INTERVAL_THRESHOLD - or spaces_between_special_tokens is False): + or spaces_between_special_tokens is False + or not hasattr(self.tokenizer, "_tokenizer")): return self.trtllm_decode_incrementally( token_ids, prev_text, diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 353a0f71404..f69f02eaeb5 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -99,6 +99,9 @@ Qwen3/Qwen3-235B-A22B: quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 85.78 +moonshotai/Kimi-K2-Instruct: + - quant_algo: FP8_BLOCK_SCALES + accuracy: 94.84 nvidia/Llama-3_3-Nemotron-Super-49B-v1: - accuracy: 92.57 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index b4b76a8161b..485ad7c0295 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -187,6 +187,9 @@ Qwen3/Qwen3-235B-A22B: quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 86 +moonshotai/Kimi-K2-Instruct: + - quant_algo: FP8_BLOCK_SCALES + accuracy: 87.65 nvidia/Llama-3_3-Nemotron-Super-49B-v1: - accuracy: 79.43 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index a7a3ecd57e0..b4adb178498 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1638,6 +1638,49 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, task.evaluate(llm) +@pytest.mark.timeout(7200) +@pytest.mark.skip_less_device_memory(100000) +class TestKimiK2(LlmapiAccuracyTestHarness): + MODEL_NAME = "moonshotai/Kimi-K2-Instruct" + MODEL_PATH = f"{llm_models_root()}/Kimi-K2-Instruct" + + @pytest.mark.skip_less_mpi_world_size(8) + @skip_pre_hopper + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size", + [(8, 1, 8, False, False, True, True, 16)], + ids=["latency"]) + def test_fp8_blockscale(self, tp_size, pp_size, ep_size, fp8kv, + attention_dp, cuda_graph, overlap_scheduler, + max_batch_size): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + pytorch_config = dict( + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, + ) + + if fp8kv: + kv_cache_config.dtype = "fp8" + + mtp_config = None + with LLM(f"{llm_models_root()}/Kimi-K2-Instruct", + max_batch_size=max_batch_size, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + trust_remote_code=True, + kv_cache_config=kv_cache_config, + **pytorch_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + class TestMinitron4BBaseInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "nvidia/Nemotron-Mini-4B-Instruct" MODEL_PATH = f"{llm_models_root()}/nemotron/nemotron-mini-4b-instruct_vfp8-fp8-bf16-export" diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index 717bf2579df..16606a07958 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -47,6 +47,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_ep accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]