diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index dfe84bdecc5..3472c064f69 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -2060,9 +2060,10 @@ def forward_sparse_mla_kvcache_bf16( # [seq, num_heads, kv_lora_rank], account for padding attn_out_latent = attn_out_latent[:, :self.num_heads_tp, :] - # TODO: seems we need .contiguous() here when padding enabled before pass to bmm? attn_out_latent = attn_out_latent.view( [-1, self.num_heads_tp, self.kv_lora_rank]) + if self.num_heads_tp != padding: + attn_out_latent = attn_out_latent.contiguous() assert (attn_out_latent.shape[0] == q.shape[0] and attn_out_latent.shape[1] == self.num_heads_tp) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index e4895ee9e0f..a9482116313 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2409,8 +2409,12 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness): (8, 1, 8, 1, False, True, True, True, 24, "_DEFAULT"), (8, 1, 8, 0, True, True, True, True, 24, "_DEFAULT"), (8, 1, 8, 3, False, False, True, True, 1, "TRTLLM"), + (8, 1, 8, 3, False, False, True, True, 1, "_DEFAULT"), ], - ids=["baseline", "baseline_mtp1", "baseline_fp8kv", "latency"]) + ids=[ + "baseline", "baseline_mtp1", "baseline_fp8kv", "latency", + "latency_default" + ]) def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler, max_batch_size, moe_backend): diff --git a/tests/integration/test_lists/test-db/l0_dgx_h200.yml b/tests/integration/test_lists/test-db/l0_dgx_h200.yml index efacea5814d..aea8b70d6f3 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h200.yml @@ -18,7 +18,7 @@ l0_dgx_h200: # - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] # OOM - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] # 1h - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency_default] - accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True] - accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False] - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True]