[TRTLLM-5758] test: Add Bielik-11B-v2.2 Model Support (#5159)

Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
NVIDIA · hypdeb · Jun 10, 2025 · Jun 10, 2025 · Jun 10, 2025 · Jun 11, 2025
commit 3a02489e86ccbc3e2baf7be1010744ce70e57286
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -92,3 +92,8 @@ nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
     accuracy: 94.16
 kanana-1.5-2.1b-instruct-2505:
   - accuracy: 75.81
+speakleash/Bielik-11B-v2.2-Instruct:
+  - accuracy: 41.51
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 40.41
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -179,3 +179,8 @@ nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
     accuracy: 83.36
 kanana-1.5-2.1b-instruct-2505:
   - accuracy: 56.89
+speakleash/Bielik-11B-v2.2-Instruct:
+  - accuracy: 64.47
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 64.36
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1488,3 +1488,22 @@ def test_auto_dtype(self):
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
+
+
+class TestBielik11BInstruct(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "speakleash/Bielik-11B-v2.2-Instruct"
+
+    def test_auto_dtype(self):
+        with LLM(f"{llm_models_root()}/Bielik-11B-v2.2-Instruct") as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_hopper
+    def test_fp8(self):
+        with LLM(f"{llm_models_root()}/Bielik-11B-v2.2-Instruct-FP8") as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -484,6 +484,8 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[throughput_latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency]
 accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
 
 test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
 test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]

diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -304,3 +304,53 @@ def test_codellama_fp8_with_bf16_lora() -> None:
                                lora_request=lora_requests)
 
         assert len(outputs) == 2
+
+
+@skip_gpu_memory_less_than_80gb
+def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
+    from tensorrt_llm._torch.llm import LLM
+
+    model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"
+
+    target_modules = ['attn_q', 'attn_k', 'attn_v']
+
+    # Set up temporary directory for LoRA adapters
+    with tempfile.TemporaryDirectory() as lora_dir:
+        print("Creating dummy LoRAs...")
+
+        model = AutoModelForCausalLM.from_pretrained(model_dir,
+                                                     torch_dtype=torch.bfloat16,
+                                                     device_map="auto")
+        hf_modules = ["q_proj", "k_proj", "v_proj"]
+        peft_lora_config = PeftLoraConfig(r=8,
+                                          target_modules=hf_modules,
+                                          bias="none",
+                                          task_type="CAUSAL_LM")
+        lora_paths = []
+        for i in range(2):
+            lora_model = get_peft_model(model, peft_lora_config)
+            for param in lora_model.parameters():
+                param.data.zero_()
+            lora_path = f"{lora_dir}/lora_{i}"
+            lora_model.save_pretrained(lora_path)
+            lora_paths.append(lora_path)
+
+        trtllm_lora_config = LoraConfig(lora_dir=lora_paths,
+                                        lora_target_modules=target_modules,
+                                        max_lora_rank=8)
+        llm = LLM(model_dir, lora_config=trtllm_lora_config)
+
+        prompts = [
+            "Kim był Mikołaj Kopernik i z czego zasłynął?",
+            "Gdzie znajduje się stolica Polski?",
+        ]
+        lora_req1 = LoRARequest("lora-1", 0, lora_paths[0])
+        lora_req2 = LoRARequest("lora-2", 1, lora_paths[1])
+        lora_requests = [lora_req1, lora_req2]
+        sampling_params = SamplingParams(max_tokens=200)
+
+        outputs = llm.generate(prompts,
+                               sampling_params,
+                               lora_request=lora_requests)
+
+        assert len(outputs) == 2