diff --git a/tests/integration/defs/accuracy/references/gpqa_diamond.yaml b/tests/integration/defs/accuracy/references/gpqa_diamond.yaml index d5ac97761e1..58de6894e6e 100644 --- a/tests/integration/defs/accuracy/references/gpqa_diamond.yaml +++ b/tests/integration/defs/accuracy/references/gpqa_diamond.yaml @@ -17,7 +17,17 @@ deepseek-ai/DeepSeek-R1: nvidia/Llama-3_3-Nemotron-Super-49B-v1: - accuracy: 44.95 - quant_algo: FP8 - accuracy: 49.49 + kv_cache_quant_algo: FP8 + accuracy: 42.42 + # GPQA diamond only contains 198 samples, so the score tends to have large variance. + # We repeated evaluation 7 times to choose a lower bound score for FP8, 42.42. + # random_seed=0: 47.98 + # random_seed=1: 42.42 + # random_seed=2: 52.02 + # random_seed=3: 51.52 + # random_seed=4: 48.48 + # random_seed=5: 47.47 + # random_seed=6: 45.96 nvidia/Llama-3.1-Nemotron-Nano-8B-v1: - accuracy: 40.40 nvidia/Llama-3_1-Nemotron-Ultra-253B-v1: diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index eb78bc87494..58ba08a7719 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -73,6 +73,7 @@ Qwen3/Qwen3-235B-A22B: nvidia/Llama-3_3-Nemotron-Super-49B-v1: - accuracy: 92.57 - quant_algo: FP8 + kv_cache_quant_algo: FP8 accuracy: 92.42 nvidia/Nemotron-H-8B-Base-8K: - accuracy: 46.20 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index e7981413761..f3cffe69428 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -153,6 +153,7 @@ Qwen3/Qwen3-235B-A22B: nvidia/Llama-3_3-Nemotron-Super-49B-v1: - accuracy: 79.43 - quant_algo: FP8 + kv_cache_quant_algo: FP8 accuracy: 79.26 nvidia/Llama-3.1-Nemotron-Nano-8B-v1: - accuracy: 57.97