diff --git a/fastchat/llm_judge/common.py b/fastchat/llm_judge/common.py index f8d01b46e..f5a463d86 100644 --- a/fastchat/llm_judge/common.py +++ b/fastchat/llm_judge/common.py @@ -24,7 +24,7 @@ TIE_DELTA = 0.1 # Categories that need reference answers -NEED_REF_CATS = ["math", "reasoning", "coding"] +NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"] # Extract scores from judgments two_score_pattern = re.compile("\[\[(\d+\.?\d*),\s?(\d+\.?\d*)\]\]") @@ -42,6 +42,7 @@ "reasoning": 0.0, "stem": 0.1, "humanities": 0.1, + "arena-hard-200": 0.0, } reverse_model_map = {