diff --git a/fastchat/llm_judge/common.py b/fastchat/llm_judge/common.py
index f8d01b46e..f5a463d86 100644
--- a/fastchat/llm_judge/common.py
+++ b/fastchat/llm_judge/common.py
@@ -24,7 +24,7 @@
 TIE_DELTA = 0.1
 
 # Categories that need reference answers
-NEED_REF_CATS = ["math", "reasoning", "coding"]
+NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"]
 
 # Extract scores from judgments
 two_score_pattern = re.compile("\[\[(\d+\.?\d*),\s?(\d+\.?\d*)\]\]")
@@ -42,6 +42,7 @@
     "reasoning": 0.0,
     "stem": 0.1,
     "humanities": 0.1,
+    "arena-hard-200": 0.0,
 }
 
 reverse_model_map = {