Skip to content

Commit d4c3fda

Browse files
Support second turn judgement and singe answer grading (#1856)
1 parent 03cb7a6 commit d4c3fda

File tree

2 files changed

+271
-35
lines changed

2 files changed

+271
-35
lines changed

fastchat/llm_judge/common.py

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ def normalize_game_key_dict(judgment_dict):
489489
return ret
490490

491491

492-
def load_model_judgments(filename: str):
492+
def load_pairwise_model_judgments(filename: str):
493493
"""Load model judgments.
494494
495495
The return value is a dict of type:
@@ -532,10 +532,35 @@ def load_model_judgments(filename: str):
532532
return normalized
533533

534534

535-
def resolve_default_judgment_dict(
535+
def load_single_model_judgments(filename: str):
536+
"""Load model judgments.
537+
538+
The return value is a dict of type:
539+
Dict[judge: Tuple -> Dict[game_key: tuple -> game_result: dict]
540+
"""
541+
judge_dict = {}
542+
543+
for line in open(filename):
544+
obj = json.loads(line)
545+
judge = tuple(obj["judge"])
546+
qid, model = obj["question_id"], obj["model"]
547+
548+
if judge not in judge_dict:
549+
judge_dict[judge] = {}
550+
551+
gamekey = (qid, model)
552+
553+
judge_dict[judge][gamekey] = {
554+
"score": obj["score"],
555+
"judgment": obj["judgment"],
556+
}
557+
return judge_dict
558+
559+
560+
def resolve_pairwise_judgment_dict(
536561
question, model_judgments_normal, model_judgments_math, multi_turn=False
537562
):
538-
"""Return the correct default judge."""
563+
"""Return the correct pairwise judge."""
539564
if multi_turn:
540565
if question["category"] in NEED_REF_CATS:
541566
return model_judgments_math[("gpt-4", "pair-math-v1-multi-turn")]
@@ -547,7 +572,22 @@ def resolve_default_judgment_dict(
547572
return model_judgments_normal[("gpt-4", "pair-v2")]
548573

549574

550-
def get_model_judge_explanation(gamekey, judgment_dict):
575+
def resolve_single_judgment_dict(
576+
question, model_judgments_normal, model_judgments_math, multi_turn=False
577+
):
578+
"""Return the correct single answer grading judge."""
579+
if multi_turn:
580+
if question["category"] in NEED_REF_CATS:
581+
return model_judgments_math[("gpt-4", "single-math-v1-multi-turn")]
582+
return model_judgments_normal[("gpt-4", "single-v1-multi-turn")]
583+
584+
if question["category"] in NEED_REF_CATS:
585+
return model_judgments_math[("gpt-4", "single-math-v1")]
586+
else:
587+
return model_judgments_normal[("gpt-4", "single-v1")]
588+
589+
590+
def get_pairwise_judge_explanation(gamekey, judgment_dict):
551591
"""Get model judge explanation."""
552592
try:
553593
qid, model_1, model_2 = gamekey
@@ -572,6 +612,24 @@ def get_model_judge_explanation(gamekey, judgment_dict):
572612
return "N/A"
573613

574614

615+
def get_single_judge_explanation(gamekey, judgment_dict):
616+
"""Get model judge explanation."""
617+
try:
618+
qid, model = gamekey
619+
620+
res = judgment_dict[gamekey]
621+
622+
g1_judgment = res["judgment"]
623+
g1_score = res["score"]
624+
625+
return (
626+
f"**Game 1**. **A**: {model}, **Score**: {g1_score}\n\n"
627+
f"**Judgment**: {g1_judgment}"
628+
)
629+
except KeyError:
630+
return "N/A"
631+
632+
575633
def check_data(questions, model_answers, ref_answers, models, judges):
576634
# check model answers
577635
for m in models:

0 commit comments

Comments
 (0)