@@ -489,7 +489,7 @@ def normalize_game_key_dict(judgment_dict):
489489 return ret
490490
491491
492- def load_model_judgments (filename : str ):
492+ def load_pairwise_model_judgments (filename : str ):
493493 """Load model judgments.
494494
495495 The return value is a dict of type:
@@ -532,10 +532,35 @@ def load_model_judgments(filename: str):
532532 return normalized
533533
534534
535- def resolve_default_judgment_dict (
535+ def load_single_model_judgments (filename : str ):
536+ """Load model judgments.
537+
538+ The return value is a dict of type:
539+ Dict[judge: Tuple -> Dict[game_key: tuple -> game_result: dict]
540+ """
541+ judge_dict = {}
542+
543+ for line in open (filename ):
544+ obj = json .loads (line )
545+ judge = tuple (obj ["judge" ])
546+ qid , model = obj ["question_id" ], obj ["model" ]
547+
548+ if judge not in judge_dict :
549+ judge_dict [judge ] = {}
550+
551+ gamekey = (qid , model )
552+
553+ judge_dict [judge ][gamekey ] = {
554+ "score" : obj ["score" ],
555+ "judgment" : obj ["judgment" ],
556+ }
557+ return judge_dict
558+
559+
560+ def resolve_pairwise_judgment_dict (
536561 question , model_judgments_normal , model_judgments_math , multi_turn = False
537562):
538- """Return the correct default judge."""
563+ """Return the correct pairwise judge."""
539564 if multi_turn :
540565 if question ["category" ] in NEED_REF_CATS :
541566 return model_judgments_math [("gpt-4" , "pair-math-v1-multi-turn" )]
@@ -547,7 +572,22 @@ def resolve_default_judgment_dict(
547572 return model_judgments_normal [("gpt-4" , "pair-v2" )]
548573
549574
550- def get_model_judge_explanation (gamekey , judgment_dict ):
575+ def resolve_single_judgment_dict (
576+ question , model_judgments_normal , model_judgments_math , multi_turn = False
577+ ):
578+ """Return the correct single answer grading judge."""
579+ if multi_turn :
580+ if question ["category" ] in NEED_REF_CATS :
581+ return model_judgments_math [("gpt-4" , "single-math-v1-multi-turn" )]
582+ return model_judgments_normal [("gpt-4" , "single-v1-multi-turn" )]
583+
584+ if question ["category" ] in NEED_REF_CATS :
585+ return model_judgments_math [("gpt-4" , "single-math-v1" )]
586+ else :
587+ return model_judgments_normal [("gpt-4" , "single-v1" )]
588+
589+
590+ def get_pairwise_judge_explanation (gamekey , judgment_dict ):
551591 """Get model judge explanation."""
552592 try :
553593 qid , model_1 , model_2 = gamekey
@@ -572,6 +612,24 @@ def get_model_judge_explanation(gamekey, judgment_dict):
572612 return "N/A"
573613
574614
615+ def get_single_judge_explanation (gamekey , judgment_dict ):
616+ """Get model judge explanation."""
617+ try :
618+ qid , model = gamekey
619+
620+ res = judgment_dict [gamekey ]
621+
622+ g1_judgment = res ["judgment" ]
623+ g1_score = res ["score" ]
624+
625+ return (
626+ f"**Game 1**. **A**: { model } , **Score**: { g1_score } \n \n "
627+ f"**Judgment**: { g1_judgment } "
628+ )
629+ except KeyError :
630+ return "N/A"
631+
632+
575633def check_data (questions , model_answers , ref_answers , models , judges ):
576634 # check model answers
577635 for m in models :
0 commit comments