@@ -284,16 +284,9 @@ def arena_hard_process(leaderboard_table_file, filepath):
284284 return arena_hard
285285
286286
287- def create_ranking_str (ranking , ranking_difference ):
288- if ranking_difference > 0 :
289- return f"{ int (ranking )} \u2191 "
290- elif ranking_difference < 0 :
291- return f"{ int (ranking )} \u2193 "
292- else :
293- return f"{ int (ranking )} "
294-
295-
296- def get_arena_table (arena_df , model_table_df , arena_subset_df = None , hidden_models = None ):
287+ def get_arena_table (
288+ arena_df , model_table_df , arena_subset_df = None , hidden_models = None , use_delta = True
289+ ):
297290 arena_df = arena_df .sort_values (
298291 by = ["final_ranking" , "rating" ], ascending = [True , False ]
299292 )
@@ -311,25 +304,20 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_model
311304 arena_df = arena_df [arena_df .index .isin (arena_subset_df .index )]
312305 arena_df ["final_ranking" ] = recompute_final_ranking (arena_df )
313306
314- arena_subset_df ["final_ranking_no_tie" ] = np .arange (1 , len (arena_subset_df ) + 1 )
315- arena_df ["final_ranking_no_tie" ] = np .arange (1 , len (arena_df ) + 1 )
316-
317307 arena_df = arena_subset_df .join (
318308 arena_df ["final_ranking" ], rsuffix = "_global" , how = "inner"
319309 )
320- arena_df ["ranking_difference" ] = (
321- arena_df ["final_ranking_global" ] - arena_df ["final_ranking" ]
322- )
310+
311+ if use_delta :
312+ arena_df ["ranking_difference" ] = (
313+ arena_df ["final_ranking_global" ] - arena_df ["final_ranking" ]
314+ )
315+ else :
316+ arena_df ["ranking_difference" ] = arena_df ["final_ranking_global" ]
323317
324318 arena_df = arena_df .sort_values (
325319 by = ["final_ranking" , "rating" ], ascending = [True , False ]
326320 )
327- arena_df ["final_ranking" ] = arena_df .apply (
328- lambda x : create_ranking_str (x ["final_ranking" ], x ["ranking_difference" ]),
329- axis = 1 ,
330- )
331-
332- arena_df ["final_ranking" ] = arena_df ["final_ranking" ].astype (str )
333321
334322 # Handle potential duplicate keys in model_table_df
335323 model_table_dict = model_table_df .groupby ("key" ).first ().to_dict (orient = "index" )
@@ -343,15 +331,18 @@ def process_row(row):
343331 return None
344332
345333 ranking = row .get ("final_ranking" ) or row .name + 1
346- result = [ranking ]
334+ result = [ranking if isinstance ( ranking , str ) else int ( ranking ) ]
347335
348336 if arena_subset_df is not None :
349- result .append (row .get ("ranking_difference" , 0 ))
337+ ranking = row .get ("ranking_difference" , 0 )
338+ result .append (ranking if isinstance (ranking , str ) else int (ranking ))
339+ else :
340+ result .append (None )
350341
351342 result .extend (
352343 [
353344 model_info .get ("Model" , "Unknown" ),
354- f" { round (row [' rating' ]) } " ,
345+ int ( round (row [" rating" ])) ,
355346 f"+{ round (row ['rating_q975' ] - row ['rating' ])} /-{ round (row ['rating' ] - row ['rating_q025' ])} " ,
356347 round (row ["num_battles" ]),
357348 model_info .get ("Organization" , "Unknown" ),
@@ -363,7 +354,6 @@ def process_row(row):
363354 ),
364355 ]
365356 )
366-
367357 return result
368358
369359 values = [
@@ -389,18 +379,6 @@ def update_leaderboard_df(arena_table_vals):
389379 ]
390380 elo_dataframe = pd .DataFrame (arena_table_vals , columns = columns )
391381
392- def highlight_max (s ):
393- return [
394- (
395- "color: green; font-weight: bold"
396- if "\u2191 " in str (v )
397- else "color: red; font-weight: bold"
398- if "\u2193 " in str (v )
399- else ""
400- )
401- for v in s
402- ]
403-
404382 def highlight_rank_max (s ):
405383 return [
406384 (
@@ -413,9 +391,49 @@ def highlight_rank_max(s):
413391 for v in s
414392 ]
415393
416- return elo_dataframe .style .apply (highlight_max , subset = ["Rank* (UB)" ]).apply (
417- highlight_rank_max , subset = ["Delta" ]
394+ return elo_dataframe .style .apply (highlight_rank_max , subset = ["Delta" ])
395+
396+
397+ def update_overall_leaderboard_df (arena_table_vals ):
398+ columns = [
399+ "Rank* (UB)" ,
400+ "Rank (StyleCtrl)" ,
401+ "Model" ,
402+ "Arena Score" ,
403+ "95% CI" ,
404+ "Votes" ,
405+ "Organization" ,
406+ "License" ,
407+ "Knowledge Cutoff" ,
408+ ]
409+ elo_dataframe = pd .DataFrame (arena_table_vals , columns = columns )
410+
411+ def highlight_red (s ):
412+ return [("color: red; font-weight: bold" ) for v in s ]
413+
414+ def highlight_green (s ):
415+ return [("color: green; font-weight: bold" ) for v in s ]
416+
417+ def compare_func (row ):
418+ if row ["Rank (StyleCtrl)" ] is None :
419+ return 0
420+ if row ["Rank (StyleCtrl)" ] == row ["Rank* (UB)" ]:
421+ return 0
422+ elif row ["Rank (StyleCtrl)" ] < row ["Rank* (UB)" ]:
423+ return 1
424+ else :
425+ return - 1
426+
427+ comparison = elo_dataframe .apply (
428+ compare_func ,
429+ axis = 1 ,
418430 )
431+ indices_red = [i for i , value in enumerate (comparison ) if value == - 1 ]
432+ indices_green = [i for i , value in enumerate (comparison ) if value == 1 ]
433+
434+ return elo_dataframe .style .apply (
435+ highlight_red , subset = pd .IndexSlice [indices_red , ["Rank (StyleCtrl)" ]]
436+ ).apply (highlight_green , subset = pd .IndexSlice [indices_green , ["Rank (StyleCtrl)" ]])
419437
420438
421439def build_arena_tab (
@@ -444,6 +462,15 @@ def build_arena_tab(
444462
445463 arena_df = arena_dfs ["Overall" ]
446464
465+ arena_overall_sc_df = None
466+ if "Overall w/ Style Control" in arena_dfs :
467+ arena_overall_sc_df = arena_dfs [
468+ "Overall w/ Style Control"
469+ ] # for incorporating style control on the overall leaderboard
470+ arena_overall_sc_df = arena_overall_sc_df [
471+ arena_overall_sc_df ["num_battles" ] > 300
472+ ]
473+
447474 def update_leaderboard_and_plots (category , filters ):
448475 if len (filters ) > 0 and "Style Control" in filters :
449476 cat_name = f"{ category } w/ Style Control"
@@ -454,23 +481,26 @@ def update_leaderboard_and_plots(category, filters):
454481
455482 arena_subset_df = arena_dfs [category ]
456483 arena_subset_df = arena_subset_df [arena_subset_df ["num_battles" ] > 300 ]
484+
457485 elo_subset_results = category_elo_results [category ]
458486
459487 baseline_category = cat_name_to_baseline .get (category , "Overall" )
460488 arena_df = arena_dfs [baseline_category ]
461489 arena_values = get_arena_table (
462490 arena_df ,
463491 model_table_df ,
464- arena_subset_df = arena_subset_df if category != "Overall" else None ,
492+ arena_subset_df = arena_subset_df
493+ if category != "Overall"
494+ else arena_overall_sc_df ,
465495 hidden_models = (
466496 None
467497 if len (filters ) > 0 and "Show Deprecate" in filters
468498 else deprecated_model_name
469499 ),
500+ use_delta = category != "Overall" ,
470501 )
471502 if category != "Overall" :
472503 arena_values = update_leaderboard_df (arena_values )
473- # arena_values = highlight_top_models(arena_values)
474504 arena_values = gr .Dataframe (
475505 headers = [
476506 "Rank* (UB)" ,
@@ -484,7 +514,7 @@ def update_leaderboard_and_plots(category, filters):
484514 "Knowledge Cutoff" ,
485515 ],
486516 datatype = [
487- "str " ,
517+ "number " ,
488518 "number" ,
489519 "markdown" ,
490520 "number" ,
@@ -497,13 +527,15 @@ def update_leaderboard_and_plots(category, filters):
497527 value = arena_values ,
498528 elem_id = "arena_leaderboard_dataframe" ,
499529 height = 1000 ,
500- column_widths = [70 , 70 , 210 , 90 , 90 , 90 , 120 , 150 , 100 ],
530+ column_widths = [75 , 75 , 180 , 60 , 60 , 60 , 70 , 80 , 60 ],
501531 wrap = True ,
502532 )
503533 else :
534+ arena_values = update_overall_leaderboard_df (arena_values )
504535 arena_values = gr .Dataframe (
505536 headers = [
506537 "Rank* (UB)" ,
538+ "Rank (StyleCtrl)" ,
507539 "Model" ,
508540 "Arena Score" ,
509541 "95% CI" ,
@@ -513,6 +545,7 @@ def update_leaderboard_and_plots(category, filters):
513545 "Knowledge Cutoff" ,
514546 ],
515547 datatype = [
548+ "number" ,
516549 "number" ,
517550 "markdown" ,
518551 "number" ,
@@ -525,7 +558,7 @@ def update_leaderboard_and_plots(category, filters):
525558 value = arena_values ,
526559 elem_id = "arena_leaderboard_dataframe" ,
527560 height = 1000 ,
528- column_widths = [70 , 220 , 90 , 90 , 90 , 120 , 150 , 100 ],
561+ column_widths = [75 , 75 , 180 , 60 , 60 , 60 , 70 , 80 , 60 ],
529562 wrap = True ,
530563 )
531564
@@ -549,7 +582,11 @@ def update_leaderboard_and_plots(category, filters):
549582
550583 # arena table
551584 arena_table_vals = get_arena_table (
552- arena_df , model_table_df , hidden_models = deprecated_model_name
585+ arena_df ,
586+ model_table_df ,
587+ hidden_models = deprecated_model_name ,
588+ arena_subset_df = arena_overall_sc_df ,
589+ use_delta = False ,
553590 )
554591
555592 md = make_arena_leaderboard_md (arena_df , last_updated_time , vision = vision )
@@ -568,32 +605,23 @@ def update_leaderboard_and_plots(category, filters):
568605 )
569606 with gr .Column (scale = 2 ):
570607 category_checkbox = gr .CheckboxGroup (
571- ["Style Control" , "Show Deprecate" ], label = "Apply filter" , info = ""
608+ ["Style Control Score" , "Show Deprecated" ],
609+ label = "Apply filter" ,
610+ info = "" ,
572611 )
573612 default_category_details = make_category_arena_leaderboard_md (
574613 arena_df , arena_df , name = "Overall"
575614 )
576- with gr .Column (scale = 4 , variant = "panel" ):
615+ with gr .Column (scale = 3 , variant = "panel" ):
577616 category_deets = gr .Markdown (
578617 default_category_details , elem_id = "category_deets"
579618 )
580619
581- arena_vals = pd .DataFrame (
582- arena_table_vals ,
583- columns = [
584- "Rank* (UB)" ,
585- "Model" ,
586- "Arena Score" ,
587- "95% CI" ,
588- "Votes" ,
589- "Organization" ,
590- "License" ,
591- "Knowledge Cutoff" ,
592- ],
593- )
620+ arena_vals = update_overall_leaderboard_df (arena_table_vals )
594621 elo_display_df = gr .Dataframe (
595622 headers = [
596623 "Rank* (UB)" ,
624+ "Rank (StyleCtrl)" ,
597625 "Model" ,
598626 "Arena Elo" ,
599627 "95% CI" ,
@@ -603,6 +631,7 @@ def update_leaderboard_and_plots(category, filters):
603631 "Knowledge Cutoff" ,
604632 ],
605633 datatype = [
634+ "number" ,
606635 "number" ,
607636 "markdown" ,
608637 "number" ,
@@ -612,11 +641,10 @@ def update_leaderboard_and_plots(category, filters):
612641 "str" ,
613642 "str" ,
614643 ],
615- # value=highlight_top_models(arena_vals.style),
616- value = arena_vals .style ,
644+ value = arena_vals ,
617645 elem_id = "arena_leaderboard_dataframe" ,
618646 height = 1000 ,
619- column_widths = [70 , 220 , 90 , 90 , 90 , 120 , 150 , 100 ],
647+ column_widths = [75 , 75 , 180 , 60 , 60 , 60 , 70 , 80 , 60 ],
620648 wrap = True ,
621649 )
622650
@@ -626,6 +654,9 @@ def update_leaderboard_and_plots(category, filters):
626654Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval).
627655See Figure 1 below for visualization of the confidence intervals of model scores.
628656
657+ **Rank (StyleCtrl)**: model's ranking with style control, which accounts for factors like response length and markdown usage to decouple model performance from these potential confounding variables.
658+ See [blog post](https://blog.lmarena.ai/blog/2024/style-control/) for further details.
659+
629660Note: in each category, we exclude models with fewer than 300 votes as their confidence intervals can be large.
630661""" ,
631662 elem_id = "leaderboard_markdown" ,
@@ -833,9 +864,7 @@ def build_category_leaderboard_tab(
833864 "long_user" ,
834865 # "no_refusal",
835866]
836- # selected_categories_width = [95, 85, 100, 75, 120, 100, 95, 100,100]
837867selected_categories_width = [110 , 110 , 110 , 110 , 110 , 80 , 80 , 80 , 80 ]
838- # selected_categories_width = [100] * len(selected_categories)
839868
840869language_categories = [
841870 "english" ,
0 commit comments