Skip to content

Commit a7c99d1

Browse files
add style control delta to overall leaderboard ui (#3585)
Co-authored-by: Wei-Lin Chiang <[email protected]>
1 parent 4c6b259 commit a7c99d1

File tree

1 file changed

+95
-66
lines changed

1 file changed

+95
-66
lines changed

fastchat/serve/monitor/monitor.py

Lines changed: 95 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -284,16 +284,9 @@ def arena_hard_process(leaderboard_table_file, filepath):
284284
return arena_hard
285285

286286

287-
def create_ranking_str(ranking, ranking_difference):
288-
if ranking_difference > 0:
289-
return f"{int(ranking)} \u2191"
290-
elif ranking_difference < 0:
291-
return f"{int(ranking)} \u2193"
292-
else:
293-
return f"{int(ranking)}"
294-
295-
296-
def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
287+
def get_arena_table(
288+
arena_df, model_table_df, arena_subset_df=None, hidden_models=None, use_delta=True
289+
):
297290
arena_df = arena_df.sort_values(
298291
by=["final_ranking", "rating"], ascending=[True, False]
299292
)
@@ -311,25 +304,20 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_model
311304
arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
312305
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
313306

314-
arena_subset_df["final_ranking_no_tie"] = np.arange(1, len(arena_subset_df) + 1)
315-
arena_df["final_ranking_no_tie"] = np.arange(1, len(arena_df) + 1)
316-
317307
arena_df = arena_subset_df.join(
318308
arena_df["final_ranking"], rsuffix="_global", how="inner"
319309
)
320-
arena_df["ranking_difference"] = (
321-
arena_df["final_ranking_global"] - arena_df["final_ranking"]
322-
)
310+
311+
if use_delta:
312+
arena_df["ranking_difference"] = (
313+
arena_df["final_ranking_global"] - arena_df["final_ranking"]
314+
)
315+
else:
316+
arena_df["ranking_difference"] = arena_df["final_ranking_global"]
323317

324318
arena_df = arena_df.sort_values(
325319
by=["final_ranking", "rating"], ascending=[True, False]
326320
)
327-
arena_df["final_ranking"] = arena_df.apply(
328-
lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]),
329-
axis=1,
330-
)
331-
332-
arena_df["final_ranking"] = arena_df["final_ranking"].astype(str)
333321

334322
# Handle potential duplicate keys in model_table_df
335323
model_table_dict = model_table_df.groupby("key").first().to_dict(orient="index")
@@ -343,15 +331,18 @@ def process_row(row):
343331
return None
344332

345333
ranking = row.get("final_ranking") or row.name + 1
346-
result = [ranking]
334+
result = [ranking if isinstance(ranking, str) else int(ranking)]
347335

348336
if arena_subset_df is not None:
349-
result.append(row.get("ranking_difference", 0))
337+
ranking = row.get("ranking_difference", 0)
338+
result.append(ranking if isinstance(ranking, str) else int(ranking))
339+
else:
340+
result.append(None)
350341

351342
result.extend(
352343
[
353344
model_info.get("Model", "Unknown"),
354-
f"{round(row['rating'])}",
345+
int(round(row["rating"])),
355346
f"+{round(row['rating_q975'] - row['rating'])}/-{round(row['rating'] - row['rating_q025'])}",
356347
round(row["num_battles"]),
357348
model_info.get("Organization", "Unknown"),
@@ -363,7 +354,6 @@ def process_row(row):
363354
),
364355
]
365356
)
366-
367357
return result
368358

369359
values = [
@@ -389,18 +379,6 @@ def update_leaderboard_df(arena_table_vals):
389379
]
390380
elo_dataframe = pd.DataFrame(arena_table_vals, columns=columns)
391381

392-
def highlight_max(s):
393-
return [
394-
(
395-
"color: green; font-weight: bold"
396-
if "\u2191" in str(v)
397-
else "color: red; font-weight: bold"
398-
if "\u2193" in str(v)
399-
else ""
400-
)
401-
for v in s
402-
]
403-
404382
def highlight_rank_max(s):
405383
return [
406384
(
@@ -413,9 +391,49 @@ def highlight_rank_max(s):
413391
for v in s
414392
]
415393

416-
return elo_dataframe.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
417-
highlight_rank_max, subset=["Delta"]
394+
return elo_dataframe.style.apply(highlight_rank_max, subset=["Delta"])
395+
396+
397+
def update_overall_leaderboard_df(arena_table_vals):
398+
columns = [
399+
"Rank* (UB)",
400+
"Rank (StyleCtrl)",
401+
"Model",
402+
"Arena Score",
403+
"95% CI",
404+
"Votes",
405+
"Organization",
406+
"License",
407+
"Knowledge Cutoff",
408+
]
409+
elo_dataframe = pd.DataFrame(arena_table_vals, columns=columns)
410+
411+
def highlight_red(s):
412+
return [("color: red; font-weight: bold") for v in s]
413+
414+
def highlight_green(s):
415+
return [("color: green; font-weight: bold") for v in s]
416+
417+
def compare_func(row):
418+
if row["Rank (StyleCtrl)"] is None:
419+
return 0
420+
if row["Rank (StyleCtrl)"] == row["Rank* (UB)"]:
421+
return 0
422+
elif row["Rank (StyleCtrl)"] < row["Rank* (UB)"]:
423+
return 1
424+
else:
425+
return -1
426+
427+
comparison = elo_dataframe.apply(
428+
compare_func,
429+
axis=1,
418430
)
431+
indices_red = [i for i, value in enumerate(comparison) if value == -1]
432+
indices_green = [i for i, value in enumerate(comparison) if value == 1]
433+
434+
return elo_dataframe.style.apply(
435+
highlight_red, subset=pd.IndexSlice[indices_red, ["Rank (StyleCtrl)"]]
436+
).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]])
419437

420438

421439
def build_arena_tab(
@@ -444,6 +462,15 @@ def build_arena_tab(
444462

445463
arena_df = arena_dfs["Overall"]
446464

465+
arena_overall_sc_df = None
466+
if "Overall w/ Style Control" in arena_dfs:
467+
arena_overall_sc_df = arena_dfs[
468+
"Overall w/ Style Control"
469+
] # for incorporating style control on the overall leaderboard
470+
arena_overall_sc_df = arena_overall_sc_df[
471+
arena_overall_sc_df["num_battles"] > 300
472+
]
473+
447474
def update_leaderboard_and_plots(category, filters):
448475
if len(filters) > 0 and "Style Control" in filters:
449476
cat_name = f"{category} w/ Style Control"
@@ -454,23 +481,26 @@ def update_leaderboard_and_plots(category, filters):
454481

455482
arena_subset_df = arena_dfs[category]
456483
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 300]
484+
457485
elo_subset_results = category_elo_results[category]
458486

459487
baseline_category = cat_name_to_baseline.get(category, "Overall")
460488
arena_df = arena_dfs[baseline_category]
461489
arena_values = get_arena_table(
462490
arena_df,
463491
model_table_df,
464-
arena_subset_df=arena_subset_df if category != "Overall" else None,
492+
arena_subset_df=arena_subset_df
493+
if category != "Overall"
494+
else arena_overall_sc_df,
465495
hidden_models=(
466496
None
467497
if len(filters) > 0 and "Show Deprecate" in filters
468498
else deprecated_model_name
469499
),
500+
use_delta=category != "Overall",
470501
)
471502
if category != "Overall":
472503
arena_values = update_leaderboard_df(arena_values)
473-
# arena_values = highlight_top_models(arena_values)
474504
arena_values = gr.Dataframe(
475505
headers=[
476506
"Rank* (UB)",
@@ -484,7 +514,7 @@ def update_leaderboard_and_plots(category, filters):
484514
"Knowledge Cutoff",
485515
],
486516
datatype=[
487-
"str",
517+
"number",
488518
"number",
489519
"markdown",
490520
"number",
@@ -497,13 +527,15 @@ def update_leaderboard_and_plots(category, filters):
497527
value=arena_values,
498528
elem_id="arena_leaderboard_dataframe",
499529
height=1000,
500-
column_widths=[70, 70, 210, 90, 90, 90, 120, 150, 100],
530+
column_widths=[75, 75, 180, 60, 60, 60, 70, 80, 60],
501531
wrap=True,
502532
)
503533
else:
534+
arena_values = update_overall_leaderboard_df(arena_values)
504535
arena_values = gr.Dataframe(
505536
headers=[
506537
"Rank* (UB)",
538+
"Rank (StyleCtrl)",
507539
"Model",
508540
"Arena Score",
509541
"95% CI",
@@ -513,6 +545,7 @@ def update_leaderboard_and_plots(category, filters):
513545
"Knowledge Cutoff",
514546
],
515547
datatype=[
548+
"number",
516549
"number",
517550
"markdown",
518551
"number",
@@ -525,7 +558,7 @@ def update_leaderboard_and_plots(category, filters):
525558
value=arena_values,
526559
elem_id="arena_leaderboard_dataframe",
527560
height=1000,
528-
column_widths=[70, 220, 90, 90, 90, 120, 150, 100],
561+
column_widths=[75, 75, 180, 60, 60, 60, 70, 80, 60],
529562
wrap=True,
530563
)
531564

@@ -549,7 +582,11 @@ def update_leaderboard_and_plots(category, filters):
549582

550583
# arena table
551584
arena_table_vals = get_arena_table(
552-
arena_df, model_table_df, hidden_models=deprecated_model_name
585+
arena_df,
586+
model_table_df,
587+
hidden_models=deprecated_model_name,
588+
arena_subset_df=arena_overall_sc_df,
589+
use_delta=False,
553590
)
554591

555592
md = make_arena_leaderboard_md(arena_df, last_updated_time, vision=vision)
@@ -568,32 +605,23 @@ def update_leaderboard_and_plots(category, filters):
568605
)
569606
with gr.Column(scale=2):
570607
category_checkbox = gr.CheckboxGroup(
571-
["Style Control", "Show Deprecate"], label="Apply filter", info=""
608+
["Style Control Score", "Show Deprecated"],
609+
label="Apply filter",
610+
info="",
572611
)
573612
default_category_details = make_category_arena_leaderboard_md(
574613
arena_df, arena_df, name="Overall"
575614
)
576-
with gr.Column(scale=4, variant="panel"):
615+
with gr.Column(scale=3, variant="panel"):
577616
category_deets = gr.Markdown(
578617
default_category_details, elem_id="category_deets"
579618
)
580619

581-
arena_vals = pd.DataFrame(
582-
arena_table_vals,
583-
columns=[
584-
"Rank* (UB)",
585-
"Model",
586-
"Arena Score",
587-
"95% CI",
588-
"Votes",
589-
"Organization",
590-
"License",
591-
"Knowledge Cutoff",
592-
],
593-
)
620+
arena_vals = update_overall_leaderboard_df(arena_table_vals)
594621
elo_display_df = gr.Dataframe(
595622
headers=[
596623
"Rank* (UB)",
624+
"Rank (StyleCtrl)",
597625
"Model",
598626
"Arena Elo",
599627
"95% CI",
@@ -603,6 +631,7 @@ def update_leaderboard_and_plots(category, filters):
603631
"Knowledge Cutoff",
604632
],
605633
datatype=[
634+
"number",
606635
"number",
607636
"markdown",
608637
"number",
@@ -612,11 +641,10 @@ def update_leaderboard_and_plots(category, filters):
612641
"str",
613642
"str",
614643
],
615-
# value=highlight_top_models(arena_vals.style),
616-
value=arena_vals.style,
644+
value=arena_vals,
617645
elem_id="arena_leaderboard_dataframe",
618646
height=1000,
619-
column_widths=[70, 220, 90, 90, 90, 120, 150, 100],
647+
column_widths=[75, 75, 180, 60, 60, 60, 70, 80, 60],
620648
wrap=True,
621649
)
622650

@@ -626,6 +654,9 @@ def update_leaderboard_and_plots(category, filters):
626654
Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval).
627655
See Figure 1 below for visualization of the confidence intervals of model scores.
628656
657+
**Rank (StyleCtrl)**: model's ranking with style control, which accounts for factors like response length and markdown usage to decouple model performance from these potential confounding variables.
658+
See [blog post](https://blog.lmarena.ai/blog/2024/style-control/) for further details.
659+
629660
Note: in each category, we exclude models with fewer than 300 votes as their confidence intervals can be large.
630661
""",
631662
elem_id="leaderboard_markdown",
@@ -833,9 +864,7 @@ def build_category_leaderboard_tab(
833864
"long_user",
834865
# "no_refusal",
835866
]
836-
# selected_categories_width = [95, 85, 100, 75, 120, 100, 95, 100,100]
837867
selected_categories_width = [110, 110, 110, 110, 110, 80, 80, 80, 80]
838-
# selected_categories_width = [100] * len(selected_categories)
839868

840869
language_categories = [
841870
"english",

0 commit comments

Comments
 (0)