diff --git a/fastchat/serve/monitor/basic_stats.py b/fastchat/serve/monitor/basic_stats.py index e1934bb07..3c1a8793d 100644 --- a/fastchat/serve/monitor/basic_stats.py +++ b/fastchat/serve/monitor/basic_stats.py @@ -13,50 +13,60 @@ NUM_SERVERS = 14 +LOG_ROOT_DIR = "~/fastchat_logs" def get_log_files(max_num_files=None): - dates = [] - for month in range(4, 12): - for day in range(1, 33): - dates.append(f"2023-{month:02d}-{day:02d}") - + log_root = os.path.expanduser(LOG_ROOT_DIR) filenames = [] - for d in dates: - for i in range(NUM_SERVERS): - name = os.path.expanduser(f"~/fastchat_logs/server{i}/{d}-conv.json") - if os.path.exists(name): - filenames.append(name) + for i in range(NUM_SERVERS): + for filename in os.listdir(f"{log_root}/server{i}"): + if filename.endswith("-conv.json"): + filepath = f"{log_root}/server{i}/{filename}" + name_tstamp_tuple = (filepath, os.path.getmtime(filepath)) + filenames.append(name_tstamp_tuple) + # sort by tstamp + filenames = sorted(filenames, key=lambda x: x[1]) + filenames = [x[0] for x in filenames] + max_num_files = max_num_files or len(filenames) filenames = filenames[-max_num_files:] return filenames -def load_log_files(log_files): +def load_log_files(filename): data = [] - for filename in tqdm(log_files, desc="read files"): - for retry in range(5): - try: - lines = open(filename).readlines() - break - except FileNotFoundError: - time.sleep(2) - - for l in lines: - row = json.loads(l) - - data.append( - dict( - type=row["type"], - tstamp=row["tstamp"], - model=row.get("model", ""), - models=row.get("models", ["", ""]), - ) + for retry in range(5): + try: + lines = open(filename).readlines() + break + except FileNotFoundError: + time.sleep(2) + + for l in lines: + row = json.loads(l) + data.append( + dict( + type=row["type"], + tstamp=row["tstamp"], + model=row.get("model", ""), + models=row.get("models", ["", ""]), ) - + ) return data +def load_log_files_parallel(log_files, num_threads=16): + data_all = [] + from multiprocessing import Pool + + with Pool(num_threads) as p: + ret_all = list(tqdm(p.imap(load_log_files, log_files), total=len(log_files))) + for ret in ret_all: + data_all.extend(ret) + return data_all + + def get_anony_vote_df(df): anony_vote_df = df[ df["type"].isin(["leftvote", "rightvote", "tievote", "bothbad_vote"]) @@ -77,7 +87,7 @@ def merge_counts(series, on, names): def report_basic_stats(log_files): - df_all = load_log_files(log_files) + df_all = load_log_files_parallel(log_files) df_all = pd.DataFrame(df_all) now_t = df_all["tstamp"].max() df_1_hour = df_all[df_all["tstamp"] > (now_t - 3600)] diff --git a/fastchat/serve/monitor/clean_battle_data.py b/fastchat/serve/monitor/clean_battle_data.py index 09308f570..44c8192aa 100644 --- a/fastchat/serve/monitor/clean_battle_data.py +++ b/fastchat/serve/monitor/clean_battle_data.py @@ -27,6 +27,7 @@ "laion", "chatglm", "chatgpt", + "gpt-4", "openai", "anthropic", "claude", @@ -35,33 +36,26 @@ "lamda", "google", "llama", + "qianwan", + "alibaba", + "mistral", + "zhipu", + "KEG lab", + "01.AI", + "AI2", + "TΓΌlu", + "Tulu", "NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.", "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES.", "API REQUEST ERROR. Please increase the number of max tokens.", "**API REQUEST ERROR** Reason: The response was blocked.", + "**API REQUEST ERROR**", ] for i in range(len(IDENTITY_WORDS)): IDENTITY_WORDS[i] = IDENTITY_WORDS[i].lower() -def get_log_files(max_num_files=None): - dates = [] - for month in range(4, 13): - for day in range(1, 33): - dates.append(f"2023-{month:02d}-{day:02d}") - - filenames = [] - for d in dates: - for i in range(NUM_SERVERS): - name = os.path.expanduser(f"~/fastchat_logs/server{i}/{d}-conv.json") - if os.path.exists(name): - filenames.append(name) - max_num_files = max_num_files or len(filenames) - filenames = filenames[-max_num_files:] - return filenames - - def remove_html(raw): if raw.startswith("

"): return raw[raw.find(": ") + 2 : -len("

\n")] @@ -76,29 +70,54 @@ def to_openai_format(messages): return ret -def replace_model_name(old_name): - return ( - old_name.replace("bard", "palm-2") - .replace("claude-v1", "claude-1") - .replace("claude-instant-v1", "claude-instant-1") - .replace("oasst-sft-1-pythia-12b", "oasst-pythia-12b") - ) +def replace_model_name(old_name, tstamp): + replace_dict = { + "bard": "palm-2", + "claude-v1": "claude-1", + "claude-instant-v1": "claude-instant-1", + "oasst-sft-1-pythia-12b": "oasst-pythia-12b", + "claude-2": "claude-2.0", + } + if old_name in ["gpt-4", "gpt-3.5-turbo"]: + if tstamp > 1687849200: + return old_name + "-0613" + else: + return old_name + "-0314" + if old_name in replace_dict: + return replace_dict[old_name] + return old_name -def clean_battle_data(log_files, exclude_model_names): +def read_file(filename): data = [] - for filename in tqdm(log_files, desc="read files"): - for retry in range(5): - try: - lines = open(filename).readlines() - break - except FileNotFoundError: - time.sleep(2) - - for l in lines: - row = json.loads(l) - if row["type"] in VOTES: - data.append(row) + for retry in range(5): + try: + # lines = open(filename).readlines() + for l in open(filename): + row = json.loads(l) + if row["type"] in VOTES: + data.append(row) + break + except FileNotFoundError: + time.sleep(2) + return data + + +def read_file_parallel(log_files, num_threads=16): + data_all = [] + from multiprocessing import Pool + + with Pool(num_threads) as p: + ret_all = list(tqdm(p.imap(read_file, log_files), total=len(log_files))) + for ret in ret_all: + data_all.extend(ret) + return data_all + + +def clean_battle_data( + log_files, exclude_model_names, ban_ip_list=None, sanitize_ip=False +): + data = read_file_parallel(log_files, num_threads=16) convert_type = { "leftvote": "model_a", @@ -112,6 +131,7 @@ def clean_battle_data(log_files, exclude_model_names): ct_anony = 0 ct_invalid = 0 ct_leaked_identity = 0 + ct_banned = 0 battles = [] for row in data: if row["models"][0] is None or row["models"][1] is None: @@ -158,7 +178,9 @@ def clean_battle_data(log_files, exclude_model_names): messages = "" for i in range(2): state = row["states"][i] - for role, msg in state["messages"][state["offset"] :]: + for turn_idx, (role, msg) in enumerate( + state["messages"][state["offset"] :] + ): if msg: messages += msg.lower() for word in IDENTITY_WORDS: @@ -171,10 +193,9 @@ def clean_battle_data(log_files, exclude_model_names): continue # Replace bard with palm - models = [replace_model_name(m) for m in models] - + models = [replace_model_name(m, row["tstamp"]) for m in models] # Exclude certain models - if any(x in exclude_model_names for x in models): + if exclude_model_names and any(x in exclude_model_names for x in models): ct_invalid += 1 continue @@ -188,8 +209,16 @@ def clean_battle_data(log_files, exclude_model_names): ip = row["ip"] if ip not in all_ips: - all_ips[ip] = len(all_ips) - user_id = all_ips[ip] + all_ips[ip] = {"ip": ip, "count": 0, "sanitized_id": len(all_ips)} + all_ips[ip]["count"] += 1 + if sanitize_ip: + user_id = f"arena_user_{all_ips[ip]['sanitized_id']}" + else: + user_id = f"{all_ips[ip]['ip']}" + + if ban_ip_list is not None and ip in ban_ip_list: + ct_banned += 1 + continue # Save the results battles.append( @@ -218,12 +247,19 @@ def clean_battle_data(log_files, exclude_model_names): print( f"#votes: {len(data)}, #invalid votes: {ct_invalid}, " - f"#leaked_identity: {ct_leaked_identity}" + f"#leaked_identity: {ct_leaked_identity} " + f"#banned: {ct_banned} " ) print(f"#battles: {len(battles)}, #anony: {ct_anony}") print(f"#models: {len(all_models)}, {all_models}") print(f"last-updated: {last_updated_datetime}") + if ban_ip_list is not None: + for ban_ip in ban_ip_list: + if ban_ip in all_ips: + del all_ips[ban_ip] + print("Top 30 IPs:") + print(sorted(all_ips.values(), key=lambda x: x["count"], reverse=True)[:30]) return battles @@ -234,10 +270,16 @@ def clean_battle_data(log_files, exclude_model_names): "--mode", type=str, choices=["simple", "conv_release"], default="simple" ) parser.add_argument("--exclude-model-names", type=str, nargs="+") + parser.add_argument("--ban-ip-file", type=str) + parser.add_argument("--sanitize-ip", action="store_true", default=False) args = parser.parse_args() log_files = get_log_files(args.max_num_files) - battles = clean_battle_data(log_files, args.exclude_model_names or []) + ban_ip_list = json.load(open(args.ban_ip_file)) if args.ban_ip_file else None + + battles = clean_battle_data( + log_files, args.exclude_model_names or [], ban_ip_list, args.sanitize_ip + ) last_updated_tstamp = battles[-1]["tstamp"] cutoff_date = datetime.datetime.fromtimestamp( last_updated_tstamp, tz=timezone("US/Pacific") diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py index e95f157c8..9b8d8c3fe 100644 --- a/fastchat/serve/monitor/elo_analysis.py +++ b/fastchat/serve/monitor/elo_analysis.py @@ -52,6 +52,41 @@ def get_bootstrap_result(battles, func_compute_elo, num_round=1000): return df[df.median().sort_values(ascending=False).index] +def compute_elo_mle_with_tie(df, SCALE=400, BASE=10, INIT_RATING=1000): + from sklearn.linear_model import LogisticRegression + + models = pd.concat([df["model_a"], df["model_b"]]).unique() + models = pd.Series(np.arange(len(models)), index=models) + + # duplicate battles + df = pd.concat([df, df], ignore_index=True) + p = len(models.index) + n = df.shape[0] + + X = np.zeros([n, p]) + X[np.arange(n), models[df["model_a"]]] = +math.log(BASE) + X[np.arange(n), models[df["model_b"]]] = -math.log(BASE) + + # one A win => two A win + Y = np.zeros(n) + Y[df["winner"] == "model_a"] = 1.0 + + # one tie => one A win + one B win + # find tie + tie (both bad) index + tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)") + tie_idx[len(tie_idx) // 2 :] = False + Y[tie_idx] = 1.0 + + lr = LogisticRegression(fit_intercept=False) + lr.fit(X, Y) + + elo_scores = SCALE * lr.coef_[0] + INIT_RATING + # calibrate llama-13b to 800 if applicable + if "llama-13b" in models.index: + elo_scores += 800 - elo_scores[models["llama-13b"]] + return pd.Series(elo_scores, index=models.index).sort_values(ascending=False) + + def get_median_elo_from_bootstrap(bootstrap_df): median = dict(bootstrap_df.quantile(0.5)) median = {k: int(v + 0.5) for k, v in median.items()} @@ -185,12 +220,12 @@ def visualize_average_win_rate(battles, limit_show_number): return fig -def visualize_bootstrap_elo_rating(df, limit_show_number): +def visualize_bootstrap_elo_rating(df, df_final, limit_show_number): bars = ( pd.DataFrame( dict( lower=df.quantile(0.025), - rating=df.quantile(0.5), + rating=df_final, upper=df.quantile(0.975), ) ) @@ -215,7 +250,7 @@ def visualize_bootstrap_elo_rating(df, limit_show_number): return fig -def report_elo_analysis_results(battles_json): +def report_elo_analysis_results(battles_json, rating_system="bt", num_bootstrap=100): battles = pd.DataFrame(battles_json) battles = battles.sort_values(ascending=True, by=["tstamp"]) # Only use anonymous votes @@ -225,24 +260,45 @@ def report_elo_analysis_results(battles_json): # Online update elo_rating_online = compute_elo(battles) - # Bootstrap - bootstrap_df = get_bootstrap_result(battles, compute_elo) - elo_rating_median = get_median_elo_from_bootstrap(bootstrap_df) - model_order = list(elo_rating_median.keys()) - model_order.sort(key=lambda k: -elo_rating_median[k]) + if rating_system == "bt": + bootstrap_df = get_bootstrap_result( + battles, compute_elo_mle_with_tie, num_round=num_bootstrap + ) + elo_rating_final = compute_elo_mle_with_tie(battles) + elif rating_system == "elo": + bootstrap_df = get_bootstrap_result( + battles, compute_elo, num_round=num_bootstrap + ) + elo_rating_median = get_median_elo_from_bootstrap(bootstrap_df) + elo_rating_final = elo_rating_median + + model_order = list(elo_rating_final.keys()) + model_order.sort(key=lambda k: -elo_rating_final[k]) limit_show_number = 25 # limit show number to make plots smaller model_order = model_order[:limit_show_number] + # leaderboard_table_df: elo rating, variance, 95% interval, number of battles + leaderboard_table_df = pd.DataFrame( + { + "rating": elo_rating_final, + "variance": bootstrap_df.var(), + "rating_q975": bootstrap_df.quantile(0.975), + "rating_q025": bootstrap_df.quantile(0.025), + "num_battles": battles["model_a"].value_counts() + + battles["model_b"].value_counts(), + } + ) + # Plots - leaderboard_table = visualize_leaderboard_table(elo_rating_median) + leaderboard_table = visualize_leaderboard_table(elo_rating_final) win_fraction_heatmap = visualize_pairwise_win_fraction(battles_no_ties, model_order) battle_count_heatmap = visualize_battle_count(battles_no_ties, model_order) average_win_rate_bar = visualize_average_win_rate( battles_no_ties, limit_show_number ) bootstrap_elo_rating = visualize_bootstrap_elo_rating( - bootstrap_df, limit_show_number + bootstrap_df, elo_rating_final, limit_show_number ) last_updated_tstamp = battles["tstamp"].max() @@ -251,8 +307,9 @@ def report_elo_analysis_results(battles_json): ).strftime("%Y-%m-%d %H:%M:%S %Z") return { + "rating_system": rating_system, "elo_rating_online": elo_rating_online, - "elo_rating_median": elo_rating_median, + "elo_rating_final": elo_rating_final, "leaderboard_table": leaderboard_table, "win_fraction_heatmap": win_fraction_heatmap, "battle_count_heatmap": battle_count_heatmap, @@ -260,6 +317,8 @@ def report_elo_analysis_results(battles_json): "bootstrap_elo_rating": bootstrap_elo_rating, "last_updated_datetime": last_updated_datetime, "last_updated_tstamp": last_updated_tstamp, + "bootstrap_df": bootstrap_df, + "leaderboard_table_df": leaderboard_table_df, } @@ -274,6 +333,11 @@ def pretty_print_elo_rating(rating): parser = argparse.ArgumentParser() parser.add_argument("--clean-battle-file", type=str) parser.add_argument("--max-num-files", type=int) + parser.add_argument("--num-bootstrap", type=int, default=100) + parser.add_argument( + "--rating-system", type=str, choices=["bt", "elo"], default="bt" + ) + parser.add_argument("--exclude-tie", action="store_true", default=False) args = parser.parse_args() np.random.seed(42) @@ -286,12 +350,14 @@ def pretty_print_elo_rating(rating): log_files = get_log_files(args.max_num_files) battles = clean_battle_data(log_files) - results = report_elo_analysis_results(battles) + results = report_elo_analysis_results( + battles, rating_system=args.rating_system, num_bootstrap=args.num_bootstrap + ) - print("# Online") + print("# Online Elo") pretty_print_elo_rating(results["elo_rating_online"]) print("# Median") - pretty_print_elo_rating(results["elo_rating_median"]) + pretty_print_elo_rating(results["elo_rating_final"]) print(f"last update : {results['last_updated_datetime']}") last_updated_tstamp = results["last_updated_tstamp"] diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 59c48eb0e..1998681a0 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -8,11 +8,13 @@ import argparse import ast +import json import pickle import os import threading import time +import pandas as pd import gradio as gr import numpy as np @@ -26,22 +28,41 @@ "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH" ) - basic_component_values = [None] * 6 leader_component_values = [None] * 5 -def make_leaderboard_md(elo_results): +def make_default_md(arena_df, elo_results): + total_votes = sum(arena_df["num_battles"]) // 2 + total_models = len(arena_df) + + leaderboard_md = f""" +# πŸ† LMSYS Chatbot Arena Leaderboard +| [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | + +LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals. +We've collected over **200,000** human preference votes to rank LLMs with the Elo ranking system. +""" + return leaderboard_md + + +def make_arena_leaderboard_md(arena_df): + total_votes = sum(arena_df["num_battles"]) // 2 + total_models = len(arena_df) + leaderboard_md = f""" -# πŸ† Chatbot Arena Leaderboard -| [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | +Total #models: **{total_models}**. Total #votes: **{total_votes}**. Last updated: Jan 9, 2024. -This leaderboard is based on the following three benchmarks. -- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 100K+ user votes to compute Elo ratings. -- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses. -- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks. +Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! Find more analysis in the [notebook]({notebook_url}). +""" + return leaderboard_md -πŸ’» Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Dec 20, 2023. + +def make_full_leaderboard_md(elo_results): + leaderboard_md = f""" +Two more benchmarks are displayed: **MT-Bench** and **MMLU**. +- [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses. +- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks. """ return leaderboard_md @@ -55,12 +76,17 @@ def make_leaderboard_md_live(elo_results): return leaderboard_md -def update_elo_components(max_num_files, elo_results_file): +def update_elo_components( + max_num_files, elo_results_file, ban_ip_file, exclude_model_names +): log_files = get_log_files(max_num_files) # Leaderboard if elo_results_file is None: # Do live update - battles = clean_battle_data(log_files, []) + ban_ip_list = json.load(open(ban_ip_file)) if ban_ip_file else None + battles = clean_battle_data( + log_files, exclude_model_names, ban_ip_list=ban_ip_list + ) elo_results = report_elo_analysis_results(battles) leader_component_values[0] = make_leaderboard_md_live(elo_results) @@ -93,10 +119,14 @@ def update_elo_components(max_num_files, elo_results_file): basic_component_values[5] = md4 -def update_worker(max_num_files, interval, elo_results_file): +def update_worker( + max_num_files, interval, elo_results_file, ban_ip_file, exclude_model_names +): while True: tic = time.time() - update_elo_components(max_num_files, elo_results_file) + update_elo_components( + max_num_files, elo_results_file, ban_ip_file, exclude_model_names + ) durtaion = time.time() - tic print(f"update duration: {durtaion:.2f} s") time.sleep(max(interval - durtaion, 0)) @@ -168,90 +198,186 @@ def build_basic_stats_tab(): return [md0, plot_1, md1, md2, md3, md4] -def build_leaderboard_tab(elo_results_file, leaderboard_table_file): +def get_full_table(arena_df, model_table_df): + values = [] + for i in range(len(model_table_df)): + row = [] + model_key = model_table_df.iloc[i]["key"] + model_name = model_table_df.iloc[i]["Model"] + # model display name + row.append(model_name) + if model_key in arena_df.index: + idx = arena_df.index.get_loc(model_key) + row.append(round(arena_df.iloc[idx]["rating"])) + else: + row.append(np.nan) + row.append(model_table_df.iloc[i]["MT-bench (score)"]) + row.append(model_table_df.iloc[i]["MMLU"]) + # Organization + row.append(model_table_df.iloc[i]["Organization"]) + # license + row.append(model_table_df.iloc[i]["License"]) + + values.append(row) + values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9) + return values + + +def get_arena_table(arena_df, model_table_df): + # sort by rating + arena_df = arena_df.sort_values(by=["rating"], ascending=False) + values = [] + for i in range(len(arena_df)): + row = [] + model_key = arena_df.index[i] + model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[ + 0 + ] + + # rank + row.append(i + 1) + # model display name + row.append(model_name) + # elo rating + row.append(round(arena_df.iloc[i]["rating"])) + upper_diff = round(arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]) + lower_diff = round(arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]) + row.append(f"+{upper_diff}/-{lower_diff}") + # num battles + row.append(round(arena_df.iloc[i]["num_battles"])) + # Organization + row.append( + model_table_df[model_table_df["key"] == model_key]["Organization"].values[0] + ) + # license + row.append( + model_table_df[model_table_df["key"] == model_key]["License"].values[0] + ) + + values.append(row) + return values + + +def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False): if elo_results_file is None: # Do live update - md = "Loading ..." + default_md = "Loading ..." p1 = p2 = p3 = p4 = None else: with open(elo_results_file, "rb") as fin: elo_results = pickle.load(fin) - md = make_leaderboard_md(elo_results) p1 = elo_results["win_fraction_heatmap"] p2 = elo_results["battle_count_heatmap"] p3 = elo_results["bootstrap_elo_rating"] p4 = elo_results["average_win_rate_bar"] + arena_df = elo_results["leaderboard_table_df"] + default_md = make_default_md(arena_df, elo_results) - md_1 = gr.Markdown(md, elem_id="leaderboard_markdown") - + md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown") if leaderboard_table_file: data = load_leaderboard_table_csv(leaderboard_table_file) - headers = [ - "Model", - "Arena Elo rating", - "MT-bench (score)", - "MMLU", - "License", - ] - values = [] - for item in data: - row = [] - for key in headers: - value = item[key] - row.append(value) - values.append(row) - values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9) - - headers[1] = "⭐ " + headers[1] - headers[2] = "πŸ“ˆ " + headers[2] - - gr.Dataframe( - headers=headers, - datatype=["markdown", "number", "number", "number", "str"], - value=values, - elem_id="leaderboard_dataframe", - ) - gr.Markdown( - """ ## Visit our [HF space](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) for more analysis! - If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model). - """, - elem_id="leaderboard_markdown", - ) + model_table_df = pd.DataFrame(data) + + with gr.Tabs() as tabs: + # arena table + arena_table_vals = get_arena_table(arena_df, model_table_df) + with gr.Tab("Arena Elo", id=0): + md = make_arena_leaderboard_md(arena_df) + gr.Markdown(md, elem_id="leaderboard_markdown") + gr.Dataframe( + headers=[ + "Rank", + "πŸ€– Model", + "⭐ Arena Elo", + "πŸ“Š 95% CI", + "πŸ—³οΈ Votes", + "Organization", + "License", + ], + datatype=[ + "str", + "markdown", + "number", + "str", + "number", + "str", + "str", + ], + value=arena_table_vals, + elem_id="arena_leaderboard_dataframe", + height=700, + column_widths=[50, 200, 100, 100, 100, 150, 150], + wrap=True, + ) + with gr.Tab("Full Leaderboard", id=1): + md = make_full_leaderboard_md(elo_results) + gr.Markdown(md, elem_id="leaderboard_markdown") + full_table_vals = get_full_table(arena_df, model_table_df) + gr.Dataframe( + headers=[ + "πŸ€– Model", + "⭐ Arena Elo", + "πŸ“ˆ MT-bench", + "πŸ“š MMLU", + "Organization", + "License", + ], + datatype=["markdown", "number", "number", "number", "str", "str"], + value=full_table_vals, + elem_id="full_leaderboard_dataframe", + column_widths=[200, 100, 100, 100, 150, 150], + height=700, + wrap=True, + ) + if not show_plot: + gr.Markdown( + """ ## Visit our [HF space](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) for more analysis! + If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model). + """, + elem_id="leaderboard_markdown", + ) else: pass - leader_component_values[:] = [md, p1, p2, p3, p4] + leader_component_values[:] = [default_md, p1, p2, p3, p4] - """ - with gr.Row(): - with gr.Column(): - gr.Markdown( - "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles" - ) - plot_1 = gr.Plot(p1, show_label=False) - with gr.Column(): - gr.Markdown( - "#### Figure 2: Battle Count for Each Combination of Models (without Ties)" - ) - plot_2 = gr.Plot(p2, show_label=False) - with gr.Row(): - with gr.Column(): - gr.Markdown( - "#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)" - ) - plot_3 = gr.Plot(p3, show_label=False) - with gr.Column(): - gr.Markdown( - "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)" - ) - plot_4 = gr.Plot(p4, show_label=False) - """ + if show_plot: + gr.Markdown( + f"""## More Statistics for Chatbot Arena\n +Below are figures for more statistics. The code for generating them is also included in this [notebook]({notebook_url}). +You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/). + """, + elem_id="leaderboard_markdown", + ) + with gr.Row(): + with gr.Column(): + gr.Markdown( + "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles" + ) + plot_1 = gr.Plot(p1, show_label=False) + with gr.Column(): + gr.Markdown( + "#### Figure 2: Battle Count for Each Combination of Models (without Ties)" + ) + plot_2 = gr.Plot(p2, show_label=False) + with gr.Row(): + with gr.Column(): + gr.Markdown( + "#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)" + ) + plot_3 = gr.Plot(p3, show_label=False) + with gr.Column(): + gr.Markdown( + "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)" + ) + plot_4 = gr.Plot(p4, show_label=False) from fastchat.serve.gradio_web_server import acknowledgment_md gr.Markdown(acknowledgment_md) - # return [md_1, plot_1, plot_2, plot_3, plot_4] + if show_plot: + return [md_1, plot_1, plot_2, plot_3, plot_4] return [md_1] @@ -268,7 +394,9 @@ def build_demo(elo_results_file, leaderboard_table_file): with gr.Tabs() as tabs: with gr.Tab("Leaderboard", id=0): leader_components = build_leaderboard_tab( - elo_results_file, leaderboard_table_file + elo_results_file, + leaderboard_table_file, + show_plot=True, ) with gr.Tab("Basic Stats", id=1): @@ -295,6 +423,8 @@ def build_demo(elo_results_file, leaderboard_table_file): parser.add_argument("--max-num-files", type=int) parser.add_argument("--elo-results-file", type=str) parser.add_argument("--leaderboard-table-file", type=str) + parser.add_argument("--ban-ip-file", type=str) + parser.add_argument("--exclude-model-names", type=str, nargs="+") args = parser.parse_args() logger = build_logger("monitor", "monitor.log") @@ -303,7 +433,13 @@ def build_demo(elo_results_file, leaderboard_table_file): if args.elo_results_file is None: # Do live update update_thread = threading.Thread( target=update_worker, - args=(args.max_num_files, args.update_interval, args.elo_results_file), + args=( + args.max_num_files, + args.update_interval, + args.elo_results_file, + args.ban_ip_file, + args.exclude_model_names, + ), ) update_thread.start()