diff --git a/fastchat/constants.py b/fastchat/constants.py index 53ed55c1c..24e1783af 100644 --- a/fastchat/constants.py +++ b/fastchat/constants.py @@ -15,6 +15,7 @@ CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION." INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE." SLOW_MODEL_MSG = "⚠️ Both models will show the responses all at once. Please stay patient as it may take over 30 seconds." +RATE_LIMIT_MSG = "**RATE LIMIT OF THIS MODEL IS REACHED. PLEASE COME BACK LATER OR TRY OTHER MODELS.**" # Maximum input length INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 12000)) # Maximum conversation turns diff --git a/fastchat/conversation.py b/fastchat/conversation.py index 46ffdcd19..ef6e316d1 100644 --- a/fastchat/conversation.py +++ b/fastchat/conversation.py @@ -276,7 +276,10 @@ def to_gradio_chatbot(self): def to_openai_api_messages(self): """Convert the conversation to OpenAI chat completion format.""" - ret = [{"role": "system", "content": self.system_message}] + if self.system_message == "": + ret = [] + else: + ret = [{"role": "system", "content": self.system_message}] for i, (_, msg) in enumerate(self.messages[self.offset :]): if i % 2 == 0: @@ -679,6 +682,17 @@ def get_conv_template(name: str) -> Conversation: ) ) +# Perplexity AI template +register_conv_template( + Conversation( + name="pplxai", + system_message="Be precise and concise.", + roles=("user", "assistant"), + sep_style=None, + sep=None, + ) +) + # Claude default template register_conv_template( Conversation( diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index 3d73356e6..6578f8441 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -1038,8 +1038,12 @@ class ChatGPTAdapter(BaseModelAdapter): def match(self, model_path: str): return model_path in ( "gpt-3.5-turbo", + "gpt-3.5-turbo-0301", + "gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", "gpt-4", + "gpt-4-0314", + "gpt-4-0613", "gpt-4-turbo", ) @@ -1063,6 +1067,22 @@ def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("chatgpt") +class PplxAIAdapter(BaseModelAdapter): + """The model adapter for Perplexity AI""" + + def match(self, model_path: str): + return model_path in ( + "pplx-7b-online", + "pplx-70b-online", + ) + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + raise NotImplementedError() + + def get_default_conv_template(self, model_path: str) -> Conversation: + return get_conv_template("pplxai") + + class ClaudeAdapter(BaseModelAdapter): """The model adapter for Claude""" @@ -1102,6 +1122,19 @@ def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("bard") +class GeminiAdapter(BaseModelAdapter): + """The model adapter for Gemini""" + + def match(self, model_path: str): + return model_path in ["gemini-pro"] + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + raise NotImplementedError() + + def get_default_conv_template(self, model_path: str) -> Conversation: + return get_conv_template("bard") + + class BiLLaAdapter(BaseModelAdapter): """The model adapter for Neutralzz/BiLLa-7B-SFT""" @@ -1420,7 +1453,7 @@ class MistralAdapter(BaseModelAdapter): """The model adapter for Mistral AI models""" def match(self, model_path: str): - return "mistral" in model_path.lower() + return "mistral" in model_path.lower() or "mixtral" in model_path.lower() def load_model(self, model_path: str, from_pretrained_kwargs: dict): model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) @@ -2056,6 +2089,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(PhoenixAdapter) register_model_adapter(BardAdapter) register_model_adapter(PaLM2Adapter) +register_model_adapter(GeminiAdapter) register_model_adapter(ChatGPTAdapter) register_model_adapter(AzureOpenAIAdapter) register_model_adapter(ClaudeAdapter) @@ -2107,6 +2141,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(MicrosoftOrcaAdapter) register_model_adapter(XdanAdapter) register_model_adapter(YiAdapter) +register_model_adapter(PplxAIAdapter) register_model_adapter(DeepseekCoderAdapter) register_model_adapter(DeepseekChatAdapter) register_model_adapter(MetaMathAdapter) diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py index 050aec1d1..1c244b50a 100644 --- a/fastchat/model/model_registry.py +++ b/fastchat/model/model_registry.py @@ -1,12 +1,12 @@ """Additional information of the models.""" -from collections import namedtuple +from collections import namedtuple, OrderedDict from typing import List ModelInfo = namedtuple("ModelInfo", ["simple_name", "link", "description"]) -model_info = {} +model_info = OrderedDict() def register_model_info( @@ -29,68 +29,110 @@ def get_model_info(name: str) -> ModelInfo: register_model_info( - ["gpt-3.5-turbo"], - "GPT-3.5", - "https://openai.com/blog/chatgpt", - "GPT-3.5 by OpenAI", + ["mixtral-8x7b-instruct-v0.1", "mistral-7b-instruct"], + "Mixtral of experts", + "https://mistral.ai/news/mixtral-of-experts/", + "A Mixture-of-Experts model by Mistral AI", ) + register_model_info( - ["gpt-3.5-turbo-1106"], - "GPT-3.5-Turbo-1106", - "https://platform.openai.com/docs/models/gpt-3-5", - "GPT-3.5-Turbo-1106 by OpenAI", + ["gemini-pro"], + "Gemini", + "https://blog.google/technology/ai/google-gemini-pro-imagen-duet-ai-update/", + "Gemini by Google", +) + +register_model_info( + ["solar-10.7b-instruct-v1.0"], + "SOLAR-10.7B-Instruct", + "https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0", + "A model trained using depth up-scaling by Upstage AI", ) + register_model_info( ["gpt-4-turbo"], "GPT-4-Turbo", "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", "GPT-4-Turbo by OpenAI", ) + register_model_info( - ["gpt-4"], "GPT-4", "https://openai.com/research/gpt-4", "ChatGPT-4 by OpenAI" + ["gpt-3.5-turbo", "gpt-3.5-turbo-0314", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106"], + "GPT-3.5", + "https://platform.openai.com/docs/models/gpt-3-5", + "GPT-3.5-Turbo by OpenAI", ) + +register_model_info( + ["gpt-4", "gpt-4-0314", "gpt-4-0613"], + "GPT-4", + "https://openai.com/research/gpt-4", + "GPT-4 by OpenAI", +) + register_model_info( ["claude-2.1", "claude-2.0"], "Claude", "https://www.anthropic.com/index/claude-2", "Claude 2 by Anthropic", ) + register_model_info( ["claude-1"], "Claude", "https://www.anthropic.com/index/introducing-claude", - "Claude by Anthropic", + "Claude 1 by Anthropic", ) + register_model_info( ["claude-instant-1", "claude-instant-1.2"], "Claude Instant", "https://www.anthropic.com/index/introducing-claude", "Claude Instant by Anthropic", ) + +register_model_info( + ["pplx-70b-online", "pplx-7b-online"], + "pplx-online-llms", + "https://blog.perplexity.ai/blog/introducing-pplx-online-llms", + "Online LLM API by Perplexity AI", +) + register_model_info( ["openhermes-2.5-mistral-7b"], "OpenHermes-2.5-Mistral-7B", "https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B", "a mistral-based model fine-tuned on 1M GPT-4 outputs", ) + register_model_info( ["starling-lm-7b-alpha"], "Starling-LM-7B-alpha", "https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha", "an open model trained using RLAIF by Berkeley", ) + register_model_info( ["tulu-2-dpo-70b"], "Tulu 2", "https://huggingface.co/allenai/tulu-2-dpo-70b", - "Tulu 2 by UW/AllenAI", + "an instruction and RLHF model by UW/AllenAI", ) + register_model_info( ["yi-34b-chat", "yi-6b-chat"], "Yi-Chat", "https://huggingface.co/01-ai/Yi-34B-Chat", "A large language model by 01 AI", ) + +register_model_info( + ["llama-2-70b-chat", "llama-2-34b-chat", "llama-2-13b-chat", "llama-2-7b-chat"], + "Llama 2", + "https://ai.meta.com/llama/", + "open foundation and fine-tuned chat models by Meta", +) + register_model_info( [ "vicuna-33b", @@ -104,186 +146,203 @@ def get_model_info(name: str) -> ModelInfo: "https://lmsys.org/blog/2023-03-30-vicuna/", "a chat assistant fine-tuned on user-shared conversations by LMSYS", ) -register_model_info( - ["llama-2-70b-chat", "llama-2-34b-chat", "llama-2-13b-chat", "llama-2-7b-chat"], - "Llama 2", - "https://ai.meta.com/llama/", - "open foundation and fine-tuned chat models by Meta", -) + register_model_info( ["chatglm3-6b", "chatglm2-6b", "chatglm-6b"], "ChatGLM", "https://chatglm.cn/blog", "an open bilingual dialogue language model by Tsinghua University", ) + register_model_info( ["openchat-3.5"], "OpenChat 3.5", "https://github.com/imoneoi/openchat", "an open model fine-tuned on Mistral-7B using C-RLFT", ) -register_model_info( - ["mistral-7b-instruct"], - "Mistral", - "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1", - "a large language model by Mistral AI team", -) + register_model_info( ["zephyr-7b-beta", "zephyr-7b-alpha"], "Zephyr", "https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha", "a chatbot fine-tuned from Mistral by Hugging Face", ) + register_model_info( ["catppt"], "CatPPT", "https://huggingface.co/rishiraj/CatPPT", "a chatbot fine-tuned from a SLERP merged model by Rishiraj Acharya", ) + register_model_info( ["qwen-14b-chat"], "Qwen", "https://huggingface.co/Qwen/Qwen-14B-Chat", "a large language model by Alibaba Cloud", ) + register_model_info( ["codellama-34b-instruct", "codellama-13b-instruct", "codellama-7b-instruct"], "Code Llama", "https://ai.meta.com/blog/code-llama-large-language-model-coding/", "open foundation models for code by Meta", ) + register_model_info( ["wizardlm-70b", "wizardlm-30b", "wizardlm-13b"], "WizardLM", "https://github.com/nlpxucan/WizardLM", "an instruction-following LLM using evol-instruct by Microsoft", ) + register_model_info( ["wizardcoder-15b-v1.0"], "WizardLM", "https://github.com/nlpxucan/WizardLM/tree/main/WizardCoder", "Empowering Code Large Language Models with Evol-Instruct", ) + register_model_info( ["mpt-7b-chat", "mpt-30b-chat"], "MPT-Chat", "https://www.mosaicml.com/blog/mpt-30b", "a chatbot fine-tuned from MPT by MosaicML", ) + register_model_info( ["guanaco-33b", "guanaco-65b"], "Guanaco", "https://github.com/artidoro/qlora", "a model fine-tuned with QLoRA by UW", ) + register_model_info( ["gpt4all-13b-snoozy"], "GPT4All-Snoozy", "https://github.com/nomic-ai/gpt4all", "a finetuned LLaMA model on assistant style data by Nomic AI", ) + register_model_info( ["koala-13b"], "Koala", "https://bair.berkeley.edu/blog/2023/04/03/koala", "a dialogue model for academic research by BAIR", ) + register_model_info( ["RWKV-4-Raven-14B"], "RWKV-4-Raven", "https://huggingface.co/BlinkDL/rwkv-4-raven", "an RNN with transformer-level LLM performance", ) + register_model_info( ["alpaca-13b"], "Alpaca", "https://crfm.stanford.edu/2023/03/13/alpaca.html", "a model fine-tuned from LLaMA on instruction-following demonstrations by Stanford", ) + register_model_info( ["oasst-pythia-12b"], "OpenAssistant (oasst)", "https://open-assistant.io", "an Open Assistant for everyone by LAION", ) + register_model_info( ["oasst-sft-7-llama-30b"], "OpenAssistant (oasst)", "https://open-assistant.io", "an Open Assistant for everyone by LAION", ) + register_model_info( ["palm-2"], "PaLM 2 Chat", "https://cloud.google.com/vertex-ai/docs/release-notes#May_10_2023", "PaLM 2 for Chat (chat-bison@001) by Google", ) + register_model_info( ["llama-7b", "llama-13b"], "LLaMA", "https://arxiv.org/abs/2302.13971", "open and efficient foundation language models by Meta", ) + register_model_info( ["open-llama-7b-v2-open-instruct", "open-llama-7b-open-instruct"], "Open LLaMa (Open Instruct)", "https://medium.com/vmware-data-ml-blog/starter-llm-for-the-enterprise-instruction-tuning-openllama-7b-d05fc3bbaccc", "Open LLaMa fine-tuned on instruction-following data by VMware", ) + register_model_info( ["dolly-v2-12b"], "Dolly", "https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm", "an instruction-tuned open large language model by Databricks", ) + register_model_info( ["stablelm-tuned-alpha-7b"], "StableLM", "https://github.com/stability-AI/stableLM", "Stability AI language models", ) + register_model_info( ["codet5p-6b"], "CodeT5p-6b", "https://huggingface.co/Salesforce/codet5p-6b", "Code completion model released by Salesforce", ) + register_model_info( ["fastchat-t5-3b", "fastchat-t5-3b-v1.0"], "FastChat-T5", "https://huggingface.co/lmsys/fastchat-t5-3b-v1.0", "a chat assistant fine-tuned from FLAN-T5 by LMSYS", ) + register_model_info( ["phoenix-inst-chat-7b"], "Phoenix-7B", "https://huggingface.co/FreedomIntelligence/phoenix-inst-chat-7b", "a multilingual chat assistant fine-tuned from Bloomz to democratize ChatGPT across languages by CUHK(SZ)", ) + register_model_info( ["realm-7b-v1"], "ReaLM", "https://github.com/FreedomIntelligence/ReaLM", "A chatbot fine-tuned from LLaMA2 with data generated via iterative calls to UserGPT and ChatGPT by CUHK(SZ) and SRIBD.", ) + register_model_info( ["billa-7b-sft"], "BiLLa-7B-SFT", "https://huggingface.co/Neutralzz/BiLLa-7B-SFT", "an instruction-tuned bilingual LLaMA with enhanced reasoning ability by an independent researcher", ) + register_model_info( ["h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2"], "h2oGPT-GM-7b", "https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2", "an instruction-tuned OpenLLaMA with enhanced conversational ability by H2O.ai", ) + register_model_info( ["baize-v2-7b", "baize-v2-13b"], "Baize v2", "https://github.com/project-baize/baize-chatbot#v2", "A chatbot fine-tuned from LLaMA with ChatGPT self-chat data and Self-Disillation with Feedback (SDF) by UCSD and SYSU.", ) + register_model_info( [ "airoboros-l2-7b-2.1", @@ -295,6 +354,7 @@ def get_model_info(name: str) -> ModelInfo: "https://huggingface.co/jondurbin/airoboros-l2-70b-2.1", "an instruction-tuned LlaMa model tuned with 100% synthetic instruction-response pairs from GPT4", ) + register_model_info( [ "spicyboros-7b-2.2", @@ -305,24 +365,28 @@ def get_model_info(name: str) -> ModelInfo: "https://huggingface.co/jondurbin/spicyboros-70b-2.2", "de-aligned versions of the airoboros models", ) + register_model_info( ["Robin-7b-v2", "Robin-13b-v2", "Robin-33b-v2"], "Robin-v2", "https://huggingface.co/OptimalScale/robin-7b-v2-delta", "A chatbot fine-tuned from LLaMA-7b, achieving competitive performance on chitchat, commonsense reasoning and instruction-following tasks, by OptimalScale, HKUST.", ) + register_model_info( ["manticore-13b-chat"], "Manticore 13B Chat", "https://huggingface.co/openaccess-ai-collective/manticore-13b-chat-pyg", "A chatbot fine-tuned from LlaMa across several CoT and chat datasets.", ) + register_model_info( ["redpajama-incite-7b-chat"], "RedPajama-INCITE-7B-Chat", "https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Chat", "A chatbot fine-tuned from RedPajama-INCITE-7B-Base by Together", ) + register_model_info( [ "falcon-7b", @@ -336,60 +400,70 @@ def get_model_info(name: str) -> ModelInfo: "https://huggingface.co/tiiuae/falcon-180B", "TII's flagship series of large language models", ) + register_model_info( ["tigerbot-7b-sft"], "Tigerbot", "https://huggingface.co/TigerResearch/tigerbot-7b-sft", "TigerBot is a large-scale language model (LLM) with multiple languages and tasks.", ) + register_model_info( ["internlm-chat-7b", "internlm-chat-7b-8k"], "InternLM", "https://huggingface.co/internlm/internlm-chat-7b", "InternLM is a multi-language large-scale language model (LLM), developed by SHLAB.", ) + register_model_info( ["Qwen-7B-Chat"], "Qwen", "https://huggingface.co/Qwen/Qwen-7B-Chat", "Qwen is a multi-language large-scale language model (LLM), developed by Damo Academy.", ) + register_model_info( ["Llama2-Chinese-13b-Chat", "LLama2-Chinese-13B"], "Llama2-Chinese", "https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat", "Llama2-Chinese is a multi-language large-scale language model (LLM), developed by FlagAlpha.", ) + register_model_info( ["Chinese-Alpaca-2-7B", "Chinese-Alpaca-2-13B"], "Chinese-Alpaca", "https://huggingface.co/hfl/chinese-alpaca-2-13b", "New extended Chinese vocabulary beyond Llama-2, open-sourcing the Chinese LLaMA-2 and Alpaca-2 LLMs.", ) + register_model_info( ["Vigogne-2-7B-Instruct", "Vigogne-2-13B-Instruct"], "Vigogne-Instruct", "https://huggingface.co/bofenghuang/vigogne-2-7b-instruct", "Vigogne-Instruct is a French large language model (LLM) optimized for instruction-following, developed by Bofeng Huang", ) + register_model_info( ["Vigogne-2-7B-Chat", "Vigogne-2-13B-Chat"], "Vigogne-Chat", "https://huggingface.co/bofenghuang/vigogne-2-7b-chat", "Vigogne-Chat is a French large language model (LLM) optimized for instruction-following and multi-turn dialogues, developed by Bofeng Huang", ) + register_model_info( ["stable-vicuna-13B-HF"], "stable-vicuna", "https://huggingface.co/TheBloke/stable-vicuna-13B-HF", "StableVicuna is a Vicuna model fine-tuned using RLHF via PPO on various conversational and instructional datasets.", ) + register_model_info( - ["deluxe-chat-v1", "deluxe-chat-v1.1"], + ["deluxe-chat-v1", "deluxe-chat-v1.1", "deluxe-chat-v1.2"], "DeluxeChat", "", "Deluxe Chat", ) + register_model_info( [ "Xwin-LM-7B-V0.1", @@ -421,7 +495,7 @@ def get_model_info(name: str) -> ModelInfo: ["dolphin-2.2.1-mistral-7b"], "dolphin-mistral", "https://huggingface.co/ehartford/dolphin-2.2.1-mistral-7b", - "An uncensored fine-tuned [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)", + "An uncensored fine-tuned Mistral 7B", ) register_model_info( @@ -434,6 +508,7 @@ def get_model_info(name: str) -> ModelInfo: "https://huggingface.co/BAAI/AquilaChat2-34B", "Chat models developed by BAAI team", ) + register_model_info( ["xDAN-L1-Chat-v0.1"], "xDAN-L1-Chat", @@ -447,10 +522,3 @@ def get_model_info(name: str) -> ModelInfo: "https://huggingface.co/meta-math", "MetaMath is a finetune of Llama2 on [MetaMathQA](https://huggingface.co/datasets/meta-math/MetaMathQA) that specializes in mathematical reasoning.", ) - -register_model_info( - ["upstage/SOLAR-10.7B-Instruct-v1.0"], - "SOLAR-10.7B-Instruct", - "https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0", - "A Llama2 fine-tune developed by upstage.ai that incorporates depth up-scaling.", -) diff --git a/fastchat/serve/api_provider.py b/fastchat/serve/api_provider.py index b223d4745..6f3ad7dda 100644 --- a/fastchat/serve/api_provider.py +++ b/fastchat/serve/api_provider.py @@ -111,41 +111,58 @@ def anthropic_api_stream_iter(model_name, prompt, temperature, top_p, max_new_to def init_palm_chat(model_name): import vertexai # pip3 install google-cloud-aiplatform from vertexai.preview.language_models import ChatModel + from vertexai.preview.generative_models import GenerativeModel project_id = os.environ["GCP_PROJECT_ID"] location = "us-central1" vertexai.init(project=project_id, location=location) - chat_model = ChatModel.from_pretrained(model_name) - chat = chat_model.start_chat(examples=[]) + if model_name in ["palm-2"]: + # According to release note, "chat-bison@001" is PaLM 2 for chat. + # https://cloud.google.com/vertex-ai/docs/release-notes#May_10_2023 + model_name = "chat-bison@001" + chat_model = ChatModel.from_pretrained(model_name) + chat = chat_model.start_chat(examples=[]) + elif model_name in ["gemini-pro"]: + model = GenerativeModel(model_name) + chat = model.start_chat() return chat -def palm_api_stream_iter(chat, message, temperature, top_p, max_new_tokens): +def palm_api_stream_iter(model_name, chat, message, temperature, top_p, max_new_tokens): + if model_name in ["gemini-pro"]: + max_new_tokens = max_new_tokens * 2 parameters = { "temperature": temperature, "top_p": top_p, "max_output_tokens": max_new_tokens, } gen_params = { - "model": "palm-2", + "model": model_name, "prompt": message, } gen_params.update(parameters) - logger.info(f"==== request ====\n{gen_params}") + if model_name == "palm-2": + response = chat.send_message(message, **parameters) + else: + response = chat.send_message(message, generation_config=parameters, stream=True) - response = chat.send_message(message, **parameters) - content = response.text + logger.info(f"==== request ====\n{gen_params}") - pos = 0 - while pos < len(content): - # This is a fancy way to simulate token generation latency combined - # with a Poisson process. - pos += random.randint(10, 20) - time.sleep(random.expovariate(50)) - data = { - "text": content[:pos], - "error_code": 0, + try: + text = "" + for chunk in response: + text += chunk.text + data = { + "text": text, + "error_code": 0, + } + yield data + except Exception as e: + logger.error(f"==== error ====\n{e}") + yield { + "text": f"**API REQUEST ERROR** Reason: {e}\nPlease try again or increase the number of max tokens.", + "error_code": 1, } yield data diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py index 0abbb058d..6797290ba 100644 --- a/fastchat/serve/gradio_block_arena_anony.py +++ b/fastchat/serve/gradio_block_arena_anony.py @@ -161,14 +161,21 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re SAMPLING_WEIGHTS = { # tier 0 "gpt-4": 4, + "gpt-4-0314": 4, "gpt-4-turbo": 4, - "gpt-3.5-turbo": 2, + "gpt-3.5-turbo-0613": 2, "gpt-3.5-turbo-1106": 2, "claude-2.1": 4, "claude-2.0": 2, "claude-1": 2, "claude-instant-1": 4, + "gemini-pro": 4, + "pplx-7b-online": 4, + "pplx-70b-online": 4, + "solar-10.7b-instruct-v1.0": 2, + "mixtral-8x7b-instruct-v0.1": 4, "openhermes-2.5-mistral-7b": 2, + "dolphin-2.2.1-mistral-7b": 2, "wizardlm-70b": 2, "starling-lm-7b-alpha": 2, "tulu-2-dpo-70b": 2, @@ -177,8 +184,7 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re "openchat-3.5": 2, "chatglm3-6b": 2, # tier 1 - "deluxe-chat-v1.1": 4, - "palm-2": 1.5, + "deluxe-chat-v1.2": 2, "llama-2-70b-chat": 1.5, "llama-2-13b-chat": 1.5, "codellama-34b-instruct": 1.5, @@ -208,24 +214,57 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re "llama-13b": 0.1, "chatglm-6b": 0.5, "deluxe-chat-v1": 4, + "palm-2": 1.5, } # target model sampling weights will be boosted. BATTLE_TARGETS = { - "gpt-4": {"claude-2.1", "gpt-4-turbo"}, - "gpt-4-turbo": {"gpt-4", "gpt-3.5-turbo", "gpt-3.5-turbo-1106", "claude-2.1"}, - "gpt-3.5-turbo": {"claude-instant-1", "gpt-4", "claude-2.1"}, - "gpt-3.5-turbo-1106": {"claude-instant-1", "gpt-3.5-turbo"}, - "claude-2.1": {"gpt-4-turbo", "gpt-4", "claude-1"}, - "claude-2.0": {"gpt-4-turbo", "gpt-4", "claude-1"}, - "claude-1": {"claude-2.1", "gpt-4", "gpt-3.5-turbo"}, + "gpt-4": {"gpt-4-0314", "claude-2.1", "gpt-4-turbo"}, + "gpt-4-0613": {"gpt-4-0314", "claude-2.1", "gpt-4-turbo"}, + "gpt-4-0314": {"gpt-4-turbo", "gpt-4-0613", "claude-2.1", "gpt-3.5-turbo-0613"}, + "gpt-4-turbo": { + "gpt-4-0613", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-1106", + "claude-2.1", + }, + "gpt-3.5-turbo-0613": {"claude-instant-1", "gpt-4-0613", "claude-2.1"}, + "gpt-3.5-turbo-1106": {"gpt-4-0613", "claude-instant-1", "gpt-3.5-turbo-0613"}, + "solar-10.7b-instruct-v1.0": { + "mixtral-8x7b-instruct-v0.1", + "gpt-3.5-turbo-0613", + "llama-2-70b-chat", + }, + "mixtral-8x7b-instruct-v0.1": { + "gpt-3.5-turbo-1106", + "gpt-3.5-turbo-0613", + "gpt-4-turbo", + "llama-2-70b-chat", + }, + "claude-2.1": {"gpt-4-turbo", "gpt-4-0613", "claude-1"}, + "claude-2.0": {"gpt-4-turbo", "gpt-4-0613", "claude-1"}, + "claude-1": {"claude-2.1", "gpt-4-0613", "gpt-3.5-turbo-0613"}, "claude-instant-1": {"gpt-3.5-turbo-1106", "claude-2.1"}, - "deluxe-chat-v1.1": {"gpt-4", "gpt-4-turbo"}, - "openhermes-2.5-mistral-7b": {"gpt-3.5-turbo", "openchat-3.5", "zephyr-7b-beta"}, - "starling-lm-7b-alpha": {"gpt-3.5-turbo", "openchat-3.5", "zephyr-7b-beta"}, - "tulu-2-dpo-70b": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"}, - "yi-34b-chat": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"}, - "openchat-3.5": {"gpt-3.5-turbo", "llama-2-70b-chat", "zephyr-7b-beta"}, + "gemini-pro": {"gpt-4-turbo", "gpt-4-0613", "gpt-3.5-turbo-0613"}, + "deluxe-chat-v1.1": {"gpt-4-0613", "gpt-4-turbo"}, + "deluxe-chat-v1.2": {"gpt-4-0613", "gpt-4-turbo"}, + "pplx-7b-online": {"gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", "llama-2-70b-chat"}, + "pplx-70b-online": {"gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", "llama-2-70b-chat"}, + "openhermes-2.5-mistral-7b": { + "gpt-3.5-turbo-0613", + "openchat-3.5", + "zephyr-7b-beta", + }, + "dolphin-2.2.1-mistral-7b": { + "gpt-3.5-turbo-0613", + "vicuna-33b", + "starling-lm-7b-alpha", + "openhermes-2.5-mistral-7b", + }, + "starling-lm-7b-alpha": {"gpt-3.5-turbo-0613", "openchat-3.5", "zephyr-7b-beta"}, + "tulu-2-dpo-70b": {"gpt-3.5-turbo-0613", "vicuna-33b", "claude-instant-1"}, + "yi-34b-chat": {"gpt-3.5-turbo-0613", "vicuna-33b", "claude-instant-1"}, + "openchat-3.5": {"gpt-3.5-turbo-0613", "llama-2-70b-chat", "zephyr-7b-beta"}, "chatglm3-6b": {"yi-34b-chat", "qwen-14b-chat"}, "qwen-14b-chat": {"vicuna-13b", "llama-2-13b-chat", "llama-2-70b-chat"}, "zephyr-7b-alpha": {"mistral-7b-instruct", "llama-2-13b-chat"}, @@ -235,7 +274,7 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re "llama-2-7b-chat", "wizardlm-13b", }, - "llama-2-70b-chat": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"}, + "llama-2-70b-chat": {"gpt-3.5-turbo-0613", "vicuna-33b", "claude-instant-1"}, "llama-2-13b-chat": {"mistral-7b-instruct", "vicuna-13b", "llama-2-70b-chat"}, "llama-2-7b-chat": {"mistral-7b-instruct", "vicuna-7b", "llama-2-13b-chat"}, "mistral-7b-instruct": { @@ -243,31 +282,29 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re "llama-2-13b-chat", "llama-2-70b-chat", }, - "vicuna-33b": {"llama-2-70b-chat", "gpt-3.5-turbo", "claude-instant-1"}, + "vicuna-33b": {"llama-2-70b-chat", "gpt-3.5-turbo-0613", "claude-instant-1"}, "vicuna-13b": {"llama-2-13b-chat", "llama-2-70b-chat"}, "vicuna-7b": {"llama-2-7b-chat", "mistral-7b-instruct", "llama-2-13b-chat"}, - "wizardlm-70b": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"}, - "palm-2": {"llama-2-13b-chat", "gpt-3.5-turbo"}, + "wizardlm-70b": {"gpt-3.5-turbo-0613", "vicuna-33b", "claude-instant-1"}, } SAMPLING_BOOST_MODELS = [ - "tulu-2-dpo-70b", - "yi-34b-chat", + # "tulu-2-dpo-70b", + # "yi-34b-chat", "claude-2.1", - "wizardlm-70b", - "starling-lm-7b-alpha", - "openhermes-2.5-mistral-7b", - "gpt-3.5-turbo-1106", - # "openchat-3.5", - # "gpt-4-turbo", - # "claude-1", + "claude-1", + "gpt-4-0613", + # "gpt-3.5-turbo-1106", + # "gpt-4-0314", + "gpt-4-turbo", + # "dolphin-2.2.1-mistral-7b", + "mixtral-8x7b-instruct-v0.1", + "gemini-pro", + "solar-10.7b-instruct-v1.0", ] # outage models won't be sampled. -OUTAGE_MODELS = [ - "zephyr-7b-alpha", - "falcon-180b-chat", -] +OUTAGE_MODELS = [] def get_sample_weight(model): @@ -291,6 +328,8 @@ def get_battle_pair(): model_weights = model_weights / total_weight chosen_idx = np.random.choice(len(models), p=model_weights) chosen_model = models[chosen_idx] + # for p, w in zip(models, model_weights): + # print(p, w) rival_models = [] rival_weights = [] @@ -427,6 +466,7 @@ def bot_response_multi( top_p, max_new_tokens, request, + apply_rate_limit=False, ) ) @@ -533,8 +573,8 @@ def build_side_by_side_ui_anony(models): ) max_output_tokens = gr.Slider( minimum=16, - maximum=1024, - value=512, + maximum=2048, + value=1024, step=64, interactive=True, label="Max output tokens", diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py index 155338002..60823f06b 100644 --- a/fastchat/serve/gradio_block_arena_named.py +++ b/fastchat/serve/gradio_block_arena_named.py @@ -355,8 +355,8 @@ def build_side_by_side_ui_named(models): ) max_output_tokens = gr.Slider( minimum=16, - maximum=1024, - value=512, + maximum=2048, + value=1024, step=64, interactive=True, label="Max output tokens", diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py index 809833e99..745b527b8 100644 --- a/fastchat/serve/gradio_web_server.py +++ b/fastchat/serve/gradio_web_server.py @@ -21,6 +21,7 @@ ErrorCode, MODERATION_MSG, CONVERSATION_LIMIT_MSG, + RATE_LIMIT_MSG, SERVER_ERROR_MSG, INPUT_CHAR_LEN_LIMIT, CONVERSATION_TURN_LIMIT, @@ -91,10 +92,8 @@ def __init__(self, model_name): self.skip_next = False self.model_name = model_name - if model_name == "palm-2": - # According to release note, "chat-bison@001" is PaLM 2 for chat. - # https://cloud.google.com/vertex-ai/docs/release-notes#May_10_2023 - self.palm_chat = init_palm_chat("chat-bison@001") + if model_name in ["palm-2", "gemini-pro"]: + self.palm_chat = init_palm_chat(model_name) def to_gradio_chatbot(self): return self.conv.to_gradio_chatbot() @@ -142,19 +141,23 @@ def get_model_list( models += list(openai_compatible_models_info.keys()) if add_chatgpt: - models += ["gpt-3.5-turbo", "gpt-3.5-turbo-1106"] + models += [ + "gpt-4-0314", + "gpt-4-0613", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-1106", + ] if add_claude: - models += ["claude-2.0", "claude-2.1", "claude-instant-1"] + models += ["claude-2.1", "claude-2.0", "claude-instant-1"] if add_palm: - models += ["palm-2"] + models += ["gemini-pro"] models = list(set(models)) - if "deluxe-chat-v1" in models: - del models[models.index("deluxe-chat-v1")] - if "deluxe-chat-v1.1" in models: - del models[models.index("deluxe-chat-v1.1")] + hidden_models = ["gpt-4-0314", "gpt-4-0613"] + for hm in hidden_models: + del models[models.index(hm)] - priority = {k: f"___{i:02d}" for i, k in enumerate(model_info)} + priority = {k: f"___{i:03d}" for i, k in enumerate(model_info)} models.sort(key=lambda x: priority.get(x, x)) logger.info(f"Models: {models}") return models @@ -329,7 +332,14 @@ def model_worker_stream_iter( yield data -def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request): +def bot_response( + state, + temperature, + top_p, + max_new_tokens, + request: gr.Request, + apply_rate_limit=True, +): ip = get_ip(request) logger.info(f"bot_response. ip: {ip}") start_tstamp = time.time() @@ -356,7 +366,16 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request) api_base=model_info["api_base"], api_key=model_info["api_key"], ) - elif model_name in ["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-3.5-turbo-1106"]: + elif model_name in [ + "gpt-3.5-turbo", + "gpt-3.5-turbo-0301", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-1106", + "gpt-4", + "gpt-4-0314", + "gpt-4-0613", + "gpt-4-turbo", + ]: # avoid conflict with Azure OpenAI assert model_name not in openai_compatible_models_info prompt = conv.to_openai_api_messages() @@ -368,9 +387,14 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request) stream_iter = anthropic_api_stream_iter( model_name, prompt, temperature, top_p, max_new_tokens ) - elif model_name == "palm-2": + elif model_name in ["palm-2", "gemini-pro"]: stream_iter = palm_api_stream_iter( - state.palm_chat, conv.messages[-2][1], temperature, top_p, max_new_tokens + model_name, + state.palm_chat, + conv.messages[-2][1], + temperature, + top_p, + max_new_tokens, ) else: # Query worker address @@ -689,8 +713,8 @@ def build_single_model_ui(models, add_promotion_links=False): ) max_output_tokens = gr.Slider( minimum=16, - maximum=1024, - value=512, + maximum=2048, + value=1024, step=64, interactive=True, label="Max output tokens", diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py index 36ff5c8ce..0009c02ad 100644 --- a/fastchat/serve/gradio_web_server_multi.py +++ b/fastchat/serve/gradio_web_server_multi.py @@ -85,15 +85,22 @@ def load_demo(url_params, request: gr.Request): # Only enable these models in anony battles. if args.add_chatgpt: models_anony += [ - "gpt-4", - "gpt-3.5-turbo", - "gpt-4-turbo", + "gpt-4-0314", + "gpt-4-0613", + "gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", ] if args.add_claude: models_anony += ["claude-2.1", "claude-2.0", "claude-1", "claude-instant-1"] if args.add_palm: - models_anony += ["palm-2"] + models_anony += ["gemini-pro"] + anony_only_models = [ + "claude-1", + "gpt-4-0314", + "gpt-4-0613", + ] + for mdl in anony_only_models: + models_anony.append(mdl) models_anony = list(set(models_anony)) side_by_side_anony_updates = load_demo_side_by_side_anony(models_anony, url_params) diff --git a/fastchat/serve/huggingface_api.py b/fastchat/serve/huggingface_api.py index 2a49bf5f1..8022fbc93 100644 --- a/fastchat/serve/huggingface_api.py +++ b/fastchat/serve/huggingface_api.py @@ -61,7 +61,7 @@ def main(args): add_model_args(parser) parser.add_argument("--temperature", type=float, default=0.7) parser.add_argument("--repetition_penalty", type=float, default=1.0) - parser.add_argument("--max-new-tokens", type=int, default=512) + parser.add_argument("--max-new-tokens", type=int, default=1024) parser.add_argument("--debug", action="store_true") parser.add_argument("--message", type=str, default="Hello! Who are you?") args = parser.parse_args() diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 580a2c866..59c48eb0e 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -22,7 +22,9 @@ from fastchat.utils import build_logger, get_window_url_params_js -notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing" +notebook_url = ( + "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH" +) basic_component_values = [None] * 6 @@ -39,7 +41,7 @@ def make_leaderboard_md(elo_results): - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses. - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks. -💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: November, 2023. +💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Dec 20, 2023. """ return leaderboard_md