diff --git a/fastchat/constants.py b/fastchat/constants.py
index 53ed55c1c..24e1783af 100644
--- a/fastchat/constants.py
+++ b/fastchat/constants.py
@@ -15,6 +15,7 @@
 CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION."
 INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE."
 SLOW_MODEL_MSG = "⚠️  Both models will show the responses all at once. Please stay patient as it may take over 30 seconds."
+RATE_LIMIT_MSG = "**RATE LIMIT OF THIS MODEL IS REACHED. PLEASE COME BACK LATER OR TRY OTHER MODELS.**"
 # Maximum input length
 INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 12000))
 # Maximum conversation turns
diff --git a/fastchat/conversation.py b/fastchat/conversation.py
index 46ffdcd19..ef6e316d1 100644
--- a/fastchat/conversation.py
+++ b/fastchat/conversation.py
@@ -276,7 +276,10 @@ def to_gradio_chatbot(self):
 
     def to_openai_api_messages(self):
         """Convert the conversation to OpenAI chat completion format."""
-        ret = [{"role": "system", "content": self.system_message}]
+        if self.system_message == "":
+            ret = []
+        else:
+            ret = [{"role": "system", "content": self.system_message}]
 
         for i, (_, msg) in enumerate(self.messages[self.offset :]):
             if i % 2 == 0:
@@ -679,6 +682,17 @@ def get_conv_template(name: str) -> Conversation:
     )
 )
 
+# Perplexity AI template
+register_conv_template(
+    Conversation(
+        name="pplxai",
+        system_message="Be precise and concise.",
+        roles=("user", "assistant"),
+        sep_style=None,
+        sep=None,
+    )
+)
+
 # Claude default template
 register_conv_template(
     Conversation(
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index 3d73356e6..6578f8441 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -1038,8 +1038,12 @@ class ChatGPTAdapter(BaseModelAdapter):
     def match(self, model_path: str):
         return model_path in (
             "gpt-3.5-turbo",
+            "gpt-3.5-turbo-0301",
+            "gpt-3.5-turbo-0613",
             "gpt-3.5-turbo-1106",
             "gpt-4",
+            "gpt-4-0314",
+            "gpt-4-0613",
             "gpt-4-turbo",
         )
 
@@ -1063,6 +1067,22 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("chatgpt")
 
 
+class PplxAIAdapter(BaseModelAdapter):
+    """The model adapter for Perplexity AI"""
+
+    def match(self, model_path: str):
+        return model_path in (
+            "pplx-7b-online",
+            "pplx-70b-online",
+        )
+
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("pplxai")
+
+
 class ClaudeAdapter(BaseModelAdapter):
     """The model adapter for Claude"""
 
@@ -1102,6 +1122,19 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("bard")
 
 
+class GeminiAdapter(BaseModelAdapter):
+    """The model adapter for Gemini"""
+
+    def match(self, model_path: str):
+        return model_path in ["gemini-pro"]
+
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("bard")
+
+
 class BiLLaAdapter(BaseModelAdapter):
     """The model adapter for Neutralzz/BiLLa-7B-SFT"""
 
@@ -1420,7 +1453,7 @@ class MistralAdapter(BaseModelAdapter):
     """The model adapter for Mistral AI models"""
 
     def match(self, model_path: str):
-        return "mistral" in model_path.lower()
+        return "mistral" in model_path.lower() or "mixtral" in model_path.lower()
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
@@ -2056,6 +2089,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 register_model_adapter(PhoenixAdapter)
 register_model_adapter(BardAdapter)
 register_model_adapter(PaLM2Adapter)
+register_model_adapter(GeminiAdapter)
 register_model_adapter(ChatGPTAdapter)
 register_model_adapter(AzureOpenAIAdapter)
 register_model_adapter(ClaudeAdapter)
@@ -2107,6 +2141,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 register_model_adapter(MicrosoftOrcaAdapter)
 register_model_adapter(XdanAdapter)
 register_model_adapter(YiAdapter)
+register_model_adapter(PplxAIAdapter)
 register_model_adapter(DeepseekCoderAdapter)
 register_model_adapter(DeepseekChatAdapter)
 register_model_adapter(MetaMathAdapter)
diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py
index 050aec1d1..1c244b50a 100644
--- a/fastchat/model/model_registry.py
+++ b/fastchat/model/model_registry.py
@@ -1,12 +1,12 @@
 """Additional information of the models."""
-from collections import namedtuple
+from collections import namedtuple, OrderedDict
 from typing import List
 
 
 ModelInfo = namedtuple("ModelInfo", ["simple_name", "link", "description"])
 
 
-model_info = {}
+model_info = OrderedDict()
 
 
 def register_model_info(
@@ -29,68 +29,110 @@ def get_model_info(name: str) -> ModelInfo:
 
 
 register_model_info(
-    ["gpt-3.5-turbo"],
-    "GPT-3.5",
-    "https://openai.com/blog/chatgpt",
-    "GPT-3.5 by OpenAI",
+    ["mixtral-8x7b-instruct-v0.1", "mistral-7b-instruct"],
+    "Mixtral of experts",
+    "https://mistral.ai/news/mixtral-of-experts/",
+    "A Mixture-of-Experts model by Mistral AI",
 )
+
 register_model_info(
-    ["gpt-3.5-turbo-1106"],
-    "GPT-3.5-Turbo-1106",
-    "https://platform.openai.com/docs/models/gpt-3-5",
-    "GPT-3.5-Turbo-1106 by OpenAI",
+    ["gemini-pro"],
+    "Gemini",
+    "https://blog.google/technology/ai/google-gemini-pro-imagen-duet-ai-update/",
+    "Gemini by Google",
+)
+
+register_model_info(
+    ["solar-10.7b-instruct-v1.0"],
+    "SOLAR-10.7B-Instruct",
+    "https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0",
+    "A model trained using depth up-scaling by Upstage AI",
 )
+
 register_model_info(
     ["gpt-4-turbo"],
     "GPT-4-Turbo",
     "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
     "GPT-4-Turbo by OpenAI",
 )
+
 register_model_info(
-    ["gpt-4"], "GPT-4", "https://openai.com/research/gpt-4", "ChatGPT-4 by OpenAI"
+    ["gpt-3.5-turbo", "gpt-3.5-turbo-0314", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106"],
+    "GPT-3.5",
+    "https://platform.openai.com/docs/models/gpt-3-5",
+    "GPT-3.5-Turbo by OpenAI",
 )
+
+register_model_info(
+    ["gpt-4", "gpt-4-0314", "gpt-4-0613"],
+    "GPT-4",
+    "https://openai.com/research/gpt-4",
+    "GPT-4 by OpenAI",
+)
+
 register_model_info(
     ["claude-2.1", "claude-2.0"],
     "Claude",
     "https://www.anthropic.com/index/claude-2",
     "Claude 2 by Anthropic",
 )
+
 register_model_info(
     ["claude-1"],
     "Claude",
     "https://www.anthropic.com/index/introducing-claude",
-    "Claude by Anthropic",
+    "Claude 1 by Anthropic",
 )
+
 register_model_info(
     ["claude-instant-1", "claude-instant-1.2"],
     "Claude Instant",
     "https://www.anthropic.com/index/introducing-claude",
     "Claude Instant by Anthropic",
 )
+
+register_model_info(
+    ["pplx-70b-online", "pplx-7b-online"],
+    "pplx-online-llms",
+    "https://blog.perplexity.ai/blog/introducing-pplx-online-llms",
+    "Online LLM API by Perplexity AI",
+)
+
 register_model_info(
     ["openhermes-2.5-mistral-7b"],
     "OpenHermes-2.5-Mistral-7B",
     "https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B",
     "a mistral-based model fine-tuned on 1M GPT-4 outputs",
 )
+
 register_model_info(
     ["starling-lm-7b-alpha"],
     "Starling-LM-7B-alpha",
     "https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha",
     "an open model trained using RLAIF by Berkeley",
 )
+
 register_model_info(
     ["tulu-2-dpo-70b"],
     "Tulu 2",
     "https://huggingface.co/allenai/tulu-2-dpo-70b",
-    "Tulu 2 by UW/AllenAI",
+    "an instruction and RLHF model by UW/AllenAI",
 )
+
 register_model_info(
     ["yi-34b-chat", "yi-6b-chat"],
     "Yi-Chat",
     "https://huggingface.co/01-ai/Yi-34B-Chat",
     "A large language model by 01 AI",
 )
+
+register_model_info(
+    ["llama-2-70b-chat", "llama-2-34b-chat", "llama-2-13b-chat", "llama-2-7b-chat"],
+    "Llama 2",
+    "https://ai.meta.com/llama/",
+    "open foundation and fine-tuned chat models by Meta",
+)
+
 register_model_info(
     [
         "vicuna-33b",
@@ -104,186 +146,203 @@ def get_model_info(name: str) -> ModelInfo:
     "https://lmsys.org/blog/2023-03-30-vicuna/",
     "a chat assistant fine-tuned on user-shared conversations by LMSYS",
 )
-register_model_info(
-    ["llama-2-70b-chat", "llama-2-34b-chat", "llama-2-13b-chat", "llama-2-7b-chat"],
-    "Llama 2",
-    "https://ai.meta.com/llama/",
-    "open foundation and fine-tuned chat models by Meta",
-)
+
 register_model_info(
     ["chatglm3-6b", "chatglm2-6b", "chatglm-6b"],
     "ChatGLM",
     "https://chatglm.cn/blog",
     "an open bilingual dialogue language model by Tsinghua University",
 )
+
 register_model_info(
     ["openchat-3.5"],
     "OpenChat 3.5",
     "https://github.com/imoneoi/openchat",
     "an open model fine-tuned on Mistral-7B using C-RLFT",
 )
-register_model_info(
-    ["mistral-7b-instruct"],
-    "Mistral",
-    "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1",
-    "a large language model by Mistral AI team",
-)
+
 register_model_info(
     ["zephyr-7b-beta", "zephyr-7b-alpha"],
     "Zephyr",
     "https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha",
     "a chatbot fine-tuned from Mistral by Hugging Face",
 )
+
 register_model_info(
     ["catppt"],
     "CatPPT",
     "https://huggingface.co/rishiraj/CatPPT",
     "a chatbot fine-tuned from a SLERP merged model by Rishiraj Acharya",
 )
+
 register_model_info(
     ["qwen-14b-chat"],
     "Qwen",
     "https://huggingface.co/Qwen/Qwen-14B-Chat",
     "a large language model by Alibaba Cloud",
 )
+
 register_model_info(
     ["codellama-34b-instruct", "codellama-13b-instruct", "codellama-7b-instruct"],
     "Code Llama",
     "https://ai.meta.com/blog/code-llama-large-language-model-coding/",
     "open foundation models for code by Meta",
 )
+
 register_model_info(
     ["wizardlm-70b", "wizardlm-30b", "wizardlm-13b"],
     "WizardLM",
     "https://github.com/nlpxucan/WizardLM",
     "an instruction-following LLM using evol-instruct by Microsoft",
 )
+
 register_model_info(
     ["wizardcoder-15b-v1.0"],
     "WizardLM",
     "https://github.com/nlpxucan/WizardLM/tree/main/WizardCoder",
     "Empowering Code Large Language Models with Evol-Instruct",
 )
+
 register_model_info(
     ["mpt-7b-chat", "mpt-30b-chat"],
     "MPT-Chat",
     "https://www.mosaicml.com/blog/mpt-30b",
     "a chatbot fine-tuned from MPT by MosaicML",
 )
+
 register_model_info(
     ["guanaco-33b", "guanaco-65b"],
     "Guanaco",
     "https://github.com/artidoro/qlora",
     "a model fine-tuned with QLoRA by UW",
 )
+
 register_model_info(
     ["gpt4all-13b-snoozy"],
     "GPT4All-Snoozy",
     "https://github.com/nomic-ai/gpt4all",
     "a finetuned LLaMA model on assistant style data by Nomic AI",
 )
+
 register_model_info(
     ["koala-13b"],
     "Koala",
     "https://bair.berkeley.edu/blog/2023/04/03/koala",
     "a dialogue model for academic research by BAIR",
 )
+
 register_model_info(
     ["RWKV-4-Raven-14B"],
     "RWKV-4-Raven",
     "https://huggingface.co/BlinkDL/rwkv-4-raven",
     "an RNN with transformer-level LLM performance",
 )
+
 register_model_info(
     ["alpaca-13b"],
     "Alpaca",
     "https://crfm.stanford.edu/2023/03/13/alpaca.html",
     "a model fine-tuned from LLaMA on instruction-following demonstrations by Stanford",
 )
+
 register_model_info(
     ["oasst-pythia-12b"],
     "OpenAssistant (oasst)",
     "https://open-assistant.io",
     "an Open Assistant for everyone by LAION",
 )
+
 register_model_info(
     ["oasst-sft-7-llama-30b"],
     "OpenAssistant (oasst)",
     "https://open-assistant.io",
     "an Open Assistant for everyone by LAION",
 )
+
 register_model_info(
     ["palm-2"],
     "PaLM 2 Chat",
     "https://cloud.google.com/vertex-ai/docs/release-notes#May_10_2023",
     "PaLM 2 for Chat (chat-bison@001) by Google",
 )
+
 register_model_info(
     ["llama-7b", "llama-13b"],
     "LLaMA",
     "https://arxiv.org/abs/2302.13971",
     "open and efficient foundation language models by Meta",
 )
+
 register_model_info(
     ["open-llama-7b-v2-open-instruct", "open-llama-7b-open-instruct"],
     "Open LLaMa (Open Instruct)",
     "https://medium.com/vmware-data-ml-blog/starter-llm-for-the-enterprise-instruction-tuning-openllama-7b-d05fc3bbaccc",
     "Open LLaMa fine-tuned on instruction-following data by VMware",
 )
+
 register_model_info(
     ["dolly-v2-12b"],
     "Dolly",
     "https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm",
     "an instruction-tuned open large language model by Databricks",
 )
+
 register_model_info(
     ["stablelm-tuned-alpha-7b"],
     "StableLM",
     "https://github.com/stability-AI/stableLM",
     "Stability AI language models",
 )
+
 register_model_info(
     ["codet5p-6b"],
     "CodeT5p-6b",
     "https://huggingface.co/Salesforce/codet5p-6b",
     "Code completion model released by Salesforce",
 )
+
 register_model_info(
     ["fastchat-t5-3b", "fastchat-t5-3b-v1.0"],
     "FastChat-T5",
     "https://huggingface.co/lmsys/fastchat-t5-3b-v1.0",
     "a chat assistant fine-tuned from FLAN-T5 by LMSYS",
 )
+
 register_model_info(
     ["phoenix-inst-chat-7b"],
     "Phoenix-7B",
     "https://huggingface.co/FreedomIntelligence/phoenix-inst-chat-7b",
     "a multilingual chat assistant fine-tuned from Bloomz to democratize ChatGPT across languages by CUHK(SZ)",
 )
+
 register_model_info(
     ["realm-7b-v1"],
     "ReaLM",
     "https://github.com/FreedomIntelligence/ReaLM",
     "A chatbot fine-tuned from LLaMA2 with data generated via iterative calls to UserGPT and ChatGPT by CUHK(SZ) and SRIBD.",
 )
+
 register_model_info(
     ["billa-7b-sft"],
     "BiLLa-7B-SFT",
     "https://huggingface.co/Neutralzz/BiLLa-7B-SFT",
     "an instruction-tuned bilingual LLaMA with enhanced reasoning ability by an independent researcher",
 )
+
 register_model_info(
     ["h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2"],
     "h2oGPT-GM-7b",
     "https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2",
     "an instruction-tuned OpenLLaMA with enhanced conversational ability by H2O.ai",
 )
+
 register_model_info(
     ["baize-v2-7b", "baize-v2-13b"],
     "Baize v2",
     "https://github.com/project-baize/baize-chatbot#v2",
     "A chatbot fine-tuned from LLaMA with ChatGPT self-chat data and Self-Disillation with Feedback (SDF) by UCSD and SYSU.",
 )
+
 register_model_info(
     [
         "airoboros-l2-7b-2.1",
@@ -295,6 +354,7 @@ def get_model_info(name: str) -> ModelInfo:
     "https://huggingface.co/jondurbin/airoboros-l2-70b-2.1",
     "an instruction-tuned LlaMa model tuned with 100% synthetic instruction-response pairs from GPT4",
 )
+
 register_model_info(
     [
         "spicyboros-7b-2.2",
@@ -305,24 +365,28 @@ def get_model_info(name: str) -> ModelInfo:
     "https://huggingface.co/jondurbin/spicyboros-70b-2.2",
     "de-aligned versions of the airoboros models",
 )
+
 register_model_info(
     ["Robin-7b-v2", "Robin-13b-v2", "Robin-33b-v2"],
     "Robin-v2",
     "https://huggingface.co/OptimalScale/robin-7b-v2-delta",
     "A chatbot fine-tuned from LLaMA-7b, achieving competitive performance on chitchat, commonsense reasoning and instruction-following tasks, by OptimalScale, HKUST.",
 )
+
 register_model_info(
     ["manticore-13b-chat"],
     "Manticore 13B Chat",
     "https://huggingface.co/openaccess-ai-collective/manticore-13b-chat-pyg",
     "A chatbot fine-tuned from LlaMa across several CoT and chat datasets.",
 )
+
 register_model_info(
     ["redpajama-incite-7b-chat"],
     "RedPajama-INCITE-7B-Chat",
     "https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Chat",
     "A chatbot fine-tuned from RedPajama-INCITE-7B-Base by Together",
 )
+
 register_model_info(
     [
         "falcon-7b",
@@ -336,60 +400,70 @@ def get_model_info(name: str) -> ModelInfo:
     "https://huggingface.co/tiiuae/falcon-180B",
     "TII's flagship series of large language models",
 )
+
 register_model_info(
     ["tigerbot-7b-sft"],
     "Tigerbot",
     "https://huggingface.co/TigerResearch/tigerbot-7b-sft",
     "TigerBot is a large-scale language model (LLM) with multiple languages and tasks.",
 )
+
 register_model_info(
     ["internlm-chat-7b", "internlm-chat-7b-8k"],
     "InternLM",
     "https://huggingface.co/internlm/internlm-chat-7b",
     "InternLM is a multi-language large-scale language model (LLM), developed by SHLAB.",
 )
+
 register_model_info(
     ["Qwen-7B-Chat"],
     "Qwen",
     "https://huggingface.co/Qwen/Qwen-7B-Chat",
     "Qwen is a multi-language large-scale language model (LLM), developed by Damo Academy.",
 )
+
 register_model_info(
     ["Llama2-Chinese-13b-Chat", "LLama2-Chinese-13B"],
     "Llama2-Chinese",
     "https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat",
     "Llama2-Chinese is a multi-language large-scale language model (LLM), developed by FlagAlpha.",
 )
+
 register_model_info(
     ["Chinese-Alpaca-2-7B", "Chinese-Alpaca-2-13B"],
     "Chinese-Alpaca",
     "https://huggingface.co/hfl/chinese-alpaca-2-13b",
     "New extended Chinese vocabulary beyond Llama-2, open-sourcing the Chinese LLaMA-2 and Alpaca-2 LLMs.",
 )
+
 register_model_info(
     ["Vigogne-2-7B-Instruct", "Vigogne-2-13B-Instruct"],
     "Vigogne-Instruct",
     "https://huggingface.co/bofenghuang/vigogne-2-7b-instruct",
     "Vigogne-Instruct is a French large language model (LLM) optimized for instruction-following, developed by Bofeng Huang",
 )
+
 register_model_info(
     ["Vigogne-2-7B-Chat", "Vigogne-2-13B-Chat"],
     "Vigogne-Chat",
     "https://huggingface.co/bofenghuang/vigogne-2-7b-chat",
     "Vigogne-Chat is a French large language model (LLM) optimized for instruction-following and multi-turn dialogues, developed by Bofeng Huang",
 )
+
 register_model_info(
     ["stable-vicuna-13B-HF"],
     "stable-vicuna",
     "https://huggingface.co/TheBloke/stable-vicuna-13B-HF",
     "StableVicuna is a Vicuna model fine-tuned using RLHF via PPO on various conversational and instructional datasets.",
 )
+
 register_model_info(
-    ["deluxe-chat-v1", "deluxe-chat-v1.1"],
+    ["deluxe-chat-v1", "deluxe-chat-v1.1", "deluxe-chat-v1.2"],
     "DeluxeChat",
     "",
     "Deluxe Chat",
 )
+
 register_model_info(
     [
         "Xwin-LM-7B-V0.1",
@@ -421,7 +495,7 @@ def get_model_info(name: str) -> ModelInfo:
     ["dolphin-2.2.1-mistral-7b"],
     "dolphin-mistral",
     "https://huggingface.co/ehartford/dolphin-2.2.1-mistral-7b",
-    "An uncensored fine-tuned [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)",
+    "An uncensored fine-tuned Mistral 7B",
 )
 
 register_model_info(
@@ -434,6 +508,7 @@ def get_model_info(name: str) -> ModelInfo:
     "https://huggingface.co/BAAI/AquilaChat2-34B",
     "Chat models developed by BAAI team",
 )
+
 register_model_info(
     ["xDAN-L1-Chat-v0.1"],
     "xDAN-L1-Chat",
@@ -447,10 +522,3 @@ def get_model_info(name: str) -> ModelInfo:
     "https://huggingface.co/meta-math",
     "MetaMath is a finetune of Llama2 on [MetaMathQA](https://huggingface.co/datasets/meta-math/MetaMathQA) that specializes in mathematical reasoning.",
 )
-
-register_model_info(
-    ["upstage/SOLAR-10.7B-Instruct-v1.0"],
-    "SOLAR-10.7B-Instruct",
-    "https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0",
-    "A Llama2 fine-tune developed by upstage.ai that incorporates depth up-scaling.",
-)
diff --git a/fastchat/serve/api_provider.py b/fastchat/serve/api_provider.py
index b223d4745..6f3ad7dda 100644
--- a/fastchat/serve/api_provider.py
+++ b/fastchat/serve/api_provider.py
@@ -111,41 +111,58 @@ def anthropic_api_stream_iter(model_name, prompt, temperature, top_p, max_new_to
 def init_palm_chat(model_name):
     import vertexai  # pip3 install google-cloud-aiplatform
     from vertexai.preview.language_models import ChatModel
+    from vertexai.preview.generative_models import GenerativeModel
 
     project_id = os.environ["GCP_PROJECT_ID"]
     location = "us-central1"
     vertexai.init(project=project_id, location=location)
 
-    chat_model = ChatModel.from_pretrained(model_name)
-    chat = chat_model.start_chat(examples=[])
+    if model_name in ["palm-2"]:
+        # According to release note, "chat-bison@001" is PaLM 2 for chat.
+        # https://cloud.google.com/vertex-ai/docs/release-notes#May_10_2023
+        model_name = "chat-bison@001"
+        chat_model = ChatModel.from_pretrained(model_name)
+        chat = chat_model.start_chat(examples=[])
+    elif model_name in ["gemini-pro"]:
+        model = GenerativeModel(model_name)
+        chat = model.start_chat()
     return chat
 
 
-def palm_api_stream_iter(chat, message, temperature, top_p, max_new_tokens):
+def palm_api_stream_iter(model_name, chat, message, temperature, top_p, max_new_tokens):
+    if model_name in ["gemini-pro"]:
+        max_new_tokens = max_new_tokens * 2
     parameters = {
         "temperature": temperature,
         "top_p": top_p,
         "max_output_tokens": max_new_tokens,
     }
     gen_params = {
-        "model": "palm-2",
+        "model": model_name,
         "prompt": message,
     }
     gen_params.update(parameters)
-    logger.info(f"==== request ====\n{gen_params}")
+    if model_name == "palm-2":
+        response = chat.send_message(message, **parameters)
+    else:
+        response = chat.send_message(message, generation_config=parameters, stream=True)
 
-    response = chat.send_message(message, **parameters)
-    content = response.text
+    logger.info(f"==== request ====\n{gen_params}")
 
-    pos = 0
-    while pos < len(content):
-        # This is a fancy way to simulate token generation latency combined
-        # with a Poisson process.
-        pos += random.randint(10, 20)
-        time.sleep(random.expovariate(50))
-        data = {
-            "text": content[:pos],
-            "error_code": 0,
+    try:
+        text = ""
+        for chunk in response:
+            text += chunk.text
+            data = {
+                "text": text,
+                "error_code": 0,
+            }
+            yield data
+    except Exception as e:
+        logger.error(f"==== error ====\n{e}")
+        yield {
+            "text": f"**API REQUEST ERROR** Reason: {e}\nPlease try again or increase the number of max tokens.",
+            "error_code": 1,
         }
         yield data
 
diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py
index 0abbb058d..6797290ba 100644
--- a/fastchat/serve/gradio_block_arena_anony.py
+++ b/fastchat/serve/gradio_block_arena_anony.py
@@ -161,14 +161,21 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
 SAMPLING_WEIGHTS = {
     # tier 0
     "gpt-4": 4,
+    "gpt-4-0314": 4,
     "gpt-4-turbo": 4,
-    "gpt-3.5-turbo": 2,
+    "gpt-3.5-turbo-0613": 2,
     "gpt-3.5-turbo-1106": 2,
     "claude-2.1": 4,
     "claude-2.0": 2,
     "claude-1": 2,
     "claude-instant-1": 4,
+    "gemini-pro": 4,
+    "pplx-7b-online": 4,
+    "pplx-70b-online": 4,
+    "solar-10.7b-instruct-v1.0": 2,
+    "mixtral-8x7b-instruct-v0.1": 4,
     "openhermes-2.5-mistral-7b": 2,
+    "dolphin-2.2.1-mistral-7b": 2,
     "wizardlm-70b": 2,
     "starling-lm-7b-alpha": 2,
     "tulu-2-dpo-70b": 2,
@@ -177,8 +184,7 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
     "openchat-3.5": 2,
     "chatglm3-6b": 2,
     # tier 1
-    "deluxe-chat-v1.1": 4,
-    "palm-2": 1.5,
+    "deluxe-chat-v1.2": 2,
     "llama-2-70b-chat": 1.5,
     "llama-2-13b-chat": 1.5,
     "codellama-34b-instruct": 1.5,
@@ -208,24 +214,57 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
     "llama-13b": 0.1,
     "chatglm-6b": 0.5,
     "deluxe-chat-v1": 4,
+    "palm-2": 1.5,
 }
 
 # target model sampling weights will be boosted.
 BATTLE_TARGETS = {
-    "gpt-4": {"claude-2.1", "gpt-4-turbo"},
-    "gpt-4-turbo": {"gpt-4", "gpt-3.5-turbo", "gpt-3.5-turbo-1106", "claude-2.1"},
-    "gpt-3.5-turbo": {"claude-instant-1", "gpt-4", "claude-2.1"},
-    "gpt-3.5-turbo-1106": {"claude-instant-1", "gpt-3.5-turbo"},
-    "claude-2.1": {"gpt-4-turbo", "gpt-4", "claude-1"},
-    "claude-2.0": {"gpt-4-turbo", "gpt-4", "claude-1"},
-    "claude-1": {"claude-2.1", "gpt-4", "gpt-3.5-turbo"},
+    "gpt-4": {"gpt-4-0314", "claude-2.1", "gpt-4-turbo"},
+    "gpt-4-0613": {"gpt-4-0314", "claude-2.1", "gpt-4-turbo"},
+    "gpt-4-0314": {"gpt-4-turbo", "gpt-4-0613", "claude-2.1", "gpt-3.5-turbo-0613"},
+    "gpt-4-turbo": {
+        "gpt-4-0613",
+        "gpt-3.5-turbo-0613",
+        "gpt-3.5-turbo-1106",
+        "claude-2.1",
+    },
+    "gpt-3.5-turbo-0613": {"claude-instant-1", "gpt-4-0613", "claude-2.1"},
+    "gpt-3.5-turbo-1106": {"gpt-4-0613", "claude-instant-1", "gpt-3.5-turbo-0613"},
+    "solar-10.7b-instruct-v1.0": {
+        "mixtral-8x7b-instruct-v0.1",
+        "gpt-3.5-turbo-0613",
+        "llama-2-70b-chat",
+    },
+    "mixtral-8x7b-instruct-v0.1": {
+        "gpt-3.5-turbo-1106",
+        "gpt-3.5-turbo-0613",
+        "gpt-4-turbo",
+        "llama-2-70b-chat",
+    },
+    "claude-2.1": {"gpt-4-turbo", "gpt-4-0613", "claude-1"},
+    "claude-2.0": {"gpt-4-turbo", "gpt-4-0613", "claude-1"},
+    "claude-1": {"claude-2.1", "gpt-4-0613", "gpt-3.5-turbo-0613"},
     "claude-instant-1": {"gpt-3.5-turbo-1106", "claude-2.1"},
-    "deluxe-chat-v1.1": {"gpt-4", "gpt-4-turbo"},
-    "openhermes-2.5-mistral-7b": {"gpt-3.5-turbo", "openchat-3.5", "zephyr-7b-beta"},
-    "starling-lm-7b-alpha": {"gpt-3.5-turbo", "openchat-3.5", "zephyr-7b-beta"},
-    "tulu-2-dpo-70b": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"},
-    "yi-34b-chat": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"},
-    "openchat-3.5": {"gpt-3.5-turbo", "llama-2-70b-chat", "zephyr-7b-beta"},
+    "gemini-pro": {"gpt-4-turbo", "gpt-4-0613", "gpt-3.5-turbo-0613"},
+    "deluxe-chat-v1.1": {"gpt-4-0613", "gpt-4-turbo"},
+    "deluxe-chat-v1.2": {"gpt-4-0613", "gpt-4-turbo"},
+    "pplx-7b-online": {"gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", "llama-2-70b-chat"},
+    "pplx-70b-online": {"gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", "llama-2-70b-chat"},
+    "openhermes-2.5-mistral-7b": {
+        "gpt-3.5-turbo-0613",
+        "openchat-3.5",
+        "zephyr-7b-beta",
+    },
+    "dolphin-2.2.1-mistral-7b": {
+        "gpt-3.5-turbo-0613",
+        "vicuna-33b",
+        "starling-lm-7b-alpha",
+        "openhermes-2.5-mistral-7b",
+    },
+    "starling-lm-7b-alpha": {"gpt-3.5-turbo-0613", "openchat-3.5", "zephyr-7b-beta"},
+    "tulu-2-dpo-70b": {"gpt-3.5-turbo-0613", "vicuna-33b", "claude-instant-1"},
+    "yi-34b-chat": {"gpt-3.5-turbo-0613", "vicuna-33b", "claude-instant-1"},
+    "openchat-3.5": {"gpt-3.5-turbo-0613", "llama-2-70b-chat", "zephyr-7b-beta"},
     "chatglm3-6b": {"yi-34b-chat", "qwen-14b-chat"},
     "qwen-14b-chat": {"vicuna-13b", "llama-2-13b-chat", "llama-2-70b-chat"},
     "zephyr-7b-alpha": {"mistral-7b-instruct", "llama-2-13b-chat"},
@@ -235,7 +274,7 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
         "llama-2-7b-chat",
         "wizardlm-13b",
     },
-    "llama-2-70b-chat": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"},
+    "llama-2-70b-chat": {"gpt-3.5-turbo-0613", "vicuna-33b", "claude-instant-1"},
     "llama-2-13b-chat": {"mistral-7b-instruct", "vicuna-13b", "llama-2-70b-chat"},
     "llama-2-7b-chat": {"mistral-7b-instruct", "vicuna-7b", "llama-2-13b-chat"},
     "mistral-7b-instruct": {
@@ -243,31 +282,29 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
         "llama-2-13b-chat",
         "llama-2-70b-chat",
     },
-    "vicuna-33b": {"llama-2-70b-chat", "gpt-3.5-turbo", "claude-instant-1"},
+    "vicuna-33b": {"llama-2-70b-chat", "gpt-3.5-turbo-0613", "claude-instant-1"},
     "vicuna-13b": {"llama-2-13b-chat", "llama-2-70b-chat"},
     "vicuna-7b": {"llama-2-7b-chat", "mistral-7b-instruct", "llama-2-13b-chat"},
-    "wizardlm-70b": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"},
-    "palm-2": {"llama-2-13b-chat", "gpt-3.5-turbo"},
+    "wizardlm-70b": {"gpt-3.5-turbo-0613", "vicuna-33b", "claude-instant-1"},
 }
 
 SAMPLING_BOOST_MODELS = [
-    "tulu-2-dpo-70b",
-    "yi-34b-chat",
+    # "tulu-2-dpo-70b",
+    # "yi-34b-chat",
     "claude-2.1",
-    "wizardlm-70b",
-    "starling-lm-7b-alpha",
-    "openhermes-2.5-mistral-7b",
-    "gpt-3.5-turbo-1106",
-    # "openchat-3.5",
-    # "gpt-4-turbo",
-    # "claude-1",
+    "claude-1",
+    "gpt-4-0613",
+    # "gpt-3.5-turbo-1106",
+    # "gpt-4-0314",
+    "gpt-4-turbo",
+    # "dolphin-2.2.1-mistral-7b",
+    "mixtral-8x7b-instruct-v0.1",
+    "gemini-pro",
+    "solar-10.7b-instruct-v1.0",
 ]
 
 # outage models won't be sampled.
-OUTAGE_MODELS = [
-    "zephyr-7b-alpha",
-    "falcon-180b-chat",
-]
+OUTAGE_MODELS = []
 
 
 def get_sample_weight(model):
@@ -291,6 +328,8 @@ def get_battle_pair():
     model_weights = model_weights / total_weight
     chosen_idx = np.random.choice(len(models), p=model_weights)
     chosen_model = models[chosen_idx]
+    # for p, w in zip(models, model_weights):
+    #     print(p, w)
 
     rival_models = []
     rival_weights = []
@@ -427,6 +466,7 @@ def bot_response_multi(
                 top_p,
                 max_new_tokens,
                 request,
+                apply_rate_limit=False,
             )
         )
 
@@ -533,8 +573,8 @@ def build_side_by_side_ui_anony(models):
         )
         max_output_tokens = gr.Slider(
             minimum=16,
-            maximum=1024,
-            value=512,
+            maximum=2048,
+            value=1024,
             step=64,
             interactive=True,
             label="Max output tokens",
diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py
index 155338002..60823f06b 100644
--- a/fastchat/serve/gradio_block_arena_named.py
+++ b/fastchat/serve/gradio_block_arena_named.py
@@ -355,8 +355,8 @@ def build_side_by_side_ui_named(models):
         )
         max_output_tokens = gr.Slider(
             minimum=16,
-            maximum=1024,
-            value=512,
+            maximum=2048,
+            value=1024,
             step=64,
             interactive=True,
             label="Max output tokens",
diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py
index 809833e99..745b527b8 100644
--- a/fastchat/serve/gradio_web_server.py
+++ b/fastchat/serve/gradio_web_server.py
@@ -21,6 +21,7 @@
     ErrorCode,
     MODERATION_MSG,
     CONVERSATION_LIMIT_MSG,
+    RATE_LIMIT_MSG,
     SERVER_ERROR_MSG,
     INPUT_CHAR_LEN_LIMIT,
     CONVERSATION_TURN_LIMIT,
@@ -91,10 +92,8 @@ def __init__(self, model_name):
         self.skip_next = False
         self.model_name = model_name
 
-        if model_name == "palm-2":
-            # According to release note, "chat-bison@001" is PaLM 2 for chat.
-            # https://cloud.google.com/vertex-ai/docs/release-notes#May_10_2023
-            self.palm_chat = init_palm_chat("chat-bison@001")
+        if model_name in ["palm-2", "gemini-pro"]:
+            self.palm_chat = init_palm_chat(model_name)
 
     def to_gradio_chatbot(self):
         return self.conv.to_gradio_chatbot()
@@ -142,19 +141,23 @@ def get_model_list(
         models += list(openai_compatible_models_info.keys())
 
     if add_chatgpt:
-        models += ["gpt-3.5-turbo", "gpt-3.5-turbo-1106"]
+        models += [
+            "gpt-4-0314",
+            "gpt-4-0613",
+            "gpt-3.5-turbo-0613",
+            "gpt-3.5-turbo-1106",
+        ]
     if add_claude:
-        models += ["claude-2.0", "claude-2.1", "claude-instant-1"]
+        models += ["claude-2.1", "claude-2.0", "claude-instant-1"]
     if add_palm:
-        models += ["palm-2"]
+        models += ["gemini-pro"]
     models = list(set(models))
 
-    if "deluxe-chat-v1" in models:
-        del models[models.index("deluxe-chat-v1")]
-    if "deluxe-chat-v1.1" in models:
-        del models[models.index("deluxe-chat-v1.1")]
+    hidden_models = ["gpt-4-0314", "gpt-4-0613"]
+    for hm in hidden_models:
+        del models[models.index(hm)]
 
-    priority = {k: f"___{i:02d}" for i, k in enumerate(model_info)}
+    priority = {k: f"___{i:03d}" for i, k in enumerate(model_info)}
     models.sort(key=lambda x: priority.get(x, x))
     logger.info(f"Models: {models}")
     return models
@@ -329,7 +332,14 @@ def model_worker_stream_iter(
             yield data
 
 
-def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request):
+def bot_response(
+    state,
+    temperature,
+    top_p,
+    max_new_tokens,
+    request: gr.Request,
+    apply_rate_limit=True,
+):
     ip = get_ip(request)
     logger.info(f"bot_response. ip: {ip}")
     start_tstamp = time.time()
@@ -356,7 +366,16 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request)
             api_base=model_info["api_base"],
             api_key=model_info["api_key"],
         )
-    elif model_name in ["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-3.5-turbo-1106"]:
+    elif model_name in [
+        "gpt-3.5-turbo",
+        "gpt-3.5-turbo-0301",
+        "gpt-3.5-turbo-0613",
+        "gpt-3.5-turbo-1106",
+        "gpt-4",
+        "gpt-4-0314",
+        "gpt-4-0613",
+        "gpt-4-turbo",
+    ]:
         # avoid conflict with Azure OpenAI
         assert model_name not in openai_compatible_models_info
         prompt = conv.to_openai_api_messages()
@@ -368,9 +387,14 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request)
         stream_iter = anthropic_api_stream_iter(
             model_name, prompt, temperature, top_p, max_new_tokens
         )
-    elif model_name == "palm-2":
+    elif model_name in ["palm-2", "gemini-pro"]:
         stream_iter = palm_api_stream_iter(
-            state.palm_chat, conv.messages[-2][1], temperature, top_p, max_new_tokens
+            model_name,
+            state.palm_chat,
+            conv.messages[-2][1],
+            temperature,
+            top_p,
+            max_new_tokens,
         )
     else:
         # Query worker address
@@ -689,8 +713,8 @@ def build_single_model_ui(models, add_promotion_links=False):
         )
         max_output_tokens = gr.Slider(
             minimum=16,
-            maximum=1024,
-            value=512,
+            maximum=2048,
+            value=1024,
             step=64,
             interactive=True,
             label="Max output tokens",
diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py
index 36ff5c8ce..0009c02ad 100644
--- a/fastchat/serve/gradio_web_server_multi.py
+++ b/fastchat/serve/gradio_web_server_multi.py
@@ -85,15 +85,22 @@ def load_demo(url_params, request: gr.Request):
         # Only enable these models in anony battles.
         if args.add_chatgpt:
             models_anony += [
-                "gpt-4",
-                "gpt-3.5-turbo",
-                "gpt-4-turbo",
+                "gpt-4-0314",
+                "gpt-4-0613",
+                "gpt-3.5-turbo-0613",
                 "gpt-3.5-turbo-1106",
             ]
         if args.add_claude:
             models_anony += ["claude-2.1", "claude-2.0", "claude-1", "claude-instant-1"]
         if args.add_palm:
-            models_anony += ["palm-2"]
+            models_anony += ["gemini-pro"]
+    anony_only_models = [
+        "claude-1",
+        "gpt-4-0314",
+        "gpt-4-0613",
+    ]
+    for mdl in anony_only_models:
+        models_anony.append(mdl)
     models_anony = list(set(models_anony))
 
     side_by_side_anony_updates = load_demo_side_by_side_anony(models_anony, url_params)
diff --git a/fastchat/serve/huggingface_api.py b/fastchat/serve/huggingface_api.py
index 2a49bf5f1..8022fbc93 100644
--- a/fastchat/serve/huggingface_api.py
+++ b/fastchat/serve/huggingface_api.py
@@ -61,7 +61,7 @@ def main(args):
     add_model_args(parser)
     parser.add_argument("--temperature", type=float, default=0.7)
     parser.add_argument("--repetition_penalty", type=float, default=1.0)
-    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument("--max-new-tokens", type=int, default=1024)
     parser.add_argument("--debug", action="store_true")
     parser.add_argument("--message", type=str, default="Hello! Who are you?")
     args = parser.parse_args()
diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py
index 580a2c866..59c48eb0e 100644
--- a/fastchat/serve/monitor/monitor.py
+++ b/fastchat/serve/monitor/monitor.py
@@ -22,7 +22,9 @@
 from fastchat.utils import build_logger, get_window_url_params_js
 
 
-notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
+notebook_url = (
+    "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH"
+)
 
 
 basic_component_values = [None] * 6
@@ -39,7 +41,7 @@ def make_leaderboard_md(elo_results):
 - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
 - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
 
-💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: November, 2023.
+💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Dec 20, 2023.
 """
     return leaderboard_md