diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py index 2fe8e6304..50b028afb 100644 --- a/fastchat/serve/vllm_worker.py +++ b/fastchat/serve/vllm_worker.py @@ -192,6 +192,7 @@ async def api_model_details(request: Request): "--controller-address", type=str, default="http://localhost:21001" ) parser.add_argument("--model-path", type=str, default="lmsys/vicuna-7b-v1.3") + parser.add_argument("--quantization", type=str) parser.add_argument( "--model-names", type=lambda s: s.split(","), @@ -210,7 +211,7 @@ async def api_model_details(request: Request): args.model = args.model_path if args.num_gpus > 1: args.tensor_parallel_size = args.num_gpus - if args.quantizaiton: + if args.quantization: args.quantization = args.quantization engine_args = AsyncEngineArgs.from_cli_args(args)