diff --git a/README.md b/README.md index 7e27001..eb1d51e 100644 --- a/README.md +++ b/README.md @@ -66,24 +66,29 @@ print(bubble_sort([5, 2, 1, 8, 4])) python ./demo/run_demo.py usage: run_demo.py [-h] [--model-path MODEL_PATH] [--example-path EXAMPLE_PATH] [--quantize QUANTIZE] - [--fastllm] [--n-gpus N_GPUS] [--gpu GPU] [--cpu] [--auth] [--username yourname] + [--chatglm-cpp] [--fastllm] [--n-gpus N_GPUS] [--gpu GPU] [--cpu] [--auth] [--username yourname] [--password yourpassword] [--port PORT] [--listen ADDRESS] # 若要启用身份验证,请先启用--auth,然后定义--username与--password,如: python run_demo.py --auth --username user --password password # 若要监听所有地址请指定 --listen 0.0.0.0 ``` +支持使用 [ChatGLM.cpp](https://github.com/li-plus/chatglm.cpp) 量化推理加速: +```sh +python ./demo/run_demo.py --quantize 4 --chatglm-cpp +``` ### 启动FAST API: ``` python ./demo/fastapicpu.py -usage: fastapicpu.py [-h] [--model-path MODEL_PATH] [--listen ADDRESS] [--port PORT] [--workders NUM] [--cpu] [--half] +usage: fastapicpu.py [-h] [--model-path MODEL_PATH] [--listen ADDRESS] [--port PORT] [--workders NUM] [--cpu] [--half] [--quantize QUANTIZE] [--chatglm-cpp] # --cpu启用cpu --half启用.half() ``` +支持使用 [ChatGLM.cpp](https://github.com/li-plus/chatglm.cpp) 量化推理加速,同样添加 `--quantize 4 --chatglm-cpp` 参数即可。 ### API使用示例 ``` -curl -X POST "http://127.0.0.1:3435" \ - -H 'Content-Type: application/json' \ - -d '{"lang": "C", "prompt": "# Write a quick sort function"}' +curl -X POST "http://127.0.0.1:7860" \ + -H 'Content-Type: application/json' \ + -d '{"lang": "Python", "prompt": "# Write a quick sort function"}' ``` diff --git a/demo/fastapicpu.py b/demo/fastapicpu.py index fdf100d..cd1edab 100644 --- a/demo/fastapicpu.py +++ b/demo/fastapicpu.py @@ -3,6 +3,16 @@ import uvicorn, json, datetime import torch import argparse + +try: + import chatglm_cpp + enable_chatglm_cpp = True +except: + print("[WARN] chatglm-cpp not found. Install it by `pip install chatglm-cpp` for better performance. " + "Check out https://github.com/li-plus/chatglm.cpp for more details.") + enable_chatglm_cpp = False + + #获取选项 def add_code_generation_args(parser): group = parser.add_argument_group(title="CodeGeeX2 DEMO") @@ -34,6 +44,15 @@ def add_code_generation_args(parser): "--half", action="store_true", ) + group.add_argument( + "--quantize", + type=int, + default=None, + ) + group.add_argument( + "--chatglm-cpp", + action="store_true", + ) return parser LANGUAGE_TAG = { @@ -108,37 +127,56 @@ def add_code_generation_args(parser): app = FastAPI() def device(): + if enable_chatglm_cpp and args.chatglm_cpp: + print("Using chatglm-cpp to improve performance") + dtype = "f16" if args.half else "f32" + if args.quantize in [4, 5, 8]: + dtype = f"q{args.quantize}_0" + model = chatglm_cpp.Pipeline(args.model_path, dtype=dtype) + return model + + print("chatglm-cpp not enabled, falling back to transformers") if not args.cpu: if not args.half: model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True).cuda() else: model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True).cuda().half() + if args.quantize in [4, 8]: + print(f"Model is quantized to INT{args.quantize} format.") + model = model.half().quantize(args.quantize) else: model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True) - return model + return model.eval() @app.post("/") async def create_item(request: Request): global model, tokenizer - tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) json_post_raw = await request.json() json_post = json.dumps(json_post_raw) json_post_list = json.loads(json_post) lang = json_post_list.get('lang') prompt = json_post_list.get('prompt') - max_length = json_post_list.get('max_length') - top_p = json_post_list.get('top_p') - temperature = json_post_list.get('temperature') - top_k = json_post_list.get('top_k') + max_length = json_post_list.get('max_length', 128) + top_p = json_post_list.get('top_p', 0.95) + temperature = json_post_list.get('temperature', 0.2) + top_k = json_post_list.get('top_k', 0) if lang != "None": prompt = LANGUAGE_TAG[lang] + "\n" + prompt - response = model.chat(tokenizer, - prompt, - max_length=max_length if max_length else 128, - top_p=top_p if top_p else 0.95, - top_k=top_k if top_k else 0, - temperature=temperature if temperature else 0.2) + if enable_chatglm_cpp and args.chatglm_cpp: + response = model.generate(prompt, + max_length=max_length, + do_sample=temperature > 0, + top_p=top_p, + top_k=top_k, + temperature=temperature) + else: + response = model.chat(tokenizer, + prompt, + max_length=max_length, + top_p=top_p, + top_k=top_k, + temperature=temperature) now = datetime.datetime.now() time = now.strftime("%Y-%m-%d %H:%M:%S") answer = { @@ -157,6 +195,6 @@ async def create_item(request: Request): parser = argparse.ArgumentParser() parser = add_code_generation_args(parser) args, _ = parser.parse_known_args() + tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) model = device() - model.eval() uvicorn.run(app, host=args.listen, port=args.port, workers=args.workers) diff --git a/demo/run_demo.py b/demo/run_demo.py index d99b977..ea58ecb 100644 --- a/demo/run_demo.py +++ b/demo/run_demo.py @@ -23,6 +23,14 @@ print("Multiple GPUs support disabled.") enable_multiple_gpus = False +try: + import chatglm_cpp + enable_chatglm_cpp = True +except: + print("[WARN] chatglm-cpp not found. Install it by `pip install chatglm-cpp` for better performance. " + "Check out https://github.com/li-plus/chatglm.cpp for more details.") + enable_chatglm_cpp = False + def get_model(args): if not args.cpu: @@ -42,6 +50,12 @@ def get_model(args): print(f"Runing on {args.n_gpus} GPUs.") model = load_model_on_gpus(args.model_path, num_gpus=args.n_gpus) model = model.eval() + elif enable_chatglm_cpp and args.chatglm_cpp: + print("Using chatglm-cpp to improve performance") + dtype = "f16" + if args.quantize in [4, 5, 8]: + dtype = f"q{args.quantize}_0" + model = chatglm_cpp.Pipeline(args.model_path, dtype=dtype) else: model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True) model = model.eval() @@ -55,7 +69,7 @@ def get_model(args): else: model = llm.from_hf(model, dtype="float16") else: - print("fastllm not installed, using transformers.") + print("chatglm-cpp and fastllm not installed, using transformers.") if args.quantize in [4, 8]: print(f"Model is quantized to INT{args.quantize} format.") model = model.half().quantize(args.quantize) @@ -81,6 +95,10 @@ def add_code_generation_args(parser): type=int, default=None, ) + group.add_argument( + "--chatglm-cpp", + action="store_true", + ) group.add_argument( "--fastllm", action="store_true", @@ -248,6 +266,16 @@ def predict( top_k=top_k, temperature=temperature) response = prompt + outputs[0] + elif enable_chatglm_cpp and args.chatglm_cpp: + inputs = tokenizer([prompt], return_tensors="pt") + pipeline = model + outputs = pipeline.generate(prompt, + max_length=inputs['input_ids'].shape[-1] + out_seq_length, + do_sample=temperature > 0, + top_p=top_p, + top_k=top_k, + temperature=temperature) + response = prompt + outputs else: inputs = tokenizer([prompt], return_tensors="pt") inputs = inputs.to(model.device) diff --git a/docs/zh/inference_zh.md b/docs/zh/inference_zh.md index d5cd9f7..b45580f 100644 --- a/docs/zh/inference_zh.md +++ b/docs/zh/inference_zh.md @@ -7,6 +7,7 @@ CodeGeeX2 是多语言代码生成模型 [CodeGeeX](https://github.com/THUDM/Cod - [多GPU推理](#多GPU推理) - [Mac推理](#Mac推理) - [fastllm加速推理](#fastllm加速推理) +- [ChatGLM.cpp量化推理](#chatglmcpp-量化推理) ## 快速开始 @@ -168,3 +169,51 @@ outputs = model.chat(tokenizer, temperature=temperature) response = outputs[0] ``` + +## ChatGLM.cpp 量化推理 + +[ChatGLM.cpp](https://github.com/li-plus/chatglm.cpp) 是类似 LLaMA.cpp 的全平台量化加速方案,支持 q4_0/q4_1/q5_0/q5_1/q8_0 多种量化精度,CPU/CUDA/Metal 多种后端,仅用一行代码实现推理加速。 + +首先安装 chatglm-cpp。如需使用 CUDA 加速,需要添加环境变量 `CMAKE_ARGS="-DGGML_CUBLAS=ON"`;如果仅使用 CPU 加速,将该环境变量去掉即可。 +```sh +CMAKE_ARGS="-DGGML_CUBLAS=ON" pip install chatglm-cpp -v +``` + +仅需一行代码即可量化加速 Hugging Face 模型,`dtype` 可指定 `q4_0`, `q4_1`, `q5_0`, `q5_1`, `q8_0`, `f16`,表示不同的量化类型。 +```python +>>> import chatglm_cpp +>>> +>>> pipeline = chatglm_cpp.Pipeline("THUDM/codegeex2-6b", dtype="q4_0") # Load HF model and quantize it into int4 +Loading checkpoint shards: 100%|███████████████████████████████████████████████| 7/7 [00:09<00:00, 1.33s/it] +Processing model states: 100%|█████████████████████████████████████████████| 199/199 [00:21<00:00, 9.21it/s] +... +>>> print(pipeline.generate("# language: Python\n# write a bubble sort function\n", do_sample=False)) + + +def bubble_sort(list): + for i in range(len(list) - 1): + for j in range(len(list) - 1): + if list[j] > list[j + 1]: + list[j], list[j + 1] = list[j + 1], list[j] + return list + + +print(bubble_sort([5, 4, 3, 2, 1])) +``` + +ChatGLM.cpp 已集成到本仓库,demo 添加选项 `--quantize 4 --chatglm-cpp` 即可开启 int4 (q4_0) 量化加速,例如: +```sh +python ./demo/run_demo.py --quantize 4 --chatglm-cpp +``` + +Fast API 同样支持 ChatGLM.cpp 加速,添加同样参数启动服务: +```sh +python ./demo/fastapicpu.py --quantize 4 --chatglm-cpp +``` + +测试服务接口: +```sh +curl -X POST "http://127.0.0.1:7860" \ + -H 'Content-Type: application/json' \ + -d '{"lang": "Python", "prompt": "# Write a bubble sort function", "max_length": 512}' +```