zai-org · Stanislas0 · Aug 11, 2023 · Aug 7, 2023 · Aug 11, 2023
diff --git a/README.md b/README.md
@@ -66,24 +66,29 @@ print(bubble_sort([5, 2, 1, 8, 4]))
 python ./demo/run_demo.py
 
 usage: run_demo.py [-h] [--model-path MODEL_PATH] [--example-path EXAMPLE_PATH] [--quantize QUANTIZE]
-                   [--fastllm] [--n-gpus N_GPUS] [--gpu GPU] [--cpu] [--auth] [--username yourname]
+                   [--chatglm-cpp] [--fastllm] [--n-gpus N_GPUS] [--gpu GPU] [--cpu] [--auth] [--username yourname]
                    [--password yourpassword]
                    [--port PORT] [--listen ADDRESS]
 
 # 若要启用身份验证，请先启用--auth，然后定义--username与--password，如：
 python run_demo.py --auth --username user --password password  # 若要监听所有地址请指定 --listen 0.0.0.0
 ```
+支持使用 [ChatGLM.cpp](https://github.com/li-plus/chatglm.cpp) 量化推理加速：
+```sh
+python ./demo/run_demo.py --quantize 4 --chatglm-cpp
+```
 ### 启动FAST API:
 ```
 python ./demo/fastapicpu.py
-usage: fastapicpu.py [-h] [--model-path MODEL_PATH] [--listen ADDRESS] [--port PORT] [--workders NUM] [--cpu] [--half]
+usage: fastapicpu.py [-h] [--model-path MODEL_PATH] [--listen ADDRESS] [--port PORT] [--workders NUM] [--cpu] [--half] [--quantize QUANTIZE] [--chatglm-cpp]
 # --cpu启用cpu --half启用.half()
 ```
+支持使用 [ChatGLM.cpp](https://github.com/li-plus/chatglm.cpp) 量化推理加速，同样添加 `--quantize 4 --chatglm-cpp` 参数即可。
 ### API使用示例
 ```
-curl -X POST "http://127.0.0.1:3435" \                                                                                                                    
- -H 'Content-Type: application/json' \
-    -d '{"lang": "C", "prompt": "# Write a quick sort function"}' 
+curl -X POST "http://127.0.0.1:7860" \
+    -H 'Content-Type: application/json' \
+    -d '{"lang": "Python", "prompt": "# Write a quick sort function"}'
 ```
 
 

diff --git a/demo/fastapicpu.py b/demo/fastapicpu.py
@@ -3,6 +3,16 @@
 import uvicorn, json, datetime
 import torch
 import argparse
+
+try:
+    import chatglm_cpp
+    enable_chatglm_cpp = True
+except:
+    print("[WARN] chatglm-cpp not found. Install it by `pip install chatglm-cpp` for better performance. "
+          "Check out https://github.com/li-plus/chatglm.cpp for more details.")
+    enable_chatglm_cpp = False
+
+
 #获取选项        
 def add_code_generation_args(parser):
     group = parser.add_argument_group(title="CodeGeeX2 DEMO")
@@ -34,6 +44,15 @@ def add_code_generation_args(parser):
         "--half",
         action="store_true",
     )
+    group.add_argument(
+        "--quantize",
+        type=int,
+        default=None,
+    )
+    group.add_argument(
+        "--chatglm-cpp",
+        action="store_true",
+    )
     return parser
 
 LANGUAGE_TAG = {
@@ -108,37 +127,56 @@ def add_code_generation_args(parser):
 
 app = FastAPI()
 def device():
+    if enable_chatglm_cpp and args.chatglm_cpp:
+        print("Using chatglm-cpp to improve performance")
+        dtype = "f16" if args.half else "f32"
+        if args.quantize in [4, 5, 8]:
+            dtype = f"q{args.quantize}_0"
+        model = chatglm_cpp.Pipeline(args.model_path, dtype=dtype)
+        return model
+
+    print("chatglm-cpp not enabled, falling back to transformers")
     if not args.cpu:
         if not args.half:
             model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True).cuda()
         else:
             model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True).cuda().half()
+        if args.quantize in [4, 8]:
+            print(f"Model is quantized to INT{args.quantize} format.")
+            model = model.half().quantize(args.quantize)
     else:
         model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
 
-    return model
+    return model.eval()
 
 @app.post("/")
 async def create_item(request: Request):
     global model, tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
     json_post_raw = await request.json()
     json_post = json.dumps(json_post_raw)
     json_post_list = json.loads(json_post)
     lang = json_post_list.get('lang')
     prompt = json_post_list.get('prompt')
-    max_length = json_post_list.get('max_length')
-    top_p = json_post_list.get('top_p')
-    temperature = json_post_list.get('temperature')
-    top_k = json_post_list.get('top_k')
+    max_length = json_post_list.get('max_length', 128)
+    top_p = json_post_list.get('top_p', 0.95)
+    temperature = json_post_list.get('temperature', 0.2)
+    top_k = json_post_list.get('top_k', 0)
     if lang != "None":
         prompt = LANGUAGE_TAG[lang] + "\n" + prompt
-    response = model.chat(tokenizer,
-                          prompt,
-                          max_length=max_length if max_length else 128,
-                          top_p=top_p if top_p else 0.95,
-                          top_k=top_k if top_k else 0,
-                          temperature=temperature if temperature else 0.2)
+    if enable_chatglm_cpp and args.chatglm_cpp:
+        response = model.generate(prompt,
+                                  max_length=max_length,
+                                  do_sample=temperature > 0,
+                                  top_p=top_p,
+                                  top_k=top_k,
+                                  temperature=temperature)
+    else:
+        response = model.chat(tokenizer,
+                              prompt,
+                              max_length=max_length,
+                              top_p=top_p,
+                              top_k=top_k,
+                              temperature=temperature)
     now = datetime.datetime.now()
     time = now.strftime("%Y-%m-%d %H:%M:%S")
     answer = {
@@ -157,6 +195,6 @@ async def create_item(request: Request):
     parser = argparse.ArgumentParser()
     parser = add_code_generation_args(parser)
     args, _ = parser.parse_known_args()
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
     model = device()
-    model.eval()
     uvicorn.run(app, host=args.listen, port=args.port, workers=args.workers)
diff --git a/demo/run_demo.py b/demo/run_demo.py
@@ -23,6 +23,14 @@
     print("Multiple GPUs support disabled.")
     enable_multiple_gpus = False
 
+try:
+    import chatglm_cpp
+    enable_chatglm_cpp = True
+except:
+    print("[WARN] chatglm-cpp not found. Install it by `pip install chatglm-cpp` for better performance. "
+          "Check out https://github.com/li-plus/chatglm.cpp for more details.")
+    enable_chatglm_cpp = False
+
 
 def get_model(args):
     if not args.cpu:
@@ -42,6 +50,12 @@ def get_model(args):
         print(f"Runing on {args.n_gpus} GPUs.")
         model = load_model_on_gpus(args.model_path, num_gpus=args.n_gpus)
         model = model.eval()
+    elif enable_chatglm_cpp and args.chatglm_cpp:
+        print("Using chatglm-cpp to improve performance")
+        dtype = "f16"
+        if args.quantize in [4, 5, 8]:
+            dtype = f"q{args.quantize}_0"
+        model = chatglm_cpp.Pipeline(args.model_path, dtype=dtype)
     else:
         model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
         model = model.eval()
@@ -55,7 +69,7 @@ def get_model(args):
             else:
                 model = llm.from_hf(model, dtype="float16")
         else:
-            print("fastllm not installed, using transformers.")
+            print("chatglm-cpp and fastllm not installed, using transformers.")
             if args.quantize in [4, 8]:
                 print(f"Model is quantized to INT{args.quantize} format.")
                 model = model.half().quantize(args.quantize)
@@ -81,6 +95,10 @@ def add_code_generation_args(parser):
         type=int,
         default=None,
     )
+    group.add_argument(
+        "--chatglm-cpp",
+        action="store_true",
+    )
     group.add_argument(
         "--fastllm",
         action="store_true",
@@ -248,6 +266,16 @@ def predict(
                                  top_k=top_k,
                                  temperature=temperature)
             response = prompt + outputs[0]
+        elif enable_chatglm_cpp and args.chatglm_cpp:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            pipeline = model
+            outputs = pipeline.generate(prompt,
+                                        max_length=inputs['input_ids'].shape[-1] + out_seq_length,
+                                        do_sample=temperature > 0,
+                                        top_p=top_p,
+                                        top_k=top_k,
+                                        temperature=temperature)
+            response = prompt + outputs
         else:
             inputs = tokenizer([prompt], return_tensors="pt")
             inputs = inputs.to(model.device)

diff --git a/docs/zh/inference_zh.md b/docs/zh/inference_zh.md
@@ -7,6 +7,7 @@ CodeGeeX2 是多语言代码生成模型 [CodeGeeX](https://github.com/THUDM/Cod
 - [多GPU推理](#多GPU推理)
 - [Mac推理](#Mac推理)
 - [fastllm加速推理](#fastllm加速推理)
+- [ChatGLM.cpp量化推理](#chatglmcpp-量化推理)
 
 ## 快速开始
 
@@ -168,3 +169,51 @@ outputs = model.chat(tokenizer,
                      temperature=temperature)
 response = outputs[0]
 ```
+
+## ChatGLM.cpp 量化推理
+
+[ChatGLM.cpp](https://github.com/li-plus/chatglm.cpp) 是类似 LLaMA.cpp 的全平台量化加速方案，支持 q4_0/q4_1/q5_0/q5_1/q8_0 多种量化精度，CPU/CUDA/Metal 多种后端，仅用一行代码实现推理加速。
+
+首先安装 chatglm-cpp。如需使用 CUDA 加速，需要添加环境变量 `CMAKE_ARGS="-DGGML_CUBLAS=ON"`；如果仅使用 CPU 加速，将该环境变量去掉即可。
+```sh
+CMAKE_ARGS="-DGGML_CUBLAS=ON" pip install chatglm-cpp -v
+```
+
+仅需一行代码即可量化加速 Hugging Face 模型，`dtype` 可指定 `q4_0`, `q4_1`, `q5_0`, `q5_1`, `q8_0`, `f16`，表示不同的量化类型。
+```python
+>>> import chatglm_cpp
+>>> 
+>>> pipeline = chatglm_cpp.Pipeline("THUDM/codegeex2-6b", dtype="q4_0") # Load HF model and quantize it into int4
+Loading checkpoint shards: 100%|███████████████████████████████████████████████| 7/7 [00:09<00:00,  1.33s/it]
+Processing model states: 100%|█████████████████████████████████████████████| 199/199 [00:21<00:00,  9.21it/s]
+...
+>>> print(pipeline.generate("# language: Python\n# write a bubble sort function\n", do_sample=False))
+
+
+def bubble_sort(list):
+    for i in range(len(list) - 1):
+        for j in range(len(list) - 1):
+            if list[j] > list[j + 1]:
+                list[j], list[j + 1] = list[j + 1], list[j]
+    return list
+
+
+print(bubble_sort([5, 4, 3, 2, 1]))
+```
+
+ChatGLM.cpp 已集成到本仓库，demo 添加选项 `--quantize 4 --chatglm-cpp` 即可开启 int4 (q4_0) 量化加速，例如：
+```sh
+python ./demo/run_demo.py --quantize 4 --chatglm-cpp
+```
+
+Fast API 同样支持 ChatGLM.cpp 加速，添加同样参数启动服务：
+```sh
+python ./demo/fastapicpu.py --quantize 4 --chatglm-cpp
+```
+
+测试服务接口：
+```sh
+curl -X POST "http://127.0.0.1:7860" \
+    -H 'Content-Type: application/json' \
+    -d '{"lang": "Python", "prompt": "# Write a bubble sort function", "max_length": 512}'
+```