Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,24 +66,29 @@ print(bubble_sort([5, 2, 1, 8, 4]))
python ./demo/run_demo.py

usage: run_demo.py [-h] [--model-path MODEL_PATH] [--example-path EXAMPLE_PATH] [--quantize QUANTIZE]
[--fastllm] [--n-gpus N_GPUS] [--gpu GPU] [--cpu] [--auth] [--username yourname]
[--chatglm-cpp] [--fastllm] [--n-gpus N_GPUS] [--gpu GPU] [--cpu] [--auth] [--username yourname]
[--password yourpassword]
[--port PORT] [--listen ADDRESS]

# 若要启用身份验证,请先启用--auth,然后定义--username与--password,如:
python run_demo.py --auth --username user --password password # 若要监听所有地址请指定 --listen 0.0.0.0
```
支持使用 [ChatGLM.cpp](https://github.com/li-plus/chatglm.cpp) 量化推理加速:
```sh
python ./demo/run_demo.py --quantize 4 --chatglm-cpp
```
### 启动FAST API:
```
python ./demo/fastapicpu.py
usage: fastapicpu.py [-h] [--model-path MODEL_PATH] [--listen ADDRESS] [--port PORT] [--workders NUM] [--cpu] [--half]
usage: fastapicpu.py [-h] [--model-path MODEL_PATH] [--listen ADDRESS] [--port PORT] [--workders NUM] [--cpu] [--half] [--quantize QUANTIZE] [--chatglm-cpp]
# --cpu启用cpu --half启用.half()
```
支持使用 [ChatGLM.cpp](https://github.com/li-plus/chatglm.cpp) 量化推理加速,同样添加 `--quantize 4 --chatglm-cpp` 参数即可。
### API使用示例
```
curl -X POST "http://127.0.0.1:3435" \
-H 'Content-Type: application/json' \
-d '{"lang": "C", "prompt": "# Write a quick sort function"}'
curl -X POST "http://127.0.0.1:7860" \
-H 'Content-Type: application/json' \
-d '{"lang": "Python", "prompt": "# Write a quick sort function"}'
```


Expand Down
64 changes: 51 additions & 13 deletions demo/fastapicpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,16 @@
import uvicorn, json, datetime
import torch
import argparse

try:
import chatglm_cpp
enable_chatglm_cpp = True
except:
print("[WARN] chatglm-cpp not found. Install it by `pip install chatglm-cpp` for better performance. "
"Check out https://github.com/li-plus/chatglm.cpp for more details.")
enable_chatglm_cpp = False


#获取选项
def add_code_generation_args(parser):
group = parser.add_argument_group(title="CodeGeeX2 DEMO")
Expand Down Expand Up @@ -34,6 +44,15 @@ def add_code_generation_args(parser):
"--half",
action="store_true",
)
group.add_argument(
"--quantize",
type=int,
default=None,
)
group.add_argument(
"--chatglm-cpp",
action="store_true",
)
return parser

LANGUAGE_TAG = {
Expand Down Expand Up @@ -108,37 +127,56 @@ def add_code_generation_args(parser):

app = FastAPI()
def device():
if enable_chatglm_cpp and args.chatglm_cpp:
print("Using chatglm-cpp to improve performance")
dtype = "f16" if args.half else "f32"
if args.quantize in [4, 5, 8]:
dtype = f"q{args.quantize}_0"
model = chatglm_cpp.Pipeline(args.model_path, dtype=dtype)
return model

print("chatglm-cpp not enabled, falling back to transformers")
if not args.cpu:
if not args.half:
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True).cuda()
else:
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True).cuda().half()
if args.quantize in [4, 8]:
print(f"Model is quantized to INT{args.quantize} format.")
model = model.half().quantize(args.quantize)
else:
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)

return model
return model.eval()

@app.post("/")
async def create_item(request: Request):
global model, tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
json_post_raw = await request.json()
json_post = json.dumps(json_post_raw)
json_post_list = json.loads(json_post)
lang = json_post_list.get('lang')
prompt = json_post_list.get('prompt')
max_length = json_post_list.get('max_length')
top_p = json_post_list.get('top_p')
temperature = json_post_list.get('temperature')
top_k = json_post_list.get('top_k')
max_length = json_post_list.get('max_length', 128)
top_p = json_post_list.get('top_p', 0.95)
temperature = json_post_list.get('temperature', 0.2)
top_k = json_post_list.get('top_k', 0)
if lang != "None":
prompt = LANGUAGE_TAG[lang] + "\n" + prompt
response = model.chat(tokenizer,
prompt,
max_length=max_length if max_length else 128,
top_p=top_p if top_p else 0.95,
top_k=top_k if top_k else 0,
temperature=temperature if temperature else 0.2)
if enable_chatglm_cpp and args.chatglm_cpp:
response = model.generate(prompt,
max_length=max_length,
do_sample=temperature > 0,
top_p=top_p,
top_k=top_k,
temperature=temperature)
else:
response = model.chat(tokenizer,
prompt,
max_length=max_length,
top_p=top_p,
top_k=top_k,
temperature=temperature)
now = datetime.datetime.now()
time = now.strftime("%Y-%m-%d %H:%M:%S")
answer = {
Expand All @@ -157,6 +195,6 @@ async def create_item(request: Request):
parser = argparse.ArgumentParser()
parser = add_code_generation_args(parser)
args, _ = parser.parse_known_args()
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
model = device()
model.eval()
uvicorn.run(app, host=args.listen, port=args.port, workers=args.workers)
30 changes: 29 additions & 1 deletion demo/run_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@
print("Multiple GPUs support disabled.")
enable_multiple_gpus = False

try:
import chatglm_cpp
enable_chatglm_cpp = True
except:
print("[WARN] chatglm-cpp not found. Install it by `pip install chatglm-cpp` for better performance. "
"Check out https://github.com/li-plus/chatglm.cpp for more details.")
enable_chatglm_cpp = False


def get_model(args):
if not args.cpu:
Expand All @@ -42,6 +50,12 @@ def get_model(args):
print(f"Runing on {args.n_gpus} GPUs.")
model = load_model_on_gpus(args.model_path, num_gpus=args.n_gpus)
model = model.eval()
elif enable_chatglm_cpp and args.chatglm_cpp:
print("Using chatglm-cpp to improve performance")
dtype = "f16"
if args.quantize in [4, 5, 8]:
dtype = f"q{args.quantize}_0"
model = chatglm_cpp.Pipeline(args.model_path, dtype=dtype)
else:
model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
model = model.eval()
Expand All @@ -55,7 +69,7 @@ def get_model(args):
else:
model = llm.from_hf(model, dtype="float16")
else:
print("fastllm not installed, using transformers.")
print("chatglm-cpp and fastllm not installed, using transformers.")
if args.quantize in [4, 8]:
print(f"Model is quantized to INT{args.quantize} format.")
model = model.half().quantize(args.quantize)
Expand All @@ -81,6 +95,10 @@ def add_code_generation_args(parser):
type=int,
default=None,
)
group.add_argument(
"--chatglm-cpp",
action="store_true",
)
group.add_argument(
"--fastllm",
action="store_true",
Expand Down Expand Up @@ -248,6 +266,16 @@ def predict(
top_k=top_k,
temperature=temperature)
response = prompt + outputs[0]
elif enable_chatglm_cpp and args.chatglm_cpp:
inputs = tokenizer([prompt], return_tensors="pt")
pipeline = model
outputs = pipeline.generate(prompt,
max_length=inputs['input_ids'].shape[-1] + out_seq_length,
do_sample=temperature > 0,
top_p=top_p,
top_k=top_k,
temperature=temperature)
response = prompt + outputs
else:
inputs = tokenizer([prompt], return_tensors="pt")
inputs = inputs.to(model.device)
Expand Down
49 changes: 49 additions & 0 deletions docs/zh/inference_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ CodeGeeX2 是多语言代码生成模型 [CodeGeeX](https://github.com/THUDM/Cod
- [多GPU推理](#多GPU推理)
- [Mac推理](#Mac推理)
- [fastllm加速推理](#fastllm加速推理)
- [ChatGLM.cpp量化推理](#chatglmcpp-量化推理)

## 快速开始

Expand Down Expand Up @@ -168,3 +169,51 @@ outputs = model.chat(tokenizer,
temperature=temperature)
response = outputs[0]
```

## ChatGLM.cpp 量化推理

[ChatGLM.cpp](https://github.com/li-plus/chatglm.cpp) 是类似 LLaMA.cpp 的全平台量化加速方案,支持 q4_0/q4_1/q5_0/q5_1/q8_0 多种量化精度,CPU/CUDA/Metal 多种后端,仅用一行代码实现推理加速。

首先安装 chatglm-cpp。如需使用 CUDA 加速,需要添加环境变量 `CMAKE_ARGS="-DGGML_CUBLAS=ON"`;如果仅使用 CPU 加速,将该环境变量去掉即可。
```sh
CMAKE_ARGS="-DGGML_CUBLAS=ON" pip install chatglm-cpp -v
```

仅需一行代码即可量化加速 Hugging Face 模型,`dtype` 可指定 `q4_0`, `q4_1`, `q5_0`, `q5_1`, `q8_0`, `f16`,表示不同的量化类型。
```python
>>> import chatglm_cpp
>>>
>>> pipeline = chatglm_cpp.Pipeline("THUDM/codegeex2-6b", dtype="q4_0") # Load HF model and quantize it into int4
Loading checkpoint shards: 100%|███████████████████████████████████████████████| 7/7 [00:09<00:00, 1.33s/it]
Processing model states: 100%|█████████████████████████████████████████████| 199/199 [00:21<00:00, 9.21it/s]
...
>>> print(pipeline.generate("# language: Python\n# write a bubble sort function\n", do_sample=False))


def bubble_sort(list):
for i in range(len(list) - 1):
for j in range(len(list) - 1):
if list[j] > list[j + 1]:
list[j], list[j + 1] = list[j + 1], list[j]
return list


print(bubble_sort([5, 4, 3, 2, 1]))
```

ChatGLM.cpp 已集成到本仓库,demo 添加选项 `--quantize 4 --chatglm-cpp` 即可开启 int4 (q4_0) 量化加速,例如:
```sh
python ./demo/run_demo.py --quantize 4 --chatglm-cpp
```

Fast API 同样支持 ChatGLM.cpp 加速,添加同样参数启动服务:
```sh
python ./demo/fastapicpu.py --quantize 4 --chatglm-cpp
```

测试服务接口:
```sh
curl -X POST "http://127.0.0.1:7860" \
-H 'Content-Type: application/json' \
-d '{"lang": "Python", "prompt": "# Write a bubble sort function", "max_length": 512}'
```