From ed8069edade958e68e02ce9aece06bef6567d5a4 Mon Sep 17 00:00:00 2001 From: jq460494839 <460494839@qq.com> Date: Thu, 21 Dec 2023 11:35:39 +0800 Subject: [PATCH 1/2] NPU needs to be initialized when starting a new process --- fastchat/serve/model_worker.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py index 5e84a4262..44f15b9c0 100644 --- a/fastchat/serve/model_worker.py +++ b/fastchat/serve/model_worker.py @@ -31,7 +31,6 @@ str_to_torch_dtype, ) - worker_id = str(uuid.uuid4())[:8] logger = build_logger("model_worker", f"model_worker_{worker_id}.log") @@ -101,6 +100,9 @@ def __init__( self.init_heart_beat() def generate_stream_gate(self, params): + if self.device == "npu": + import torch_npu + torch_npu.npu.set_device("npu:0") self.call_ct += 1 try: @@ -216,8 +218,8 @@ def get_embeddings(self, params): all_embeddings = [] all_token_num = 0 for i in range(0, input_ids.size(1), self.context_len): - chunk_input_ids = input_ids[:, i : i + self.context_len] - chunk_attention_mask = attention_mask[:, i : i + self.context_len] + chunk_input_ids = input_ids[:, i: i + self.context_len] + chunk_attention_mask = attention_mask[:, i: i + self.context_len] chunk_embeddings, token_num = self.__process_embed_chunk( chunk_input_ids, chunk_attention_mask, **model_type_dict From beab1da639c2b90b6f43f2351bab3175ab0018b6 Mon Sep 17 00:00:00 2001 From: jq460494839 <460494839@qq.com> Date: Thu, 21 Dec 2023 11:54:51 +0800 Subject: [PATCH 2/2] format --- fastchat/serve/model_worker.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py index 44f15b9c0..93ccaa54a 100644 --- a/fastchat/serve/model_worker.py +++ b/fastchat/serve/model_worker.py @@ -102,6 +102,7 @@ def __init__( def generate_stream_gate(self, params): if self.device == "npu": import torch_npu + torch_npu.npu.set_device("npu:0") self.call_ct += 1 @@ -218,8 +219,8 @@ def get_embeddings(self, params): all_embeddings = [] all_token_num = 0 for i in range(0, input_ids.size(1), self.context_len): - chunk_input_ids = input_ids[:, i: i + self.context_len] - chunk_attention_mask = attention_mask[:, i: i + self.context_len] + chunk_input_ids = input_ids[:, i : i + self.context_len] + chunk_attention_mask = attention_mask[:, i : i + self.context_len] chunk_embeddings, token_num = self.__process_embed_chunk( chunk_input_ids, chunk_attention_mask, **model_type_dict