From ed8069edade958e68e02ce9aece06bef6567d5a4 Mon Sep 17 00:00:00 2001
From: jq460494839 <460494839@qq.com>
Date: Thu, 21 Dec 2023 11:35:39 +0800
Subject: [PATCH 1/2] NPU needs to be initialized when starting a new process

---
 fastchat/serve/model_worker.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py
index 5e84a4262..44f15b9c0 100644
--- a/fastchat/serve/model_worker.py
+++ b/fastchat/serve/model_worker.py
@@ -31,7 +31,6 @@
     str_to_torch_dtype,
 )
 
-
 worker_id = str(uuid.uuid4())[:8]
 logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
 
@@ -101,6 +100,9 @@ def __init__(
             self.init_heart_beat()
 
     def generate_stream_gate(self, params):
+        if self.device == "npu":
+            import torch_npu
+            torch_npu.npu.set_device("npu:0")
         self.call_ct += 1
 
         try:
@@ -216,8 +218,8 @@ def get_embeddings(self, params):
                 all_embeddings = []
                 all_token_num = 0
                 for i in range(0, input_ids.size(1), self.context_len):
-                    chunk_input_ids = input_ids[:, i : i + self.context_len]
-                    chunk_attention_mask = attention_mask[:, i : i + self.context_len]
+                    chunk_input_ids = input_ids[:, i: i + self.context_len]
+                    chunk_attention_mask = attention_mask[:, i: i + self.context_len]
 
                     chunk_embeddings, token_num = self.__process_embed_chunk(
                         chunk_input_ids, chunk_attention_mask, **model_type_dict

From beab1da639c2b90b6f43f2351bab3175ab0018b6 Mon Sep 17 00:00:00 2001
From: jq460494839 <460494839@qq.com>
Date: Thu, 21 Dec 2023 11:54:51 +0800
Subject: [PATCH 2/2] format

---
 fastchat/serve/model_worker.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py
index 44f15b9c0..93ccaa54a 100644
--- a/fastchat/serve/model_worker.py
+++ b/fastchat/serve/model_worker.py
@@ -102,6 +102,7 @@ def __init__(
     def generate_stream_gate(self, params):
         if self.device == "npu":
             import torch_npu
+
             torch_npu.npu.set_device("npu:0")
         self.call_ct += 1
 
@@ -218,8 +219,8 @@ def get_embeddings(self, params):
                 all_embeddings = []
                 all_token_num = 0
                 for i in range(0, input_ids.size(1), self.context_len):
-                    chunk_input_ids = input_ids[:, i: i + self.context_len]
-                    chunk_attention_mask = attention_mask[:, i: i + self.context_len]
+                    chunk_input_ids = input_ids[:, i : i + self.context_len]
+                    chunk_attention_mask = attention_mask[:, i : i + self.context_len]
 
                     chunk_embeddings, token_num = self.__process_embed_chunk(
                         chunk_input_ids, chunk_attention_mask, **model_type_dict