Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix tensor parallel not work
  • Loading branch information
CPFLAME committed Aug 4, 2022
commit 40cfb282dd5e752436ca66b41af149fbfd712c2f
2 changes: 1 addition & 1 deletion libai/engine/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,7 @@ def build_model(cls, cfg):
model = build_model(cfg.model)
logger = logging.getLogger(__name__)
logger.info("Model:\n{}".format(model))
model.apply(dist.convert_to_distributed_default_setting)
model._apply(dist.convert_to_distributed_default_setting)
return model

@classmethod
Expand Down
10 changes: 5 additions & 5 deletions libai/engine/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,11 +212,11 @@ def write_metrics(
metrics_dict = all_metrics_dict
total_losses_reduced = sum(metrics_dict.values())

### 新增
total_losses_reduced = sum(metrics_dict.values())
if dist.is_main_process():
txt = open("/home/chenqiaoling/RWKV-LM/libai/projects/RWKV_V4/results/exp7_libai.txt", "a")
txt.write(str(total_losses_reduced.item())+"\n")
# ### 新增
# total_losses_reduced = sum(metrics_dict.values())
# if dist.is_main_process():
# txt = open("/home/chenqiaoling/RWKV-LM/libai/projects/RWKV_V4/results/exp7_libai.txt", "a")
# txt.write(str(total_losses_reduced.item())+"\n")

storage.put_scalar("{}total_loss".format(prefix), total_losses_reduced)
if len(metrics_dict) > 1:
Expand Down
16 changes: 8 additions & 8 deletions libai/utils/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,18 +387,18 @@ def get_num_nodes():
return flow.env.get_node_size()


def convert_to_distributed_default_setting(module):
def convert_to_distributed_default_setting(t):
"""
Helper function to convert all eager local tensor in :attr:`nn.Module` in the model to
global tensor with data parallelism as default.
"""
for _, v in module.state_dict().items():
if not v.is_global:
module.to_global(
sbp=get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=get_layer_placement(0),
)
return
if not t.is_global:
return t.to_global(
sbp=get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=get_layer_placement(0),
)
else:
return t


def ttol(tensor, pure_local=False, ranks=None):
Expand Down