Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
mod for compile
  • Loading branch information
strint committed Sep 14, 2022
commit 725c44075141d0d6800e8ba5608400f61a695ef3
2 changes: 1 addition & 1 deletion configs/common/data/t5_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,5 @@
train_val_test_num_samples=None, # a hint for deferred assignment
splits=[[949.0, 50.0, 1.0]],
weights=[1.0],
num_workers=4,
num_workers=0,
)
8 changes: 6 additions & 2 deletions configs/t5_large_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from .common.models.graph import graph

vocab_file = "../bert_data/bert-base-chinese-vocab.txt"
data_prefix = "../bert_data/loss_compara_content_sentence"
data_prefix = "../bert_data/data/loss_compara_content_sentence"
# data_prefix = "../bert_data/loss_compara_content_sentence"

tokenization.tokenizer.vocab_file = vocab_file
dataloader.train.dataset[0].data_prefix = data_prefix
Expand All @@ -22,13 +23,16 @@
model.cfg.bias_dropout_fusion = False
model.cfg.bias_gelu_fusion = False

graph.debug = 1

train.input_placement_device = "cpu"

train.dist.data_parallel_size=8
train.dist.data_parallel_size=64
train.dist.tensor_parallel_size=1
train.dist.pipeline_parallel_size=1
train.dist.pipeline_num_layers = 2 * model.cfg.hidden_layers


train.train_micro_batch_size = 16
train.amp.enabled = True

Expand Down
1 change: 0 additions & 1 deletion libai/models/utils/graph_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def __init__(

self.model = model
self.is_train = is_train
self.debug(1, only_compile=True)

if is_train:
self.add_optimizer(optimizer, lr_sch=lr_scheduler)
Expand Down
1 change: 1 addition & 0 deletions tools/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ NODE_RANK=${NODE_RANK:-0}
ADDR=${ADDR:-127.0.0.1}
PORT=${PORT:-12345}

export GLOG_v=1
export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true

python3 -m oneflow.distributed.launch \
Expand Down