Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
adapt t5 for any rank compile
  • Loading branch information
strint committed Sep 30, 2022
commit 8de0b595f8ac7cb5670de6fd3706aa692e0d2b97
2 changes: 1 addition & 1 deletion configs/common/data/test_t5_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

dataloader.train = LazyCall(build_nlp_train_loader)(
dataset=[
LazyCall(T5Dataset)(vocab_size=8, num_samples=1000, enc_seq_len=8, dec_seq_len=8),
LazyCall(T5Dataset)(vocab_size=8, num_samples=1024, enc_seq_len=8, dec_seq_len=8),
],
num_workers=4,
)
7 changes: 4 additions & 3 deletions configs/test_t5_large_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
model.cfg.num_attention_heads = 8
model.cfg.vocab_size = 8
model.cfg.hidden_size = 8
model.cfg.hidden_layers = 6
# model.cfg.hidden_layers = 6
model.cfg.hidden_layers = 1
model.cfg.scale_mask_softmax_fusion = False
model.cfg.bias_dropout_fusion = False
model.cfg.bias_gelu_fusion = False
Expand All @@ -20,12 +21,12 @@

train.input_placement_device = "cpu"

train.dist.data_parallel_size=10
train.dist.data_parallel_size=2
train.dist.tensor_parallel_size=1
train.dist.pipeline_parallel_size=1
train.dist.pipeline_num_layers = 2 * model.cfg.hidden_layers

train.train_micro_batch_size =100
train.train_micro_batch_size = 80
train.amp.enabled = True

train.evaluation.evaluator = LazyCall(PPLEvaluator)()
Expand Down
3 changes: 2 additions & 1 deletion libai/utils/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ def _init_distributed_env(self, cfg):
self._world_size = num_gpus_per_node * num_nodes

def _init_parallel_size(self, cfg):

# tensor parallel size
self._tensor_parallel_size = min(cfg.tensor_parallel_size, self.world_size)
assert self.world_size % self._tensor_parallel_size == 0, (
Expand Down Expand Up @@ -390,6 +389,8 @@ def get_world_size():


def get_num_nodes():
# Note that this is just for dry run compile
return 1
return flow.env.get_node_size()


Expand Down
12 changes: 10 additions & 2 deletions tools/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,18 @@ ADDR=${ADDR:-127.0.0.1}
PORT=${PORT:-12345}

export GLOG_v=1
#export ONEFLOW_DEBUG_MODE=1
#export ONEFLOW_DEBUG_MODE=true

export ONEFLOW_DRY_RUN_GRAPH_COMPILE=true
export ONEFLOW_DRY_RUN_COMPILE_NODE_NUM=1
export ONEFLOW_DRY_RUN_COMPILE_DEV_NUM_PER_NODE=80

export ONEFLOW_FUSE_OPTIMIZER_UPDATE_CAST=true

export ONEFLOW_LAZY_COMPILE_MODE=naive
#export ONEFLOW_LAZY_COMPILE_MODE=rank_per_thread
#export ONEFLOW_LAZY_COMPILE_MODE=rank_per_iter

python3 -m oneflow.distributed.launch \
--nproc_per_node $GPUS --nnodes $NODE --node_rank $NODE_RANK --master_addr $ADDR --master_port $PORT \
$FILE --config-file $CONFIG ${@:4}