Skip to content
Closed
Prev Previous commit
Next Next commit
13b training scripts
  • Loading branch information
Hao Zhang committed May 29, 2023
commit 318891613d7958118ddbae636b3f1f2b83f18d91
2 changes: 1 addition & 1 deletion fastchat/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def make_supervised_data_module(
# train_dataset = dataset_cls(train_raw_data, tokenizer=tokenizer)
# eval_dataset = dataset_cls(eval_raw_data, tokenizer=tokenizer)

rank0_print(f"#train {len(raw_data)}")
rank0_print(f"######## train {len(raw_data)}")
train_dataset = dataset_cls(raw_data, tokenizer=tokenizer)
return dict(train_dataset=train_dataset, eval_dataset=None)

Expand Down
31 changes: 31 additions & 0 deletions scripts/train_vicuna_13b_single_node_slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash
echo "NODE_RANK="$SLURM_NODEID
echo "MASTER_ADDR="$MASTER_ADDR
echo "MASTER_PORT="$MASTER_PORT
echo "WORLD_SIZE="$WORLD_SIZE
python -m torch.distributed.run --nproc_per_node=16 --nnodes $SLURM_NNODES --node_rank=$SLURM_PROCID \
--master_addr $MASTER_ADDR --master_port $MASTER_PORT \
fastchat/train/train_xformer.py \
--model_name_or_path /nfs/projects/mbzuai/ext_hao.zhang/hao/dataset/llama-13b \
--data_path /nfs/projects/mbzuai/ext_hao.zhang/hao/dataset/sharegpt_20230515_clean_lang_split_identity_v2.json \
--fp16 True \
--output_dir vicuna_13b_full_sharegpt_20230515_v2_32GPU \
--num_train_epochs 3 \
--max_steps 10 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 400 \
--save_total_limit 8 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.04 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--fsdp "full_shard auto_wrap" \
--fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
--model_max_length 2048 \
--gradient_checkpointing True \
--lazy_preprocess True
40 changes: 40 additions & 0 deletions scripts/train_vicuna_13b_slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash
#SBATCH --job-name=hao_13b_full # create a short name for your job
#SBATCH --nodes=2
#SBATCH --gres=gpu:16 # number of gpus per node
#SBATCH --exclusive
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=96
#SBATCH --time=30-00:00:00 # total run time limit (HH:MM:SS)
#SBATCH --reservation=high-profile
#SBATCH --partition=high-profile
#SBATCH --error=/nfs/projects/mbzuai/ext_hao.zhang/hao/slurm_logs/job.%J.%N.13b.err
#SBATCH --output=/nfs/projects/mbzuai/ext_hao.zhang/hao/slurm_logs/job.%J.%N.13b.out
##### Number of total processes
echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
echo "Nodelist:= " $SLURM_JOB_NODELIST
echo "Number of nodes:= " $SLURM_JOB_NUM_NODES
echo "Ntasks per node:= " $SLURM_NTASKS_PER_NODE
echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
# If you want to load things from your .bashrc profile, e.g. cuda drivers, singularity etc
cd /nfs/projects/mbzuai/ext_hao.zhang/
source ~/.bashrc
conda activate hao-env
cd hao/FastChat
free -g 2>&1
lscpu 2>&1
# ******************* These are read internally it seems ***********************************
# ******** Master port, address and world size MUST be passed as variables for DDP to work
export MASTER_PORT=20001
export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
echo "MASTER_PORT"=$MASTER_PORT
#echo "WORLD_SIZE="$WORLD_SIZE
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
#echo "MASTER_ADDR="$MASTER_ADDR
# ******************************************************************************************
echo "Run started at:- "
date
# Actual run of script
#srun python main.py # Use this if you have python in your environment
srun scripts/train_vicuna_13b_single_node_slurm.sh
9 changes: 5 additions & 4 deletions scripts/train_vicuna_30b_single_node_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@ echo "WORLD_SIZE="$WORLD_SIZE
python -m torch.distributed.run --nproc_per_node=16 --nnodes $SLURM_NNODES --node_rank=$SLURM_PROCID \
--master_addr $MASTER_ADDR --master_port $MASTER_PORT \
fastchat/train/train_xformer.py \
--model_name_or_path /nfs/projects/mbzuai/ext_hao.zhang/hao/dataset/llama-30b \
--data_path /nfs/projects/mbzuai/ext_hao.zhang/hao/dataset/sharegpt_20230515_clean_lang_split_identity_gpt4.json \
--model_name_or_path /nfs/projects/mbzuai/ext_hao.zhang/hao/FastChat/vicuna_30b_sharegpt_20230515_48GPU/checkpoint-500 \
--data_path /nfs/projects/mbzuai/ext_hao.zhang/hao/dataset/sharegpt_20230515_clean_lang_split_identity.json \
--fp16 True \
--output_dir vicuna_30b_sharegpt_20230515_48GPU_gpt4 \
--output_dir vicuna_30b_sharegpt_20230515_48GPU \
--num_train_epochs 3 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--evaluation_strategy "steps" \
--eval_steps 1500 \
--save_strategy "steps" \
--save_steps 200 \
--save_total_limit 8 \
Expand Down