13b training scripts

lm-sys · zhisbug · May 14, 2023 · May 15, 2023 · May 15, 2023 · May 15, 2023
commit 318891613d7958118ddbae636b3f1f2b83f18d91
diff --git a/fastchat/train/train.py b/fastchat/train/train.py
@@ -249,7 +249,7 @@ def make_supervised_data_module(
     # train_dataset = dataset_cls(train_raw_data, tokenizer=tokenizer)
     # eval_dataset = dataset_cls(eval_raw_data, tokenizer=tokenizer)
 
-    rank0_print(f"#train {len(raw_data)}")
+    rank0_print(f"######## train {len(raw_data)}")
     train_dataset = dataset_cls(raw_data, tokenizer=tokenizer)
     return dict(train_dataset=train_dataset, eval_dataset=None)
 

diff --git a/scripts/train_vicuna_13b_single_node_slurm.sh b/scripts/train_vicuna_13b_single_node_slurm.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+echo "NODE_RANK="$SLURM_NODEID
+echo "MASTER_ADDR="$MASTER_ADDR
+echo "MASTER_PORT="$MASTER_PORT
+echo "WORLD_SIZE="$WORLD_SIZE
+python -m torch.distributed.run --nproc_per_node=16 --nnodes $SLURM_NNODES --node_rank=$SLURM_PROCID \
+    --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
+    fastchat/train/train_xformer.py \
+    --model_name_or_path /nfs/projects/mbzuai/ext_hao.zhang/hao/dataset/llama-13b \
+    --data_path /nfs/projects/mbzuai/ext_hao.zhang/hao/dataset/sharegpt_20230515_clean_lang_split_identity_v2.json \
+    --fp16 True \
+    --output_dir vicuna_13b_full_sharegpt_20230515_v2_32GPU \
+    --num_train_epochs 3 \
+    --max_steps 10 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 16 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 400 \
+    --save_total_limit 8 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.04 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --lazy_preprocess True
diff --git a/scripts/train_vicuna_13b_slurm.sh b/scripts/train_vicuna_13b_slurm.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+#SBATCH --job-name=hao_13b_full  # create a short name for your job
+#SBATCH --nodes=2
+#SBATCH --gres=gpu:16      # number of gpus per node
+#SBATCH --exclusive
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=96
+#SBATCH --time=30-00:00:00     # total run time limit (HH:MM:SS)
+#SBATCH --reservation=high-profile
+#SBATCH --partition=high-profile
+#SBATCH --error=/nfs/projects/mbzuai/ext_hao.zhang/hao/slurm_logs/job.%J.%N.13b.err
+#SBATCH --output=/nfs/projects/mbzuai/ext_hao.zhang/hao/slurm_logs/job.%J.%N.13b.out
+##### Number of total processes
+echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
+echo "Nodelist:= " $SLURM_JOB_NODELIST
+echo "Number of nodes:= " $SLURM_JOB_NUM_NODES
+echo "Ntasks per node:= " $SLURM_NTASKS_PER_NODE
+echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
+# If you want to load things from your .bashrc profile, e.g. cuda drivers, singularity etc
+cd /nfs/projects/mbzuai/ext_hao.zhang/
+source ~/.bashrc
+conda activate hao-env
+cd hao/FastChat
+free -g 2>&1
+lscpu 2>&1
+# ******************* These are read internally it seems ***********************************
+# ******** Master port, address and world size MUST be passed as variables for DDP to work
+export MASTER_PORT=20001
+export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
+echo "MASTER_PORT"=$MASTER_PORT
+#echo "WORLD_SIZE="$WORLD_SIZE
+master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_ADDR=$master_addr
+#echo "MASTER_ADDR="$MASTER_ADDR
+# ******************************************************************************************
+echo "Run started at:- "
+date
+# Actual run of script
+#srun python main.py # Use this if you have python in your environment
+srun scripts/train_vicuna_13b_single_node_slurm.sh
diff --git a/scripts/train_vicuna_30b_single_node_slurm.sh b/scripts/train_vicuna_30b_single_node_slurm.sh
@@ -6,15 +6,16 @@ echo "WORLD_SIZE="$WORLD_SIZE
 python -m torch.distributed.run --nproc_per_node=16 --nnodes $SLURM_NNODES --node_rank=$SLURM_PROCID \
     --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
     fastchat/train/train_xformer.py \
-    --model_name_or_path /nfs/projects/mbzuai/ext_hao.zhang/hao/dataset/llama-30b \
-    --data_path /nfs/projects/mbzuai/ext_hao.zhang/hao/dataset/sharegpt_20230515_clean_lang_split_identity_gpt4.json \
+    --model_name_or_path /nfs/projects/mbzuai/ext_hao.zhang/hao/FastChat/vicuna_30b_sharegpt_20230515_48GPU/checkpoint-500 \
+    --data_path /nfs/projects/mbzuai/ext_hao.zhang/hao/dataset/sharegpt_20230515_clean_lang_split_identity.json \
     --fp16 True \
-    --output_dir vicuna_30b_sharegpt_20230515_48GPU_gpt4 \
+    --output_dir vicuna_30b_sharegpt_20230515_48GPU \
     --num_train_epochs 3 \
     --per_device_train_batch_size 4 \
     --per_device_eval_batch_size 16 \
     --gradient_accumulation_steps 1 \
-    --evaluation_strategy "no" \
+    --evaluation_strategy "steps" \
+    --eval_steps 1500 \
     --save_strategy "steps" \
     --save_steps 200 \
     --save_total_limit 8 \