Change pytorch version and able to launch now

lm-sys · merrymercy · Mar 19, 2023 · Mar 19, 2023 · Mar 19, 2023 · Mar 19, 2023
commit eda603a9d95dbcb4184d3f4b80307b88a5b8f850
diff --git a/README.md b/README.md
@@ -6,16 +6,18 @@ chatbot server
 1. Install skypilot and setup the credentials locally following the instructions [here](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)
 2. Launch the training job with the following line (will be launched on a single node with 4 A100-80GB GPUs)
     ```
-    sky launch -c alpaca -s scripts/train.yaml
+    # WANDB API KEY is required for logging. We use the key in your local environment.
+    sky launch -c alpaca -s scripts/train.yaml --env WANDB_API_KEY
     ```
-    We can also launch the training job with multiple nodes and different number of GPUs. We will automatically adapt the
-    gradient accumulation steps to the setting (Supported max number of #nodes * #GPUs per node = 32)
+    Or use spot (not managed).
     ```
-    sky launch -c alpaca-2 -s --num-nodes 2 --gpus A100-80GB:8 scripts/train.yaml
+    sky launch -c alpaca-spot -s --use-spot scripts/train.yaml --env WANDB_API_KEY
     ```
-    Or using spot (not managed).
+    **The following still does not work at the moment as Alpaca code does not support multiple nodes.**
+    We can also launch the training job with multiple nodes and different number of GPUs. We will automatically adapt the
+    gradient accumulation steps to the setting (Supported max number of #nodes * #GPUs per node = 32)
     ```
-    sky launch -c alpaca-spot -s --use-spot scripts/train.yaml
+    sky launch -c alpaca-2 -s --num-nodes 2 --gpus A100-80GB:8 scripts/train.yaml  --env WANDB_API_KEY
     ```
     Managed spot version TO BE ADDED.
 

diff --git a/scripts/train.yaml b/scripts/train.yaml
@@ -33,18 +33,19 @@ setup: |
   conda activate chatbot
 
   # Install pytorch
-  pip install torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
+  pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
 
   # Install huggingface with the LLaMA commit
   git clone https://github.com/huggingface/transformers.git
   cd transformers
-  git checkout 60d51ef5123d949fd8c59cd4d3254e711541d278
+  git checkout 60d51ef # pin to latest commit
   pip install .
   cd -
 
   # Install alpaca
   git clone https://github.com/tatsu-lab/stanford_alpaca.git
   cd stanford_alpaca
+  git checkout eb5b171 # pin to latest commit
   pip install -r requirements.txt
   cd -
 
@@ -72,9 +73,7 @@ run: |
   torchrun \
     --nnodes=$NUM_NODES \
     --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \
-    --rdzv_id=100 \
-    --rdzv_backend=c10d \
-    --rdzv_endpoint=$HOST_ADDR:29500 \
+    --master_port=12355 \
     train.py \
     --model_name_or_path /artifacts/llama-hf/llama-7B \
     --data_path /data/alpaca-data.json \