WIP

huggingface · Jan 24, 2025 · 153a90f · 153a90f
1 parent 0f52871
commit 153a90f
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 50 deletions.
diff --git a/docs/source/training_tutorials/sft_lora_finetune_llm.mdx b/docs/source/training_tutorials/sft_lora_finetune_llm.mdx
@@ -224,24 +224,25 @@ set -ex
 
 export NEURON_FUSE_SOFTMAX=1
 export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=3
-export MALLOC_ARENA_MAX=64
+export MALLOC_ARENA_MAX=64 # limit the CPU allocation to avoid potential crashes
 export NEURON_CC_FLAGS="--model-type=transformer --distribution-strategy=llm-training --enable-saturate-infinity --cache_dir=/home/ubuntu/cache_dir_neuron/"
 
-PROCESSES_PER_NODE=8
+PROCESSES_PER_NODE=4
 
-NUM_EPOCHS=1
 TP_DEGREE=2
 PP_DEGREE=1
 BS=1
 GRADIENT_ACCUMULATION_STEPS=8
 LOGGING_STEPS=1
 MODEL_NAME="meta-llama/Meta-Llama-3-8B"
-OUTPUT_DIR=output
+OUTPUT_DIR=dolly_llama
 
 if [ "$NEURON_EXTRACT_GRAPHS_ONLY" = "1" ]; then
-    MAX_STEPS=$((LOGGING_STEPS + 10))
+    MAX_STEPS=10
+    NUM_EPOCHS=1
 else
     MAX_STEPS=-1
+    NUM_EPOCHS=3
 fi
 
 
@@ -267,15 +268,19 @@ XLA_USE_BF16=1 torchrun --nproc_per_node $PROCESSES_PER_NODE docs/source/trainin
   --overwrite_output_dir
 ```
 
-Save this script to a file, for instance `sft_lora_finetune_llm.sh`, and pass it to the `neuron_parallel_compile` tool to trigger the compilation.
+For convenience, we saved this shell script to a file, [sft_lora_finetune_llm.sh](https://github.com/huggingface/optimum-neuron/blob/main/docs/source/training_tutorials/sft_lora_finetune_llm.sh). You can now pass it to the `neuron_parallel_compile` tool to trigger the compilation:
+
+```bash
+neuron_parallel_compile bash docs/source/training_tutorials/sft_lora_finetune_llm.sh
+```
 
 <Tip>
 
-Make sure to run this precompilation phase for around 10 training steps to ensure that the compiler has compiled all the necessary graphs. It is usually enough to accumulate and compile all the graphs that will be needed during the actual training.
+This precompilation phase runs for around 10 training steps to ensure that the compiler has compiled all the necessary graphs. It is usually enough to accumulate and compile all the graphs that will be needed during the actual training.
 
 </Tip>
 
-_Note: Compiling without a cache can take a while. It will also create dummy files in the `dolly_llama_sharded` during compilation you will have to remove them afterwards. We also need to add `MALLOC_ARENA_MAX=64` to limit the CPU allocation to avoid potential crashes, don't remove it for now._
+_Note: Compiling without a cache can take a while. It will also create dummy files in the `dolly_llama` during compilation you will have to remove them afterwards._
 
 ```bash
 # remove dummy artifacts which are created by the precompilation command
@@ -288,52 +293,13 @@ After compilation is done we can start our actual training with a similar comman
 
 We will use `torchrun` to launch our training script. `torchrun` is a tool that automatically distributes a PyTorch model across multiple accelerators. We can pass the number of accelerators as `nproc_per_node` arguments alongside our hyperparameters.
 
-The difference to the compilation command is that we changed from `max_steps=10` to `num_train_epochs=3`.
+The difference to the compilation command is that we changed variables `max_steps=10` and `num_train_epochs=3`.
 
-Launch the training, with the following command.
+Launch the training, with the same command used in the precompilation step, but without `neuron_parallel_compile`:
 
 ```bash
-#!/bin/bash
-set -ex
-
-export NEURON_FUSE_SOFTMAX=1
-export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=3
-export MALLOC_ARENA_MAX=64
-export NEURON_CC_FLAGS="--model-type=transformer --distribution-strategy=llm-training --enable-saturate-infinity --cache_dir=/home/ubuntu/cache_dir_neuron/"
-
-PROCESSES_PER_NODE=8
+bash docs/source/training_tutorials/sft_lora_finetune_llm.sh
 
-NUM_EPOCHS=1
-TP_DEGREE=2
-PP_DEGREE=1
-BS=1
-GRADIENT_ACCUMULATION_STEPS=8
-LOGGING_STEPS=1
-MODEL_NAME="meta-llama/Meta-Llama-3-8B"
-OUTPUT_DIR=output
-MAX_STEPS=-1
-
-
-XLA_USE_BF16=1 torchrun --nproc_per_node $PROCESSES_PER_NODE docs/source/training_tutorials/sft_lora_finetune_llm.py \
-  --model_id $MODEL_NAME \
-  --num_train_epochs $NUM_EPOCHS \
-  --do_train \
-  --learning_rate 5e-5 \
-  --warmup_ratio 0.03 \
-  --max_steps $MAX_STEPS \
-  --per_device_train_batch_size $BS \
-  --per_device_eval_batch_size $BS \
-  --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
-  --gradient_checkpointing true \
-  --bf16 \
-  --zero_1 false \
-  --tensor_parallel_size $TP_DEGREE \
-  --pipeline_parallel_size $PP_DEGREE \
-  --logging_steps $LOGGING_STEPS \
-  --save_total_limit 1 \
-  --output_dir $OUTPUT_DIR \
-  --lr_scheduler_type "constant" \
-  --overwrite_output_dir
 ```
 
 That's it, we successfully trained Llama-3 8B on AWS Trainium!

diff --git a/docs/source/training_tutorials/sft_lora_finetune_llm.sh b/docs/source/training_tutorials/sft_lora_finetune_llm.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -ex
+
+export NEURON_FUSE_SOFTMAX=1
+export NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=3
+export MALLOC_ARENA_MAX=64 # limit the CPU allocation to avoid potential crashes
+export NEURON_CC_FLAGS="--model-type=transformer --distribution-strategy=llm-training --enable-saturate-infinity --cache_dir=/home/ubuntu/cache_dir_neuron/"
+
+PROCESSES_PER_NODE=4
+
+TP_DEGREE=2
+PP_DEGREE=1
+BS=1
+GRADIENT_ACCUMULATION_STEPS=8
+LOGGING_STEPS=1
+MODEL_NAME="meta-llama/Meta-Llama-3-8B"
+OUTPUT_DIR=dolly_llama
+
+if [ "$NEURON_EXTRACT_GRAPHS_ONLY" = "1" ]; then
+    MAX_STEPS=10
+    NUM_EPOCHS=1
+else
+    MAX_STEPS=-1
+    NUM_EPOCHS=3
+fi
+
+
+XLA_USE_BF16=1 torchrun --nproc_per_node $PROCESSES_PER_NODE docs/source/training_tutorials/sft_lora_finetune_llm.py \
+  --model_id $MODEL_NAME \
+  --num_train_epochs $NUM_EPOCHS \
+  --do_train \
+  --learning_rate 5e-5 \
+  --warmup_ratio 0.03 \
+  --max_steps $MAX_STEPS \
+  --per_device_train_batch_size $BS \
+  --per_device_eval_batch_size $BS \
+  --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+  --gradient_checkpointing true \
+  --bf16 \
+  --zero_1 false \
+  --tensor_parallel_size $TP_DEGREE \
+  --pipeline_parallel_size $PP_DEGREE \
+  --logging_steps $LOGGING_STEPS \
+  --save_total_limit 1 \
+  --output_dir $OUTPUT_DIR \
+  --lr_scheduler_type "constant" \
+  --overwrite_output_dir