-
Notifications
You must be signed in to change notification settings - Fork 100
/
Copy pathhf_ds_gpt2_base_n32.slurm
168 lines (141 loc) · 4.9 KB
/
hf_ds_gpt2_base_n32.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/bin/bash
#SBATCH --job-name=hf_ds_gpt2_base_n32
#SBATCH --constraint=v100-32g
#SBATCH --nodes=32
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=40 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --gres=gpu:4 # number of gpus
#SBATCH --time 00:30:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=%x-%j.out # output file name
#SBATCH --error=%x-%j.out # error file name (same to watch just one file)
#SBATCH --account=six@gpu
set -x -e
export PYTHONUNBUFFERED=1
source $six_ALL_CCFRWORK/start-prod
nvidia-smi
cd $six_ALL_CCFRWORK/code/transformers-clm-any-model-config/
export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
DATASET="stas/openwebtext-10k"
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
# adjust depending on the number of the nodes
NNODES=32
MICRO_BATCH_SIZE=4
# succeeded:
# to try
MSIZE=158
# failed
# MSIZE=181
if [[ ${MSIZE} == 7 ]]; then NHIDDEN=4096; NLAYERS=36
elif [[ ${MSIZE} == 14 ]]; then NHIDDEN=6144; NLAYERS=32
elif [[ ${MSIZE} == 18 ]]; then NHIDDEN=6144; NLAYERS=40
elif [[ ${MSIZE} == 25 ]]; then NHIDDEN=7168; NLAYERS=40
elif [[ ${MSIZE} == 30 ]]; then NHIDDEN=7168; NLAYERS=48
elif [[ ${MSIZE} == 39 ]]; then NHIDDEN=8192; NLAYERS=48
elif [[ ${MSIZE} == 52 ]]; then NHIDDEN=8192; NLAYERS=64
elif [[ ${MSIZE} == 65 ]]; then NHIDDEN=9216; NLAYERS=64
elif [[ ${MSIZE} == 81 ]]; then NHIDDEN=10240; NLAYERS=64
elif [[ ${MSIZE} == 97 ]]; then NHIDDEN=11264; NLAYERS=64
elif [[ ${MSIZE} == 116 ]]; then NHIDDEN=12288; NLAYERS=64
elif [[ ${MSIZE} == 136 ]]; then NHIDDEN=13312; NLAYERS=64
elif [[ ${MSIZE} == 158 ]]; then NHIDDEN=14336; NLAYERS=64
elif [[ ${MSIZE} == 181 ]]; then NHIDDEN=15360; NLAYERS=64
elif [[ ${MSIZE} == 206 ]]; then NHIDDEN=16384; NLAYERS=64
else echo "invalid MSIZE: $MSIZE"
fi
GPUS_PER_NODE=4
NHEADS=32
SEQ_LEN=1024
VOCAB_SIZE=50257
export LAUNCHER="python -u -m torch.distributed.launch \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT \
"
config_json="./ds_z3_cpu_offload.json"
cat <<EOT > $config_json
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 8,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e14,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_fp16_weights_on_model_save": false
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
EOT
export PYTHONPATH=src
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
export USE_TF=0
export CMD=" \
examples/pytorch/language-modeling/run_clm.py \
--model_type gpt2 \
--tokenizer_name gpt2 \
--config_overrides "n_embd=$NHIDDEN,n_head=$NHEADS,n_layer=$NLAYERS,n_positions=$SEQ_LEN,gradient_checkpointing=true,use_cache=False" \
--dataset_name $DATASET \
--output_dir output_dir \
--overwrite_output_dir \
--do_train \
--max_train_samples 1000 \
--per_device_train_batch_size $MICRO_BATCH_SIZE \
--num_train_epochs 1 \
--warmup_steps 8 \
--fp16 \
--report_to none \
--deepspeed $config_json \
"
# clear old checkpoint as it'd mismatch while we sort things out
rm -rf $six_ALL_CCFRWORK/checkpoints/gpt2-1-node
# model size
python -c "h=$NHIDDEN; l=$NLAYERS; s=$SEQ_LEN; v=$VOCAB_SIZE; print(f'Model size: {(l * (12*h**2 + 13*h) + (v * h) + (s * h) ) / 10**9 :.0f}B')"
# to debug - add echo (it exits and prints what it would have launched)
srun --jobid $SLURM_JOBID bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a hf_ds_gpt2_base_n32_bs4.txt