-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmegatron_ds_snmg.slurm
executable file
·129 lines (111 loc) · 2.5 KB
/
megatron_ds_snmg.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=16
#SBATCH --hint=nomultithread
#SBATCH --output=%x-%j.out
set -e -x
WORKSPACE=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
CONTAINER_IMAGE=pytorch:22.02-py3-deepspeed
N_GPUS=8
# Paths
DATASET=${WORKSPACE}/my-gpt2_text_document
VOCAB_PATH=${WORKSPACE}/gpt2-vocab.json
MERGE_PATH=${WORKSPACE}/gpt2-merges.txt
CONFIG_JSON=${WORKSPACE}/ds_config.json
# Enable DeepSpeed
USE_DEEPSPEED=1
ZERO_STAGE=1
# Debug model
TP=4
PP=1
LAYERS=8
HIDDEN=512
SEQ=1024
GLOBAL_BATCH=128
MICRO_BATCH=4
# 52B
#TP=4
#PP=16
#HIDDEN=8192
#LAYERS=64
#SEQ=1024
#GLOBAL_BATCH=1024
#MICRO_BATCH=4
#Megatron Options
options=" \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--num-layers ${LAYERS} \
--hidden-size ${HIDDEN} \
--num-attention-heads 32 \
--seq-length ${SEQ} \
--loss-scale 12 \
--max-position-embeddings ${SEQ} \
--micro-batch-size ${MICRO_BATCH} \
--global-batch-size ${GLOBAL_BATCH} \
--train-iters 1000 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--log-interval 1 \
--eval-iters 40 \
--eval-interval 1000 \
--data-path ${DATASET} \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--save-interval 1000 \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--fp16 \
--checkpoint-activations \
--distributed-backend nccl \
"
# DeepSpeed Options
if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
echo "Using DeepSpeed"
options="${options} \
--deepspeed \
--deepspeed-activation-checkpointing \
--deepspeed_config=${CONFIG_JSON} \
--zero-stage=${ZERO_STAGE} \
"
fi
# DeepSpeed Configs
cat <<EOT > $CONFIG_JSON
{
"train_batch_size" : ${GLOBAL_BATCH},
"train_micro_batch_size_per_gpu": ${MICRO_BATCH},
"steps_per_print": 1,
"zero_optimization": {
"stage": ${ZERO_STAGE}
},
"gradient_clipping": 1.0,
"prescale_gradients": false,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"wall_clock_breakdown" : true
}
EOT
CMD="${WORKSPACE}/pretrain_gpt.py ${options}"
echo $CMD
# Touch launcher
LAUNCHER="deepspeed \
--num_gpus ${N_GPUS} \
"
# Execution command
srun -l \
--container-image ${CONTAINER_IMAGE} \
--container-mounts ${WORKSPACE} \
bash -c "${LAUNCHER} ${CMD}" \