diff --git a/ALCF/aws_ofi_nccl_plugin.sh b/ALCF/aws_ofi_nccl_plugin.sh index ffd1471cd3..fc91a972a6 100644 --- a/ALCF/aws_ofi_nccl_plugin.sh +++ b/ALCF/aws_ofi_nccl_plugin.sh @@ -3,8 +3,12 @@ # AWS NCCL OFI Plugin settings below export NCCL_CROSS_NIC=1 export NCCL_COLLNET_ENABLE=1 -export NCCL_NET="AWS Libfabric" -export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH +if [ -n "$SOPHIA" ]; then + NCCL_SOCKET_IFNAME=bond0 +else + export NCCL_NET="AWS Libfabric" + export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH +fi export LD_LIBRARY_PATH=/soft/libraries/hwloc/lib/:$LD_LIBRARY_PATH export FI_CXI_DISABLE_HOST_REGISTER=1 export FI_MR_CACHE_MONITOR=userfaultfd diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index bc31c322ce..a7c2b60ec6 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -365,7 +365,14 @@ setupLauncher() { make_ds_hostfile || exit export LAUNCHER="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}" else - if [[ -n "${DIST_LAUNCH}" ]]; then + if [ -n "$SOPHIA" ]; then + # LAUNCHER="${DIST_LAUNCH} --pmi=pmix --genvall $(which python3) -Wignore ${EXEC}" + # dist_launch_cmd="mpiexec --verbose --envall -n ${num_gpus} -ppn ${num_gpus_per_host} --hostfile ${hostfile} --cpu-bind depth -d ${depth}" + LAUNCHER="mpiexec --verbose $(env | grep -v -e '"' -e ';' -e '*' -e '-' -e '__' -e '}' -e '|' -e ')' -e esac -e else -e fi| awk -F= '{print "-x", $1}' | tr '\n' ' ') \ + -n ${num_gpus} --hostfile ${hostfile} $(which python3) -Wignore ${EXEC}" + echo "${LAUNCHER}" + export LAUNCHER="${LAUNCHER}" + elif [[ -n "${DIST_LAUNCH}" ]]; then mn=$(get_machine_name) if [[ "${mn}" == "aurora" || "${mn}" == "sunspot" ]]; then LAUNCHER="${DIST_LAUNCH} --pmi=pmix --genvall $(which python3) -Wignore ${EXEC}" @@ -561,7 +568,7 @@ setParams() { ###################################################################### # +--------[Polaris]-----------------------------------+ # elif [[ $(hostname) == x3* ]]; then - elif [[ "${mn}" == "polaris" || "${mn}" == "sirius" ]]; then + elif [[ "${mn}" == "polaris" || "${mn}" == "sirius" || -n "$SOPHIA" ]]; then # export LAUNCH_CMD="${LAUNCH_CMD:-deepspeed}" TP=${TP:-1} # TP = 2 export NCCL=${NCCL:-nccl} # NCCL @@ -594,7 +601,7 @@ setParams() { fi fi # +----------------------------------------------------------------------+ - export TP="${TP}" + export TP="${TP:-1}" export PP="${PP:-1}" export SP="${SP:-1}" export FLASH_ARG="${FLASH_ARG}" diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 592ff2855b..12041b133a 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -380,10 +380,10 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask): context_layer = context_layer.permute(2, 0, 1, 3).contiguous() # [sq, b, np, hn] --> [sq, b, hp] - new_context_layer_shape = context_layer.size()[:-2] + ( - self.hidden_size_per_partition, - ) - context_layer = context_layer.view(*new_context_layer_shape) + # new_context_layer_shape = context_layer.size()[:-2] + ( + # self.hidden_size_per_partition, + # ) + # context_layer = context_layer.view(*new_context_layer_shape) return context_layer @@ -987,7 +987,7 @@ def forward( ).contiguous() else: context_layer = self.dist_attn( - query_layer, key_layer, value_layer, attention_mask + query_layer, key_layer, value_layer, batch_dim_idx=batch_dim_idx, attention_mask=attention_mask ) else: if self.use_flash_attn: @@ -1024,7 +1024,8 @@ def forward( # ================= # Output. [sq, b, h] # ================= - + if not self.use_flash_attn: ## for deepspeed>=14.5 + context_layer = context_layer.flatten(start_dim=-2) ## [s, b, hc, hd] -> [s, b, h] output, bias = self.dense(context_layer) return output, bias diff --git a/soph_mult_qsub_DP.sh b/soph_mult_qsub_DP.sh new file mode 100644 index 0000000000..ac7a9449dd --- /dev/null +++ b/soph_mult_qsub_DP.sh @@ -0,0 +1,39 @@ +## Author: Eugene Ku +## Issue: Oddly, Sophia uses more memory than polaris for some reason. +## Extra Libs (Libraries): +## 1. Will need Apex +## a. download apex b. cd apex; python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ +## 2. pip install deepspeed --upgrade (I used 0.15.1) + +export PBS_O_WORKDIR=$(dirname $0 | xargs realpath) +cd $PBS_O_WORKDIR + +## SCALE CONFIGS +export TRAIN_ITER=5000 +export SP=1 +MICRO_BATCH=1 +export MICRO_BATCH=$(($SP * $MICRO_BATCH)) ## Single copy of model batch size +export DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt +# export NLAYERS=2 +export NLAYERS=10 +export OPT=adamw +export SAVE_INTERVAL=100 +export GRAD_ACC_STEPS=1 +export NO_FLASH_ATTN=1 +export SOPHIA=1 ## Sophia + +## MODEL ARGUEMNTS +if [ -n $SOPHIA ]; then + . venvs/2024-08-08/bin/activate +fi + +bash $PBS_O_WORKDIR/train_llama_alcf.sh + + +# QUEUE=by-node +# qsub -V -A datascience -q $QUEUE -l select=2 -l walltime=16:00:00,filesystems=eagle:home $PBS_O_WORKDIR/train_llama_alcf.sh + +# QUEUE=debug-scaling +# qsub -V -A datascience -q $QUEUE -l select=4 -l walltime=1:00:00,filesystems=eagle:home $PBS_O_WORKDIR/train_llama_alcf.sh + +# echo Submitted a job with PBS_DIR at $PBS_O_WORKDIR on $QUEUE queue. \ No newline at end of file