diff --git a/ALCF/aws_ofi_nccl_plugin.sh b/ALCF/aws_ofi_nccl_plugin.sh
index ffd1471cd3..fc91a972a6 100644
--- a/ALCF/aws_ofi_nccl_plugin.sh
+++ b/ALCF/aws_ofi_nccl_plugin.sh
@@ -3,8 +3,12 @@
 # AWS NCCL OFI Plugin settings below
 export NCCL_CROSS_NIC=1
 export NCCL_COLLNET_ENABLE=1
-export NCCL_NET="AWS Libfabric"
-export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH
+if [ -n "$SOPHIA" ]; then
+    NCCL_SOCKET_IFNAME=bond0
+else
+    export NCCL_NET="AWS Libfabric"
+    export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH
+fi
 export LD_LIBRARY_PATH=/soft/libraries/hwloc/lib/:$LD_LIBRARY_PATH
 export FI_CXI_DISABLE_HOST_REGISTER=1
 export FI_MR_CACHE_MONITOR=userfaultfd
diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh
index bc31c322ce..a7c2b60ec6 100644
--- a/ALCF/helpers.sh
+++ b/ALCF/helpers.sh
@@ -365,7 +365,14 @@ setupLauncher() {
         make_ds_hostfile || exit
         export LAUNCHER="deepspeed --hostfile $hfds --launcher MPICH ${EXEC}"
     else
-        if [[ -n "${DIST_LAUNCH}" ]]; then
+        if [ -n "$SOPHIA" ]; then
+            # LAUNCHER="${DIST_LAUNCH} --pmi=pmix --genvall $(which python3) -Wignore ${EXEC}"
+            # dist_launch_cmd="mpiexec --verbose --envall -n ${num_gpus} -ppn ${num_gpus_per_host} --hostfile ${hostfile} --cpu-bind depth -d ${depth}"
+            LAUNCHER="mpiexec --verbose $(env | grep -v -e '"' -e ';' -e '*' -e '-' -e '__' -e '}' -e '|' -e ')' -e esac -e else -e fi| awk -F= '{print "-x", $1}' | tr '\n' ' ') \
+                        -n ${num_gpus} --hostfile ${hostfile} $(which python3) -Wignore ${EXEC}"
+            echo "${LAUNCHER}"
+            export LAUNCHER="${LAUNCHER}"
+        elif [[ -n "${DIST_LAUNCH}" ]]; then
             mn=$(get_machine_name)
             if [[ "${mn}" == "aurora" || "${mn}" == "sunspot" ]]; then
                 LAUNCHER="${DIST_LAUNCH} --pmi=pmix --genvall $(which python3) -Wignore ${EXEC}"
@@ -561,7 +568,7 @@ setParams() {
         ######################################################################
     # +--------[Polaris]-----------------------------------+
     # elif [[ $(hostname) == x3* ]]; then
-    elif [[ "${mn}" == "polaris" || "${mn}" == "sirius" ]]; then
+    elif [[ "${mn}" == "polaris" || "${mn}" == "sirius" || -n "$SOPHIA" ]]; then
         # export LAUNCH_CMD="${LAUNCH_CMD:-deepspeed}"
         TP=${TP:-1}               # TP = 2
         export NCCL=${NCCL:-nccl} # NCCL
@@ -594,7 +601,7 @@ setParams() {
         fi
     fi
     # +----------------------------------------------------------------------+
-    export TP="${TP}"
+    export TP="${TP:-1}"
     export PP="${PP:-1}"
     export SP="${SP:-1}"
     export FLASH_ARG="${FLASH_ARG}"
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 592ff2855b..12041b133a 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -380,10 +380,10 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask):
         context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
 
         # [sq, b, np, hn] --> [sq, b, hp]
-        new_context_layer_shape = context_layer.size()[:-2] + (
-            self.hidden_size_per_partition,
-        )
-        context_layer = context_layer.view(*new_context_layer_shape)
+        # new_context_layer_shape = context_layer.size()[:-2] + (
+        #     self.hidden_size_per_partition,
+        # )
+        # context_layer = context_layer.view(*new_context_layer_shape)
 
         return context_layer
 
@@ -987,7 +987,7 @@ def forward(
                     ).contiguous()
             else:
                 context_layer = self.dist_attn(
-                    query_layer, key_layer, value_layer, attention_mask
+                    query_layer, key_layer, value_layer, batch_dim_idx=batch_dim_idx, attention_mask=attention_mask
                 )
         else:
             if self.use_flash_attn:
@@ -1024,7 +1024,8 @@ def forward(
         # =================
         # Output. [sq, b, h]
         # =================
-
+        if not self.use_flash_attn: ## for deepspeed>=14.5
+            context_layer = context_layer.flatten(start_dim=-2) ## [s, b, hc, hd] -> [s, b, h]
         output, bias = self.dense(context_layer)
 
         return output, bias
diff --git a/soph_mult_qsub_DP.sh b/soph_mult_qsub_DP.sh
new file mode 100644
index 0000000000..ac7a9449dd
--- /dev/null
+++ b/soph_mult_qsub_DP.sh
@@ -0,0 +1,39 @@
+## Author: Eugene Ku
+## Issue: Oddly, Sophia uses more memory than polaris for some reason.
+## Extra Libs (Libraries): 
+## 1. Will need Apex
+## a. download apex b. cd apex; python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+## 2. pip install deepspeed --upgrade (I used 0.15.1)
+
+export PBS_O_WORKDIR=$(dirname $0 | xargs realpath)
+cd $PBS_O_WORKDIR
+
+## SCALE CONFIGS
+export TRAIN_ITER=5000
+export SP=1
+MICRO_BATCH=1
+export MICRO_BATCH=$(($SP * $MICRO_BATCH)) ## Single copy of model batch size
+export DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt
+# export NLAYERS=2
+export NLAYERS=10
+export OPT=adamw
+export SAVE_INTERVAL=100
+export GRAD_ACC_STEPS=1
+export NO_FLASH_ATTN=1
+export SOPHIA=1 ## Sophia
+
+## MODEL ARGUEMNTS
+if [ -n $SOPHIA ]; then
+    . venvs/2024-08-08/bin/activate
+fi
+
+bash $PBS_O_WORKDIR/train_llama_alcf.sh
+
+
+# QUEUE=by-node
+# qsub -V -A datascience -q $QUEUE -l select=2 -l walltime=16:00:00,filesystems=eagle:home $PBS_O_WORKDIR/train_llama_alcf.sh
+
+# QUEUE=debug-scaling
+# qsub -V -A datascience -q $QUEUE -l select=4 -l walltime=1:00:00,filesystems=eagle:home $PBS_O_WORKDIR/train_llama_alcf.sh
+
+# echo Submitted a job with PBS_DIR at $PBS_O_WORKDIR on $QUEUE queue.
\ No newline at end of file