Skip to content

Commit

Permalink
Fix issue where allocations are scheduled inside of allocations, have…
Browse files Browse the repository at this point in the history
… spheral_ats check if we are inside an allocation
  • Loading branch information
ldowen committed Nov 7, 2024
1 parent d4c9bf7 commit 514f6a6
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 30 deletions.
9 changes: 4 additions & 5 deletions .gitlab/machines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,19 @@
- ruby
- shell
variables:
SCHEDULER_PARAMETERS: "--res=ci --exclusive=user --deadline=now+1hour -N 2 -t 60"
HOSTNAME: 'ruby'
PARTITION: pdebug
BUILD_ALLOC: srun -N 1 -c 36 -p pdebug -t 60
CLEAN_ALLOC: srun -n 20
extends: [.on_toss_4_x86]

.on_lassen:
tags:
- lassen
- shell
variables:
SCHEDULER_PARAMETERS: "-nnodes 1 -W 60 -q pci -alloc_flags atsdisable"
LSB_JOB_STARTER: [ "ENVIRONMENT=BATCH" ]
ENVIRONMENT: "BATCH"
HOSTNAME: 'lassen'
BUILD_ALLOC: lalloc 1 -W 60
CLEAN_ALLOC: lalloc 1 lrun -n 20
LC_MODULES: "cuda/11.1.0"
extends: [.on_blueos_3_ppc64]

Expand Down
20 changes: 10 additions & 10 deletions .gitlab/scripts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
- cd $CI_BUILD_DIR
- echo $SPEC

- $BUILD_ALLOC ./$SCRIPT_DIR/gitlab/build_and_install.py --spec="$SPEC" --tpls-only
- ./$SCRIPT_DIR/gitlab/build_and_install.py --spec="$SPEC" --tpls-only
artifacts:
paths:
- ci-dir.txt
Expand All @@ -27,7 +27,7 @@
script:
- CI_BUILD_DIR=$(cat ci-dir.txt)
- cd $CI_BUILD_DIR && cat job-name.txt
- $BUILD_ALLOC ./$SCRIPT_DIR/devtools/host-config-build.py --host-config gitlab.cmake --build $EXTRA_CMAKE_ARGS
- ./$SCRIPT_DIR/devtools/host-config-build.py --host-config gitlab.cmake --build $EXTRA_CMAKE_ARGS
artifacts:
paths:
- ci-dir.txt
Expand Down Expand Up @@ -55,7 +55,7 @@
- CI_BUILD_DIR=$(cat ci-dir.txt)
- cd $CI_BUILD_DIR && cat job-name.txt

- ./build_gitlab/install/spheral-ats ./build_gitlab/install/$ATS_FILE || exit_code=$?
- ./build_gitlab/install/spheral-ats --ciRun ./build_gitlab/install/$ATS_FILE || exit_code=$?
- cp -r test-logs $CI_PROJECT_DIR
- exit $exit_code
artifacts:
Expand All @@ -76,7 +76,7 @@

- ml load mpifileutils
- cd $SPHERAL_BUILDS_DIR
- $CLEAN_ALLOC drm $CI_BUILD_DIR/..
- drm $CI_BUILD_DIR/..

# ------------------------------------------------------------------------------
# Shared TPL scripts.
Expand All @@ -87,15 +87,15 @@
variables:
GIT_STRATEGY: none
script:
- $BUILD_ALLOC ./$SCRIPT_DIR/devtools/tpl-manager.py --spec-list="$SCRIPT_DIR/devtools/spec-list.json" --spheral-spack-dir=$UPSTREAM_DIR
- ./$SCRIPT_DIR/devtools/tpl-manager.py --spec-list="$SCRIPT_DIR/devtools/spec-list.json" --spheral-spack-dir=$UPSTREAM_DIR

.toss_update_permissions:
stage: update_permissions
variables:
GIT_STRATEGY: none
script:
- ml load mpifileutils
- srun -N 1 -p $PARTITION -n 20 -t 10 dchmod --mode go+rx $UPSTREAM_DIR
- dchmod --mode go+rx $UPSTREAM_DIR

# ------------------------------------------------------------------------------
# Production Installation scripts
Expand All @@ -121,7 +121,7 @@
- INSTALL_DIR=/usr/gapps/Spheral/$SYS_TYPE/spheral-$SPHERAL_REV_STR
- DEV_PKG_NAME=$SYS_TYPE-spheral-dev-pkg-$SPHERAL_REV_STR

- env SPHERAL_REV_STR=$SPHERAL_REV_STR INSTALL_DIR=$INSTALL_DIR SPEC=$SPEC SPACK_PKG_NAME=$SPACK_PKG_NAME BUILD_ALLOC="$BUILD_ALLOC" SCRIPT_DIR=$SCRIPT_DIR
- env SPHERAL_REV_STR=$SPHERAL_REV_STR INSTALL_DIR=$INSTALL_DIR SPEC=$SPEC SPACK_PKG_NAME=$SPACK_PKG_NAME SCRIPT_DIR=$SCRIPT_DIR
bash ./$SCRIPT_DIR/lc/generate-buildcache.sh

- echo $INSTALL_DIR &> install-dir.txt
Expand All @@ -145,7 +145,7 @@
- tar -xzf $DEV_PKG_NAME.tar.gz
- cd $DEV_PKG_NAME

- env INSTALL_DIR=$INSTALL_DIR SPEC=$SPEC SPACK_PKG_NAME=$SPACK_PKG_NAME BUILD_ALLOC="$BUILD_ALLOC" SCRIPT_DIR=$SCRIPT_DIR
- env INSTALL_DIR=$INSTALL_DIR SPEC=$SPEC SPACK_PKG_NAME=$SPACK_PKG_NAME SCRIPT_DIR=$SCRIPT_DIR
bash ./$SCRIPT_DIR/lc/install-from-dev-pkg.sh

artifacts:
Expand All @@ -164,7 +164,7 @@
- chmod go+r /usr/gapps/Spheral/modulefiles/Spheral/"$ALIAS".lua

- ml load mpifileutils
- srun -N 1 -p $PARTITION -n 20 -t 10 dchmod --mode go+rx $INSTALL_DIR
- dchmod --mode go+rx $INSTALL_DIR
- ln -sfn $INSTALL_DIR /usr/gapps/Spheral/$SYS_TYPE/$ALIAS


Expand All @@ -187,7 +187,7 @@
- echo $DIR_LIST

- ml load mpifileutils
- if [[ $DIR_LIST ]]; then $CLEAN_ALLOC drm $DIR_LIST; else echo "No directories to remove at this time."; fi
- if [[ $DIR_LIST ]]; then drm $DIR_LIST; else echo "No directories to remove at this time."; fi
when: always

.merge_pr_rule:
Expand Down
3 changes: 1 addition & 2 deletions scripts/atstest.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env bash

# Running with exec bash -l prevents blueOS from printing stty error messages
exec bash -l @CMAKE_INSTALL_PREFIX@/spheral @CMAKE_INSTALL_PREFIX@/scripts/spheral_ats.py "$@"
@CMAKE_INSTALL_PREFIX@/spheral @CMAKE_INSTALL_PREFIX@/scripts/spheral_ats.py "$@"
8 changes: 3 additions & 5 deletions scripts/lc/install-from-dev-pkg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ trap 'echo "# $BASH_COMMAND"' DEBUG

SPACK_PKG_NAME=${SPACK_PKG_NAME:-'spheral'}
SPACK_URL=${SPACK_URL:-'https://github.com/spack/spack'}
BUILD_ALLOC=${BUILD_ALLOC}
SCRIPT_DIR=${SCRIPT_DIR:-'scripts'}

if [[ -z "${SPEC}" ]]; then
Expand All @@ -20,7 +19,6 @@ echo $SPEC
echo $SPACK_URL
echo $INSTALL_DIR
echo $SCRIPT_DIR
echo $BUILD_ALLOC

rm -rf $INSTALL_DIR
mkdir -p $INSTALL_DIR
Expand All @@ -36,12 +34,12 @@ spack mirror add --unsigned spheral-mirror $PWD/resources/mirror
spack mirror add --unsigned spheral-cache $PWD/resources
spack buildcache update-index $PWD/resources/mirror

$BUILD_ALLOC spack install --fresh --deprecated --no-check-signature --only dependencies $SPACK_PKG_NAME@develop%$SPEC
spack install --fresh --deprecated --no-check-signature --only dependencies $SPACK_PKG_NAME@develop%$SPEC

$BUILD_ALLOC ./$SCRIPT_DIR/devtools/tpl-manager.py --spack-url $SPACK_URL --no-upstream --spheral-spack-dir $INSTALL_DIR/spheral-spack-tpls --spec $SPEC
./$SCRIPT_DIR/devtools/tpl-manager.py --spack-url $SPACK_URL --no-upstream --spheral-spack-dir $INSTALL_DIR/spheral-spack-tpls --spec $SPEC

HOST_CONFIG_FILE=$(ls -t | grep -E "*\.cmake" | head -1)
$BUILD_ALLOC ./$SCRIPT_DIR/devtools/host-config-build.py --host-config $HOST_CONFIG_FILE -i $INSTALL_DIR --build --no-clean
./$SCRIPT_DIR/devtools/host-config-build.py --host-config $HOST_CONFIG_FILE -i $INSTALL_DIR --build --no-clean



Expand Down
31 changes: 23 additions & 8 deletions scripts/spheral_ats.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ def main():
parser.add_argument("--timeLimit", type=int,
default=None,
help="Time limit for allocation.")
parser.add_argument("--ciRun", action="store_true",
help="Option to only be used by the CI")
parser.add_argument("--perfTest", action="store_true",
help="Turn on if doing a performance test.")
parser.add_argument("--atsHelp", action="store_true",
Expand All @@ -141,21 +143,28 @@ def main():
ats_args = install_ats_args()
numNodes = options.numNodes
timeLimit = options.timeLimit
ciRun = False if options.perfTest else True
launch_cmd = ""
blueOS = False
# These are environment variables to suggest we are in an allocation already
# NOTE: CI runs should already be in an allocation so the launch cmd is
# unused in those cases
inAllocVars = []

if hostname:
mac_args = []
if any(x in hostname for x in toss_machine_names):
numNodes = numNodes if numNodes else 2
timeLimit = timeLimit if timeLimit else 120
mac_args = [f"--numNodes {numNodes}"]
inAllocVars = ["SLURM_JOB_NUM_NODES", "SLURM_NNODES"]
launch_cmd = f"salloc --exclusive -N {numNodes} -t {timeLimit} "
if (options.ciRun):
launch_cmd += "-p pdebug "
elif any(x in hostname for x in blueos_machine_names):
blueOS = True
numNodes = numNodes if numNodes else 1
timeLimit = timeLimit if timeLimit else 60
inAllocVars = ["LSB_MAX_NUM_PROCESSORS"]
mac_args = ["--smpi_off", f"--numNodes {numNodes}"]
launch_cmd = f"bsub -nnodes {numNodes} -Is -XF -W {timeLimit} -core_isolation 2 "
ats_args.extend(mac_args)
Expand All @@ -164,7 +173,7 @@ def main():
# Launch ATS
#---------------------------------------------------------------------------
# If doing a CI run, set some more options
if (ciRun):
if (not options.perfTest):
if ("--logs" not in unknown_options):
ats_args.append(f"--logs {test_log_name}")
log_name = test_log_name
Expand All @@ -176,14 +185,20 @@ def main():
ats_args = " ".join(str(x) for x in ats_args)
other_args = " ".join(str(x) for x in unknown_options)
cmd = f"{ats_exe} -e {spheral_exe} {ats_args} {other_args}"
if blueOS:
# Launches using Bsub have issues with '<' being in command
# so entire run statment must be in quotes
run_command = f"{launch_cmd} '{cmd}'"
# Check if are already in an allocation
inAlloc = any(e in list(os.environ.keys()) for e in inAllocVars)
# If already in allocation, do not do a launch
if inAlloc:
run_command = cmd
else:
run_command = f"{launch_cmd}{cmd}"
if blueOS:
# Launches using Bsub have issues with '<' being in command
# so entire run statment must be in quotes
run_command = f"{launch_cmd} '{cmd}'"
else:
run_command = f"{launch_cmd}{cmd}"
print(f"\nRunning: {run_command}\n")
if (ciRun):
if (options.ciRun):
run_and_report(run_command, log_name, 0)
else:
try:
Expand Down

0 comments on commit 514f6a6

Please sign in to comment.