From 38e4d4c1aaec2c1c1d9033a0df500de9e4507599 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 7 May 2024 19:30:04 +0000 Subject: [PATCH] Update as per feedback, this commit gives slurm user permission to run scripts as root --- ansible/roles/slurm/tasks/main.yml | 32 ++++++++++++++++- .../run_setup_network_storage.sh | 3 ++ .../run_setup_nfs_exports.sh | 3 ++ scripts/setup.py | 32 ++++++++++++----- scripts/setup_network_storage.py | 35 ++++++++++++++++--- scripts/slurmsync.py | 29 +++++++++++---- 6 files changed, 113 insertions(+), 21 deletions(-) create mode 100755 scripts/network_storage_wrappers/run_setup_network_storage.sh create mode 100755 scripts/network_storage_wrappers/run_setup_nfs_exports.sh diff --git a/ansible/roles/slurm/tasks/main.yml b/ansible/roles/slurm/tasks/main.yml index 5cfebd5b1..b69bc8bf7 100644 --- a/ansible/roles/slurm/tasks/main.yml +++ b/ansible/roles/slurm/tasks/main.yml @@ -45,6 +45,16 @@ - '{{slurm_paths.state}}' - '{{slurm_paths.run}}' +- name: Mkdir for network storage wrappers + file: + path: '{{item}}' + state: directory + owner: '{{slurm_user.user}}' + group: '{{slurm_user.group}}' + mode: '0755' + loop: + - '{{slurm_paths.scripts}}/network_storage_wrappers' + - name: Include Install Tasks include_tasks: install.yml @@ -67,13 +77,22 @@ - conf.py - resume.py - setup.py - - setup_network_storage.py - startup.sh - slurmsync.py - suspend.py - util.py - load_bq.py +- name: Copy network storage scripts with root owner + copy: + src: scripts/{{item}} + dest: '{{slurm_paths.scripts}}/{{item}}' + owner: 'root' + group: 'root' + mode: 0o755 + with_items: + - setup_network_storage.py + - name: Copy slurm_gcp_plugins copy: src: scripts/slurm_gcp_plugins @@ -83,6 +102,17 @@ mode: '0644' directory_mode: '0755' +- name: Copy bash scripts + copy: + src: scripts/network_storage_wrappers/{{item}} + dest: '{{ slurm_paths.scripts }}/network_storage_wrappers/{{item}}' + owner: 'root' + group: 'root' + mode: '0644' + with_items: + - run_setup_network_storage.sh + - run_setup_nfs_exports.sh + - name: Copy Jobs copy: src: jobs/ diff --git a/scripts/network_storage_wrappers/run_setup_network_storage.sh b/scripts/network_storage_wrappers/run_setup_network_storage.sh new file mode 100755 index 000000000..9005c5fbe --- /dev/null +++ b/scripts/network_storage_wrappers/run_setup_network_storage.sh @@ -0,0 +1,3 @@ +#!/bin/bash +cd /slurm/scripts/ +python3 -m setup_network_storage "setup_network_storage" diff --git a/scripts/network_storage_wrappers/run_setup_nfs_exports.sh b/scripts/network_storage_wrappers/run_setup_nfs_exports.sh new file mode 100755 index 000000000..ff3f5c54e --- /dev/null +++ b/scripts/network_storage_wrappers/run_setup_nfs_exports.sh @@ -0,0 +1,3 @@ +#!/bin/bash +cd /slurm/scripts/ +python3 -m setup_network_storage "setup_nfs_exports" diff --git a/scripts/setup.py b/scripts/setup.py index bc82f5609..4a7d8f9e3 100755 --- a/scripts/setup.py +++ b/scripts/setup.py @@ -47,10 +47,6 @@ login_nodeset, ) import slurmsync -from setup_network_storage import ( - setup_network_storage, - setup_nfs_exports, -) SETUP_SCRIPT = Path(__file__) @@ -287,11 +283,31 @@ def setup_nss_slurm(): run(r"sed -i 's/\(^\(passwd\|group\):\s\+\)/\1slurm /g' /etc/nsswitch.conf") +def call_setup_network_storage(log): + try: + subprocess.run( + ["sudo", "/slurm/scripts/network_storage_wrappers/run_setup_network_storage.sh"] + ) + log.info("network storage mounted successfully") + except Exception as e: + log.error(e) + + +def call_setup_nfs_exports(log): + try: + subprocess.run(["sudo", "/slurm/scripts/network_storage_wrappers/run_setup_nfs_exports.sh"]) + log.info("nfs exported successfully") + except Exception as e: + log.error(e) + + def setup_sudoers(): content = """ # Allow SlurmUser to manage the slurm daemons slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmd.service slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmctld.service +slurm ALL=(ALL) NOPASSWD: /slurm/scripts/network_storage_wrappers/run_setup_network_storage.sh +slurm ALL=(ALL) NOPASSWD: /slurm/scripts/network_storage_wrappers/run_setup_nfs_exports.sh """ sudoers_file = Path("/etc/sudoers.d/slurm") sudoers_file.write_text(content) @@ -399,7 +415,7 @@ def setup_controller(args): if cfg.controller_secondary_disk: setup_secondary_disks() - setup_network_storage(log) + call_setup_network_storage(log) run_custom_scripts() @@ -431,7 +447,7 @@ def setup_controller(args): run("systemctl enable nfs-server", timeout=30) run("systemctl start nfs-server", timeout=30) - setup_nfs_exports() + call_setup_nfs_exports(log) run("systemctl enable --now slurmcmd.timer", timeout=30) log.info("Check status of cluster services") @@ -464,7 +480,7 @@ def setup_login(args): update_system_config("slurmd", sysconf) install_custom_scripts() - setup_network_storage(log) + call_setup_network_storage(log) setup_sudoers() run("systemctl restart munge") run("systemctl enable slurmd", timeout=30) @@ -498,7 +514,7 @@ def setup_compute(args): install_custom_scripts() setup_nss_slurm() - setup_network_storage(log) + call_setup_network_storage(log) has_gpu = run("lspci | grep --ignore-case 'NVIDIA' | wc -l", shell=True).returncode if has_gpu: diff --git a/scripts/setup_network_storage.py b/scripts/setup_network_storage.py index 07f5a5b99..c4ff1a628 100755 --- a/scripts/setup_network_storage.py +++ b/scripts/setup_network_storage.py @@ -20,21 +20,31 @@ import time import shutil +import logging from pathlib import Path from concurrent.futures import as_completed from addict import Dict as NSDict from util import ( + cfg, lkp, dirs, separate, run, host_lookup, + chown_slurm, + config_root_logger, load_config_file, backoff_delay, ) +filename = Path(__file__).name +LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log") + +log = logging.getLogger(filename) + + def setup_nfs_exports(): """nfs export all needed directories""" # The controller only needs to set up exports for cluster-internal mounts @@ -78,7 +88,7 @@ def setup_nfs_exports(): run("exportfs -a", timeout=30) -def munge_mount_handler(log): +def munge_mount_handler(): cfg = load_config_file(Path(__file__).with_name("config.yaml")) if not cfg.munge_mount: log.error("Missing munge_mount in cfg") @@ -154,7 +164,7 @@ def munge_mount_handler(log): shutil.rmtree(local_mount) -def mount_fstab(mounts, log): +def mount_fstab(mounts): """Wait on each mount, then make sure all fstab is mounted""" from more_executors import Executors, ExceptionRetryPolicy @@ -250,10 +260,11 @@ def resolve_network_storage(nodeset=None): return list(mounts.values()) -def setup_network_storage(log): +def setup_network_storage(): """prepare network fs mounts and add them to fstab""" log.info("Set up network storage") # filter mounts into two dicts, cluster-internal and external mounts + all_mounts = resolve_network_storage() ext_mounts, int_mounts = separate_external_internal_mounts(all_mounts) if lkp.instance_role == "controller": @@ -308,5 +319,19 @@ def setup_network_storage(log): for entry in fstab_entries: f.write(entry) f.write("\n") - mount_fstab(mounts_by_local(mounts), log) - munge_mount_handler(log) + mount_fstab(mounts_by_local(mounts)) + munge_mount_handler() + + +if __name__ == "__main__": + chown_slurm(LOGFILE, mode=0o600) + config_root_logger(filename, logfile=LOGFILE) + + if len(sys.argv) != 2: + log.error("only 2 argument needed..") + else: + function_name = sys.argv[1] + if function_name == "setup_network_storage": + setup_network_storage() + else: + setup_nfs_exports() diff --git a/scripts/slurmsync.py b/scripts/slurmsync.py index ffe0d8432..908ebe139 100755 --- a/scripts/slurmsync.py +++ b/scripts/slurmsync.py @@ -26,6 +26,7 @@ from itertools import chain from pathlib import Path import yaml +import subprocess import util from util import ( @@ -59,10 +60,6 @@ install_cgroup_conf, install_topology_conf, ) -from setup_network_storage import ( - setup_network_storage, - setup_nfs_exports, -) filename = Path(__file__).name LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log") @@ -97,6 +94,24 @@ def start_instance_op(inst, project=None): ) +def call_setup_network_storage(log): + try: + subprocess.run( + ["sudo", "/slurm/scripts/network_storage_wrappers/run_setup_network_storage.sh"] + ) + log.info("network storage mounted successfully") + except Exception as e: + log.error(e) + + +def call_setup_nfs_exports(log): + try: + subprocess.run(["sudo", "/slurm/scripts/network_storage_wrappers/run_setup_nfs_exports.sh"]) + log.info("nfs exported successfully") + except Exception as e: + log.error(e) + + def start_instances(node_list): log.info("{} instances to start ({})".format(len(node_list), ",".join(node_list))) @@ -486,8 +501,8 @@ def reconfigure_slurm(): install_gres_conf(lkp) install_cgroup_conf(lkp) install_topology_conf(lkp) - setup_network_storage(log) - setup_nfs_exports() + call_setup_network_storage(log) + call_setup_nfs_exports(log) log.info("Restarting slurmctld to make changes take effect.") try: run("sudo systemctl restart slurmctld.service", check=False) @@ -497,7 +512,7 @@ def reconfigure_slurm(): util.run(f"wall '{update_msg}'", timeout=30) log.debug("Done.") elif lkp.instance_role_safe in ["compute", "login"]: - setup_network_storage(log) + call_setup_network_storage(log) log.info("Restarting slurmd to make changes take effect.") run("systemctl restart slurmd") util.run(f"wall '{update_msg}'", timeout=30)