From 24b5d84159f94b81e708c55eda64c811a8ab70f8 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 7 May 2024 19:30:04 +0000 Subject: [PATCH] Update as per feedback, this commit gives slurm user permission to run scripts as root --- ansible/roles/slurm/tasks/main.yml | 21 +++++++ .../bash_scripts/run_setup_network_storage.sh | 3 + scripts/bash_scripts/run_setup_nfs_exports.sh | 3 + scripts/setup.py | 28 +++++++--- scripts/setup_network_storage.py | 56 ++++++++++--------- scripts/slurmsync.py | 25 ++++++--- 6 files changed, 94 insertions(+), 42 deletions(-) create mode 100755 scripts/bash_scripts/run_setup_network_storage.sh create mode 100755 scripts/bash_scripts/run_setup_nfs_exports.sh diff --git a/ansible/roles/slurm/tasks/main.yml b/ansible/roles/slurm/tasks/main.yml index 5cfebd5b1..c616111d9 100644 --- a/ansible/roles/slurm/tasks/main.yml +++ b/ansible/roles/slurm/tasks/main.yml @@ -45,6 +45,16 @@ - '{{slurm_paths.state}}' - '{{slurm_paths.run}}' +- name: Mkdir for slurm bash scripts + file: + path: '{{item}}' + state: directory + owner: '{{slurm_user.user}}' + group: '{{slurm_user.group}}' + mode: '0700' + loop: + - '{{slurm_paths.scripts}}/bash_scripts' + - name: Include Install Tasks include_tasks: install.yml @@ -83,6 +93,17 @@ mode: '0644' directory_mode: '0755' +- name: Copy bash scripts + copy: + src: scripts/bash_scripts/{{item}} + dest: '{{ slurm_paths.scripts }}/bash_scripts/{{item}}' + owner: "root" + group: "root" + mode: '0700' + with_items: + - run_setup_network_storage.sh + - run_setup_nfs_exports.sh + - name: Copy Jobs copy: src: jobs/ diff --git a/scripts/bash_scripts/run_setup_network_storage.sh b/scripts/bash_scripts/run_setup_network_storage.sh new file mode 100755 index 000000000..4e8219aed --- /dev/null +++ b/scripts/bash_scripts/run_setup_network_storage.sh @@ -0,0 +1,3 @@ +#!/bin/bash +cd /slurm/scripts/ +python3 -c 'import setup_network_storage; setup_network_storage.setup_network_storage()' diff --git a/scripts/bash_scripts/run_setup_nfs_exports.sh b/scripts/bash_scripts/run_setup_nfs_exports.sh new file mode 100755 index 000000000..22284ecc3 --- /dev/null +++ b/scripts/bash_scripts/run_setup_nfs_exports.sh @@ -0,0 +1,3 @@ +#!/bin/bash +cd /slurm/scripts/ +python3 -c 'import setup_network_storage; setup_network_storage.setup_nfs_exports()' diff --git a/scripts/setup.py b/scripts/setup.py index bc82f5609..975f5e087 100755 --- a/scripts/setup.py +++ b/scripts/setup.py @@ -47,10 +47,6 @@ login_nodeset, ) import slurmsync -from setup_network_storage import ( - setup_network_storage, - setup_nfs_exports, -) SETUP_SCRIPT = Path(__file__) @@ -287,11 +283,27 @@ def setup_nss_slurm(): run(r"sed -i 's/\(^\(passwd\|group\):\s\+\)/\1slurm /g' /etc/nsswitch.conf") +def call_setup_network_storage(): + result = subprocess.run( + ["sudo", "/slurm/scripts/bash_scripts/run_setup_network_storage.sh"] + ) + return result.returncode + + +def call_setup_nfs_exports(): + result = subprocess.run( + ["sudo", "/slurm/scripts/bash_scripts/run_setup_nfs_exports.sh"] + ) + return result.returncode + + def setup_sudoers(): content = """ # Allow SlurmUser to manage the slurm daemons slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmd.service slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmctld.service +slurm ALL=(ALL) NOPASSWD: /slurm/scripts/bash_scripts/run_setup_network_storage.sh +slurm ALL=(ALL) NOPASSWD: /slurm/scripts/bash_scripts/run_setup_nfs_exports.sh """ sudoers_file = Path("/etc/sudoers.d/slurm") sudoers_file.write_text(content) @@ -399,7 +411,7 @@ def setup_controller(args): if cfg.controller_secondary_disk: setup_secondary_disks() - setup_network_storage(log) + call_setup_network_storage() run_custom_scripts() @@ -431,7 +443,7 @@ def setup_controller(args): run("systemctl enable nfs-server", timeout=30) run("systemctl start nfs-server", timeout=30) - setup_nfs_exports() + call_setup_nfs_exports() run("systemctl enable --now slurmcmd.timer", timeout=30) log.info("Check status of cluster services") @@ -464,7 +476,7 @@ def setup_login(args): update_system_config("slurmd", sysconf) install_custom_scripts() - setup_network_storage(log) + call_setup_network_storage() setup_sudoers() run("systemctl restart munge") run("systemctl enable slurmd", timeout=30) @@ -498,7 +510,7 @@ def setup_compute(args): install_custom_scripts() setup_nss_slurm() - setup_network_storage(log) + call_setup_network_storage() has_gpu = run("lspci | grep --ignore-case 'NVIDIA' | wc -l", shell=True).returncode if has_gpu: diff --git a/scripts/setup_network_storage.py b/scripts/setup_network_storage.py index 07f5a5b99..f86ffcd53 100755 --- a/scripts/setup_network_storage.py +++ b/scripts/setup_network_storage.py @@ -78,11 +78,11 @@ def setup_nfs_exports(): run("exportfs -a", timeout=30) -def munge_mount_handler(log): +def munge_mount_handler(): cfg = load_config_file(Path(__file__).with_name("config.yaml")) - if not cfg.munge_mount: - log.error("Missing munge_mount in cfg") - elif lkp.instance_role == "controller": + # if not cfg.munge_mount: + # # log.error("Missing munge_mount in cfg") + if lkp.instance_role == "controller": return mount = cfg.munge_mount @@ -102,7 +102,7 @@ def munge_mount_handler(log): munge_key = Path(dirs.munge / "munge.key") - log.info(f"Mounting munge share to: {local_mount}") + # log.info(f"Mounting munge share to: {local_mount}") local_mount.mkdir() if fs_type.lower() == "gcsfuse".lower(): if remote_mount is None: @@ -130,23 +130,23 @@ def munge_mount_handler(log): run(cmd, timeout=timeout) break except Exception as e: - log.error( - f"munge mount failed: '{cmd}' {e}, try {retry}, waiting {wait:0.2f}s" - ) + # log.error( + # f"munge mount failed: '{cmd}' {e}, try {retry}, waiting {wait:0.2f}s" + # ) time.sleep(wait) err = e continue else: raise err - log.info(f"Copy munge.key from: {local_mount}") + # log.info(f"Copy munge.key from: {local_mount}") shutil.copy2(Path(local_mount / "munge.key"), munge_key) - log.info("Restrict permissions of munge.key") + # log.info("Restrict permissions of munge.key") shutil.chown(munge_key, user="munge", group="munge") os.chmod(munge_key, stat.S_IRUSR) - log.info(f"Unmount {local_mount}") + # log.info(f"Unmount {local_mount}") if fs_type.lower() == "gcsfuse".lower(): run(f"fusermount -u {local_mount}", timeout=120) else: @@ -154,19 +154,19 @@ def munge_mount_handler(log): shutil.rmtree(local_mount) -def mount_fstab(mounts, log): +def mount_fstab(mounts): """Wait on each mount, then make sure all fstab is mounted""" from more_executors import Executors, ExceptionRetryPolicy def mount_path(path): - log.info(f"Waiting for '{path}' to be mounted...") + # log.info(f"Waiting for '{path}' to be mounted...") try: run(f"mount {path}", timeout=120) except Exception as e: exc_type, _, _ = sys.exc_info() - log.error(f"mount of path '{path}' failed: {exc_type}: {e}") + # log.error(f"mount of path '{path}' failed: {exc_type}: {e}") raise e - log.info(f"Mount point '{path}' was mounted.") + # log.info(f"Mount point '{path}' was mounted.") MAX_MOUNT_TIMEOUT = 60 * 5 future_list = [] @@ -250,10 +250,11 @@ def resolve_network_storage(nodeset=None): return list(mounts.values()) -def setup_network_storage(log): +def setup_network_storage(): """prepare network fs mounts and add them to fstab""" - log.info("Set up network storage") + # log.info("Set up network storage") # filter mounts into two dicts, cluster-internal and external mounts + print("INFO: set up network storage called") all_mounts = resolve_network_storage() ext_mounts, int_mounts = separate_external_internal_mounts(all_mounts) if lkp.instance_role == "controller": @@ -269,14 +270,14 @@ def setup_network_storage(log): server_ip = mount.server_ip or "" local_mount.mkdirp() - log.info( - "Setting up mount ({}) {}{} to {}".format( - fs_type, - server_ip + ":" if fs_type != "gcsfuse" else "", - remote_mount, - local_mount, - ) - ) + # log.info( + # "Setting up mount ({}) {}{} to {}".format( + # fs_type, + # server_ip + ":" if fs_type != "gcsfuse" else "", + # remote_mount, + # local_mount, + # ) + # ) mount_options = mount.mount_options.split(",") if mount.mount_options else [] if not mount_options or "_netdev" not in mount_options: @@ -308,5 +309,6 @@ def setup_network_storage(log): for entry in fstab_entries: f.write(entry) f.write("\n") - mount_fstab(mounts_by_local(mounts), log) - munge_mount_handler(log) + mount_fstab(mounts_by_local(mounts)) + munge_mount_handler() + print("INFO: set up network storage finished") diff --git a/scripts/slurmsync.py b/scripts/slurmsync.py index ffe0d8432..b1addaf4a 100755 --- a/scripts/slurmsync.py +++ b/scripts/slurmsync.py @@ -26,6 +26,7 @@ from itertools import chain from pathlib import Path import yaml +import subprocess import util from util import ( @@ -59,10 +60,6 @@ install_cgroup_conf, install_topology_conf, ) -from setup_network_storage import ( - setup_network_storage, - setup_nfs_exports, -) filename = Path(__file__).name LOGFILE = (Path(cfg.slurm_log_dir if cfg else ".") / filename).with_suffix(".log") @@ -97,6 +94,20 @@ def start_instance_op(inst, project=None): ) +def call_setup_network_storage(): + result = subprocess.run( + ["sudo", "/slurm/scripts/bash_scripts/run_setup_network_storage.sh"] + ) + return result.returncode + + +def call_setup_nfs_exports(): + result = subprocess.run( + ["sudo", "/slurm/scripts/bash_scripts/run_setup_nfs_exports.sh"] + ) + return result.returncode + + def start_instances(node_list): log.info("{} instances to start ({})".format(len(node_list), ",".join(node_list))) @@ -486,8 +497,8 @@ def reconfigure_slurm(): install_gres_conf(lkp) install_cgroup_conf(lkp) install_topology_conf(lkp) - setup_network_storage(log) - setup_nfs_exports() + call_setup_network_storage() + call_setup_nfs_exports() log.info("Restarting slurmctld to make changes take effect.") try: run("sudo systemctl restart slurmctld.service", check=False) @@ -497,7 +508,7 @@ def reconfigure_slurm(): util.run(f"wall '{update_msg}'", timeout=30) log.debug("Done.") elif lkp.instance_role_safe in ["compute", "login"]: - setup_network_storage(log) + call_setup_network_storage() log.info("Restarting slurmd to make changes take effect.") run("systemctl restart slurmd") util.run(f"wall '{update_msg}'", timeout=30)