Skip to content

Commit

Permalink
Update as per feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
harshthakkar01 committed May 15, 2024
1 parent 07f2b24 commit 6bede70
Show file tree
Hide file tree
Showing 6 changed files with 187 additions and 36 deletions.
2 changes: 2 additions & 0 deletions ansible/roles/slurm/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@
- setup_network_storage.py
- startup.sh
- slurmsync.py
- network_wrapper.sh
- network_storage.yml
- suspend.py
- util.py
- load_bq.py
Expand Down
21 changes: 21 additions & 0 deletions scripts/network_storage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---

- name: Add data to a file
hosts: localhost
tasks:
- name: Show content to update
debug:
msg:
- "Data: {{ data }}"
- "Path: {{ path }}"

- name: Create file if it doesn't exist
ansible.builtin.file:
path: "{{ path }}"
state: touch

- name: Ensure data entry is present (lineinfile)
ansible.builtin.lineinfile:
path: "{{ path }}"
line: "{{ data }}"
state: present
68 changes: 68 additions & 0 deletions scripts/network_wrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/bin/bash
# Copyright (C) SchedMD LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


update_file() {
local path="$1"
local data="$2"

if [[ -z "$path" || -z "$data" ]]; then
echo "Error: func_update_fstab requires two arguments: path and data"
return 1
fi

# Create JSON data
local json_data='{"path": "'"${path}"'", "data": "'"${data}"'"}'

# Run Ansible playbook and use network_storage.yml.
ansible-playbook network_storage.yml -e "$json_data"
local exit_status=$?
if [[ $exit_status -ne 0 ]]; then
echo "ansible playbook failed with exit code $exit_status"
return $exit_status
fi

echo "ansible updated file successfully"
}

run_cmd() {
local command="$1"
shift # Remove the command itself from the arguments

if [[ -z "$command" ]]; then
echo "Error: run_cmd requires at least one argument (the command to run)"
return 1
fi

echo "Running command: $command $@"
"$command" "$@" # Execute the command with remaining arguments
local exit_status=$?
if [[ $exit_status -ne 0 ]]; then
echo "Command failed with exit code $exit_status"
return $exit_status # Propagate error
fi
}

if [[ "$1" == "update_file" ]]; then
shift
update_file "$@"
elif [[ "$1" == "run_cmd" ]]; then
shift
run_cmd "$@"
else
echo "Usage: $0 {update_file|run_cmd} [arguments...]"
echo " update_file: file_name line_to_add"
echo " run_cmd: command [arg1] [arg2] ..."
fi
7 changes: 4 additions & 3 deletions scripts/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,9 @@ def setup_nss_slurm():
def setup_sudoers():
content = """
# Allow SlurmUser to manage the slurm daemons
slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmd.service
slurm ALL= NOPASSWD: /usr/bin/systemctl restart slurmctld.service
slurm ALL=(root:root) NOPASSWD: /usr/bin/systemctl restart slurmd.service
slurm ALL=(root:root) NOPASSWD: /usr/bin/systemctl restart slurmctld.service
slurm ALL=(root:root) NOPASSWD: /slurm/scripts/network_wrapper.sh
"""
sudoers_file = Path("/etc/sudoers.d/slurm")
sudoers_file.write_text(content)
Expand Down Expand Up @@ -431,7 +432,7 @@ def setup_controller(args):
run("systemctl enable nfs-server", timeout=30)
run("systemctl start nfs-server", timeout=30)

setup_nfs_exports()
setup_nfs_exports(log)
run("systemctl enable --now slurmcmd.timer", timeout=30)

log.info("Check status of cluster services")
Expand Down
123 changes: 91 additions & 32 deletions scripts/setup_network_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,9 @@
# limitations under the License.

import os
import sys
import stat
import time
import subprocess

import shutil
from pathlib import Path
from concurrent.futures import as_completed
from addict import Dict as NSDict
Expand All @@ -28,14 +26,70 @@
lkp,
dirs,
separate,
run,
host_lookup,
load_config_file,
backoff_delay,
)


def setup_nfs_exports():
def call_update_file(path, data, log=None):
"""Calls the func_update_file Bash function to update a file using Ansible.
Args:
path: The path to the file to update.
data: The line to add to the file.
log: A logging object to record messages.
"""
try:
if os.geteuid() == 0: # If run by root.
subprocess.run(
f"""/slurm/scripts/network_wrapper.sh update_file '{path}' '{data}'""",
shell=True,
check=True, # Raise an exception if the command fails
text=True, # Capture stdout/stderr as text
)
else: # If run by non-root user
subprocess.run(
f"""sudo /slurm/scripts/network_wrapper.sh update_file '{path}' '{data}'""",
shell=True,
check=True, # Raise an exception if the command fails
text=True, # Capture stdout/stderr as text
)
log.info("File updated successfully via Ansible.")
except Exception as e:
log.error(f"An error occurred: {e}")


def call_run_cmd(command, *args, timeout=None, log=None):
"""Calls the func_run_cmd Bash function to execute a shell command.
Args:
command: The shell command to run.
*args: Additional arguments for the command.
log: A logging object to record messages.
"""
try:
if os.geteuid() == 0: # If run by root
subprocess.run(
f"/slurm/scripts/network_wrapper.sh run_cmd {command} {' '.join(args)}",
shell=True,
check=True, # Raise an exception if the command fails
text=True, # Capture stdout/stderr as text
)
else: # If run by non-root
subprocess.run(
f"sudo /slurm/scripts/network_wrapper.sh run_cmd {command} {' '.join(args)}",
shell=True,
check=True, # Raise an exception if the command fails
timeout=timeout,
text=True, # Capture stdout/stderr as text
)
log.info(f"Command '{command}' executed successfully.")
except Exception as e:
log.error(f"An error occurred: {e}")


def setup_nfs_exports(log):
"""nfs export all needed directories"""
# The controller only needs to set up exports for cluster-internal mounts
# switch the key to remote mount path since that is what needs exporting
Expand Down Expand Up @@ -66,16 +120,17 @@ def setup_nfs_exports():
# export path if corresponding selector boolean is True
exports = []
for path in con_mounts:
Path(path).mkdirp()
run(rf"sed -i '\#{path}#d' /etc/exports", timeout=30)
call_run_cmd("mkdir -p", path, log=log)
call_run_cmd("sed", "-i", rf"\#{path}#d", "/etc/exports", timeout=30, log=log)
exports.append(f"{path} *(rw,no_subtree_check,no_root_squash)")

exportsd = Path("/etc/exports.d")
exportsd.mkdirp()
with (exportsd / "slurm.exports").open("w") as f:
f.write("\n")
f.write("\n".join(exports))
run("exportfs -a", timeout=30)
call_run_cmd("mkdir -p", str(exportsd), log=log)

for export in exports:
call_update_file("/etc/exports.d/slurm.exports", export, log)

call_run_cmd("exportfs -a", timeout=30, log=log)


def munge_mount_handler(log):
Expand Down Expand Up @@ -103,7 +158,7 @@ def munge_mount_handler(log):
munge_key = Path(dirs.munge / "munge.key")

log.info(f"Mounting munge share to: {local_mount}")
local_mount.mkdir()
call_run_cmd("mkdir -p", str(local_mount), log=log)
if fs_type.lower() == "gcsfuse".lower():
if remote_mount is None:
remote_mount = ""
Expand All @@ -127,7 +182,7 @@ def munge_mount_handler(log):
timeout = 120
for retry, wait in enumerate(backoff_delay(0.5, timeout), 1):
try:
run(cmd, timeout=timeout)
call_run_cmd(cmd, timeout=timeout, log=log)
break
except Exception as e:
log.error(
Expand All @@ -140,18 +195,20 @@ def munge_mount_handler(log):
raise err

log.info(f"Copy munge.key from: {local_mount}")
shutil.copy2(Path(local_mount / "munge.key"), munge_key)
call_run_cmd(
"cp", "-r", str(Path(local_mount) / "munge.key"), str(munge_key), log=log
)

log.info("Restrict permissions of munge.key")
shutil.chown(munge_key, user="munge", group="munge")
os.chmod(munge_key, stat.S_IRUSR)
call_run_cmd("chown -r munge:munge", munge_key, log=log)
call_run_cmd("chmod", "0400", munge_key, log=log)

log.info(f"Unmount {local_mount}")
if fs_type.lower() == "gcsfuse".lower():
run(f"fusermount -u {local_mount}", timeout=120)
call_run_cmd("fusermount -u", local_mount, timeout=120, log=log)
else:
run(f"umount {local_mount}", timeout=120)
shutil.rmtree(local_mount)
call_run_cmd("umount", local_mount, timeout=120, log=log)
call_run_cmd("rm -rf", local_mount, log=log)


def mount_fstab(mounts, log):
Expand All @@ -161,11 +218,10 @@ def mount_fstab(mounts, log):
def mount_path(path):
log.info(f"Waiting for '{path}' to be mounted...")
try:
run(f"mount {path}", timeout=120)
call_run_cmd("mount", path, timeout=120, log=log)
except Exception as e:
exc_type, _, _ = sys.exc_info()
log.error(f"mount of path '{path}' failed: {exc_type}: {e}")
raise e
log.error(f"mount of path '{path}' failed: {e}")
return
log.info(f"Mount point '{path}' was mounted.")

MAX_MOUNT_TIMEOUT = 60 * 5
Expand Down Expand Up @@ -253,21 +309,23 @@ def resolve_network_storage(nodeset=None):
def setup_network_storage(log):
"""prepare network fs mounts and add them to fstab"""
log.info("Set up network storage")

# filter mounts into two dicts, cluster-internal and external mounts
all_mounts = resolve_network_storage()
ext_mounts, int_mounts = separate_external_internal_mounts(all_mounts)
if lkp.instance_role == "controller":
mounts = ext_mounts
else:
mounts = ext_mounts + int_mounts

# Determine fstab entries and write them out
fstab_entries = []
for mount in mounts:
local_mount = Path(mount.local_mount)
remote_mount = mount.remote_mount
fs_type = mount.fs_type
server_ip = mount.server_ip or ""
local_mount.mkdirp()
call_run_cmd("mkdir -p", str(local_mount), log=log)

log.info(
"Setting up mount ({}) {}{} to {}".format(
Expand Down Expand Up @@ -299,14 +357,15 @@ def setup_network_storage(log):
)
)

# Copy fstab to fstab.bak and use backup as clean copy to re-evaluate mounts.
fstab = Path("/etc/fstab")
if not Path(fstab.with_suffix(".bak")).is_file():
shutil.copy2(fstab, fstab.with_suffix(".bak"))
shutil.copy2(fstab.with_suffix(".bak"), fstab)
with open(fstab, "a") as f:
f.write("\n")
for entry in fstab_entries:
f.write(entry)
f.write("\n")
call_run_cmd("cp -p", str(fstab), str(fstab.with_suffix(".bak")), log=log)
call_run_cmd("cp -p", str(fstab.with_suffix(".bak")), str(fstab), log=log)

# Update fstab.
for entry in fstab_entries:
call_update_file("/etc/fstab", entry, log)

mount_fstab(mounts_by_local(mounts), log)
munge_mount_handler(log)
2 changes: 1 addition & 1 deletion scripts/slurmsync.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ def reconfigure_slurm():
install_cgroup_conf(lkp)
install_topology_conf(lkp)
setup_network_storage(log)
setup_nfs_exports()
setup_nfs_exports(log)
log.info("Restarting slurmctld to make changes take effect.")
try:
run("sudo systemctl restart slurmctld.service", check=False)
Expand Down

0 comments on commit 6bede70

Please sign in to comment.