| /*****************************************************************************\ |
| * job_container_tmpfs.c - Define job container plugin for creating a |
| * temporary mount namespace for the job, to provide |
| * quota based access to node local memory. |
| ***************************************************************************** |
| * Copyright (C) 2019-2021 Regents of the University of California |
| * Produced at Lawrence Berkeley National Laboratory |
| * Written by Aditi Gaur <agaur@lbl.gov> |
| * All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * SLURM is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with SLURM; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #define _GNU_SOURCE |
| #include <errno.h> |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| #include <sys/wait.h> |
| #include <sys/mman.h> |
| #include <fcntl.h> |
| #include <sys/mount.h> |
| #include <semaphore.h> |
| |
| #include "src/common/slurm_xlator.h" |
| |
| #include "src/common/env.h" |
| #include "src/common/fd.h" |
| #include "src/common/log.h" |
| #include "src/common/read_config.h" |
| #include "src/common/run_command.h" |
| #include "src/common/stepd_api.h" |
| #include "src/common/uid.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| #include "src/interfaces/switch.h" |
| #include "src/slurmd/slurmstepd/slurmstepd_job.h" |
| |
| #include "read_jcconf.h" |
| |
| static int _create_ns(uint32_t job_id, stepd_step_rec_t *step); |
| static int _delete_ns(uint32_t job_id); |
| |
| #if defined (__APPLE__) |
| extern slurmd_conf_t *conf __attribute__((weak_import)); |
| #else |
| slurmd_conf_t *conf = NULL; |
| #endif |
| |
| const char plugin_name[] = "job_container tmpfs plugin"; |
| const char plugin_type[] = "job_container/tmpfs"; |
| const uint32_t plugin_version = SLURM_VERSION_NUMBER; |
| |
| static slurm_jc_conf_t *jc_conf = NULL; |
| static int step_ns_fd = -1; |
| static bool plugin_disabled = false; |
| |
| static void _create_paths(uint32_t job_id, char **job_mount, char **ns_holder, |
| char **src_bind) |
| { |
| xassert(job_mount); |
| |
| xstrfmtcat(*job_mount, "%s/%u", jc_conf->basepath, job_id); |
| |
| if (ns_holder) |
| xstrfmtcat(*ns_holder, "%s/.ns", *job_mount); |
| |
| if (src_bind) |
| xstrfmtcat(*src_bind, "%s/.%u", *job_mount, job_id); |
| } |
| |
| static int _find_step_in_list(step_loc_t *stepd, uint32_t *job_id) |
| { |
| return (stepd->step_id.job_id == *job_id); |
| } |
| |
| static bool _is_plugin_disabled(char *basepath) |
| { |
| return ((!basepath) || (!xstrncasecmp(basepath, "none", 4))); |
| } |
| |
| static int _restore_ns(list_t *steps, const char *d_name) |
| { |
| char *endptr; |
| int fd; |
| unsigned long job_id; |
| step_loc_t *stepd; |
| |
| errno = 0; |
| job_id = strtoul(d_name, &endptr, 10); |
| if ((errno != 0) || (job_id >= NO_VAL) || (*endptr != '\0')) { |
| debug3("ignoring %s, could not convert to jobid.", d_name); |
| return SLURM_SUCCESS; |
| } |
| |
| /* here we think this is a job container */ |
| log_flag(JOB_CONT, "determine if job %lu is still running", job_id); |
| stepd = list_find_first(steps, (ListFindF)_find_step_in_list, &job_id); |
| if (!stepd) { |
| debug("%s: Job %lu not found, deleting the namespace", |
| __func__, job_id); |
| return _delete_ns(job_id); |
| } |
| |
| fd = stepd_connect(stepd->directory, stepd->nodename, |
| &stepd->step_id, &stepd->protocol_version); |
| if (fd == -1) { |
| error("%s: failed to connect to stepd for %lu.", |
| __func__, job_id); |
| return _delete_ns(job_id); |
| } |
| |
| close(fd); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int init(void) |
| { |
| if (running_in_slurmd()) { |
| /* |
| * Only init the config here for the slurmd. It will be sent by |
| * the slurmd to the slurmstepd at launch time. |
| */ |
| if (!(jc_conf = init_slurm_jc_conf())) { |
| error("%s: Configuration not read correctly: Does '%s' not exist?", |
| plugin_type, tmpfs_conf_file); |
| return SLURM_ERROR; |
| } |
| plugin_disabled = _is_plugin_disabled(jc_conf->basepath); |
| debug("job_container.conf read successfully"); |
| } |
| |
| debug("%s loaded", plugin_name); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern void fini(void) |
| { |
| debug("%s unloaded", plugin_name); |
| |
| if (step_ns_fd != -1) { |
| close(step_ns_fd); |
| step_ns_fd = -1; |
| } |
| |
| #ifdef MEMORY_LEAK_DEBUG |
| free_jc_conf(); |
| #endif |
| } |
| |
| extern int container_p_restore(char *dir_name, bool recover) |
| { |
| DIR *dp; |
| struct dirent *ep; |
| list_t *steps; |
| int rc = SLURM_SUCCESS; |
| |
| if (plugin_disabled) |
| return SLURM_SUCCESS; |
| |
| if (jc_conf->auto_basepath) { |
| int fstatus; |
| mode_t omask = umask(S_IWGRP | S_IWOTH); |
| |
| if (jc_conf->basepath[0] != '/') { |
| debug("%s: unable to create ns directory '%s' : does not start with '/'", |
| __func__, jc_conf->basepath); |
| umask(omask); |
| return SLURM_ERROR; |
| } |
| |
| if ((fstatus = mkdirpath(jc_conf->basepath, 0755, true))) { |
| debug("%s: unable to create ns directory '%s' : %s", |
| __func__, jc_conf->basepath, |
| slurm_strerror(fstatus)); |
| umask(omask); |
| return SLURM_ERROR; |
| } |
| |
| umask(omask); |
| } |
| |
| steps = stepd_available(conf->spooldir, conf->node_name); |
| |
| /* |
| * Iterate over basepath, restore only the folders that seem bounded to |
| * real jobs (have .ns file). NOTE: Restoring the state could be either |
| * deleting the folder if the job is died and resources are free, or |
| * mount it otherwise. |
| */ |
| if (!(dp = opendir(jc_conf->basepath))) { |
| error("%s: Unable to open %s", __func__, jc_conf->basepath); |
| return SLURM_ERROR; |
| } |
| |
| while ((ep = readdir(dp))) { |
| /* If possible, only check directories */ |
| if ((ep->d_type == DT_DIR) || (ep->d_type == DT_UNKNOWN)) { |
| if (_restore_ns(steps, ep->d_name)) |
| rc = SLURM_ERROR; |
| } |
| } |
| closedir(dp); |
| FREE_NULL_LIST(steps); |
| |
| if (rc) |
| error("Encountered an error while restoring job containers."); |
| |
| return rc; |
| } |
| |
| static int _mount_private_dirs(char *path, uid_t uid) |
| { |
| char *buffer = NULL, *mount_path = NULL, *save_ptr = NULL, *token; |
| int rc = 0; |
| |
| if (!path) { |
| error("%s: no path to private directories specified.", |
| __func__); |
| return -1; |
| } |
| buffer = xstrdup(jc_conf->dirs); |
| token = strtok_r(buffer, ",", &save_ptr); |
| while (token) { |
| /* skip /dev/shm, this is handled elsewhere */ |
| if (!xstrcmp(token, "/dev/shm")) { |
| token = strtok_r(NULL, ",", &save_ptr); |
| continue; |
| } |
| xstrfmtcat(mount_path, "%s/%s", path, token); |
| for (char *t = mount_path + strlen(path) + 1; *t; t++) { |
| if (*t == '/') |
| *t = '_'; |
| } |
| rc = mkdir(mount_path, 0700); |
| if (rc && errno != EEXIST) { |
| error("%s: Failed to create %s, %m", |
| __func__, mount_path); |
| goto private_mounts_exit; |
| } |
| rc = lchown(mount_path, uid, -1); |
| if (rc) { |
| error("%s: lchown failed for %s: %m", |
| __func__, mount_path); |
| goto private_mounts_exit; |
| } |
| if (mount(mount_path, token, NULL, MS_BIND, NULL)) { |
| error("%s: %s mount failed, %m", __func__, token); |
| rc = -1; |
| goto private_mounts_exit; |
| } |
| token = strtok_r(NULL, ",", &save_ptr); |
| xfree(mount_path); |
| } |
| |
| private_mounts_exit: |
| xfree(buffer); |
| xfree(mount_path); |
| return rc; |
| } |
| |
| static int _mount_private_shm(void) |
| { |
| char *loc = NULL; |
| int rc = 0; |
| |
| /* return early if "/dev/shm" is not in the mount list */ |
| if (!(loc = xstrcasestr(jc_conf->dirs, "/dev/shm"))) |
| return rc; |
| if (!((loc[8] == ',') || (loc[8] == 0))) |
| return rc; |
| |
| /* handle mounting a new /dev/shm */ |
| if (!jc_conf->shared) { |
| /* |
| * only unmount old /dev/shm if private, otherwise this can |
| * impact the root namespace |
| */ |
| rc = umount("/dev/shm"); |
| if (rc && errno != EINVAL) { |
| error("%s: umount /dev/shm failed: %m", __func__); |
| return rc; |
| } |
| } |
| rc = mount("tmpfs", "/dev/shm", "tmpfs", 0, NULL); |
| if (rc) { |
| error("%s: /dev/shm mount failed: %m", __func__); |
| return -1; |
| } |
| return rc; |
| } |
| |
| static int _clean_job_basepath(uint32_t job_id) |
| { |
| DIR *dp; |
| struct dirent *ep; |
| char *path = NULL; |
| |
| if (!(dp = opendir(jc_conf->basepath))) { |
| error("%s: Unable to open %s", __func__, jc_conf->basepath); |
| return SLURM_ERROR; |
| } |
| |
| while ((ep = readdir(dp))) { |
| if (!xstrcmp(ep->d_name, ".") || !xstrcmp(ep->d_name, "..")) |
| continue; |
| /* If possible, only attempt with directories */ |
| if ((ep->d_type == DT_DIR) || (ep->d_type == DT_UNKNOWN)) { |
| xstrfmtcat(path, "%s/%s", |
| jc_conf->basepath, ep->d_name); |
| /* it is not important if this fails */ |
| if (umount2(path, MNT_DETACH)) |
| log_flag(JOB_CONT, "failed to unmount %s for job %u", |
| path, job_id); |
| xfree(path); |
| } |
| } |
| closedir(dp); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static char **_setup_script_env(uint32_t job_id, |
| stepd_step_rec_t *step, |
| char *src_bind, |
| char *ns_holder) |
| { |
| char **env = env_array_create(); |
| |
| env_array_overwrite_fmt(&env, "SLURM_JOB_ID", "%u", job_id); |
| env_array_overwrite_fmt(&env, "SLURM_CONF", "%s", conf->conffile); |
| env_array_overwrite_fmt(&env, "SLURMD_NODENAME", "%s", conf->node_name); |
| if (src_bind) |
| env_array_overwrite_fmt(&env, "SLURM_JOB_MOUNTPOINT_SRC", "%s", |
| src_bind); |
| if (step) { |
| if (step->het_job_id && (step->het_job_id != NO_VAL)) |
| env_array_overwrite_fmt(&env, "SLURM_HET_JOB_ID", "%u", |
| step->het_job_id); |
| env_array_overwrite_fmt(&env, "SLURM_JOB_GID", "%u", |
| step->gid); |
| env_array_overwrite_fmt(&env, "SLURM_JOB_UID", "%u", |
| step->uid); |
| env_array_overwrite_fmt(&env, "SLURM_JOB_USER", "%s", |
| step->user_name); |
| if (step->alias_list) |
| env_array_overwrite_fmt(&env, "SLURM_NODE_ALIASES", |
| "%s", step->alias_list); |
| if (step->cwd) |
| env_array_overwrite_fmt(&env, "SLURM_JOB_WORK_DIR", |
| "%s", step->cwd); |
| } |
| |
| if (ns_holder) |
| env_array_overwrite_fmt(&env, "SLURM_NS", "%s", ns_holder); |
| |
| return env; |
| } |
| |
| static int _create_ns(uint32_t job_id, stepd_step_rec_t *step) |
| { |
| char *job_mount = NULL, *ns_holder = NULL, *src_bind = NULL; |
| char *result = NULL; |
| int fd; |
| int rc = 0; |
| sem_t *sem1 = NULL; |
| sem_t *sem2 = NULL; |
| pid_t cpid; |
| |
| _create_paths(job_id, &job_mount, &ns_holder, &src_bind); |
| |
| if (mkdir(job_mount, 0700)) { |
| error("%s: mkdir %s failed: %m", __func__, job_mount); |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| |
| /* |
| * MS_BIND mountflag would make mount() ignore all other mountflags |
| * except MS_REC. We need MS_PRIVATE mountflag as well to make the |
| * mount (as well as all mounts inside it) private, which needs to be |
| * done by calling mount() a second time with MS_PRIVATE and MS_REC |
| * flags. |
| */ |
| if (mount(job_mount, job_mount, NULL, MS_BIND, NULL)) { |
| error("%s: Initial base mount failed: %m", __func__); |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| if (mount(job_mount, job_mount, NULL, MS_PRIVATE | MS_REC, NULL)) { |
| error("%s: Initial base mount failed: %m", __func__); |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| |
| fd = open(ns_holder, O_CREAT|O_RDWR, S_IRWXU); |
| if (fd == -1) { |
| error("%s: open failed %s: %m", __func__, ns_holder); |
| rc = -1; |
| goto exit2; |
| } |
| close(fd); |
| |
| /* run any initialization script- if any*/ |
| if (jc_conf->initscript) { |
| run_command_args_t run_command_args = { |
| .max_wait = 10 * MSEC_IN_SEC, |
| .script_path = jc_conf->initscript, |
| .script_type = "initscript", |
| .status = &rc, |
| }; |
| run_command_args.env = _setup_script_env(job_id, step, |
| src_bind, NULL); |
| |
| log_flag(JOB_CONT, "Running InitScript"); |
| result = run_command(&run_command_args); |
| log_flag(JOB_CONT, "InitScript rc: %d, stdout: %s", rc, result); |
| env_array_free(run_command_args.env); |
| xfree(result); |
| |
| if (rc) { |
| error("%s: InitScript: %s failed with rc: %d", |
| __func__, jc_conf->initscript, rc); |
| goto exit2; |
| } |
| } |
| |
| rc = mkdir(src_bind, 0700); |
| if (rc && (errno != EEXIST)) { |
| error("%s: mkdir failed %s, %m", __func__, src_bind); |
| goto exit2; |
| } |
| |
| sem1 = mmap(NULL, sizeof(*sem1), PROT_READ|PROT_WRITE, |
| MAP_SHARED|MAP_ANONYMOUS, -1, 0); |
| if (sem1 == MAP_FAILED) { |
| error("%s: mmap failed: %m", __func__); |
| rc = -1; |
| goto exit2; |
| } |
| |
| sem2 = mmap(NULL, sizeof(*sem2), PROT_READ|PROT_WRITE, |
| MAP_SHARED|MAP_ANONYMOUS, -1, 0); |
| if (sem2 == MAP_FAILED) { |
| error("%s: mmap failed: %m", __func__); |
| sem_destroy(sem1); |
| munmap(sem1, sizeof(*sem1)); |
| rc = -1; |
| goto exit2; |
| } |
| |
| rc = sem_init(sem1, 1, 0); |
| if (rc) { |
| error("%s: sem_init: %m", __func__); |
| goto exit1; |
| } |
| rc = sem_init(sem2, 1, 0); |
| if (rc) { |
| error("%s: sem_init: %m", __func__); |
| goto exit1; |
| } |
| |
| cpid = fork(); |
| |
| if (cpid == -1) { |
| error("%s: fork Failed: %m", __func__); |
| rc = -1; |
| goto exit1; |
| } |
| |
| if (cpid == 0) { |
| rc = unshare(CLONE_NEWNS); |
| if (rc) { |
| error("%s: %m", __func__); |
| goto child_exit; |
| } |
| if (sem_post(sem1) < 0) { |
| error("%s: sem_post failed: %m", __func__); |
| rc = -1; |
| goto child_exit; |
| } |
| if (sem_wait(sem2) < 0) { |
| error("%s: sem_wait failed %m", __func__); |
| rc = -1; |
| goto child_exit; |
| } |
| if (!jc_conf->shared) { |
| /* Set root filesystem to private */ |
| if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL)) { |
| error("%s: Failed to make root private: %m", |
| __func__); |
| rc = -1; |
| goto child_exit; |
| } |
| } else { |
| /* Set root filesystem to shared */ |
| if (mount(NULL, "/", NULL, MS_SHARED | MS_REC, NULL)) { |
| error("%s: Failed to make root shared: %m", |
| __func__); |
| rc = -1; |
| goto child_exit; |
| } |
| /* Set root filesystem to slave */ |
| if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) { |
| error("%s: Failed to make root slave: %m", |
| __func__); |
| rc = -1; |
| goto child_exit; |
| } |
| } |
| |
| /* |
| * Now we have a persistent mount namespace. |
| * Mount private directories inside the namespace. |
| */ |
| if (_mount_private_dirs(src_bind, step->uid) == -1) { |
| rc = -1; |
| goto child_exit; |
| } |
| |
| /* |
| * this happens when restarting the slurmd, the ownership should |
| * already be correct here. |
| */ |
| rc = chown(src_bind, step->uid, -1); |
| if (rc) { |
| error("%s: chown failed for %s: %m", |
| __func__, src_bind); |
| rc = -1; |
| goto child_exit; |
| } |
| |
| /* |
| * switch/nvidia_imex needs to create an ephemeral device |
| * node under /dev in this new namespace. |
| */ |
| if ((rc = switch_g_fs_init(step))) { |
| error("%s: switch_g_fs_init failed", __func__); |
| rc = -1; |
| goto child_exit; |
| } |
| |
| /* |
| * This umount is to remove the basepath mount from being |
| * visible inside the namespace. So if a user looks up the |
| * mounts inside the job, they will only see their job mount |
| * but not the basepath mount. |
| */ |
| if (jc_conf->shared) |
| rc = _clean_job_basepath(job_id); |
| else |
| rc = umount2(job_mount, MNT_DETACH); |
| if (rc) { |
| error("%s: failed to clean job mount(s): %m", __func__); |
| goto child_exit; |
| } |
| child_exit: |
| sem_destroy(sem1); |
| munmap(sem1, sizeof(*sem1)); |
| sem_destroy(sem2); |
| munmap(sem2, sizeof(*sem2)); |
| |
| if (!rc) { |
| rc = _mount_private_shm(); |
| if (rc) |
| error("%s: could not mount private shm", |
| __func__); |
| } |
| exit(rc); |
| } else { |
| int wstatus; |
| char *proc_path = NULL; |
| |
| if (sem_wait(sem1) < 0) { |
| error("%s: sem_Wait failed: %m", __func__); |
| rc = -1; |
| goto exit1; |
| } |
| |
| xstrfmtcat(proc_path, "/proc/%u/ns/mnt", cpid); |
| |
| /* |
| * Bind mount /proc/pid/ns/mnt to hold namespace active |
| * without a process attached to it |
| */ |
| rc = mount(proc_path, ns_holder, NULL, MS_BIND, NULL); |
| xfree(proc_path); |
| if (rc) { |
| error("%s: ns base mount failed: %m", __func__); |
| if (sem_post(sem2) < 0) |
| error("%s: Could not release semaphore: %m", |
| __func__); |
| goto exit1; |
| } |
| if (sem_post(sem2) < 0) { |
| error("%s: sem_post failed: %m", __func__); |
| goto exit1; |
| } |
| |
| if ((waitpid(cpid, &wstatus, 0) != cpid) || WEXITSTATUS(wstatus)) { |
| error("%s: waitpid failed", __func__); |
| rc = SLURM_ERROR; |
| goto exit1; |
| } |
| |
| rc = 0; |
| } |
| |
| /* run any post clone initialization script */ |
| if (jc_conf->clonensscript) { |
| run_command_args_t run_command_args = { |
| .max_wait = jc_conf->clonensscript_wait * MSEC_IN_SEC, |
| .script_path = jc_conf->clonensscript, |
| .script_type = "clonensscript", |
| .status = &rc, |
| }; |
| run_command_args.env = _setup_script_env(job_id, step, |
| src_bind, ns_holder); |
| |
| log_flag(JOB_CONT, "Running CloneNSScript"); |
| result = run_command(&run_command_args); |
| log_flag(JOB_CONT, "CloneNSScript rc: %d, stdout: %s", |
| rc, result); |
| xfree(result); |
| env_array_free(run_command_args.env); |
| |
| if (rc) { |
| error("%s: CloneNSScript %s failed with rc=%d", |
| __func__, jc_conf->clonensscript, rc); |
| goto exit2; |
| } |
| } |
| |
| exit1: |
| sem_destroy(sem1); |
| munmap(sem1, sizeof(*sem1)); |
| sem_destroy(sem2); |
| munmap(sem2, sizeof(*sem2)); |
| |
| exit2: |
| if (rc) { |
| int failures; |
| /* cleanup the job mount */ |
| if ((failures = rmdir_recursive(job_mount, false))) { |
| error("%s: failed to remove %d files from %s", |
| __func__, failures, job_mount); |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| if (umount2(job_mount, MNT_DETACH)) |
| error("%s: umount2 %s failed: %m", |
| __func__, job_mount); |
| if (rmdir(job_mount)) |
| error("rmdir %s failed: %m", job_mount); |
| } |
| |
| end_it: |
| xfree(job_mount); |
| xfree(src_bind); |
| xfree(ns_holder); |
| |
| return rc; |
| } |
| |
| extern int container_p_join_external(uint32_t job_id) |
| { |
| char *job_mount = NULL, *ns_holder = NULL; |
| |
| if (plugin_disabled) |
| return SLURM_SUCCESS; |
| |
| _create_paths(job_id, &job_mount, &ns_holder, NULL); |
| |
| if (step_ns_fd == -1) { |
| step_ns_fd = open(ns_holder, O_RDONLY); |
| if (step_ns_fd == -1) |
| error("%s: %m", __func__); |
| } |
| |
| xfree(job_mount); |
| xfree(ns_holder); |
| |
| return step_ns_fd; |
| } |
| |
| extern int container_p_join(slurm_step_id_t *step_id, uid_t uid, |
| bool step_create) |
| { |
| char *job_mount = NULL, *ns_holder = NULL; |
| int fd; |
| int rc = SLURM_SUCCESS; |
| |
| if (plugin_disabled) |
| return SLURM_SUCCESS; |
| |
| /* |
| * Handle EntireStepInNS setting. If set, the join needs to happen |
| * during the fork+exec chain that creates the slurmstepd process, and |
| * all successive calls within slurmstepd need to be skipped. If not |
| * set, do the opposite. |
| */ |
| if ((!jc_conf->entire_step_in_ns && running_in_slurmd() && |
| step_create) || |
| (jc_conf->entire_step_in_ns && running_in_slurmstepd() && |
| step_id->step_id != SLURM_EXTERN_CONT)) |
| return SLURM_SUCCESS; |
| |
| /* |
| * Jobid 0 means we are not a real job, but a script running instead we |
| * do not need to handle this request. |
| */ |
| if (step_id->job_id == 0) |
| return SLURM_SUCCESS; |
| |
| _create_paths(step_id->job_id, &job_mount, &ns_holder, NULL); |
| |
| /* This is called on the slurmd so we can't use ns_fd. */ |
| fd = open(ns_holder, O_RDONLY); |
| if (fd == -1) { |
| error("%s: open failed for %s: %m", __func__, ns_holder); |
| xfree(job_mount); |
| xfree(ns_holder); |
| return SLURM_ERROR; |
| } |
| |
| rc = setns(fd, CLONE_NEWNS); |
| if (rc) { |
| error("%s: setns failed for %s: %m", __func__, ns_holder); |
| /* closed after error() */ |
| close(fd); |
| xfree(job_mount); |
| xfree(ns_holder); |
| return SLURM_ERROR; |
| } else { |
| log_flag(JOB_CONT, "job %u entered namespace", step_id->job_id); |
| } |
| |
| close(fd); |
| xfree(job_mount); |
| xfree(ns_holder); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _delete_ns(uint32_t job_id) |
| { |
| char *job_mount = NULL, *ns_holder = NULL; |
| int rc = 0, failures = 0; |
| char *result = NULL; |
| |
| _create_paths(job_id, &job_mount, &ns_holder, NULL); |
| |
| /* run any post clone epilog script */ |
| /* initialize environ variable to include jobid and namespace file */ |
| if (jc_conf->clonensepilog) { |
| run_command_args_t run_command_args = { |
| .max_wait = jc_conf->clonensepilog_wait * MSEC_IN_SEC, |
| .script_path = jc_conf->clonensepilog, |
| .script_type = "clonensepilog", |
| .status = &rc, |
| }; |
| run_command_args.env = _setup_script_env(job_id, NULL, |
| NULL, ns_holder); |
| log_flag(JOB_CONT, "Running CloneNSEpilog"); |
| result = run_command(&run_command_args); |
| env_array_free(run_command_args.env); |
| log_flag(JOB_CONT, "CloneNSEpilog rc: %d, stdout: %s", |
| rc, result); |
| xfree(result); |
| |
| if (rc) { |
| error("%s: CloneNSEpilog script %s failed with rc=%d", |
| __func__, jc_conf->clonensepilog, rc); |
| } |
| } |
| |
| errno = 0; |
| |
| /* |
| * Close the step_ns_fd if it was opened. If close fails here, it |
| * should be safe to continue since ns_holder is lazy unmounted later |
| * and will get cleaned up when the slurmstepd process ends. |
| */ |
| if (step_ns_fd != -1) { |
| if (close(step_ns_fd)) |
| log_flag(JOB_CONT, "job %u close step_ns_fd(%d) failed: %m", |
| job_id, step_ns_fd); |
| |
| else |
| step_ns_fd = -1; |
| } |
| |
| /* |
| * umount2() sets errno to EINVAL if the target is not a mount point |
| * but also if called with invalid flags. Consider this if changing the |
| * flags to umount2(). |
| */ |
| rc = umount2(ns_holder, MNT_DETACH); |
| if (rc) { |
| if ((errno == EINVAL) || (errno == ENOENT)) { |
| log_flag(JOB_CONT, "%s: umount2 %s failed: %m", |
| __func__, ns_holder); |
| } else { |
| error("%s: umount2 %s failed: %m", |
| __func__, ns_holder); |
| xfree(job_mount); |
| xfree(ns_holder); |
| return SLURM_ERROR; |
| } |
| } |
| |
| if ((failures = rmdir_recursive(job_mount, false))) |
| error("%s: failed to remove %d files from %s", |
| __func__, failures, job_mount); |
| if (umount2(job_mount, MNT_DETACH)) |
| log_flag(JOB_CONT, "umount2: %s failed: %m", job_mount); |
| if (rmdir(job_mount)) |
| error("rmdir %s failed: %m", job_mount); |
| |
| xfree(job_mount); |
| xfree(ns_holder); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int container_p_stepd_create(uint32_t job_id, stepd_step_rec_t *step) |
| { |
| if (plugin_disabled) |
| return SLURM_SUCCESS; |
| |
| return _create_ns(job_id, step); |
| } |
| |
| extern int container_p_stepd_delete(uint32_t job_id) |
| { |
| if (plugin_disabled) |
| return SLURM_SUCCESS; |
| |
| return _delete_ns(job_id); |
| } |
| |
| extern int container_p_send_stepd(int fd) |
| { |
| int len; |
| buf_t *buf; |
| |
| buf = get_slurm_jc_conf_buf(); |
| |
| /* The config should have been inited by now */ |
| xassert(buf); |
| |
| len = get_buf_offset(buf); |
| safe_write(fd, &len, sizeof(len)); |
| safe_write(fd, get_buf_data(buf), len); |
| |
| return SLURM_SUCCESS; |
| rwfail: |
| error("%s: failed", __func__); |
| return SLURM_ERROR; |
| } |
| |
| extern int container_p_recv_stepd(int fd) |
| { |
| int len; |
| buf_t *buf; |
| |
| safe_read(fd, &len, sizeof(len)); |
| |
| buf = init_buf(len); |
| safe_read(fd, buf->head, len); |
| |
| if (!(jc_conf = set_slurm_jc_conf(buf))) |
| goto rwfail; |
| |
| plugin_disabled = _is_plugin_disabled(jc_conf->basepath); |
| |
| return SLURM_SUCCESS; |
| rwfail: |
| error("%s: failed", __func__); |
| return SLURM_ERROR; |
| } |