blob: 616ca61801caa81ded787716d5b8763ca7084686 [file] [log] [blame] [edit]
/*****************************************************************************\
* namespace_linux.c - Define namespace plugin for creating temporary linux
* namespaces for the job to provide some isolation between
* jobs on the same node.
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <semaphore.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/unistd.h>
#include <sys/wait.h>
#include "src/common/slurm_xlator.h"
#include "src/common/env.h"
#include "src/common/fd.h"
#include "src/common/log.h"
#include "src/common/read_config.h"
#include "src/common/run_command.h"
#include "src/common/stepd_api.h"
#include "src/common/uid.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/interfaces/cgroup.h"
#include "src/interfaces/proctrack.h"
#include "src/interfaces/switch.h"
#include "src/slurmd/slurmstepd/slurmstepd_job.h"
#include "read_nsconf.h"
static int _create_ns(stepd_step_rec_t *step);
static int _delete_ns(uint32_t job_id);
#if defined(__APPLE__)
extern slurmd_conf_t *conf __attribute__((weak_import));
#else
slurmd_conf_t *conf = NULL;
#endif
/* Required Slurm plugin symbols: */
const char plugin_name[] = "namespace linux plugin";
const char plugin_type[] = "namespace/linux";
const uint32_t plugin_version = SLURM_VERSION_NUMBER;
static ns_conf_t *ns_conf = NULL;
static bool plugin_disabled = false;
static pid_t ns_pid = -1;
/* NS_L_NS must be last */
enum ns_l_types {
NS_L_PID = 0,
NS_L_USER,
NS_L_NS,
NS_L_END
};
typedef struct {
bool enabled;
int fd;
int flag;
char *path;
char *proc_name;
} ns_l_t;
static ns_l_t ns_l_enabled[NS_L_END] = { { false, -1, 0, NULL, NULL },
{ false, -1, 0, NULL, NULL },
{ false, -1, 0, NULL, NULL } };
static void _create_paths(uint32_t job_id, char **job_mount, char **ns_base,
char **src_bind)
{
xassert(job_mount);
xstrfmtcat(*job_mount, "%s/%u", ns_conf->basepath, job_id);
if (ns_base) {
xstrfmtcat(*ns_base, "%s/.ns", *job_mount);
if (ns_conf->clonensflags & CLONE_NEWNS) {
ns_l_enabled[NS_L_NS].enabled = true;
ns_l_enabled[NS_L_NS].flag = CLONE_NEWNS;
xfree(ns_l_enabled[NS_L_NS].path);
xstrfmtcat(ns_l_enabled[NS_L_NS].path, "%s/mnt",
*ns_base);
ns_l_enabled[NS_L_NS].proc_name = "mnt";
}
if (ns_conf->clonensflags & CLONE_NEWPID) {
ns_l_enabled[NS_L_PID].enabled = true;
ns_l_enabled[NS_L_PID].flag = CLONE_NEWPID;
xfree(ns_l_enabled[NS_L_PID].path);
xstrfmtcat(ns_l_enabled[NS_L_PID].path, "%s/pid",
*ns_base);
ns_l_enabled[NS_L_PID].proc_name = "pid";
}
if (ns_conf->clonensflags & CLONE_NEWUSER) {
ns_l_enabled[NS_L_USER].enabled = true;
ns_l_enabled[NS_L_USER].flag = CLONE_NEWUSER;
xfree(ns_l_enabled[NS_L_USER].path);
xstrfmtcat(ns_l_enabled[NS_L_USER].path, "%s/user",
*ns_base);
ns_l_enabled[NS_L_USER].proc_name = "user";
}
}
if (src_bind)
xstrfmtcat(*src_bind, "%s/.%u", *job_mount, job_id);
}
static int _find_step_in_list(step_loc_t *stepd, uint32_t *job_id)
{
return (stepd->step_id.job_id == *job_id);
}
static bool _is_plugin_disabled(char *basepath)
{
return ((!basepath) || (!xstrncasecmp(basepath, "none", 4)));
}
static int _restore_ns(list_t *steps, const char *d_name)
{
char *endptr;
int fd;
unsigned long job_id;
step_loc_t *stepd;
errno = 0;
job_id = strtoul(d_name, &endptr, 10);
if ((errno != 0) || (job_id >= NO_VAL) || (*endptr != '\0')) {
debug3("ignoring %s, could not convert to jobid.", d_name);
return SLURM_SUCCESS;
}
/* here we think this is a job namespace */
log_flag(NAMESPACE, "determine if job %lu is still running", job_id);
stepd = list_find_first(steps, (ListFindF) _find_step_in_list, &job_id);
if (!stepd) {
debug("%s: Job %lu not found, deleting the namespace",
__func__, job_id);
return _delete_ns(job_id);
}
fd = stepd_connect(stepd->directory, stepd->nodename, &stepd->step_id,
&stepd->protocol_version);
if (fd == -1) {
error("%s: failed to connect to stepd for %lu.",
__func__, job_id);
return _delete_ns(job_id);
}
fd_close(&fd);
return SLURM_SUCCESS;
}
extern int init(void)
{
if (running_in_slurmd()) {
/*
* Only init the config here for the slurmd. It will be sent by
* the slurmd to the slurmstepd at launch time.
*/
if (!(ns_conf = init_slurm_ns_conf())) {
error("%s: Configuration not read correctly: Does '%s' not exist?",
plugin_type, ns_conf_file);
return SLURM_ERROR;
}
plugin_disabled = _is_plugin_disabled(ns_conf->basepath);
debug("namespace.yaml read successfully");
}
debug("%s loaded", plugin_name);
return SLURM_SUCCESS;
}
extern void fini(void)
{
#ifdef MEMORY_LEAK_DEBUG
for (int i = 0; i < NS_L_END; i++) {
xfree(ns_l_enabled[i].path);
fd_close(&ns_l_enabled[i].fd);
}
free_ns_conf();
#endif
debug("%s unloaded", plugin_name);
}
extern int namespace_p_restore(char *dir_name, bool recover)
{
DIR *dp;
struct dirent *ep;
list_t *steps;
int rc = SLURM_SUCCESS;
if (plugin_disabled)
return SLURM_SUCCESS;
if (ns_conf->auto_basepath) {
int fstatus;
mode_t omask = umask(S_IWGRP | S_IWOTH);
if (ns_conf->basepath[0] != '/') {
debug("%s: unable to create ns directory '%s' : does not start with '/'",
__func__, ns_conf->basepath);
umask(omask);
return SLURM_ERROR;
}
if ((fstatus = mkdirpath(ns_conf->basepath, 0755, true))) {
debug("%s: unable to create ns directory '%s' : %s",
__func__, ns_conf->basepath,
slurm_strerror(fstatus));
umask(omask);
return SLURM_ERROR;
}
umask(omask);
}
steps = stepd_available(conf->spooldir, conf->node_name);
/*
* Iterate over basepath, restore only the folders that seem bounded to
* real jobs (have .ns file). NOTE: Restoring the state could be either
* deleting the folder if the job is died and resources are free, or
* mount it otherwise.
*/
if (!(dp = opendir(ns_conf->basepath))) {
error("%s: Unable to open %s", __func__, ns_conf->basepath);
return SLURM_ERROR;
}
while ((ep = readdir(dp))) {
/* If possible, only check directories */
if ((ep->d_type == DT_DIR) || (ep->d_type == DT_UNKNOWN)) {
if (_restore_ns(steps, ep->d_name))
rc = SLURM_ERROR;
}
}
closedir(dp);
FREE_NULL_LIST(steps);
if (rc)
error("Encountered an error while restoring job namespaces.");
return rc;
}
static int _mount_private_dirs(char *path, uid_t uid)
{
char *buffer = NULL, *mount_path = NULL, *save_ptr = NULL, *token;
int rc = 0;
if (!path) {
error("%s: no path to private directories specified.",
__func__);
return -1;
}
buffer = xstrdup(ns_conf->dirs);
token = strtok_r(buffer, ",", &save_ptr);
while (token) {
/* skip /dev/shm, this is handled elsewhere */
if (!xstrcmp(token, "/dev/shm")) {
token = strtok_r(NULL, ",", &save_ptr);
continue;
}
xstrfmtcat(mount_path, "%s/%s", path, token);
for (char *t = mount_path + strlen(path) + 1; *t; t++) {
if (*t == '/')
*t = '_';
}
rc = mkdir(mount_path, 0700);
if (rc && errno != EEXIST) {
error("%s: Failed to create %s, %m",
__func__, mount_path);
goto private_mounts_exit;
}
if (mount(mount_path, token, NULL, MS_BIND, NULL)) {
error("%s: %s mount failed, %m", __func__, token);
rc = -1;
goto private_mounts_exit;
}
token = strtok_r(NULL, ",", &save_ptr);
xfree(mount_path);
}
private_mounts_exit:
xfree(buffer);
xfree(mount_path);
return rc;
}
static int _chown_private_dirs(char *path, uid_t uid)
{
char *buffer = NULL, *mount_path = NULL, *save_ptr = NULL, *token;
int rc = 0;
if (!path) {
error("%s: no path to private directories specified.",
__func__);
return -1;
}
buffer = xstrdup(ns_conf->dirs);
token = strtok_r(buffer, ",", &save_ptr);
while (token) {
/* skip /dev/shm, this is handled elsewhere */
if (!xstrcmp(token, "/dev/shm")) {
token = strtok_r(NULL, ",", &save_ptr);
continue;
}
xstrfmtcat(mount_path, "%s/%s", path, token);
for (char *t = mount_path + strlen(path) + 1; *t; t++) {
if (*t == '/')
*t = '_';
}
rc = lchown(mount_path, uid, -1);
if (rc) {
error("%s: lchown failed for %s: %m",
__func__, mount_path);
goto private_mounts_exit;
}
token = strtok_r(NULL, ",", &save_ptr);
xfree(mount_path);
}
private_mounts_exit:
xfree(buffer);
xfree(mount_path);
return rc;
}
static int _mount_private_shm(void)
{
char *loc = NULL;
int rc = 0;
/* return early if "/dev/shm" is not in the mount list */
if (!(loc = xstrcasestr(ns_conf->dirs, "/dev/shm")))
return rc;
if (!((loc[8] == ',') || (loc[8] == 0)))
return rc;
/* handle mounting a new /dev/shm */
if (!ns_conf->shared) {
/*
* only unmount old /dev/shm if private, otherwise this can
* impact the root namespace
*/
rc = umount("/dev/shm");
if (rc && errno != EINVAL) {
error("%s: umount /dev/shm failed: %m", __func__);
return rc;
}
}
rc = mount("tmpfs", "/dev/shm", "tmpfs", 0, NULL);
if (rc) {
error("%s: /dev/shm mount failed: %m", __func__);
return -1;
}
return rc;
}
static int _mount_private_proc(void)
{
if (!ns_l_enabled[NS_L_PID].enabled)
return SLURM_SUCCESS;
if (mount("proc", "/proc", "proc", 0, NULL)) {
error("%s: /proc mount failed: %m", __func__);
return SLURM_ERROR;
}
return SLURM_SUCCESS;
}
static char **_setup_script_env(uint32_t job_id, stepd_step_rec_t *step,
char *src_bind, char *ns_base)
{
char **env = env_array_create();
env_array_overwrite_fmt(&env, "SLURM_JOB_ID", "%u", job_id);
env_array_overwrite_fmt(&env, "SLURM_CONF", "%s", conf->conffile);
env_array_overwrite_fmt(&env, "SLURMD_NODENAME", "%s", conf->node_name);
if (src_bind)
env_array_overwrite_fmt(&env, "SLURM_JOB_MOUNTPOINT_SRC", "%s",
src_bind);
if (step) {
if (step->het_job_id && (step->het_job_id != NO_VAL))
env_array_overwrite_fmt(&env, "SLURM_HET_JOB_ID", "%u",
step->het_job_id);
env_array_overwrite_fmt(&env, "SLURM_JOB_GID", "%u", step->gid);
env_array_overwrite_fmt(&env, "SLURM_JOB_UID", "%u", step->uid);
env_array_overwrite_fmt(&env, "SLURM_JOB_USER", "%s",
step->user_name);
if (step->alias_list)
env_array_overwrite_fmt(&env, "SLURM_NODE_ALIASES",
"%s", step->alias_list);
if (step->cwd)
env_array_overwrite_fmt(&env, "SLURM_JOB_WORK_DIR",
"%s", step->cwd);
if (step->job_mem)
env_array_overwrite_fmt(&env, "SLURM_JOB_MEM",
"%" PRIu64, step->job_mem);
}
if (ns_base)
env_array_overwrite_fmt(&env, "SLURM_NS", "%s", ns_base);
return env;
}
static pid_t sys_clone(unsigned long flags, int *parent_tid, int *child_tid,
unsigned long tls)
{
#ifdef __x86_64__
return syscall(__NR_clone, flags, NULL, parent_tid, child_tid, tls);
#else
return syscall(__NR_clone, flags, NULL, parent_tid, tls, child_tid);
#endif
}
static void _create_ns_child(stepd_step_rec_t *step, char *src_bind,
char *job_mount, sem_t *sem1, sem_t *sem2)
{
char *argv[4] = { (char *) conf->stepd_loc, "ns_infinity", NULL, NULL };
int rc = 0;
if (sem_wait(sem1) < 0) {
error("%s: sem_wait failed %m", __func__);
rc = -1;
goto child_exit;
}
if (!ns_conf->shared) {
/* Set root filesystem to private */
if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) {
error("%s: Failed to make root private: %m",
__func__);
rc = -1;
goto child_exit;
}
} else {
/* Set root filesystem to shared */
if (mount(NULL, "/", NULL, MS_SHARED | MS_REC, NULL)) {
error("%s: Failed to make root shared: %m",
__func__);
rc = -1;
goto child_exit;
}
/* Set root filesystem to slave */
if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) {
error("%s: Failed to make root slave: %m",
__func__);
rc = -1;
goto child_exit;
}
}
if (_mount_private_proc() == SLURM_ERROR) {
rc = -1;
goto child_exit;
}
/*
* Now we have a persistent mount namespace.
* Mount private directories inside the namespace.
*/
if (_mount_private_dirs(src_bind, step->uid) == -1) {
rc = -1;
goto child_exit;
}
/*
* switch/nvidia_imex needs to create an ephemeral device
* node under /dev in this new namespace.
*/
if ((rc = switch_g_fs_init(step))) {
error("%s: switch_g_fs_init failed", __func__);
rc = -1;
goto child_exit;
}
if ((rc = _mount_private_shm())) {
error("%s: could not mount private shm", __func__);
goto child_exit;
}
if (sem_post(sem2) < 0) {
error("%s: sem_post failed: %m", __func__);
goto child_exit;
}
sem_destroy(sem1);
munmap(sem1, sizeof(*sem1));
sem_destroy(sem2);
munmap(sem2, sizeof(*sem2));
/* become an infinity process */
xstrfmtcat(argv[2], "%u", step->step_id.job_id);
execvp(argv[0], argv);
error("execvp of slurmstepd infinity failed: %m");
_exit(127);
child_exit:
/* Do a final post to prevent from waiting on errors */
sem_post(sem2);
sem_destroy(sem1);
munmap(sem1, sizeof(*sem1));
sem_destroy(sem2);
munmap(sem2, sizeof(*sem2));
exit(rc);
}
static int _clonens_user_setup(stepd_step_rec_t *step, pid_t pid)
{
int fd = -1, rc = SLURM_SUCCESS;
char *tmpstr = NULL;
if (!ns_l_enabled[NS_L_USER].enabled)
return rc;
/* If the script is specified, it takes precidendce */
if (ns_conf->usernsscript) {
char *result = NULL;
run_command_args_t run_command_args = {
.max_wait = 10 * MSEC_IN_SEC,
.script_path = ns_conf->usernsscript,
.script_type = "UserNSScript",
.status = &rc,
};
run_command_args.env = _setup_script_env(step->step_id.job_id,
step, NULL, NULL);
env_array_overwrite_fmt(&run_command_args.env, "SLURM_NS_PID",
"%u", pid);
log_flag(NAMESPACE, "Running UserNSScript");
result = run_command(&run_command_args);
log_flag(NAMESPACE, "UserNSScript rc: %d, stdout: %s",
rc, result);
env_array_free(run_command_args.env);
xfree(result);
if (rc)
error("%s: UserNSScript: %s failed with rc: %d",
__func__, ns_conf->usernsscript, rc);
goto end_it;
}
xstrfmtcat(tmpstr, "/proc/%d/uid_map", pid);
if (!(-1 != (fd = open(tmpstr, O_WRONLY)))) {
error("%s: open uid_map %s failed: %m", __func__, tmpstr);
rc = SLURM_ERROR;
goto end_it;
}
if (!(1 <= dprintf(fd, "0 0 4294967295\n"))) {
error("%s: write 0 0 4294967295 uid_map %s failed: %m",
__func__, tmpstr);
rc = SLURM_ERROR;
goto end_it;
}
fd_close(&fd);
xfree(tmpstr);
xstrfmtcat(tmpstr, "/proc/%d/gid_map", pid);
if (!(-1 != (fd = open(tmpstr, O_WRONLY)))) {
error("%s: open gid_map failed: %m", __func__);
rc = SLURM_ERROR;
goto end_it;
}
if (!(1 <= dprintf(fd, "0 0 4294967295\n"))) {
error("%s: write 0 0 4294967295 failed: %m",
__func__ );
rc = SLURM_ERROR;
goto end_it;
}
end_it:
fd_close(&fd);
xfree(tmpstr);
return rc;
}
static int _create_ns(stepd_step_rec_t *step)
{
int child_tid = 0, parent_tid = 0;
char *job_mount = NULL, *ns_base = NULL, *src_bind = NULL;
char *result = NULL;
int fd;
int rc = 0;
unsigned long tls = 0;
sem_t *sem1 = NULL;
sem_t *sem2 = NULL;
_create_paths(step->step_id.job_id, &job_mount, &ns_base, &src_bind);
if (mkdir(job_mount, 0700)) {
error("%s: mkdir %s failed: %m", __func__, job_mount);
rc = SLURM_ERROR;
goto end_it;
}
/*
* MS_BIND mountflag would make mount() ignore all other mountflags
* except MS_REC. We need MS_PRIVATE mountflag as well to make the
* mount (as well as all mounts inside it) private, which needs to be
* done by calling mount() a second time with MS_PRIVATE and MS_REC
* flags.
*/
if (mount(job_mount, job_mount, NULL, MS_BIND, NULL)) {
error("%s: Initial base mount failed: %m", __func__);
rc = SLURM_ERROR;
goto end_it;
}
if (mount(job_mount, job_mount, NULL, MS_PRIVATE | MS_REC, NULL)) {
error("%s: Initial base mount failed: %m", __func__);
rc = SLURM_ERROR;
goto end_it;
}
if (mkdir(ns_base, 0700)) {
error("%s: mkdir %s failed: %m", __func__, ns_base);
rc = SLURM_ERROR;
goto end_it;
}
/* Create locations for all enabled namespaces */
for (int i = 0; i < NS_L_END; i++) {
if (!ns_l_enabled[i].enabled)
continue;
fd = open(ns_l_enabled[i].path, O_CREAT | O_RDWR, S_IRWXU);
if (fd == -1) {
error("%s: open failed %s: %m",
__func__, ns_l_enabled[i].path);
rc = -1;
goto exit2;
}
fd_close(&fd);
}
/* Create location for bind mounts to go */
rc = mkdir(src_bind, 0700);
if (rc && (errno != EEXIST)) {
error("%s: mkdir failed %s, %m", __func__, src_bind);
goto exit2;
}
if (chown(src_bind, step->uid, -1)) {
error("%s: chown failed for %s: %m",
__func__, src_bind);
rc = -1;
goto exit2;
}
/* run any initialization script- if any*/
if (ns_conf->initscript) {
run_command_args_t run_command_args = {
.max_wait = 10 * MSEC_IN_SEC,
.script_path = ns_conf->initscript,
.script_type = "initscript",
.status = &rc,
};
run_command_args.env = _setup_script_env(step->step_id.job_id,
step, src_bind, NULL);
log_flag(NAMESPACE, "Running InitScript");
result = run_command(&run_command_args);
log_flag(NAMESPACE, "InitScript rc: %d, stdout: %s", rc, result);
env_array_free(run_command_args.env);
xfree(result);
if (rc) {
error("%s: InitScript: %s failed with rc: %d",
__func__, ns_conf->initscript, rc);
goto exit2;
}
}
sem1 = mmap(NULL, sizeof(*sem1), PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (sem1 == MAP_FAILED) {
error("%s: mmap failed: %m", __func__);
rc = -1;
goto exit2;
}
sem2 = mmap(NULL, sizeof(*sem2), PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (sem2 == MAP_FAILED) {
error("%s: mmap failed: %m", __func__);
sem_destroy(sem1);
munmap(sem1, sizeof(*sem1));
rc = -1;
goto exit2;
}
rc = sem_init(sem1, 1, 0);
if (rc) {
error("%s: sem_init: %m", __func__);
goto exit1;
}
rc = sem_init(sem2, 1, 0);
if (rc) {
error("%s: sem_init: %m", __func__);
goto exit1;
}
ns_pid = sys_clone(ns_conf->clonensflags | SIGCHLD, &parent_tid,
&child_tid, tls);
if (ns_pid == -1) {
error("%s: sys_clone failed: %m", __func__);
rc = -1;
goto exit1;
} else if (ns_pid == 0) {
_create_ns_child(step, src_bind, job_mount, sem1, sem2);
} else {
char *proc_path = NULL;
/*
* Bind mount /proc/pid/ns/loc to hold namespace active
* without a process attached to it
*/
for (int i = 0; i < NS_L_END; i++) {
if (!ns_l_enabled[i].enabled)
continue;
xstrfmtcat(proc_path, "/proc/%u/ns/%s", ns_pid,
ns_l_enabled[i].proc_name);
rc = mount(proc_path, ns_l_enabled[i].path, NULL,
MS_BIND, NULL);
if (rc) {
error("%s: ns %s mount failed: %m",
__func__, ns_l_enabled[i].proc_name);
if (sem_post(sem1) < 0)
error("%s: Could not release semaphore: %m",
__func__);
xfree(proc_path);
goto exit1;
}
xfree(proc_path);
}
/* setup users before setting up the rest of the container */
if ((rc = _clonens_user_setup(step, ns_pid))) {
error("%s: Unable to prepare user namespace.",
__func__);
/* error needs to fall though here */
}
/* Setup remainder of the container */
if (sem_post(sem1) < 0) {
error("%s: sem_post failed: %m", __func__);
goto exit1;
}
/* Wait for container to be setup */
if (sem_wait(sem2) < 0) {
error("%s: sem_Wait failed: %m", __func__);
rc = -1;
goto exit1;
}
if (proctrack_g_add(step, ns_pid) != SLURM_SUCCESS) {
error("%s: Job %u can't add pid %d to proctrack plugin in the extern_step.",
__func__, step->step_id.job_id, ns_pid);
rc = SLURM_ERROR;
goto exit1;
}
if (_chown_private_dirs(src_bind, step->uid) == -1) {
rc = -1;
goto exit1;
}
/* Any error that remains here should skip further setup */
if (rc)
goto exit1;
}
/* run any post clone initialization script */
if (ns_conf->clonensscript) {
run_command_args_t run_command_args = {
.max_wait = ns_conf->clonensscript_wait * MSEC_IN_SEC,
.script_path = ns_conf->clonensscript,
.script_type = "clonensscript",
.status = &rc,
};
run_command_args.env =
_setup_script_env(step->step_id.job_id, step, src_bind,
ns_l_enabled[NS_L_NS].path);
log_flag(NAMESPACE, "Running CloneNSScript");
result = run_command(&run_command_args);
log_flag(NAMESPACE, "CloneNSScript rc: %d, stdout: %s",
rc, result);
xfree(result);
env_array_free(run_command_args.env);
if (rc) {
error("%s: CloneNSScript %s failed with rc=%d",
__func__, ns_conf->clonensscript, rc);
goto exit2;
}
}
exit1:
sem_destroy(sem1);
munmap(sem1, sizeof(*sem1));
sem_destroy(sem2);
munmap(sem2, sizeof(*sem2));
exit2:
if (rc) {
int failures;
/* cleanup the job mount */
if ((failures = rmdir_recursive(job_mount, false))) {
error("%s: failed to remove %d files from %s",
__func__, failures, job_mount);
rc = SLURM_ERROR;
goto end_it;
}
if (umount2(job_mount, MNT_DETACH))
error("%s: umount2 %s failed: %m",
__func__, job_mount);
if (rmdir(job_mount))
error("rmdir %s failed: %m", job_mount);
}
end_it:
xfree(job_mount);
xfree(src_bind);
xfree(ns_base);
return rc;
}
extern int namespace_p_join_external(slurm_step_id_t *step_id, list_t *ns_map)
{
char *job_mount = NULL, *ns_base = NULL;
ns_fd_map_t *tmp_map = NULL;
xassert(ns_map);
if (plugin_disabled)
return 0;
_create_paths(step_id->job_id, &job_mount, &ns_base, NULL);
for (int i = 0; i < NS_L_END; i++) {
if (!ns_l_enabled[i].enabled)
continue;
if (ns_l_enabled[i].fd == -1) {
ns_l_enabled[i].fd =
open(ns_l_enabled[i].path, O_RDONLY);
if (ns_l_enabled[i].fd == -1) {
error("%s: %m", __func__);
goto end_it;
}
}
tmp_map = xmalloc(sizeof(*tmp_map));
tmp_map->type = ns_l_enabled[i].flag;
tmp_map->fd = ns_l_enabled[i].fd;
list_append(ns_map, tmp_map);
tmp_map = NULL;
}
end_it:
xfree(job_mount);
xfree(ns_base);
return list_count(ns_map);
}
extern int namespace_p_join(slurm_step_id_t *step_id, uid_t uid,
bool step_create)
{
char *job_mount = NULL, *ns_base = NULL;
int rc = SLURM_SUCCESS;
if (plugin_disabled)
return SLURM_SUCCESS;
/* Formerly EntireStepInNS handling, this is now the normal process */
if ((running_in_slurmstepd() && step_id->step_id != SLURM_EXTERN_CONT))
return SLURM_SUCCESS;
/*
* Jobid 0 means we are not a real job, but a script running instead we
* do not need to handle this request.
*/
if (step_id->job_id == 0)
return SLURM_SUCCESS;
_create_paths(step_id->job_id, &job_mount, &ns_base, NULL);
/* Open all namespaces first, however we cannot assume this is shared */
for (int i = 0; i < NS_L_END; i++) {
if (!ns_l_enabled[i].enabled)
continue;
/* This is called on the slurmd so we can't use ns_fd. */
ns_l_enabled[i].fd = open(ns_l_enabled[i].path, O_RDONLY);
if (ns_l_enabled[i].fd == -1) {
error("%s: open failed for %s: %m",
__func__, ns_l_enabled[i].path);
xfree(job_mount);
xfree(ns_base);
return SLURM_ERROR;
}
}
for (int i = 0; i < NS_L_END; i++) {
if (!ns_l_enabled[i].enabled)
continue;
rc = setns(ns_l_enabled[i].fd, 0);
fd_close(&ns_l_enabled[i].fd);
if (rc) {
error("%s: setns failed for %s: %m",
__func__, ns_l_enabled[i].path);
/* closed after error() */
xfree(job_mount);
xfree(ns_base);
return SLURM_ERROR;
}
log_flag(NAMESPACE, "%ps entered %s namespace", step_id,
ns_l_enabled[i].path);
}
log_flag(NAMESPACE, "%ps entered namespace", step_id);
xfree(job_mount);
xfree(ns_base);
return SLURM_SUCCESS;
}
static int _delete_ns(uint32_t job_id)
{
char *job_mount = NULL, *ns_base = NULL;
int rc = 0, failures = 0;
char *result = NULL;
_create_paths(job_id, &job_mount, &ns_base, NULL);
/* run any post clone epilog script */
/* initialize environ variable to include jobid and namespace file */
if (ns_conf->clonensepilog) {
run_command_args_t run_command_args = {
.max_wait = ns_conf->clonensepilog_wait * MSEC_IN_SEC,
.script_path = ns_conf->clonensepilog,
.script_type = "clonensepilog",
.status = &rc,
};
run_command_args.env =
_setup_script_env(job_id, NULL, NULL, ns_base);
log_flag(NAMESPACE, "Running CloneNSEpilog");
result = run_command(&run_command_args);
env_array_free(run_command_args.env);
log_flag(NAMESPACE, "CloneNSEpilog rc: %d, stdout: %s",
rc, result);
xfree(result);
if (rc) {
error("%s: CloneNSEpilog script %s failed with rc=%d",
__func__, ns_conf->clonensepilog, rc);
}
}
errno = 0;
/*
* umount2() sets errno to EINVAL if the target is not a mount point
* but also if called with invalid flags. Consider this if changing the
* flags to umount2().
*/
for (int i = 0; i < NS_L_END; i++) {
if (!ns_l_enabled[i].enabled)
continue;
rc = umount2(ns_l_enabled[i].path, MNT_DETACH);
if (rc) {
if ((errno == EINVAL) || (errno == ENOENT)) {
log_flag(NAMESPACE, "%s: umount2 %s failed: %m",
__func__, ns_l_enabled[i].path);
} else {
error("%s: umount2 %s failed: %m",
__func__, ns_l_enabled[i].path);
failures = 1;
}
}
}
/* If any of the unmounts failed above, bail out here */
if (failures) {
xfree(job_mount);
xfree(ns_base);
return SLURM_ERROR;
}
if ((failures = rmdir_recursive(job_mount, false)))
error("%s: failed to remove %d files from %s",
__func__, failures, job_mount);
if (umount2(job_mount, MNT_DETACH))
log_flag(NAMESPACE, "umount2: %s failed: %m", job_mount);
if (rmdir(job_mount))
error("rmdir %s failed: %m", job_mount);
xfree(job_mount);
xfree(ns_base);
return SLURM_SUCCESS;
}
extern int namespace_p_stepd_create(stepd_step_rec_t *step)
{
if (plugin_disabled)
return SLURM_SUCCESS;
return _create_ns(step);
}
extern int namespace_p_stepd_delete(slurm_step_id_t *step_id)
{
if (plugin_disabled)
return SLURM_SUCCESS;
if (ns_pid) {
int wstatus;
/*
* The namespace process may have been signaled already, but
* kill it to be sure.
*/
kill(ns_pid, SIGKILL);
waitpid(ns_pid, &wstatus, 0);
ns_pid = -1;
}
return _delete_ns(step_id->job_id);
}
extern int namespace_p_send_stepd(int fd)
{
int len;
buf_t *buf;
buf = get_slurm_ns_conf_buf();
/* The config should have been inited by now */
xassert(buf);
len = get_buf_offset(buf);
safe_write(fd, &len, sizeof(len));
safe_write(fd, get_buf_data(buf), len);
return SLURM_SUCCESS;
rwfail:
error("%s: failed", __func__);
return SLURM_ERROR;
}
extern int namespace_p_recv_stepd(int fd)
{
int len;
buf_t *buf;
safe_read(fd, &len, sizeof(len));
buf = init_buf(len);
safe_read(fd, buf->head, len);
if (!(ns_conf = set_slurm_ns_conf(buf)))
goto rwfail;
plugin_disabled = _is_plugin_disabled(ns_conf->basepath);
return SLURM_SUCCESS;
rwfail:
error("%s: failed", __func__);
return SLURM_ERROR;
}
extern bool namespace_p_can_bpf(stepd_step_rec_t *step)
{
if (plugin_disabled)
return true;
/*
* Only special parts of the extern step are run in the namespace.
* The calls ebpf in the extern step are not in the namespace.
*/
if (step->step_id.step_id == SLURM_EXTERN_CONT)
return true;
/*
* bpf programs cannot be directly loaded from inside the user namespace
* unless a token is created.
*/
if (ns_conf->clonensflags & CLONE_NEWUSER)
return false;
return true;
}
extern int namespace_p_setup_bpf_token(stepd_step_rec_t *step)
{
int rc = SLURM_ERROR;
int fd = -1;
int token_fd = SLURM_ERROR;
uint16_t prot_ver;
slurm_step_id_t con = step->step_id;
/*
* This indicates that either this is a extern step or that the plugin
* is not configured to use user namespaces. In both cases we do not
* need to get a bpf token. Also if we already have one do not setup
* another.
*/
if (namespace_p_can_bpf(step) || cgroup_g_bpf_get_token() != -1)
return SLURM_SUCCESS;
#ifndef HAVE_BPF_TOKENS
error("Slurm is not compiled with BPF token support");
return SLURM_ERROR;
#endif
con.step_id = SLURM_EXTERN_CONT;
con.step_het_comp = NO_VAL;
if ((fd = stepd_connect(conf->spooldir, conf->node_name, &con,
&prot_ver)) == -1) {
error("%s: Connect to %ps external failed: %m",
__func__, &con.job_id);
goto end;
}
token_fd = stepd_get_bpf_token(fd, prot_ver);
if (token_fd != SLURM_ERROR) {
cgroup_g_bpf_set_token(token_fd);
rc = SLURM_SUCCESS;
}
end:
fd_close(&fd);
return rc;
}