src/plugins/namespace/linux/namespace_linux.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  namespace_linux.c - Define namespace plugin for creating temporary linux
  *			namespaces for the job to provide some isolation between
  *			jobs on the same node.
  *****************************************************************************
  *  Copyright (C) SchedMD LLC.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  SLURM is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with SLURM; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/
 #define _GNU_SOURCE
 #include <errno.h>
 #include <fcntl.h>
 #include <semaphore.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>

 #include "src/common/slurm_xlator.h"

 #include "src/common/env.h"
 #include "src/common/fd.h"
 #include "src/common/log.h"
 #include "src/common/read_config.h"
 #include "src/common/run_command.h"
 #include "src/common/stepd_api.h"
 #include "src/common/uid.h"
 #include "src/common/xmalloc.h"
 #include "src/common/xstring.h"
 #include "src/interfaces/cgroup.h"
 #include "src/interfaces/proctrack.h"
 #include "src/interfaces/switch.h"
 #include "src/slurmd/slurmstepd/slurmstepd_job.h"

 #include "read_nsconf.h"

 static int _create_ns(stepd_step_rec_t *step);
 static int _delete_ns(uint32_t job_id);

 #if defined(__APPLE__)
 extern slurmd_conf_t *conf __attribute__((weak_import));
 #else
 slurmd_conf_t *conf = NULL;
 #endif

 /* Required Slurm plugin symbols: */
 const char plugin_name[] = "namespace linux plugin";
 const char plugin_type[] = "namespace/linux";
 const uint32_t plugin_version = SLURM_VERSION_NUMBER;

 static ns_conf_t *ns_conf = NULL;
 static bool plugin_disabled = false;
 static pid_t ns_pid = -1;

 /* NS_L_NS must be last */
 enum ns_l_types {
 	NS_L_PID = 0,
 	NS_L_USER,
 	NS_L_NS,
 	NS_L_END
 };

 typedef struct {
 	bool enabled;
 	int fd;
 	int flag;
 	char *path;
 	char *proc_name;
 } ns_l_t;

 static ns_l_t ns_l_enabled[NS_L_END] = { { false, -1, 0, NULL, NULL },
 					 { false, -1, 0, NULL, NULL },
 					 { false, -1, 0, NULL, NULL } };

 static void _create_paths(uint32_t job_id, char **job_mount, char **ns_base,
 			  char **src_bind)
 {
 	xassert(job_mount);

 	xstrfmtcat(*job_mount, "%s/%u", ns_conf->basepath, job_id);

 	if (ns_base) {
 		xstrfmtcat(*ns_base, "%s/.ns", *job_mount);
 		if (ns_conf->clonensflags & CLONE_NEWNS) {
 			ns_l_enabled[NS_L_NS].enabled = true;
 			ns_l_enabled[NS_L_NS].flag = CLONE_NEWNS;
 			xfree(ns_l_enabled[NS_L_NS].path);
 			xstrfmtcat(ns_l_enabled[NS_L_NS].path, "%s/mnt",
 				   *ns_base);
 			ns_l_enabled[NS_L_NS].proc_name = "mnt";
 		}
 		if (ns_conf->clonensflags & CLONE_NEWPID) {
 			ns_l_enabled[NS_L_PID].enabled = true;
 			ns_l_enabled[NS_L_PID].flag = CLONE_NEWPID;
 			xfree(ns_l_enabled[NS_L_PID].path);
 			xstrfmtcat(ns_l_enabled[NS_L_PID].path, "%s/pid",
 				   *ns_base);
 			ns_l_enabled[NS_L_PID].proc_name = "pid";
 		}
 		if (ns_conf->clonensflags & CLONE_NEWUSER) {
 			ns_l_enabled[NS_L_USER].enabled = true;
 			ns_l_enabled[NS_L_USER].flag = CLONE_NEWUSER;
 			xfree(ns_l_enabled[NS_L_USER].path);
 			xstrfmtcat(ns_l_enabled[NS_L_USER].path, "%s/user",
 				   *ns_base);
 			ns_l_enabled[NS_L_USER].proc_name = "user";
 		}
 	}

 	if (src_bind)
 		xstrfmtcat(*src_bind, "%s/.%u", *job_mount, job_id);
 }

 static int _find_step_in_list(step_loc_t *stepd, uint32_t *job_id)
 {
 	return (stepd->step_id.job_id == *job_id);
 }

 static bool _is_plugin_disabled(char *basepath)
 {
 	return ((!basepath) || (!xstrncasecmp(basepath, "none", 4)));
 }

 static int _restore_ns(list_t *steps, const char *d_name)
 {
 	char *endptr;
 	int fd;
 	unsigned long job_id;
 	step_loc_t *stepd;

 	errno = 0;
 	job_id = strtoul(d_name, &endptr, 10);
 	if ((errno != 0) || (job_id >= NO_VAL) || (*endptr != '\0')) {
 		debug3("ignoring %s, could not convert to jobid.", d_name);
 		return SLURM_SUCCESS;
 	}

 	/* here we think this is a job namespace */
 	log_flag(NAMESPACE, "determine if job %lu is still running", job_id);
 	stepd = list_find_first(steps, (ListFindF) _find_step_in_list, &job_id);
 	if (!stepd) {
 		debug("%s: Job %lu not found, deleting the namespace",
 		      __func__, job_id);
 		return _delete_ns(job_id);
 	}

 	fd = stepd_connect(stepd->directory, stepd->nodename, &stepd->step_id,
 			   &stepd->protocol_version);
 	if (fd == -1) {
 		error("%s: failed to connect to stepd for %lu.",
 		      __func__, job_id);
 		return _delete_ns(job_id);
 	}

 	fd_close(&fd);

 	return SLURM_SUCCESS;
 }

 extern int init(void)
 {
 	if (running_in_slurmd()) {
 		/*
 		 * Only init the config here for the slurmd. It will be sent by
 		 * the slurmd to the slurmstepd at launch time.
 		 */
 		if (!(ns_conf = init_slurm_ns_conf())) {
 			error("%s: Configuration not read correctly: Does '%s' not exist?",
 			      plugin_type, ns_conf_file);
 			return SLURM_ERROR;
 		}
 		plugin_disabled = _is_plugin_disabled(ns_conf->basepath);
 		debug("namespace.yaml read successfully");
 	}

 	debug("%s loaded", plugin_name);

 	return SLURM_SUCCESS;
 }

 extern void fini(void)
 {
 #ifdef MEMORY_LEAK_DEBUG
 	for (int i = 0; i < NS_L_END; i++) {
 		xfree(ns_l_enabled[i].path);
 		fd_close(&ns_l_enabled[i].fd);
 	}
 	free_ns_conf();
 #endif
 	debug("%s unloaded", plugin_name);
 }

 extern int namespace_p_restore(char *dir_name, bool recover)
 {
 	DIR *dp;
 	struct dirent *ep;
 	list_t *steps;
 	int rc = SLURM_SUCCESS;

 	if (plugin_disabled)
 		return SLURM_SUCCESS;

 	if (ns_conf->auto_basepath) {
 		int fstatus;
 		mode_t omask = umask(S_IWGRP | S_IWOTH);

 		if (ns_conf->basepath[0] != '/') {
 			debug("%s: unable to create ns directory '%s' : does not start with '/'",
 			      __func__, ns_conf->basepath);
 			umask(omask);
 			return SLURM_ERROR;
 		}

 		if ((fstatus = mkdirpath(ns_conf->basepath, 0755, true))) {
 			debug("%s: unable to create ns directory '%s' : %s",
 			      __func__, ns_conf->basepath,
 			      slurm_strerror(fstatus));
 			umask(omask);
 			return SLURM_ERROR;
 		}

 		umask(omask);
 	}

 	steps = stepd_available(conf->spooldir, conf->node_name);

 	/*
 	 * Iterate over basepath, restore only the folders that seem bounded to
 	 * real jobs (have .ns file). NOTE: Restoring the state could be either
 	 * deleting the folder if the job is died and resources are free, or
 	 * mount it otherwise.
 	 */
 	if (!(dp = opendir(ns_conf->basepath))) {
 		error("%s: Unable to open %s", __func__, ns_conf->basepath);
 		return SLURM_ERROR;
 	}

 	while ((ep = readdir(dp))) {
 		/* If possible, only check directories */
 		if ((ep->d_type == DT_DIR) || (ep->d_type == DT_UNKNOWN)) {
 			if (_restore_ns(steps, ep->d_name))
 				rc = SLURM_ERROR;
 		}
 	}
 	closedir(dp);
 	FREE_NULL_LIST(steps);

 	if (rc)
 		error("Encountered an error while restoring job namespaces.");

 	return rc;
 }

 static int _mount_private_dirs(char *path, uid_t uid)
 {
 	char *buffer = NULL, *mount_path = NULL, *save_ptr = NULL, *token;
 	int rc = 0;

 	if (!path) {
 		error("%s: no path to private directories specified.",
 		      __func__);
 		return -1;
 	}
 	buffer = xstrdup(ns_conf->dirs);
 	token = strtok_r(buffer, ",", &save_ptr);
 	while (token) {
 		/* skip /dev/shm, this is handled elsewhere */
 		if (!xstrcmp(token, "/dev/shm")) {
 			token = strtok_r(NULL, ",", &save_ptr);
 			continue;
 		}
 		xstrfmtcat(mount_path, "%s/%s", path, token);
 		for (char *t = mount_path + strlen(path) + 1; *t; t++) {
 			if (*t == '/')
 				*t = '_';
 		}
 		rc = mkdir(mount_path, 0700);
 		if (rc && errno != EEXIST) {
 			error("%s: Failed to create %s, %m",
 			      __func__, mount_path);
 			goto private_mounts_exit;
 		}
 		if (mount(mount_path, token, NULL, MS_BIND, NULL)) {
 			error("%s: %s mount failed, %m", __func__, token);
 			rc = -1;
 			goto private_mounts_exit;
 		}
 		token = strtok_r(NULL, ",", &save_ptr);
 		xfree(mount_path);
 	}

 private_mounts_exit:
 	xfree(buffer);
 	xfree(mount_path);
 	return rc;
 }

 static int _chown_private_dirs(char *path, uid_t uid)
 {
 	char *buffer = NULL, *mount_path = NULL, *save_ptr = NULL, *token;
 	int rc = 0;

 	if (!path) {
 		error("%s: no path to private directories specified.",
 		      __func__);
 		return -1;
 	}
 	buffer = xstrdup(ns_conf->dirs);
 	token = strtok_r(buffer, ",", &save_ptr);
 	while (token) {
 		/* skip /dev/shm, this is handled elsewhere */
 		if (!xstrcmp(token, "/dev/shm")) {
 			token = strtok_r(NULL, ",", &save_ptr);
 			continue;
 		}
 		xstrfmtcat(mount_path, "%s/%s", path, token);
 		for (char *t = mount_path + strlen(path) + 1; *t; t++) {
 			if (*t == '/')
 				*t = '_';
 		}
 		rc = lchown(mount_path, uid, -1);
 		if (rc) {
 			error("%s: lchown failed for %s: %m",
 			      __func__, mount_path);
 			goto private_mounts_exit;
 		}
 		token = strtok_r(NULL, ",", &save_ptr);
 		xfree(mount_path);
 	}

 private_mounts_exit:
 	xfree(buffer);
 	xfree(mount_path);
 	return rc;
 }

 static int _mount_private_shm(void)
 {
 	char *loc = NULL;
 	int rc = 0;

 	/* return early if "/dev/shm" is not in the mount list */
 	if (!(loc = xstrcasestr(ns_conf->dirs, "/dev/shm")))
 		return rc;
 	if (!((loc[8] == ',') || (loc[8] == 0)))
 		return rc;

 	/* handle mounting a new /dev/shm */
 	if (!ns_conf->shared) {
 		/*
 		 * only unmount old /dev/shm if private, otherwise this can
 		 * impact the root namespace
 		 */
 		rc = umount("/dev/shm");
 		if (rc && errno != EINVAL) {
 			error("%s: umount /dev/shm failed: %m", __func__);
 			return rc;
 		}
 	}
 	rc = mount("tmpfs", "/dev/shm", "tmpfs", 0, NULL);
 	if (rc) {
 		error("%s: /dev/shm mount failed: %m", __func__);
 		return -1;
 	}
 	return rc;
 }

 static int _mount_private_proc(void)
 {
 	if (!ns_l_enabled[NS_L_PID].enabled)
 		return SLURM_SUCCESS;

 	if (mount("proc", "/proc", "proc", 0, NULL)) {
 		error("%s: /proc mount failed: %m", __func__);
 		return SLURM_ERROR;
 	}

 	return SLURM_SUCCESS;
 }

 static char **_setup_script_env(uint32_t job_id, stepd_step_rec_t *step,
 				char *src_bind, char *ns_base)
 {
 	char **env = env_array_create();

 	env_array_overwrite_fmt(&env, "SLURM_JOB_ID", "%u", job_id);
 	env_array_overwrite_fmt(&env, "SLURM_CONF", "%s", conf->conffile);
 	env_array_overwrite_fmt(&env, "SLURMD_NODENAME", "%s", conf->node_name);
 	if (src_bind)
 		env_array_overwrite_fmt(&env, "SLURM_JOB_MOUNTPOINT_SRC", "%s",
 					src_bind);
 	if (step) {
 		if (step->het_job_id && (step->het_job_id != NO_VAL))
 			env_array_overwrite_fmt(&env, "SLURM_HET_JOB_ID", "%u",
 						step->het_job_id);
 		env_array_overwrite_fmt(&env, "SLURM_JOB_GID", "%u", step->gid);
 		env_array_overwrite_fmt(&env, "SLURM_JOB_UID", "%u", step->uid);
 		env_array_overwrite_fmt(&env, "SLURM_JOB_USER", "%s",
 					step->user_name);
 		if (step->alias_list)
 			env_array_overwrite_fmt(&env, "SLURM_NODE_ALIASES",
 						"%s", step->alias_list);
 		if (step->cwd)
 			env_array_overwrite_fmt(&env, "SLURM_JOB_WORK_DIR",
 						"%s", step->cwd);
 		if (step->job_mem)
 			env_array_overwrite_fmt(&env, "SLURM_JOB_MEM",
 						"%" PRIu64, step->job_mem);
 	}

 	if (ns_base)
 		env_array_overwrite_fmt(&env, "SLURM_NS", "%s", ns_base);

 	return env;
 }

 static pid_t sys_clone(unsigned long flags, int *parent_tid, int *child_tid,
 		       unsigned long tls)
 {
 #ifdef __x86_64__
 	return syscall(__NR_clone, flags, NULL, parent_tid, child_tid, tls);
 #else
 	return syscall(__NR_clone, flags, NULL, parent_tid, tls, child_tid);
 #endif
 }

 static void _create_ns_child(stepd_step_rec_t *step, char *src_bind,
 			     char *job_mount, sem_t *sem1, sem_t *sem2)
 {
 	char *argv[4] = { (char *) conf->stepd_loc, "ns_infinity", NULL, NULL };
 	int rc = 0;

 	if (sem_wait(sem1) < 0) {
 		error("%s: sem_wait failed %m", __func__);
 		rc = -1;
 		goto child_exit;
 	}
 	if (!ns_conf->shared) {
 		/* Set root filesystem to private */
 		if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) {
 			error("%s: Failed to make root private: %m",
 			      __func__);
 			rc = -1;
 			goto child_exit;
 		}
 	} else {
 		/* Set root filesystem to shared */
 		if (mount(NULL, "/", NULL, MS_SHARED | MS_REC, NULL)) {
 			error("%s: Failed to make root shared: %m",
 			      __func__);
 			rc = -1;
 			goto child_exit;
 		}
 		/* Set root filesystem to slave */
 		if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) {
 			error("%s: Failed to make root slave: %m",
 			      __func__);
 			rc = -1;
 			goto child_exit;
 		}
 	}

 	if (_mount_private_proc() == SLURM_ERROR) {
 		rc = -1;
 		goto child_exit;
 	}

 	/*
 	 * Now we have a persistent mount namespace.
 	 * Mount private directories inside the namespace.
 	 */
 	if (_mount_private_dirs(src_bind, step->uid) == -1) {
 		rc = -1;
 		goto child_exit;
 	}

 	/*
 	 * switch/nvidia_imex needs to create an ephemeral device
 	 * node under /dev in this new namespace.
 	 */
 	if ((rc = switch_g_fs_init(step))) {
 		error("%s: switch_g_fs_init failed", __func__);
 		rc = -1;
 		goto child_exit;
 	}

 	if ((rc = _mount_private_shm())) {
 		error("%s: could not mount private shm", __func__);
 		goto child_exit;
 	}

 	if (sem_post(sem2) < 0) {
 		error("%s: sem_post failed: %m", __func__);
 		goto child_exit;
 	}

 	sem_destroy(sem1);
 	munmap(sem1, sizeof(*sem1));
 	sem_destroy(sem2);
 	munmap(sem2, sizeof(*sem2));

 	/* become an infinity process */
 	xstrfmtcat(argv[2], "%u", step->step_id.job_id);

 	execvp(argv[0], argv);
 	error("execvp of slurmstepd infinity failed: %m");
 	_exit(127);

 child_exit:
 	/* Do a final post to prevent from waiting on errors */
 	sem_post(sem2);
 	sem_destroy(sem1);
 	munmap(sem1, sizeof(*sem1));
 	sem_destroy(sem2);
 	munmap(sem2, sizeof(*sem2));

 	exit(rc);
 }

 static int _clonens_user_setup(stepd_step_rec_t *step, pid_t pid)
 {
 	int fd = -1, rc = SLURM_SUCCESS;
 	char *tmpstr = NULL;

 	if (!ns_l_enabled[NS_L_USER].enabled)
 		return rc;

 	/* If the script is specified, it takes precidendce */
 	if (ns_conf->usernsscript) {
 		char *result = NULL;
 		run_command_args_t run_command_args = {
 			.max_wait = 10 * MSEC_IN_SEC,
 			.script_path = ns_conf->usernsscript,
 			.script_type = "UserNSScript",
 			.status = &rc,
 		};
 		run_command_args.env = _setup_script_env(step->step_id.job_id,
 							 step, NULL, NULL);
 		env_array_overwrite_fmt(&run_command_args.env, "SLURM_NS_PID",
 					"%u", pid);

 		log_flag(NAMESPACE, "Running UserNSScript");
 		result = run_command(&run_command_args);
 		log_flag(NAMESPACE, "UserNSScript rc: %d, stdout: %s",
 			 rc, result);
 		env_array_free(run_command_args.env);
 		xfree(result);

 		if (rc)
 			error("%s: UserNSScript: %s failed with rc: %d",
 			      __func__, ns_conf->usernsscript, rc);
 		goto end_it;
 	}

 	xstrfmtcat(tmpstr, "/proc/%d/uid_map", pid);
 	if (!(-1 != (fd = open(tmpstr, O_WRONLY)))) {
 		error("%s: open uid_map %s failed: %m", __func__, tmpstr);
 		rc = SLURM_ERROR;
 		goto end_it;
 	}
 	if (!(1 <= dprintf(fd, "0 0 4294967295\n"))) {
 		error("%s: write 0 0 4294967295 uid_map %s failed: %m",
 		      __func__, tmpstr);
 		rc = SLURM_ERROR;
 		goto end_it;
 	}
 	fd_close(&fd);
 	xfree(tmpstr);

 	xstrfmtcat(tmpstr, "/proc/%d/gid_map", pid);
 	if (!(-1 != (fd = open(tmpstr, O_WRONLY)))) {
 		error("%s: open gid_map failed: %m", __func__);
 		rc = SLURM_ERROR;
 		goto end_it;
 	}
 	if (!(1 <= dprintf(fd, "0 0 4294967295\n"))) {
 		error("%s: write 0 0 4294967295 failed: %m",
 		      __func__ );
 		rc = SLURM_ERROR;
 		goto end_it;
 	}

 end_it:
 	fd_close(&fd);
 	xfree(tmpstr);
 	return rc;
 }

 static int _create_ns(stepd_step_rec_t *step)
 {
 	int child_tid = 0, parent_tid = 0;
 	char *job_mount = NULL, *ns_base = NULL, *src_bind = NULL;
 	char *result = NULL;
 	int fd;
 	int rc = 0;
 	unsigned long tls = 0;
 	sem_t *sem1 = NULL;
 	sem_t *sem2 = NULL;

 	_create_paths(step->step_id.job_id, &job_mount, &ns_base, &src_bind);

 	if (mkdir(job_mount, 0700)) {
 		error("%s: mkdir %s failed: %m", __func__, job_mount);
 		rc = SLURM_ERROR;
 		goto end_it;
 	}

 	/*
 	 * MS_BIND mountflag would make mount() ignore all other mountflags
 	 * except MS_REC. We need MS_PRIVATE mountflag as well to make the
 	 * mount (as well as all mounts inside it) private, which needs to be
 	 * done by calling mount() a second time with MS_PRIVATE and MS_REC
 	 * flags.
 	 */
 	if (mount(job_mount, job_mount, NULL, MS_BIND, NULL)) {
 		error("%s: Initial base mount failed: %m", __func__);
 		rc = SLURM_ERROR;
 		goto end_it;
 	}
 	if (mount(job_mount, job_mount, NULL, MS_PRIVATE | MS_REC, NULL)) {
 		error("%s: Initial base mount failed: %m", __func__);
 		rc = SLURM_ERROR;
 		goto end_it;
 	}

 	if (mkdir(ns_base, 0700)) {
 		error("%s: mkdir %s failed: %m", __func__, ns_base);
 		rc = SLURM_ERROR;
 		goto end_it;
 	}

 	/* Create locations for all enabled namespaces */
 	for (int i = 0; i < NS_L_END; i++) {
 		if (!ns_l_enabled[i].enabled)
 			continue;
 		fd = open(ns_l_enabled[i].path, O_CREAT | O_RDWR, S_IRWXU);
 		if (fd == -1) {
 			error("%s: open failed %s: %m",
 			      __func__, ns_l_enabled[i].path);
 			rc = -1;
 			goto exit2;
 		}
 		fd_close(&fd);
 	}

 	/* Create location for bind mounts to go */
 	rc = mkdir(src_bind, 0700);
 	if (rc && (errno != EEXIST)) {
 		error("%s: mkdir failed %s, %m", __func__, src_bind);
 		goto exit2;
 	}

 	if (chown(src_bind, step->uid, -1)) {
 		error("%s: chown failed for %s: %m",
 		      __func__, src_bind);
 		rc = -1;
 		goto exit2;
 	}

 	/* run any initialization script- if any*/
 	if (ns_conf->initscript) {
 		run_command_args_t run_command_args = {
 			.max_wait = 10 * MSEC_IN_SEC,
 			.script_path = ns_conf->initscript,
 			.script_type = "initscript",
 			.status = &rc,
 		};
 		run_command_args.env = _setup_script_env(step->step_id.job_id,
 							 step, src_bind, NULL);

 		log_flag(NAMESPACE, "Running InitScript");
 		result = run_command(&run_command_args);
 		log_flag(NAMESPACE, "InitScript rc: %d, stdout: %s", rc, result);
 		env_array_free(run_command_args.env);
 		xfree(result);

 		if (rc) {
 			error("%s: InitScript: %s failed with rc: %d",
 			      __func__, ns_conf->initscript, rc);
 			goto exit2;
 		}
 	}

 	sem1 = mmap(NULL, sizeof(*sem1), PROT_READ | PROT_WRITE,
 		    MAP_SHARED | MAP_ANONYMOUS, -1, 0);
 	if (sem1 == MAP_FAILED) {
 		error("%s: mmap failed: %m", __func__);
 		rc = -1;
 		goto exit2;
 	}

 	sem2 = mmap(NULL, sizeof(*sem2), PROT_READ | PROT_WRITE,
 		    MAP_SHARED | MAP_ANONYMOUS, -1, 0);
 	if (sem2 == MAP_FAILED) {
 		error("%s: mmap failed: %m", __func__);
 		sem_destroy(sem1);
 		munmap(sem1, sizeof(*sem1));
 		rc = -1;
 		goto exit2;
 	}

 	rc = sem_init(sem1, 1, 0);
 	if (rc) {
 		error("%s: sem_init: %m", __func__);
 		goto exit1;
 	}
 	rc = sem_init(sem2, 1, 0);
 	if (rc) {
 		error("%s: sem_init: %m", __func__);
 		goto exit1;
 	}
 	ns_pid = sys_clone(ns_conf->clonensflags | SIGCHLD, &parent_tid,
 			   &child_tid, tls);

 	if (ns_pid == -1) {
 		error("%s: sys_clone failed: %m", __func__);
 		rc = -1;
 		goto exit1;
 	} else if (ns_pid == 0) {
 		_create_ns_child(step, src_bind, job_mount, sem1, sem2);
 	} else {
 		char *proc_path = NULL;

 		/*
 		 * Bind mount /proc/pid/ns/loc to hold namespace active
 		 * without a process attached to it
 		 */
 		for (int i = 0; i < NS_L_END; i++) {
 			if (!ns_l_enabled[i].enabled)
 				continue;
 			xstrfmtcat(proc_path, "/proc/%u/ns/%s", ns_pid,
 				   ns_l_enabled[i].proc_name);
 			rc = mount(proc_path, ns_l_enabled[i].path, NULL,
 				   MS_BIND, NULL);
 			if (rc) {
 				error("%s: ns %s mount failed: %m",
 				      __func__, ns_l_enabled[i].proc_name);
 				if (sem_post(sem1) < 0)
 					error("%s: Could not release semaphore: %m",
 					      __func__);
 				xfree(proc_path);
 				goto exit1;
 			}
 			xfree(proc_path);
 		}

 		/* setup users before setting up the rest of the container */
 		if ((rc = _clonens_user_setup(step, ns_pid))) {
 			error("%s: Unable to prepare user namespace.",
 			      __func__);
 			/* error needs to fall though here */
 		}

 		/* Setup remainder of the container */
 		if (sem_post(sem1) < 0) {
 			error("%s: sem_post failed: %m", __func__);
 			goto exit1;
 		}

 		/* Wait for container to be setup */
 		if (sem_wait(sem2) < 0) {
 			error("%s: sem_Wait failed: %m", __func__);
 			rc = -1;
 			goto exit1;
 		}

 		if (proctrack_g_add(step, ns_pid) != SLURM_SUCCESS) {
 			error("%s: Job %u can't add pid %d to proctrack plugin in the extern_step.",
 			      __func__, step->step_id.job_id, ns_pid);
 			rc = SLURM_ERROR;
 			goto exit1;
 		}

 		if (_chown_private_dirs(src_bind, step->uid) == -1) {
 			rc = -1;
 			goto exit1;
 		}

 		/* Any error that remains here should skip further setup */
 		if (rc)
 			goto exit1;
 	}

 	/* run any post clone initialization script */
 	if (ns_conf->clonensscript) {
 		run_command_args_t run_command_args = {
 			.max_wait = ns_conf->clonensscript_wait * MSEC_IN_SEC,
 			.script_path = ns_conf->clonensscript,
 			.script_type = "clonensscript",
 			.status = &rc,
 		};
 		run_command_args.env =
 			_setup_script_env(step->step_id.job_id, step, src_bind,
 					  ns_l_enabled[NS_L_NS].path);

 		log_flag(NAMESPACE, "Running CloneNSScript");
 		result = run_command(&run_command_args);
 		log_flag(NAMESPACE, "CloneNSScript rc: %d, stdout: %s",
 			 rc, result);
 		xfree(result);
 		env_array_free(run_command_args.env);

 		if (rc) {
 			error("%s: CloneNSScript %s failed with rc=%d",
 			      __func__, ns_conf->clonensscript, rc);
 			goto exit2;
 		}
 	}

 exit1:
 	sem_destroy(sem1);
 	munmap(sem1, sizeof(*sem1));
 	sem_destroy(sem2);
 	munmap(sem2, sizeof(*sem2));

 exit2:
 	if (rc) {
 		int failures;
 		/* cleanup the job mount */
 		if ((failures = rmdir_recursive(job_mount, false))) {
 			error("%s: failed to remove %d files from %s",
 			      __func__, failures, job_mount);
 			rc = SLURM_ERROR;
 			goto end_it;
 		}
 		if (umount2(job_mount, MNT_DETACH))
 			error("%s: umount2 %s failed: %m",
 			      __func__, job_mount);
 		if (rmdir(job_mount))
 			error("rmdir %s failed: %m", job_mount);
 	}

 end_it:
 	xfree(job_mount);
 	xfree(src_bind);
 	xfree(ns_base);

 	return rc;
 }

 extern int namespace_p_join_external(slurm_step_id_t *step_id, list_t *ns_map)
 {
 	char *job_mount = NULL, *ns_base = NULL;
 	ns_fd_map_t *tmp_map = NULL;

 	xassert(ns_map);

 	if (plugin_disabled)
 		return 0;

 	_create_paths(step_id->job_id, &job_mount, &ns_base, NULL);

 	for (int i = 0; i < NS_L_END; i++) {
 		if (!ns_l_enabled[i].enabled)
 			continue;

 		if (ns_l_enabled[i].fd == -1) {
 			ns_l_enabled[i].fd =
 				open(ns_l_enabled[i].path, O_RDONLY);
 			if (ns_l_enabled[i].fd == -1) {
 				error("%s: %m", __func__);
 				goto end_it;
 			}
 		}
 		tmp_map = xmalloc(sizeof(*tmp_map));
 		tmp_map->type = ns_l_enabled[i].flag;
 		tmp_map->fd = ns_l_enabled[i].fd;
 		list_append(ns_map, tmp_map);
 		tmp_map = NULL;
 	}

 end_it:

 	xfree(job_mount);
 	xfree(ns_base);

 	return list_count(ns_map);
 }

 extern int namespace_p_join(slurm_step_id_t *step_id, uid_t uid,
 			    bool step_create)
 {
 	char *job_mount = NULL, *ns_base = NULL;
 	int rc = SLURM_SUCCESS;

 	if (plugin_disabled)
 		return SLURM_SUCCESS;

 	/* Formerly EntireStepInNS handling, this is now the normal process */
 	if ((running_in_slurmstepd() && step_id->step_id != SLURM_EXTERN_CONT))
 		return SLURM_SUCCESS;

 	/*
 	 * Jobid 0 means we are not a real job, but a script running instead we
 	 * do not need to handle this request.
 	 */
 	if (step_id->job_id == 0)
 		return SLURM_SUCCESS;

 	_create_paths(step_id->job_id, &job_mount, &ns_base, NULL);

 	/* Open all namespaces first, however we cannot assume this is shared */
 	for (int i = 0; i < NS_L_END; i++) {
 		if (!ns_l_enabled[i].enabled)
 			continue;
 		/* This is called on the slurmd so we can't use ns_fd. */
 		ns_l_enabled[i].fd = open(ns_l_enabled[i].path, O_RDONLY);
 		if (ns_l_enabled[i].fd == -1) {
 			error("%s: open failed for %s: %m",
 			      __func__, ns_l_enabled[i].path);
 			xfree(job_mount);
 			xfree(ns_base);
 			return SLURM_ERROR;
 		}
 	}
 	for (int i = 0; i < NS_L_END; i++) {
 		if (!ns_l_enabled[i].enabled)
 			continue;
 		rc = setns(ns_l_enabled[i].fd, 0);
 		fd_close(&ns_l_enabled[i].fd);
 		if (rc) {
 			error("%s: setns failed for %s: %m",
 			      __func__, ns_l_enabled[i].path);
 			/* closed after error() */
 			xfree(job_mount);
 			xfree(ns_base);
 			return SLURM_ERROR;
 		}
 		log_flag(NAMESPACE, "%ps entered %s namespace", step_id,
 			 ns_l_enabled[i].path);
 	}

 	log_flag(NAMESPACE, "%ps entered namespace", step_id);

 	xfree(job_mount);
 	xfree(ns_base);

 	return SLURM_SUCCESS;
 }

 static int _delete_ns(uint32_t job_id)
 {
 	char *job_mount = NULL, *ns_base = NULL;
 	int rc = 0, failures = 0;
 	char *result = NULL;

 	_create_paths(job_id, &job_mount, &ns_base, NULL);

 	/* run any post clone epilog script */
 	/* initialize environ variable to include jobid and namespace file */
 	if (ns_conf->clonensepilog) {
 		run_command_args_t run_command_args = {
 			.max_wait = ns_conf->clonensepilog_wait * MSEC_IN_SEC,
 			.script_path = ns_conf->clonensepilog,
 			.script_type = "clonensepilog",
 			.status = &rc,
 		};
 		run_command_args.env =
 			_setup_script_env(job_id, NULL, NULL, ns_base);
 		log_flag(NAMESPACE, "Running CloneNSEpilog");
 		result = run_command(&run_command_args);
 		env_array_free(run_command_args.env);
 		log_flag(NAMESPACE, "CloneNSEpilog rc: %d, stdout: %s",
 			 rc, result);
 		xfree(result);

 		if (rc) {
 			error("%s: CloneNSEpilog script %s failed with rc=%d",
 			      __func__, ns_conf->clonensepilog, rc);
 		}
 	}

 	errno = 0;

 	/*
 	 * umount2() sets errno to EINVAL if the target is not a mount point
 	 * but also if called with invalid flags.  Consider this if changing the
 	 * flags to umount2().
 	 */

 	for (int i = 0; i < NS_L_END; i++) {
 		if (!ns_l_enabled[i].enabled)
 			continue;
 		rc = umount2(ns_l_enabled[i].path, MNT_DETACH);
 		if (rc) {
 			if ((errno == EINVAL) || (errno == ENOENT)) {
 				log_flag(NAMESPACE, "%s: umount2 %s failed: %m",
 					 __func__, ns_l_enabled[i].path);
 			} else {
 				error("%s: umount2 %s failed: %m",
 				      __func__, ns_l_enabled[i].path);
 				failures = 1;
 			}
 		}
 	}
 	/* If any of the unmounts failed above, bail out here */
 	if (failures) {
 		xfree(job_mount);
 		xfree(ns_base);
 		return SLURM_ERROR;
 	}

 	if ((failures = rmdir_recursive(job_mount, false)))
 		error("%s: failed to remove %d files from %s",
 		      __func__, failures, job_mount);
 	if (umount2(job_mount, MNT_DETACH))
 		log_flag(NAMESPACE, "umount2: %s failed: %m", job_mount);
 	if (rmdir(job_mount))
 		error("rmdir %s failed: %m", job_mount);

 	xfree(job_mount);
 	xfree(ns_base);

 	return SLURM_SUCCESS;
 }

 extern int namespace_p_stepd_create(stepd_step_rec_t *step)
 {
 	if (plugin_disabled)
 		return SLURM_SUCCESS;

 	return _create_ns(step);
 }

 extern int namespace_p_stepd_delete(slurm_step_id_t *step_id)
 {
 	if (plugin_disabled)
 		return SLURM_SUCCESS;

 	if (ns_pid) {
 		int wstatus;
 		/*
 		 * The namespace process may have been signaled already, but
 		 * kill it to be sure.
 		 */
 		kill(ns_pid, SIGKILL);
 		waitpid(ns_pid, &wstatus, 0);
 		ns_pid = -1;
 	}

 	return _delete_ns(step_id->job_id);
 }

 extern int namespace_p_send_stepd(int fd)
 {
 	int len;
 	buf_t *buf;

 	buf = get_slurm_ns_conf_buf();

 	/* The config should have been inited by now */
 	xassert(buf);

 	len = get_buf_offset(buf);
 	safe_write(fd, &len, sizeof(len));
 	safe_write(fd, get_buf_data(buf), len);

 	return SLURM_SUCCESS;
 rwfail:
 	error("%s: failed", __func__);
 	return SLURM_ERROR;
 }

 extern int namespace_p_recv_stepd(int fd)
 {
 	int len;
 	buf_t *buf;

 	safe_read(fd, &len, sizeof(len));

 	buf = init_buf(len);
 	safe_read(fd, buf->head, len);

 	if (!(ns_conf = set_slurm_ns_conf(buf)))
 		goto rwfail;

 	plugin_disabled = _is_plugin_disabled(ns_conf->basepath);

 	return SLURM_SUCCESS;
 rwfail:
 	error("%s: failed", __func__);
 	return SLURM_ERROR;
 }

 extern bool namespace_p_can_bpf(stepd_step_rec_t *step)
 {
 	if (plugin_disabled)
 		return true;

 	/*
 	 * Only special parts of the extern step are run in the namespace.
 	 * The calls ebpf in the extern step are not in the namespace.
 	 */
 	if (step->step_id.step_id == SLURM_EXTERN_CONT)
 		return true;

 	/*
 	 * bpf programs cannot be directly loaded from inside the user namespace
 	 * unless a token is created.
 	 */
 	if (ns_conf->clonensflags & CLONE_NEWUSER)
 		return false;

 	return true;
 }

 extern int namespace_p_setup_bpf_token(stepd_step_rec_t *step)
 {
 	int rc = SLURM_ERROR;
 	int fd = -1;
 	int token_fd = SLURM_ERROR;
 	uint16_t prot_ver;
 	slurm_step_id_t con = step->step_id;

 	/*
 	 * This indicates that either this is a extern step or that the plugin
 	 * is not configured to use user namespaces. In both cases we do not
 	 * need to get a bpf token. Also if we already have one do not setup
 	 * another.
 	 */
 	if (namespace_p_can_bpf(step) || cgroup_g_bpf_get_token() != -1)
 		return SLURM_SUCCESS;

 #ifndef HAVE_BPF_TOKENS
 	error("Slurm is not compiled with BPF token support");
 	return SLURM_ERROR;
 #endif

 	con.step_id = SLURM_EXTERN_CONT;
 	con.step_het_comp = NO_VAL;

 	if ((fd = stepd_connect(conf->spooldir, conf->node_name, &con,
 				&prot_ver)) == -1) {
 		error("%s: Connect to %ps external failed: %m",
 		      __func__, &con.job_id);
 		goto end;
 	}

 	token_fd = stepd_get_bpf_token(fd, prot_ver);
 	if (token_fd != SLURM_ERROR) {
 		cgroup_g_bpf_set_token(token_fd);
 		rc = SLURM_SUCCESS;
 	}
 end:
 	fd_close(&fd);
 	return rc;
 }