src/plugins/cgroup/v1/cgroup_v1.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  cgroup_v1.c - Cgroup v1 plugin
  *****************************************************************************
  *  Copyright (C) SchedMD LLC.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #define _GNU_SOURCE

 #include "cgroup_v1.h"

 /*
  * These variables are required by the generic plugin interface.  If they
  * are not found in the plugin, the plugin loader will ignore it.
  *
  * plugin_name - a string giving a human-readable description of the
  * plugin.  There is no maximum length, but the symbol must refer to
  * a valid string.
  *
  * plugin_type - a string suggesting the type of the plugin or its
  * applicability to a particular form of data or method of data handling.
  * If the low-level plugin API is used, the contents of this string are
  * unimportant and may be anything.  Slurm uses the higher-level plugin
  * interface which requires this string to be of the form
  *
  *	<application>/<method>
  *
  * where <application> is a description of the intended application of
  * the plugin (e.g., "select" for Slurm node selection) and <method>
  * is a description of how this plugin satisfies that application.  Slurm will
  * only load select plugins if the plugin_type string has a
  * prefix of "select/".
  *
  * plugin_version - an unsigned 32-bit integer containing the Slurm version
  * (major.minor.micro combined into a single number).
  */
 const char plugin_name[] = "Cgroup v1 plugin";
 const char plugin_type[] = "cgroup/v1";
 const uint32_t plugin_version = SLURM_VERSION_NUMBER;

 static char g_user_cgpath[CG_CTL_CNT][PATH_MAX];
 static char g_job_cgpath[CG_CTL_CNT][PATH_MAX];
 static char g_step_cgpath[CG_CTL_CNT][PATH_MAX];
 static uint16_t g_step_active_cnt[CG_CTL_CNT];

 static xcgroup_ns_t g_cg_ns[CG_CTL_CNT];

 /* Internal cgroup structs */
 static xcgroup_t int_cg[CG_CTL_CNT][CG_LEVEL_CNT];

 const char *g_cg_name[CG_CTL_CNT] = {
 	"freezer",
 	"cpuset",
 	"memory",
 	"devices",
 	"cpuacct"
 };

 /* Cgroup v1 control items for the oom monitor */
 #define STOP_OOM 1

 typedef enum {
 	OOM_KILL_NONE,		/* Don't account for oom_kill events. */
 	OOM_KILL_COUNTER,	/* Use memory.oom_control's oom_kill field. */
 	OOM_KILL_MON		/* Spawn a monitoring thread and use eventfd. */
 } oom_kill_type_t;

 typedef struct {
 	int cfd;	/* control file fd. */
 	int efd;	/* event file fd. */
 	int event_fd;	/* eventfd fd. */
 } oom_event_args_t;

 static oom_kill_type_t oom_kill_type = OOM_KILL_NONE;
 static uint64_t oom_kill_count = 0;
 static int oom_pipe[2] = { -1, -1 };
 static pthread_t oom_thread;
 static pthread_mutex_t oom_mutex = PTHREAD_MUTEX_INITIALIZER;

 /* Task tracking artifacts */
 list_t *g_task_list[CG_CTL_CNT];
 static uint32_t g_max_task_id = 0;
 /*
  * There are potentially multiple tasks on a node, so we want to
  * track every task cgroup and which taskid it belongs to.
  */
 typedef struct {
 	xcgroup_t task_cg;
 	uint32_t taskid;
 } task_cg_info_t;

 extern bool cgroup_p_has_feature(cgroup_ctl_feature_t f);

 static int _step_destroy_internal(cgroup_ctl_type_t sub, bool root_locked);
 static int _get_oom_kill_from_file(xcgroup_t *cg);

 static int _cgroup_init(cgroup_ctl_type_t sub)
 {
 	if (sub >= CG_CTL_CNT)
 		return SLURM_ERROR;

 	if (xcgroup_ns_create(&g_cg_ns[sub], "", g_cg_name[sub])
 	    != SLURM_SUCCESS) {
 		error("unable to create %s cgroup namespace", g_cg_name[sub]);
 		return SLURM_ERROR;
 	}

 	if (common_cgroup_create(&g_cg_ns[sub], &int_cg[sub][CG_LEVEL_ROOT],
 				 "", 0, 0) != SLURM_SUCCESS) {
 		error("unable to create root %s xcgroup", g_cg_name[sub]);
 		common_cgroup_ns_destroy(&g_cg_ns[sub]);
 		return SLURM_ERROR;
 	}

 	if (xcgroup_create_slurm_cg(
 		    &g_cg_ns[sub], &int_cg[sub][CG_LEVEL_SLURM]) !=
 	    SLURM_SUCCESS) {
 		error("unable to create slurm %s xcgroup", g_cg_name[sub]);
 		common_cgroup_ns_destroy(&g_cg_ns[sub]);
 		return SLURM_ERROR;
 	}

 	return SLURM_SUCCESS;
 }

 static int _cpuset_create(stepd_step_rec_t *step)
 {
 	int rc;
 	char *sys_cgpath = NULL;
 	char *value;
 	size_t cpus_size;

 	rc = common_cgroup_get_param(&int_cg[CG_CPUS][CG_LEVEL_SLURM],
 				     "cpuset.cpus", &value, &cpus_size);

 	if ((rc != SLURM_SUCCESS) || (cpus_size == 1)) {
 		/* initialize the cpusets as it was non-existent */
 		if (xcgroup_cpuset_init(&int_cg[CG_CPUS][CG_LEVEL_SLURM]) !=
 		    SLURM_SUCCESS) {
 			xfree(value);
 			return SLURM_ERROR;
 		}
 	}

 	/* Do not inherit this setting in children, let plugins set it. */
 	common_cgroup_set_param(&int_cg[CG_CPUS][CG_LEVEL_SLURM],
 				"cgroup.clone_children", "0");

 	if (step == NULL) {
 		/* This is a request to create a cpuset for slurmd daemon */
 		xstrfmtcat(sys_cgpath, "%s/system",
 			   int_cg[CG_CPUS][CG_LEVEL_SLURM].name);

 		/* create system cgroup in the cpuset ns */
 		if ((rc = common_cgroup_create(
 			     &g_cg_ns[CG_CPUS],
 			     &int_cg[CG_CPUS][CG_LEVEL_SYSTEM],
 			     sys_cgpath, getuid(), getgid()))
 		    != SLURM_SUCCESS) {
 			goto end;
 		}

 		if (running_in_slurmd()) {
 			/*
 			 * The slurmd is the only place we need to set up the
 			 * system cgroup and the slurmstepd should not overwrite
 			 * these. If slurmstepd does overwrite these values
 			 * (such as cpuset.cpus) then slurmd will not be
 			 * properly constrained anymore.
 			 */
 			if ((rc = common_cgroup_instantiate(
 				     &int_cg[CG_CPUS][CG_LEVEL_SYSTEM]))
 			    != SLURM_SUCCESS)
 				goto end;

 			/* set notify on release flag */
 			common_cgroup_set_param(
 				&int_cg[CG_CPUS][CG_LEVEL_SYSTEM],
 				"notify_on_release", "0");

 			if ((rc = xcgroup_cpuset_init(
 				     &int_cg[CG_CPUS][CG_LEVEL_SYSTEM]))
 			    != SLURM_SUCCESS)
 				goto end;
 		}

 		log_flag(CGROUP,
 			 "system cgroup: system cpuset cgroup initialized");
 	} else {
 		/*
 		 * We don't lock here the g_root cg[CG_CPUS] because it is
 		 * locked from the caller.
 		 */
 		rc = xcgroup_create_hierarchy(__func__,
 					      step,
 					      &g_cg_ns[CG_CPUS],
 					      int_cg[CG_CPUS],
 					      g_job_cgpath[CG_CPUS],
 					      g_step_cgpath[CG_CPUS],
 					      g_user_cgpath[CG_CPUS]);
 	}

 end:
 	xfree(value);
 	xfree(sys_cgpath);

 	return rc;
 }

 static int _remove_cg_subsystem(xcgroup_t int_cg[], const char *log_str,
 				bool root_locked)
 {
 	xcgroup_t *root_cg = &int_cg[CG_LEVEL_ROOT];
 	xcgroup_t *job_cg = &int_cg[CG_LEVEL_JOB];
 	xcgroup_t *step_cg = &int_cg[CG_LEVEL_STEP];
 	xcgroup_t *user_cg = &int_cg[CG_LEVEL_USER];
 	xcgroup_t *slurm_cg = &int_cg[CG_LEVEL_SLURM];
 	int rc = SLURM_SUCCESS;

 	/*
 	 * Lock the root cgroup so we don't race with other steps that are being
 	 * started.
 	 */
 	if (!root_locked && (common_cgroup_lock(root_cg) != SLURM_SUCCESS)) {
 		error("common_cgroup_lock error (%s)", log_str);
 		return SLURM_ERROR;
 	}

 	/*
 	 * Always try to move slurmstepd process to the root cgroup, otherwise
 	 * the rmdir(2) triggered by the calls below will always fail if the pid
 	 * of stepd is in the cgroup. We don't know what other plugins will do
 	 * and whether they will attach the stepd pid to the cg.
 	 */
 	rc = common_cgroup_move_process(root_cg, getpid());
 	if (rc != SLURM_SUCCESS) {
 		error("Unable to move pid %d to root cgroup", getpid());
 		goto end;
 	}
 	common_cgroup_wait_pid_moved(step_cg, getpid(), log_str);

 	/* Delete step cgroup. */
 	if ((rc = common_cgroup_delete(step_cg)) != SLURM_SUCCESS)
 		goto end;

 	/*
 	 * At this point we'll do a best effort for the job and user cgroup,
 	 * since other jobs or steps may still be alive and not let us complete
 	 * the cleanup. The last job/step in the hierarchy will be the one which
 	 * will finally remove these two directories
 	 */
 	/* Delete job cgroup. */
 	if ((rc = common_cgroup_delete(job_cg)) != SLURM_SUCCESS) {
 		rc = SLURM_SUCCESS;
 		goto end;
 	}
 	/* Delete user cgroup. */
 	if ((rc = common_cgroup_delete(user_cg)) != SLURM_SUCCESS) {
 		rc = SLURM_SUCCESS;
 		goto end;
 	}

 	/*
 	 * Invalidate the cgroup structs.
 	 */
 	common_cgroup_destroy(user_cg);
 	common_cgroup_destroy(job_cg);
 	common_cgroup_destroy(step_cg);
 	common_cgroup_destroy(slurm_cg);

 end:
 	if (!root_locked)
 		common_cgroup_unlock(root_cg);
 	return rc;
 }

 static int _acct_task(void *x, void *arg)
 {
 	task_cg_info_t *t = (task_cg_info_t *) x;
 	cgroup_ctl_type_t *ctl = (cgroup_ctl_type_t *) arg;

 	/* Before deleting the task we account for its oom_kill if needed. */
 	if ((oom_kill_type == OOM_KILL_COUNTER) &&
 	    (ctl && (*ctl == CG_MEMORY)))
 		_get_oom_kill_from_file(&t->task_cg);

 	return SLURM_SUCCESS;
 }

 static int _rmdir_task(void *x, void *arg)
 {
 	task_cg_info_t *t = (task_cg_info_t *) x;

 	if (common_cgroup_delete(&t->task_cg) != SLURM_SUCCESS)
 		log_flag(CGROUP, "taskid: %d, failed to delete %s %m",
 			 t->taskid, t->task_cg.path);

 	return SLURM_SUCCESS;
 }

 static int _find_task_cg_info(void *x, void *key)
 {
 	task_cg_info_t *task_cg = (task_cg_info_t*)x;
 	uint32_t taskid = *(uint32_t*)key;

 	if (task_cg->taskid == taskid)
 		return 1;

 	return 0;
 }

 static void _free_task_cg_info(void *object)
 {
 	task_cg_info_t *task_cg = (task_cg_info_t *)object;

 	if (task_cg) {
 		common_cgroup_destroy(&task_cg->task_cg);
 		xfree(task_cg);
 	}
 }

 static int _handle_task_cgroup(cgroup_ctl_type_t sub, stepd_step_rec_t *step,
 			       pid_t pid, uint32_t taskid)
 {
 	int rc = SLURM_SUCCESS;
 	bool need_to_add = false;
 	task_cg_info_t *task_cg_info;
 	uid_t uid = step->uid;
 	gid_t gid = step->gid;
 	char *task_cgroup_path = NULL;

 	/* build task cgroup relative path */
 	xstrfmtcat(task_cgroup_path, "%s/task_%u", g_step_cgpath[sub], taskid);
 	if (!task_cgroup_path) {
 		error("unable to build task_%u cg relative path for %s: %m",
 		      taskid, g_step_cgpath[sub]);
 		return SLURM_ERROR;
 	}

 	if (!(task_cg_info = list_find_first(g_task_list[sub],
 					     _find_task_cg_info,
 					     &taskid))) {
 		task_cg_info = xmalloc(sizeof(*task_cg_info));
 		task_cg_info->taskid = taskid;
 		need_to_add = true;
 	}

 	/*
 	 * Create task cgroup in the cg ns
 	 */
 	if (common_cgroup_create(&g_cg_ns[sub], &task_cg_info->task_cg,
 				 task_cgroup_path, uid, gid) != SLURM_SUCCESS) {
 		error("unable to create task %u cgroup", taskid);
 		xfree(task_cg_info);
 		xfree(task_cgroup_path);
 		return SLURM_ERROR;
 	}

 	if (common_cgroup_instantiate(&task_cg_info->task_cg) != SLURM_SUCCESS)
 	{
 		_free_task_cg_info(task_cg_info);
 		error("unable to instantiate task %u cgroup", taskid);
 		xfree(task_cgroup_path);
 		return SLURM_ERROR;
 	}

 	/* set notify on release flag */
 	common_cgroup_set_param(&task_cg_info->task_cg, "notify_on_release",
 				"0");

 	/* Initialize the cpuset cgroup before moving processes into it */
 	if (sub == CG_CPUS) {
 		rc = xcgroup_cpuset_init(&task_cg_info->task_cg);
 		if (rc != SLURM_SUCCESS) {
 			error("Unable to initialize the cpuset cgroup %s",
 			      task_cg_info->task_cg.path);
 			goto end;
 		}
 	}

 	/* Attach the pid to the corresponding step_x/task_y cgroup */
 	rc = common_cgroup_move_process(&task_cg_info->task_cg, pid);
 	if (rc != SLURM_SUCCESS)
 		error("Unable to move pid %d to %s cg", pid, task_cgroup_path);

 	/* Add the cgroup to the list now that it is initialized. */
 	if (need_to_add)
 		list_append(g_task_list[sub], task_cg_info);

 end:
 	xfree(task_cgroup_path);
 	return rc;
 }

 static int _all_tasks_destroy(cgroup_ctl_type_t sub)
 {
 	int rc;

 	/* Empty the lists of accounted tasks, do a best effort in rmdir */
 	rc = list_for_each(g_task_list[sub], _rmdir_task, NULL);
 	list_flush(g_task_list[sub]);

 	return rc;
 }

 static bool _is_root_path(char *path)
 {
 	bool rc = false;
 	char *parent_path = NULL, file_path[PATH_MAX];
 	parent_path = xdirname(path);

 	if (snprintf(file_path, PATH_MAX, "%s/cgroup.procs", parent_path) >=
 	    PATH_MAX) {
 		error("Could not generate cgroup path: %s", file_path);
 		goto end;
 	}

 	/* If cgroup.procs is not found one level up, we are in the root */
 	if (access(file_path, F_OK))
 		rc = true;

 end:
 	xfree(parent_path);
 	return rc;
 }

 static void _remove_process_cg_limits(pid_t pid)
 {
 	xcgroup_t cg_cpu = { 0 };
 	xcgroup_t cg_mem = { 0 };
 	xcgroup_ns_t cpu_ns = { 0 };
 	xcgroup_ns_t mem_ns = { 0 };

 	/* Try to reset cpuset limits */
 	if (xcgroup_ns_create(&cpu_ns, "", g_cg_name[CG_CPUS])) {
 		log_flag(CGROUP,"Not resetting cpuset, controller not found");
 	} else if (xcgroup_ns_find_by_pid(&cpu_ns, &cg_cpu, pid)) {
 		error("Cannot find slurmd cpu cgroup");
 	} else if (!_is_root_path(cg_cpu.path)) {
 		if (xcgroup_cpuset_init(&cg_cpu)) {
 			error("Cannot reset slurmd cpuset limits");
 		} else {
 			log_flag(CGROUP, "Reset slurmd cpuset limits");
 		}
 	}
 	common_cgroup_destroy(&cg_cpu);
 	common_cgroup_ns_destroy(&cpu_ns);

 	/* Try to reset memory limits */
 	if (xcgroup_ns_create(&mem_ns, "", g_cg_name[CG_MEMORY])) {
 		log_flag(CGROUP,"Not resetting memory, controller not found");
 	} else if (xcgroup_ns_find_by_pid(&mem_ns, &cg_mem, pid)) {
 		error("Cannot find slurmd memory cgroup");
 	} else if (!_is_root_path(cg_mem.path)) {
 		if (common_cgroup_set_param(&cg_mem, "memory.limit_in_bytes",
 					    "-1")) {
 			error("Cannot reset slurmd memory limits");
 		} else {
 			log_flag(CGROUP, "Reset slurmd memory limits");
 		}
 	}
 	common_cgroup_destroy(&cg_mem);
 	common_cgroup_ns_destroy(&mem_ns);
 }

 extern int init(void)
 {
 	int i;

 	for (i = 0; i < CG_CTL_CNT; i++) {
 		g_user_cgpath[i][0] = '\0';
 		g_job_cgpath[i][0] = '\0';
 		g_step_cgpath[i][0] = '\0';
 		g_step_active_cnt[i] = 0;
 		FREE_NULL_LIST(g_task_list[i]);
 		g_task_list[i] = list_create(_free_task_cg_info);
 	}

 	debug("%s loaded", plugin_name);
 	return SLURM_SUCCESS;
 }

 extern void fini(void)
 {
 	for (int sub = 0; sub < CG_CTL_CNT; sub++) {
 		FREE_NULL_LIST(g_task_list[sub]);
 		common_cgroup_ns_destroy(&g_cg_ns[sub]);
 		common_cgroup_destroy(&int_cg[sub][CG_LEVEL_ROOT]);
 	}

 	debug("unloading %s", plugin_name);
 }

 extern int cgroup_p_setup_scope(char *scope_path)
 {
 	if (running_in_slurmd())
 		_remove_process_cg_limits(getpid());
 	return SLURM_SUCCESS;
 }

 extern char *cgroup_p_get_scope_path(void)
 {
 	return NULL;
 }

 extern int cgroup_p_initialize(cgroup_ctl_type_t sub)
 {
 	int rc = SLURM_SUCCESS;

 	/* Only initialize if not inited */
 	if (g_cg_ns[sub].mnt_point)
 		return rc;

 	if ((rc = _cgroup_init(sub)))
 		return rc;

 	switch (sub) {
 	case CG_TRACK:
 	case CG_CPUS:
 		break;
 	case CG_MEMORY:
 		common_cgroup_set_param(&int_cg[sub][CG_LEVEL_ROOT],
 					"memory.use_hierarchy", "1");
 		break;
 	case CG_DEVICES:
 	case CG_CPUACCT:
 		break;
 	default:
 		error("cgroup subsystem %u not supported", sub);
 		rc = SLURM_ERROR;
 		break;
 	}

 	return rc;
 }

 extern int cgroup_p_system_create(cgroup_ctl_type_t sub)
 {
 	char *sys_cgpath = NULL;
 	int rc = SLURM_SUCCESS;

 	switch (sub) {
 	case CG_CPUS:
 		rc = _cpuset_create(NULL);
 		break;
 	case CG_MEMORY:
 		xstrfmtcat(sys_cgpath, "%s/system",
 			   int_cg[sub][CG_LEVEL_SLURM].name);

 		if ((rc = common_cgroup_create(&g_cg_ns[sub],
 					       &int_cg[sub][CG_LEVEL_SYSTEM],
 					       sys_cgpath, getuid(), getgid()))
 		    != SLURM_SUCCESS)
 			goto end;

 		if ((rc = common_cgroup_instantiate(
 			     &int_cg[sub][CG_LEVEL_SYSTEM]))
 		    != SLURM_SUCCESS)
 			goto end;

 		/* set notify on release flag */
 		common_cgroup_set_param(&int_cg[sub][CG_LEVEL_SYSTEM],
 					"notify_on_release", "0");

 		if ((rc = common_cgroup_set_param(&int_cg[sub][CG_LEVEL_SYSTEM],
 						  "memory.use_hierarchy", "1"))
 		    != SLURM_SUCCESS) {
 			error("system cgroup: unable to ask for hierarchical accounting of system memcg '%s'",
 			      int_cg[sub][CG_LEVEL_SYSTEM].path);
 			goto end;
 		}

 		if ((rc = common_cgroup_set_uint64_param(
 			     &int_cg[sub][CG_LEVEL_SYSTEM],
 			     "memory.oom_control", 1))
 		    != SLURM_SUCCESS) {
 			error("Resource spec: unable to disable OOM Killer in system memory cgroup: %s",
 			      int_cg[sub][CG_LEVEL_SYSTEM].path);
 			goto end;
 		}
 		break;
 	case CG_TRACK:
 	case CG_DEVICES:
 	case CG_CPUACCT:
 		error("This operation is not supported for %s", g_cg_name[sub]);
 		return SLURM_ERROR;
 	default:
 		error("cgroup subsystem %u not supported", sub);
 		return SLURM_ERROR;
 		break;
 	}

 end:
 	xfree(sys_cgpath);
 	return rc;
 }

 extern int cgroup_p_system_addto(cgroup_ctl_type_t sub, pid_t *pids, int npids)
 {
 	switch (sub) {
 	case CG_TRACK:
 		break;
 	case CG_CPUS:
 		return common_cgroup_add_pids(&int_cg[sub][CG_LEVEL_SYSTEM],
 					      pids, npids);
 	case CG_MEMORY:
 		return common_cgroup_add_pids(&int_cg[sub][CG_LEVEL_SYSTEM],
 					      pids, npids);
 	case CG_DEVICES:
 		break;
 	case CG_CPUACCT:
 		break;
 	default:
 		error("cgroup subsystem %u not supported", sub);
 		return SLURM_ERROR;
 	}

 	error("This operation is not supported for %s", g_cg_name[sub]);
 	return SLURM_ERROR;
 }

 extern int cgroup_p_system_destroy(cgroup_ctl_type_t sub)
 {
 	int rc = SLURM_SUCCESS;

 	/*
 	 * Note: we do not need to lock the root cgroup because the only user
 	 * of this function is a single thread of slurmd.
 	 */

 	/* Another plugin may have already destroyed this subsystem. */
 	if (!int_cg[sub][CG_LEVEL_SYSTEM].path)
 		return SLURM_SUCCESS;

 	/* Custom actions for every cgroup subsystem */
 	switch (sub) {
 	case CG_CPUS:
 	case CG_MEMORY:
 		break;
 	case CG_TRACK:
 	case CG_DEVICES:
 	case CG_CPUACCT:
 		error("This operation is not supported for %s", g_cg_name[sub]);
 		return SLURM_SUCCESS;
 	default:
 		error("cgroup subsystem %u not supported", sub);
 		return SLURM_ERROR;
 		break;
 	}

 	rc = common_cgroup_move_process(&int_cg[sub][CG_LEVEL_ROOT], getpid());
 	if (rc != SLURM_SUCCESS) {
 		error("Unable to move pid %d to root cgroup", getpid());
 		goto end;
 	}
 	common_cgroup_wait_pid_moved(&int_cg[sub][CG_LEVEL_SYSTEM], getpid(),
 				     g_cg_name[sub]);

 	if ((rc = common_cgroup_delete(&int_cg[sub][CG_LEVEL_SYSTEM]))
 	    != SLURM_SUCCESS) {
 		log_flag(CGROUP, "not removing system cg (%s), there may be attached stepds: %m",
 			 g_cg_name[sub]);
 		goto end;
 	}
 	common_cgroup_destroy(&int_cg[sub][CG_LEVEL_SYSTEM]);
 end:
 	if (rc == SLURM_SUCCESS) {
 		common_cgroup_destroy(&int_cg[sub][CG_LEVEL_SLURM]);
 		common_cgroup_destroy(&int_cg[sub][CG_LEVEL_ROOT]);
 		common_cgroup_ns_destroy(&g_cg_ns[sub]);
 	}
 	return rc;
 }

 /*
  * Each call to this function counts as one active user of the step directories,
  * so the number of calls to this function must mach the number of calls of
  * cgroup_p_step_destroy in each plugin.
  */
 extern int cgroup_p_step_create(cgroup_ctl_type_t sub, stepd_step_rec_t *step)
 {
 	int rc = SLURM_SUCCESS;

 	/*
 	 * Lock the root cgroup so we don't race with other steps that are being
 	 * terminated, they could remove the directories while we're creating
 	 * them.
 	 */
 	if (common_cgroup_lock(&int_cg[sub][CG_LEVEL_ROOT]) != SLURM_SUCCESS) {
 		error("common_cgroup_lock error");
 		return SLURM_ERROR;
 	}

 	/* Don't let other plugins destroy our structs. */
 	g_step_active_cnt[sub]++;

 	switch (sub) {
 	case CG_TRACK:
 		/* create a new cgroup for that container */
 		if ((rc = xcgroup_create_hierarchy(__func__,
 						   step,
 						   &g_cg_ns[sub],
 						   int_cg[sub],
 						   g_job_cgpath[sub],
 						   g_step_cgpath[sub],
 						   g_user_cgpath[sub]))
 		    != SLURM_SUCCESS)
 			goto step_c_err;
 		break;
 	case CG_CPUS:
 		if ((rc = _cpuset_create(step))!= SLURM_SUCCESS)
 			goto step_c_err;
 		break;
 	case CG_MEMORY:
 		if ((rc = xcgroup_create_hierarchy(__func__,
 						   step,
 						   &g_cg_ns[sub],
 						   int_cg[sub],
 						   g_job_cgpath[sub],
 						   g_step_cgpath[sub],
 						   g_user_cgpath[sub]))
 		    != SLURM_SUCCESS) {
 			goto step_c_err;
 		}
 		if ((rc = common_cgroup_set_param(&int_cg[sub][CG_LEVEL_USER],
 						  "memory.use_hierarchy",
 						  "1")) != SLURM_SUCCESS) {
 			error("unable to set hierarchical accounting for %s",
 			      g_user_cgpath[sub]);
 			_step_destroy_internal(sub, true);
 			break;
 		}
 		if ((rc = common_cgroup_set_param(&int_cg[sub][CG_LEVEL_JOB],
 						  "memory.use_hierarchy",
 						  "1")) != SLURM_SUCCESS) {
 			error("unable to set hierarchical accounting for %s",
 			      g_job_cgpath[sub]);
 			_step_destroy_internal(sub, true);
 			break;
 		}
 		if ((rc = common_cgroup_set_param(&int_cg[sub][CG_LEVEL_STEP],
 						  "memory.use_hierarchy",
 						  "1") != SLURM_SUCCESS)) {
 			error("unable to set hierarchical accounting for %s",
 			      int_cg[sub][CG_LEVEL_STEP].path);
 			_step_destroy_internal(sub, true);
 			break;
 		}
 		break;
 	case CG_DEVICES:
 		/* create a new cgroup for that container */
 		if ((rc = xcgroup_create_hierarchy(__func__,
 						   step,
 						   &g_cg_ns[sub],
 						   int_cg[sub],
 						   g_job_cgpath[sub],
 						   g_step_cgpath[sub],
 						   g_user_cgpath[sub]))
 		    != SLURM_SUCCESS)
 			goto step_c_err;
 		break;
 	case CG_CPUACCT:
 		if ((rc = xcgroup_create_hierarchy(__func__,
 						   step,
 						   &g_cg_ns[sub],
 						   int_cg[sub],
 						   g_job_cgpath[sub],
 						   g_step_cgpath[sub],
 						   g_user_cgpath[sub]))
 		    != SLURM_SUCCESS)
 			goto step_c_err;
 		break;
 	default:
 		error("cgroup subsystem %u not supported", sub);
 		rc = SLURM_ERROR;
 		goto step_c_err;
 	}
 	common_cgroup_unlock(&int_cg[sub][CG_LEVEL_ROOT]);
 	return rc;

 step_c_err:
 	/* step cgroup is not created */
 	common_cgroup_unlock(&int_cg[sub][CG_LEVEL_ROOT]);
 	g_step_active_cnt[sub]--;
 	return rc;
 }

 extern int cgroup_p_step_addto(cgroup_ctl_type_t sub, pid_t *pids, int npids)
 {
 	if (*g_step_cgpath[sub] == '\0')
 		return SLURM_ERROR;

 	switch (sub) {
 	case CG_TRACK:
 		/*
 		 * Stick slurmstepd pid to the newly created job container
 		 * (Note: we do not put it in the step container because this
 		 * container could be used to suspend/resume tasks using freezer
 		 * properties so we need to let the slurmstepd outside of
 		 * this one).
 		 */
 		if ((npids == 1) && (*pids == getpid())) {
 			return common_cgroup_add_pids(
 				&int_cg[sub][CG_LEVEL_JOB], pids, npids);
 		}
 		break;
 	case CG_CPUS:
 	case CG_MEMORY:
 	case CG_DEVICES:
 		break;
 	case CG_CPUACCT:
 		error("This operation is not supported for %s", g_cg_name[sub]);
 		return SLURM_ERROR;
 	default:
 		error("cgroup subsystem %u not supported", sub);
 		return SLURM_ERROR;
 	}

 	return common_cgroup_add_pids(&int_cg[sub][CG_LEVEL_STEP], pids, npids);
 }

 extern int cgroup_p_step_get_pids(pid_t **pids, int *npids)
 {
 	if (*g_step_cgpath[CG_TRACK] == '\0')
 		return SLURM_ERROR;

 	return common_cgroup_get_pids(&int_cg[CG_TRACK][CG_LEVEL_STEP], pids,
 				      npids);
 }

 extern int cgroup_p_step_suspend(void)
 {
 	if (*g_step_cgpath[CG_TRACK] == '\0')
 		return SLURM_ERROR;

 	return common_cgroup_set_param(&int_cg[CG_TRACK][CG_LEVEL_STEP],
 				       "freezer.state", "FROZEN");
 }

 extern int cgroup_p_step_resume(void)
 {
 	if (*g_step_cgpath[CG_TRACK] == '\0')
 		return SLURM_ERROR;

 	return common_cgroup_set_param(&int_cg[CG_TRACK][CG_LEVEL_STEP],
 				       "freezer.state", "THAWED");
 }

 static int _step_destroy_internal(cgroup_ctl_type_t sub, bool root_locked)
 {
 	int rc = SLURM_SUCCESS;

 	/*
 	 * Only destroy the step if we're the only ones using it. Log it unless
 	 * loaded from slurmd, where we will not create any step but call fini.
 	 */
 	if (g_step_active_cnt[sub] == 0) {
 		error("called without a previous init. This shouldn't happen!");
 		return SLURM_SUCCESS;
 	}
 	/* Only destroy the step if we're the only ones using it. */
 	if (g_step_active_cnt[sub] > 1) {
 		g_step_active_cnt[sub]--;
 		log_flag(CGROUP, "Not destroying %s step dir, resource busy by %d other plugin",
 			 g_cg_name[sub], g_step_active_cnt[sub]);
 		return SLURM_SUCCESS;
 	}

 	/* Remove any possible task directories first */
 	_all_tasks_destroy(sub);

 	/* Custom actions for every cgroup subsystem */
 	switch (sub) {
 	case CG_TRACK:
 		break;
 	case CG_CPUS:
 		break;
 	case CG_MEMORY:
 		break;
 	case CG_DEVICES:
 		break;
 	case CG_CPUACCT:
 		break;
 	default:
 		error("cgroup subsystem %u not supported", sub);
 		return SLURM_ERROR;
 		break;
 	}

 	rc = _remove_cg_subsystem(int_cg[sub], g_cg_name[sub], root_locked);

 	if (rc == SLURM_SUCCESS) {
 		g_step_active_cnt[sub] = 0;
 		g_step_cgpath[sub][0] = '\0';
 	}

 	return rc;
 }

 extern int cgroup_p_step_destroy(cgroup_ctl_type_t sub)
 {
 	return _step_destroy_internal(sub, false);
 }

 /*
  * Is the specified pid in our cgroup g_cg_ns[CG_TRACK]?
  * In the future we may want to replace this with a get pids and a search.
  */
 extern bool cgroup_p_has_pid(pid_t pid)
 {
 	bool rc;
 	int rc2;
 	xcgroup_t cg;

 	rc2 = xcgroup_ns_find_by_pid(&g_cg_ns[CG_TRACK], &cg, pid);
 	if (rc2 != SLURM_SUCCESS)
 		return false;

 	rc = true;
 	if (xstrcmp(cg.path, int_cg[CG_TRACK][CG_LEVEL_STEP].path))
 		rc = false;

 	common_cgroup_destroy(&cg);
 	return rc;
 }

 static void _get_mem_recursive(xcgroup_t *cg, cgroup_limits_t *limits)
 {
 	char *mem_max = NULL, *tmp_str = NULL;
 	size_t mem_sz;
 	unsigned long mem_lim;
 	unsigned long page_counter_max = LONG_MAX - sysconf(_SC_PAGE_SIZE) + 1;

 	if (!xstrcmp(cg->path, "/sys/fs/cgroup"))
 		goto end;

 	/* Break when there is no memory controller anymore */
 	if (common_cgroup_get_param(cg, "memory.limit_in_bytes",
 				    &mem_max, &mem_sz) != SLURM_SUCCESS)
 		goto end;

 	/* Check ancestor */
 	mem_lim = slurm_atoul(mem_max);
 	if (mem_lim == page_counter_max) {
 		tmp_str = xdirname(cg->path);
 		xfree(cg->path);
 		cg->path = tmp_str;
 		_get_mem_recursive(cg, limits);
 		if (limits->limit_in_bytes != NO_VAL64)
 			goto end;
 	} else {
 		/* found it! */
 		limits->limit_in_bytes = mem_lim;
 	}
 end:
 	xfree(mem_max);
 }

 extern cgroup_limits_t *cgroup_p_constrain_get(cgroup_ctl_type_t sub,
 					       cgroup_level_t level)
 {
 	int rc = SLURM_SUCCESS;
 	cgroup_limits_t *limits;
 	xcgroup_t tmp_cg = { 0 };

 	/* Only initialize if not inited */
 	if (!g_cg_ns[sub].mnt_point && (rc = _cgroup_init(sub)))
 		return NULL;

 	limits = xmalloc(sizeof(*limits));
 	cgroup_init_limits(limits);

 	switch (sub) {
 	case CG_TRACK:
 		break;
 	case CG_CPUS:
 		if (common_cgroup_get_param(&int_cg[sub][level],
 					    "cpuset.cpus",
 					    &limits->allow_cores,
 					    &limits->cores_size)
 		    != SLURM_SUCCESS)
 			rc = SLURM_ERROR;

 		if (common_cgroup_get_param(&int_cg[sub][level],
 					    "cpuset.mems",
 					    &limits->allow_mems,
 					    &limits->mems_size)
 		    != SLURM_SUCCESS)
 			rc = SLURM_ERROR;

 		if (limits->cores_size > 0)
 			limits->allow_cores[(limits->cores_size)-1] = '\0';

 		if (limits->mems_size > 0)
 			limits->allow_mems[(limits->mems_size)-1] = '\0';

 		if (rc != SLURM_SUCCESS)
 			goto fail;
 		break;
 	case CG_MEMORY:
 		tmp_cg.path = xstrdup(int_cg[sub][level].path);
 		_get_mem_recursive(&tmp_cg, limits);
 		xfree(tmp_cg.path);
 		break;
 	case CG_DEVICES:
 		break;
 	default:
 		error("cgroup subsystem %u not supported", sub);
 		rc = SLURM_ERROR;
 		break;
 	}

 	return limits;
 fail:
 	cgroup_free_limits(limits);
 	return NULL;
 }

 extern int cgroup_p_constrain_set(cgroup_ctl_type_t sub, cgroup_level_t level,
 				  cgroup_limits_t *limits)
 {
 	int rc = SLURM_SUCCESS;
 	task_cg_info_t *task_cg_info;
 	char *dev_str = NULL;

 	if (!limits)
 		return SLURM_ERROR;

 	switch (sub) {
 	case CG_TRACK:
 		break;
 	case CG_CPUS:
 		/* Do not try to set the cpuset limits of slurmd in this case */
 		if ((level == CG_LEVEL_SYSTEM) &&
 		    (slurm_conf.task_plugin_param & SLURMD_SPEC_OVERRIDE))
 			break;

 		if (level == CG_LEVEL_SYSTEM ||
 		    level == CG_LEVEL_USER ||
 		    level == CG_LEVEL_JOB ||
 		    level == CG_LEVEL_STEP) {
 			if (common_cgroup_set_param(&int_cg[sub][level],
 						    "cpuset.cpus",
 						    limits->allow_cores)
 			    != SLURM_SUCCESS)
 				rc = SLURM_ERROR;
 		}

 		if (level == CG_LEVEL_USER ||
 		    level == CG_LEVEL_JOB ||
 		    level == CG_LEVEL_STEP) {
 			if (common_cgroup_set_param(&int_cg[sub][level],
 						    "cpuset.mems",
 						    limits->allow_mems)
 			    != SLURM_SUCCESS)
 				rc = SLURM_ERROR;
 		}
 		break;
 	case CG_MEMORY:
 		/* Do not try to set the cpuset limits of slurmd in this case */
 		if ((level == CG_LEVEL_SYSTEM) &&
 		    (slurm_conf.task_plugin_param & SLURMD_SPEC_OVERRIDE))
 			break;

 		if ((level == CG_LEVEL_JOB) &&
 		    (limits->swappiness != NO_VAL64)) {
 			rc = common_cgroup_set_uint64_param(&int_cg[sub][level],
 							    "memory.swappiness",
 							    limits->swappiness);
 		}

 		if (level == CG_LEVEL_JOB ||
 		    level ==  CG_LEVEL_STEP ||
 		    level == CG_LEVEL_SYSTEM) {
 			if (common_cgroup_set_uint64_param(
 				    &int_cg[sub][level],
 				    "memory.limit_in_bytes",
 				    limits->limit_in_bytes)
 			    != SLURM_SUCCESS)
 				rc = SLURM_ERROR;
 		}

 		if (level == CG_LEVEL_JOB ||
 		    level ==  CG_LEVEL_STEP) {
 			if (common_cgroup_set_uint64_param(
 				    &int_cg[sub][level],
 				    "memory.soft_limit_in_bytes",
 				    limits->soft_limit_in_bytes)
 			    != SLURM_SUCCESS)
 				rc = SLURM_ERROR;

 			if (limits->memsw_limit_in_bytes != NO_VAL64)
 				if (common_cgroup_set_uint64_param(
 					    &int_cg[sub][level],
 					    "memory.memsw.limit_in_bytes",
 					    limits->memsw_limit_in_bytes)
 				    != SLURM_SUCCESS)
 					rc = SLURM_ERROR;
 		}
 		break;
 	case CG_DEVICES:
 		dev_str = gres_device_id2str(&limits->device);
 		if (level == CG_LEVEL_STEP ||
 		    level == CG_LEVEL_JOB) {
 			if (limits->allow_device) {
 				if (common_cgroup_set_param(
 					    &int_cg[sub][level],
 					    "devices.allow",
 					    dev_str)
 				    != SLURM_SUCCESS)
 					rc = SLURM_ERROR;
 			} else {
 				if (common_cgroup_set_param(
 					    &int_cg[sub][level],
 					    "devices.deny",
 					    dev_str)
 				    != SLURM_SUCCESS)
 					rc = SLURM_ERROR;
 			}
 		}

 		if (level == CG_LEVEL_TASK) {
 			task_cg_info = list_find_first(g_task_list[sub],
 						       _find_task_cg_info,
 						       &(limits->taskid));
 			if (!task_cg_info) {
 				error("Task %d is not being tracked in %s controller, cannot set constrain.",
 				      limits->taskid, g_cg_name[sub]);
 				rc = SLURM_ERROR;
 				break;
 			}

 			if (limits->allow_device) {
 				rc = common_cgroup_set_param(
 					&task_cg_info->task_cg,
 					"devices.allow",
 					dev_str);
 			} else {
 				rc = common_cgroup_set_param(
 					&task_cg_info->task_cg,
 					"devices.deny",
 					dev_str);
 			}
 		}
 		break;
 	default:
 		error("cgroup subsystem %u not supported", sub);
 		rc = SLURM_ERROR;
 		break;
 	}

 	xfree(dev_str);
 	return rc;
 }

 extern int cgroup_p_constrain_apply(cgroup_ctl_type_t sub, cgroup_level_t level,
                                     uint32_t task_id)
 {
     return SLURM_SUCCESS;
 }

 /*
  * Code based on linux tools/cgroup/cgroup_event_listener.c with adapted
  * modifications for Slurm logic and needs.
  */
 static int _read_fd(int fd, uint64_t *buf)
 {
 	int rc = SLURM_ERROR;
 	size_t len = sizeof(uint64_t);
 	uint64_t *buf_ptr = buf;
 	ssize_t nread;

 	while (len > 0 && (nread = read(fd, buf_ptr, len)) != 0) {
 		if (nread == -1) {
 			if (errno == EINTR)
 				continue;
 			error("read(): %m");
 			break;
 		}
 		len -= nread;
 		buf_ptr += nread;
 	}

 	if (len == 0)
 		rc = SLURM_SUCCESS;

 	return rc;
 }

 static void *_oom_event_monitor(void *x)
 {
 	oom_event_args_t *args = (oom_event_args_t *) x;
 	int ret = -1;
 	uint64_t res;
 	struct pollfd fds[2];

 	debug("started.");

 	/*
 	 * POLLPRI should only meaningful for event_fd, since according to the
 	 * poll() man page it may indicate "cgroup.events" file modified.
 	 *
 	 * POLLRDHUP should only be meaningful for oom_pipe[0], since it refers
 	 * to stream socket peer closed connection.
 	 *
 	 * POLLHUP is ignored in events member, and should be set by the Kernel
 	 * in revents even if not defined in events.
 	 *
 	 */
 	fds[0].fd = args->event_fd;
 	fds[0].events = POLLIN | POLLPRI;

 	fds[1].fd = oom_pipe[0];
 	fds[1].events = POLLIN | POLLRDHUP;

 	/*
 	 * Poll event_fd for oom_kill events plus oom_pipe[0] for stop msg.
 	 * Specifying a negative value in timeout means an infinite timeout.
 	 */
 	while (1) {
 		ret = poll(fds, 2, -1);

 		if (ret == -1) {
 			/* Error. */
 			if (errno == EINTR)
 				continue;

 			error("poll(): %m");
 			break;
 		} else if (ret == 0) {
 			/* Should not happen since infinite timeout. */
 			error("poll() timeout.");
 			break;
 		} else if (ret > 0) {
 			if (fds[0].revents & (POLLIN | POLLPRI)) {
 				/* event_fd readable. */
 				res = 0;
 				ret = _read_fd(args->event_fd, &res);
 				if (ret == SLURM_SUCCESS) {
 					slurm_mutex_lock(&oom_mutex);
 					debug3("res: %"PRIu64"", res);
 					oom_kill_count += res;
 					debug("oom-kill event count: %"PRIu64"",
 					      oom_kill_count);
 					slurm_mutex_unlock(&oom_mutex);
 				} else
 					error("cannot read oom-kill counts.");
 			} else if (fds[0].revents & (POLLRDHUP | POLLERR |
 						     POLLHUP | POLLNVAL)) {
 				error("problem with event_fd");
 				break;
 			}

 			if (fds[1].revents & POLLIN) {
 				/* oom_pipe[0] readable. */
 				res = 0;
 				ret = _read_fd(oom_pipe[0], &res);
 				if (ret == SLURM_SUCCESS && res == STOP_OOM) {
 					/* Read stop msg. */
 					log_flag(CGROUP, "stop msg read.");
 					break;
 				}
 			} else if (fds[1].revents &
 				   (POLLRDHUP | POLLERR | POLLHUP | POLLNVAL)) {
 				error("problem with oom_pipe[0]");
 				break;
 			}
 		}
 	}

 	slurm_mutex_lock(&oom_mutex);
 	if (!oom_kill_count)
 		debug("No oom events detected.");
 	slurm_mutex_unlock(&oom_mutex);

 	close(args->event_fd);
 	close(args->efd);
 	close(args->cfd);
 	close(oom_pipe[0]);
 	xfree(args);

 	debug("stopping.");

 	return NULL;
 }

 extern int cgroup_p_step_start_oom_mgr(stepd_step_rec_t *step)
 {
 	char *control_file = NULL, *event_file = NULL, *line = NULL;
 	int rc = SLURM_SUCCESS, event_fd = -1, cfd = -1, efd = -1;
 	oom_event_args_t *event_args;
 	size_t sz;

 	rc = common_cgroup_get_param(&int_cg[CG_MEMORY][CG_LEVEL_STEP],
 				     "memory.oom_control",
 				     &event_file,
 				     &sz);

 	if (rc != SLURM_SUCCESS) {
 		error("Not monitoring OOM events, memory.oom_control could not be read.");
 		return rc;
 	}

 	/*
 	 * If oom_kill field is found we will read it from the cgroup interface,
 	 * so don't start the oom thread.
 	 */
 	if (event_file) {
 		line = xstrstr(event_file, "oom_kill ");
 		xfree(event_file);
 		if (line) {
 			oom_kill_type = OOM_KILL_COUNTER;
 			return SLURM_SUCCESS;
 		}
 	}

 	/*
 	 * Start a new OOM monitor thread, used in kernels which do not support
 	 * memory.oom_control's oom_kill field (<=3.x).
 	 */
 	xstrfmtcat(control_file, "%s/%s", int_cg[CG_MEMORY][CG_LEVEL_STEP].path,
 		   "memory.oom_control");

 	if ((cfd = open(control_file, O_RDONLY | O_CLOEXEC)) == -1) {
 		error("Cannot open %s: %m", control_file);
 		rc = SLURM_ERROR;
 		goto fini;
 	}

 	xstrfmtcat(event_file, "%s/%s", int_cg[CG_MEMORY][CG_LEVEL_STEP].path,
 		   "cgroup.event_control");

 	if ((efd = open(event_file, O_WRONLY | O_CLOEXEC)) == -1) {
 		error("Cannot open %s: %m", event_file);
 		rc = SLURM_ERROR;
 		goto fini;
 	}

 	if ((event_fd = eventfd(0, EFD_CLOEXEC)) == -1) {
 		error("eventfd: %m");
 		rc = SLURM_ERROR;
 		goto fini;
 	}

 	xstrfmtcat(line, "%d %d", event_fd, cfd);

 	oom_kill_count = 0;

 	safe_write(efd, line, strlen(line) + 1);

 	if (pipe2(oom_pipe, O_CLOEXEC) == -1) {
 		error("pipe(): %m");
 		rc = SLURM_ERROR;
 		goto fini;
 	}

 	/*
 	 * Monitoring thread should be responsible for closing the fd's and
 	 * freeing the oom_event_args_t struct and members.
 	 */
 	event_args = xmalloc(sizeof(oom_event_args_t));
 	event_args->cfd = cfd;
 	event_args->efd = efd;
 	event_args->event_fd = event_fd;

 	slurm_mutex_init(&oom_mutex);
 	slurm_thread_create(&oom_thread, _oom_event_monitor, event_args);
 	oom_kill_type = OOM_KILL_MON;

 fini:
 	xfree(line);
 	if (oom_kill_type != OOM_KILL_MON) {
 		close(event_fd);
 		close(efd);
 		close(cfd);
 		close(oom_pipe[0]);
 		close(oom_pipe[1]);
 	}
 	xfree(event_file);
 	xfree(control_file);

 	if (rc != SLURM_SUCCESS)
 		error("Unable to register OOM notifications for %s",
 		      int_cg[CG_MEMORY][CG_LEVEL_STEP].path);
 	return rc;

 rwfail:
 	error("Cannot write to %s", event_file);
 	rc = SLURM_ERROR;
 	goto fini;
 }

 static uint64_t _failcnt(xcgroup_t *cg, char *param)
 {
 	uint64_t value = 0;

 	if (xcgroup_get_uint64_param(cg, param, &value) != SLURM_SUCCESS) {
 		log_flag(CGROUP, "unable to read '%s' from '%s'",
 			 param, cg->path);
 		value = 0;
 	}

 	return value;
 }

 static int _get_oom_kill_from_file(xcgroup_t *cg)
 {
 	char *oom_control = NULL, *ptr;
 	size_t sz;
 	uint64_t local_oom_kill_cnt = 0;

 	if (common_cgroup_get_param(cg, "memory.oom_control",
 				    &oom_control, &sz) != SLURM_SUCCESS)
 		return SLURM_ERROR;

 	if (oom_control) {
 		if ((ptr = xstrstr(oom_control, "oom_kill "))) {
 			if (sscanf(ptr, "oom_kill %"PRIu64,
 				   &local_oom_kill_cnt) != 1)
 				error("Cannot parse oom_kill counter from %s memory.oom_control.",
 				      cg->path);
 		}
 		xfree(oom_control);
 		log_flag(CGROUP, "Detected %"PRIu64" out-of-memory events in %s",
 			 local_oom_kill_cnt, cg->path);
 		oom_kill_count += local_oom_kill_cnt;
 	}

 	return SLURM_SUCCESS;
 }

 extern cgroup_oom_t *cgroup_p_step_stop_oom_mgr(stepd_step_rec_t *step)
 {
 	cgroup_oom_t *results = NULL;
 	uint64_t stop_msg;

 	if (oom_kill_type == OOM_KILL_NONE) {
 		error("OOM events were not monitored for %ps: couldn't read memory.oom_control or subscribe to its events.",
 		      &step->step_id);
 		return results;
 	}

 	if (common_cgroup_lock(&int_cg[CG_MEMORY][CG_LEVEL_STEP]) !=
 	    SLURM_SUCCESS) {
 		error("common_cgroup_lock error: %m");
 		goto fail_oom_results;
 	}

 	results = xmalloc(sizeof(*results));

 	if (cgroup_p_has_feature(CG_MEMCG_SWAP)) {
 		results->step_memsw_failcnt = _failcnt(
 			&int_cg[CG_MEMORY][CG_LEVEL_STEP],
 			"memory.memsw.failcnt");
 		results->job_memsw_failcnt = _failcnt(
 			&int_cg[CG_MEMORY][CG_LEVEL_JOB],
 			"memory.memsw.failcnt");
 	}
 	results->step_mem_failcnt = _failcnt(&int_cg[CG_MEMORY][CG_LEVEL_STEP],
 					     "memory.failcnt");
 	results->job_mem_failcnt = _failcnt(&int_cg[CG_MEMORY][CG_LEVEL_JOB],
 					    "memory.failcnt");

 	/*
 	 * If there's no OOM Thread, try to read oom_kill from the interface and
 	 * accumulate the kills of the step into the global counter which should
 	 * already contain all the tasks kills.
 	 */
 	if (oom_kill_type == OOM_KILL_COUNTER) {
 		cgroup_ctl_type_t ctl = CG_MEMORY;

 		list_for_each(g_task_list[ctl], _acct_task, &ctl);
 		if (_get_oom_kill_from_file(
 			    &int_cg[CG_MEMORY][CG_LEVEL_STEP]) !=
 		    SLURM_SUCCESS) {
 			log_flag(CGROUP,
 				 "OOM events were not monitored for %ps",
 				 &step->step_id);
 		}
 		results->oom_kill_cnt = oom_kill_count;
 		common_cgroup_unlock(&int_cg[CG_MEMORY][CG_LEVEL_STEP]);
 		return results;
 	}
 	common_cgroup_unlock(&int_cg[CG_MEMORY][CG_LEVEL_STEP]);

 	/*
 	 * oom_thread created, but could have finished before we attempt
 	 * to send the stop msg. If it finished, oom_thread should had
 	 * closed the read endpoint of oom_pipe.
 	 */
 	stop_msg = STOP_OOM;
 	safe_write(oom_pipe[1], &stop_msg, sizeof(stop_msg));

 rwfail: /* Ignore safe_write issues. */
 	log_flag(CGROUP, "attempt to join oom_thread.");
 	slurm_thread_join(oom_thread);

 	slurm_mutex_lock(&oom_mutex);
 	results->oom_kill_cnt = oom_kill_count;
 	slurm_mutex_unlock(&oom_mutex);

 fail_oom_results:
 	close(oom_pipe[1]);
 	slurm_mutex_destroy(&oom_mutex);

 	return results;
 }

 /***************************************
  ***** CGROUP TASK FUNCTIONS *****
  **************************************/
 extern int cgroup_p_task_addto(cgroup_ctl_type_t sub, stepd_step_rec_t *step,
 			       pid_t pid, uint32_t task_id)
 {
 	if (task_id > g_max_task_id)
 		g_max_task_id = task_id;

 	log_flag(CGROUP, "%ps taskid %u max_task_id %u",
 		 &step->step_id, task_id, g_max_task_id);

 	return _handle_task_cgroup(sub, step, pid, task_id);
 }

 extern cgroup_acct_t *cgroup_p_task_get_acct_data(uint32_t taskid)
 {
 	char *cpu_time = NULL, *memory_stat = NULL, *ptr;
 	char *memory_peak = NULL;
 	size_t cpu_time_sz = 0, memory_stat_sz = 0, tmp_sz = 0;
 	cgroup_acct_t *stats = NULL;
 	xcgroup_t *task_cpuacct_cg = NULL;
 	xcgroup_t *task_memory_cg = NULL;

 	/* Find which task cgroup to use */
 	task_memory_cg = list_find_first(g_task_list[CG_MEMORY],
 					 _find_task_cg_info,
 					 &taskid);
 	task_cpuacct_cg = list_find_first(g_task_list[CG_CPUACCT],
 					  _find_task_cg_info,
 					  &taskid);

 	/*
 	 * We should always find the task cgroup; if we don't for some reason,
 	 * just print an error and return.
 	 */
 	if (!task_cpuacct_cg) {
 		error("Could not find task_cpuacct_cg, this should never happen");
 		return NULL;
 	}

 	if (!task_memory_cg) {
 		error("Could not find task_memory_cg, this should never happen");
 		return NULL;
 	}

 	/*
 	 * Initialize values, a NO_VAL64 will indicate to the caller that
 	 * something happened here.
 	 */
 	stats = xmalloc(sizeof(*stats));
 	stats->usec = NO_VAL64;
 	stats->ssec = NO_VAL64;
 	stats->total_rss = NO_VAL64;
 	stats->total_pgmajfault = NO_VAL64;
 	stats->total_vmem = NO_VAL64;
 	stats->memory_peak = INFINITE64; /* As required in common_jag.c */

 	if (common_cgroup_get_param(task_cpuacct_cg, "cpuacct.stat", &cpu_time,
 				    &cpu_time_sz) == SLURM_SUCCESS) {
 		sscanf(cpu_time, "%*s %"PRIu64" %*s %"PRIu64,
 		       &stats->usec, &stats->ssec);
 	}

 	if (common_cgroup_get_param(task_memory_cg, "memory.stat", &memory_stat,
 				    &memory_stat_sz) == SLURM_SUCCESS) {
 		if ((ptr = xstrstr(memory_stat, "total_rss")))
 			sscanf(ptr, "total_rss %"PRIu64, &stats->total_rss);
 		if ((ptr = xstrstr(memory_stat, "total_pgmajfault")))
 			sscanf(ptr, "total_pgmajfault %"PRIu64,
 			       &stats->total_pgmajfault);
 	}

 	if (stats->total_rss != NO_VAL64) {
 		uint64_t total_cache = NO_VAL64, total_swap = NO_VAL64;

 		if ((ptr = xstrstr(memory_stat, "total_cache")))
 			sscanf(ptr, "total_cache %"PRIu64, &total_cache);
 		if ((ptr = xstrstr(memory_stat, "total_swap")))
 			sscanf(ptr, "total_swap %"PRIu64, &total_swap);

 		stats->total_vmem = stats->total_rss;
 		if (total_cache != NO_VAL64)
 			stats->total_vmem += total_cache;
 		if (total_swap != NO_VAL64)
 			stats->total_vmem += total_swap;
 	}

 	/* In cgroup/v1, memory.peak is provided by memory.max_usage_in_bytes */
 	if (common_cgroup_get_param(task_memory_cg,
 				    "memory.max_usage_in_bytes",
 				    &memory_peak,
 				    &tmp_sz) != SLURM_SUCCESS) {
 		log_flag(CGROUP, "Cannot read task %d memory.max_usage_in_bytes interface",
 			 taskid);
 	}
 	if (memory_peak) {
 		if (sscanf(memory_peak, "%"PRIu64, &stats->memory_peak) != 1)
 			error("Cannot parse memory.max_usage_in_bytes interface");
 	}

 	xfree(cpu_time);
 	xfree(memory_stat);
 	xfree(memory_peak);

 	return stats;
 }

 /* cgroup/v1 usec and ssec are provided in USER_HZ. */
 extern long int cgroup_p_get_acct_units(void)
 {
 	return jobacct_gather_get_clk_tck();
 }

 extern bool cgroup_p_has_feature(cgroup_ctl_feature_t f)
 {
 	struct stat st;
 	int rc;
 	char *memsw_filepath = NULL;
 	static int swap_enabled = -1;

 	/* Check if swap constrain capability is enabled in this system. */
 	switch (f) {
 	case CG_MEMCG_SWAP:
 		if (swap_enabled == -1) {
 			xstrfmtcat(memsw_filepath,
 				   "%s/memory/memory.memsw.limit_in_bytes",
 				   slurm_cgroup_conf.cgroup_mountpoint);
 			rc = stat(memsw_filepath, &st);
 			xfree(memsw_filepath);
 			return (swap_enabled = (rc == 0));
 		} else
 			return swap_enabled;
 	default:
 		break;
 	}

 	return false;
 }

 extern int cgroup_p_signal(int signal)
 {
 	error("%s not implemented in %s", __func__, plugin_name);
 	return SLURM_ERROR;
 }

 extern char *cgroup_p_get_task_empty_event_path(uint32_t taskid,
 						bool *on_modify)
 {
 	return NULL;
 }

 extern int cgroup_p_is_task_empty(uint32_t taskid)
 {
 	return ESLURM_NOT_SUPPORTED;
 }