blob: 5d4cf876167110624544836c27f9ad59ad1e51b1 [file] [log] [blame]
/*****************************************************************************\
* cgroup_v2.c - Cgroup v2 plugin
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#define _GNU_SOURCE
#include <fcntl.h>
#include <mntent.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/inotify.h>
#include <poll.h>
#include <unistd.h>
#include "slurm/slurm.h"
#include "slurm/slurm_errno.h"
#include "src/common/bitstring.h"
#include "src/common/fd.h"
#include "src/common/list.h"
#include "src/common/log.h"
#include "src/common/timers.h"
#include "src/common/xassert.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/common/daemonize.h"
#include "src/slurmctld/slurmctld.h"
#include "src/slurmd/slurmd/slurmd.h"
#include "src/plugins/cgroup/common/cgroup_common.h"
#include "src/plugins/cgroup/v2/cgroup_dbus.h"
#include "src/plugins/cgroup/v2/ebpf.h"
#define SYSTEM_CGSLICE "system.slice"
#define SYSTEM_CGSCOPE "slurmstepd"
#define SYSTEM_CGDIR "system"
const char plugin_name[] = "Cgroup v2 plugin";
const char plugin_type[] = "cgroup/v2";
const uint32_t plugin_version = SLURM_VERSION_NUMBER;
/* Internal cgroup structs */
static list_t *task_list;
static uint16_t step_active_cnt;
static xcgroup_ns_t int_cg_ns = { 0 };
static xcgroup_t int_cg[CG_LEVEL_CNT];
static bpf_program_t p[CG_LEVEL_CNT];
static char *stepd_scope_path = NULL;
static uint32_t task_special_id = NO_VAL;
static char *invoc_id;
static char *ctl_names[] = {
[CG_TRACK] = "freezer",
[CG_CPUS] = "cpuset",
[CG_MEMORY] = "memory",
[CG_CPUACCT] = "cpu",
[CG_DEVICES] = "devices",
/* Below are extra controllers not explicitly tracked by Slurm. */
[CG_IO] = "io",
[CG_HUGETLB] = "hugetlb",
[CG_PIDS] = "pids",
[CG_RDMA] = "rdma",
[CG_MISC] = "misc"
};
typedef struct {
xcgroup_t task_cg;
uint32_t taskid;
bpf_program_t p;
} task_cg_info_t;
typedef struct {
int npids;
pid_t *pids;
} foreach_pid_array_t;
extern bool cgroup_p_has_feature(cgroup_ctl_feature_t f);
extern int cgroup_p_task_addto(cgroup_ctl_type_t ctl, stepd_step_rec_t *step,
pid_t pid, uint32_t task_id);
/* Hierarchy will take this form:
* [int_cg_ns] [int_cg_ns]
* "slurmd service" "slurmtepds scope"
* root(delegated) root(delegated) [CG_LEVEL_ROOT]
* | / \
* | / |
* slurmd | job_x ... job_n [CG_LEVEL_JOB]
* system |
* (waiting area |
* for new stepds) |
* step_0 ... step_n [CG_LEVEL_STEP]
* / \
* [CG_LEVEL_STEP_USER] user_processes slurm_processes [CG_LEVEL_STEP_SLURM]
* / (slurmstepds)
* /
* |
* task_special...task_0...task_n [CG_LEVEL_TASK] (user pids)
* (task_id = NO_VAL)
*/
/*
* Get the cgroup root mountpoint for a given mount path and pid.
*
* This function parses the /proc/pid/mountinfo, and gets the 4th element of
* the line which 5th element equals to mount parameter.
*
* From man proc_pid_mountinfo about 4th and 5th fields of mountinfo:
* (4) root: the pathname of the directory in the filesystem which forms the
* root of this mount.
* (5) mount point: the pathname of the mount point relative to the process's
* root directory.
*
* This is used primarily to get the real mount for a cgroup filesystem as in
* some specific containerized environments the real root of the cgroup
* filesystem may not be coincide with what we get in /proc/1/cgroup.
*
* This only checks the first occurrence of the mount as it will always be the
* proper one, as this file gets written sequentially, meaning that the "real"
* /sys/fs/cgroup will appear first. If it happens to be any bind mount to it
* it will appear later, those bind mounts do not affect the /proc/<pid>/cgroup
* data.
*
* Example:
* - For mount = "/sys/fs/cgroup" and pid 123, we find the following line in
* /proc/123/mountinfo, so as the 5th field matches mount, we will return the
* 4th field "/":
* 475 337 0:28 / /sys/fs/cgroup rw(...) - cgroup2 cgroup2 rw,nsdelegate(...)
*
* - If we get a line like this, we will return "/../../../../../..".
* 379 377 0:28 /../../../../../.. /sys/fs/cgroup rw(...) - cgroup2(...)
*
* IN mount - Path to match with the 5th field of mountinfo string.
* IN pid_str - Pid to look for the mountinfo.
* OUT data - NULL if not found, or a xmalloc'ed string with a copy of the
* 4th field of the line which matches mount with the 5th field.
*/
static char *_get_root_mount_mountinfo(char *mount, char *pid_str)
{
char *path = NULL, *line = NULL, *word, *data = NULL, *save_ptr = NULL;
size_t len = 0;
int count = 0;
FILE *f;
bool found = false;
path = xstrdup_printf("/proc/%s/mountinfo", pid_str);
f = fopen(path, "r");
xfree(path);
if (f == NULL) {
fatal("cannot read /proc/%s/mountinfo contents: %m", pid_str);
return NULL;
}
while (!found && getline(&line, &len, f) != -1) {
if (xstrstr(line, mount)) {
count = 0;
word = strtok_r(line, " ", &save_ptr);
while (word) {
/*
* The 4th value is the root of the mount, and
* the 5th is the mount, so we want to get
* the 4th and ensure that the 5th is exactly
* equal to mount, so that we are not looking
* into a subdirectory.
*/
if (count == 3) {
data = word;
word = strtok_r(NULL, " ", &save_ptr);
if (!xstrcmp(word, mount)) {
data = xstrdup(data);
found = true;
break;
}
}
count++;
word = strtok_r(NULL, " ", &save_ptr);
}
}
}
free(line);
fclose(f);
if (!data) {
error("Could not parse '%s' root mount for %s", mount, pid_str);
}
return data;
}
/*
* Check whether path is a valid cgroup2 mountpoint. This also checks that the
* cgroup mount passed is usable in the current cgroup2 namespace.
*
* IN path - Path to cgroup2 mountpoint.
*/
static bool _is_cgroup2_mount(char *path)
{
FILE *fp = setmntent("/proc/mounts", "r");
struct mntent *mnt;
char *minfo = NULL;
bool rc = false;
if (!fp) {
error("Failed to open /proc/mounts");
return rc;
}
while ((mnt = getmntent(fp))) {
if (!xstrcmp(mnt->mnt_dir, path) &&
!xstrcmp(mnt->mnt_type, "cgroup2")) {
rc = true;
break;
}
}
if (!rc) {
error("The cgroup mountpoint %s is not mounted", path);
goto end;
}
minfo = _get_root_mount_mountinfo(path, "self");
if (xstrcmp(minfo, "/"))
error("The cgroup mountpoint does not align with the current namespace. Please, ensure all namespaces are correctly mounted. Refer to the slurm cgroup_v2 documentation.");
end:
xfree(minfo);
endmntent(fp);
return rc;
}
/*
* Read /proc/<pid>/cgroup and return the absolute cgroup path of the given pid.
*
* We will deal with different cases. For example:
*
* In regular systems we expect one single line like this:
* "0::/init.scope\n"
*
* In some containerized environments it could look like:
* "0::/docker.slice/docker-<some UUID>.scope/init.scope"
*
* Or in a cgroup namespace:
* "0::/"
*
* This function just strips the initial "0::" and the last part of the path
* (e.g "init.scope") portions. Then it adds the cgroup mountpoint prefix.
*
* In Unified hierarchies this must contain only one line. If there are more
* lines this would mean we are in Hybrid or in Legacy cgroup. We do not support
* hybrid mode, so if we find more than one line we fatal.
*
* The Cgroup v2 documented way to know which is the cgroup root for a
* process in the cgroup hierarchy is just to read /proc/<pid>/cgroup.
*
* The parameter pid_str is a string representing a numeric pid or the
* keyword 'self'. (Note: if we are in a cgroup namespace without a proper proc
* mount, using 'self' will possibly return a different value than using
* getpid()).
*
* IN pid_str - pid to read the path for
* OUT ret - xmalloc'ed string containing the cgroup path for the passed pid
* read from /proc/<pid>/cgroup.
*/
static char *_get_proc_cg_path(char *pid_str)
{
char *buf, *start = NULL, *p, *ret = NULL;
char *path = NULL, *minfo = NULL;
size_t sz;
path = xstrdup_printf("/proc/%s/cgroup", pid_str);
if (common_file_read_content(path, &buf, &sz) != SLURM_SUCCESS) {
xfree(path);
fatal("cannot read /proc/%s/cgroup contents: %m", pid_str);
}
xfree(path);
/*
* In Unified mode there will be just one line containing the path
* of the cgroup and starting by 0. If there are more than one then
* some v1 cgroups are mounted, we do not support it.
*/
if (buf && (buf[0] != '0'))
fatal("Hybrid mode is not supported. Mounted cgroups are: %s",
buf);
/*
* Skip until past the :: from the file ensuring that we are not past
* the buffer size.
*/
if ((p = xstrchr(buf, ':')) != NULL) {
if ((p + 2) < (buf + sz - 1))
start = p + 2;
/* Remove everything after the first newline found. */
if ((p = xstrchr(start, '\n')))
*p = '\0';
}
if (!start || (*start == '\0'))
fatal("Unexpected format found in /proc/%s/cgroup file: %s",
pid_str, buf);
/* Start the return string with the mount point of the cgroup. */
ret = xstrdup(slurm_cgroup_conf.cgroup_mountpoint);
/*
* Only check mountinfo in case that the cgroup file points to a
* location that is not the root of the cgroup mountpoint (/).
*/
if (xstrcmp(start, "/")) {
/*
* Check for correct /proc and cgroup mounts when we are in a
* cgroup namespace by checking mountinfo.
*/
minfo = _get_root_mount_mountinfo(
slurm_cgroup_conf.cgroup_mountpoint,
pid_str);
/*
* If minfo is "/" our root is
* slurm_cgroup_conf.cgroup_mountpoint.
*
* If minfo contains something different than "/":
* For containers with remounted cgroups, mountinfo would've
* returned a string different than "/", so we first need to
* ensure that the minfo is a substring of what we've read in
* /proc/pid/cgroup.
*
* If minfo content is not a substring of our /proc/pid/cgroup
* (e.g. minfo is "../../.." and /proc/pid/cgroup is
* 0::/something), we're in a wrong situation.
*/
if (xstrcmp(minfo, "/")) {
/*
* If the information of /proc/pid/mountinfo is not a
* substring of the one in /proc/pid/cgroup, it means
* that something is wrong. For example we are in a pid
* and a cgroup namespace without /proc properly mounted.
*/
if (xstrstr(start, minfo))
start = start + strlen(minfo);
else
fatal("mismatch found in /proc/%s/mountinfo: \"%s\" vs /proc/%s/cgroup: \"%s\". Please check that procfs and cgroupfs are correctly mounted in the namespace.",
pid_str, minfo, pid_str, start);
}
/* Append the sanitized path to the cgroup mountpoint. */
xstrcat(ret, start);
xfree(minfo);
}
xfree(buf);
return ret;
}
/*
* Get the absolute OS's cgroup root directory by reading /proc/1/cgroup path.
*
* In normal systems the final path will look like this:
* /sys/fs/cgroup[/]
*
* In containerized environments it will look like:
* /sys/fs/cgroup[/docker.slice/docker-<some UUID>.scope]
*
*/
static char *_get_init_cg_path()
{
char *cg_path, *ret = NULL;
cg_path = _get_proc_cg_path("1");
if (xstrcmp(cg_path, slurm_cgroup_conf.cgroup_mountpoint)) {
ret = xdirname(cg_path);
xfree(cg_path);
} else {
ret = cg_path;
}
return ret;
}
/*
* Fill up the internal cgroup namespace object. This mainly contains the path
* to what will be our root cgroup.
* E.g. /sys/fs/cgroup/system.slice/node1_slurmstepd.scope/ for slurmstepd.
*/
static void _set_int_cg_ns()
{
int_cg_ns.init_cg_path = _get_init_cg_path();
/*
* When started manually in a container and reconfiguring, if we are pid
* 1 we can directly get the cgroup as it has been configured in our
* previous instance.
*/
if (slurm_cgroup_conf.ignore_systemd && getenv("SLURMD_RECONF") &&
(getpid() == 1)) {
stepd_scope_path = xdirname(int_cg_ns.init_cg_path);
int_cg_ns.mnt_point = xstrdup(int_cg_ns.init_cg_path);
return;
}
#ifdef MULTIPLE_SLURMD
xstrfmtcat(stepd_scope_path, "%s/%s/%s_%s.scope",
int_cg_ns.init_cg_path, SYSTEM_CGSLICE, conf->node_name,
SYSTEM_CGSCOPE);
#else
xstrfmtcat(stepd_scope_path, "%s/%s/%s.scope", int_cg_ns.init_cg_path,
SYSTEM_CGSLICE, SYSTEM_CGSCOPE);
#endif
int_cg_ns.mnt_point = _get_proc_cg_path("self");
}
/*
* For each available controller, enable it in this path. This operation is
* only intended to be done in the Domain controllers, never in a leaf where
* processes reside. If it is done in a leaf it *won't be possible* to add any
* pid to it. Enabling the controllers will make their interfaces available
* (e.g. the memory.*, cpu.*, cpuset.* ... files) to control the cgroup.
*/
static int _enable_subtree_control(char *path, bitstr_t *ctl_bitmap)
{
int i, rc = SLURM_SUCCESS, rc2;
char *content = NULL, *file_path = NULL;
xassert(ctl_bitmap);
xstrfmtcat(file_path, "%s/cgroup.subtree_control", path);
for (i = 0; i < CG_CTL_CNT; i++) {
if (!bit_test(ctl_bitmap, i))
continue;
xstrfmtcat(content, "+%s", ctl_names[i]);
rc2 = common_file_write_content(file_path, content,
strlen(content));
if (rc2 != SLURM_SUCCESS) {
/*
* In a container it is possible that part of the
* cgroup tree is mounted in read-only mode, so skip
* the parts that we cannot touch.
*/
if (errno == EROFS) {
log_flag(CGROUP,
"Cannot enable %s in %s, skipping: %m",
ctl_names[i], file_path);
} else {
/* Controller won't be available. */
error("Cannot enable %s in %s: %m",
ctl_names[i], file_path);
bit_clear(ctl_bitmap, i);
rc = SLURM_ERROR;
}
} else {
log_flag(CGROUP, "Enabled %s controller in %s",
ctl_names[i], file_path);
}
xfree(content);
}
xfree(file_path);
return rc;
}
static int _get_controllers(char *path, bitstr_t *ctl_bitmap)
{
char *buf = NULL, *ptr, *save_ptr, *ctl_filepath = NULL, *extra;
size_t sz;
xassert(ctl_bitmap);
/* Remove the extra controllers if not explicitly asked */
extra = slurm_cgroup_conf.enable_extra_controllers;
if (!xstrstr(extra, "all")) {
if (extra) {
for (int i = CG_IO; i < CG_CTL_CNT; i++) {
if (!xstrstr(extra, ctl_names[i])) {
ctl_names[i] = "";
}
}
} else {
for (int i = CG_IO; i < CG_CTL_CNT; i++)
ctl_names[i] = "";
}
}
xstrfmtcat(ctl_filepath, "%s/cgroup.controllers", path);
if (common_file_read_content(ctl_filepath, &buf, &sz) !=
SLURM_SUCCESS || !buf) {
error("cannot read %s: %m", ctl_filepath);
xfree(ctl_filepath);
return SLURM_ERROR;
}
xfree(ctl_filepath);
if (buf[sz - 1] == '\n')
buf[sz - 1] = '\0';
ptr = strtok_r(buf, " ", &save_ptr);
while (ptr) {
for (int i = 0; i < CG_CTL_CNT; i++) {
if (!xstrcmp(ctl_names[i], ""))
continue;
if (!xstrcasecmp(ctl_names[i], ptr)) {
bit_set(ctl_bitmap, i);
break;
}
}
ptr = strtok_r(NULL, " ", &save_ptr);
}
xfree(buf);
for (int i = 0; i < CG_CTL_CNT; i++) {
if ((i == CG_DEVICES) || (i == CG_TRACK))
continue;
if (invoc_id && !bit_test(ctl_bitmap, i) &&
xstrcmp(ctl_names[i], ""))
error("Controller %s is not enabled!", ctl_names[i]);
}
return SLURM_SUCCESS;
}
/*
* Enables the cgroup controllers system_ctrls from /sys/fs/cgroup to the one
* specified in cg_path. If system_ctrls is null it reads it from
* /sys/fs/cgroup/cgroup.controllers
*/
static int _enable_controllers(char *cg_path, bitstr_t *system_ctrls)
{
int rc = SLURM_SUCCESS;
char *p, *dst;
xassert(system_ctrls);
if (!(xstrstr(cg_path, slurm_cgroup_conf.cgroup_mountpoint))) {
error("%s is not under the cgroup mountpoint %s.",
cg_path, slurm_cgroup_conf.cgroup_mountpoint);
return SLURM_ERROR;
}
p = dst = xstrdup(cg_path);
p += strlen(slurm_cgroup_conf.cgroup_mountpoint);
do {
*p = '\0';
if ((rc = _enable_subtree_control(dst, system_ctrls)))
goto cleanup;
*p = '/';
} while ((p = xstrchr(p + 1, '/')));
cleanup:
xfree(dst);
return rc;
}
/*
* Enabling the subtree from the top mountpoint to the slice we will reside
* is needed to get all the controllers we want to support. Nevertheless note
* that if systemd is reloaded, reset, or does any operation that implies
* traversing the cgroup tree matching its internal database, and there's no
* service started with Delegate=yes (like running this slurmd manually), the
* controllers can eventually be deactivated without warning by systemd.
*
* Also note that usually starting any service or scope with Delegate=yes in the
* slice we want to live, will make systemd to automatically activate the
* controllers in the tree, so this operation here would be redundant.
*/
static int _enable_system_controllers()
{
char *slice_path = NULL;
bitstr_t *system_ctrls = bit_alloc(CG_CTL_CNT);
int rc = SLURM_ERROR;
if (_get_controllers(slurm_cgroup_conf.cgroup_mountpoint,
system_ctrls) != SLURM_SUCCESS) {
error("Could not obtain system controllers from %s",
slurm_cgroup_conf.cgroup_mountpoint);
goto end;
}
if (_enable_controllers(int_cg_ns.mnt_point, system_ctrls) !=
SLURM_SUCCESS) {
error("Could not enable controllers for cgroup path %s",
int_cg_ns.mnt_point);
goto end;
}
/*
* Enable it for system.slice, where the stepd scope will reside when
* it is created later. Do not do it when ignoresystemd is true as it
* will be done when the stepd_scope_path is created.
*/
if (!slurm_cgroup_conf.ignore_systemd) {
slice_path = xdirname(stepd_scope_path);
if (_enable_subtree_control(slice_path, system_ctrls) !=
SLURM_SUCCESS) {
error("Could not enable subtree control at %s",
slice_path);
goto end;
}
}
rc = SLURM_SUCCESS;
end:
xfree(slice_path);
FREE_NULL_BITMAP(system_ctrls);
return rc;
}
/*
* Read the cgroup.controllers file of the root to detect which are the
* available controllers in this system.
*/
static int _setup_controllers()
{
/* Field not used in v2 */
int_cg_ns.subsystems = NULL;
/*
* Check all the available controllers in this system and enable them in
* every level of the cgroup tree if EnableControllers=yes.
* Normally, if the unit we're starting up has a Delegate=yes, systemd
* will set the cgroup.subtree_controllers of the parent with all the
* available controllers on that level, making all of them available on
* our unit automatically. In some situations, like if the parent cgroup
* doesn't have write permissions or if it started with fewer
* controllers available than the ones on the system (when the
* grandfather doesn't have subtree_control set), that won't happen and
* we may need Enablecontrollers. This may happen in containers.
*/
if (running_in_slurmd() && slurm_cgroup_conf.enable_controllers)
_enable_system_controllers();
/* Get the controllers on our namespace. */
return _get_controllers(int_cg_ns.mnt_point,
int_cg_ns.avail_controllers);
}
static int _rmdir_task(void *x, void *arg)
{
task_cg_info_t *t = (task_cg_info_t *) x;
if (common_cgroup_delete(&t->task_cg) != SLURM_SUCCESS)
log_flag(CGROUP, "Failed to delete %s: %m", t->task_cg.path);
return SLURM_SUCCESS;
}
static int _find_task_cg_info(void *x, void *key)
{
task_cg_info_t *task_cg = (task_cg_info_t *)x;
uint32_t taskid = *(uint32_t*)key;
if (task_cg->taskid == taskid)
return 1;
return 0;
}
static void _free_task_cg_info(void *x)
{
task_cg_info_t *task_cg = (task_cg_info_t *)x;
if (task_cg) {
common_cgroup_destroy(&task_cg->task_cg);
free_ebpf_prog(&task_cg->p);
xfree(task_cg);
}
}
static void _all_tasks_destroy()
{
/* Empty the lists of accounted tasks, do a best effort in rmdir */
(void) list_delete_all(task_list, _rmdir_task, NULL);
}
static int _get_task_pids(void *x, void *key)
{
task_cg_info_t *task_cg_info = (task_cg_info_t *)x;
foreach_pid_array_t *pid_array = key;
pid_t *pids = NULL;
int npids = 0;
xassert(pid_array);
common_cgroup_get_pids(&task_cg_info->task_cg, &pids, &npids);
if (pid_array->pids) {
xrecalloc(pid_array->pids, (pid_array->npids + npids),
sizeof(*pid_array->pids));
memcpy((pid_array->pids + pid_array->npids), pids,
sizeof(*pid_array->pids) * npids);
pid_array->npids += npids;
} else {
pid_array->pids = pids;
pids = NULL;
pid_array->npids = npids;
}
xfree(pids);
return SLURM_SUCCESS;
}
static int _find_pid_task(void *x, void *key)
{
task_cg_info_t *task_cg_info = (task_cg_info_t *)x;
pid_t pid = *(pid_t *) key;
pid_t *pids = NULL;
int npids = 0;
bool found = false;
if (common_cgroup_get_pids(&task_cg_info->task_cg, &pids, &npids) !=
SLURM_SUCCESS)
return false;
for (int i = 0; i < npids; i++) {
if (pids[i] == pid) {
found = true;
break;
}
}
xfree(pids);
return found;
}
/*
* Check the "populated" key in the cgroup.events file
* Returns CGROUP_EMPTY, CGROUP_POPULATED, or SLURM_ERROR.
*/
static int _is_cgroup_empty(xcgroup_t *cg)
{
char *events_content = NULL, *ptr;
int rc;
int populated = -1;
size_t size;
/* Check if cgroup is empty in the first place. */
if (common_cgroup_get_param(cg, "cgroup.events", &events_content,
&size) != SLURM_SUCCESS) {
error("Cannot read %s/cgroup.events", cg->path);
return SLURM_ERROR;
}
if (!events_content) {
error("%s/cgroup.events is empty", cg->path);
return SLURM_ERROR;
}
if (!(ptr = xstrstr(events_content, "populated"))) {
error("Could not find \"populated\" field in %s/cgroup.events: \"%s\"",
cg->path, events_content);
xfree(events_content);
return SLURM_ERROR;
}
if ((rc = sscanf(ptr, "populated %u", &populated) != 1)) {
error("Could not find value for \"populated\" field in %s/cgroup.events (\"%s\"): %s",
cg->path, events_content, strerror(rc));
xfree(events_content);
return SLURM_ERROR;
}
xfree(events_content);
switch (populated) {
case 0:
return CGROUP_EMPTY;
case 1:
return CGROUP_POPULATED;
default:
error("Cannot determine if %s is empty.", cg->path);
break;
}
return SLURM_ERROR;
}
static void _wait_cgroup_empty(xcgroup_t *cg, int timeout_ms)
{
char *cgroup_events = NULL;
int rc, fd, wd, populated = -1;
struct pollfd pfd[1];
populated = _is_cgroup_empty(cg);
if (populated == SLURM_ERROR) {
error("Cannot determine if %s is empty.", cg->path);
return;
} else if (populated == CGROUP_EMPTY) //We're done
return;
/*
* Cgroup is not empty, so wait for a while just monitoring any change
* on cgroup.events. Changing populate from 1 to 0 is what we expect.
*/
xstrfmtcat(cgroup_events, "%s/cgroup.events", cg->path);
/* Initialize an inotify monitor */
fd = inotify_init();
if (fd < 0) {
error("Cannot initialize inotify for checking cgroup events: %m");
return;
}
/* Set the file and events we want to monitor. */
wd = inotify_add_watch(fd, cgroup_events, IN_MODIFY);
if (wd < 0) {
error("Cannot add watch events to %s: %m", cgroup_events);
goto end_inotify;
}
/* Wait for new events. */
pfd[0].fd = fd;
pfd[0].events = POLLIN;
rc = poll(pfd, 1, timeout_ms);
/*
* We don't really care about the event details, just check now if the
* cg event file contains what we're looking for.
*/
if (rc < 0)
error("Error polling for event in %s: %m", cgroup_events);
else if (rc == 0)
error("Timeout waiting for %s to become empty.", cgroup_events);
/* Check if cgroup is empty again. */
populated = _is_cgroup_empty(cg);
if (populated == SLURM_ERROR)
error("Cannot determine if %s is empty.", cg->path);
else if (populated == CGROUP_POPULATED)
log_flag(CGROUP, "Cgroup %s is not empty.", cg->path);
end_inotify:
close(fd);
xfree(cgroup_events);
}
/*
* dbus is a batch system and asynchronous, so we cannot know when the scope
* will be ready unless we wait for the cgroup directories to be created and
* for the pid to show up in cgroup.procs.
*
* The waiting time will depend completely on the time systemd takes to complete
* such operations.
*/
static int _wait_scope_ready(xcgroup_t scope_root, pid_t pid, uint32_t t)
{
DEF_TIMERS;
bool found = false;
int rc, npids, retries = 0;
pid_t *pids;
uint32_t timeout = t * 1000; //msec to usec
struct stat sb;
struct timeval start_tv;
START_TIMER;
gettimeofday(&start_tv, NULL);
/* Wait for the scope directory to show up. */
do {
rc = stat(scope_root.path, &sb);
if (!rc)
break;
if ((rc < 0) && (errno != ENOENT)) {
error("stat() error checking for %s after dbus call: %m",
scope_root.path);
return SLURM_ERROR;
}
retries++;
if (slurm_delta_tv(&start_tv) > timeout)
goto dbus_timeout;
poll(NULL, 0, 10);
} while (true);
END_TIMER;
log_flag(CGROUP, "Took %s and %d retries for scope dir %s to show up.",
TIME_STR, retries, scope_root.path);
/* Wait for the pid to show up in cgroup.procs */
START_TIMER;
retries = 0;
do {
common_cgroup_get_pids(&scope_root, &pids, &npids);
for (int i = 0; i < npids; i++) {
if (pids[i] == pid) {
found = true;
break;
}
}
xfree(pids);
retries++;
if (!found) {
if (slurm_delta_tv(&start_tv) > timeout)
goto dbus_timeout;
poll(NULL, 0, 10);
}
} while (!found);
END_TIMER;
log_flag(CGROUP, "Took %s and %d retries for pid %d to show up in %s/cgroup.procs.",
TIME_STR, retries, pid, scope_root.path);
log_flag(CGROUP, "Scope initialization complete after %d msec",
(slurm_delta_tv(&start_tv)/1000));
return SLURM_SUCCESS;
dbus_timeout:
END_TIMER;
error("Scope initialization timeout after %s", TIME_STR);
return SLURM_ERROR;
}
static int _init_stepd_system_scope(pid_t pid)
{
char *system_dir = "/" SYSTEM_CGDIR;
char *self_cg_path;
if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_SYSTEM],
system_dir, (uid_t) 0, (gid_t) 0) !=
SLURM_SUCCESS) {
error("unable to create system cgroup %s", system_dir);
return SLURM_ERROR;
}
if (common_cgroup_instantiate(&int_cg[CG_LEVEL_SYSTEM]) !=
SLURM_SUCCESS) {
error("Unable to instantiate system %s cgroup", system_dir);
return SLURM_ERROR;
}
if (common_cgroup_move_process(&int_cg[CG_LEVEL_SYSTEM], pid) !=
SLURM_SUCCESS) {
error("Unable to attach pid %d to %s cgroup.", pid, system_dir);
return SLURM_ERROR;
}
/* Now check we're really where we belong to. */
self_cg_path = _get_proc_cg_path("self");
if (xstrcmp(self_cg_path, int_cg[CG_LEVEL_SYSTEM].path)) {
error("Could not move slurmstepd pid %d to a Slurm's delegated cgroup. Should be in %s, we are in %s.",
pid, int_cg[CG_LEVEL_SYSTEM].path, self_cg_path);
xfree(self_cg_path);
return SLURM_ERROR;
}
xfree(self_cg_path);
if (_enable_subtree_control(int_cg[CG_LEVEL_ROOT].path,
int_cg_ns.avail_controllers) !=
SLURM_SUCCESS) {
error("Cannot enable subtree_control at the top level %s",
int_cg_ns.mnt_point);
return SLURM_ERROR;
}
return SLURM_SUCCESS;
}
static int _init_new_scope(char *scope_path)
{
int rc;
rc = mkdirpath(scope_path, 0755, true);
if (rc && (errno != EEXIST)) {
error("Could not create scope directory %s: %m", scope_path);
return SLURM_ERROR;
}
_enable_controllers(scope_path, int_cg_ns.avail_controllers);
log_flag(CGROUP, "Created %s", scope_path);
return SLURM_SUCCESS;
}
/*
* Talk to systemd through dbus to move the slurmstepd pid into the reserved
* scope for stepds and user processes.
*/
static int _init_new_scope_dbus(char *scope_path)
{
int status, pipe_fd[2];
pid_t pid;
xcgroup_t sys_root, scope_root;
char *const argv[3] = {
(char *)conf->stepd_loc,
"infinity",
NULL };
if (pipe(pipe_fd))
fatal("pipe() failed: %m");
xassert(pipe_fd[0] > STDERR_FILENO);
xassert(pipe_fd[1] > STDERR_FILENO);
pid = fork();
if (pid < 0)
fatal("%s: cannot start slurmstepd infinity process", __func__);
else if (pid == 0) {
/* wait for signal from parent */
if (close(pipe_fd[1]))
fatal("close(%u) failed: %m", pipe_fd[1]);
safe_read(pipe_fd[0], &pid, sizeof(pid));
if (close(pipe_fd[0]))
fatal("close(%u) failed: %m", pipe_fd[0]);
/*
* Uncouple ourselves from slurmd, so a signal sent to the
* slurmd process group won't kill slurmstepd infinity. This way
* the scope will remain forever and no further calls to
* dbus/systemd will be needed until the scope is manually
* stopped.
*
* This minimizes the interaction with systemd becoming less
* dependent on possible malfunctions it might have.
*/
if (xdaemon())
_exit(127);
/* Become slurmstepd infinity */
execvp(argv[0], argv);
error("execvp of slurmstepd wait failed: %m");
_exit(127);
}
if (close(pipe_fd[0]))
fatal("close(%u) failed: %m", pipe_fd[0]);
if (cgroup_dbus_attach_to_scope(pid, scope_path) != SLURM_SUCCESS) {
/*
* Systemd scope unit may already exist or is stuck, and
* the directory is not there!.
*/
kill(pid, SIGKILL);
waitpid(pid, &status, WNOHANG);
fatal("systemd scope for slurmstepd could not be set.");
}
/*
* We need to wait for the scope to be created, and the child pid
* moved to the root, so we do not race with systemd.
*
* Experiments shown that depending on systemd load, it can be slow
* (>500ms) launching and executing the 'systemd job'. The 'job' will
* consist in internally creating the scope, mkdir the cgroup
* directories and finally move the pid.
*
* After *all* this work is done, then we can continue.
*/
scope_root.path = scope_path;
if (_wait_scope_ready(scope_root, pid,
slurm_cgroup_conf.systemd_timeout)
!= SLURM_SUCCESS) {
kill(pid, SIGKILL);
waitpid(pid, &status, WNOHANG);
fatal("Scope init timed out, systemd might need cleanup with 'systemctl reset-failed', please consider increasing SystemdTimeout in cgroup.conf (SystemdTimeout=%"PRIu64").",
slurm_cgroup_conf.systemd_timeout);
}
/*
* Assuming the scope is created, let's mkdir the /system dir which will
* allocate the sleep infinity pid. This way the slurmstepd scope won't
* be a leaf anymore and we'll be able to create more directories.
* _init_new_scope here is simply used as a mkdir.
*/
memset(&sys_root, 0, sizeof(sys_root));
xstrfmtcat(sys_root.path, "%s/%s", scope_path, SYSTEM_CGDIR);
if (mkdirpath(sys_root.path, 0755, true) != SLURM_SUCCESS) {
xfree(sys_root.path);
kill(pid, SIGKILL);
waitpid(pid, &status, WNOHANG);
fatal("slurmstepd scope could not be set.");
}
/* Success!, we got the system/ cg directory, move the child there. */
if (common_cgroup_move_process(&sys_root, pid)) {
xfree(sys_root.path);
kill(pid, SIGKILL);
waitpid(pid, &status, WNOHANG);
fatal("Unable to move pid %d to system cgroup %s", pid,
sys_root.path);
}
common_cgroup_destroy(&sys_root);
/*
* Wait for the infinity pid to be in the correct cgroup or further
* cgroup configuration will fail as we're at this point violating the
* no internal process constrain.
*
* To control resource distribution of a cgroup, the cgroup must create
* children directories and transfer all its processes to these
* children before enabling controllers in its cgroup.subtree_control
* file.
*
* As cgroupfs is sometimes slow, we cannot continue setting up this
* cgroup unless we guarantee the child are moved.
*/
if (!common_cgroup_wait_pid_moved(&scope_root, pid, scope_path)) {
kill(pid, SIGKILL);
waitpid(pid, &status, WNOHANG);
fatal("Timeout waiting for pid %d to leave %s", pid,
scope_path);
}
/* Tell the child it can continue daemonizing itself. */
safe_write(pipe_fd[1], &pid, sizeof(pid));
if ((waitpid(pid, &status, 0) != pid) || WEXITSTATUS(status)) {
/*
* If we receive an error it means xdaemon() or execv() has
* failed.
*/
fatal("%s: slurmstepd infinity could not be executed.",
__func__);
}
if (close(pipe_fd[1]))
fatal("close(%u) failed: %m", pipe_fd[1]);
return SLURM_SUCCESS;
rwfail:
fatal("Unable to contact with child: %m");
}
/*
* If IgnoreSystemd=yes in cgroup.conf we do a mkdir in
* /sys/fs/cgroup/system.slice/<nodename>_slurmstepd or /slurmstepd if no
* MULTIPLE_SLURMD.
*
* Otherwise call dbus to talk to systemd and create a 'scope' which will in
* turn create the same cgroup directory.
*
* This directory will be used to place future slurmstepds.
*/
static int _init_slurmd_system_scope()
{
struct stat sb;
/* Do only if the cgroup associated to the scope is not created yet. */
if (!stat(stepd_scope_path, &sb))
return SLURM_SUCCESS;
/*
* If we don't want to use systemd at all just create the cgroup
* directories manually and return.
*/
if (slurm_cgroup_conf.ignore_systemd)
return _init_new_scope(stepd_scope_path);
/* Call systemd through dbus to create a new scope. */
if ((_init_new_scope_dbus(stepd_scope_path) != SLURM_SUCCESS)) {
if (slurm_cgroup_conf.ignore_systemd_on_failure) {
log_flag(CGROUP, "Could not create scope through systemd, doing it manually as IgnoreSystemdOnFailure is set in cgroup.conf");
return _init_new_scope(stepd_scope_path);
} else {
error("cannot initialize cgroup directory for stepds: if the scope %s already exists it means the associated cgroup directories disappeared and the scope entered in a failed state. You should investigate why the scope lost its cgroup directories and possibly use the 'systemd reset-failed' command to fix this inconsistent systemd state.",
stepd_scope_path);
return SLURM_ERROR;
}
}
return SLURM_SUCCESS;
}
static void _get_parent_effective_cpus_mems(char **cpus_effective,
char **mems_effective,
xcgroup_t *cg)
{
size_t sz;
xcgroup_t parent_cg = { 0 };
/* Copy the settings from one level up on the hierarchy. */
parent_cg.path = xdirname(cg->path);
*cpus_effective = NULL;
*mems_effective = NULL;
if (common_cgroup_get_param(&parent_cg, "cpuset.cpus.effective",
cpus_effective, &sz) != SLURM_SUCCESS) {
error("Cannot read scope %s/cpuset.cpus.effective",
parent_cg.path);
}
if (common_cgroup_get_param(&parent_cg, "cpuset.mems.effective",
mems_effective, &sz) != SLURM_SUCCESS) {
error("Cannot read scope %s/cpuset.mems.effective",
parent_cg.path);
}
common_cgroup_destroy(&parent_cg);
}
/*
* Unset the limits applied to slurmd from _resource_spec_init(), namely
* cpuset.cpus, cpuset.mems and memory.max. If others are applied in the future
* this function can be extended to reset other limits.
*
* IN: cg - slurmd cgroup to reset the limits.
* RET: SLURM_SUCCESS or SLURM_ERROR if any limit could not be reset.
*/
static int _unset_cpu_mem_limits(xcgroup_t *cg)
{
int rc = SLURM_SUCCESS;
if (!bit_test(cg->ns->avail_controllers, CG_CPUS)) {
log_flag(CGROUP, "Not resetting cpuset limits in %s as %s controller is not enabled",
cg->path, ctl_names[CG_CPUS]);
} else if (!xstrcmp(cg->path, int_cg_ns.init_cg_path)) {
log_flag(CGROUP, "Not resetting cpuset limits in %s as we are already in the top cgroup",
cg->path);
} else {
/*
* Normally it should suffice to write a "" into cpuset.cpus to
* reset the allowed cpus, but for some reason this seems to be
* interpreted as an "empty" cpuset by the kernel and it does
* not allow us to do it when there are process in it (e.g. in
* a reconfigure when slurmd is started manually). Instead, the
* kernel allows us to specify the full range of cpus so we
* will grab here the parent cpuset.cpus and apply it to our
* cgroup. The same is done for cpuset.mems, as this interface
* suffers from the same problem.
*/
char *parent_cpus, *parent_mems;
int i;
_get_parent_effective_cpus_mems(&parent_cpus, &parent_mems, cg);
rc += common_cgroup_set_param(cg, "cpuset.cpus", parent_cpus);
rc += common_cgroup_set_param(cg, "cpuset.mems", parent_mems);
if ((i = strlen(parent_cpus)))
parent_cpus[i - 1] = '\0';
if ((i = strlen(parent_mems)))
parent_mems[i - 1] = '\0';
log_flag(CGROUP, "%s reset cpuset.cpus=%s cpuset.mems=%s",
cg->path, parent_cpus, parent_mems);
xfree(parent_cpus);
xfree(parent_mems);
}
if (!bit_test(cg->ns->avail_controllers, CG_MEMORY)) {
log_flag(CGROUP, "Not resetting limits in %s as %s controller is not enabled",
cg->path, ctl_names[CG_MEMORY]);
} else {
rc += common_cgroup_set_param(cg, "memory.max", "max");
log_flag(CGROUP, "%s reset memory.max=max", cg->path);
}
return (rc) ? SLURM_ERROR : SLURM_SUCCESS;
}
/*
* Slurmd started manually may not remain in the actual scope. Normally there
* are other pids there, like the terminal from where it's been launched, so
* slurmd would affect these pids. For example a CoreSpecCount of 1 would leave
* the bash terminal with only one core.
*
* Get out of there and put ourselves into a new home. This shouldn't happen on
* production systems.
*/
static int _migrate_to_stepd_scope()
{
char *new_home = NULL;
pid_t slurmd_pid = getpid();
bit_clear_all(int_cg_ns.avail_controllers);
xfree(int_cg_ns.mnt_point);
common_cgroup_destroy(&int_cg[CG_LEVEL_ROOT]);
xstrfmtcat(new_home, "%s/slurmd", stepd_scope_path);
int_cg_ns.mnt_point = new_home;
if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_ROOT], "",
(uid_t) 0, (gid_t) 0) != SLURM_SUCCESS) {
error("unable to create root cgroup");
return SLURM_ERROR;
}
if (common_cgroup_instantiate(&int_cg[CG_LEVEL_ROOT]) !=
SLURM_SUCCESS) {
error("Unable to instantiate slurmd %s cgroup", new_home);
return SLURM_ERROR;
}
log_flag(CGROUP, "Created %s", new_home);
/*
* Set invoc_id to empty string to indicate that from now on we should
* behave as if we were spawned by systemd.
*/
invoc_id = "";
if (_get_controllers(stepd_scope_path, int_cg_ns.avail_controllers) !=
SLURM_SUCCESS)
return SLURM_ERROR;
if (_enable_subtree_control(stepd_scope_path,
int_cg_ns.avail_controllers) !=
SLURM_SUCCESS) {
error("Cannot enable subtree_control at the top level %s",
int_cg_ns.mnt_point);
return SLURM_ERROR;
}
if (common_cgroup_move_process(&int_cg[CG_LEVEL_ROOT], slurmd_pid) !=
SLURM_SUCCESS) {
error("Unable to attach slurmd pid %d to %s cgroup.",
slurmd_pid, new_home);
return SLURM_ERROR;
}
return SLURM_SUCCESS;
}
static void _get_memory_events(uint64_t *job_kills, uint64_t *step_kills)
{
size_t sz;
char *mem_events = NULL, *ptr;
/*
* memory.events:
* all fields in this file are hierarchical and the file modified event
* can be generated due to an event down the hierarchy. For the local
* events at the cgroup level we can check memory.events.local instead.
*/
/* Get latest stats for the step */
if (common_cgroup_get_param(&int_cg[CG_LEVEL_STEP_USER],
"memory.events",
&mem_events, &sz) != SLURM_SUCCESS)
error("Cannot read %s/memory.events",
int_cg[CG_LEVEL_STEP_USER].path);
if (mem_events) {
if ((ptr = xstrstr(mem_events, "oom_kill "))) {
if (sscanf(ptr, "oom_kill %"PRIu64, step_kills) != 1)
error("Cannot read step's oom_kill counter from memory.events file.");
}
xfree(mem_events);
}
/* Get stats for the job */
if (common_cgroup_get_param(&int_cg[CG_LEVEL_JOB],
"memory.events",
&mem_events, &sz) != SLURM_SUCCESS)
error("Cannot read %s/memory.events",
int_cg[CG_LEVEL_STEP_USER].path);
if (mem_events) {
if ((ptr = xstrstr(mem_events, "oom_kill "))) {
if (sscanf(ptr, "oom_kill %"PRIu64, job_kills) != 1)
error("Cannot read job's oom_kill counter from memory.events file.");
}
xfree(mem_events);
}
}
static void _get_swap_events(uint64_t *job_swkills, uint64_t *step_swkills)
{
size_t sz;
char *mem_swap_events = NULL, *ptr;
/* Get latest swap stats for the step */
if (common_cgroup_get_param(&int_cg[CG_LEVEL_STEP_USER],
"memory.swap.events",
&mem_swap_events, &sz) != SLURM_SUCCESS)
error("Cannot read %s/memory.swap.events",
int_cg[CG_LEVEL_STEP_USER].path);
if (mem_swap_events) {
if ((ptr = xstrstr(mem_swap_events, "fail "))) {
if (sscanf(ptr, "fail %"PRIu64, step_swkills) != 1)
error("Cannot read step's fail counter from memory.swap.events file.");
}
xfree(mem_swap_events);
}
/* Get swap stats for the job */
if (common_cgroup_get_param(&int_cg[CG_LEVEL_JOB], "memory.swap.events",
&mem_swap_events, &sz) != SLURM_SUCCESS)
error("Cannot read %s/memory.swap.events",
int_cg[CG_LEVEL_STEP_USER].path);
if (mem_swap_events) {
if ((ptr = xstrstr(mem_swap_events, "fail "))) {
if (sscanf(ptr, "fail %"PRIu64, job_swkills) != 1)
error("Cannot read job's fail counter from memory.swap.events file.");
}
xfree(mem_swap_events);
}
}
/*
* This function checks that all the processes contained in the cgroup cg
* belong to our namespace.
*
* That is checked by ensuring none of the pids contained in the cgroup.procs
* interface are 0, which would indicate that we cannot see the pid of that
* process, meaning this process belongs to another namespace.
*
* Trying to move a 0 in Cgroups moves yourself.
*
* IN cg - the cgroup we want to check for cgroup.procs not containing 0's
* RET - SLURM_ERROR if cgroup.procs could not be read or there are 0's.
* SLURM_SUCCESS otherwise.
*/
static int _check_cg_pids_correct_ns(xcgroup_t *cg)
{
pid_t *pids = NULL;
int npids = 0, rc = SLURM_SUCCESS;
if (common_cgroup_get_pids(cg, &pids, &npids) != SLURM_SUCCESS) {
error("unable to get processes from %s cgroup", cg->path);
return SLURM_ERROR;
}
for (int i = 0; i < npids; i++) {
if (pids[i] == 0) {
error("We detected a pid 0 which means you are in a cgroup namespace and a mounted cgroup but with pids from the host that we're not allowed to manage.");
rc = SLURM_ERROR;
break;
}
}
xfree(pids);
return rc;
}
/*
* Move the pids from 'from' cgroup to 'to' cgroup and enable the controllers.
*
* Create a new cgroup in the path resulting of the concenation of
* int_cg_ns.mnt_point (normally /sys/fs/cgroup ) and the "to" parameter.
*
* Then get all the processes in the "from" cgroup.procs and move them to the
* new cgroup.
*
* Finally enable the subtree control on the "from" cgroup to ensure that no new
* processes will be put there, convert it to a cgroup "domain controller".
*
* On failure retry by waiting for the processes to show up in the new cgroup,
* then try again to enable subtree control. If that last one fails it returns
* an error. Is important to note that this function does not guarantee
* that all the process can be successfully moved, as it is inherently racy.
* It might happen that in between the common_cgroup_get_pids() and the movement
* of those to the new cgroup, new processes are spawned there, thus making the
* enable_subtree fail. We don't want to freeze the cgroup either as we might
* be freezing ourselves.
*
* IN from - origin cgroup where to move pids from.
* IN to - destination cgroup path to be created, set, and pids moved.
* RET rc - SLURM_SUCCESS if all pids could be read and moved into a new
* configured cgroup, error otherwise.
*/
static int _empty_pids(xcgroup_t *from, char *to)
{
pid_t *pids = NULL;
int npids = 0;
xcgroup_t dest;
bitstr_t *system_ctrls = bit_alloc(CG_CTL_CNT);
int rc = SLURM_ERROR;
if (_get_controllers(slurm_cgroup_conf.cgroup_mountpoint,
system_ctrls) != SLURM_SUCCESS) {
error("Unable to get cgroup root controllers.");
goto fail;
}
if (common_cgroup_create(&int_cg_ns, &dest, to, (uid_t) 0, (gid_t) 0) !=
SLURM_SUCCESS) {
error("Unable to create cgroup structure for %s", to);
goto fail;
}
if (common_cgroup_instantiate(&dest) != SLURM_SUCCESS) {
error("Unable to create cgroup %s", dest.path);
goto fail;
}
if (common_cgroup_get_pids(from, &pids, &npids) != SLURM_SUCCESS) {
error("Unable to get pids from origin cgroup %s", from->path);
goto fail;
}
for (int i = 0; i < npids; i++) {
if (common_cgroup_move_process(&dest, pids[i]) !=
SLURM_SUCCESS) {
error("Unable to move process %d from %s to %s cgroup.",
pids[i], from->path, dest.path);
goto fail;
}
}
if (_enable_subtree_control(from->path, system_ctrls)) {
error("Cannot enable subtree control in %s cgroup. Trying to wait for process movement: %m",
from->path);
for (int i = 0; i < npids; i++) {
if (!common_cgroup_wait_pid_moved(from, pids[i],
from->path)) {
error("Move pid %d from %s to %s failed.",
pids[i], from->path, dest.path);
goto fail;
}
}
if (_enable_subtree_control(from->path, system_ctrls)) {
error("Cannot enable subtree control for cgroup %s: %m",
from->path);
goto fail;
}
}
rc = SLURM_SUCCESS;
fail:
common_cgroup_destroy(&dest);
FREE_NULL_BITMAP(system_ctrls);
xfree(pids);
return rc;
}
/*
* Initialize the cgroup plugin. Slurmd MUST be started by systemd and the
* option Delegate set to 'Yes' or equal to a string with the desired
* controllers we want to support in this system. If we are slurmd we're going
* to create a systemd scope for further slurmstepds. The scope is associated
* to a cgroup directory, and it will be delegated to us too. We need to
* separate it from slurmd because if we restart slurmd and there are living
* steps in the same directory, then slurmd could not be put in a non-leaf
* cgroup, and systemd will fail (no internal process constraint).
* Take in mind also we should not do anything upper in the hierarchy because of
* the single-writer architecture systemd imposes to us. The upper tree is
* completely under systemd control.
*
* We need to play the cgroup v2 game rules:
*
* - No Internal Process Constraint
* - Top-down Constraint
*
* And try to be compliant with systemd, or they will complain:
*
* - Single writer rule.
*
* Read cgroup v2 documentation for more info.
*/
extern int init(void)
{
int_cg_ns.avail_controllers = bit_alloc(CG_CTL_CNT);
step_active_cnt = 0;
FREE_NULL_LIST(task_list);
task_list = list_create(_free_task_cg_info);
debug("%s loaded", plugin_name);
return SLURM_SUCCESS;
}
static bool _pid_in_root(char *pid_str)
{
char *cg_path, *tmp_str, file_path[PATH_MAX];
bool rc = false;
cg_path = _get_proc_cg_path(pid_str);
tmp_str = xdirname(cg_path);
xfree(cg_path);
cg_path = tmp_str;
tmp_str = NULL;
if (snprintf(file_path, PATH_MAX, "%s/cgroup.procs", cg_path) >=
PATH_MAX) {
error("Could not generate cgroup path: %s", file_path);
goto end;
}
/* If cgroup.procs is not found one level up, we are in the root */
if (access(file_path, F_OK))
rc = true;
end:
xfree(cg_path);
return rc;
}
extern int cgroup_p_setup_scope(char *scope_path)
{
/*
* Detect if we are started by systemd. Another way could be to check
* if our PPID=1, but we cannot rely on it because when starting slurmd
* with -D over a sshd session, slurmd will be reparented by 1, and
* doing this on a graphical session, it will be reparented by
* "systemd --user". So it is not a reliable check. Instead use
* the existence of INVOCATION_ID to know if the pid has been forked by
* systemd.
*/
invoc_id = getenv("INVOCATION_ID");
if (!_is_cgroup2_mount(slurm_cgroup_conf.cgroup_mountpoint)) {
fatal("%s is not a valid cgroup2 mountpoint",
slurm_cgroup_conf.cgroup_mountpoint);
}
/*
* Set our current root dir in our "internal cgroup namespace".
* We will create our tree and all directories from this root.
* In slurmstepd, we got it from slurmd at startup so no need to guess.
*/
if (running_in_slurmstepd()) {
stepd_scope_path = xstrdup(scope_path);
int_cg_ns.mnt_point = stepd_scope_path;
} else
_set_int_cg_ns();
if (!int_cg_ns.mnt_point) {
error("Cannot setup the cgroup namespace.");
return SLURM_ERROR;
}
/* Setup the root cgroup object. */
if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_ROOT], "",
(uid_t) 0, (gid_t) 0) != SLURM_SUCCESS) {
error("unable to create root cgroup (%s)",
int_cg[CG_LEVEL_ROOT].path);
return SLURM_ERROR;
}
/*
* Check whether there are pids in the root cgroup that do not belong to
* this namespace, and exit if so, as we cannot handle processes from
* another namespace.
*/
if (running_in_slurmd() &&
(_check_cg_pids_correct_ns(&int_cg[CG_LEVEL_ROOT]) !=
SLURM_SUCCESS)) {
error("cgroup %s contains pids from outside of our pid namespace, so we cannot manage this cgroup.",
int_cg[CG_LEVEL_ROOT].path);
return SLURM_ERROR;
}
/*
* Convert our false root into a workable root - best effort.
*
* Slurmd will detect when the root cgroup is not a real one. This can
* happen when we have been started in a cgroup namespaced container and
* our /sys/fs/cgroup is mapped to a non-root cgroup directory in the
* host, meaning it cannot have pids in cgroup.procs if there are
* subdirectories.
*
* As we're going to create a hierarchy, we need to move out the pids
* to a child directory, we've chosen /system for that.
*
* So move the pids away from the "false root" cgroup to /system.
*
* Only do that if IgnoreSystemd is set.
*/
if (running_in_slurmd() && cgroup_p_has_feature(CG_FALSE_ROOT) &&
slurm_cgroup_conf.ignore_systemd && _pid_in_root("self")) {
if (_empty_pids(&int_cg[CG_LEVEL_ROOT], "/system") !=
SLURM_SUCCESS){
error("cannot empty the false root cgroup (%s) of pids.",
int_cg[CG_LEVEL_ROOT].path);
return SLURM_ERROR;
}
}
/*
* Check available controllers in cgroup.controller, record them in our
* bitmap and enable them if EnableControllers option is set.
* We enable them manually just because we support CgroupIgnoreSystemd
* option. Theoretically when starting a unit with Delegate=yes, you will
* get all controllers available at your level.
*/
if (_setup_controllers() != SLURM_SUCCESS)
return SLURM_ERROR;
/*
* slurmd will setup a new home for future slurmstepds. Every stepd
* will emigrate to this new place.
*/
if (running_in_slurmd()) {
if (_init_slurmd_system_scope() != SLURM_SUCCESS)
return SLURM_ERROR;
/*
* If we are not started by systemd we need to move out to not
* mess with the pids that may be in our actual cgroup.
*/
if (!invoc_id) {
log_flag(CGROUP, "assuming slurmd has been started manually.");
if (_migrate_to_stepd_scope() != SLURM_SUCCESS)
return SLURM_ERROR;
} else {
log_flag(CGROUP, "INVOCATION_ID env var found. Assuming slurmd has been started by systemd.");
}
/*
* We need to unset any cpu and memory limits as we do not want
* to inherit previous limits. We cannot reset them later
* because _load_gres needs to see all the cpus. The CoreSpec
* initialization will happen afterwards and set whatever
* is needed.
*/
if (_unset_cpu_mem_limits(&int_cg[CG_LEVEL_ROOT]) !=
SLURM_SUCCESS) {
error("Cannot reset %s cgroup limits.",
int_cg[CG_LEVEL_ROOT].path);
return SLURM_ERROR;
}
}
if (running_in_slurmstepd()) {
/*
* We expect slurmd to already have set our scope directory.
* Move ourselves in the system subdirectory, which is a
* temporary 'parking' until we have not created the job
* hierarchy.
*/
if (_init_stepd_system_scope(getpid()) != SLURM_SUCCESS)
return SLURM_ERROR;
}
/*
* If we're slurmd we're all set and able to constrain things, i.e.
* CoreSpec* and MemSpec*.
*
* If we are a new slurmstepd we are ready now to create job steps. In
* that case, since we're still in the temporary "system" directory,
* we will need move ourselves out to a new job directory and then
* create int_cg[CG_LEVEL_ROOT].path/job_x/step_x.
*/
return SLURM_SUCCESS;
}
extern void fini(void)
{
/*
* Clear up the namespace and cgroups memory. Don't rmdir anything since
* we may not be stopping yet. When the process terminates systemd will
* remove the remaining directories.
*/
FREE_NULL_BITMAP(int_cg_ns.avail_controllers);
common_cgroup_destroy(&int_cg[CG_LEVEL_SYSTEM]);
common_cgroup_destroy(&int_cg[CG_LEVEL_ROOT]);
common_cgroup_ns_destroy(&int_cg_ns);
FREE_NULL_LIST(task_list);
free_ebpf_prog(&p[CG_LEVEL_JOB]);
free_ebpf_prog(&p[CG_LEVEL_STEP_USER]);
xfree(stepd_scope_path);
debug("unloading %s", plugin_name);
}
/*
* Unlike in Legacy mode (v1) where we needed to create a directory for each
* controller, in Unified mode this function will do almost nothing except for
* some sanity checks. That's because hierarchy is unified into the same path.
* and the controllers will be enabled when we create the hierarchy. The only
* controller that may need a real init is the 'devices', which in Unified is
* not a real controller, but instead we need to register an eBPF program.
*/
extern int cgroup_p_initialize(cgroup_ctl_type_t ctl)
{
switch (ctl) {
case CG_DEVICES:
init_ebpf_prog(&p[CG_LEVEL_JOB]);
init_ebpf_prog(&p[CG_LEVEL_STEP_USER]);
break;
case CG_TRACK:
/* This is not a controller in Cgroup v2.*/
break;
default:
if (!bit_test(int_cg_ns.avail_controllers, ctl)) {
error("%s cgroup controller is not available.",
ctl_names[ctl]);
return SLURM_ERROR;
}
if (running_in_slurmd()) {
bitstr_t *scope_ctrls = bit_alloc(CG_CTL_CNT);
_get_controllers(stepd_scope_path, scope_ctrls);
if (!bit_test(scope_ctrls, ctl)) {
error("%s cgroup controller is not available for %s.",
ctl_names[ctl], stepd_scope_path);
FREE_NULL_BITMAP(scope_ctrls);
return SLURM_ERROR;
}
FREE_NULL_BITMAP(scope_ctrls);
}
break;
}
return SLURM_SUCCESS;
}
/*
* As part of the initialization, the slurmd directory is already created, so
* this function will remain empty.
*/
extern int cgroup_p_system_create(cgroup_ctl_type_t ctl)
{
return SLURM_SUCCESS;
}
/*
* Slurmd will live in its own cgroup, not sharing anything with slurmstepd.
* This means there's no reason to implement this function in v2.
* Also slurmstepd is put into the user's hierarchy (see graph) and is not
* affected by CoreSpec or MemSpec.
*/
extern int cgroup_p_system_addto(cgroup_ctl_type_t ctl, pid_t *pids, int npids)
{
return SLURM_SUCCESS;
}
/*
* There's no need to do any cleanup, when systemd terminates the cgroup is
* automatically removed by systemd.
*/
extern int cgroup_p_system_destroy(cgroup_ctl_type_t ctl)
{
return SLURM_SUCCESS;
}
/*
* Create the step hierarchy and move the stepd process into it. Further forked
* processes will be created in the step directory as child. We need to respect
* the cgroup v2 Top-Down constraint to not add pids to non-leaf cgroups.
*
* We create two directories per step because we need to put the stepd into its
* specific slurm/ dir, otherwise suspending/constraining the user cgroup would
* also suspend or constrain the stepd.
*
* step_x/slurm (for slurm processes, slurmstepd)
* step_x/user (for users processes, tasks)
*
* No need to cleanup the directories on error because when a job ends
* systemd does the cleanup automatically.
*
* Note that CoreSpec and/or MemSpec does not affect slurmstepd.
*/
extern int cgroup_p_step_create(cgroup_ctl_type_t ctl, stepd_step_rec_t *step)
{
int rc = SLURM_SUCCESS;
char *new_path = NULL;
char tmp_char[64];
/*
* Lock the root cgroup so we don't race with other steps that are being
* terminated and trying to destroy the job_x directory.
*/
if (common_cgroup_lock(&int_cg[CG_LEVEL_ROOT]) != SLURM_SUCCESS) {
error("common_cgroup_lock error (%s)", ctl_names[ctl]);
return SLURM_ERROR;
}
/* Don't let other plugins destroy our structs. */
step_active_cnt++;
/* Job cgroup */
xstrfmtcat(new_path, "/job_%u", step->step_id.job_id);
if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_JOB],
new_path, 0, 0) != SLURM_SUCCESS) {
error("unable to create job %u cgroup", step->step_id.job_id);
rc = SLURM_ERROR;
goto endit;
}
if (common_cgroup_instantiate(&int_cg[CG_LEVEL_JOB]) != SLURM_SUCCESS) {
common_cgroup_destroy(&int_cg[CG_LEVEL_JOB]);
error("unable to instantiate job %u cgroup",
step->step_id.job_id);
rc = SLURM_ERROR;
goto endit;
}
xfree(new_path);
_enable_subtree_control(int_cg[CG_LEVEL_JOB].path,
int_cg_ns.avail_controllers);
/* Step cgroup */
xstrfmtcat(new_path, "%s/step_%s", int_cg[CG_LEVEL_JOB].name,
log_build_step_id_str(&step->step_id, tmp_char,
sizeof(tmp_char),
STEP_ID_FLAG_NO_PREFIX |
STEP_ID_FLAG_NO_JOB));
if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_STEP],
new_path, 0, 0) != SLURM_SUCCESS) {
error("unable to create step %ps cgroup", &step->step_id);
rc = SLURM_ERROR;
goto endit;
}
if (common_cgroup_instantiate(&int_cg[CG_LEVEL_STEP]) !=
SLURM_SUCCESS) {
common_cgroup_destroy(&int_cg[CG_LEVEL_STEP]);
error("unable to instantiate step %ps cgroup", &step->step_id);
rc = SLURM_ERROR;
goto endit;
}
xfree(new_path);
_enable_subtree_control(int_cg[CG_LEVEL_STEP].path,
int_cg_ns.avail_controllers);
/*
* We have our stepd directory already into job_x, from now one nobody
* can destroy this job directory. We're safe.
*/
common_cgroup_unlock(&int_cg[CG_LEVEL_ROOT]);
/* Step User processes cgroup */
xstrfmtcat(new_path, "%s/user", int_cg[CG_LEVEL_STEP].name);
if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_STEP_USER],
new_path, 0, 0) != SLURM_SUCCESS) {
error("unable to create step %ps user procs cgroup",
&step->step_id);
rc = SLURM_ERROR;
goto endit;
}
if (common_cgroup_instantiate(&int_cg[CG_LEVEL_STEP_USER]) !=
SLURM_SUCCESS) {
common_cgroup_destroy(&int_cg[CG_LEVEL_STEP_USER]);
error("unable to instantiate step %ps user procs cgroup",
&step->step_id);
rc = SLURM_ERROR;
goto endit;
}
xfree(new_path);
_enable_subtree_control(int_cg[CG_LEVEL_STEP_USER].path,
int_cg_ns.avail_controllers);
/*
* Step Slurm processes cgroup
* Do not enable subtree control at this level since this is a leaf.
*/
xstrfmtcat(new_path, "%s/slurm", int_cg[CG_LEVEL_STEP].name);
if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_STEP_SLURM],
new_path, 0, 0) != SLURM_SUCCESS) {
error("unable to create step %ps slurm procs cgroup",
&step->step_id);
rc = SLURM_ERROR;
goto endit;
}
if (common_cgroup_instantiate(&int_cg[CG_LEVEL_STEP_SLURM]) !=
SLURM_SUCCESS) {
common_cgroup_destroy(&int_cg[CG_LEVEL_STEP_SLURM]);
error("unable to instantiate step %ps slurm procs cgroup",
&step->step_id);
rc = SLURM_ERROR;
goto endit;
}
xfree(new_path);
/* Place this stepd is in the correct cgroup. */
if (common_cgroup_move_process(&int_cg[CG_LEVEL_STEP_SLURM],
step->jmgr_pid) != SLURM_SUCCESS) {
error("unable to move stepd pid to its dedicated cgroup");
rc = SLURM_ERROR;
}
endit:
xfree(new_path);
if (rc != SLURM_SUCCESS)
step_active_cnt--;
return rc;
}
/*
* Move a pid to a specific cgroup. It needs to be a leaf, we cannot move
* a pid to an intermediate directory in the cgroup hierarchy. Since we always
* work at task level, we will add this pid to the special task task_4294967293.
*
* Future: If in cgroup v2 we want to be able to enable/disable controllers for
* the slurmstepd pid, we need to add here the logic when stepd pid is detected.
* By default, all controllers are enabled for slurmstepd cgroup.
*
* - Top-down Constraint
* - No Internal Process Constraint
*
* Read cgroup v2 documentation for more info.
*/
extern int cgroup_p_step_addto(cgroup_ctl_type_t ctl, pid_t *pids, int npids)
{
int rc = SLURM_SUCCESS;
pid_t stepd_pid = getpid();
for (int i = 0; i < npids; i++) {
/* Ignore any possible movement of slurmstepd */
if (pids[i] == stepd_pid)
continue;
if (cgroup_p_task_addto(ctl, NULL, pids[i],
task_special_id) != SLURM_SUCCESS)
rc = SLURM_ERROR;
}
return rc;
}
/*
* Read the cgroup.procs of the leafs of this step.
*
* - count the pids of slurm/ directory
* - for all task_x dir:
* read task_x/cgroup.procs and add them into **pids
*/
extern int cgroup_p_step_get_pids(pid_t **pids, int *npids)
{
foreach_pid_array_t pid_array;
memset(&pid_array, 0, sizeof(pid_array));
/* Include the slurm processes (stepd) pids too. */
common_cgroup_get_pids(&int_cg[CG_LEVEL_STEP_SLURM],
&pid_array.pids, &pid_array.npids);
list_for_each(task_list, _get_task_pids, &pid_array);
*npids = pid_array.npids;
*pids = pid_array.pids;
return SLURM_SUCCESS;
}
/* Freeze the user processes of this step */
extern int cgroup_p_step_suspend(void)
{
/* This plugin is unloaded. */
if (!int_cg[CG_LEVEL_STEP_USER].path)
return SLURM_SUCCESS;
/*
* Freezing of the cgroup may take some time; when this action is
* completed, the "frozen" value in the cgroup.events control file will
* be updated to "1" and the corresponding notification will be issued.
*/
return common_cgroup_set_param(&int_cg[CG_LEVEL_STEP_USER],
"cgroup.freeze", "1");
}
/* Resume the user processes of this step */
extern int cgroup_p_step_resume(void)
{
/* This plugin is unloaded. */
if (!int_cg[CG_LEVEL_STEP_USER].path)
return SLURM_SUCCESS;
return common_cgroup_set_param(&int_cg[CG_LEVEL_STEP_USER],
"cgroup.freeze", "0");
}
/*
* Destroy the step cgroup. We need to move out ourselves to the root of
* the cgroup filesystem first.
*/
extern int cgroup_p_step_destroy(cgroup_ctl_type_t ctl)
{
int rc = SLURM_SUCCESS;
xcgroup_t init_root;
/*
* Only destroy the step if we're the only ones using it. Log it unless
* loaded from slurmd, where we will not create any step but call fini.
*/
if (step_active_cnt == 0) {
error("called without a previous step create. This shouldn't happen!");
return SLURM_SUCCESS;
}
if (step_active_cnt > 1) {
step_active_cnt--;
log_flag(CGROUP, "Not destroying %s step dir, resource busy by %d other plugin",
ctl_names[ctl], step_active_cnt);
return SLURM_SUCCESS;
}
/*
* Lock the root cgroup so we don't race with other steps that are being
* started and trying to create things inside job_x directory.
*/
if (common_cgroup_lock(&int_cg[CG_LEVEL_ROOT]) != SLURM_SUCCESS) {
error("common_cgroup_lock error (%s)", ctl_names[ctl]);
return SLURM_ERROR;
}
/*
* FUTURE:
* Here we can implement a recursive kill of all pids in the step.
*/
/*
* Move ourselves to the CGROUP SYETEM level. This is the waiting area
* for new Slurmstepd process which do not have job folders yet, or for
* jobs that are ending execution. This directory also contains the
* "stepd infinity" process to keep the scope alive.
*
* This level is a leaf. We are not violating the no-internal-processes
* constrain.
*
* Moving the process here instead of to the cgroup root
* (typically /sys/fs/cgroup) will prevent problems when running into
* containerized environments, where cgroupfs root might not be
* writeable.
*/
memset(&init_root, 0, sizeof(init_root));
init_root.path = xstrdup(int_cg[CG_LEVEL_SYSTEM].path);
rc = common_cgroup_move_process(&init_root, getpid());
if (rc != SLURM_SUCCESS) {
error("Unable to move pid %d to system cgroup %s", getpid(),
init_root.path);
goto end;
}
/* Wait for this cgroup to be empty, 1 second */
_wait_cgroup_empty(&int_cg[CG_LEVEL_STEP_SLURM], 1000);
/* Remove any possible task directories first */
_all_tasks_destroy();
/* Rmdir this job's stepd cgroup */
if ((rc = common_cgroup_delete(&int_cg[CG_LEVEL_STEP_SLURM])) !=
SLURM_SUCCESS) {
debug2("unable to remove slurm's step cgroup (%s): %m",
int_cg[CG_LEVEL_STEP_SLURM].path);
goto end;
}
common_cgroup_destroy(&int_cg[CG_LEVEL_STEP_SLURM]);
/* Rmdir this job's user processes cgroup */
if ((rc = common_cgroup_delete(&int_cg[CG_LEVEL_STEP_USER])) !=
SLURM_SUCCESS) {
debug2("unable to remove user's step cgroup (%s): %m",
int_cg[CG_LEVEL_STEP_USER].path);
goto end;
}
common_cgroup_destroy(&int_cg[CG_LEVEL_STEP_USER]);
/* Rmdir this step's processes cgroup */
if ((rc = common_cgroup_delete(&int_cg[CG_LEVEL_STEP])) !=
SLURM_SUCCESS) {
debug2("unable to remove step cgroup (%s): %m",
int_cg[CG_LEVEL_STEP].path);
goto end;
}
common_cgroup_destroy(&int_cg[CG_LEVEL_STEP]);
/*
* That's a best try to rmdir if no more steps are in this job,
* it must not fail on error because other steps can still be alive.
*/
if (common_cgroup_delete(&int_cg[CG_LEVEL_JOB]) != SLURM_SUCCESS) {
debug2("still unable to remove job's step cgroup (%s): %m",
int_cg[CG_LEVEL_JOB].path);
goto end;
}
common_cgroup_destroy(&int_cg[CG_LEVEL_JOB]);
step_active_cnt = 0;
end:
common_cgroup_unlock(&int_cg[CG_LEVEL_ROOT]);
common_cgroup_destroy(&init_root);
return rc;
}
/*
* Return true if the user pid is in this step/task cgroup.
*
* We just need to get the pids from the task_X directories and from the slurm
* processes cgroup, since these will be the only leafs we'll have.
*/
extern bool cgroup_p_has_pid(pid_t pid)
{
task_cg_info_t *task_cg_info;
pid_t *pids_slurm = NULL;
int npids_slurm = 0, i;
task_cg_info = list_find_first(task_list, _find_pid_task, &pid);
if (task_cg_info)
return true;
/* Look for in the slurm processes cgroup too. */
if (common_cgroup_get_pids(&int_cg[CG_LEVEL_STEP_SLURM],
&pids_slurm, &npids_slurm) !=
SLURM_SUCCESS)
return false;
for (i = 0; i < npids_slurm; i++) {
if (pids_slurm[i] == pid) {
xfree(pids_slurm);
return true;
}
}
xfree(pids_slurm);
return false;
}
extern int cgroup_p_constrain_set(cgroup_ctl_type_t ctl, cgroup_level_t level,
cgroup_limits_t *limits)
{
int rc = SLURM_SUCCESS;
bpf_program_t *program = NULL;
task_cg_info_t *task_cg_info;
char *dev_id_str = NULL;
uint32_t bpf_dev_type = NO_VAL;
/*
* cgroup/v1 legacy compatibility: We have no such levels in cgroup/v2
* but we may still get calls for them.
*/
if (level == CG_LEVEL_USER)
return SLURM_SUCCESS;
if (level == CG_LEVEL_SLURM)
level = CG_LEVEL_ROOT;
/* This is for CoreSpec* and MemSpec* for slurmd */
if (level == CG_LEVEL_SYSTEM)
level = CG_LEVEL_ROOT;
/*
* Our real step level is the level for user processes. This will make
* that the slurmstepd is never constrained in its own cgroup, which is
* something we want. Instead, slurmstepd will be part of the job limit.
* Note that a step which initializes pmi, could cause slurmstepd to
* grow, and we don't want this to be part of the step, but be part of
* the job.
*/
if (level == CG_LEVEL_STEP)
level = CG_LEVEL_STEP_USER;
if (!limits)
return SLURM_ERROR;
switch (ctl) {
case CG_TRACK:
/* Not implemented. */
break;
case CG_CPUS:
if (limits->allow_cores &&
common_cgroup_set_param(
&int_cg[level],
"cpuset.cpus",
limits->allow_cores) != SLURM_SUCCESS) {
rc = SLURM_ERROR;
}
if (limits->allow_mems &&
common_cgroup_set_param(
&int_cg[level],
"cpuset.mems",
limits->allow_mems) != SLURM_SUCCESS) {
rc = SLURM_ERROR;
}
break;
case CG_MEMORY:
if ((limits->limit_in_bytes != NO_VAL64) &&
common_cgroup_set_uint64_param(
&int_cg[level],
"memory.max",
limits->limit_in_bytes) != SLURM_SUCCESS) {
rc = SLURM_ERROR;
}
if ((limits->soft_limit_in_bytes != NO_VAL64) &&
common_cgroup_set_uint64_param(
&int_cg[level],
"memory.high",
limits->soft_limit_in_bytes) != SLURM_SUCCESS) {
rc = SLURM_ERROR;
}
if ((limits->memsw_limit_in_bytes != NO_VAL64) &&
common_cgroup_set_uint64_param(
&int_cg[level],
"memory.swap.max",
(limits->memsw_limit_in_bytes -
limits->limit_in_bytes)) != SLURM_SUCCESS) {
rc = SLURM_ERROR;
}
break;
case CG_DEVICES:
/*
* Set program to point to the needed bpf_program_t depending on
* the hierarchy level.
*/
switch (level) {
case CG_LEVEL_JOB:
case CG_LEVEL_STEP_USER:
program = &(p[level]);
break;
case CG_LEVEL_TASK:
if (!(task_cg_info = list_find_first(
task_list,
_find_task_cg_info,
&limits->taskid))) {
error("No task found with id %u, this should never happen",
limits->taskid);
return SLURM_ERROR;
}
program = &(task_cg_info->p);
break;
default:
error("unknown hierarchy level %d", level);
break;
}
if (!program) {
error("Could not find a bpf program to use at level %d",
level);
return SLURM_ERROR;
}
dev_id_str = gres_device_id2str(&limits->device);
if (limits->allow_device)
log_flag(CGROUP, "Allowing access to device (%s)",
dev_id_str);
else
log_flag(CGROUP, "Denying access to device (%s)",
dev_id_str);
xfree(dev_id_str);
/* Determine the correct BPF device type. */
if (limits->device.type == DEV_TYPE_BLOCK)
bpf_dev_type = BPF_DEVCG_DEV_BLOCK;
else if (limits->device.type == DEV_TYPE_CHAR)
bpf_dev_type = BPF_DEVCG_DEV_CHAR;
rc = add_device_ebpf_prog(program, bpf_dev_type,
limits->device.major,
limits->device.minor,
limits->allow_device);
break;
default:
error("cgroup controller %u not supported", ctl);
rc = SLURM_ERROR;
break;
}
return rc;
}
/*
* Apply the device constrain limits, this is only used with cgroupv2 as there
* is the need of loading and attaching the eBPF program to the cgroup.
* It closes, loads and attach the bpf_program to the corresponding cgroup using
* level and task_id, task_id is only used in CG_LEVEL_TASK level.
*/
extern int cgroup_p_constrain_apply(cgroup_ctl_type_t ctl, cgroup_level_t level,
uint32_t task_id)
{
bpf_program_t *program = NULL;
task_cg_info_t *task_cg_info;
char *cgroup_path = NULL;
/*
* cgroup/v1 legacy compatibility: We have no such levels in cgroup/v2
* but we may still get calls for them.
*/
if (level == CG_LEVEL_USER)
return SLURM_SUCCESS;
if (level == CG_LEVEL_SLURM)
level = CG_LEVEL_ROOT;
/*
* Our real step level is the level for user processes. This will make
* that the slurmstepd is never constrained in its own cgroup, which is
* something we want. Instead, slurmstepd will be part of the job limit.
* Note that a step which initializes pmi, could cause slurmstepd to
* grow, and we don't want this to be part of the step, but be part of
* the job.
*/
if (level == CG_LEVEL_STEP)
level = CG_LEVEL_STEP_USER;
/* Only used in devices cgroup restriction */
switch (ctl) {
case CG_DEVICES:
/*
* Set program to point to the needed bpf_program_t depending on
* the level and the task_id.
*/
if (level == CG_LEVEL_STEP_USER || level == CG_LEVEL_JOB) {
program = &(p[level]);
cgroup_path = int_cg[level].path;
}
if (level == CG_LEVEL_TASK) {
if (!(task_cg_info = list_find_first(task_list,
_find_task_cg_info,
&task_id))) {
error("No task found with id %u, this should never happen",
task_id);
return SLURM_ERROR;
}
program = &(task_cg_info->p);
cgroup_path = task_cg_info->task_cg.path;
}
if (!program) {
error("EBPF program with task_id %u does not exist",
task_id);
return SLURM_ERROR;
}
/*
* Only load the program if it has more instructions that the
* initial ones.
*/
if (program->n_inst > INIT_INST) {
log_flag(CGROUP,"EBPF Closing and loading bpf program into %s",
cgroup_path);
/* Set the default action*/
close_ebpf_prog(program, EBPF_ACCEPT);
/*
* Load the ebpf program into the cgroup without the
* override flag if we are at TASK level, as this is the
* last cgroup in the hierarchy.
*/
return load_ebpf_prog(program, cgroup_path,
(level != CG_LEVEL_TASK));
} else {
log_flag(CGROUP, "EBPF Not loading the program into %s because it is a noop",
cgroup_path);
}
break;
default:
error("cgroup controller %u not supported", ctl);
return SLURM_ERROR;
break;
}
return SLURM_SUCCESS;
}
extern char *cgroup_p_get_scope_path(void)
{
return stepd_scope_path;
}
static void _get_mem_recursive(xcgroup_t *cg, cgroup_limits_t *limits)
{
char *mem_max = NULL, *tmp_str = NULL, file_path[PATH_MAX];
size_t mem_sz;
if (!xstrcmp(cg->path, "/"))
goto end;
/*
* Break when there is no memory controller anymore.
*
* We check if the file exists before getting its value because at the
* moment we do not have proper error propagation and common_get_param
* will emit an error(), which in our case it would just be a
* verification and not an error.
*/
snprintf(file_path, PATH_MAX, "%s/memory.max", cg->path);
if (access(file_path, F_OK)) {
log_flag(CGROUP, "Reached %s cgroup without memory controller",
cg->path);
goto end;
}
if (common_cgroup_get_param(cg, "memory.max", &mem_max, &mem_sz) !=
SLURM_SUCCESS)
goto end;
/* Check ancestor */
if (xstrstr(mem_max, "max")) {
tmp_str = xdirname(cg->path);
xfree(cg->path);
cg->path = tmp_str;
_get_mem_recursive(cg, limits);
if (limits->limit_in_bytes != NO_VAL64)
goto end;
} else {
/* found it! */
mem_max[mem_sz - 1] = '\0';
limits->limit_in_bytes = slurm_atoull(mem_max);
}
end:
xfree(mem_max);
}
extern cgroup_limits_t *cgroup_p_constrain_get(cgroup_ctl_type_t ctl,
cgroup_level_t level)
{
cgroup_limits_t *limits;
xcgroup_t tmp_cg = { 0 };
/*
* cgroup/v1 legacy compatibility: We have no such levels in cgroup/v2
* but we may still get calls for them.
*/
if (level == CG_LEVEL_USER) {
error("Incorrect cgroup level: %d", level);
return NULL;
}
if (level == CG_LEVEL_SLURM)
level = CG_LEVEL_ROOT;
/*
* Our real step level is the level for user processes. This will make
* that the slurmstepd is never constrained in its own cgroup, which is
* something we want. Instead, slurmstepd will be part of the job limit.
* Note that a step which initializes pmi, could cause slurmstepd to
* grow, and we don't want this to be part of the step, but be part of
* the job.
*/
if (level == CG_LEVEL_STEP)
level = CG_LEVEL_STEP_USER;
/* This is for CoreSpec* and MemSpec* for slurmd */
if (level == CG_LEVEL_SYSTEM)
level = CG_LEVEL_ROOT;
limits = xmalloc(sizeof(*limits));
cgroup_init_limits(limits);
switch (ctl) {
case CG_TRACK:
/* Not implemented. */
goto fail;
case CG_CPUS:
/*
* cpuset.cpus:
* ------------
* It lists the *requested* CPUs to be used by tasks within this
* cgroup. The actual list of CPUs to be granted, however, is
* subjected to constraints imposed by its parent and can differ
* from the requested CPUs.
*
* An empty value in cpuset.cpus indicates that the cgroup is
* using the same setting as the nearest cgroup ancestor with a
* non-empty cpuset.cpus, or all the available CPUs if none is
* found.
*
* cpuset.cpus.effective:
* ----------------------
* It lists the onlined CPUs that are actually granted to this
* cgroup by its parent. These CPUs are allowed to be used by
* tasks within the current cgroup.
*
* If cpuset.cpus is empty, the cpuset.cpus.effective file shows
* all the CPUs from the parent cgroup that can be available to
* be used by this cgroup.
*
* If cpuset.cpus is not empty, the cpuset.cpus.effective file
* should be a subset of cpuset.cpus unless none of the CPUs
* listed in cpuset.cpus can be granted. In this case, it will
* be treated just like an empty cpuset.cpus.
*/
if (common_cgroup_get_param(
&int_cg[level],
"cpuset.cpus",
&limits->allow_cores,
&limits->cores_size) != SLURM_SUCCESS)
goto fail;
if ((limits->cores_size == 1) &&
!xstrcmp(limits->allow_cores, "\n")) {
xfree(limits->allow_cores);
if (common_cgroup_get_param(
&int_cg[level],
"cpuset.cpus.effective",
&limits->allow_cores,
&limits->cores_size) != SLURM_SUCCESS)
goto fail;
}
/*
* The same concepts from cpuset.cpus and cpuset.cpus.effective
* applies for cpuset.mems and cpuset.mems.effective, so follow
* the same logic here.
*/
if (common_cgroup_get_param(
&int_cg[level],
"cpuset.mems",
&limits->allow_mems,
&limits->mems_size) != SLURM_SUCCESS)
goto fail;
if ((limits->mems_size == 1) &&
!xstrcmp(limits->allow_mems, "\n")) {
xfree(limits->allow_mems);
if (common_cgroup_get_param(
&int_cg[level],
"cpuset.mems.effective",
&limits->allow_mems,
&limits->mems_size) != SLURM_SUCCESS)
goto fail;
}
/*
* Replace the last \n by \0. We lose one byte but we don't care
* since typically this object will be freed soon and we still
* keep the correct array size.
*/
if (limits->cores_size > 0)
limits->allow_cores[(limits->cores_size)-1] = '\0';
if (limits->mems_size > 0)
limits->allow_mems[(limits->mems_size)-1] = '\0';
break;
case CG_MEMORY:
tmp_cg.path = xstrdup(int_cg[level].path);
_get_mem_recursive(&tmp_cg, limits);
xfree(tmp_cg.path);
break;
case CG_DEVICES:
/* Not implemented. */
goto fail;
default:
error("cgroup controller %u not supported", ctl);
goto fail;
}
return limits;
fail:
log_flag(CGROUP, "Returning empty limits, this should not happen.");
cgroup_free_limits(limits);
return NULL;
}
extern int cgroup_p_step_start_oom_mgr(stepd_step_rec_t *step)
{
/* Only set the memory.oom.group if needed. */
if (step->oom_kill_step) {
if (!cgroup_p_has_feature(CG_MEMCG_OOMGROUP))
log_flag(CGROUP, "OOMKillStep was requested but memory.oom.group interface is not available.");
else {
if (common_cgroup_set_param(&int_cg[CG_LEVEL_STEP_USER],
"memory.oom.group", "1")) {
error("Cannot set memory.oom.group");
return SLURM_ERROR;
}
}
}
return SLURM_SUCCESS;
}
extern cgroup_oom_t *cgroup_p_step_stop_oom_mgr(stepd_step_rec_t *step)
{
cgroup_oom_t *oom_step_results = NULL;
uint64_t job_kills = 0, step_kills = 0;
uint64_t job_swkills = 0, step_swkills = 0;
if (!bit_test(int_cg_ns.avail_controllers, CG_MEMORY))
return NULL;
_get_memory_events(&job_kills, &step_kills);
if (cgroup_p_has_feature(CG_MEMCG_SWAP))
_get_swap_events(&job_swkills, &step_swkills);
/* Return stats */
log_flag(CGROUP, "OOM detected %"PRIu64" job and %"PRIu64" step kills",
job_kills, step_kills);
oom_step_results = xmalloc(sizeof(*oom_step_results));
oom_step_results->job_mem_failcnt = job_kills;
oom_step_results->job_memsw_failcnt = job_swkills;
oom_step_results->oom_kill_cnt = step_kills;
oom_step_results->step_mem_failcnt = step_kills;
oom_step_results->step_memsw_failcnt = step_swkills;
return oom_step_results;
}
extern int cgroup_p_task_addto(cgroup_ctl_type_t ctl, stepd_step_rec_t *step,
pid_t pid, uint32_t task_id)
{
task_cg_info_t *task_cg_info;
char *task_cg_path = NULL;
bool need_to_add = false;
/* Ignore any possible movement of slurmstepd */
if (pid == getpid())
return SLURM_SUCCESS;
if (task_id == task_special_id)
log_flag(CGROUP, "Starting task_special cgroup accounting");
else
log_flag(CGROUP, "Starting task %u cgroup accounting", task_id);
/* Let's be sure this task is not already created. */
if (!(task_cg_info = list_find_first(task_list, _find_task_cg_info,
&task_id))) {
task_cg_info = xmalloc(sizeof(*task_cg_info));
task_cg_info->taskid = task_id;
need_to_add = true;
}
if (need_to_add) {
/* Create task hierarchy in this step. */
if (task_id == task_special_id)
xstrfmtcat(task_cg_path, "%s/task_special",
int_cg[CG_LEVEL_STEP_USER].name);
else
xstrfmtcat(task_cg_path, "%s/task_%u",
int_cg[CG_LEVEL_STEP_USER].name, task_id);
if (common_cgroup_create(&int_cg_ns, &task_cg_info->task_cg,
task_cg_path, 0, 0) != SLURM_SUCCESS) {
if (task_id == task_special_id)
error("unable to create task_special cgroup");
else
error("unable to create task %u cgroup",
task_id);
xfree(task_cg_info);
xfree(task_cg_path);
return SLURM_ERROR;
}
xfree(task_cg_path);
if (common_cgroup_instantiate(&task_cg_info->task_cg) !=
SLURM_SUCCESS) {
if (task_id == task_special_id)
error("unable to instantiate task_special cgroup");
else
error("unable to instantiate task %u cgroup",
task_id);
common_cgroup_destroy(&task_cg_info->task_cg);
xfree(task_cg_info);
return SLURM_ERROR;
}
/* Initialize the bpf_program before appending to the list. */
init_ebpf_prog(&task_cg_info->p);
/* Add the cgroup to the list now that it is initialized. */
list_append(task_list, task_cg_info);
}
/* Attach the pid to the corresponding step_x/task_y cgroup */
if (common_cgroup_move_process(&task_cg_info->task_cg, pid) !=
SLURM_SUCCESS)
error("Unable to move pid %d to %s cg",
pid, (task_cg_info->task_cg).path);
return SLURM_SUCCESS;
}
extern cgroup_acct_t *cgroup_p_task_get_acct_data(uint32_t task_id)
{
char *cpu_stat = NULL, *memory_stat = NULL, *memory_current = NULL;
char *memory_peak = NULL;
char *ptr;
size_t tmp_sz = 0;
cgroup_acct_t *stats = NULL;
task_cg_info_t *task_cg_info;
static bool interfaces_checked = false, memory_peak_interface = false;
if (!(task_cg_info = list_find_first(task_list, _find_task_cg_info,
&task_id))) {
if (task_id == task_special_id)
error("No task found with id %u (task_special), this should never happen",
task_id);
else
error("No task found with id %u, this should never happen",
task_id);
return NULL;
}
/*
* Check optional interfaces existence and permissions. This check
* will help to avoid querying unexistent cgroup interfaces every time,
* as might happen in kernel versions that do not provide all of them
*/
if (!interfaces_checked) {
/*
* Check for memory.peak support as RHEL8 and other OSes with
* old kernels might not provide it.
*/
memory_peak_interface = cgroup_p_has_feature(CG_MEMCG_PEAK);
interfaces_checked = true;
}
if (common_cgroup_get_param(&task_cg_info->task_cg,
"cpu.stat",
&cpu_stat,
&tmp_sz) != SLURM_SUCCESS) {
if (task_id == task_special_id)
log_flag(CGROUP, "Cannot read task_special cpu.stat file");
else
log_flag(CGROUP, "Cannot read task %d cpu.stat file",
task_id);
}
if (common_cgroup_get_param(&task_cg_info->task_cg,
"memory.current",
&memory_current,
&tmp_sz) != SLURM_SUCCESS) {
if (task_id == task_special_id)
log_flag(CGROUP, "Cannot read task_special memory.current file");
else
log_flag(CGROUP, "Cannot read task %d memory.current file",
task_id);
}
if (common_cgroup_get_param(&task_cg_info->task_cg,
"memory.stat",
&memory_stat,
&tmp_sz) != SLURM_SUCCESS) {
if (task_id == task_special_id)
log_flag(CGROUP, "Cannot read task_special memory.stat file");
else
log_flag(CGROUP, "Cannot read task %d memory.stat file",
task_id);
}
if (memory_peak_interface) {
if (common_cgroup_get_param(&task_cg_info->task_cg,
"memory.peak",
&memory_peak,
&tmp_sz) != SLURM_SUCCESS) {
if (task_id == task_special_id)
log_flag(CGROUP, "Cannot read task_special memory.peak interface, does your OS support it?");
else
log_flag(CGROUP, "Cannot read task %d memory.peak interface, does your OS support it?",
task_id);
}
}
/*
* Initialize values. A NO_VAL64 will indicate the caller that something
* happened here. Values that aren't set here are returned as 0.
*/
stats = xmalloc(sizeof(*stats));
stats->usec = NO_VAL64;
stats->ssec = NO_VAL64;
stats->total_rss = NO_VAL64;
stats->total_pgmajfault = NO_VAL64;
stats->memory_peak = INFINITE64; /* As required in common_jag.c */
if (cpu_stat) {
ptr = xstrstr(cpu_stat, "user_usec");
if (ptr &&
(sscanf(ptr, "user_usec %"PRIu64, &stats->usec) != 1))
error("Cannot parse user_sec field in cpu.stat file");
ptr = xstrstr(cpu_stat, "system_usec");
if (ptr &&
(sscanf(ptr, "system_usec %"PRIu64, &stats->ssec) != 1))
error("Cannot parse system_usec field in cpu.stat file");
xfree(cpu_stat);
}
/*
* In cgroup/v1, total_rss was the hierarchical sum of # of bytes of
* anonymous and swap cache memory (including transparent huge pages).
*
* In cgroup/v2 we use memory.current which includes all the
* memory the app has touched. Using this value makes it consistent with
* the OOM killer limit.
*/
if (memory_current) {
if (sscanf(memory_current, "%"PRIu64, &stats->total_rss) != 1)
error("Cannot parse memory.current file");
xfree(memory_current);
}
if (memory_stat) {
ptr = xstrstr(memory_stat, "pgmajfault");
if (ptr && (sscanf(ptr, "pgmajfault %"PRIu64,
&stats->total_pgmajfault) != 1))
log_flag(CGROUP, "Cannot parse pgmajfault field in memory.stat file");
xfree(memory_stat);
}
if (memory_peak) {
if (sscanf(memory_peak, "%"PRIu64, &stats->memory_peak) != 1)
error("Cannot parse memory.peak file");
xfree(memory_peak);
}
return stats;
}
/*
* Return conversion units used for stats gathered from cpuacct.
* Dividing the provided data by this number will give seconds.
*/
extern long int cgroup_p_get_acct_units(void)
{
/* usec and ssec from cpuacct.stat are provided in micro-seconds. */
return (long int)USEC_IN_SEC;
}
extern bool cgroup_p_has_feature(cgroup_ctl_feature_t f)
{
char file_path[PATH_MAX];
switch (f) {
case CG_MEMCG_OOMGROUP:
if (!bit_test(int_cg_ns.avail_controllers, CG_MEMORY))
break;
if (snprintf(file_path, PATH_MAX, "%s/memory.oom.group",
int_cg[CG_LEVEL_ROOT].path) >= PATH_MAX)
break;
if (!access(file_path, F_OK))
return true;
break;
case CG_MEMCG_PEAK:
if (!bit_test(int_cg_ns.avail_controllers, CG_MEMORY))
break;
if (snprintf(file_path, PATH_MAX, "%s/memory.peak",
int_cg[CG_LEVEL_ROOT].path) >= PATH_MAX)
break;
if (!access(file_path, F_OK))
return true;
break;
case CG_MEMCG_SWAP:
if (!bit_test(int_cg_ns.avail_controllers, CG_MEMORY))
break;
if (snprintf(file_path, PATH_MAX, "%s/memory.swap.max",
int_cg[CG_LEVEL_ROOT].path) >= PATH_MAX)
break;
if (!access(file_path, F_OK))
return true;
break;
case CG_FALSE_ROOT:
/*
* The cgroup.type file is only present on non-root cgroups.
* This is done to ensure that we do not have a cgroup non-root
* mounted into /sys/fs/cgroup.
*/
if (snprintf(file_path, PATH_MAX, "%s/cgroup.type",
slurm_cgroup_conf.cgroup_mountpoint) >= PATH_MAX)
break;
if (!access(file_path, F_OK))
return true;
break;
case CG_KILL_BUTTON:
if (snprintf(file_path, PATH_MAX, "%s/cgroup.kill",
int_cg[CG_LEVEL_ROOT].path) >= PATH_MAX)
break;
if (!access(file_path, F_OK))
return true;
break;
default:
break;
}
return false;
}
extern int cgroup_p_signal(int signal)
{
if (signal != SIGKILL) {
error("cgroup/v2 cgroup.kill only supports SIGKILL");
return SLURM_ERROR;
}
if (common_cgroup_set_param(&int_cg[CG_LEVEL_STEP_USER],
"cgroup.kill", "1")) {
error("Writing 1 to %s/cgroup.kill failed",
int_cg[CG_LEVEL_STEP_USER].path);
return SLURM_ERROR;
}
log_flag(CGROUP, "Sent signal %d to %s", signal,
int_cg[CG_LEVEL_STEP_USER].path);
return SLURM_SUCCESS;
}
extern char *cgroup_p_get_task_empty_event_path(uint32_t taskid,
bool *on_modify)
{
task_cg_info_t *task_cg_info;
xassert(on_modify);
if (!(task_cg_info = list_find_first(task_list, _find_task_cg_info,
&taskid))) {
return NULL;
}
/* We want to watch when cgroups.events is modified */
*on_modify = true;
return xstrdup_printf("%s/cgroup.events", task_cg_info->task_cg.path);
}
extern int cgroup_p_is_task_empty(uint32_t taskid)
{
task_cg_info_t *task_cg_info;
xcgroup_t cg;
if (!(task_cg_info = list_find_first(task_list, _find_task_cg_info,
&taskid))) {
return SLURM_ERROR;
}
cg = task_cg_info->task_cg;
return _is_cgroup_empty(&cg);
}