| /*****************************************************************************\ |
| * cgroup_v2.c - Cgroup v2 plugin |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #define _GNU_SOURCE |
| |
| #include <fcntl.h> |
| #include <mntent.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| #include <sys/inotify.h> |
| #include <poll.h> |
| #include <unistd.h> |
| |
| #include "slurm/slurm.h" |
| #include "slurm/slurm_errno.h" |
| |
| #include "src/common/bitstring.h" |
| #include "src/common/fd.h" |
| #include "src/common/list.h" |
| #include "src/common/log.h" |
| #include "src/common/timers.h" |
| #include "src/common/xassert.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| #include "src/common/daemonize.h" |
| #include "src/slurmctld/slurmctld.h" |
| #include "src/slurmd/slurmd/slurmd.h" |
| #include "src/plugins/cgroup/common/cgroup_common.h" |
| #include "src/plugins/cgroup/v2/cgroup_dbus.h" |
| #include "src/plugins/cgroup/v2/ebpf.h" |
| |
| #define SYSTEM_CGSLICE "system.slice" |
| #define SYSTEM_CGSCOPE "slurmstepd" |
| #define SYSTEM_CGDIR "system" |
| |
| const char plugin_name[] = "Cgroup v2 plugin"; |
| const char plugin_type[] = "cgroup/v2"; |
| const uint32_t plugin_version = SLURM_VERSION_NUMBER; |
| |
| /* Internal cgroup structs */ |
| static list_t *task_list; |
| static uint16_t step_active_cnt; |
| static xcgroup_ns_t int_cg_ns = { 0 }; |
| static xcgroup_t int_cg[CG_LEVEL_CNT]; |
| static bpf_program_t p[CG_LEVEL_CNT]; |
| static char *stepd_scope_path = NULL; |
| static uint32_t task_special_id = NO_VAL; |
| static char *invoc_id; |
| static char *ctl_names[] = { |
| [CG_TRACK] = "freezer", |
| [CG_CPUS] = "cpuset", |
| [CG_MEMORY] = "memory", |
| [CG_CPUACCT] = "cpu", |
| [CG_DEVICES] = "devices", |
| /* Below are extra controllers not explicitly tracked by Slurm. */ |
| [CG_IO] = "io", |
| [CG_HUGETLB] = "hugetlb", |
| [CG_PIDS] = "pids", |
| [CG_RDMA] = "rdma", |
| [CG_MISC] = "misc" |
| }; |
| |
| typedef struct { |
| xcgroup_t task_cg; |
| uint32_t taskid; |
| bpf_program_t p; |
| } task_cg_info_t; |
| |
| typedef struct { |
| int npids; |
| pid_t *pids; |
| } foreach_pid_array_t; |
| |
| extern bool cgroup_p_has_feature(cgroup_ctl_feature_t f); |
| extern int cgroup_p_task_addto(cgroup_ctl_type_t ctl, stepd_step_rec_t *step, |
| pid_t pid, uint32_t task_id); |
| |
| /* Hierarchy will take this form: |
| * [int_cg_ns] [int_cg_ns] |
| * "slurmd service" "slurmtepds scope" |
| * root(delegated) root(delegated) [CG_LEVEL_ROOT] |
| * | / \ |
| * | / | |
| * slurmd | job_x ... job_n [CG_LEVEL_JOB] |
| * system | |
| * (waiting area | |
| * for new stepds) | |
| * step_0 ... step_n [CG_LEVEL_STEP] |
| * / \ |
| * [CG_LEVEL_STEP_USER] user_processes slurm_processes [CG_LEVEL_STEP_SLURM] |
| * / (slurmstepds) |
| * / |
| * | |
| * task_special...task_0...task_n [CG_LEVEL_TASK] (user pids) |
| * (task_id = NO_VAL) |
| */ |
| |
| /* |
| * Get the cgroup root mountpoint for a given mount path and pid. |
| * |
| * This function parses the /proc/pid/mountinfo, and gets the 4th element of |
| * the line which 5th element equals to mount parameter. |
| * |
| * From man proc_pid_mountinfo about 4th and 5th fields of mountinfo: |
| * (4) root: the pathname of the directory in the filesystem which forms the |
| * root of this mount. |
| * (5) mount point: the pathname of the mount point relative to the process's |
| * root directory. |
| * |
| * This is used primarily to get the real mount for a cgroup filesystem as in |
| * some specific containerized environments the real root of the cgroup |
| * filesystem may not be coincide with what we get in /proc/1/cgroup. |
| * |
| * This only checks the first occurrence of the mount as it will always be the |
| * proper one, as this file gets written sequentially, meaning that the "real" |
| * /sys/fs/cgroup will appear first. If it happens to be any bind mount to it |
| * it will appear later, those bind mounts do not affect the /proc/<pid>/cgroup |
| * data. |
| * |
| * Example: |
| * - For mount = "/sys/fs/cgroup" and pid 123, we find the following line in |
| * /proc/123/mountinfo, so as the 5th field matches mount, we will return the |
| * 4th field "/": |
| * 475 337 0:28 / /sys/fs/cgroup rw(...) - cgroup2 cgroup2 rw,nsdelegate(...) |
| * |
| * - If we get a line like this, we will return "/../../../../../..". |
| * 379 377 0:28 /../../../../../.. /sys/fs/cgroup rw(...) - cgroup2(...) |
| * |
| * IN mount - Path to match with the 5th field of mountinfo string. |
| * IN pid_str - Pid to look for the mountinfo. |
| * OUT data - NULL if not found, or a xmalloc'ed string with a copy of the |
| * 4th field of the line which matches mount with the 5th field. |
| */ |
| static char *_get_root_mount_mountinfo(char *mount, char *pid_str) |
| { |
| char *path = NULL, *line = NULL, *word, *data = NULL, *save_ptr = NULL; |
| size_t len = 0; |
| int count = 0; |
| FILE *f; |
| bool found = false; |
| |
| path = xstrdup_printf("/proc/%s/mountinfo", pid_str); |
| f = fopen(path, "r"); |
| xfree(path); |
| if (f == NULL) { |
| fatal("cannot read /proc/%s/mountinfo contents: %m", pid_str); |
| return NULL; |
| } |
| |
| while (!found && getline(&line, &len, f) != -1) { |
| if (xstrstr(line, mount)) { |
| count = 0; |
| word = strtok_r(line, " ", &save_ptr); |
| while (word) { |
| /* |
| * The 4th value is the root of the mount, and |
| * the 5th is the mount, so we want to get |
| * the 4th and ensure that the 5th is exactly |
| * equal to mount, so that we are not looking |
| * into a subdirectory. |
| */ |
| if (count == 3) { |
| data = word; |
| word = strtok_r(NULL, " ", &save_ptr); |
| if (!xstrcmp(word, mount)) { |
| data = xstrdup(data); |
| found = true; |
| break; |
| } |
| } |
| count++; |
| word = strtok_r(NULL, " ", &save_ptr); |
| } |
| } |
| } |
| free(line); |
| |
| fclose(f); |
| if (!data) { |
| error("Could not parse '%s' root mount for %s", mount, pid_str); |
| } |
| return data; |
| } |
| |
| /* |
| * Check whether path is a valid cgroup2 mountpoint. This also checks that the |
| * cgroup mount passed is usable in the current cgroup2 namespace. |
| * |
| * IN path - Path to cgroup2 mountpoint. |
| */ |
| static bool _is_cgroup2_mount(char *path) |
| { |
| FILE *fp = setmntent("/proc/mounts", "r"); |
| struct mntent *mnt; |
| char *minfo = NULL; |
| bool rc = false; |
| |
| if (!fp) { |
| error("Failed to open /proc/mounts"); |
| return rc; |
| } |
| |
| while ((mnt = getmntent(fp))) { |
| if (!xstrcmp(mnt->mnt_dir, path) && |
| !xstrcmp(mnt->mnt_type, "cgroup2")) { |
| rc = true; |
| break; |
| } |
| } |
| |
| if (!rc) { |
| error("The cgroup mountpoint %s is not mounted", path); |
| goto end; |
| } |
| |
| minfo = _get_root_mount_mountinfo(path, "self"); |
| if (xstrcmp(minfo, "/")) |
| error("The cgroup mountpoint does not align with the current namespace. Please, ensure all namespaces are correctly mounted. Refer to the slurm cgroup_v2 documentation."); |
| |
| end: |
| xfree(minfo); |
| endmntent(fp); |
| return rc; |
| } |
| |
| /* |
| * Read /proc/<pid>/cgroup and return the absolute cgroup path of the given pid. |
| * |
| * We will deal with different cases. For example: |
| * |
| * In regular systems we expect one single line like this: |
| * "0::/init.scope\n" |
| * |
| * In some containerized environments it could look like: |
| * "0::/docker.slice/docker-<some UUID>.scope/init.scope" |
| * |
| * Or in a cgroup namespace: |
| * "0::/" |
| * |
| * This function just strips the initial "0::" and the last part of the path |
| * (e.g "init.scope") portions. Then it adds the cgroup mountpoint prefix. |
| * |
| * In Unified hierarchies this must contain only one line. If there are more |
| * lines this would mean we are in Hybrid or in Legacy cgroup. We do not support |
| * hybrid mode, so if we find more than one line we fatal. |
| * |
| * The Cgroup v2 documented way to know which is the cgroup root for a |
| * process in the cgroup hierarchy is just to read /proc/<pid>/cgroup. |
| * |
| * The parameter pid_str is a string representing a numeric pid or the |
| * keyword 'self'. (Note: if we are in a cgroup namespace without a proper proc |
| * mount, using 'self' will possibly return a different value than using |
| * getpid()). |
| * |
| * IN pid_str - pid to read the path for |
| * OUT ret - xmalloc'ed string containing the cgroup path for the passed pid |
| * read from /proc/<pid>/cgroup. |
| */ |
| static char *_get_proc_cg_path(char *pid_str) |
| { |
| char *buf, *start = NULL, *p, *ret = NULL; |
| char *path = NULL, *minfo = NULL; |
| size_t sz; |
| |
| path = xstrdup_printf("/proc/%s/cgroup", pid_str); |
| if (common_file_read_content(path, &buf, &sz) != SLURM_SUCCESS) { |
| xfree(path); |
| fatal("cannot read /proc/%s/cgroup contents: %m", pid_str); |
| } |
| xfree(path); |
| |
| /* |
| * In Unified mode there will be just one line containing the path |
| * of the cgroup and starting by 0. If there are more than one then |
| * some v1 cgroups are mounted, we do not support it. |
| */ |
| if (buf && (buf[0] != '0')) |
| fatal("Hybrid mode is not supported. Mounted cgroups are: %s", |
| buf); |
| |
| /* |
| * Skip until past the :: from the file ensuring that we are not past |
| * the buffer size. |
| */ |
| if ((p = xstrchr(buf, ':')) != NULL) { |
| if ((p + 2) < (buf + sz - 1)) |
| start = p + 2; |
| /* Remove everything after the first newline found. */ |
| if ((p = xstrchr(start, '\n'))) |
| *p = '\0'; |
| } |
| |
| if (!start || (*start == '\0')) |
| fatal("Unexpected format found in /proc/%s/cgroup file: %s", |
| pid_str, buf); |
| |
| /* Start the return string with the mount point of the cgroup. */ |
| ret = xstrdup(slurm_cgroup_conf.cgroup_mountpoint); |
| |
| /* |
| * Only check mountinfo in case that the cgroup file points to a |
| * location that is not the root of the cgroup mountpoint (/). |
| */ |
| if (xstrcmp(start, "/")) { |
| /* |
| * Check for correct /proc and cgroup mounts when we are in a |
| * cgroup namespace by checking mountinfo. |
| */ |
| minfo = _get_root_mount_mountinfo( |
| slurm_cgroup_conf.cgroup_mountpoint, |
| pid_str); |
| /* |
| * If minfo is "/" our root is |
| * slurm_cgroup_conf.cgroup_mountpoint. |
| * |
| * If minfo contains something different than "/": |
| * For containers with remounted cgroups, mountinfo would've |
| * returned a string different than "/", so we first need to |
| * ensure that the minfo is a substring of what we've read in |
| * /proc/pid/cgroup. |
| * |
| * If minfo content is not a substring of our /proc/pid/cgroup |
| * (e.g. minfo is "../../.." and /proc/pid/cgroup is |
| * 0::/something), we're in a wrong situation. |
| */ |
| if (xstrcmp(minfo, "/")) { |
| /* |
| * If the information of /proc/pid/mountinfo is not a |
| * substring of the one in /proc/pid/cgroup, it means |
| * that something is wrong. For example we are in a pid |
| * and a cgroup namespace without /proc properly mounted. |
| */ |
| if (xstrstr(start, minfo)) |
| start = start + strlen(minfo); |
| else |
| fatal("mismatch found in /proc/%s/mountinfo: \"%s\" vs /proc/%s/cgroup: \"%s\". Please check that procfs and cgroupfs are correctly mounted in the namespace.", |
| pid_str, minfo, pid_str, start); |
| } |
| |
| /* Append the sanitized path to the cgroup mountpoint. */ |
| xstrcat(ret, start); |
| |
| xfree(minfo); |
| } |
| |
| xfree(buf); |
| return ret; |
| } |
| |
| /* |
| * Get the absolute OS's cgroup root directory by reading /proc/1/cgroup path. |
| * |
| * In normal systems the final path will look like this: |
| * /sys/fs/cgroup[/] |
| * |
| * In containerized environments it will look like: |
| * /sys/fs/cgroup[/docker.slice/docker-<some UUID>.scope] |
| * |
| */ |
| static char *_get_init_cg_path() |
| { |
| char *cg_path, *ret = NULL; |
| |
| cg_path = _get_proc_cg_path("1"); |
| |
| if (xstrcmp(cg_path, slurm_cgroup_conf.cgroup_mountpoint)) { |
| ret = xdirname(cg_path); |
| xfree(cg_path); |
| } else { |
| ret = cg_path; |
| } |
| |
| return ret; |
| } |
| |
| /* |
| * Fill up the internal cgroup namespace object. This mainly contains the path |
| * to what will be our root cgroup. |
| * E.g. /sys/fs/cgroup/system.slice/node1_slurmstepd.scope/ for slurmstepd. |
| */ |
| static void _set_int_cg_ns() |
| { |
| int_cg_ns.init_cg_path = _get_init_cg_path(); |
| |
| /* |
| * When started manually in a container and reconfiguring, if we are pid |
| * 1 we can directly get the cgroup as it has been configured in our |
| * previous instance. |
| */ |
| if (slurm_cgroup_conf.ignore_systemd && getenv("SLURMD_RECONF") && |
| (getpid() == 1)) { |
| stepd_scope_path = xdirname(int_cg_ns.init_cg_path); |
| int_cg_ns.mnt_point = xstrdup(int_cg_ns.init_cg_path); |
| return; |
| } |
| |
| #ifdef MULTIPLE_SLURMD |
| xstrfmtcat(stepd_scope_path, "%s/%s/%s_%s.scope", |
| int_cg_ns.init_cg_path, SYSTEM_CGSLICE, conf->node_name, |
| SYSTEM_CGSCOPE); |
| #else |
| xstrfmtcat(stepd_scope_path, "%s/%s/%s.scope", int_cg_ns.init_cg_path, |
| SYSTEM_CGSLICE, SYSTEM_CGSCOPE); |
| #endif |
| int_cg_ns.mnt_point = _get_proc_cg_path("self"); |
| } |
| |
| /* |
| * For each available controller, enable it in this path. This operation is |
| * only intended to be done in the Domain controllers, never in a leaf where |
| * processes reside. If it is done in a leaf it *won't be possible* to add any |
| * pid to it. Enabling the controllers will make their interfaces available |
| * (e.g. the memory.*, cpu.*, cpuset.* ... files) to control the cgroup. |
| */ |
| static int _enable_subtree_control(char *path, bitstr_t *ctl_bitmap) |
| { |
| int i, rc = SLURM_SUCCESS, rc2; |
| char *content = NULL, *file_path = NULL; |
| |
| xassert(ctl_bitmap); |
| |
| xstrfmtcat(file_path, "%s/cgroup.subtree_control", path); |
| for (i = 0; i < CG_CTL_CNT; i++) { |
| if (!bit_test(ctl_bitmap, i)) |
| continue; |
| |
| xstrfmtcat(content, "+%s", ctl_names[i]); |
| rc2 = common_file_write_content(file_path, content, |
| strlen(content)); |
| if (rc2 != SLURM_SUCCESS) { |
| /* |
| * In a container it is possible that part of the |
| * cgroup tree is mounted in read-only mode, so skip |
| * the parts that we cannot touch. |
| */ |
| if (errno == EROFS) { |
| log_flag(CGROUP, |
| "Cannot enable %s in %s, skipping: %m", |
| ctl_names[i], file_path); |
| } else { |
| /* Controller won't be available. */ |
| error("Cannot enable %s in %s: %m", |
| ctl_names[i], file_path); |
| bit_clear(ctl_bitmap, i); |
| rc = SLURM_ERROR; |
| } |
| } else { |
| log_flag(CGROUP, "Enabled %s controller in %s", |
| ctl_names[i], file_path); |
| } |
| xfree(content); |
| } |
| xfree(file_path); |
| return rc; |
| } |
| |
| static int _get_controllers(char *path, bitstr_t *ctl_bitmap) |
| { |
| char *buf = NULL, *ptr, *save_ptr, *ctl_filepath = NULL, *extra; |
| size_t sz; |
| |
| xassert(ctl_bitmap); |
| |
| /* Remove the extra controllers if not explicitly asked */ |
| extra = slurm_cgroup_conf.enable_extra_controllers; |
| if (!xstrstr(extra, "all")) { |
| if (extra) { |
| for (int i = CG_IO; i < CG_CTL_CNT; i++) { |
| if (!xstrstr(extra, ctl_names[i])) { |
| ctl_names[i] = ""; |
| } |
| } |
| } else { |
| for (int i = CG_IO; i < CG_CTL_CNT; i++) |
| ctl_names[i] = ""; |
| } |
| } |
| |
| xstrfmtcat(ctl_filepath, "%s/cgroup.controllers", path); |
| if (common_file_read_content(ctl_filepath, &buf, &sz) != |
| SLURM_SUCCESS || !buf) { |
| error("cannot read %s: %m", ctl_filepath); |
| xfree(ctl_filepath); |
| return SLURM_ERROR; |
| } |
| xfree(ctl_filepath); |
| |
| if (buf[sz - 1] == '\n') |
| buf[sz - 1] = '\0'; |
| |
| ptr = strtok_r(buf, " ", &save_ptr); |
| while (ptr) { |
| for (int i = 0; i < CG_CTL_CNT; i++) { |
| if (!xstrcmp(ctl_names[i], "")) |
| continue; |
| if (!xstrcasecmp(ctl_names[i], ptr)) { |
| bit_set(ctl_bitmap, i); |
| break; |
| } |
| } |
| ptr = strtok_r(NULL, " ", &save_ptr); |
| } |
| xfree(buf); |
| |
| for (int i = 0; i < CG_CTL_CNT; i++) { |
| if ((i == CG_DEVICES) || (i == CG_TRACK)) |
| continue; |
| if (invoc_id && !bit_test(ctl_bitmap, i) && |
| xstrcmp(ctl_names[i], "")) |
| error("Controller %s is not enabled!", ctl_names[i]); |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Enables the cgroup controllers system_ctrls from /sys/fs/cgroup to the one |
| * specified in cg_path. If system_ctrls is null it reads it from |
| * /sys/fs/cgroup/cgroup.controllers |
| */ |
| static int _enable_controllers(char *cg_path, bitstr_t *system_ctrls) |
| { |
| int rc = SLURM_SUCCESS; |
| char *p, *dst; |
| |
| xassert(system_ctrls); |
| if (!(xstrstr(cg_path, slurm_cgroup_conf.cgroup_mountpoint))) { |
| error("%s is not under the cgroup mountpoint %s.", |
| cg_path, slurm_cgroup_conf.cgroup_mountpoint); |
| return SLURM_ERROR; |
| } |
| |
| p = dst = xstrdup(cg_path); |
| p += strlen(slurm_cgroup_conf.cgroup_mountpoint); |
| do { |
| *p = '\0'; |
| if ((rc = _enable_subtree_control(dst, system_ctrls))) |
| goto cleanup; |
| *p = '/'; |
| } while ((p = xstrchr(p + 1, '/'))); |
| |
| cleanup: |
| xfree(dst); |
| return rc; |
| } |
| |
| /* |
| * Enabling the subtree from the top mountpoint to the slice we will reside |
| * is needed to get all the controllers we want to support. Nevertheless note |
| * that if systemd is reloaded, reset, or does any operation that implies |
| * traversing the cgroup tree matching its internal database, and there's no |
| * service started with Delegate=yes (like running this slurmd manually), the |
| * controllers can eventually be deactivated without warning by systemd. |
| * |
| * Also note that usually starting any service or scope with Delegate=yes in the |
| * slice we want to live, will make systemd to automatically activate the |
| * controllers in the tree, so this operation here would be redundant. |
| */ |
| static int _enable_system_controllers() |
| { |
| char *slice_path = NULL; |
| bitstr_t *system_ctrls = bit_alloc(CG_CTL_CNT); |
| int rc = SLURM_ERROR; |
| |
| if (_get_controllers(slurm_cgroup_conf.cgroup_mountpoint, |
| system_ctrls) != SLURM_SUCCESS) { |
| error("Could not obtain system controllers from %s", |
| slurm_cgroup_conf.cgroup_mountpoint); |
| goto end; |
| } |
| |
| if (_enable_controllers(int_cg_ns.mnt_point, system_ctrls) != |
| SLURM_SUCCESS) { |
| error("Could not enable controllers for cgroup path %s", |
| int_cg_ns.mnt_point); |
| goto end; |
| } |
| |
| /* |
| * Enable it for system.slice, where the stepd scope will reside when |
| * it is created later. Do not do it when ignoresystemd is true as it |
| * will be done when the stepd_scope_path is created. |
| */ |
| if (!slurm_cgroup_conf.ignore_systemd) { |
| slice_path = xdirname(stepd_scope_path); |
| if (_enable_subtree_control(slice_path, system_ctrls) != |
| SLURM_SUCCESS) { |
| error("Could not enable subtree control at %s", |
| slice_path); |
| goto end; |
| } |
| } |
| rc = SLURM_SUCCESS; |
| end: |
| xfree(slice_path); |
| FREE_NULL_BITMAP(system_ctrls); |
| return rc; |
| } |
| |
| /* |
| * Read the cgroup.controllers file of the root to detect which are the |
| * available controllers in this system. |
| */ |
| static int _setup_controllers() |
| { |
| /* Field not used in v2 */ |
| int_cg_ns.subsystems = NULL; |
| |
| /* |
| * Check all the available controllers in this system and enable them in |
| * every level of the cgroup tree if EnableControllers=yes. |
| * Normally, if the unit we're starting up has a Delegate=yes, systemd |
| * will set the cgroup.subtree_controllers of the parent with all the |
| * available controllers on that level, making all of them available on |
| * our unit automatically. In some situations, like if the parent cgroup |
| * doesn't have write permissions or if it started with fewer |
| * controllers available than the ones on the system (when the |
| * grandfather doesn't have subtree_control set), that won't happen and |
| * we may need Enablecontrollers. This may happen in containers. |
| */ |
| if (running_in_slurmd() && slurm_cgroup_conf.enable_controllers) |
| _enable_system_controllers(); |
| |
| /* Get the controllers on our namespace. */ |
| return _get_controllers(int_cg_ns.mnt_point, |
| int_cg_ns.avail_controllers); |
| } |
| |
| static int _rmdir_task(void *x, void *arg) |
| { |
| task_cg_info_t *t = (task_cg_info_t *) x; |
| |
| if (common_cgroup_delete(&t->task_cg) != SLURM_SUCCESS) |
| log_flag(CGROUP, "Failed to delete %s: %m", t->task_cg.path); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _find_task_cg_info(void *x, void *key) |
| { |
| task_cg_info_t *task_cg = (task_cg_info_t *)x; |
| uint32_t taskid = *(uint32_t*)key; |
| |
| if (task_cg->taskid == taskid) |
| return 1; |
| |
| return 0; |
| } |
| |
| static void _free_task_cg_info(void *x) |
| { |
| task_cg_info_t *task_cg = (task_cg_info_t *)x; |
| |
| if (task_cg) { |
| common_cgroup_destroy(&task_cg->task_cg); |
| free_ebpf_prog(&task_cg->p); |
| xfree(task_cg); |
| } |
| } |
| |
| static void _all_tasks_destroy() |
| { |
| /* Empty the lists of accounted tasks, do a best effort in rmdir */ |
| (void) list_delete_all(task_list, _rmdir_task, NULL); |
| } |
| |
| static int _get_task_pids(void *x, void *key) |
| { |
| task_cg_info_t *task_cg_info = (task_cg_info_t *)x; |
| foreach_pid_array_t *pid_array = key; |
| pid_t *pids = NULL; |
| int npids = 0; |
| |
| xassert(pid_array); |
| common_cgroup_get_pids(&task_cg_info->task_cg, &pids, &npids); |
| |
| if (pid_array->pids) { |
| xrecalloc(pid_array->pids, (pid_array->npids + npids), |
| sizeof(*pid_array->pids)); |
| memcpy((pid_array->pids + pid_array->npids), pids, |
| sizeof(*pid_array->pids) * npids); |
| pid_array->npids += npids; |
| } else { |
| pid_array->pids = pids; |
| pids = NULL; |
| pid_array->npids = npids; |
| } |
| xfree(pids); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _find_pid_task(void *x, void *key) |
| { |
| task_cg_info_t *task_cg_info = (task_cg_info_t *)x; |
| pid_t pid = *(pid_t *) key; |
| pid_t *pids = NULL; |
| int npids = 0; |
| bool found = false; |
| |
| if (common_cgroup_get_pids(&task_cg_info->task_cg, &pids, &npids) != |
| SLURM_SUCCESS) |
| return false; |
| |
| for (int i = 0; i < npids; i++) { |
| if (pids[i] == pid) { |
| found = true; |
| break; |
| } |
| } |
| |
| xfree(pids); |
| return found; |
| } |
| |
| /* |
| * Check the "populated" key in the cgroup.events file |
| * Returns CGROUP_EMPTY, CGROUP_POPULATED, or SLURM_ERROR. |
| */ |
| static int _is_cgroup_empty(xcgroup_t *cg) |
| { |
| char *events_content = NULL, *ptr; |
| int rc; |
| int populated = -1; |
| size_t size; |
| |
| /* Check if cgroup is empty in the first place. */ |
| if (common_cgroup_get_param(cg, "cgroup.events", &events_content, |
| &size) != SLURM_SUCCESS) { |
| error("Cannot read %s/cgroup.events", cg->path); |
| return SLURM_ERROR; |
| } |
| |
| if (!events_content) { |
| error("%s/cgroup.events is empty", cg->path); |
| return SLURM_ERROR; |
| } |
| |
| if (!(ptr = xstrstr(events_content, "populated"))) { |
| error("Could not find \"populated\" field in %s/cgroup.events: \"%s\"", |
| cg->path, events_content); |
| xfree(events_content); |
| return SLURM_ERROR; |
| } |
| |
| if ((rc = sscanf(ptr, "populated %u", &populated) != 1)) { |
| error("Could not find value for \"populated\" field in %s/cgroup.events (\"%s\"): %s", |
| cg->path, events_content, strerror(rc)); |
| xfree(events_content); |
| return SLURM_ERROR; |
| } |
| |
| xfree(events_content); |
| |
| switch (populated) { |
| case 0: |
| return CGROUP_EMPTY; |
| case 1: |
| return CGROUP_POPULATED; |
| default: |
| error("Cannot determine if %s is empty.", cg->path); |
| break; |
| } |
| return SLURM_ERROR; |
| } |
| |
| static void _wait_cgroup_empty(xcgroup_t *cg, int timeout_ms) |
| { |
| char *cgroup_events = NULL; |
| int rc, fd, wd, populated = -1; |
| struct pollfd pfd[1]; |
| |
| populated = _is_cgroup_empty(cg); |
| |
| if (populated == SLURM_ERROR) { |
| error("Cannot determine if %s is empty.", cg->path); |
| return; |
| } else if (populated == CGROUP_EMPTY) //We're done |
| return; |
| |
| /* |
| * Cgroup is not empty, so wait for a while just monitoring any change |
| * on cgroup.events. Changing populate from 1 to 0 is what we expect. |
| */ |
| |
| xstrfmtcat(cgroup_events, "%s/cgroup.events", cg->path); |
| |
| /* Initialize an inotify monitor */ |
| fd = inotify_init(); |
| if (fd < 0) { |
| error("Cannot initialize inotify for checking cgroup events: %m"); |
| return; |
| } |
| |
| /* Set the file and events we want to monitor. */ |
| wd = inotify_add_watch(fd, cgroup_events, IN_MODIFY); |
| if (wd < 0) { |
| error("Cannot add watch events to %s: %m", cgroup_events); |
| goto end_inotify; |
| } |
| |
| /* Wait for new events. */ |
| pfd[0].fd = fd; |
| pfd[0].events = POLLIN; |
| rc = poll(pfd, 1, timeout_ms); |
| |
| /* |
| * We don't really care about the event details, just check now if the |
| * cg event file contains what we're looking for. |
| */ |
| if (rc < 0) |
| error("Error polling for event in %s: %m", cgroup_events); |
| else if (rc == 0) |
| error("Timeout waiting for %s to become empty.", cgroup_events); |
| |
| /* Check if cgroup is empty again. */ |
| populated = _is_cgroup_empty(cg); |
| |
| if (populated == SLURM_ERROR) |
| error("Cannot determine if %s is empty.", cg->path); |
| else if (populated == CGROUP_POPULATED) |
| log_flag(CGROUP, "Cgroup %s is not empty.", cg->path); |
| |
| end_inotify: |
| close(fd); |
| xfree(cgroup_events); |
| } |
| |
| /* |
| * dbus is a batch system and asynchronous, so we cannot know when the scope |
| * will be ready unless we wait for the cgroup directories to be created and |
| * for the pid to show up in cgroup.procs. |
| * |
| * The waiting time will depend completely on the time systemd takes to complete |
| * such operations. |
| */ |
| static int _wait_scope_ready(xcgroup_t scope_root, pid_t pid, uint32_t t) |
| { |
| DEF_TIMERS; |
| bool found = false; |
| int rc, npids, retries = 0; |
| pid_t *pids; |
| uint32_t timeout = t * 1000; //msec to usec |
| struct stat sb; |
| struct timeval start_tv; |
| |
| START_TIMER; |
| gettimeofday(&start_tv, NULL); |
| |
| /* Wait for the scope directory to show up. */ |
| do { |
| rc = stat(scope_root.path, &sb); |
| if (!rc) |
| break; |
| if ((rc < 0) && (errno != ENOENT)) { |
| error("stat() error checking for %s after dbus call: %m", |
| scope_root.path); |
| return SLURM_ERROR; |
| } |
| retries++; |
| if (slurm_delta_tv(&start_tv) > timeout) |
| goto dbus_timeout; |
| poll(NULL, 0, 10); |
| } while (true); |
| |
| END_TIMER; |
| log_flag(CGROUP, "Took %s and %d retries for scope dir %s to show up.", |
| TIME_STR, retries, scope_root.path); |
| |
| /* Wait for the pid to show up in cgroup.procs */ |
| START_TIMER; |
| retries = 0; |
| do { |
| common_cgroup_get_pids(&scope_root, &pids, &npids); |
| for (int i = 0; i < npids; i++) { |
| if (pids[i] == pid) { |
| found = true; |
| break; |
| } |
| } |
| xfree(pids); |
| retries++; |
| if (!found) { |
| if (slurm_delta_tv(&start_tv) > timeout) |
| goto dbus_timeout; |
| poll(NULL, 0, 10); |
| } |
| } while (!found); |
| |
| END_TIMER; |
| log_flag(CGROUP, "Took %s and %d retries for pid %d to show up in %s/cgroup.procs.", |
| TIME_STR, retries, pid, scope_root.path); |
| |
| log_flag(CGROUP, "Scope initialization complete after %d msec", |
| (slurm_delta_tv(&start_tv)/1000)); |
| |
| return SLURM_SUCCESS; |
| dbus_timeout: |
| END_TIMER; |
| error("Scope initialization timeout after %s", TIME_STR); |
| return SLURM_ERROR; |
| } |
| |
| static int _init_stepd_system_scope(pid_t pid) |
| { |
| char *system_dir = "/" SYSTEM_CGDIR; |
| char *self_cg_path; |
| |
| if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_SYSTEM], |
| system_dir, (uid_t) 0, (gid_t) 0) != |
| SLURM_SUCCESS) { |
| error("unable to create system cgroup %s", system_dir); |
| return SLURM_ERROR; |
| } |
| |
| if (common_cgroup_instantiate(&int_cg[CG_LEVEL_SYSTEM]) != |
| SLURM_SUCCESS) { |
| error("Unable to instantiate system %s cgroup", system_dir); |
| return SLURM_ERROR; |
| } |
| |
| if (common_cgroup_move_process(&int_cg[CG_LEVEL_SYSTEM], pid) != |
| SLURM_SUCCESS) { |
| error("Unable to attach pid %d to %s cgroup.", pid, system_dir); |
| return SLURM_ERROR; |
| } |
| |
| /* Now check we're really where we belong to. */ |
| self_cg_path = _get_proc_cg_path("self"); |
| if (xstrcmp(self_cg_path, int_cg[CG_LEVEL_SYSTEM].path)) { |
| error("Could not move slurmstepd pid %d to a Slurm's delegated cgroup. Should be in %s, we are in %s.", |
| pid, int_cg[CG_LEVEL_SYSTEM].path, self_cg_path); |
| xfree(self_cg_path); |
| return SLURM_ERROR; |
| } |
| xfree(self_cg_path); |
| |
| if (_enable_subtree_control(int_cg[CG_LEVEL_ROOT].path, |
| int_cg_ns.avail_controllers) != |
| SLURM_SUCCESS) { |
| error("Cannot enable subtree_control at the top level %s", |
| int_cg_ns.mnt_point); |
| return SLURM_ERROR; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _init_new_scope(char *scope_path) |
| { |
| int rc; |
| |
| rc = mkdirpath(scope_path, 0755, true); |
| if (rc && (errno != EEXIST)) { |
| error("Could not create scope directory %s: %m", scope_path); |
| return SLURM_ERROR; |
| } |
| _enable_controllers(scope_path, int_cg_ns.avail_controllers); |
| log_flag(CGROUP, "Created %s", scope_path); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Talk to systemd through dbus to move the slurmstepd pid into the reserved |
| * scope for stepds and user processes. |
| */ |
| static int _init_new_scope_dbus(char *scope_path) |
| { |
| int status, pipe_fd[2]; |
| pid_t pid; |
| xcgroup_t sys_root, scope_root; |
| char *const argv[3] = { |
| (char *)conf->stepd_loc, |
| "infinity", |
| NULL }; |
| |
| if (pipe(pipe_fd)) |
| fatal("pipe() failed: %m"); |
| xassert(pipe_fd[0] > STDERR_FILENO); |
| xassert(pipe_fd[1] > STDERR_FILENO); |
| |
| pid = fork(); |
| if (pid < 0) |
| fatal("%s: cannot start slurmstepd infinity process", __func__); |
| else if (pid == 0) { |
| /* wait for signal from parent */ |
| if (close(pipe_fd[1])) |
| fatal("close(%u) failed: %m", pipe_fd[1]); |
| |
| safe_read(pipe_fd[0], &pid, sizeof(pid)); |
| |
| if (close(pipe_fd[0])) |
| fatal("close(%u) failed: %m", pipe_fd[0]); |
| |
| /* |
| * Uncouple ourselves from slurmd, so a signal sent to the |
| * slurmd process group won't kill slurmstepd infinity. This way |
| * the scope will remain forever and no further calls to |
| * dbus/systemd will be needed until the scope is manually |
| * stopped. |
| * |
| * This minimizes the interaction with systemd becoming less |
| * dependent on possible malfunctions it might have. |
| */ |
| if (xdaemon()) |
| _exit(127); |
| |
| /* Become slurmstepd infinity */ |
| execvp(argv[0], argv); |
| error("execvp of slurmstepd wait failed: %m"); |
| _exit(127); |
| } |
| |
| if (close(pipe_fd[0])) |
| fatal("close(%u) failed: %m", pipe_fd[0]); |
| |
| if (cgroup_dbus_attach_to_scope(pid, scope_path) != SLURM_SUCCESS) { |
| /* |
| * Systemd scope unit may already exist or is stuck, and |
| * the directory is not there!. |
| */ |
| kill(pid, SIGKILL); |
| waitpid(pid, &status, WNOHANG); |
| fatal("systemd scope for slurmstepd could not be set."); |
| } |
| |
| /* |
| * We need to wait for the scope to be created, and the child pid |
| * moved to the root, so we do not race with systemd. |
| * |
| * Experiments shown that depending on systemd load, it can be slow |
| * (>500ms) launching and executing the 'systemd job'. The 'job' will |
| * consist in internally creating the scope, mkdir the cgroup |
| * directories and finally move the pid. |
| * |
| * After *all* this work is done, then we can continue. |
| */ |
| scope_root.path = scope_path; |
| if (_wait_scope_ready(scope_root, pid, |
| slurm_cgroup_conf.systemd_timeout) |
| != SLURM_SUCCESS) { |
| kill(pid, SIGKILL); |
| waitpid(pid, &status, WNOHANG); |
| fatal("Scope init timed out, systemd might need cleanup with 'systemctl reset-failed', please consider increasing SystemdTimeout in cgroup.conf (SystemdTimeout=%"PRIu64").", |
| slurm_cgroup_conf.systemd_timeout); |
| } |
| |
| /* |
| * Assuming the scope is created, let's mkdir the /system dir which will |
| * allocate the sleep infinity pid. This way the slurmstepd scope won't |
| * be a leaf anymore and we'll be able to create more directories. |
| * _init_new_scope here is simply used as a mkdir. |
| */ |
| memset(&sys_root, 0, sizeof(sys_root)); |
| xstrfmtcat(sys_root.path, "%s/%s", scope_path, SYSTEM_CGDIR); |
| if (mkdirpath(sys_root.path, 0755, true) != SLURM_SUCCESS) { |
| xfree(sys_root.path); |
| kill(pid, SIGKILL); |
| waitpid(pid, &status, WNOHANG); |
| fatal("slurmstepd scope could not be set."); |
| } |
| |
| /* Success!, we got the system/ cg directory, move the child there. */ |
| if (common_cgroup_move_process(&sys_root, pid)) { |
| xfree(sys_root.path); |
| kill(pid, SIGKILL); |
| waitpid(pid, &status, WNOHANG); |
| fatal("Unable to move pid %d to system cgroup %s", pid, |
| sys_root.path); |
| } |
| common_cgroup_destroy(&sys_root); |
| |
| /* |
| * Wait for the infinity pid to be in the correct cgroup or further |
| * cgroup configuration will fail as we're at this point violating the |
| * no internal process constrain. |
| * |
| * To control resource distribution of a cgroup, the cgroup must create |
| * children directories and transfer all its processes to these |
| * children before enabling controllers in its cgroup.subtree_control |
| * file. |
| * |
| * As cgroupfs is sometimes slow, we cannot continue setting up this |
| * cgroup unless we guarantee the child are moved. |
| */ |
| if (!common_cgroup_wait_pid_moved(&scope_root, pid, scope_path)) { |
| kill(pid, SIGKILL); |
| waitpid(pid, &status, WNOHANG); |
| fatal("Timeout waiting for pid %d to leave %s", pid, |
| scope_path); |
| } |
| |
| /* Tell the child it can continue daemonizing itself. */ |
| safe_write(pipe_fd[1], &pid, sizeof(pid)); |
| if ((waitpid(pid, &status, 0) != pid) || WEXITSTATUS(status)) { |
| /* |
| * If we receive an error it means xdaemon() or execv() has |
| * failed. |
| */ |
| fatal("%s: slurmstepd infinity could not be executed.", |
| __func__); |
| } |
| |
| if (close(pipe_fd[1])) |
| fatal("close(%u) failed: %m", pipe_fd[1]); |
| |
| return SLURM_SUCCESS; |
| rwfail: |
| fatal("Unable to contact with child: %m"); |
| } |
| |
| /* |
| * If IgnoreSystemd=yes in cgroup.conf we do a mkdir in |
| * /sys/fs/cgroup/system.slice/<nodename>_slurmstepd or /slurmstepd if no |
| * MULTIPLE_SLURMD. |
| * |
| * Otherwise call dbus to talk to systemd and create a 'scope' which will in |
| * turn create the same cgroup directory. |
| * |
| * This directory will be used to place future slurmstepds. |
| */ |
| static int _init_slurmd_system_scope() |
| { |
| struct stat sb; |
| |
| /* Do only if the cgroup associated to the scope is not created yet. */ |
| if (!stat(stepd_scope_path, &sb)) |
| return SLURM_SUCCESS; |
| |
| /* |
| * If we don't want to use systemd at all just create the cgroup |
| * directories manually and return. |
| */ |
| if (slurm_cgroup_conf.ignore_systemd) |
| return _init_new_scope(stepd_scope_path); |
| |
| /* Call systemd through dbus to create a new scope. */ |
| if ((_init_new_scope_dbus(stepd_scope_path) != SLURM_SUCCESS)) { |
| if (slurm_cgroup_conf.ignore_systemd_on_failure) { |
| log_flag(CGROUP, "Could not create scope through systemd, doing it manually as IgnoreSystemdOnFailure is set in cgroup.conf"); |
| return _init_new_scope(stepd_scope_path); |
| } else { |
| error("cannot initialize cgroup directory for stepds: if the scope %s already exists it means the associated cgroup directories disappeared and the scope entered in a failed state. You should investigate why the scope lost its cgroup directories and possibly use the 'systemd reset-failed' command to fix this inconsistent systemd state.", |
| stepd_scope_path); |
| return SLURM_ERROR; |
| } |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| static void _get_parent_effective_cpus_mems(char **cpus_effective, |
| char **mems_effective, |
| xcgroup_t *cg) |
| { |
| size_t sz; |
| xcgroup_t parent_cg = { 0 }; |
| |
| /* Copy the settings from one level up on the hierarchy. */ |
| parent_cg.path = xdirname(cg->path); |
| |
| *cpus_effective = NULL; |
| *mems_effective = NULL; |
| |
| if (common_cgroup_get_param(&parent_cg, "cpuset.cpus.effective", |
| cpus_effective, &sz) != SLURM_SUCCESS) { |
| error("Cannot read scope %s/cpuset.cpus.effective", |
| parent_cg.path); |
| } |
| |
| if (common_cgroup_get_param(&parent_cg, "cpuset.mems.effective", |
| mems_effective, &sz) != SLURM_SUCCESS) { |
| error("Cannot read scope %s/cpuset.mems.effective", |
| parent_cg.path); |
| } |
| |
| common_cgroup_destroy(&parent_cg); |
| } |
| |
| /* |
| * Unset the limits applied to slurmd from _resource_spec_init(), namely |
| * cpuset.cpus, cpuset.mems and memory.max. If others are applied in the future |
| * this function can be extended to reset other limits. |
| * |
| * IN: cg - slurmd cgroup to reset the limits. |
| * RET: SLURM_SUCCESS or SLURM_ERROR if any limit could not be reset. |
| */ |
| static int _unset_cpu_mem_limits(xcgroup_t *cg) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| if (!bit_test(cg->ns->avail_controllers, CG_CPUS)) { |
| log_flag(CGROUP, "Not resetting cpuset limits in %s as %s controller is not enabled", |
| cg->path, ctl_names[CG_CPUS]); |
| } else if (!xstrcmp(cg->path, int_cg_ns.init_cg_path)) { |
| log_flag(CGROUP, "Not resetting cpuset limits in %s as we are already in the top cgroup", |
| cg->path); |
| } else { |
| /* |
| * Normally it should suffice to write a "" into cpuset.cpus to |
| * reset the allowed cpus, but for some reason this seems to be |
| * interpreted as an "empty" cpuset by the kernel and it does |
| * not allow us to do it when there are process in it (e.g. in |
| * a reconfigure when slurmd is started manually). Instead, the |
| * kernel allows us to specify the full range of cpus so we |
| * will grab here the parent cpuset.cpus and apply it to our |
| * cgroup. The same is done for cpuset.mems, as this interface |
| * suffers from the same problem. |
| */ |
| char *parent_cpus, *parent_mems; |
| int i; |
| _get_parent_effective_cpus_mems(&parent_cpus, &parent_mems, cg); |
| rc += common_cgroup_set_param(cg, "cpuset.cpus", parent_cpus); |
| rc += common_cgroup_set_param(cg, "cpuset.mems", parent_mems); |
| if ((i = strlen(parent_cpus))) |
| parent_cpus[i - 1] = '\0'; |
| if ((i = strlen(parent_mems))) |
| parent_mems[i - 1] = '\0'; |
| log_flag(CGROUP, "%s reset cpuset.cpus=%s cpuset.mems=%s", |
| cg->path, parent_cpus, parent_mems); |
| xfree(parent_cpus); |
| xfree(parent_mems); |
| } |
| |
| if (!bit_test(cg->ns->avail_controllers, CG_MEMORY)) { |
| log_flag(CGROUP, "Not resetting limits in %s as %s controller is not enabled", |
| cg->path, ctl_names[CG_MEMORY]); |
| } else { |
| rc += common_cgroup_set_param(cg, "memory.max", "max"); |
| log_flag(CGROUP, "%s reset memory.max=max", cg->path); |
| } |
| |
| return (rc) ? SLURM_ERROR : SLURM_SUCCESS; |
| } |
| |
| /* |
| * Slurmd started manually may not remain in the actual scope. Normally there |
| * are other pids there, like the terminal from where it's been launched, so |
| * slurmd would affect these pids. For example a CoreSpecCount of 1 would leave |
| * the bash terminal with only one core. |
| * |
| * Get out of there and put ourselves into a new home. This shouldn't happen on |
| * production systems. |
| */ |
| static int _migrate_to_stepd_scope() |
| { |
| char *new_home = NULL; |
| pid_t slurmd_pid = getpid(); |
| |
| bit_clear_all(int_cg_ns.avail_controllers); |
| xfree(int_cg_ns.mnt_point); |
| common_cgroup_destroy(&int_cg[CG_LEVEL_ROOT]); |
| |
| xstrfmtcat(new_home, "%s/slurmd", stepd_scope_path); |
| int_cg_ns.mnt_point = new_home; |
| |
| if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_ROOT], "", |
| (uid_t) 0, (gid_t) 0) != SLURM_SUCCESS) { |
| error("unable to create root cgroup"); |
| return SLURM_ERROR; |
| } |
| |
| if (common_cgroup_instantiate(&int_cg[CG_LEVEL_ROOT]) != |
| SLURM_SUCCESS) { |
| error("Unable to instantiate slurmd %s cgroup", new_home); |
| return SLURM_ERROR; |
| } |
| log_flag(CGROUP, "Created %s", new_home); |
| |
| /* |
| * Set invoc_id to empty string to indicate that from now on we should |
| * behave as if we were spawned by systemd. |
| */ |
| invoc_id = ""; |
| |
| if (_get_controllers(stepd_scope_path, int_cg_ns.avail_controllers) != |
| SLURM_SUCCESS) |
| return SLURM_ERROR; |
| |
| if (_enable_subtree_control(stepd_scope_path, |
| int_cg_ns.avail_controllers) != |
| SLURM_SUCCESS) { |
| error("Cannot enable subtree_control at the top level %s", |
| int_cg_ns.mnt_point); |
| return SLURM_ERROR; |
| } |
| |
| if (common_cgroup_move_process(&int_cg[CG_LEVEL_ROOT], slurmd_pid) != |
| SLURM_SUCCESS) { |
| error("Unable to attach slurmd pid %d to %s cgroup.", |
| slurmd_pid, new_home); |
| return SLURM_ERROR; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static void _get_memory_events(uint64_t *job_kills, uint64_t *step_kills) |
| { |
| size_t sz; |
| char *mem_events = NULL, *ptr; |
| |
| /* |
| * memory.events: |
| * all fields in this file are hierarchical and the file modified event |
| * can be generated due to an event down the hierarchy. For the local |
| * events at the cgroup level we can check memory.events.local instead. |
| */ |
| |
| /* Get latest stats for the step */ |
| if (common_cgroup_get_param(&int_cg[CG_LEVEL_STEP_USER], |
| "memory.events", |
| &mem_events, &sz) != SLURM_SUCCESS) |
| error("Cannot read %s/memory.events", |
| int_cg[CG_LEVEL_STEP_USER].path); |
| |
| if (mem_events) { |
| if ((ptr = xstrstr(mem_events, "oom_kill "))) { |
| if (sscanf(ptr, "oom_kill %"PRIu64, step_kills) != 1) |
| error("Cannot read step's oom_kill counter from memory.events file."); |
| } |
| xfree(mem_events); |
| } |
| |
| /* Get stats for the job */ |
| if (common_cgroup_get_param(&int_cg[CG_LEVEL_JOB], |
| "memory.events", |
| &mem_events, &sz) != SLURM_SUCCESS) |
| error("Cannot read %s/memory.events", |
| int_cg[CG_LEVEL_STEP_USER].path); |
| |
| if (mem_events) { |
| if ((ptr = xstrstr(mem_events, "oom_kill "))) { |
| if (sscanf(ptr, "oom_kill %"PRIu64, job_kills) != 1) |
| error("Cannot read job's oom_kill counter from memory.events file."); |
| } |
| xfree(mem_events); |
| } |
| } |
| |
| static void _get_swap_events(uint64_t *job_swkills, uint64_t *step_swkills) |
| { |
| size_t sz; |
| char *mem_swap_events = NULL, *ptr; |
| |
| /* Get latest swap stats for the step */ |
| if (common_cgroup_get_param(&int_cg[CG_LEVEL_STEP_USER], |
| "memory.swap.events", |
| &mem_swap_events, &sz) != SLURM_SUCCESS) |
| error("Cannot read %s/memory.swap.events", |
| int_cg[CG_LEVEL_STEP_USER].path); |
| |
| if (mem_swap_events) { |
| if ((ptr = xstrstr(mem_swap_events, "fail "))) { |
| if (sscanf(ptr, "fail %"PRIu64, step_swkills) != 1) |
| error("Cannot read step's fail counter from memory.swap.events file."); |
| } |
| xfree(mem_swap_events); |
| } |
| |
| /* Get swap stats for the job */ |
| if (common_cgroup_get_param(&int_cg[CG_LEVEL_JOB], "memory.swap.events", |
| &mem_swap_events, &sz) != SLURM_SUCCESS) |
| error("Cannot read %s/memory.swap.events", |
| int_cg[CG_LEVEL_STEP_USER].path); |
| |
| if (mem_swap_events) { |
| if ((ptr = xstrstr(mem_swap_events, "fail "))) { |
| if (sscanf(ptr, "fail %"PRIu64, job_swkills) != 1) |
| error("Cannot read job's fail counter from memory.swap.events file."); |
| } |
| xfree(mem_swap_events); |
| } |
| } |
| |
| /* |
| * This function checks that all the processes contained in the cgroup cg |
| * belong to our namespace. |
| * |
| * That is checked by ensuring none of the pids contained in the cgroup.procs |
| * interface are 0, which would indicate that we cannot see the pid of that |
| * process, meaning this process belongs to another namespace. |
| * |
| * Trying to move a 0 in Cgroups moves yourself. |
| * |
| * IN cg - the cgroup we want to check for cgroup.procs not containing 0's |
| * RET - SLURM_ERROR if cgroup.procs could not be read or there are 0's. |
| * SLURM_SUCCESS otherwise. |
| */ |
| static int _check_cg_pids_correct_ns(xcgroup_t *cg) |
| { |
| pid_t *pids = NULL; |
| int npids = 0, rc = SLURM_SUCCESS; |
| |
| if (common_cgroup_get_pids(cg, &pids, &npids) != SLURM_SUCCESS) { |
| error("unable to get processes from %s cgroup", cg->path); |
| return SLURM_ERROR; |
| } |
| |
| for (int i = 0; i < npids; i++) { |
| if (pids[i] == 0) { |
| error("We detected a pid 0 which means you are in a cgroup namespace and a mounted cgroup but with pids from the host that we're not allowed to manage."); |
| rc = SLURM_ERROR; |
| break; |
| } |
| } |
| xfree(pids); |
| return rc; |
| } |
| |
| /* |
| * Move the pids from 'from' cgroup to 'to' cgroup and enable the controllers. |
| * |
| * Create a new cgroup in the path resulting of the concenation of |
| * int_cg_ns.mnt_point (normally /sys/fs/cgroup ) and the "to" parameter. |
| * |
| * Then get all the processes in the "from" cgroup.procs and move them to the |
| * new cgroup. |
| * |
| * Finally enable the subtree control on the "from" cgroup to ensure that no new |
| * processes will be put there, convert it to a cgroup "domain controller". |
| * |
| * On failure retry by waiting for the processes to show up in the new cgroup, |
| * then try again to enable subtree control. If that last one fails it returns |
| * an error. Is important to note that this function does not guarantee |
| * that all the process can be successfully moved, as it is inherently racy. |
| * It might happen that in between the common_cgroup_get_pids() and the movement |
| * of those to the new cgroup, new processes are spawned there, thus making the |
| * enable_subtree fail. We don't want to freeze the cgroup either as we might |
| * be freezing ourselves. |
| * |
| * IN from - origin cgroup where to move pids from. |
| * IN to - destination cgroup path to be created, set, and pids moved. |
| * RET rc - SLURM_SUCCESS if all pids could be read and moved into a new |
| * configured cgroup, error otherwise. |
| */ |
| static int _empty_pids(xcgroup_t *from, char *to) |
| { |
| pid_t *pids = NULL; |
| int npids = 0; |
| xcgroup_t dest; |
| bitstr_t *system_ctrls = bit_alloc(CG_CTL_CNT); |
| int rc = SLURM_ERROR; |
| |
| if (_get_controllers(slurm_cgroup_conf.cgroup_mountpoint, |
| system_ctrls) != SLURM_SUCCESS) { |
| error("Unable to get cgroup root controllers."); |
| goto fail; |
| } |
| |
| if (common_cgroup_create(&int_cg_ns, &dest, to, (uid_t) 0, (gid_t) 0) != |
| SLURM_SUCCESS) { |
| error("Unable to create cgroup structure for %s", to); |
| goto fail; |
| } |
| |
| if (common_cgroup_instantiate(&dest) != SLURM_SUCCESS) { |
| error("Unable to create cgroup %s", dest.path); |
| goto fail; |
| } |
| |
| if (common_cgroup_get_pids(from, &pids, &npids) != SLURM_SUCCESS) { |
| error("Unable to get pids from origin cgroup %s", from->path); |
| goto fail; |
| } |
| |
| for (int i = 0; i < npids; i++) { |
| if (common_cgroup_move_process(&dest, pids[i]) != |
| SLURM_SUCCESS) { |
| error("Unable to move process %d from %s to %s cgroup.", |
| pids[i], from->path, dest.path); |
| goto fail; |
| } |
| } |
| |
| if (_enable_subtree_control(from->path, system_ctrls)) { |
| error("Cannot enable subtree control in %s cgroup. Trying to wait for process movement: %m", |
| from->path); |
| for (int i = 0; i < npids; i++) { |
| if (!common_cgroup_wait_pid_moved(from, pids[i], |
| from->path)) { |
| error("Move pid %d from %s to %s failed.", |
| pids[i], from->path, dest.path); |
| goto fail; |
| } |
| } |
| if (_enable_subtree_control(from->path, system_ctrls)) { |
| error("Cannot enable subtree control for cgroup %s: %m", |
| from->path); |
| goto fail; |
| } |
| } |
| rc = SLURM_SUCCESS; |
| fail: |
| common_cgroup_destroy(&dest); |
| FREE_NULL_BITMAP(system_ctrls); |
| xfree(pids); |
| return rc; |
| } |
| |
| /* |
| * Initialize the cgroup plugin. Slurmd MUST be started by systemd and the |
| * option Delegate set to 'Yes' or equal to a string with the desired |
| * controllers we want to support in this system. If we are slurmd we're going |
| * to create a systemd scope for further slurmstepds. The scope is associated |
| * to a cgroup directory, and it will be delegated to us too. We need to |
| * separate it from slurmd because if we restart slurmd and there are living |
| * steps in the same directory, then slurmd could not be put in a non-leaf |
| * cgroup, and systemd will fail (no internal process constraint). |
| * Take in mind also we should not do anything upper in the hierarchy because of |
| * the single-writer architecture systemd imposes to us. The upper tree is |
| * completely under systemd control. |
| * |
| * We need to play the cgroup v2 game rules: |
| * |
| * - No Internal Process Constraint |
| * - Top-down Constraint |
| * |
| * And try to be compliant with systemd, or they will complain: |
| * |
| * - Single writer rule. |
| * |
| * Read cgroup v2 documentation for more info. |
| */ |
| extern int init(void) |
| { |
| int_cg_ns.avail_controllers = bit_alloc(CG_CTL_CNT); |
| step_active_cnt = 0; |
| FREE_NULL_LIST(task_list); |
| task_list = list_create(_free_task_cg_info); |
| debug("%s loaded", plugin_name); |
| return SLURM_SUCCESS; |
| } |
| |
| static bool _pid_in_root(char *pid_str) |
| { |
| char *cg_path, *tmp_str, file_path[PATH_MAX]; |
| bool rc = false; |
| |
| cg_path = _get_proc_cg_path(pid_str); |
| tmp_str = xdirname(cg_path); |
| xfree(cg_path); |
| cg_path = tmp_str; |
| tmp_str = NULL; |
| |
| if (snprintf(file_path, PATH_MAX, "%s/cgroup.procs", cg_path) >= |
| PATH_MAX) { |
| error("Could not generate cgroup path: %s", file_path); |
| goto end; |
| } |
| |
| /* If cgroup.procs is not found one level up, we are in the root */ |
| if (access(file_path, F_OK)) |
| rc = true; |
| |
| end: |
| xfree(cg_path); |
| return rc; |
| } |
| |
| extern int cgroup_p_setup_scope(char *scope_path) |
| { |
| /* |
| * Detect if we are started by systemd. Another way could be to check |
| * if our PPID=1, but we cannot rely on it because when starting slurmd |
| * with -D over a sshd session, slurmd will be reparented by 1, and |
| * doing this on a graphical session, it will be reparented by |
| * "systemd --user". So it is not a reliable check. Instead use |
| * the existence of INVOCATION_ID to know if the pid has been forked by |
| * systemd. |
| */ |
| invoc_id = getenv("INVOCATION_ID"); |
| |
| if (!_is_cgroup2_mount(slurm_cgroup_conf.cgroup_mountpoint)) { |
| fatal("%s is not a valid cgroup2 mountpoint", |
| slurm_cgroup_conf.cgroup_mountpoint); |
| } |
| |
| /* |
| * Set our current root dir in our "internal cgroup namespace". |
| * We will create our tree and all directories from this root. |
| * In slurmstepd, we got it from slurmd at startup so no need to guess. |
| */ |
| if (running_in_slurmstepd()) { |
| stepd_scope_path = xstrdup(scope_path); |
| int_cg_ns.mnt_point = stepd_scope_path; |
| } else |
| _set_int_cg_ns(); |
| |
| if (!int_cg_ns.mnt_point) { |
| error("Cannot setup the cgroup namespace."); |
| return SLURM_ERROR; |
| } |
| |
| /* Setup the root cgroup object. */ |
| if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_ROOT], "", |
| (uid_t) 0, (gid_t) 0) != SLURM_SUCCESS) { |
| error("unable to create root cgroup (%s)", |
| int_cg[CG_LEVEL_ROOT].path); |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * Check whether there are pids in the root cgroup that do not belong to |
| * this namespace, and exit if so, as we cannot handle processes from |
| * another namespace. |
| */ |
| if (running_in_slurmd() && |
| (_check_cg_pids_correct_ns(&int_cg[CG_LEVEL_ROOT]) != |
| SLURM_SUCCESS)) { |
| error("cgroup %s contains pids from outside of our pid namespace, so we cannot manage this cgroup.", |
| int_cg[CG_LEVEL_ROOT].path); |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * Convert our false root into a workable root - best effort. |
| * |
| * Slurmd will detect when the root cgroup is not a real one. This can |
| * happen when we have been started in a cgroup namespaced container and |
| * our /sys/fs/cgroup is mapped to a non-root cgroup directory in the |
| * host, meaning it cannot have pids in cgroup.procs if there are |
| * subdirectories. |
| * |
| * As we're going to create a hierarchy, we need to move out the pids |
| * to a child directory, we've chosen /system for that. |
| * |
| * So move the pids away from the "false root" cgroup to /system. |
| * |
| * Only do that if IgnoreSystemd is set. |
| */ |
| if (running_in_slurmd() && cgroup_p_has_feature(CG_FALSE_ROOT) && |
| slurm_cgroup_conf.ignore_systemd && _pid_in_root("self")) { |
| if (_empty_pids(&int_cg[CG_LEVEL_ROOT], "/system") != |
| SLURM_SUCCESS){ |
| error("cannot empty the false root cgroup (%s) of pids.", |
| int_cg[CG_LEVEL_ROOT].path); |
| return SLURM_ERROR; |
| } |
| } |
| /* |
| * Check available controllers in cgroup.controller, record them in our |
| * bitmap and enable them if EnableControllers option is set. |
| * We enable them manually just because we support CgroupIgnoreSystemd |
| * option. Theoretically when starting a unit with Delegate=yes, you will |
| * get all controllers available at your level. |
| */ |
| if (_setup_controllers() != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| |
| /* |
| * slurmd will setup a new home for future slurmstepds. Every stepd |
| * will emigrate to this new place. |
| */ |
| if (running_in_slurmd()) { |
| if (_init_slurmd_system_scope() != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| |
| /* |
| * If we are not started by systemd we need to move out to not |
| * mess with the pids that may be in our actual cgroup. |
| */ |
| if (!invoc_id) { |
| log_flag(CGROUP, "assuming slurmd has been started manually."); |
| if (_migrate_to_stepd_scope() != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| } else { |
| log_flag(CGROUP, "INVOCATION_ID env var found. Assuming slurmd has been started by systemd."); |
| } |
| |
| /* |
| * We need to unset any cpu and memory limits as we do not want |
| * to inherit previous limits. We cannot reset them later |
| * because _load_gres needs to see all the cpus. The CoreSpec |
| * initialization will happen afterwards and set whatever |
| * is needed. |
| */ |
| if (_unset_cpu_mem_limits(&int_cg[CG_LEVEL_ROOT]) != |
| SLURM_SUCCESS) { |
| error("Cannot reset %s cgroup limits.", |
| int_cg[CG_LEVEL_ROOT].path); |
| return SLURM_ERROR; |
| } |
| } |
| |
| if (running_in_slurmstepd()) { |
| /* |
| * We expect slurmd to already have set our scope directory. |
| * Move ourselves in the system subdirectory, which is a |
| * temporary 'parking' until we have not created the job |
| * hierarchy. |
| */ |
| if (_init_stepd_system_scope(getpid()) != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * If we're slurmd we're all set and able to constrain things, i.e. |
| * CoreSpec* and MemSpec*. |
| * |
| * If we are a new slurmstepd we are ready now to create job steps. In |
| * that case, since we're still in the temporary "system" directory, |
| * we will need move ourselves out to a new job directory and then |
| * create int_cg[CG_LEVEL_ROOT].path/job_x/step_x. |
| */ |
| return SLURM_SUCCESS; |
| } |
| |
| extern void fini(void) |
| { |
| /* |
| * Clear up the namespace and cgroups memory. Don't rmdir anything since |
| * we may not be stopping yet. When the process terminates systemd will |
| * remove the remaining directories. |
| */ |
| FREE_NULL_BITMAP(int_cg_ns.avail_controllers); |
| common_cgroup_destroy(&int_cg[CG_LEVEL_SYSTEM]); |
| common_cgroup_destroy(&int_cg[CG_LEVEL_ROOT]); |
| common_cgroup_ns_destroy(&int_cg_ns); |
| FREE_NULL_LIST(task_list); |
| free_ebpf_prog(&p[CG_LEVEL_JOB]); |
| free_ebpf_prog(&p[CG_LEVEL_STEP_USER]); |
| xfree(stepd_scope_path); |
| |
| debug("unloading %s", plugin_name); |
| } |
| |
| /* |
| * Unlike in Legacy mode (v1) where we needed to create a directory for each |
| * controller, in Unified mode this function will do almost nothing except for |
| * some sanity checks. That's because hierarchy is unified into the same path. |
| * and the controllers will be enabled when we create the hierarchy. The only |
| * controller that may need a real init is the 'devices', which in Unified is |
| * not a real controller, but instead we need to register an eBPF program. |
| */ |
| extern int cgroup_p_initialize(cgroup_ctl_type_t ctl) |
| { |
| switch (ctl) { |
| case CG_DEVICES: |
| init_ebpf_prog(&p[CG_LEVEL_JOB]); |
| init_ebpf_prog(&p[CG_LEVEL_STEP_USER]); |
| break; |
| case CG_TRACK: |
| /* This is not a controller in Cgroup v2.*/ |
| break; |
| default: |
| if (!bit_test(int_cg_ns.avail_controllers, ctl)) { |
| error("%s cgroup controller is not available.", |
| ctl_names[ctl]); |
| return SLURM_ERROR; |
| } |
| |
| if (running_in_slurmd()) { |
| bitstr_t *scope_ctrls = bit_alloc(CG_CTL_CNT); |
| _get_controllers(stepd_scope_path, scope_ctrls); |
| if (!bit_test(scope_ctrls, ctl)) { |
| error("%s cgroup controller is not available for %s.", |
| ctl_names[ctl], stepd_scope_path); |
| FREE_NULL_BITMAP(scope_ctrls); |
| return SLURM_ERROR; |
| } |
| FREE_NULL_BITMAP(scope_ctrls); |
| } |
| break; |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * As part of the initialization, the slurmd directory is already created, so |
| * this function will remain empty. |
| */ |
| extern int cgroup_p_system_create(cgroup_ctl_type_t ctl) |
| { |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Slurmd will live in its own cgroup, not sharing anything with slurmstepd. |
| * This means there's no reason to implement this function in v2. |
| * Also slurmstepd is put into the user's hierarchy (see graph) and is not |
| * affected by CoreSpec or MemSpec. |
| */ |
| extern int cgroup_p_system_addto(cgroup_ctl_type_t ctl, pid_t *pids, int npids) |
| { |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * There's no need to do any cleanup, when systemd terminates the cgroup is |
| * automatically removed by systemd. |
| */ |
| extern int cgroup_p_system_destroy(cgroup_ctl_type_t ctl) |
| { |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Create the step hierarchy and move the stepd process into it. Further forked |
| * processes will be created in the step directory as child. We need to respect |
| * the cgroup v2 Top-Down constraint to not add pids to non-leaf cgroups. |
| * |
| * We create two directories per step because we need to put the stepd into its |
| * specific slurm/ dir, otherwise suspending/constraining the user cgroup would |
| * also suspend or constrain the stepd. |
| * |
| * step_x/slurm (for slurm processes, slurmstepd) |
| * step_x/user (for users processes, tasks) |
| * |
| * No need to cleanup the directories on error because when a job ends |
| * systemd does the cleanup automatically. |
| * |
| * Note that CoreSpec and/or MemSpec does not affect slurmstepd. |
| */ |
| extern int cgroup_p_step_create(cgroup_ctl_type_t ctl, stepd_step_rec_t *step) |
| { |
| int rc = SLURM_SUCCESS; |
| char *new_path = NULL; |
| char tmp_char[64]; |
| |
| /* |
| * Lock the root cgroup so we don't race with other steps that are being |
| * terminated and trying to destroy the job_x directory. |
| */ |
| if (common_cgroup_lock(&int_cg[CG_LEVEL_ROOT]) != SLURM_SUCCESS) { |
| error("common_cgroup_lock error (%s)", ctl_names[ctl]); |
| return SLURM_ERROR; |
| } |
| |
| /* Don't let other plugins destroy our structs. */ |
| step_active_cnt++; |
| |
| /* Job cgroup */ |
| xstrfmtcat(new_path, "/job_%u", step->step_id.job_id); |
| if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_JOB], |
| new_path, 0, 0) != SLURM_SUCCESS) { |
| error("unable to create job %u cgroup", step->step_id.job_id); |
| rc = SLURM_ERROR; |
| goto endit; |
| } |
| if (common_cgroup_instantiate(&int_cg[CG_LEVEL_JOB]) != SLURM_SUCCESS) { |
| common_cgroup_destroy(&int_cg[CG_LEVEL_JOB]); |
| error("unable to instantiate job %u cgroup", |
| step->step_id.job_id); |
| rc = SLURM_ERROR; |
| goto endit; |
| } |
| xfree(new_path); |
| _enable_subtree_control(int_cg[CG_LEVEL_JOB].path, |
| int_cg_ns.avail_controllers); |
| |
| /* Step cgroup */ |
| xstrfmtcat(new_path, "%s/step_%s", int_cg[CG_LEVEL_JOB].name, |
| log_build_step_id_str(&step->step_id, tmp_char, |
| sizeof(tmp_char), |
| STEP_ID_FLAG_NO_PREFIX | |
| STEP_ID_FLAG_NO_JOB)); |
| |
| if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_STEP], |
| new_path, 0, 0) != SLURM_SUCCESS) { |
| error("unable to create step %ps cgroup", &step->step_id); |
| rc = SLURM_ERROR; |
| goto endit; |
| } |
| if (common_cgroup_instantiate(&int_cg[CG_LEVEL_STEP]) != |
| SLURM_SUCCESS) { |
| common_cgroup_destroy(&int_cg[CG_LEVEL_STEP]); |
| error("unable to instantiate step %ps cgroup", &step->step_id); |
| rc = SLURM_ERROR; |
| goto endit; |
| } |
| xfree(new_path); |
| _enable_subtree_control(int_cg[CG_LEVEL_STEP].path, |
| int_cg_ns.avail_controllers); |
| |
| /* |
| * We have our stepd directory already into job_x, from now one nobody |
| * can destroy this job directory. We're safe. |
| */ |
| common_cgroup_unlock(&int_cg[CG_LEVEL_ROOT]); |
| |
| /* Step User processes cgroup */ |
| xstrfmtcat(new_path, "%s/user", int_cg[CG_LEVEL_STEP].name); |
| if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_STEP_USER], |
| new_path, 0, 0) != SLURM_SUCCESS) { |
| error("unable to create step %ps user procs cgroup", |
| &step->step_id); |
| rc = SLURM_ERROR; |
| goto endit; |
| } |
| if (common_cgroup_instantiate(&int_cg[CG_LEVEL_STEP_USER]) != |
| SLURM_SUCCESS) { |
| common_cgroup_destroy(&int_cg[CG_LEVEL_STEP_USER]); |
| error("unable to instantiate step %ps user procs cgroup", |
| &step->step_id); |
| rc = SLURM_ERROR; |
| goto endit; |
| } |
| xfree(new_path); |
| _enable_subtree_control(int_cg[CG_LEVEL_STEP_USER].path, |
| int_cg_ns.avail_controllers); |
| |
| /* |
| * Step Slurm processes cgroup |
| * Do not enable subtree control at this level since this is a leaf. |
| */ |
| xstrfmtcat(new_path, "%s/slurm", int_cg[CG_LEVEL_STEP].name); |
| if (common_cgroup_create(&int_cg_ns, &int_cg[CG_LEVEL_STEP_SLURM], |
| new_path, 0, 0) != SLURM_SUCCESS) { |
| error("unable to create step %ps slurm procs cgroup", |
| &step->step_id); |
| rc = SLURM_ERROR; |
| goto endit; |
| } |
| if (common_cgroup_instantiate(&int_cg[CG_LEVEL_STEP_SLURM]) != |
| SLURM_SUCCESS) { |
| common_cgroup_destroy(&int_cg[CG_LEVEL_STEP_SLURM]); |
| error("unable to instantiate step %ps slurm procs cgroup", |
| &step->step_id); |
| rc = SLURM_ERROR; |
| goto endit; |
| } |
| xfree(new_path); |
| |
| /* Place this stepd is in the correct cgroup. */ |
| if (common_cgroup_move_process(&int_cg[CG_LEVEL_STEP_SLURM], |
| step->jmgr_pid) != SLURM_SUCCESS) { |
| error("unable to move stepd pid to its dedicated cgroup"); |
| rc = SLURM_ERROR; |
| } |
| |
| endit: |
| xfree(new_path); |
| if (rc != SLURM_SUCCESS) |
| step_active_cnt--; |
| return rc; |
| } |
| |
| /* |
| * Move a pid to a specific cgroup. It needs to be a leaf, we cannot move |
| * a pid to an intermediate directory in the cgroup hierarchy. Since we always |
| * work at task level, we will add this pid to the special task task_4294967293. |
| * |
| * Future: If in cgroup v2 we want to be able to enable/disable controllers for |
| * the slurmstepd pid, we need to add here the logic when stepd pid is detected. |
| * By default, all controllers are enabled for slurmstepd cgroup. |
| * |
| * - Top-down Constraint |
| * - No Internal Process Constraint |
| * |
| * Read cgroup v2 documentation for more info. |
| */ |
| extern int cgroup_p_step_addto(cgroup_ctl_type_t ctl, pid_t *pids, int npids) |
| { |
| int rc = SLURM_SUCCESS; |
| pid_t stepd_pid = getpid(); |
| |
| for (int i = 0; i < npids; i++) { |
| /* Ignore any possible movement of slurmstepd */ |
| if (pids[i] == stepd_pid) |
| continue; |
| if (cgroup_p_task_addto(ctl, NULL, pids[i], |
| task_special_id) != SLURM_SUCCESS) |
| rc = SLURM_ERROR; |
| } |
| return rc; |
| } |
| |
| /* |
| * Read the cgroup.procs of the leafs of this step. |
| * |
| * - count the pids of slurm/ directory |
| * - for all task_x dir: |
| * read task_x/cgroup.procs and add them into **pids |
| */ |
| extern int cgroup_p_step_get_pids(pid_t **pids, int *npids) |
| { |
| foreach_pid_array_t pid_array; |
| |
| memset(&pid_array, 0, sizeof(pid_array)); |
| |
| /* Include the slurm processes (stepd) pids too. */ |
| common_cgroup_get_pids(&int_cg[CG_LEVEL_STEP_SLURM], |
| &pid_array.pids, &pid_array.npids); |
| |
| list_for_each(task_list, _get_task_pids, &pid_array); |
| *npids = pid_array.npids; |
| *pids = pid_array.pids; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* Freeze the user processes of this step */ |
| extern int cgroup_p_step_suspend(void) |
| { |
| /* This plugin is unloaded. */ |
| if (!int_cg[CG_LEVEL_STEP_USER].path) |
| return SLURM_SUCCESS; |
| |
| /* |
| * Freezing of the cgroup may take some time; when this action is |
| * completed, the "frozen" value in the cgroup.events control file will |
| * be updated to "1" and the corresponding notification will be issued. |
| */ |
| return common_cgroup_set_param(&int_cg[CG_LEVEL_STEP_USER], |
| "cgroup.freeze", "1"); |
| } |
| |
| /* Resume the user processes of this step */ |
| extern int cgroup_p_step_resume(void) |
| { |
| /* This plugin is unloaded. */ |
| if (!int_cg[CG_LEVEL_STEP_USER].path) |
| return SLURM_SUCCESS; |
| |
| return common_cgroup_set_param(&int_cg[CG_LEVEL_STEP_USER], |
| "cgroup.freeze", "0"); |
| } |
| |
| /* |
| * Destroy the step cgroup. We need to move out ourselves to the root of |
| * the cgroup filesystem first. |
| */ |
| extern int cgroup_p_step_destroy(cgroup_ctl_type_t ctl) |
| { |
| int rc = SLURM_SUCCESS; |
| xcgroup_t init_root; |
| |
| /* |
| * Only destroy the step if we're the only ones using it. Log it unless |
| * loaded from slurmd, where we will not create any step but call fini. |
| */ |
| if (step_active_cnt == 0) { |
| error("called without a previous step create. This shouldn't happen!"); |
| return SLURM_SUCCESS; |
| } |
| |
| if (step_active_cnt > 1) { |
| step_active_cnt--; |
| log_flag(CGROUP, "Not destroying %s step dir, resource busy by %d other plugin", |
| ctl_names[ctl], step_active_cnt); |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Lock the root cgroup so we don't race with other steps that are being |
| * started and trying to create things inside job_x directory. |
| */ |
| if (common_cgroup_lock(&int_cg[CG_LEVEL_ROOT]) != SLURM_SUCCESS) { |
| error("common_cgroup_lock error (%s)", ctl_names[ctl]); |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * FUTURE: |
| * Here we can implement a recursive kill of all pids in the step. |
| */ |
| |
| /* |
| * Move ourselves to the CGROUP SYETEM level. This is the waiting area |
| * for new Slurmstepd process which do not have job folders yet, or for |
| * jobs that are ending execution. This directory also contains the |
| * "stepd infinity" process to keep the scope alive. |
| * |
| * This level is a leaf. We are not violating the no-internal-processes |
| * constrain. |
| * |
| * Moving the process here instead of to the cgroup root |
| * (typically /sys/fs/cgroup) will prevent problems when running into |
| * containerized environments, where cgroupfs root might not be |
| * writeable. |
| */ |
| memset(&init_root, 0, sizeof(init_root)); |
| init_root.path = xstrdup(int_cg[CG_LEVEL_SYSTEM].path); |
| rc = common_cgroup_move_process(&init_root, getpid()); |
| if (rc != SLURM_SUCCESS) { |
| error("Unable to move pid %d to system cgroup %s", getpid(), |
| init_root.path); |
| goto end; |
| } |
| /* Wait for this cgroup to be empty, 1 second */ |
| _wait_cgroup_empty(&int_cg[CG_LEVEL_STEP_SLURM], 1000); |
| |
| /* Remove any possible task directories first */ |
| _all_tasks_destroy(); |
| |
| /* Rmdir this job's stepd cgroup */ |
| if ((rc = common_cgroup_delete(&int_cg[CG_LEVEL_STEP_SLURM])) != |
| SLURM_SUCCESS) { |
| debug2("unable to remove slurm's step cgroup (%s): %m", |
| int_cg[CG_LEVEL_STEP_SLURM].path); |
| goto end; |
| } |
| common_cgroup_destroy(&int_cg[CG_LEVEL_STEP_SLURM]); |
| |
| /* Rmdir this job's user processes cgroup */ |
| if ((rc = common_cgroup_delete(&int_cg[CG_LEVEL_STEP_USER])) != |
| SLURM_SUCCESS) { |
| debug2("unable to remove user's step cgroup (%s): %m", |
| int_cg[CG_LEVEL_STEP_USER].path); |
| goto end; |
| } |
| common_cgroup_destroy(&int_cg[CG_LEVEL_STEP_USER]); |
| |
| /* Rmdir this step's processes cgroup */ |
| if ((rc = common_cgroup_delete(&int_cg[CG_LEVEL_STEP])) != |
| SLURM_SUCCESS) { |
| debug2("unable to remove step cgroup (%s): %m", |
| int_cg[CG_LEVEL_STEP].path); |
| goto end; |
| } |
| common_cgroup_destroy(&int_cg[CG_LEVEL_STEP]); |
| |
| /* |
| * That's a best try to rmdir if no more steps are in this job, |
| * it must not fail on error because other steps can still be alive. |
| */ |
| if (common_cgroup_delete(&int_cg[CG_LEVEL_JOB]) != SLURM_SUCCESS) { |
| debug2("still unable to remove job's step cgroup (%s): %m", |
| int_cg[CG_LEVEL_JOB].path); |
| goto end; |
| } |
| common_cgroup_destroy(&int_cg[CG_LEVEL_JOB]); |
| step_active_cnt = 0; |
| end: |
| common_cgroup_unlock(&int_cg[CG_LEVEL_ROOT]); |
| common_cgroup_destroy(&init_root); |
| return rc; |
| } |
| |
| /* |
| * Return true if the user pid is in this step/task cgroup. |
| * |
| * We just need to get the pids from the task_X directories and from the slurm |
| * processes cgroup, since these will be the only leafs we'll have. |
| */ |
| extern bool cgroup_p_has_pid(pid_t pid) |
| { |
| task_cg_info_t *task_cg_info; |
| pid_t *pids_slurm = NULL; |
| int npids_slurm = 0, i; |
| |
| task_cg_info = list_find_first(task_list, _find_pid_task, &pid); |
| |
| if (task_cg_info) |
| return true; |
| |
| /* Look for in the slurm processes cgroup too. */ |
| if (common_cgroup_get_pids(&int_cg[CG_LEVEL_STEP_SLURM], |
| &pids_slurm, &npids_slurm) != |
| SLURM_SUCCESS) |
| return false; |
| |
| for (i = 0; i < npids_slurm; i++) { |
| if (pids_slurm[i] == pid) { |
| xfree(pids_slurm); |
| return true; |
| } |
| } |
| |
| xfree(pids_slurm); |
| return false; |
| } |
| |
| extern int cgroup_p_constrain_set(cgroup_ctl_type_t ctl, cgroup_level_t level, |
| cgroup_limits_t *limits) |
| { |
| int rc = SLURM_SUCCESS; |
| bpf_program_t *program = NULL; |
| task_cg_info_t *task_cg_info; |
| char *dev_id_str = NULL; |
| uint32_t bpf_dev_type = NO_VAL; |
| |
| /* |
| * cgroup/v1 legacy compatibility: We have no such levels in cgroup/v2 |
| * but we may still get calls for them. |
| */ |
| if (level == CG_LEVEL_USER) |
| return SLURM_SUCCESS; |
| |
| if (level == CG_LEVEL_SLURM) |
| level = CG_LEVEL_ROOT; |
| |
| /* This is for CoreSpec* and MemSpec* for slurmd */ |
| if (level == CG_LEVEL_SYSTEM) |
| level = CG_LEVEL_ROOT; |
| |
| /* |
| * Our real step level is the level for user processes. This will make |
| * that the slurmstepd is never constrained in its own cgroup, which is |
| * something we want. Instead, slurmstepd will be part of the job limit. |
| * Note that a step which initializes pmi, could cause slurmstepd to |
| * grow, and we don't want this to be part of the step, but be part of |
| * the job. |
| */ |
| if (level == CG_LEVEL_STEP) |
| level = CG_LEVEL_STEP_USER; |
| |
| if (!limits) |
| return SLURM_ERROR; |
| |
| switch (ctl) { |
| case CG_TRACK: |
| /* Not implemented. */ |
| break; |
| case CG_CPUS: |
| if (limits->allow_cores && |
| common_cgroup_set_param( |
| &int_cg[level], |
| "cpuset.cpus", |
| limits->allow_cores) != SLURM_SUCCESS) { |
| rc = SLURM_ERROR; |
| } |
| if (limits->allow_mems && |
| common_cgroup_set_param( |
| &int_cg[level], |
| "cpuset.mems", |
| limits->allow_mems) != SLURM_SUCCESS) { |
| rc = SLURM_ERROR; |
| } |
| break; |
| case CG_MEMORY: |
| if ((limits->limit_in_bytes != NO_VAL64) && |
| common_cgroup_set_uint64_param( |
| &int_cg[level], |
| "memory.max", |
| limits->limit_in_bytes) != SLURM_SUCCESS) { |
| rc = SLURM_ERROR; |
| } |
| if ((limits->soft_limit_in_bytes != NO_VAL64) && |
| common_cgroup_set_uint64_param( |
| &int_cg[level], |
| "memory.high", |
| limits->soft_limit_in_bytes) != SLURM_SUCCESS) { |
| rc = SLURM_ERROR; |
| } |
| if ((limits->memsw_limit_in_bytes != NO_VAL64) && |
| common_cgroup_set_uint64_param( |
| &int_cg[level], |
| "memory.swap.max", |
| (limits->memsw_limit_in_bytes - |
| limits->limit_in_bytes)) != SLURM_SUCCESS) { |
| rc = SLURM_ERROR; |
| } |
| break; |
| case CG_DEVICES: |
| /* |
| * Set program to point to the needed bpf_program_t depending on |
| * the hierarchy level. |
| */ |
| switch (level) { |
| case CG_LEVEL_JOB: |
| case CG_LEVEL_STEP_USER: |
| program = &(p[level]); |
| break; |
| case CG_LEVEL_TASK: |
| if (!(task_cg_info = list_find_first( |
| task_list, |
| _find_task_cg_info, |
| &limits->taskid))) { |
| error("No task found with id %u, this should never happen", |
| limits->taskid); |
| return SLURM_ERROR; |
| } |
| program = &(task_cg_info->p); |
| break; |
| default: |
| error("unknown hierarchy level %d", level); |
| break; |
| } |
| if (!program) { |
| error("Could not find a bpf program to use at level %d", |
| level); |
| return SLURM_ERROR; |
| } |
| |
| dev_id_str = gres_device_id2str(&limits->device); |
| if (limits->allow_device) |
| log_flag(CGROUP, "Allowing access to device (%s)", |
| dev_id_str); |
| else |
| log_flag(CGROUP, "Denying access to device (%s)", |
| dev_id_str); |
| xfree(dev_id_str); |
| |
| /* Determine the correct BPF device type. */ |
| if (limits->device.type == DEV_TYPE_BLOCK) |
| bpf_dev_type = BPF_DEVCG_DEV_BLOCK; |
| else if (limits->device.type == DEV_TYPE_CHAR) |
| bpf_dev_type = BPF_DEVCG_DEV_CHAR; |
| |
| rc = add_device_ebpf_prog(program, bpf_dev_type, |
| limits->device.major, |
| limits->device.minor, |
| limits->allow_device); |
| break; |
| default: |
| error("cgroup controller %u not supported", ctl); |
| rc = SLURM_ERROR; |
| break; |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * Apply the device constrain limits, this is only used with cgroupv2 as there |
| * is the need of loading and attaching the eBPF program to the cgroup. |
| * It closes, loads and attach the bpf_program to the corresponding cgroup using |
| * level and task_id, task_id is only used in CG_LEVEL_TASK level. |
| */ |
| extern int cgroup_p_constrain_apply(cgroup_ctl_type_t ctl, cgroup_level_t level, |
| uint32_t task_id) |
| { |
| bpf_program_t *program = NULL; |
| task_cg_info_t *task_cg_info; |
| char *cgroup_path = NULL; |
| |
| /* |
| * cgroup/v1 legacy compatibility: We have no such levels in cgroup/v2 |
| * but we may still get calls for them. |
| */ |
| if (level == CG_LEVEL_USER) |
| return SLURM_SUCCESS; |
| |
| if (level == CG_LEVEL_SLURM) |
| level = CG_LEVEL_ROOT; |
| /* |
| * Our real step level is the level for user processes. This will make |
| * that the slurmstepd is never constrained in its own cgroup, which is |
| * something we want. Instead, slurmstepd will be part of the job limit. |
| * Note that a step which initializes pmi, could cause slurmstepd to |
| * grow, and we don't want this to be part of the step, but be part of |
| * the job. |
| */ |
| if (level == CG_LEVEL_STEP) |
| level = CG_LEVEL_STEP_USER; |
| |
| /* Only used in devices cgroup restriction */ |
| switch (ctl) { |
| case CG_DEVICES: |
| /* |
| * Set program to point to the needed bpf_program_t depending on |
| * the level and the task_id. |
| */ |
| if (level == CG_LEVEL_STEP_USER || level == CG_LEVEL_JOB) { |
| program = &(p[level]); |
| cgroup_path = int_cg[level].path; |
| } |
| |
| if (level == CG_LEVEL_TASK) { |
| if (!(task_cg_info = list_find_first(task_list, |
| _find_task_cg_info, |
| &task_id))) { |
| error("No task found with id %u, this should never happen", |
| task_id); |
| return SLURM_ERROR; |
| } |
| program = &(task_cg_info->p); |
| cgroup_path = task_cg_info->task_cg.path; |
| } |
| |
| if (!program) { |
| error("EBPF program with task_id %u does not exist", |
| task_id); |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * Only load the program if it has more instructions that the |
| * initial ones. |
| */ |
| if (program->n_inst > INIT_INST) { |
| log_flag(CGROUP,"EBPF Closing and loading bpf program into %s", |
| cgroup_path); |
| /* Set the default action*/ |
| close_ebpf_prog(program, EBPF_ACCEPT); |
| /* |
| * Load the ebpf program into the cgroup without the |
| * override flag if we are at TASK level, as this is the |
| * last cgroup in the hierarchy. |
| */ |
| return load_ebpf_prog(program, cgroup_path, |
| (level != CG_LEVEL_TASK)); |
| } else { |
| log_flag(CGROUP, "EBPF Not loading the program into %s because it is a noop", |
| cgroup_path); |
| } |
| break; |
| default: |
| error("cgroup controller %u not supported", ctl); |
| return SLURM_ERROR; |
| break; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern char *cgroup_p_get_scope_path(void) |
| { |
| return stepd_scope_path; |
| } |
| |
| static void _get_mem_recursive(xcgroup_t *cg, cgroup_limits_t *limits) |
| { |
| char *mem_max = NULL, *tmp_str = NULL, file_path[PATH_MAX]; |
| size_t mem_sz; |
| |
| if (!xstrcmp(cg->path, "/")) |
| goto end; |
| |
| /* |
| * Break when there is no memory controller anymore. |
| * |
| * We check if the file exists before getting its value because at the |
| * moment we do not have proper error propagation and common_get_param |
| * will emit an error(), which in our case it would just be a |
| * verification and not an error. |
| */ |
| snprintf(file_path, PATH_MAX, "%s/memory.max", cg->path); |
| if (access(file_path, F_OK)) { |
| log_flag(CGROUP, "Reached %s cgroup without memory controller", |
| cg->path); |
| goto end; |
| } |
| |
| if (common_cgroup_get_param(cg, "memory.max", &mem_max, &mem_sz) != |
| SLURM_SUCCESS) |
| goto end; |
| |
| /* Check ancestor */ |
| if (xstrstr(mem_max, "max")) { |
| tmp_str = xdirname(cg->path); |
| xfree(cg->path); |
| cg->path = tmp_str; |
| _get_mem_recursive(cg, limits); |
| if (limits->limit_in_bytes != NO_VAL64) |
| goto end; |
| } else { |
| /* found it! */ |
| mem_max[mem_sz - 1] = '\0'; |
| limits->limit_in_bytes = slurm_atoull(mem_max); |
| } |
| end: |
| xfree(mem_max); |
| } |
| |
| extern cgroup_limits_t *cgroup_p_constrain_get(cgroup_ctl_type_t ctl, |
| cgroup_level_t level) |
| { |
| cgroup_limits_t *limits; |
| xcgroup_t tmp_cg = { 0 }; |
| |
| /* |
| * cgroup/v1 legacy compatibility: We have no such levels in cgroup/v2 |
| * but we may still get calls for them. |
| */ |
| if (level == CG_LEVEL_USER) { |
| error("Incorrect cgroup level: %d", level); |
| return NULL; |
| } |
| |
| if (level == CG_LEVEL_SLURM) |
| level = CG_LEVEL_ROOT; |
| /* |
| * Our real step level is the level for user processes. This will make |
| * that the slurmstepd is never constrained in its own cgroup, which is |
| * something we want. Instead, slurmstepd will be part of the job limit. |
| * Note that a step which initializes pmi, could cause slurmstepd to |
| * grow, and we don't want this to be part of the step, but be part of |
| * the job. |
| */ |
| if (level == CG_LEVEL_STEP) |
| level = CG_LEVEL_STEP_USER; |
| |
| /* This is for CoreSpec* and MemSpec* for slurmd */ |
| if (level == CG_LEVEL_SYSTEM) |
| level = CG_LEVEL_ROOT; |
| |
| limits = xmalloc(sizeof(*limits)); |
| cgroup_init_limits(limits); |
| |
| switch (ctl) { |
| case CG_TRACK: |
| /* Not implemented. */ |
| goto fail; |
| case CG_CPUS: |
| /* |
| * cpuset.cpus: |
| * ------------ |
| * It lists the *requested* CPUs to be used by tasks within this |
| * cgroup. The actual list of CPUs to be granted, however, is |
| * subjected to constraints imposed by its parent and can differ |
| * from the requested CPUs. |
| * |
| * An empty value in cpuset.cpus indicates that the cgroup is |
| * using the same setting as the nearest cgroup ancestor with a |
| * non-empty cpuset.cpus, or all the available CPUs if none is |
| * found. |
| * |
| * cpuset.cpus.effective: |
| * ---------------------- |
| * It lists the onlined CPUs that are actually granted to this |
| * cgroup by its parent. These CPUs are allowed to be used by |
| * tasks within the current cgroup. |
| * |
| * If cpuset.cpus is empty, the cpuset.cpus.effective file shows |
| * all the CPUs from the parent cgroup that can be available to |
| * be used by this cgroup. |
| * |
| * If cpuset.cpus is not empty, the cpuset.cpus.effective file |
| * should be a subset of cpuset.cpus unless none of the CPUs |
| * listed in cpuset.cpus can be granted. In this case, it will |
| * be treated just like an empty cpuset.cpus. |
| */ |
| if (common_cgroup_get_param( |
| &int_cg[level], |
| "cpuset.cpus", |
| &limits->allow_cores, |
| &limits->cores_size) != SLURM_SUCCESS) |
| goto fail; |
| |
| if ((limits->cores_size == 1) && |
| !xstrcmp(limits->allow_cores, "\n")) { |
| xfree(limits->allow_cores); |
| if (common_cgroup_get_param( |
| &int_cg[level], |
| "cpuset.cpus.effective", |
| &limits->allow_cores, |
| &limits->cores_size) != SLURM_SUCCESS) |
| goto fail; |
| } |
| |
| /* |
| * The same concepts from cpuset.cpus and cpuset.cpus.effective |
| * applies for cpuset.mems and cpuset.mems.effective, so follow |
| * the same logic here. |
| */ |
| if (common_cgroup_get_param( |
| &int_cg[level], |
| "cpuset.mems", |
| &limits->allow_mems, |
| &limits->mems_size) != SLURM_SUCCESS) |
| goto fail; |
| |
| if ((limits->mems_size == 1) && |
| !xstrcmp(limits->allow_mems, "\n")) { |
| xfree(limits->allow_mems); |
| if (common_cgroup_get_param( |
| &int_cg[level], |
| "cpuset.mems.effective", |
| &limits->allow_mems, |
| &limits->mems_size) != SLURM_SUCCESS) |
| goto fail; |
| } |
| |
| /* |
| * Replace the last \n by \0. We lose one byte but we don't care |
| * since typically this object will be freed soon and we still |
| * keep the correct array size. |
| */ |
| if (limits->cores_size > 0) |
| limits->allow_cores[(limits->cores_size)-1] = '\0'; |
| |
| if (limits->mems_size > 0) |
| limits->allow_mems[(limits->mems_size)-1] = '\0'; |
| break; |
| case CG_MEMORY: |
| tmp_cg.path = xstrdup(int_cg[level].path); |
| _get_mem_recursive(&tmp_cg, limits); |
| xfree(tmp_cg.path); |
| break; |
| case CG_DEVICES: |
| /* Not implemented. */ |
| goto fail; |
| default: |
| error("cgroup controller %u not supported", ctl); |
| goto fail; |
| } |
| |
| return limits; |
| fail: |
| log_flag(CGROUP, "Returning empty limits, this should not happen."); |
| cgroup_free_limits(limits); |
| return NULL; |
| } |
| |
| extern int cgroup_p_step_start_oom_mgr(stepd_step_rec_t *step) |
| { |
| /* Only set the memory.oom.group if needed. */ |
| if (step->oom_kill_step) { |
| if (!cgroup_p_has_feature(CG_MEMCG_OOMGROUP)) |
| log_flag(CGROUP, "OOMKillStep was requested but memory.oom.group interface is not available."); |
| else { |
| if (common_cgroup_set_param(&int_cg[CG_LEVEL_STEP_USER], |
| "memory.oom.group", "1")) { |
| error("Cannot set memory.oom.group"); |
| return SLURM_ERROR; |
| } |
| } |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| extern cgroup_oom_t *cgroup_p_step_stop_oom_mgr(stepd_step_rec_t *step) |
| { |
| cgroup_oom_t *oom_step_results = NULL; |
| uint64_t job_kills = 0, step_kills = 0; |
| uint64_t job_swkills = 0, step_swkills = 0; |
| |
| if (!bit_test(int_cg_ns.avail_controllers, CG_MEMORY)) |
| return NULL; |
| |
| _get_memory_events(&job_kills, &step_kills); |
| |
| if (cgroup_p_has_feature(CG_MEMCG_SWAP)) |
| _get_swap_events(&job_swkills, &step_swkills); |
| |
| /* Return stats */ |
| log_flag(CGROUP, "OOM detected %"PRIu64" job and %"PRIu64" step kills", |
| job_kills, step_kills); |
| |
| oom_step_results = xmalloc(sizeof(*oom_step_results)); |
| oom_step_results->job_mem_failcnt = job_kills; |
| oom_step_results->job_memsw_failcnt = job_swkills; |
| oom_step_results->oom_kill_cnt = step_kills; |
| oom_step_results->step_mem_failcnt = step_kills; |
| oom_step_results->step_memsw_failcnt = step_swkills; |
| |
| return oom_step_results; |
| } |
| |
| extern int cgroup_p_task_addto(cgroup_ctl_type_t ctl, stepd_step_rec_t *step, |
| pid_t pid, uint32_t task_id) |
| { |
| task_cg_info_t *task_cg_info; |
| char *task_cg_path = NULL; |
| bool need_to_add = false; |
| |
| /* Ignore any possible movement of slurmstepd */ |
| if (pid == getpid()) |
| return SLURM_SUCCESS; |
| |
| if (task_id == task_special_id) |
| log_flag(CGROUP, "Starting task_special cgroup accounting"); |
| else |
| log_flag(CGROUP, "Starting task %u cgroup accounting", task_id); |
| |
| /* Let's be sure this task is not already created. */ |
| if (!(task_cg_info = list_find_first(task_list, _find_task_cg_info, |
| &task_id))) { |
| task_cg_info = xmalloc(sizeof(*task_cg_info)); |
| task_cg_info->taskid = task_id; |
| need_to_add = true; |
| } |
| |
| if (need_to_add) { |
| /* Create task hierarchy in this step. */ |
| if (task_id == task_special_id) |
| xstrfmtcat(task_cg_path, "%s/task_special", |
| int_cg[CG_LEVEL_STEP_USER].name); |
| else |
| xstrfmtcat(task_cg_path, "%s/task_%u", |
| int_cg[CG_LEVEL_STEP_USER].name, task_id); |
| |
| if (common_cgroup_create(&int_cg_ns, &task_cg_info->task_cg, |
| task_cg_path, 0, 0) != SLURM_SUCCESS) { |
| if (task_id == task_special_id) |
| error("unable to create task_special cgroup"); |
| else |
| error("unable to create task %u cgroup", |
| task_id); |
| xfree(task_cg_info); |
| xfree(task_cg_path); |
| return SLURM_ERROR; |
| } |
| xfree(task_cg_path); |
| |
| if (common_cgroup_instantiate(&task_cg_info->task_cg) != |
| SLURM_SUCCESS) { |
| if (task_id == task_special_id) |
| error("unable to instantiate task_special cgroup"); |
| else |
| error("unable to instantiate task %u cgroup", |
| task_id); |
| common_cgroup_destroy(&task_cg_info->task_cg); |
| xfree(task_cg_info); |
| return SLURM_ERROR; |
| } |
| /* Initialize the bpf_program before appending to the list. */ |
| init_ebpf_prog(&task_cg_info->p); |
| |
| /* Add the cgroup to the list now that it is initialized. */ |
| list_append(task_list, task_cg_info); |
| } |
| |
| /* Attach the pid to the corresponding step_x/task_y cgroup */ |
| if (common_cgroup_move_process(&task_cg_info->task_cg, pid) != |
| SLURM_SUCCESS) |
| error("Unable to move pid %d to %s cg", |
| pid, (task_cg_info->task_cg).path); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern cgroup_acct_t *cgroup_p_task_get_acct_data(uint32_t task_id) |
| { |
| char *cpu_stat = NULL, *memory_stat = NULL, *memory_current = NULL; |
| char *memory_peak = NULL; |
| char *ptr; |
| size_t tmp_sz = 0; |
| cgroup_acct_t *stats = NULL; |
| task_cg_info_t *task_cg_info; |
| static bool interfaces_checked = false, memory_peak_interface = false; |
| |
| if (!(task_cg_info = list_find_first(task_list, _find_task_cg_info, |
| &task_id))) { |
| if (task_id == task_special_id) |
| error("No task found with id %u (task_special), this should never happen", |
| task_id); |
| else |
| error("No task found with id %u, this should never happen", |
| task_id); |
| return NULL; |
| } |
| |
| /* |
| * Check optional interfaces existence and permissions. This check |
| * will help to avoid querying unexistent cgroup interfaces every time, |
| * as might happen in kernel versions that do not provide all of them |
| */ |
| if (!interfaces_checked) { |
| /* |
| * Check for memory.peak support as RHEL8 and other OSes with |
| * old kernels might not provide it. |
| */ |
| memory_peak_interface = cgroup_p_has_feature(CG_MEMCG_PEAK); |
| interfaces_checked = true; |
| } |
| |
| if (common_cgroup_get_param(&task_cg_info->task_cg, |
| "cpu.stat", |
| &cpu_stat, |
| &tmp_sz) != SLURM_SUCCESS) { |
| if (task_id == task_special_id) |
| log_flag(CGROUP, "Cannot read task_special cpu.stat file"); |
| else |
| log_flag(CGROUP, "Cannot read task %d cpu.stat file", |
| task_id); |
| } |
| |
| if (common_cgroup_get_param(&task_cg_info->task_cg, |
| "memory.current", |
| &memory_current, |
| &tmp_sz) != SLURM_SUCCESS) { |
| if (task_id == task_special_id) |
| log_flag(CGROUP, "Cannot read task_special memory.current file"); |
| else |
| log_flag(CGROUP, "Cannot read task %d memory.current file", |
| task_id); |
| } |
| |
| if (common_cgroup_get_param(&task_cg_info->task_cg, |
| "memory.stat", |
| &memory_stat, |
| &tmp_sz) != SLURM_SUCCESS) { |
| if (task_id == task_special_id) |
| log_flag(CGROUP, "Cannot read task_special memory.stat file"); |
| else |
| log_flag(CGROUP, "Cannot read task %d memory.stat file", |
| task_id); |
| } |
| |
| if (memory_peak_interface) { |
| if (common_cgroup_get_param(&task_cg_info->task_cg, |
| "memory.peak", |
| &memory_peak, |
| &tmp_sz) != SLURM_SUCCESS) { |
| if (task_id == task_special_id) |
| log_flag(CGROUP, "Cannot read task_special memory.peak interface, does your OS support it?"); |
| else |
| log_flag(CGROUP, "Cannot read task %d memory.peak interface, does your OS support it?", |
| task_id); |
| } |
| } |
| |
| /* |
| * Initialize values. A NO_VAL64 will indicate the caller that something |
| * happened here. Values that aren't set here are returned as 0. |
| */ |
| stats = xmalloc(sizeof(*stats)); |
| stats->usec = NO_VAL64; |
| stats->ssec = NO_VAL64; |
| stats->total_rss = NO_VAL64; |
| stats->total_pgmajfault = NO_VAL64; |
| stats->memory_peak = INFINITE64; /* As required in common_jag.c */ |
| |
| if (cpu_stat) { |
| ptr = xstrstr(cpu_stat, "user_usec"); |
| if (ptr && |
| (sscanf(ptr, "user_usec %"PRIu64, &stats->usec) != 1)) |
| error("Cannot parse user_sec field in cpu.stat file"); |
| |
| ptr = xstrstr(cpu_stat, "system_usec"); |
| if (ptr && |
| (sscanf(ptr, "system_usec %"PRIu64, &stats->ssec) != 1)) |
| error("Cannot parse system_usec field in cpu.stat file"); |
| xfree(cpu_stat); |
| } |
| |
| /* |
| * In cgroup/v1, total_rss was the hierarchical sum of # of bytes of |
| * anonymous and swap cache memory (including transparent huge pages). |
| * |
| * In cgroup/v2 we use memory.current which includes all the |
| * memory the app has touched. Using this value makes it consistent with |
| * the OOM killer limit. |
| */ |
| if (memory_current) { |
| if (sscanf(memory_current, "%"PRIu64, &stats->total_rss) != 1) |
| error("Cannot parse memory.current file"); |
| xfree(memory_current); |
| } |
| |
| if (memory_stat) { |
| ptr = xstrstr(memory_stat, "pgmajfault"); |
| if (ptr && (sscanf(ptr, "pgmajfault %"PRIu64, |
| &stats->total_pgmajfault) != 1)) |
| log_flag(CGROUP, "Cannot parse pgmajfault field in memory.stat file"); |
| xfree(memory_stat); |
| } |
| |
| if (memory_peak) { |
| if (sscanf(memory_peak, "%"PRIu64, &stats->memory_peak) != 1) |
| error("Cannot parse memory.peak file"); |
| xfree(memory_peak); |
| } |
| |
| return stats; |
| } |
| |
| /* |
| * Return conversion units used for stats gathered from cpuacct. |
| * Dividing the provided data by this number will give seconds. |
| */ |
| extern long int cgroup_p_get_acct_units(void) |
| { |
| /* usec and ssec from cpuacct.stat are provided in micro-seconds. */ |
| return (long int)USEC_IN_SEC; |
| } |
| |
| extern bool cgroup_p_has_feature(cgroup_ctl_feature_t f) |
| { |
| char file_path[PATH_MAX]; |
| |
| switch (f) { |
| case CG_MEMCG_OOMGROUP: |
| if (!bit_test(int_cg_ns.avail_controllers, CG_MEMORY)) |
| break; |
| if (snprintf(file_path, PATH_MAX, "%s/memory.oom.group", |
| int_cg[CG_LEVEL_ROOT].path) >= PATH_MAX) |
| break; |
| if (!access(file_path, F_OK)) |
| return true; |
| break; |
| case CG_MEMCG_PEAK: |
| if (!bit_test(int_cg_ns.avail_controllers, CG_MEMORY)) |
| break; |
| if (snprintf(file_path, PATH_MAX, "%s/memory.peak", |
| int_cg[CG_LEVEL_ROOT].path) >= PATH_MAX) |
| break; |
| if (!access(file_path, F_OK)) |
| return true; |
| break; |
| case CG_MEMCG_SWAP: |
| if (!bit_test(int_cg_ns.avail_controllers, CG_MEMORY)) |
| break; |
| if (snprintf(file_path, PATH_MAX, "%s/memory.swap.max", |
| int_cg[CG_LEVEL_ROOT].path) >= PATH_MAX) |
| break; |
| if (!access(file_path, F_OK)) |
| return true; |
| break; |
| case CG_FALSE_ROOT: |
| /* |
| * The cgroup.type file is only present on non-root cgroups. |
| * This is done to ensure that we do not have a cgroup non-root |
| * mounted into /sys/fs/cgroup. |
| */ |
| if (snprintf(file_path, PATH_MAX, "%s/cgroup.type", |
| slurm_cgroup_conf.cgroup_mountpoint) >= PATH_MAX) |
| break; |
| if (!access(file_path, F_OK)) |
| return true; |
| break; |
| case CG_KILL_BUTTON: |
| if (snprintf(file_path, PATH_MAX, "%s/cgroup.kill", |
| int_cg[CG_LEVEL_ROOT].path) >= PATH_MAX) |
| break; |
| if (!access(file_path, F_OK)) |
| return true; |
| break; |
| default: |
| break; |
| } |
| |
| return false; |
| } |
| |
| extern int cgroup_p_signal(int signal) |
| { |
| if (signal != SIGKILL) { |
| error("cgroup/v2 cgroup.kill only supports SIGKILL"); |
| return SLURM_ERROR; |
| } |
| |
| if (common_cgroup_set_param(&int_cg[CG_LEVEL_STEP_USER], |
| "cgroup.kill", "1")) { |
| error("Writing 1 to %s/cgroup.kill failed", |
| int_cg[CG_LEVEL_STEP_USER].path); |
| return SLURM_ERROR; |
| } |
| |
| log_flag(CGROUP, "Sent signal %d to %s", signal, |
| int_cg[CG_LEVEL_STEP_USER].path); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern char *cgroup_p_get_task_empty_event_path(uint32_t taskid, |
| bool *on_modify) |
| { |
| task_cg_info_t *task_cg_info; |
| |
| xassert(on_modify); |
| |
| if (!(task_cg_info = list_find_first(task_list, _find_task_cg_info, |
| &taskid))) { |
| return NULL; |
| } |
| |
| /* We want to watch when cgroups.events is modified */ |
| *on_modify = true; |
| |
| return xstrdup_printf("%s/cgroup.events", task_cg_info->task_cg.path); |
| } |
| |
| extern int cgroup_p_is_task_empty(uint32_t taskid) |
| { |
| task_cg_info_t *task_cg_info; |
| xcgroup_t cg; |
| |
| if (!(task_cg_info = list_find_first(task_list, _find_task_cg_info, |
| &taskid))) { |
| return SLURM_ERROR; |
| } |
| |
| cg = task_cg_info->task_cg; |
| |
| return _is_cgroup_empty(&cg); |
| } |