| /*****************************************************************************\ |
| * cgroup_v1.c - Cgroup v1 plugin |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #define _GNU_SOURCE |
| |
| #include "cgroup_v1.h" |
| |
| /* |
| * These variables are required by the generic plugin interface. If they |
| * are not found in the plugin, the plugin loader will ignore it. |
| * |
| * plugin_name - a string giving a human-readable description of the |
| * plugin. There is no maximum length, but the symbol must refer to |
| * a valid string. |
| * |
| * plugin_type - a string suggesting the type of the plugin or its |
| * applicability to a particular form of data or method of data handling. |
| * If the low-level plugin API is used, the contents of this string are |
| * unimportant and may be anything. Slurm uses the higher-level plugin |
| * interface which requires this string to be of the form |
| * |
| * <application>/<method> |
| * |
| * where <application> is a description of the intended application of |
| * the plugin (e.g., "select" for Slurm node selection) and <method> |
| * is a description of how this plugin satisfies that application. Slurm will |
| * only load select plugins if the plugin_type string has a |
| * prefix of "select/". |
| * |
| * plugin_version - an unsigned 32-bit integer containing the Slurm version |
| * (major.minor.micro combined into a single number). |
| */ |
| const char plugin_name[] = "Cgroup v1 plugin"; |
| const char plugin_type[] = "cgroup/v1"; |
| const uint32_t plugin_version = SLURM_VERSION_NUMBER; |
| |
| static char g_user_cgpath[CG_CTL_CNT][PATH_MAX]; |
| static char g_job_cgpath[CG_CTL_CNT][PATH_MAX]; |
| static char g_step_cgpath[CG_CTL_CNT][PATH_MAX]; |
| static uint16_t g_step_active_cnt[CG_CTL_CNT]; |
| |
| static xcgroup_ns_t g_cg_ns[CG_CTL_CNT]; |
| |
| /* Internal cgroup structs */ |
| static xcgroup_t int_cg[CG_CTL_CNT][CG_LEVEL_CNT]; |
| |
| const char *g_cg_name[CG_CTL_CNT] = { |
| "freezer", |
| "cpuset", |
| "memory", |
| "devices", |
| "cpuacct" |
| }; |
| |
| /* Cgroup v1 control items for the oom monitor */ |
| #define STOP_OOM 1 |
| |
| typedef enum { |
| OOM_KILL_NONE, /* Don't account for oom_kill events. */ |
| OOM_KILL_COUNTER, /* Use memory.oom_control's oom_kill field. */ |
| OOM_KILL_MON /* Spawn a monitoring thread and use eventfd. */ |
| } oom_kill_type_t; |
| |
| typedef struct { |
| int cfd; /* control file fd. */ |
| int efd; /* event file fd. */ |
| int event_fd; /* eventfd fd. */ |
| } oom_event_args_t; |
| |
| static oom_kill_type_t oom_kill_type = OOM_KILL_NONE; |
| static uint64_t oom_kill_count = 0; |
| static int oom_pipe[2] = { -1, -1 }; |
| static pthread_t oom_thread; |
| static pthread_mutex_t oom_mutex = PTHREAD_MUTEX_INITIALIZER; |
| |
| /* Task tracking artifacts */ |
| list_t *g_task_list[CG_CTL_CNT]; |
| static uint32_t g_max_task_id = 0; |
| /* |
| * There are potentially multiple tasks on a node, so we want to |
| * track every task cgroup and which taskid it belongs to. |
| */ |
| typedef struct { |
| xcgroup_t task_cg; |
| uint32_t taskid; |
| } task_cg_info_t; |
| |
| extern bool cgroup_p_has_feature(cgroup_ctl_feature_t f); |
| |
| static int _step_destroy_internal(cgroup_ctl_type_t sub, bool root_locked); |
| static int _get_oom_kill_from_file(xcgroup_t *cg); |
| |
| static int _cgroup_init(cgroup_ctl_type_t sub) |
| { |
| if (sub >= CG_CTL_CNT) |
| return SLURM_ERROR; |
| |
| if (xcgroup_ns_create(&g_cg_ns[sub], "", g_cg_name[sub]) |
| != SLURM_SUCCESS) { |
| error("unable to create %s cgroup namespace", g_cg_name[sub]); |
| return SLURM_ERROR; |
| } |
| |
| if (common_cgroup_create(&g_cg_ns[sub], &int_cg[sub][CG_LEVEL_ROOT], |
| "", 0, 0) != SLURM_SUCCESS) { |
| error("unable to create root %s xcgroup", g_cg_name[sub]); |
| common_cgroup_ns_destroy(&g_cg_ns[sub]); |
| return SLURM_ERROR; |
| } |
| |
| if (xcgroup_create_slurm_cg( |
| &g_cg_ns[sub], &int_cg[sub][CG_LEVEL_SLURM]) != |
| SLURM_SUCCESS) { |
| error("unable to create slurm %s xcgroup", g_cg_name[sub]); |
| common_cgroup_ns_destroy(&g_cg_ns[sub]); |
| return SLURM_ERROR; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _cpuset_create(stepd_step_rec_t *step) |
| { |
| int rc; |
| char *sys_cgpath = NULL; |
| char *value; |
| size_t cpus_size; |
| |
| rc = common_cgroup_get_param(&int_cg[CG_CPUS][CG_LEVEL_SLURM], |
| "cpuset.cpus", &value, &cpus_size); |
| |
| if ((rc != SLURM_SUCCESS) || (cpus_size == 1)) { |
| /* initialize the cpusets as it was non-existent */ |
| if (xcgroup_cpuset_init(&int_cg[CG_CPUS][CG_LEVEL_SLURM]) != |
| SLURM_SUCCESS) { |
| xfree(value); |
| return SLURM_ERROR; |
| } |
| } |
| |
| /* Do not inherit this setting in children, let plugins set it. */ |
| common_cgroup_set_param(&int_cg[CG_CPUS][CG_LEVEL_SLURM], |
| "cgroup.clone_children", "0"); |
| |
| if (step == NULL) { |
| /* This is a request to create a cpuset for slurmd daemon */ |
| xstrfmtcat(sys_cgpath, "%s/system", |
| int_cg[CG_CPUS][CG_LEVEL_SLURM].name); |
| |
| /* create system cgroup in the cpuset ns */ |
| if ((rc = common_cgroup_create( |
| &g_cg_ns[CG_CPUS], |
| &int_cg[CG_CPUS][CG_LEVEL_SYSTEM], |
| sys_cgpath, getuid(), getgid())) |
| != SLURM_SUCCESS) { |
| goto end; |
| } |
| |
| if (running_in_slurmd()) { |
| /* |
| * The slurmd is the only place we need to set up the |
| * system cgroup and the slurmstepd should not overwrite |
| * these. If slurmstepd does overwrite these values |
| * (such as cpuset.cpus) then slurmd will not be |
| * properly constrained anymore. |
| */ |
| if ((rc = common_cgroup_instantiate( |
| &int_cg[CG_CPUS][CG_LEVEL_SYSTEM])) |
| != SLURM_SUCCESS) |
| goto end; |
| |
| /* set notify on release flag */ |
| common_cgroup_set_param( |
| &int_cg[CG_CPUS][CG_LEVEL_SYSTEM], |
| "notify_on_release", "0"); |
| |
| if ((rc = xcgroup_cpuset_init( |
| &int_cg[CG_CPUS][CG_LEVEL_SYSTEM])) |
| != SLURM_SUCCESS) |
| goto end; |
| } |
| |
| log_flag(CGROUP, |
| "system cgroup: system cpuset cgroup initialized"); |
| } else { |
| /* |
| * We don't lock here the g_root cg[CG_CPUS] because it is |
| * locked from the caller. |
| */ |
| rc = xcgroup_create_hierarchy(__func__, |
| step, |
| &g_cg_ns[CG_CPUS], |
| int_cg[CG_CPUS], |
| g_job_cgpath[CG_CPUS], |
| g_step_cgpath[CG_CPUS], |
| g_user_cgpath[CG_CPUS]); |
| } |
| |
| end: |
| xfree(value); |
| xfree(sys_cgpath); |
| |
| return rc; |
| } |
| |
| static int _remove_cg_subsystem(xcgroup_t int_cg[], const char *log_str, |
| bool root_locked) |
| { |
| xcgroup_t *root_cg = &int_cg[CG_LEVEL_ROOT]; |
| xcgroup_t *job_cg = &int_cg[CG_LEVEL_JOB]; |
| xcgroup_t *step_cg = &int_cg[CG_LEVEL_STEP]; |
| xcgroup_t *user_cg = &int_cg[CG_LEVEL_USER]; |
| xcgroup_t *slurm_cg = &int_cg[CG_LEVEL_SLURM]; |
| int rc = SLURM_SUCCESS; |
| |
| /* |
| * Lock the root cgroup so we don't race with other steps that are being |
| * started. |
| */ |
| if (!root_locked && (common_cgroup_lock(root_cg) != SLURM_SUCCESS)) { |
| error("common_cgroup_lock error (%s)", log_str); |
| return SLURM_ERROR; |
| } |
| |
| /* |
| * Always try to move slurmstepd process to the root cgroup, otherwise |
| * the rmdir(2) triggered by the calls below will always fail if the pid |
| * of stepd is in the cgroup. We don't know what other plugins will do |
| * and whether they will attach the stepd pid to the cg. |
| */ |
| rc = common_cgroup_move_process(root_cg, getpid()); |
| if (rc != SLURM_SUCCESS) { |
| error("Unable to move pid %d to root cgroup", getpid()); |
| goto end; |
| } |
| common_cgroup_wait_pid_moved(step_cg, getpid(), log_str); |
| |
| /* Delete step cgroup. */ |
| if ((rc = common_cgroup_delete(step_cg)) != SLURM_SUCCESS) |
| goto end; |
| |
| /* |
| * At this point we'll do a best effort for the job and user cgroup, |
| * since other jobs or steps may still be alive and not let us complete |
| * the cleanup. The last job/step in the hierarchy will be the one which |
| * will finally remove these two directories |
| */ |
| /* Delete job cgroup. */ |
| if ((rc = common_cgroup_delete(job_cg)) != SLURM_SUCCESS) { |
| rc = SLURM_SUCCESS; |
| goto end; |
| } |
| /* Delete user cgroup. */ |
| if ((rc = common_cgroup_delete(user_cg)) != SLURM_SUCCESS) { |
| rc = SLURM_SUCCESS; |
| goto end; |
| } |
| |
| /* |
| * Invalidate the cgroup structs. |
| */ |
| common_cgroup_destroy(user_cg); |
| common_cgroup_destroy(job_cg); |
| common_cgroup_destroy(step_cg); |
| common_cgroup_destroy(slurm_cg); |
| |
| end: |
| if (!root_locked) |
| common_cgroup_unlock(root_cg); |
| return rc; |
| } |
| |
| static int _acct_task(void *x, void *arg) |
| { |
| task_cg_info_t *t = (task_cg_info_t *) x; |
| cgroup_ctl_type_t *ctl = (cgroup_ctl_type_t *) arg; |
| |
| /* Before deleting the task we account for its oom_kill if needed. */ |
| if ((oom_kill_type == OOM_KILL_COUNTER) && |
| (ctl && (*ctl == CG_MEMORY))) |
| _get_oom_kill_from_file(&t->task_cg); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _rmdir_task(void *x, void *arg) |
| { |
| task_cg_info_t *t = (task_cg_info_t *) x; |
| |
| if (common_cgroup_delete(&t->task_cg) != SLURM_SUCCESS) |
| log_flag(CGROUP, "taskid: %d, failed to delete %s %m", |
| t->taskid, t->task_cg.path); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _find_task_cg_info(void *x, void *key) |
| { |
| task_cg_info_t *task_cg = (task_cg_info_t*)x; |
| uint32_t taskid = *(uint32_t*)key; |
| |
| if (task_cg->taskid == taskid) |
| return 1; |
| |
| return 0; |
| } |
| |
| static void _free_task_cg_info(void *object) |
| { |
| task_cg_info_t *task_cg = (task_cg_info_t *)object; |
| |
| if (task_cg) { |
| common_cgroup_destroy(&task_cg->task_cg); |
| xfree(task_cg); |
| } |
| } |
| |
| static int _handle_task_cgroup(cgroup_ctl_type_t sub, stepd_step_rec_t *step, |
| pid_t pid, uint32_t taskid) |
| { |
| int rc = SLURM_SUCCESS; |
| bool need_to_add = false; |
| task_cg_info_t *task_cg_info; |
| uid_t uid = step->uid; |
| gid_t gid = step->gid; |
| char *task_cgroup_path = NULL; |
| |
| /* build task cgroup relative path */ |
| xstrfmtcat(task_cgroup_path, "%s/task_%u", g_step_cgpath[sub], taskid); |
| if (!task_cgroup_path) { |
| error("unable to build task_%u cg relative path for %s: %m", |
| taskid, g_step_cgpath[sub]); |
| return SLURM_ERROR; |
| } |
| |
| if (!(task_cg_info = list_find_first(g_task_list[sub], |
| _find_task_cg_info, |
| &taskid))) { |
| task_cg_info = xmalloc(sizeof(*task_cg_info)); |
| task_cg_info->taskid = taskid; |
| need_to_add = true; |
| } |
| |
| /* |
| * Create task cgroup in the cg ns |
| */ |
| if (common_cgroup_create(&g_cg_ns[sub], &task_cg_info->task_cg, |
| task_cgroup_path, uid, gid) != SLURM_SUCCESS) { |
| error("unable to create task %u cgroup", taskid); |
| xfree(task_cg_info); |
| xfree(task_cgroup_path); |
| return SLURM_ERROR; |
| } |
| |
| if (common_cgroup_instantiate(&task_cg_info->task_cg) != SLURM_SUCCESS) |
| { |
| _free_task_cg_info(task_cg_info); |
| error("unable to instantiate task %u cgroup", taskid); |
| xfree(task_cgroup_path); |
| return SLURM_ERROR; |
| } |
| |
| /* set notify on release flag */ |
| common_cgroup_set_param(&task_cg_info->task_cg, "notify_on_release", |
| "0"); |
| |
| /* Initialize the cpuset cgroup before moving processes into it */ |
| if (sub == CG_CPUS) { |
| rc = xcgroup_cpuset_init(&task_cg_info->task_cg); |
| if (rc != SLURM_SUCCESS) { |
| error("Unable to initialize the cpuset cgroup %s", |
| task_cg_info->task_cg.path); |
| goto end; |
| } |
| } |
| |
| /* Attach the pid to the corresponding step_x/task_y cgroup */ |
| rc = common_cgroup_move_process(&task_cg_info->task_cg, pid); |
| if (rc != SLURM_SUCCESS) |
| error("Unable to move pid %d to %s cg", pid, task_cgroup_path); |
| |
| /* Add the cgroup to the list now that it is initialized. */ |
| if (need_to_add) |
| list_append(g_task_list[sub], task_cg_info); |
| |
| end: |
| xfree(task_cgroup_path); |
| return rc; |
| } |
| |
| static int _all_tasks_destroy(cgroup_ctl_type_t sub) |
| { |
| int rc; |
| |
| /* Empty the lists of accounted tasks, do a best effort in rmdir */ |
| rc = list_for_each(g_task_list[sub], _rmdir_task, NULL); |
| list_flush(g_task_list[sub]); |
| |
| return rc; |
| } |
| |
| static bool _is_root_path(char *path) |
| { |
| bool rc = false; |
| char *parent_path = NULL, file_path[PATH_MAX]; |
| parent_path = xdirname(path); |
| |
| if (snprintf(file_path, PATH_MAX, "%s/cgroup.procs", parent_path) >= |
| PATH_MAX) { |
| error("Could not generate cgroup path: %s", file_path); |
| goto end; |
| } |
| |
| /* If cgroup.procs is not found one level up, we are in the root */ |
| if (access(file_path, F_OK)) |
| rc = true; |
| |
| end: |
| xfree(parent_path); |
| return rc; |
| } |
| |
| static void _remove_process_cg_limits(pid_t pid) |
| { |
| xcgroup_t cg_cpu = { 0 }; |
| xcgroup_t cg_mem = { 0 }; |
| xcgroup_ns_t cpu_ns = { 0 }; |
| xcgroup_ns_t mem_ns = { 0 }; |
| |
| /* Try to reset cpuset limits */ |
| if (xcgroup_ns_create(&cpu_ns, "", g_cg_name[CG_CPUS])) { |
| log_flag(CGROUP,"Not resetting cpuset, controller not found"); |
| } else if (xcgroup_ns_find_by_pid(&cpu_ns, &cg_cpu, pid)) { |
| error("Cannot find slurmd cpu cgroup"); |
| } else if (!_is_root_path(cg_cpu.path)) { |
| if (xcgroup_cpuset_init(&cg_cpu)) { |
| error("Cannot reset slurmd cpuset limits"); |
| } else { |
| log_flag(CGROUP, "Reset slurmd cpuset limits"); |
| } |
| } |
| common_cgroup_destroy(&cg_cpu); |
| common_cgroup_ns_destroy(&cpu_ns); |
| |
| /* Try to reset memory limits */ |
| if (xcgroup_ns_create(&mem_ns, "", g_cg_name[CG_MEMORY])) { |
| log_flag(CGROUP,"Not resetting memory, controller not found"); |
| } else if (xcgroup_ns_find_by_pid(&mem_ns, &cg_mem, pid)) { |
| error("Cannot find slurmd memory cgroup"); |
| } else if (!_is_root_path(cg_mem.path)) { |
| if (common_cgroup_set_param(&cg_mem, "memory.limit_in_bytes", |
| "-1")) { |
| error("Cannot reset slurmd memory limits"); |
| } else { |
| log_flag(CGROUP, "Reset slurmd memory limits"); |
| } |
| } |
| common_cgroup_destroy(&cg_mem); |
| common_cgroup_ns_destroy(&mem_ns); |
| } |
| |
| extern int init(void) |
| { |
| int i; |
| |
| for (i = 0; i < CG_CTL_CNT; i++) { |
| g_user_cgpath[i][0] = '\0'; |
| g_job_cgpath[i][0] = '\0'; |
| g_step_cgpath[i][0] = '\0'; |
| g_step_active_cnt[i] = 0; |
| FREE_NULL_LIST(g_task_list[i]); |
| g_task_list[i] = list_create(_free_task_cg_info); |
| } |
| |
| debug("%s loaded", plugin_name); |
| return SLURM_SUCCESS; |
| } |
| |
| extern void fini(void) |
| { |
| for (int sub = 0; sub < CG_CTL_CNT; sub++) { |
| FREE_NULL_LIST(g_task_list[sub]); |
| common_cgroup_ns_destroy(&g_cg_ns[sub]); |
| common_cgroup_destroy(&int_cg[sub][CG_LEVEL_ROOT]); |
| } |
| |
| debug("unloading %s", plugin_name); |
| } |
| |
| extern int cgroup_p_setup_scope(char *scope_path) |
| { |
| if (running_in_slurmd()) |
| _remove_process_cg_limits(getpid()); |
| return SLURM_SUCCESS; |
| } |
| |
| extern char *cgroup_p_get_scope_path(void) |
| { |
| return NULL; |
| } |
| |
| extern int cgroup_p_initialize(cgroup_ctl_type_t sub) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| /* Only initialize if not inited */ |
| if (g_cg_ns[sub].mnt_point) |
| return rc; |
| |
| if ((rc = _cgroup_init(sub))) |
| return rc; |
| |
| switch (sub) { |
| case CG_TRACK: |
| case CG_CPUS: |
| break; |
| case CG_MEMORY: |
| common_cgroup_set_param(&int_cg[sub][CG_LEVEL_ROOT], |
| "memory.use_hierarchy", "1"); |
| break; |
| case CG_DEVICES: |
| case CG_CPUACCT: |
| break; |
| default: |
| error("cgroup subsystem %u not supported", sub); |
| rc = SLURM_ERROR; |
| break; |
| } |
| |
| return rc; |
| } |
| |
| extern int cgroup_p_system_create(cgroup_ctl_type_t sub) |
| { |
| char *sys_cgpath = NULL; |
| int rc = SLURM_SUCCESS; |
| |
| switch (sub) { |
| case CG_CPUS: |
| rc = _cpuset_create(NULL); |
| break; |
| case CG_MEMORY: |
| xstrfmtcat(sys_cgpath, "%s/system", |
| int_cg[sub][CG_LEVEL_SLURM].name); |
| |
| if ((rc = common_cgroup_create(&g_cg_ns[sub], |
| &int_cg[sub][CG_LEVEL_SYSTEM], |
| sys_cgpath, getuid(), getgid())) |
| != SLURM_SUCCESS) |
| goto end; |
| |
| if ((rc = common_cgroup_instantiate( |
| &int_cg[sub][CG_LEVEL_SYSTEM])) |
| != SLURM_SUCCESS) |
| goto end; |
| |
| /* set notify on release flag */ |
| common_cgroup_set_param(&int_cg[sub][CG_LEVEL_SYSTEM], |
| "notify_on_release", "0"); |
| |
| if ((rc = common_cgroup_set_param(&int_cg[sub][CG_LEVEL_SYSTEM], |
| "memory.use_hierarchy", "1")) |
| != SLURM_SUCCESS) { |
| error("system cgroup: unable to ask for hierarchical accounting of system memcg '%s'", |
| int_cg[sub][CG_LEVEL_SYSTEM].path); |
| goto end; |
| } |
| |
| if ((rc = common_cgroup_set_uint64_param( |
| &int_cg[sub][CG_LEVEL_SYSTEM], |
| "memory.oom_control", 1)) |
| != SLURM_SUCCESS) { |
| error("Resource spec: unable to disable OOM Killer in system memory cgroup: %s", |
| int_cg[sub][CG_LEVEL_SYSTEM].path); |
| goto end; |
| } |
| break; |
| case CG_TRACK: |
| case CG_DEVICES: |
| case CG_CPUACCT: |
| error("This operation is not supported for %s", g_cg_name[sub]); |
| return SLURM_ERROR; |
| default: |
| error("cgroup subsystem %u not supported", sub); |
| return SLURM_ERROR; |
| break; |
| } |
| |
| end: |
| xfree(sys_cgpath); |
| return rc; |
| } |
| |
| extern int cgroup_p_system_addto(cgroup_ctl_type_t sub, pid_t *pids, int npids) |
| { |
| switch (sub) { |
| case CG_TRACK: |
| break; |
| case CG_CPUS: |
| return common_cgroup_add_pids(&int_cg[sub][CG_LEVEL_SYSTEM], |
| pids, npids); |
| case CG_MEMORY: |
| return common_cgroup_add_pids(&int_cg[sub][CG_LEVEL_SYSTEM], |
| pids, npids); |
| case CG_DEVICES: |
| break; |
| case CG_CPUACCT: |
| break; |
| default: |
| error("cgroup subsystem %u not supported", sub); |
| return SLURM_ERROR; |
| } |
| |
| error("This operation is not supported for %s", g_cg_name[sub]); |
| return SLURM_ERROR; |
| } |
| |
| extern int cgroup_p_system_destroy(cgroup_ctl_type_t sub) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| /* |
| * Note: we do not need to lock the root cgroup because the only user |
| * of this function is a single thread of slurmd. |
| */ |
| |
| /* Another plugin may have already destroyed this subsystem. */ |
| if (!int_cg[sub][CG_LEVEL_SYSTEM].path) |
| return SLURM_SUCCESS; |
| |
| /* Custom actions for every cgroup subsystem */ |
| switch (sub) { |
| case CG_CPUS: |
| case CG_MEMORY: |
| break; |
| case CG_TRACK: |
| case CG_DEVICES: |
| case CG_CPUACCT: |
| error("This operation is not supported for %s", g_cg_name[sub]); |
| return SLURM_SUCCESS; |
| default: |
| error("cgroup subsystem %u not supported", sub); |
| return SLURM_ERROR; |
| break; |
| } |
| |
| rc = common_cgroup_move_process(&int_cg[sub][CG_LEVEL_ROOT], getpid()); |
| if (rc != SLURM_SUCCESS) { |
| error("Unable to move pid %d to root cgroup", getpid()); |
| goto end; |
| } |
| common_cgroup_wait_pid_moved(&int_cg[sub][CG_LEVEL_SYSTEM], getpid(), |
| g_cg_name[sub]); |
| |
| if ((rc = common_cgroup_delete(&int_cg[sub][CG_LEVEL_SYSTEM])) |
| != SLURM_SUCCESS) { |
| log_flag(CGROUP, "not removing system cg (%s), there may be attached stepds: %m", |
| g_cg_name[sub]); |
| goto end; |
| } |
| common_cgroup_destroy(&int_cg[sub][CG_LEVEL_SYSTEM]); |
| end: |
| if (rc == SLURM_SUCCESS) { |
| common_cgroup_destroy(&int_cg[sub][CG_LEVEL_SLURM]); |
| common_cgroup_destroy(&int_cg[sub][CG_LEVEL_ROOT]); |
| common_cgroup_ns_destroy(&g_cg_ns[sub]); |
| } |
| return rc; |
| } |
| |
| /* |
| * Each call to this function counts as one active user of the step directories, |
| * so the number of calls to this function must mach the number of calls of |
| * cgroup_p_step_destroy in each plugin. |
| */ |
| extern int cgroup_p_step_create(cgroup_ctl_type_t sub, stepd_step_rec_t *step) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| /* |
| * Lock the root cgroup so we don't race with other steps that are being |
| * terminated, they could remove the directories while we're creating |
| * them. |
| */ |
| if (common_cgroup_lock(&int_cg[sub][CG_LEVEL_ROOT]) != SLURM_SUCCESS) { |
| error("common_cgroup_lock error"); |
| return SLURM_ERROR; |
| } |
| |
| /* Don't let other plugins destroy our structs. */ |
| g_step_active_cnt[sub]++; |
| |
| switch (sub) { |
| case CG_TRACK: |
| /* create a new cgroup for that container */ |
| if ((rc = xcgroup_create_hierarchy(__func__, |
| step, |
| &g_cg_ns[sub], |
| int_cg[sub], |
| g_job_cgpath[sub], |
| g_step_cgpath[sub], |
| g_user_cgpath[sub])) |
| != SLURM_SUCCESS) |
| goto step_c_err; |
| break; |
| case CG_CPUS: |
| if ((rc = _cpuset_create(step))!= SLURM_SUCCESS) |
| goto step_c_err; |
| break; |
| case CG_MEMORY: |
| if ((rc = xcgroup_create_hierarchy(__func__, |
| step, |
| &g_cg_ns[sub], |
| int_cg[sub], |
| g_job_cgpath[sub], |
| g_step_cgpath[sub], |
| g_user_cgpath[sub])) |
| != SLURM_SUCCESS) { |
| goto step_c_err; |
| } |
| if ((rc = common_cgroup_set_param(&int_cg[sub][CG_LEVEL_USER], |
| "memory.use_hierarchy", |
| "1")) != SLURM_SUCCESS) { |
| error("unable to set hierarchical accounting for %s", |
| g_user_cgpath[sub]); |
| _step_destroy_internal(sub, true); |
| break; |
| } |
| if ((rc = common_cgroup_set_param(&int_cg[sub][CG_LEVEL_JOB], |
| "memory.use_hierarchy", |
| "1")) != SLURM_SUCCESS) { |
| error("unable to set hierarchical accounting for %s", |
| g_job_cgpath[sub]); |
| _step_destroy_internal(sub, true); |
| break; |
| } |
| if ((rc = common_cgroup_set_param(&int_cg[sub][CG_LEVEL_STEP], |
| "memory.use_hierarchy", |
| "1") != SLURM_SUCCESS)) { |
| error("unable to set hierarchical accounting for %s", |
| int_cg[sub][CG_LEVEL_STEP].path); |
| _step_destroy_internal(sub, true); |
| break; |
| } |
| break; |
| case CG_DEVICES: |
| /* create a new cgroup for that container */ |
| if ((rc = xcgroup_create_hierarchy(__func__, |
| step, |
| &g_cg_ns[sub], |
| int_cg[sub], |
| g_job_cgpath[sub], |
| g_step_cgpath[sub], |
| g_user_cgpath[sub])) |
| != SLURM_SUCCESS) |
| goto step_c_err; |
| break; |
| case CG_CPUACCT: |
| if ((rc = xcgroup_create_hierarchy(__func__, |
| step, |
| &g_cg_ns[sub], |
| int_cg[sub], |
| g_job_cgpath[sub], |
| g_step_cgpath[sub], |
| g_user_cgpath[sub])) |
| != SLURM_SUCCESS) |
| goto step_c_err; |
| break; |
| default: |
| error("cgroup subsystem %u not supported", sub); |
| rc = SLURM_ERROR; |
| goto step_c_err; |
| } |
| common_cgroup_unlock(&int_cg[sub][CG_LEVEL_ROOT]); |
| return rc; |
| |
| step_c_err: |
| /* step cgroup is not created */ |
| common_cgroup_unlock(&int_cg[sub][CG_LEVEL_ROOT]); |
| g_step_active_cnt[sub]--; |
| return rc; |
| } |
| |
| extern int cgroup_p_step_addto(cgroup_ctl_type_t sub, pid_t *pids, int npids) |
| { |
| if (*g_step_cgpath[sub] == '\0') |
| return SLURM_ERROR; |
| |
| switch (sub) { |
| case CG_TRACK: |
| /* |
| * Stick slurmstepd pid to the newly created job container |
| * (Note: we do not put it in the step container because this |
| * container could be used to suspend/resume tasks using freezer |
| * properties so we need to let the slurmstepd outside of |
| * this one). |
| */ |
| if ((npids == 1) && (*pids == getpid())) { |
| return common_cgroup_add_pids( |
| &int_cg[sub][CG_LEVEL_JOB], pids, npids); |
| } |
| break; |
| case CG_CPUS: |
| case CG_MEMORY: |
| case CG_DEVICES: |
| break; |
| case CG_CPUACCT: |
| error("This operation is not supported for %s", g_cg_name[sub]); |
| return SLURM_ERROR; |
| default: |
| error("cgroup subsystem %u not supported", sub); |
| return SLURM_ERROR; |
| } |
| |
| return common_cgroup_add_pids(&int_cg[sub][CG_LEVEL_STEP], pids, npids); |
| } |
| |
| extern int cgroup_p_step_get_pids(pid_t **pids, int *npids) |
| { |
| if (*g_step_cgpath[CG_TRACK] == '\0') |
| return SLURM_ERROR; |
| |
| return common_cgroup_get_pids(&int_cg[CG_TRACK][CG_LEVEL_STEP], pids, |
| npids); |
| } |
| |
| extern int cgroup_p_step_suspend(void) |
| { |
| if (*g_step_cgpath[CG_TRACK] == '\0') |
| return SLURM_ERROR; |
| |
| return common_cgroup_set_param(&int_cg[CG_TRACK][CG_LEVEL_STEP], |
| "freezer.state", "FROZEN"); |
| } |
| |
| extern int cgroup_p_step_resume(void) |
| { |
| if (*g_step_cgpath[CG_TRACK] == '\0') |
| return SLURM_ERROR; |
| |
| return common_cgroup_set_param(&int_cg[CG_TRACK][CG_LEVEL_STEP], |
| "freezer.state", "THAWED"); |
| } |
| |
| static int _step_destroy_internal(cgroup_ctl_type_t sub, bool root_locked) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| /* |
| * Only destroy the step if we're the only ones using it. Log it unless |
| * loaded from slurmd, where we will not create any step but call fini. |
| */ |
| if (g_step_active_cnt[sub] == 0) { |
| error("called without a previous init. This shouldn't happen!"); |
| return SLURM_SUCCESS; |
| } |
| /* Only destroy the step if we're the only ones using it. */ |
| if (g_step_active_cnt[sub] > 1) { |
| g_step_active_cnt[sub]--; |
| log_flag(CGROUP, "Not destroying %s step dir, resource busy by %d other plugin", |
| g_cg_name[sub], g_step_active_cnt[sub]); |
| return SLURM_SUCCESS; |
| } |
| |
| /* Remove any possible task directories first */ |
| _all_tasks_destroy(sub); |
| |
| /* Custom actions for every cgroup subsystem */ |
| switch (sub) { |
| case CG_TRACK: |
| break; |
| case CG_CPUS: |
| break; |
| case CG_MEMORY: |
| break; |
| case CG_DEVICES: |
| break; |
| case CG_CPUACCT: |
| break; |
| default: |
| error("cgroup subsystem %u not supported", sub); |
| return SLURM_ERROR; |
| break; |
| } |
| |
| rc = _remove_cg_subsystem(int_cg[sub], g_cg_name[sub], root_locked); |
| |
| if (rc == SLURM_SUCCESS) { |
| g_step_active_cnt[sub] = 0; |
| g_step_cgpath[sub][0] = '\0'; |
| } |
| |
| return rc; |
| } |
| |
| extern int cgroup_p_step_destroy(cgroup_ctl_type_t sub) |
| { |
| return _step_destroy_internal(sub, false); |
| } |
| |
| /* |
| * Is the specified pid in our cgroup g_cg_ns[CG_TRACK]? |
| * In the future we may want to replace this with a get pids and a search. |
| */ |
| extern bool cgroup_p_has_pid(pid_t pid) |
| { |
| bool rc; |
| int rc2; |
| xcgroup_t cg; |
| |
| rc2 = xcgroup_ns_find_by_pid(&g_cg_ns[CG_TRACK], &cg, pid); |
| if (rc2 != SLURM_SUCCESS) |
| return false; |
| |
| rc = true; |
| if (xstrcmp(cg.path, int_cg[CG_TRACK][CG_LEVEL_STEP].path)) |
| rc = false; |
| |
| common_cgroup_destroy(&cg); |
| return rc; |
| } |
| |
| static void _get_mem_recursive(xcgroup_t *cg, cgroup_limits_t *limits) |
| { |
| char *mem_max = NULL, *tmp_str = NULL; |
| size_t mem_sz; |
| unsigned long mem_lim; |
| unsigned long page_counter_max = LONG_MAX - sysconf(_SC_PAGE_SIZE) + 1; |
| |
| if (!xstrcmp(cg->path, "/sys/fs/cgroup")) |
| goto end; |
| |
| /* Break when there is no memory controller anymore */ |
| if (common_cgroup_get_param(cg, "memory.limit_in_bytes", |
| &mem_max, &mem_sz) != SLURM_SUCCESS) |
| goto end; |
| |
| /* Check ancestor */ |
| mem_lim = slurm_atoul(mem_max); |
| if (mem_lim == page_counter_max) { |
| tmp_str = xdirname(cg->path); |
| xfree(cg->path); |
| cg->path = tmp_str; |
| _get_mem_recursive(cg, limits); |
| if (limits->limit_in_bytes != NO_VAL64) |
| goto end; |
| } else { |
| /* found it! */ |
| limits->limit_in_bytes = mem_lim; |
| } |
| end: |
| xfree(mem_max); |
| } |
| |
| extern cgroup_limits_t *cgroup_p_constrain_get(cgroup_ctl_type_t sub, |
| cgroup_level_t level) |
| { |
| int rc = SLURM_SUCCESS; |
| cgroup_limits_t *limits; |
| xcgroup_t tmp_cg = { 0 }; |
| |
| /* Only initialize if not inited */ |
| if (!g_cg_ns[sub].mnt_point && (rc = _cgroup_init(sub))) |
| return NULL; |
| |
| limits = xmalloc(sizeof(*limits)); |
| cgroup_init_limits(limits); |
| |
| switch (sub) { |
| case CG_TRACK: |
| break; |
| case CG_CPUS: |
| if (common_cgroup_get_param(&int_cg[sub][level], |
| "cpuset.cpus", |
| &limits->allow_cores, |
| &limits->cores_size) |
| != SLURM_SUCCESS) |
| rc = SLURM_ERROR; |
| |
| if (common_cgroup_get_param(&int_cg[sub][level], |
| "cpuset.mems", |
| &limits->allow_mems, |
| &limits->mems_size) |
| != SLURM_SUCCESS) |
| rc = SLURM_ERROR; |
| |
| if (limits->cores_size > 0) |
| limits->allow_cores[(limits->cores_size)-1] = '\0'; |
| |
| if (limits->mems_size > 0) |
| limits->allow_mems[(limits->mems_size)-1] = '\0'; |
| |
| if (rc != SLURM_SUCCESS) |
| goto fail; |
| break; |
| case CG_MEMORY: |
| tmp_cg.path = xstrdup(int_cg[sub][level].path); |
| _get_mem_recursive(&tmp_cg, limits); |
| xfree(tmp_cg.path); |
| break; |
| case CG_DEVICES: |
| break; |
| default: |
| error("cgroup subsystem %u not supported", sub); |
| rc = SLURM_ERROR; |
| break; |
| } |
| |
| return limits; |
| fail: |
| cgroup_free_limits(limits); |
| return NULL; |
| } |
| |
| extern int cgroup_p_constrain_set(cgroup_ctl_type_t sub, cgroup_level_t level, |
| cgroup_limits_t *limits) |
| { |
| int rc = SLURM_SUCCESS; |
| task_cg_info_t *task_cg_info; |
| char *dev_str = NULL; |
| |
| if (!limits) |
| return SLURM_ERROR; |
| |
| switch (sub) { |
| case CG_TRACK: |
| break; |
| case CG_CPUS: |
| /* Do not try to set the cpuset limits of slurmd in this case */ |
| if ((level == CG_LEVEL_SYSTEM) && |
| (slurm_conf.task_plugin_param & SLURMD_SPEC_OVERRIDE)) |
| break; |
| |
| if (level == CG_LEVEL_SYSTEM || |
| level == CG_LEVEL_USER || |
| level == CG_LEVEL_JOB || |
| level == CG_LEVEL_STEP) { |
| if (common_cgroup_set_param(&int_cg[sub][level], |
| "cpuset.cpus", |
| limits->allow_cores) |
| != SLURM_SUCCESS) |
| rc = SLURM_ERROR; |
| } |
| |
| if (level == CG_LEVEL_USER || |
| level == CG_LEVEL_JOB || |
| level == CG_LEVEL_STEP) { |
| if (common_cgroup_set_param(&int_cg[sub][level], |
| "cpuset.mems", |
| limits->allow_mems) |
| != SLURM_SUCCESS) |
| rc = SLURM_ERROR; |
| } |
| break; |
| case CG_MEMORY: |
| /* Do not try to set the cpuset limits of slurmd in this case */ |
| if ((level == CG_LEVEL_SYSTEM) && |
| (slurm_conf.task_plugin_param & SLURMD_SPEC_OVERRIDE)) |
| break; |
| |
| if ((level == CG_LEVEL_JOB) && |
| (limits->swappiness != NO_VAL64)) { |
| rc = common_cgroup_set_uint64_param(&int_cg[sub][level], |
| "memory.swappiness", |
| limits->swappiness); |
| } |
| |
| if (level == CG_LEVEL_JOB || |
| level == CG_LEVEL_STEP || |
| level == CG_LEVEL_SYSTEM) { |
| if (common_cgroup_set_uint64_param( |
| &int_cg[sub][level], |
| "memory.limit_in_bytes", |
| limits->limit_in_bytes) |
| != SLURM_SUCCESS) |
| rc = SLURM_ERROR; |
| } |
| |
| if (level == CG_LEVEL_JOB || |
| level == CG_LEVEL_STEP) { |
| if (common_cgroup_set_uint64_param( |
| &int_cg[sub][level], |
| "memory.soft_limit_in_bytes", |
| limits->soft_limit_in_bytes) |
| != SLURM_SUCCESS) |
| rc = SLURM_ERROR; |
| |
| if (limits->memsw_limit_in_bytes != NO_VAL64) |
| if (common_cgroup_set_uint64_param( |
| &int_cg[sub][level], |
| "memory.memsw.limit_in_bytes", |
| limits->memsw_limit_in_bytes) |
| != SLURM_SUCCESS) |
| rc = SLURM_ERROR; |
| } |
| break; |
| case CG_DEVICES: |
| dev_str = gres_device_id2str(&limits->device); |
| if (level == CG_LEVEL_STEP || |
| level == CG_LEVEL_JOB) { |
| if (limits->allow_device) { |
| if (common_cgroup_set_param( |
| &int_cg[sub][level], |
| "devices.allow", |
| dev_str) |
| != SLURM_SUCCESS) |
| rc = SLURM_ERROR; |
| } else { |
| if (common_cgroup_set_param( |
| &int_cg[sub][level], |
| "devices.deny", |
| dev_str) |
| != SLURM_SUCCESS) |
| rc = SLURM_ERROR; |
| } |
| } |
| |
| if (level == CG_LEVEL_TASK) { |
| task_cg_info = list_find_first(g_task_list[sub], |
| _find_task_cg_info, |
| &(limits->taskid)); |
| if (!task_cg_info) { |
| error("Task %d is not being tracked in %s controller, cannot set constrain.", |
| limits->taskid, g_cg_name[sub]); |
| rc = SLURM_ERROR; |
| break; |
| } |
| |
| if (limits->allow_device) { |
| rc = common_cgroup_set_param( |
| &task_cg_info->task_cg, |
| "devices.allow", |
| dev_str); |
| } else { |
| rc = common_cgroup_set_param( |
| &task_cg_info->task_cg, |
| "devices.deny", |
| dev_str); |
| } |
| } |
| break; |
| default: |
| error("cgroup subsystem %u not supported", sub); |
| rc = SLURM_ERROR; |
| break; |
| } |
| |
| xfree(dev_str); |
| return rc; |
| } |
| |
| extern int cgroup_p_constrain_apply(cgroup_ctl_type_t sub, cgroup_level_t level, |
| uint32_t task_id) |
| { |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Code based on linux tools/cgroup/cgroup_event_listener.c with adapted |
| * modifications for Slurm logic and needs. |
| */ |
| static int _read_fd(int fd, uint64_t *buf) |
| { |
| int rc = SLURM_ERROR; |
| size_t len = sizeof(uint64_t); |
| uint64_t *buf_ptr = buf; |
| ssize_t nread; |
| |
| while (len > 0 && (nread = read(fd, buf_ptr, len)) != 0) { |
| if (nread == -1) { |
| if (errno == EINTR) |
| continue; |
| error("read(): %m"); |
| break; |
| } |
| len -= nread; |
| buf_ptr += nread; |
| } |
| |
| if (len == 0) |
| rc = SLURM_SUCCESS; |
| |
| return rc; |
| } |
| |
| static void *_oom_event_monitor(void *x) |
| { |
| oom_event_args_t *args = (oom_event_args_t *) x; |
| int ret = -1; |
| uint64_t res; |
| struct pollfd fds[2]; |
| |
| debug("started."); |
| |
| /* |
| * POLLPRI should only meaningful for event_fd, since according to the |
| * poll() man page it may indicate "cgroup.events" file modified. |
| * |
| * POLLRDHUP should only be meaningful for oom_pipe[0], since it refers |
| * to stream socket peer closed connection. |
| * |
| * POLLHUP is ignored in events member, and should be set by the Kernel |
| * in revents even if not defined in events. |
| * |
| */ |
| fds[0].fd = args->event_fd; |
| fds[0].events = POLLIN | POLLPRI; |
| |
| fds[1].fd = oom_pipe[0]; |
| fds[1].events = POLLIN | POLLRDHUP; |
| |
| /* |
| * Poll event_fd for oom_kill events plus oom_pipe[0] for stop msg. |
| * Specifying a negative value in timeout means an infinite timeout. |
| */ |
| while (1) { |
| ret = poll(fds, 2, -1); |
| |
| if (ret == -1) { |
| /* Error. */ |
| if (errno == EINTR) |
| continue; |
| |
| error("poll(): %m"); |
| break; |
| } else if (ret == 0) { |
| /* Should not happen since infinite timeout. */ |
| error("poll() timeout."); |
| break; |
| } else if (ret > 0) { |
| if (fds[0].revents & (POLLIN | POLLPRI)) { |
| /* event_fd readable. */ |
| res = 0; |
| ret = _read_fd(args->event_fd, &res); |
| if (ret == SLURM_SUCCESS) { |
| slurm_mutex_lock(&oom_mutex); |
| debug3("res: %"PRIu64"", res); |
| oom_kill_count += res; |
| debug("oom-kill event count: %"PRIu64"", |
| oom_kill_count); |
| slurm_mutex_unlock(&oom_mutex); |
| } else |
| error("cannot read oom-kill counts."); |
| } else if (fds[0].revents & (POLLRDHUP | POLLERR | |
| POLLHUP | POLLNVAL)) { |
| error("problem with event_fd"); |
| break; |
| } |
| |
| if (fds[1].revents & POLLIN) { |
| /* oom_pipe[0] readable. */ |
| res = 0; |
| ret = _read_fd(oom_pipe[0], &res); |
| if (ret == SLURM_SUCCESS && res == STOP_OOM) { |
| /* Read stop msg. */ |
| log_flag(CGROUP, "stop msg read."); |
| break; |
| } |
| } else if (fds[1].revents & |
| (POLLRDHUP | POLLERR | POLLHUP | POLLNVAL)) { |
| error("problem with oom_pipe[0]"); |
| break; |
| } |
| } |
| } |
| |
| slurm_mutex_lock(&oom_mutex); |
| if (!oom_kill_count) |
| debug("No oom events detected."); |
| slurm_mutex_unlock(&oom_mutex); |
| |
| close(args->event_fd); |
| close(args->efd); |
| close(args->cfd); |
| close(oom_pipe[0]); |
| xfree(args); |
| |
| debug("stopping."); |
| |
| return NULL; |
| } |
| |
| extern int cgroup_p_step_start_oom_mgr(stepd_step_rec_t *step) |
| { |
| char *control_file = NULL, *event_file = NULL, *line = NULL; |
| int rc = SLURM_SUCCESS, event_fd = -1, cfd = -1, efd = -1; |
| oom_event_args_t *event_args; |
| size_t sz; |
| |
| rc = common_cgroup_get_param(&int_cg[CG_MEMORY][CG_LEVEL_STEP], |
| "memory.oom_control", |
| &event_file, |
| &sz); |
| |
| if (rc != SLURM_SUCCESS) { |
| error("Not monitoring OOM events, memory.oom_control could not be read."); |
| return rc; |
| } |
| |
| /* |
| * If oom_kill field is found we will read it from the cgroup interface, |
| * so don't start the oom thread. |
| */ |
| if (event_file) { |
| line = xstrstr(event_file, "oom_kill "); |
| xfree(event_file); |
| if (line) { |
| oom_kill_type = OOM_KILL_COUNTER; |
| return SLURM_SUCCESS; |
| } |
| } |
| |
| /* |
| * Start a new OOM monitor thread, used in kernels which do not support |
| * memory.oom_control's oom_kill field (<=3.x). |
| */ |
| xstrfmtcat(control_file, "%s/%s", int_cg[CG_MEMORY][CG_LEVEL_STEP].path, |
| "memory.oom_control"); |
| |
| if ((cfd = open(control_file, O_RDONLY | O_CLOEXEC)) == -1) { |
| error("Cannot open %s: %m", control_file); |
| rc = SLURM_ERROR; |
| goto fini; |
| } |
| |
| xstrfmtcat(event_file, "%s/%s", int_cg[CG_MEMORY][CG_LEVEL_STEP].path, |
| "cgroup.event_control"); |
| |
| if ((efd = open(event_file, O_WRONLY | O_CLOEXEC)) == -1) { |
| error("Cannot open %s: %m", event_file); |
| rc = SLURM_ERROR; |
| goto fini; |
| } |
| |
| if ((event_fd = eventfd(0, EFD_CLOEXEC)) == -1) { |
| error("eventfd: %m"); |
| rc = SLURM_ERROR; |
| goto fini; |
| } |
| |
| xstrfmtcat(line, "%d %d", event_fd, cfd); |
| |
| oom_kill_count = 0; |
| |
| safe_write(efd, line, strlen(line) + 1); |
| |
| if (pipe2(oom_pipe, O_CLOEXEC) == -1) { |
| error("pipe(): %m"); |
| rc = SLURM_ERROR; |
| goto fini; |
| } |
| |
| /* |
| * Monitoring thread should be responsible for closing the fd's and |
| * freeing the oom_event_args_t struct and members. |
| */ |
| event_args = xmalloc(sizeof(oom_event_args_t)); |
| event_args->cfd = cfd; |
| event_args->efd = efd; |
| event_args->event_fd = event_fd; |
| |
| slurm_mutex_init(&oom_mutex); |
| slurm_thread_create(&oom_thread, _oom_event_monitor, event_args); |
| oom_kill_type = OOM_KILL_MON; |
| |
| fini: |
| xfree(line); |
| if (oom_kill_type != OOM_KILL_MON) { |
| close(event_fd); |
| close(efd); |
| close(cfd); |
| close(oom_pipe[0]); |
| close(oom_pipe[1]); |
| } |
| xfree(event_file); |
| xfree(control_file); |
| |
| if (rc != SLURM_SUCCESS) |
| error("Unable to register OOM notifications for %s", |
| int_cg[CG_MEMORY][CG_LEVEL_STEP].path); |
| return rc; |
| |
| rwfail: |
| error("Cannot write to %s", event_file); |
| rc = SLURM_ERROR; |
| goto fini; |
| } |
| |
| static uint64_t _failcnt(xcgroup_t *cg, char *param) |
| { |
| uint64_t value = 0; |
| |
| if (xcgroup_get_uint64_param(cg, param, &value) != SLURM_SUCCESS) { |
| log_flag(CGROUP, "unable to read '%s' from '%s'", |
| param, cg->path); |
| value = 0; |
| } |
| |
| return value; |
| } |
| |
| static int _get_oom_kill_from_file(xcgroup_t *cg) |
| { |
| char *oom_control = NULL, *ptr; |
| size_t sz; |
| uint64_t local_oom_kill_cnt = 0; |
| |
| if (common_cgroup_get_param(cg, "memory.oom_control", |
| &oom_control, &sz) != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| |
| if (oom_control) { |
| if ((ptr = xstrstr(oom_control, "oom_kill "))) { |
| if (sscanf(ptr, "oom_kill %"PRIu64, |
| &local_oom_kill_cnt) != 1) |
| error("Cannot parse oom_kill counter from %s memory.oom_control.", |
| cg->path); |
| } |
| xfree(oom_control); |
| log_flag(CGROUP, "Detected %"PRIu64" out-of-memory events in %s", |
| local_oom_kill_cnt, cg->path); |
| oom_kill_count += local_oom_kill_cnt; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern cgroup_oom_t *cgroup_p_step_stop_oom_mgr(stepd_step_rec_t *step) |
| { |
| cgroup_oom_t *results = NULL; |
| uint64_t stop_msg; |
| |
| if (oom_kill_type == OOM_KILL_NONE) { |
| error("OOM events were not monitored for %ps: couldn't read memory.oom_control or subscribe to its events.", |
| &step->step_id); |
| return results; |
| } |
| |
| if (common_cgroup_lock(&int_cg[CG_MEMORY][CG_LEVEL_STEP]) != |
| SLURM_SUCCESS) { |
| error("common_cgroup_lock error: %m"); |
| goto fail_oom_results; |
| } |
| |
| results = xmalloc(sizeof(*results)); |
| |
| if (cgroup_p_has_feature(CG_MEMCG_SWAP)) { |
| results->step_memsw_failcnt = _failcnt( |
| &int_cg[CG_MEMORY][CG_LEVEL_STEP], |
| "memory.memsw.failcnt"); |
| results->job_memsw_failcnt = _failcnt( |
| &int_cg[CG_MEMORY][CG_LEVEL_JOB], |
| "memory.memsw.failcnt"); |
| } |
| results->step_mem_failcnt = _failcnt(&int_cg[CG_MEMORY][CG_LEVEL_STEP], |
| "memory.failcnt"); |
| results->job_mem_failcnt = _failcnt(&int_cg[CG_MEMORY][CG_LEVEL_JOB], |
| "memory.failcnt"); |
| |
| /* |
| * If there's no OOM Thread, try to read oom_kill from the interface and |
| * accumulate the kills of the step into the global counter which should |
| * already contain all the tasks kills. |
| */ |
| if (oom_kill_type == OOM_KILL_COUNTER) { |
| cgroup_ctl_type_t ctl = CG_MEMORY; |
| |
| list_for_each(g_task_list[ctl], _acct_task, &ctl); |
| if (_get_oom_kill_from_file( |
| &int_cg[CG_MEMORY][CG_LEVEL_STEP]) != |
| SLURM_SUCCESS) { |
| log_flag(CGROUP, |
| "OOM events were not monitored for %ps", |
| &step->step_id); |
| } |
| results->oom_kill_cnt = oom_kill_count; |
| common_cgroup_unlock(&int_cg[CG_MEMORY][CG_LEVEL_STEP]); |
| return results; |
| } |
| common_cgroup_unlock(&int_cg[CG_MEMORY][CG_LEVEL_STEP]); |
| |
| /* |
| * oom_thread created, but could have finished before we attempt |
| * to send the stop msg. If it finished, oom_thread should had |
| * closed the read endpoint of oom_pipe. |
| */ |
| stop_msg = STOP_OOM; |
| safe_write(oom_pipe[1], &stop_msg, sizeof(stop_msg)); |
| |
| rwfail: /* Ignore safe_write issues. */ |
| log_flag(CGROUP, "attempt to join oom_thread."); |
| slurm_thread_join(oom_thread); |
| |
| slurm_mutex_lock(&oom_mutex); |
| results->oom_kill_cnt = oom_kill_count; |
| slurm_mutex_unlock(&oom_mutex); |
| |
| fail_oom_results: |
| close(oom_pipe[1]); |
| slurm_mutex_destroy(&oom_mutex); |
| |
| return results; |
| } |
| |
| /*************************************** |
| ***** CGROUP TASK FUNCTIONS ***** |
| **************************************/ |
| extern int cgroup_p_task_addto(cgroup_ctl_type_t sub, stepd_step_rec_t *step, |
| pid_t pid, uint32_t task_id) |
| { |
| if (task_id > g_max_task_id) |
| g_max_task_id = task_id; |
| |
| log_flag(CGROUP, "%ps taskid %u max_task_id %u", |
| &step->step_id, task_id, g_max_task_id); |
| |
| return _handle_task_cgroup(sub, step, pid, task_id); |
| } |
| |
| extern cgroup_acct_t *cgroup_p_task_get_acct_data(uint32_t taskid) |
| { |
| char *cpu_time = NULL, *memory_stat = NULL, *ptr; |
| char *memory_peak = NULL; |
| size_t cpu_time_sz = 0, memory_stat_sz = 0, tmp_sz = 0; |
| cgroup_acct_t *stats = NULL; |
| xcgroup_t *task_cpuacct_cg = NULL; |
| xcgroup_t *task_memory_cg = NULL; |
| |
| /* Find which task cgroup to use */ |
| task_memory_cg = list_find_first(g_task_list[CG_MEMORY], |
| _find_task_cg_info, |
| &taskid); |
| task_cpuacct_cg = list_find_first(g_task_list[CG_CPUACCT], |
| _find_task_cg_info, |
| &taskid); |
| |
| /* |
| * We should always find the task cgroup; if we don't for some reason, |
| * just print an error and return. |
| */ |
| if (!task_cpuacct_cg) { |
| error("Could not find task_cpuacct_cg, this should never happen"); |
| return NULL; |
| } |
| |
| if (!task_memory_cg) { |
| error("Could not find task_memory_cg, this should never happen"); |
| return NULL; |
| } |
| |
| /* |
| * Initialize values, a NO_VAL64 will indicate to the caller that |
| * something happened here. |
| */ |
| stats = xmalloc(sizeof(*stats)); |
| stats->usec = NO_VAL64; |
| stats->ssec = NO_VAL64; |
| stats->total_rss = NO_VAL64; |
| stats->total_pgmajfault = NO_VAL64; |
| stats->total_vmem = NO_VAL64; |
| stats->memory_peak = INFINITE64; /* As required in common_jag.c */ |
| |
| if (common_cgroup_get_param(task_cpuacct_cg, "cpuacct.stat", &cpu_time, |
| &cpu_time_sz) == SLURM_SUCCESS) { |
| sscanf(cpu_time, "%*s %"PRIu64" %*s %"PRIu64, |
| &stats->usec, &stats->ssec); |
| } |
| |
| if (common_cgroup_get_param(task_memory_cg, "memory.stat", &memory_stat, |
| &memory_stat_sz) == SLURM_SUCCESS) { |
| if ((ptr = xstrstr(memory_stat, "total_rss"))) |
| sscanf(ptr, "total_rss %"PRIu64, &stats->total_rss); |
| if ((ptr = xstrstr(memory_stat, "total_pgmajfault"))) |
| sscanf(ptr, "total_pgmajfault %"PRIu64, |
| &stats->total_pgmajfault); |
| } |
| |
| if (stats->total_rss != NO_VAL64) { |
| uint64_t total_cache = NO_VAL64, total_swap = NO_VAL64; |
| |
| if ((ptr = xstrstr(memory_stat, "total_cache"))) |
| sscanf(ptr, "total_cache %"PRIu64, &total_cache); |
| if ((ptr = xstrstr(memory_stat, "total_swap"))) |
| sscanf(ptr, "total_swap %"PRIu64, &total_swap); |
| |
| stats->total_vmem = stats->total_rss; |
| if (total_cache != NO_VAL64) |
| stats->total_vmem += total_cache; |
| if (total_swap != NO_VAL64) |
| stats->total_vmem += total_swap; |
| } |
| |
| /* In cgroup/v1, memory.peak is provided by memory.max_usage_in_bytes */ |
| if (common_cgroup_get_param(task_memory_cg, |
| "memory.max_usage_in_bytes", |
| &memory_peak, |
| &tmp_sz) != SLURM_SUCCESS) { |
| log_flag(CGROUP, "Cannot read task %d memory.max_usage_in_bytes interface", |
| taskid); |
| } |
| if (memory_peak) { |
| if (sscanf(memory_peak, "%"PRIu64, &stats->memory_peak) != 1) |
| error("Cannot parse memory.max_usage_in_bytes interface"); |
| } |
| |
| xfree(cpu_time); |
| xfree(memory_stat); |
| xfree(memory_peak); |
| |
| return stats; |
| } |
| |
| /* cgroup/v1 usec and ssec are provided in USER_HZ. */ |
| extern long int cgroup_p_get_acct_units(void) |
| { |
| return jobacct_gather_get_clk_tck(); |
| } |
| |
| extern bool cgroup_p_has_feature(cgroup_ctl_feature_t f) |
| { |
| struct stat st; |
| int rc; |
| char *memsw_filepath = NULL; |
| static int swap_enabled = -1; |
| |
| /* Check if swap constrain capability is enabled in this system. */ |
| switch (f) { |
| case CG_MEMCG_SWAP: |
| if (swap_enabled == -1) { |
| xstrfmtcat(memsw_filepath, |
| "%s/memory/memory.memsw.limit_in_bytes", |
| slurm_cgroup_conf.cgroup_mountpoint); |
| rc = stat(memsw_filepath, &st); |
| xfree(memsw_filepath); |
| return (swap_enabled = (rc == 0)); |
| } else |
| return swap_enabled; |
| default: |
| break; |
| } |
| |
| return false; |
| } |
| |
| extern int cgroup_p_signal(int signal) |
| { |
| error("%s not implemented in %s", __func__, plugin_name); |
| return SLURM_ERROR; |
| } |
| |
| extern char *cgroup_p_get_task_empty_event_path(uint32_t taskid, |
| bool *on_modify) |
| { |
| return NULL; |
| } |
| |
| extern int cgroup_p_is_task_empty(uint32_t taskid) |
| { |
| return ESLURM_NOT_SUPPORTED; |
| } |