| /***************************************************************************** \ |
| * task_cgroup_memory.c - memory cgroup subsystem for task/cgroup |
| ***************************************************************************** |
| * Copyright (C) 2009 CEA/DAM/DIF |
| * Written by Matthieu Hautreux <matthieu.hautreux@cea.fr> |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "slurm/slurm_errno.h" |
| #include "slurm/slurm.h" |
| #include "src/slurmd/common/set_oomadj.h" |
| #include "src/slurmd/slurmstepd/slurmstepd_job.h" |
| #include "src/slurmd/slurmd/slurmd.h" |
| |
| #include "src/common/xstring.h" |
| #include "src/interfaces/cgroup.h" |
| |
| static bool constrain_ram_space; |
| static bool constrain_swap_space; |
| |
| static float allowed_ram_space; /* Allowed RAM in percent */ |
| static float allowed_swap_space; /* Allowed Swap percent */ |
| |
| static uint64_t max_ram; /* Upper bound for memory.limit_in_bytes */ |
| static uint64_t max_swap; /* Upper bound for swap */ |
| static uint64_t totalram; /* Total RealMemory of node from slurm.conf */ |
| static uint64_t min_ram_space; /* Don't constrain RAM below this value */ |
| |
| static bool oom_mgr_started = false; |
| |
| static uint64_t percent_in_bytes(uint64_t mb, float percent) |
| { |
| return ((mb * 1024 * 1024) * (percent / 100.0)); |
| } |
| |
| extern int task_cgroup_memory_init(void) |
| { |
| if (cgroup_g_initialize(CG_MEMORY) != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| |
| constrain_ram_space = slurm_cgroup_conf.constrain_ram_space; |
| constrain_swap_space = slurm_cgroup_conf.constrain_swap_space; |
| |
| /* |
| * as the swap space threshold will be configured with a |
| * mem+swp parameter value, if RAM space is not monitored, |
| * set allowed RAM space to 100% of the job requested memory. |
| * It will help to construct the mem+swp value that will be |
| * used for both mem and mem+swp limit during memcg creation. |
| */ |
| if (constrain_ram_space) |
| allowed_ram_space = slurm_cgroup_conf.allowed_ram_space; |
| else |
| allowed_ram_space = 100.0; |
| |
| allowed_swap_space = slurm_cgroup_conf.allowed_swap_space; |
| |
| if ((totalram = (uint64_t) conf->conf_memory_size) == 0) |
| error ("Unable to get RealMemory size"); |
| |
| max_ram = percent_in_bytes(totalram, |
| slurm_cgroup_conf.max_ram_percent); |
| max_swap = percent_in_bytes(totalram, |
| slurm_cgroup_conf.max_swap_percent); |
| max_swap += max_ram; |
| min_ram_space = slurm_cgroup_conf.min_ram_space * 1024 * 1024; |
| |
| debug("task/cgroup/memory: TotCfgRealMem:%"PRIu64"M allowed:%.4g%%(%s), " |
| "swap:%.4g%%(%s), max:%.4g%%(%"PRIu64"M) " |
| "max+swap:%.4g%%(%"PRIu64"M) min:%"PRIu64"M ", |
| totalram, allowed_ram_space, |
| constrain_ram_space ? "enforced" : "permissive", |
| allowed_swap_space, |
| constrain_swap_space ? "enforced" : "permissive", |
| slurm_cgroup_conf.max_ram_percent, |
| (uint64_t) (max_ram / (1024 * 1024)), |
| slurm_cgroup_conf.max_swap_percent, |
| (uint64_t) (max_swap / (1024 * 1024)), |
| slurm_cgroup_conf.min_ram_space); |
| |
| set_oom_adj_env(STEPD_OOM_ADJ); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int task_cgroup_memory_fini(void) |
| { |
| return cgroup_g_step_destroy(CG_MEMORY); |
| } |
| |
| /* Return configured memory limit in bytes given a memory limit in MB. */ |
| static uint64_t mem_limit_in_bytes(uint64_t mem, bool with_allowed) |
| { |
| /* |
| * If mem == 0 then assume there was no Slurm limit imposed |
| * on the amount of memory for job or step. Use the total |
| * amount of available RAM instead. |
| */ |
| if (mem == 0) |
| mem = totalram * 1024 * 1024; |
| else { |
| if (with_allowed) |
| mem = percent_in_bytes(mem, allowed_ram_space); |
| else |
| mem = percent_in_bytes(mem, 100.0); |
| } |
| |
| if (mem < min_ram_space) |
| return min_ram_space; |
| |
| if (mem > max_ram) |
| return max_ram; |
| |
| return mem; |
| } |
| |
| /* |
| * Return configured swap limit in bytes given a memory limit in MB. |
| * |
| * Swap limit is calculated as: |
| * mem_limit_in_bytes + (configured_swap_percent * allocated_mem_in_bytes) |
| */ |
| static uint64_t swap_limit_in_bytes(uint64_t mem) |
| { |
| uint64_t swap; |
| |
| /* If mem == 0 assume "unlimited" and use totalram. */ |
| swap = percent_in_bytes(mem ? mem : totalram, allowed_swap_space); |
| mem = mem_limit_in_bytes(mem, true) + swap; |
| |
| if (mem < min_ram_space) |
| return min_ram_space; |
| |
| if (mem > max_swap) |
| return max_swap; |
| |
| return mem; |
| } |
| |
| static int _memcg_initialize(stepd_step_rec_t *step, uint64_t mem_limit, |
| bool is_step) |
| { |
| uint64_t mlb = mem_limit_in_bytes(mem_limit, true); |
| uint64_t mlb_soft = mem_limit_in_bytes(mem_limit, false); |
| uint64_t mls = swap_limit_in_bytes(mem_limit); |
| cgroup_limits_t limits; |
| |
| if (mlb_soft > mlb) { |
| /* |
| * NOTE: It is recommended to set the soft limit always below |
| * the hard limit, otherwise the hard one will take precedence. |
| */ |
| debug2("Setting memory soft limit (%"PRIu64" bytes) to the same value as memory limit (%"PRIu64" bytes) for %s", |
| mlb_soft, mlb, is_step ? "step" : "job"); |
| mlb_soft = mlb; |
| } |
| |
| cgroup_init_limits(&limits); |
| |
| /* |
| * When RAM space has not to be constrained and we are here, it means |
| * that only Swap space has to be constrained. Thus set RAM space limit |
| * to the mem+swap limit too. |
| */ |
| if (!constrain_ram_space) |
| mlb = mls; |
| |
| limits.limit_in_bytes = mlb; |
| limits.soft_limit_in_bytes = mlb_soft; |
| limits.memsw_limit_in_bytes = NO_VAL64; |
| limits.swappiness = NO_VAL64; |
| |
| /* This limit has to be set only if ConstrainSwapSpace is set to yes. */ |
| if (constrain_swap_space) { |
| limits.swappiness = slurm_cgroup_conf.memory_swappiness; |
| limits.memsw_limit_in_bytes = mls; |
| info("%s: alloc=%"PRIu64"MB mem.limit=%"PRIu64"MB " |
| "memsw.limit=%"PRIu64"MB job_swappiness=%"PRIu64, |
| is_step ? "step" : "job", |
| mem_limit, |
| mlb/(1024*1024), |
| mls/(1024*1024), |
| limits.swappiness); |
| } else { |
| info("%s: alloc=%"PRIu64"MB mem.limit=%"PRIu64"MB " |
| "memsw.limit=unlimited", is_step ? "step" : "job", |
| mem_limit, |
| mlb/(1024*1024)); |
| } |
| |
| if (!is_step) { |
| if (cgroup_g_constrain_set(CG_MEMORY, CG_LEVEL_JOB, &limits) |
| != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| } else { |
| if (cgroup_g_constrain_set(CG_MEMORY, CG_LEVEL_STEP, &limits) |
| != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| extern int task_cgroup_memory_create(stepd_step_rec_t *step) |
| { |
| pid_t pid; |
| |
| if (cgroup_g_step_create(CG_MEMORY, step) != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| |
| /* Set the associated memory limits for the job and for the step. */ |
| if (_memcg_initialize(step, step->job_mem, false) != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| if (_memcg_initialize(step, step->step_mem, true) != SLURM_SUCCESS) |
| return SLURM_ERROR; |
| |
| if (cgroup_g_step_start_oom_mgr(step) == SLURM_SUCCESS) |
| oom_mgr_started = true; |
| |
| /* Attach the slurmstepd to the step memory cgroup. */ |
| pid = getpid(); |
| return cgroup_g_step_addto(CG_MEMORY, &pid, 1); |
| } |
| |
| extern int task_cgroup_memory_check_oom(stepd_step_rec_t *step) |
| { |
| cgroup_oom_t *results; |
| int rc = SLURM_SUCCESS; |
| |
| if (!oom_mgr_started) |
| return SLURM_SUCCESS; |
| |
| results = cgroup_g_step_stop_oom_mgr(step); |
| |
| if (results == NULL) |
| return SLURM_ERROR; |
| |
| if (results->step_memsw_failcnt > 0) { |
| /* |
| * reports the number of times that the memory plus swap space |
| * limit has reached the value in memory.memsw.limit_in_bytes. |
| */ |
| info("%ps hit memory+swap limit at least once during execution. This may or may not result in some failure.", |
| &step->step_id); |
| } else if (results->step_mem_failcnt > 0) { |
| /* |
| * reports the number of times that the memory limit has reached |
| * the value set in memory.limit_in_bytes. |
| */ |
| info("%ps hit memory limit at least once during execution. This may or may not result in some failure.", |
| &step->step_id); |
| } |
| |
| if (results->job_memsw_failcnt > 0) { |
| info("%ps hit memory+swap limit at least once during execution. This may or may not result in some failure.", |
| &step->step_id); |
| } else if (results->job_mem_failcnt > 0) { |
| info("%ps hit memory limit at least once during execution. This may or may not result in some failure.", |
| &step->step_id); |
| } |
| |
| if (results->oom_kill_cnt) { |
| error("Detected %"PRIu64" oom_kill event%s in %ps. Some of the step tasks have been OOM Killed.", |
| results->oom_kill_cnt, |
| (results->oom_kill_cnt == 1) ? "" : "s" , |
| &step->step_id); |
| /* |
| * If OOMKillStep is set send a message to terminate this |
| * step, this is done to ensure that if this is a multinode |
| * step, the step gets terminated in all other nodes. |
| */ |
| if (step->oom_kill_step) { |
| slurm_terminate_job_step(step->step_id.job_id, |
| step->step_id.step_id); |
| } |
| rc = ENOMEM; |
| } |
| |
| xfree(results); |
| |
| return rc; |
| } |
| |
| extern int task_cgroup_memory_add_pid(stepd_step_rec_t *step, pid_t pid, |
| uint32_t taskid) |
| { |
| return cgroup_g_task_addto(CG_MEMORY, step, pid, taskid); |
| } |
| |
| extern int task_cgroup_memory_add_extern_pid(pid_t pid) |
| { |
| /* Only in the extern step we will not create specific tasks */ |
| return cgroup_g_step_addto(CG_MEMORY, &pid, 1); |
| } |