| /*****************************************************************************\ |
| * cgroup.h - driver for cgroup plugin |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #ifndef _INTERFACES_CGROUP_H |
| #define _INTERFACES_CGROUP_H |
| |
| /* Check filesystem type */ |
| #if defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) |
| #include <magic.h> |
| #include <sys/mount.h> |
| #include <sys/param.h> |
| #else |
| #include <linux/magic.h> |
| #include <sys/vfs.h> |
| #endif |
| |
| #include <pwd.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <time.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <unistd.h> |
| #include <inttypes.h> |
| |
| #include "config.h" |
| |
| #include "slurm/slurm.h" |
| #include "src/slurmd/slurmd/slurmd.h" |
| #include "src/slurmd/slurmstepd/slurmstepd_job.h" |
| |
| #include "src/interfaces/gres.h" |
| #include "src/common/log.h" |
| #include "src/common/list.h" |
| #include "src/common/macros.h" |
| #include "src/common/pack.h" |
| #include "src/common/parse_config.h" |
| #include "src/common/parse_time.h" |
| #include "src/common/read_config.h" |
| #include "src/common/plugin.h" |
| #include "src/common/slurm_opt.h" |
| #include "src/common/slurm_protocol_api.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| #include "src/plugins/cgroup/common/cgroup_common.h" |
| |
| #ifdef __GNUC__ |
| #define F_TYPE_EQUAL(a, b) (a == (__typeof__(a)) b) |
| #else |
| #define F_TYPE_EQUAL(a, b) (a == (__SWORD_TYPE) b) |
| #endif |
| |
| /* Not defined in non-supported v2 linux versions -- e.g centos7 */ |
| #ifndef CGROUP2_SUPER_MAGIC |
| #define CGROUP2_SUPER_MAGIC 0x63677270 |
| #endif |
| |
| /* Default lower bound on memory limit in MB. This is required so we |
| * don't immediately kill slurmstepd on mem cgroup creation if |
| * an administrator or user sets and absurdly low mem limit. |
| */ |
| #define XCGROUP_DEFAULT_MIN_RAM 30 |
| |
| /* Current supported cgroup controller types */ |
| typedef enum { |
| CG_TRACK, |
| CG_CPUS, |
| CG_MEMORY, |
| CG_DEVICES, |
| CG_CPUACCT, |
| /* Below are extra controllers not explicitly tracked by Slurm. */ |
| CG_IO, |
| CG_HUGETLB, |
| CG_PIDS, |
| CG_RDMA, |
| CG_MISC, |
| CG_CTL_CNT |
| } cgroup_ctl_type_t; |
| |
| /* Current supported cgroup controller features */ |
| typedef enum { |
| CG_FALSE_ROOT, |
| CG_MEMCG_OOMGROUP, |
| CG_MEMCG_PEAK, |
| CG_MEMCG_SWAP, |
| CG_KILL_BUTTON |
| } cgroup_ctl_feature_t; |
| |
| typedef enum { |
| CG_LEVEL_ROOT, |
| CG_LEVEL_SLURM, |
| CG_LEVEL_USER, |
| CG_LEVEL_JOB, |
| CG_LEVEL_STEP, |
| CG_LEVEL_STEP_SLURM, |
| CG_LEVEL_STEP_USER, |
| CG_LEVEL_TASK, |
| CG_LEVEL_SYSTEM, |
| CG_LEVEL_CNT |
| } cgroup_level_t; |
| |
| typedef enum { |
| CGROUP_EMPTY, |
| CGROUP_POPULATED, |
| } cgroup_empty_t; |
| |
| /* This data type is used to get/set various parameters in cgroup hierarchy */ |
| typedef struct { |
| /* extra info */ |
| stepd_step_rec_t *step; |
| uint32_t taskid; |
| /* task cpuset */ |
| char *allow_cores; |
| char *allow_mems; |
| size_t cores_size; |
| size_t mems_size; |
| /* task devices */ |
| bool allow_device; |
| gres_device_id_t device; |
| /* jobacct memory */ |
| uint64_t limit_in_bytes; |
| uint64_t soft_limit_in_bytes; |
| uint64_t memsw_limit_in_bytes; |
| uint64_t swappiness; |
| } cgroup_limits_t; |
| |
| typedef struct { |
| uint64_t step_mem_failcnt; |
| uint64_t step_memsw_failcnt; |
| uint64_t job_mem_failcnt; |
| uint64_t job_memsw_failcnt; |
| uint64_t oom_kill_cnt; |
| } cgroup_oom_t; |
| |
| typedef struct { |
| uint64_t memory_peak; |
| uint64_t usec; |
| uint64_t ssec; |
| uint64_t total_rss; |
| uint64_t total_pgmajfault; |
| uint64_t total_vmem; |
| } cgroup_acct_t; |
| |
| /* Slurm cgroup plugins configuration parameters */ |
| typedef struct { |
| char *cgroup_mountpoint; |
| |
| char *cgroup_prepend; |
| |
| bool constrain_cores; |
| |
| bool constrain_ram_space; |
| float allowed_ram_space; |
| float max_ram_percent; /* Upper bound on memory as % of RAM */ |
| |
| uint64_t min_ram_space; /* Lower bound on memory limit (MB) */ |
| |
| bool constrain_swap_space; |
| float allowed_swap_space; |
| float max_swap_percent; /* Upper bound on swap as % of RAM */ |
| uint64_t memory_swappiness; |
| |
| bool constrain_devices; |
| char *cgroup_plugin; |
| |
| bool ignore_systemd; |
| bool ignore_systemd_on_failure; |
| |
| bool enable_controllers; |
| char *enable_extra_controllers; |
| |
| bool signal_children_processes; |
| uint64_t systemd_timeout; /* How much time to wait on systemd operations (msec)*/ |
| } cgroup_conf_t; |
| |
| |
| extern cgroup_conf_t slurm_cgroup_conf; |
| |
| /* global functions */ |
| extern int cgroup_conf_init(void); |
| extern void cgroup_conf_destroy(void); |
| extern void cgroup_free_limits(cgroup_limits_t *limits); |
| extern void cgroup_init_limits(cgroup_limits_t *limits); |
| extern list_t *cgroup_get_conf_list(void); |
| extern int cgroup_write_conf(int fd); |
| extern int cgroup_read_conf(int fd); |
| extern int cgroup_write_state(int fd); |
| extern int cgroup_read_state(int fd); |
| |
| extern bool cgroup_memcg_job_confinement(void); |
| extern char *autodetect_cgroup_version(void); |
| |
| /* global plugin functions */ |
| extern int cgroup_g_init(void); |
| extern int cgroup_g_fini(void); |
| |
| /* |
| * Create the cgroup namespace and the root cgroup objects. This two entities |
| * are the basic ones used by any other function and contain information about |
| * the cg paths, mount points, name, ownership, and so on. Set also any specific |
| * required parameter on the root cgroup depending on the controller. |
| * |
| * In cgroup/v1 a subsystem is a synonym for cgroup controller. |
| * |
| * IN sub - Controller to initialize. |
| * RET SLURM_SUCCESS or error |
| */ |
| extern int cgroup_g_initialize(cgroup_ctl_type_t sub); |
| |
| /* |
| * Create the system directories for the specified controller and set any |
| * required parameters. These directories are the ones where slurmd will |
| * be put if CoreSpecLimit, MemSpecLimit or CoreSpecCnt are set in slurm.conf. |
| * Current supported controllers are only cpuset and memory. |
| * |
| * IN sub - Controller to initialize. |
| * RET SLURM_SUCCESS or error |
| */ |
| extern int cgroup_g_system_create(cgroup_ctl_type_t sub); |
| |
| /* |
| * Add pids to the system cgroups. Typically these pids will be slurmstepd pids. |
| * |
| * IN sub - To which controller will the pids be added. |
| * IN pids - Array of pids to add. |
| * IN npids - Count of pids in the array. |
| * RET SLURM_SUCCESS if pids were correctly added or SLURM_ERROR otherwise. |
| */ |
| extern int cgroup_g_system_addto(cgroup_ctl_type_t sub, pid_t *pids, int npids); |
| |
| /* |
| * rmdir the system cgroup controller and destroy the cgroup global objects. |
| * In v1 it will move our pid first to the root cgroup, otherwise removal would |
| * return EBUSY. |
| * |
| * IN sub - Which controller will be destroyed. |
| * RET SLURM_SUCCESS if destroy was successful, SLURM_ERROR otherwise. |
| */ |
| extern int cgroup_g_system_destroy(cgroup_ctl_type_t sub); |
| |
| /* |
| * Create the directories for a job step in the given controller, set also any |
| * needed default parameters. Initialize also the step cgroup objects. |
| * Every controller may have its own specific settings. This function is called |
| * from a slurmstepd only once. Record also that we're using this step object. |
| * |
| * IN sub - Under which controller will the directory hierarchy be created. |
| * IN job - Step record which is used to create the path in the hierarchy. |
| * RET SLURM_SUCCESS if creation was successful, SLURM_ERROR otherwise. |
| */ |
| extern int cgroup_g_step_create(cgroup_ctl_type_t sub, stepd_step_rec_t *step); |
| |
| /* |
| * Given a controller, add the specified pids to cgroup.procs of the step. Note |
| * that this function will always be called from slurmstepd, which will already |
| * have created the step hierarchy and will have the step cgroup objects |
| * initialized. |
| * |
| * IN sub - Under which controller will the directory hierarchy be created. |
| * IN pids - Array of pids to add. |
| * IN npids - Count of pids in the array. |
| * RET SLURM_SUCCESS if addition was possible, SLURM_ERROR otherwise. |
| */ |
| extern int cgroup_g_step_addto(cgroup_ctl_type_t sub, pid_t *pids, int npids); |
| |
| /* |
| * Get the pids under the freezer controller for this step. |
| * |
| * OUT pids - Array of pids containing the pids in this step. |
| * OUT npids - Count of pids in the array. |
| * RET SLURM_SUCCESS if pids were correctly obtained, SLURM_ERROR otherwise. |
| */ |
| extern int cgroup_g_step_get_pids(pid_t **pids, int *npids); |
| |
| /* |
| * Suspend the step using the freezer controller. |
| * |
| * RET SLURM_SUCCESS if operation was successful, SLURM_ERROR otherwise. |
| */ |
| extern int cgroup_g_step_suspend(void); |
| |
| /* |
| * Resume the step using the freezer controller. |
| * |
| * RET SLURM_SUCCESS if operation was successful, SLURM_ERROR otherwise. |
| */ |
| extern int cgroup_g_step_resume(void); |
| |
| /* |
| * If the caller (typically from a plugin) is the only one using this step |
| * object, rmdir the controller's step directories and destroy the associated |
| * cgroup objects. Decrement the step object's active usage count. |
| * |
| * IN sub - Which controller will be destroyed for this step. |
| * RET SLURM_SUCCESS if operation was successful, SLURM_ERROR otherwise. |
| */ |
| extern int cgroup_g_step_destroy(cgroup_ctl_type_t sub); |
| |
| /* |
| * Given a pid, determine if this pid is being tracked by the freezer container. |
| * |
| * RET true if pid was found, false in any other case. |
| */ |
| extern bool cgroup_g_has_pid(pid_t pid); |
| |
| /* |
| * Obtain the constrains set to the cgroup of the specified controller. |
| * |
| * IN sub - From which controller we want the limits. |
| * IN level - Directory level to get the info from. |
| * RET cgroup_limits_t object if limits could be obtained, NULL otherwise. |
| */ |
| extern cgroup_limits_t *cgroup_g_constrain_get(cgroup_ctl_type_t sub, |
| cgroup_level_t type); |
| |
| /* |
| * Set constrains to the root cgroup of the specified controller. |
| * |
| * IN sub - To which controller we want the limits be applied to. |
| * IN level - Directory level to apply the limits to. |
| * IN limits - Struct containing the the limits to be applied. |
| * RET SLURM_SUCCESS if limits were applied successfully, SLURM_ERROR otherwise. |
| */ |
| extern int cgroup_g_constrain_set(cgroup_ctl_type_t sub, cgroup_level_t level, |
| cgroup_limits_t *limits); |
| |
| /* |
| * This function is only needed in v2, in v1 will always return SLURM_SUCCESS |
| */ |
| extern int cgroup_g_constrain_apply(cgroup_ctl_type_t sub, cgroup_level_t level, |
| uint32_t task_id); |
| |
| /* |
| * Function to detect OOM conditions. |
| * |
| * In v2 it will just read memory.oom_control. |
| * |
| * In v1, use memory.oom_control and cgroup.event_control, see: |
| * https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt |
| * |
| * In v1, Start a monitoring thread which will read the event files with a |
| * polling mechanism and wait for a stop signal. When the stop signal is |
| * received this thread will communicate the detected OOMs. This is not a 100% |
| * reliable method since events can be triggered with more than just OOMs, e.g. |
| * rmdirs. |
| * |
| * IN job - Step record. |
| * RET SLURM_SUCCESS if monitoring thread is started, SLURM_ERROR otherwise. |
| */ |
| extern int cgroup_g_step_start_oom_mgr(stepd_step_rec_t *step); |
| |
| /* |
| * Signal the monitoring thread with a stop message and get the results. |
| * |
| * IN job - Step record. |
| * RET cgroup_oom_t - Struct containing the oom information for this step. |
| */ |
| extern cgroup_oom_t *cgroup_g_step_stop_oom_mgr(stepd_step_rec_t *step); |
| |
| /* |
| * Add a task_X directories to the specified controllers of this step and |
| * record we're tracking this task. Add the task pid to the controller. |
| * |
| * IN sub - controller we're managing |
| * IN job - step record to create the task directories and add the pid to. |
| * IN task_id - task number to form the path and create the task_x directory. |
| * IN pid - pid to add to. Note, the task_id may not coincide with job->task[i] |
| * so we may not know where the pid is stored in the job struct. |
| * RET SLURM_SUCCESS if the task was successfully created and the pid added to |
| * all accounting controllers. |
| */ |
| extern int cgroup_g_task_addto(cgroup_ctl_type_t sub, stepd_step_rec_t *step, |
| pid_t pid, uint32_t task_id); |
| |
| /* |
| * Given a task id return the accounting data reading the accounting controller |
| * files for this step. |
| * |
| * IN task_id - task number we want the data from, for the current step. |
| * RET cgroup_acct_t - struct containing the required data. |
| */ |
| extern cgroup_acct_t *cgroup_g_task_get_acct_data(uint32_t taskid); |
| |
| /* |
| * Return conversion units used for stats gathered from cpuacct. |
| * Dividing the provided data by this number will give seconds. |
| * |
| * RET hertz - USER_HZ of the system. |
| */ |
| extern long int cgroup_g_get_acct_units(void); |
| |
| /* |
| * Check if Cgroup has this feature available. |
| * Usually this will depend on the kernel config settings or the boot flags, |
| * and since checks can be done by slurmd before init, we are checking it |
| * directly from the root. |
| */ |
| extern bool cgroup_g_has_feature(cgroup_ctl_feature_t f); |
| |
| /* |
| * Send KILL signal to the user processes cgroup of this step atomically. |
| * |
| * IN signal - Signal to send. Actually only SIGKILL is supported. |
| * OUT - SLURM_ERROR if signal could not be sent, SLURM_SUCCESS otherwise. |
| */ |
| extern int cgroup_g_signal(int signal); |
| |
| extern char *cgroup_g_get_task_empty_event_path(uint32_t taskid, |
| bool *on_modify); |
| |
| extern int cgroup_g_is_task_empty(uint32_t taskid); |
| |
| #endif |