blob: c94a5cb748a66a81e743802a118fef2b2b318d39 [file] [log] [blame]
/*****************************************************************************\
* cgroup.h - driver for cgroup plugin
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef _INTERFACES_CGROUP_H
#define _INTERFACES_CGROUP_H
/* Check filesystem type */
#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__)
#include <magic.h>
#include <sys/mount.h>
#include <sys/param.h>
#else
#include <linux/magic.h>
#include <sys/vfs.h>
#endif
#include <pwd.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <inttypes.h>
#include "config.h"
#include "slurm/slurm.h"
#include "src/slurmd/slurmd/slurmd.h"
#include "src/slurmd/slurmstepd/slurmstepd_job.h"
#include "src/interfaces/gres.h"
#include "src/common/log.h"
#include "src/common/list.h"
#include "src/common/macros.h"
#include "src/common/pack.h"
#include "src/common/parse_config.h"
#include "src/common/parse_time.h"
#include "src/common/read_config.h"
#include "src/common/plugin.h"
#include "src/common/slurm_opt.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/plugins/cgroup/common/cgroup_common.h"
#ifdef __GNUC__
#define F_TYPE_EQUAL(a, b) (a == (__typeof__(a)) b)
#else
#define F_TYPE_EQUAL(a, b) (a == (__SWORD_TYPE) b)
#endif
/* Not defined in non-supported v2 linux versions -- e.g centos7 */
#ifndef CGROUP2_SUPER_MAGIC
#define CGROUP2_SUPER_MAGIC 0x63677270
#endif
/* Default lower bound on memory limit in MB. This is required so we
* don't immediately kill slurmstepd on mem cgroup creation if
* an administrator or user sets and absurdly low mem limit.
*/
#define XCGROUP_DEFAULT_MIN_RAM 30
/* Current supported cgroup controller types */
typedef enum {
CG_TRACK,
CG_CPUS,
CG_MEMORY,
CG_DEVICES,
CG_CPUACCT,
/* Below are extra controllers not explicitly tracked by Slurm. */
CG_IO,
CG_HUGETLB,
CG_PIDS,
CG_RDMA,
CG_MISC,
CG_CTL_CNT
} cgroup_ctl_type_t;
/* Current supported cgroup controller features */
typedef enum {
CG_FALSE_ROOT,
CG_MEMCG_OOMGROUP,
CG_MEMCG_PEAK,
CG_MEMCG_SWAP,
CG_KILL_BUTTON
} cgroup_ctl_feature_t;
typedef enum {
CG_LEVEL_ROOT,
CG_LEVEL_SLURM,
CG_LEVEL_USER,
CG_LEVEL_JOB,
CG_LEVEL_STEP,
CG_LEVEL_STEP_SLURM,
CG_LEVEL_STEP_USER,
CG_LEVEL_TASK,
CG_LEVEL_SYSTEM,
CG_LEVEL_CNT
} cgroup_level_t;
typedef enum {
CGROUP_EMPTY,
CGROUP_POPULATED,
} cgroup_empty_t;
/* This data type is used to get/set various parameters in cgroup hierarchy */
typedef struct {
/* extra info */
stepd_step_rec_t *step;
uint32_t taskid;
/* task cpuset */
char *allow_cores;
char *allow_mems;
size_t cores_size;
size_t mems_size;
/* task devices */
bool allow_device;
gres_device_id_t device;
/* jobacct memory */
uint64_t limit_in_bytes;
uint64_t soft_limit_in_bytes;
uint64_t memsw_limit_in_bytes;
uint64_t swappiness;
} cgroup_limits_t;
typedef struct {
uint64_t step_mem_failcnt;
uint64_t step_memsw_failcnt;
uint64_t job_mem_failcnt;
uint64_t job_memsw_failcnt;
uint64_t oom_kill_cnt;
} cgroup_oom_t;
typedef struct {
uint64_t memory_peak;
uint64_t usec;
uint64_t ssec;
uint64_t total_rss;
uint64_t total_pgmajfault;
uint64_t total_vmem;
} cgroup_acct_t;
/* Slurm cgroup plugins configuration parameters */
typedef struct {
char *cgroup_mountpoint;
char *cgroup_prepend;
bool constrain_cores;
bool constrain_ram_space;
float allowed_ram_space;
float max_ram_percent; /* Upper bound on memory as % of RAM */
uint64_t min_ram_space; /* Lower bound on memory limit (MB) */
bool constrain_swap_space;
float allowed_swap_space;
float max_swap_percent; /* Upper bound on swap as % of RAM */
uint64_t memory_swappiness;
bool constrain_devices;
char *cgroup_plugin;
bool ignore_systemd;
bool ignore_systemd_on_failure;
bool enable_controllers;
char *enable_extra_controllers;
bool signal_children_processes;
uint64_t systemd_timeout; /* How much time to wait on systemd operations (msec)*/
} cgroup_conf_t;
extern cgroup_conf_t slurm_cgroup_conf;
/* global functions */
extern int cgroup_conf_init(void);
extern void cgroup_conf_destroy(void);
extern void cgroup_free_limits(cgroup_limits_t *limits);
extern void cgroup_init_limits(cgroup_limits_t *limits);
extern list_t *cgroup_get_conf_list(void);
extern int cgroup_write_conf(int fd);
extern int cgroup_read_conf(int fd);
extern int cgroup_write_state(int fd);
extern int cgroup_read_state(int fd);
extern bool cgroup_memcg_job_confinement(void);
extern char *autodetect_cgroup_version(void);
/* global plugin functions */
extern int cgroup_g_init(void);
extern int cgroup_g_fini(void);
/*
* Create the cgroup namespace and the root cgroup objects. This two entities
* are the basic ones used by any other function and contain information about
* the cg paths, mount points, name, ownership, and so on. Set also any specific
* required parameter on the root cgroup depending on the controller.
*
* In cgroup/v1 a subsystem is a synonym for cgroup controller.
*
* IN sub - Controller to initialize.
* RET SLURM_SUCCESS or error
*/
extern int cgroup_g_initialize(cgroup_ctl_type_t sub);
/*
* Create the system directories for the specified controller and set any
* required parameters. These directories are the ones where slurmd will
* be put if CoreSpecLimit, MemSpecLimit or CoreSpecCnt are set in slurm.conf.
* Current supported controllers are only cpuset and memory.
*
* IN sub - Controller to initialize.
* RET SLURM_SUCCESS or error
*/
extern int cgroup_g_system_create(cgroup_ctl_type_t sub);
/*
* Add pids to the system cgroups. Typically these pids will be slurmstepd pids.
*
* IN sub - To which controller will the pids be added.
* IN pids - Array of pids to add.
* IN npids - Count of pids in the array.
* RET SLURM_SUCCESS if pids were correctly added or SLURM_ERROR otherwise.
*/
extern int cgroup_g_system_addto(cgroup_ctl_type_t sub, pid_t *pids, int npids);
/*
* rmdir the system cgroup controller and destroy the cgroup global objects.
* In v1 it will move our pid first to the root cgroup, otherwise removal would
* return EBUSY.
*
* IN sub - Which controller will be destroyed.
* RET SLURM_SUCCESS if destroy was successful, SLURM_ERROR otherwise.
*/
extern int cgroup_g_system_destroy(cgroup_ctl_type_t sub);
/*
* Create the directories for a job step in the given controller, set also any
* needed default parameters. Initialize also the step cgroup objects.
* Every controller may have its own specific settings. This function is called
* from a slurmstepd only once. Record also that we're using this step object.
*
* IN sub - Under which controller will the directory hierarchy be created.
* IN job - Step record which is used to create the path in the hierarchy.
* RET SLURM_SUCCESS if creation was successful, SLURM_ERROR otherwise.
*/
extern int cgroup_g_step_create(cgroup_ctl_type_t sub, stepd_step_rec_t *step);
/*
* Given a controller, add the specified pids to cgroup.procs of the step. Note
* that this function will always be called from slurmstepd, which will already
* have created the step hierarchy and will have the step cgroup objects
* initialized.
*
* IN sub - Under which controller will the directory hierarchy be created.
* IN pids - Array of pids to add.
* IN npids - Count of pids in the array.
* RET SLURM_SUCCESS if addition was possible, SLURM_ERROR otherwise.
*/
extern int cgroup_g_step_addto(cgroup_ctl_type_t sub, pid_t *pids, int npids);
/*
* Get the pids under the freezer controller for this step.
*
* OUT pids - Array of pids containing the pids in this step.
* OUT npids - Count of pids in the array.
* RET SLURM_SUCCESS if pids were correctly obtained, SLURM_ERROR otherwise.
*/
extern int cgroup_g_step_get_pids(pid_t **pids, int *npids);
/*
* Suspend the step using the freezer controller.
*
* RET SLURM_SUCCESS if operation was successful, SLURM_ERROR otherwise.
*/
extern int cgroup_g_step_suspend(void);
/*
* Resume the step using the freezer controller.
*
* RET SLURM_SUCCESS if operation was successful, SLURM_ERROR otherwise.
*/
extern int cgroup_g_step_resume(void);
/*
* If the caller (typically from a plugin) is the only one using this step
* object, rmdir the controller's step directories and destroy the associated
* cgroup objects. Decrement the step object's active usage count.
*
* IN sub - Which controller will be destroyed for this step.
* RET SLURM_SUCCESS if operation was successful, SLURM_ERROR otherwise.
*/
extern int cgroup_g_step_destroy(cgroup_ctl_type_t sub);
/*
* Given a pid, determine if this pid is being tracked by the freezer container.
*
* RET true if pid was found, false in any other case.
*/
extern bool cgroup_g_has_pid(pid_t pid);
/*
* Obtain the constrains set to the cgroup of the specified controller.
*
* IN sub - From which controller we want the limits.
* IN level - Directory level to get the info from.
* RET cgroup_limits_t object if limits could be obtained, NULL otherwise.
*/
extern cgroup_limits_t *cgroup_g_constrain_get(cgroup_ctl_type_t sub,
cgroup_level_t type);
/*
* Set constrains to the root cgroup of the specified controller.
*
* IN sub - To which controller we want the limits be applied to.
* IN level - Directory level to apply the limits to.
* IN limits - Struct containing the the limits to be applied.
* RET SLURM_SUCCESS if limits were applied successfully, SLURM_ERROR otherwise.
*/
extern int cgroup_g_constrain_set(cgroup_ctl_type_t sub, cgroup_level_t level,
cgroup_limits_t *limits);
/*
* This function is only needed in v2, in v1 will always return SLURM_SUCCESS
*/
extern int cgroup_g_constrain_apply(cgroup_ctl_type_t sub, cgroup_level_t level,
uint32_t task_id);
/*
* Function to detect OOM conditions.
*
* In v2 it will just read memory.oom_control.
*
* In v1, use memory.oom_control and cgroup.event_control, see:
* https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
*
* In v1, Start a monitoring thread which will read the event files with a
* polling mechanism and wait for a stop signal. When the stop signal is
* received this thread will communicate the detected OOMs. This is not a 100%
* reliable method since events can be triggered with more than just OOMs, e.g.
* rmdirs.
*
* IN job - Step record.
* RET SLURM_SUCCESS if monitoring thread is started, SLURM_ERROR otherwise.
*/
extern int cgroup_g_step_start_oom_mgr(stepd_step_rec_t *step);
/*
* Signal the monitoring thread with a stop message and get the results.
*
* IN job - Step record.
* RET cgroup_oom_t - Struct containing the oom information for this step.
*/
extern cgroup_oom_t *cgroup_g_step_stop_oom_mgr(stepd_step_rec_t *step);
/*
* Add a task_X directories to the specified controllers of this step and
* record we're tracking this task. Add the task pid to the controller.
*
* IN sub - controller we're managing
* IN job - step record to create the task directories and add the pid to.
* IN task_id - task number to form the path and create the task_x directory.
* IN pid - pid to add to. Note, the task_id may not coincide with job->task[i]
* so we may not know where the pid is stored in the job struct.
* RET SLURM_SUCCESS if the task was successfully created and the pid added to
* all accounting controllers.
*/
extern int cgroup_g_task_addto(cgroup_ctl_type_t sub, stepd_step_rec_t *step,
pid_t pid, uint32_t task_id);
/*
* Given a task id return the accounting data reading the accounting controller
* files for this step.
*
* IN task_id - task number we want the data from, for the current step.
* RET cgroup_acct_t - struct containing the required data.
*/
extern cgroup_acct_t *cgroup_g_task_get_acct_data(uint32_t taskid);
/*
* Return conversion units used for stats gathered from cpuacct.
* Dividing the provided data by this number will give seconds.
*
* RET hertz - USER_HZ of the system.
*/
extern long int cgroup_g_get_acct_units(void);
/*
* Check if Cgroup has this feature available.
* Usually this will depend on the kernel config settings or the boot flags,
* and since checks can be done by slurmd before init, we are checking it
* directly from the root.
*/
extern bool cgroup_g_has_feature(cgroup_ctl_feature_t f);
/*
* Send KILL signal to the user processes cgroup of this step atomically.
*
* IN signal - Signal to send. Actually only SIGKILL is supported.
* OUT - SLURM_ERROR if signal could not be sent, SLURM_SUCCESS otherwise.
*/
extern int cgroup_g_signal(int signal);
extern char *cgroup_g_get_task_empty_event_path(uint32_t taskid,
bool *on_modify);
extern int cgroup_g_is_task_empty(uint32_t taskid);
#endif