blob: ebdb668d773e9475e408312d2c561e8fb6b6cc26 [file] [log] [blame]
/*****************************************************************************\
* task_cgroup.c - Library for task pre-launch and post_termination functions
* for containment using linux cgroup subsystems
*****************************************************************************
* Copyright (C) 2009 CEA/DAM/DIF
* Written by Matthieu Hautreux <matthieu.hautreux@cea.fr>
* Modified by Felip Moll <felip.moll@schedmd.com>
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#include <signal.h>
#include <sys/types.h>
#include "slurm/slurm_errno.h"
#include "src/common/slurm_xlator.h"
#include "src/common/xstring.h"
#include "src/interfaces/cgroup.h"
#include "src/slurmd/slurmstepd/slurmstepd_job.h"
#include "src/slurmd/slurmd/slurmd.h"
#include "task_cgroup_cpuset.h"
#include "task_cgroup_memory.h"
#include "task_cgroup_devices.h"
const char plugin_name[] = "Tasks containment cgroup plugin";
const char plugin_type[] = "task/cgroup";
const uint32_t plugin_version = SLURM_VERSION_NUMBER;
static bool use_cpuset = false;
static bool use_memory = false;
static bool use_devices = false;
extern int init(void)
{
int rc = SLURM_SUCCESS;
/* Sanity check for incompatible plugins */
if (!xstrcmp(slurm_cgroup_conf.cgroup_plugin, "disabled")) {
fatal("%s: CgroupPlugin=disabled in cgroup.conf is not compatible with task/cgroup.",
__func__);
}
if (slurm_cgroup_conf.constrain_swap_space &&
!cgroup_g_has_feature(CG_MEMCG_SWAP)) {
error("ConstrainSwapSpace is enabled but there is no support for swap in the memory cgroup controller.");
return SLURM_ERROR;
}
if (!running_in_slurmstepd())
goto end;
if (slurm_cgroup_conf.constrain_cores)
use_cpuset = true;
if ((slurm_conf.select_type_param & SELECT_MEMORY) &&
(slurm_cgroup_conf.constrain_ram_space ||
slurm_cgroup_conf.constrain_swap_space))
use_memory = true;
if (slurm_cgroup_conf.constrain_devices)
use_devices = true;
if (use_cpuset) {
if ((rc = task_cgroup_cpuset_init())) {
error("failure enabling core enforcement: %s",
slurm_strerror(rc));
return rc;
} else
debug("core enforcement enabled");
}
if (use_memory) {
if ((rc = task_cgroup_memory_init())) {
error("failure enabling memory enforcement: %s",
slurm_strerror(rc));
return rc;
} else
debug("memory enforcement enabled");
}
if (use_devices) {
if ((rc = task_cgroup_devices_init())) {
error("failure enabling device enforcement: %s",
slurm_strerror(rc));
return rc;
} else
debug("device enforcement enabled");
}
end:
debug("%s loaded", plugin_name);
return rc;
}
extern int fini(void)
{
int rc = SLURM_SUCCESS;
if (use_cpuset && (task_cgroup_cpuset_fini() != SLURM_SUCCESS))
rc = SLURM_ERROR;
if (use_memory && (task_cgroup_memory_fini() != SLURM_SUCCESS))
rc = SLURM_ERROR;
if (use_devices && (task_cgroup_devices_fini() != SLURM_SUCCESS))
rc = SLURM_ERROR;
debug("%s unloaded", plugin_name);
return rc;
}
extern int task_p_slurmd_batch_request(batch_job_launch_msg_t *req)
{
return SLURM_SUCCESS;
}
extern int task_p_slurmd_launch_request(launch_tasks_request_msg_t *req,
uint32_t node_id, char **err_msg)
{
return SLURM_SUCCESS;
}
/*
* task_p_pre_setuid() is called as root before setting the UID for the user to
* launch his jobs. Use this to create the cgroup hierarchy and set the owner
* appropriately.
*/
extern int task_p_pre_setuid(stepd_step_rec_t *step)
{
int rc = SLURM_SUCCESS;
if (use_cpuset && (task_cgroup_cpuset_create(step) != SLURM_SUCCESS))
rc = SLURM_ERROR;
if (use_memory && (task_cgroup_memory_create(step) != SLURM_SUCCESS))
rc = SLURM_ERROR;
if (use_devices && (task_cgroup_devices_create(step) != SLURM_SUCCESS))
rc = SLURM_ERROR;
return rc;
}
/*
* task_p_pre_launch_priv() is called prior to exec of application task.
* Runs in privileged mode.
*/
extern int task_p_pre_launch_priv(stepd_step_rec_t *step, uint32_t node_tid,
uint32_t global_tid)
{
int rc = SLURM_SUCCESS;
if (use_cpuset &&
(task_cgroup_cpuset_add_pid(step, step->task[node_tid]->pid,
global_tid) != SLURM_SUCCESS))
rc = SLURM_ERROR;
if (use_memory &&
(task_cgroup_memory_add_pid(step, step->task[node_tid]->pid,
global_tid) != SLURM_SUCCESS))
rc = SLURM_ERROR;
if (use_devices &&
(task_cgroup_devices_add_pid(step, step->task[node_tid]->pid,
global_tid) != SLURM_SUCCESS))
rc = SLURM_ERROR;
if (use_devices &&
(task_cgroup_devices_constrain(step, node_tid, global_tid) !=
SLURM_SUCCESS))
rc = SLURM_ERROR;
return rc;
}
/*
* task_p_pre_launch() is called prior to exec of application task.
* It is followed by TaskProlog program (from slurm.conf) and --task-prolog
* (from srun command line).
*/
extern int task_p_pre_launch(stepd_step_rec_t *step)
{
return SLURM_SUCCESS;
}
/*
* task_term() is called after termination of application task.
* It is preceded by --task-epilog (from srun command line) fllowed by
* TaskEpilog program (from slurm.conf).
*/
extern int task_p_post_term(stepd_step_rec_t *step, stepd_step_task_info_t *task)
{
static bool ran = false;
int rc = SLURM_SUCCESS;
/*
* Only run this on the first call since this will run for
* every task on the node.
*/
if (use_memory && !ran) {
rc = task_cgroup_memory_check_oom(step);
ran = true;
}
return rc;
}
/* task_p_post_step() is called after termination of the step (all the task). */
extern int task_p_post_step(stepd_step_rec_t *step)
{
return fini();
}
/* Add pid to specific cgroup. */
extern int task_p_add_pid(pid_t pid)
{
int rc = SLURM_SUCCESS;
if (use_cpuset &&
(task_cgroup_cpuset_add_extern_pid(pid) != SLURM_SUCCESS))
rc = SLURM_ERROR;
if (use_memory && (task_cgroup_memory_add_extern_pid(pid) !=
SLURM_SUCCESS))
rc = SLURM_ERROR;
if (use_devices &&
(task_cgroup_devices_add_extern_pid(pid) != SLURM_SUCCESS))
rc = SLURM_ERROR;
return rc;
}