blob: b9cb5b8d161b0f68ab7f8c0fc3fda597c437dff8 [file] [log] [blame] [edit]
/*****************************************************************************\
* proctrack_cgroup.c - process tracking via linux cgroup containers
*****************************************************************************
* Copyright (C) 2009 CEA/DAM/DIF
* Written by Matthieu Hautreux <matthieu.hautreux@cea.fr>
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
#include <signal.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "slurm/slurm.h"
#include "slurm/slurm_errno.h"
#include "src/common/log.h"
#include "src/common/xstring.h"
#include "src/common/cgroup.h"
#include "src/common/read_config.h"
#include "src/slurmd/common/xcpuinfo.h"
#include "src/slurmd/slurmd/slurmd.h"
#include "src/slurmd/slurmstepd/slurmstepd_job.h"
/*
* These variables are required by the generic plugin interface. If they
* are not found in the plugin, the plugin loader will ignore it.
*
* plugin_name - a string giving a human-readable description of the
* plugin. There is no maximum length, but the symbol must refer to
* a valid string.
*
* plugin_type - a string suggesting the type of the plugin or its
* applicability to a particular form of data or method of data handling.
* If the low-level plugin API is used, the contents of this string are
* unimportant and may be anything. Slurm uses the higher-level plugin
* interface which requires this string to be of the form
*
* <application>/<method>
*
* where <application> is a description of the intended application of
* the plugin (e.g., "jobcomp" for Slurm job completion logging) and <method>
* is a description of how this plugin satisfies that application. Slurm will
* only load job completion logging plugins if the plugin_type string has a
* prefix of "jobcomp/".
*
* plugin_version - an unsigned 32-bit integer containing the Slurm version
* (major.minor.micro combined into a single number).
*/
const char plugin_name[] = "Process tracking via linux cgroup freezer subsystem";
const char plugin_type[] = "proctrack/cgroup";
const uint32_t plugin_version = SLURM_VERSION_NUMBER;
int
_slurm_cgroup_is_pid_a_slurm_task(uint64_t id, pid_t pid)
{
int fstatus = -1;
int fd;
pid_t ppid;
char file_path[PATH_MAX], buf[2048];
if (snprintf(file_path, PATH_MAX, "/proc/%ld/stat",
(long)pid) >= PATH_MAX) {
debug2("unable to build pid '%d' stat file: %m ", pid);
return fstatus;
}
if ((fd = open(file_path, O_RDONLY)) < 0) {
debug2("unable to open '%s' : %m ", file_path);
return fstatus;
}
if (read(fd, buf, 2048) <= 0) {
debug2("unable to read '%s' : %m ", file_path);
close(fd);
return fstatus;
}
close(fd);
if (sscanf(buf, "%*d %*s %*s %d", &ppid) != 1) {
debug2("unable to get ppid of pid '%d', %m", pid);
return fstatus;
}
/*
* assume that any child of slurmstepd is a slurm task
* they will get all signals, inherited processes will
* only get SIGKILL
*/
if (ppid == (pid_t) id)
fstatus = 1;
else
fstatus = 0;
return fstatus;
}
/*
* init() is called when the plugin is loaded, before any other functions
* are called. Put global initialization here.
*/
extern int init (void)
{
/* initialize cpuinfo internal data */
if (xcpuinfo_init() != XCPUINFO_SUCCESS) {
return SLURM_ERROR;
}
/* initialize cgroup internal data */
if (cgroup_g_initialize(CG_TRACK) != SLURM_SUCCESS) {
xcpuinfo_fini();
return SLURM_ERROR;
}
return SLURM_SUCCESS;
}
extern int fini (void)
{
xcpuinfo_fini();
return SLURM_SUCCESS;
}
/*
* Uses slurmd job-step manager's pid as the unique container id.
*/
extern int proctrack_p_create (stepd_step_rec_t *job)
{
return cgroup_g_step_create(CG_TRACK, job);
}
extern int proctrack_p_add (stepd_step_rec_t *job, pid_t pid)
{
return cgroup_g_step_addto(CG_TRACK, &pid, 1);
}
extern int proctrack_p_signal (uint64_t id, int signal)
{
pid_t* pids = NULL;
int npids = 0;
int i;
int slurm_task;
/* get all the pids associated with the step */
if (cgroup_g_step_get_pids(&pids, &npids) != SLURM_SUCCESS) {
debug3("unable to get pids list for cont_id=%"PRIu64"", id);
/* that could mean that all the processes already exit */
/* the container so return success */
return SLURM_SUCCESS;
}
/* directly manage SIGSTOP using cgroup freezer subsystem */
if (signal == SIGSTOP) {
xfree(pids);
return cgroup_g_step_suspend();
}
/* start by resuming in case of SIGKILL */
if (signal == SIGKILL) {
cgroup_g_step_resume();
}
for (i = 0 ; i<npids ; i++) {
/* do not kill slurmstepd (it should not be part
* of the list, but just to not forget about that ;))
*/
if (pids[i] == (pid_t)id)
continue;
/* only signal slurm tasks unless signal is SIGKILL */
slurm_task = _slurm_cgroup_is_pid_a_slurm_task(id, pids[i]);
if (slurm_task == 1 || signal == SIGKILL) {
debug2("killing process %d (%s) with signal %d", pids[i],
(slurm_task==1)?"slurm_task":"inherited_task",
signal);
kill(pids[i], signal);
}
}
xfree(pids);
/* resume tasks after signaling slurm tasks with SIGCONT to be sure */
/* that SIGTSTP received at suspend time is removed */
if (signal == SIGCONT) {
return cgroup_g_step_resume();
}
return SLURM_SUCCESS;
}
extern int proctrack_p_destroy (uint64_t id)
{
return cgroup_g_step_destroy(CG_TRACK);
}
extern uint64_t proctrack_p_find(pid_t pid)
{
/* not provided for now */
return 0;
}
extern bool proctrack_p_has_pid(uint64_t cont_id, pid_t pid)
{
return cgroup_g_has_pid(pid);
}
extern int proctrack_p_wait(uint64_t cont_id)
{
int delay = 1;
time_t start = time(NULL);
if (cont_id == 0 || cont_id == 1) {
errno = EINVAL;
return SLURM_ERROR;
}
/* Spin until the container is successfully destroyed */
/* This indicates that all tasks have exited the container */
while (proctrack_p_destroy(cont_id) != SLURM_SUCCESS) {
time_t now = time(NULL);
if (now > (start + slurm_conf.unkillable_timeout)) {
error("Unable to destroy container %"PRIu64" in cgroup plugin, giving up after %lu sec",
cont_id, (now - start));
break;
}
proctrack_p_signal(cont_id, SIGKILL);
sleep(delay);
if (delay < 32)
delay *= 2;
}
return SLURM_SUCCESS;
}
extern int proctrack_p_get_pids(uint64_t cont_id, pid_t **pids, int *npids)
{
return cgroup_g_step_get_pids(pids, npids);
}