blob: b95097f53d98022a11c111cf729ee38ac530bfdf [file] [log] [blame]
/*****************************************************************************\
* proctrack.c - Process tracking plugin stub.
*****************************************************************************
* Copyright (C) 2005 The Regents of the University of California.
* Copyright (C) SchedMD LLC.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#include <fcntl.h>
#include <pthread.h>
#include <signal.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#ifdef HAVE_LINUX_SCHED_H
# include <linux/sched.h>
#endif
/* This is suppose to be defined in linux/sched.h but we have found it
* is a very rare occasion this is the case, so we define it here.
*/
#ifndef PF_DUMPCORE
#define PF_DUMPCORE 0x00000200 /* dumped core */
#endif
#include "src/common/log.h"
#include "src/common/plugrack.h"
#include "src/common/read_config.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/interfaces/proctrack.h"
#include "src/slurmd/slurmstepd/slurmstepd_job.h"
bool proctrack_forked = false;
typedef struct slurm_proctrack_ops {
int (*create) (stepd_step_rec_t *step);
int (*add) (stepd_step_rec_t *step, pid_t pid);
int (*signal) (uint64_t id, int signal);
int (*destroy) (uint64_t id);
uint64_t (*find_cont) (pid_t pid);
bool (*has_pid) (uint64_t id, pid_t pid);
int (*wait) (uint64_t id);
int (*wait_for_any_task)(stepd_step_rec_t *step,
stepd_step_task_info_t **task, bool block);
int (*get_pids) (uint64_t id, pid_t ** pids, int *npids);
} slurm_proctrack_ops_t;
/*
* Must be synchronized with slurm_proctrack_ops_t above.
*/
static const char *syms[] = {
"proctrack_p_create",
"proctrack_p_add",
"proctrack_p_signal",
"proctrack_p_destroy",
"proctrack_p_find",
"proctrack_p_has_pid",
"proctrack_p_wait",
"proctrack_p_wait_for_any_task",
"proctrack_p_get_pids"
};
static slurm_proctrack_ops_t ops;
static plugin_context_t *g_context = NULL;
static pthread_mutex_t g_context_lock = PTHREAD_MUTEX_INITIALIZER;
/*
* The proctrack plugin can only be changed by restarting slurmd
* without preserving state (-c option).
*/
extern int proctrack_g_init(void)
{
int retval = SLURM_SUCCESS;
char *plugin_type = "proctrack";
slurm_mutex_lock(&g_context_lock);
if (g_context)
goto done;
g_context = plugin_context_create(plugin_type,
slurm_conf.proctrack_type,
(void **) &ops, syms, sizeof(syms));
if (!g_context) {
error("cannot create %s context for %s",
plugin_type, slurm_conf.proctrack_type);
retval = SLURM_ERROR;
goto done;
}
done:
slurm_mutex_unlock(&g_context_lock);
return retval;
}
extern int proctrack_g_fini(void)
{
int rc;
if (!g_context)
return SLURM_SUCCESS;
rc = plugin_context_destroy(g_context);
g_context = NULL;
return rc;
}
/*
* Create a container
* job IN - stepd_step_rec_t structure
* step->cont_id OUT - Plugin must fill in step->cont_id either here
* or in proctrack_g_add()
*
* Returns a Slurm errno.
*/
extern int proctrack_g_create(stepd_step_rec_t *step)
{
xassert(g_context);
return (*(ops.create))(step);
}
/*
* Add a process to the specified container
* job IN - stepd_step_rec_t structure
* pid IN - process ID to be added to the container
* step->cont_id OUT - Plugin must fill in step->cont_id either here
* or in proctrack_g_create()
*
* Returns a Slurm errno.
*/
extern int proctrack_g_add(stepd_step_rec_t *step, pid_t pid)
{
int i = 0, max_retry = 3, rc;
xassert(g_context);
/* Sometimes a plugin is transient in adding a pid, so lets
* try a few times before we call it quits.
*/
while ((rc = (*(ops.add))(step, pid)) != SLURM_SUCCESS) {
if (i++ > max_retry)
break;
debug("%s: %u.%u couldn't add pid %u, sleeping and trying again",
__func__, step->step_id.job_id,
step->step_id.step_id, pid);
sleep(1);
}
return rc;
}
/* Determine if core dump in progress
* stat_fname - Pathname of the form /proc/<PID>/stat
* RET - True if core dump in progress, otherwise false
*/
static bool _test_core_dumping(char* stat_fname)
{
int pid, ppid, pgrp, session, tty, tpgid;
char cmd[16], state[1];
long unsigned flags, min_flt, cmin_flt, maj_flt, cmaj_flt;
long unsigned utime, stime;
long cutime, cstime, priority, nice, timeout, it_real_value;
long resident_set_size;
long unsigned start_time, vsize;
long unsigned resident_set_size_rlim, start_code, end_code;
long unsigned start_stack, kstk_esp, kstk_eip;
long unsigned w_chan, n_swap, sn_swap;
int l_proc;
int num;
char *str_ptr, *proc_stat;
int proc_fd, proc_stat_size = BUF_SIZE;
bool dumping_results = false;
proc_fd = open(stat_fname, O_RDONLY, 0);
if (proc_fd == -1)
return false; /* process is now gone */
proc_stat = xmalloc_nz(proc_stat_size + 1);
while (1) {
num = read(proc_fd, proc_stat, proc_stat_size);
if (num <= 0) {
proc_stat[0] = '\0';
break;
}
proc_stat[num] = '\0';
if (num < proc_stat_size)
break;
proc_stat_size += BUF_SIZE;
xrealloc_nz(proc_stat, proc_stat_size + 1);
if (lseek(proc_fd, (off_t) 0, SEEK_SET) != 0)
break;
}
close(proc_fd);
/* race condition at process termination */
if (proc_stat[0] == '\0') {
debug("%s: %s is empty", __func__, stat_fname);
xfree(proc_stat);
return false;
}
/* split into "PID (cmd" and "<rest>" */
str_ptr = (char *)strrchr(proc_stat, ')');
if (str_ptr == NULL) {
error("%s: unexpected format of %s (%s) bracket missing?",
__func__, stat_fname, proc_stat);
xfree(proc_stat);
return false;
}
*str_ptr = '\0'; /* replace trailing ')' with NULL */
/* parse these two strings separately, skipping the leading "(". */
memset (cmd, 0, sizeof(cmd));
sscanf (proc_stat, "%d (%15c", &pid, cmd); /* comm[16] in kernel */
num = sscanf(str_ptr + 2, /* skip space after ')' too */
"%c "
"%d %d %d %d %d "
"%lu %lu %lu %lu %lu %lu %lu "
"%ld %ld %ld %ld %ld %ld "
"%lu %lu "
"%ld "
"%lu %lu %lu "
"%lu %lu %lu "
"%*s %*s %*s %*s " /* discard, no RT signals & Linux 2.1 used hex */
"%lu %lu %lu %*d %d",
state,
&ppid, &pgrp, &session, &tty, &tpgid,
&flags, &min_flt, &cmin_flt, &maj_flt, &cmaj_flt, &utime, &stime,
&cutime, &cstime, &priority, &nice, &timeout, &it_real_value,
&start_time, &vsize,
&resident_set_size,
&resident_set_size_rlim, &start_code, &end_code,
&start_stack, &kstk_esp, &kstk_eip,
/* &signal, &blocked, &sig_ignore, &sig_catch, */ /* can't use */
&w_chan, &n_swap, &sn_swap /* , &Exit_signal */, &l_proc);
if (num < 13)
error("/proc entry too short (%s)", proc_stat);
else if (flags & PF_DUMPCORE)
dumping_results = true;
xfree(proc_stat);
return dumping_results;
}
typedef struct {
uint64_t cont_id;
int signal;
} sig_agent_arg_t;
static void *_sig_agent(void *args)
{
bool hung_pids = false;
sig_agent_arg_t *agent_arg_ptr = args;
pid_t stepd_pid = getpid();
while (1) {
pid_t *pids = NULL;
int i, npids = 0;
char *stat_fname = NULL;
if (hung_pids)
sleep(5);
hung_pids = false;
if (proctrack_g_get_pids(agent_arg_ptr->cont_id, &pids,
&npids) == SLURM_SUCCESS) {
if (!npids ||
((npids == 1) && (pids[0] == stepd_pid))) {
xfree(pids);
break;
}
/*
* Check if any processes are core dumping.
* If so, do not signal any of them, instead
* jump back to the sleep and wait for the core
* dump to finish.
*
* This works around an issue with OpenMP
* applications failing to write a full core
* file out - only one of the processes will
* be marked are core dumping, but killing any
* of them will terminate the application.
*/
for (i = 0; i < npids; i++) {
if (pids[i] == stepd_pid)
continue;
xstrfmtcat(stat_fname, "/proc/%d/stat",
(int) pids[i]);
if (_test_core_dumping(stat_fname)) {
debug("Process %d continuing core dump",
(int) pids[i]);
hung_pids = true;
xfree(stat_fname);
break;
}
xfree(stat_fname);
}
if (hung_pids) {
xfree(pids);
continue;
}
for (i = 0; i < npids; i++) {
/* Avoid killing our own (stepd) process. */
if (pids[i] == stepd_pid)
continue;
/* Kill processes */
kill(pids[i], agent_arg_ptr->signal);
}
xfree(pids);
}
break;
}
(void) (*(ops.signal)) (agent_arg_ptr->cont_id, agent_arg_ptr->signal);
xfree(args);
return NULL;
}
static void _spawn_signal_thread(uint64_t cont_id, int signal)
{
sig_agent_arg_t *agent_arg_ptr;
agent_arg_ptr = xmalloc(sizeof(sig_agent_arg_t));
agent_arg_ptr->cont_id = cont_id;
agent_arg_ptr->signal = signal;
slurm_thread_create_detached(_sig_agent, agent_arg_ptr);
}
/*
* Signal all processes within a container
* cont_id IN - container ID as returned by proctrack_g_create()
* signal IN - signal to send, if zero then perform error checking
* but do not send signal
*
* Returns a Slurm errno.
*/
extern int proctrack_g_signal(uint64_t cont_id, int signal)
{
xassert(g_context);
if (signal == SIGKILL) {
pid_t *pids = NULL, stepd_pid = getpid();
int i, j, npids = 0, hung_pids = 0;
char *stat_fname = NULL;
if (proctrack_g_get_pids(cont_id, &pids, &npids) ==
SLURM_SUCCESS) {
for (j = 0; j < 2; j++) {
if (j)
sleep(2);
hung_pids = 0;
for (i = 0; i < npids; i++) {
if (!pids[i] || (pids[i] == stepd_pid))
continue;
xstrfmtcat(stat_fname, "/proc/%d/stat",
(int) pids[i]);
if (_test_core_dumping(stat_fname)) {
debug("Process %d continuing "
"core dump",
(int) pids[i]);
hung_pids++;
} else {
/* Don't test this PID again */
pids[i] = 0;
}
xfree(stat_fname);
}
if (hung_pids == 0)
break;
}
xfree(pids);
if (hung_pids) {
info("Deferring sending signal, processes in "
"job are currently core dumping");
_spawn_signal_thread(cont_id, signal);
return SLURM_SUCCESS;
}
}
}
return (*(ops.signal)) (cont_id, signal);
}
/*
* Destroy a container, any processes within the container are not effected
* cont_id IN - container ID as returned by proctrack_g_create()
*
* Returns a Slurm errno.
*/
extern int proctrack_g_destroy(uint64_t cont_id)
{
xassert(g_context);
return (*(ops.destroy)) (cont_id);
}
/*
* Get container ID for given process ID
*
* Returns zero if no container found for the given pid.
*/
extern uint64_t proctrack_g_find(pid_t pid)
{
xassert(g_context);
return (*(ops.find_cont)) (pid);
}
/*
* Return "true" if the container "cont_id" contains the process with
* ID "pid".
*/
extern bool proctrack_g_has_pid(uint64_t cont_id, pid_t pid)
{
xassert(g_context);
return (*(ops.has_pid)) (cont_id, pid);
}
/*
* Wait for all processes within a container to exit.
*
* Return SLURM_SUCCESS or SLURM_ERROR.
*/
extern int proctrack_g_wait(uint64_t cont_id)
{
xassert(g_context);
return (*(ops.wait)) (cont_id);
}
/*
* Wait for any task to end
*
* IN step - wait for any task in this step
* OUT ended_task - pointer to task that ended. NULL if no tasks ended
* IN block - If true, wait until any task ends, or return immediately if all
* tasks have already ended. If false, check for any ended tasks and then
* immediately return.
*
* RET - SLURM_SUCCESS or SLURM_ERROR. SLURM_ERROR and errno set to ECHILD
* means all tasks have already ended.
*/
extern int proctrack_g_wait_for_any_task(stepd_step_rec_t *step,
stepd_step_task_info_t **ended_task,
bool block)
{
int status;
struct rusage rusage;
int pid;
xassert(g_context);
xassert(ended_task);
if (step->flags & LAUNCH_WAIT_FOR_CHILDREN)
return (*(ops.wait_for_any_task))(step, ended_task, block);
pid = wait3(&status, block ? 0 : WNOHANG, &rusage);
if ((pid > 0) && (*ended_task = job_task_info_by_pid(step, pid))) {
(*ended_task)->estatus = status;
(*ended_task)->rusage = rusage;
}
return pid;
}
/*
* Get all process IDs within a container.
*
* IN cont_id - Container ID.
* OUT pids - a pointer to an xmalloc'ed array of process ids, of
* length "npids". Caller must free array with xfree().
* OUT npids - number of process IDs in the returned "pids" array.
*
* Return SLURM_SUCCESS if container exists (npids may be zero, and
* pids NULL), return SLURM_ERROR if container does not exist, or
* plugin does not implement the call.
*/
extern int proctrack_g_get_pids(uint64_t cont_id, pid_t **pids, int *npids)
{
xassert(g_context);
return (*(ops.get_pids)) (cont_id, pids, npids);
}