blob: eccba057ddedb91807824c42ff77e396343c8993 [file] [log] [blame]
/*****************************************************************************\
* slurmd/slurmstepd/task.c - task launching functions for slurmstepd
*****************************************************************************
* Copyright (C) 2002-2007 The Regents of the University of California.
* Copyright (C) 2008-2009 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Mark A. Grondona <mgrondona@llnl.gov>.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#define _GNU_SOURCE
#include <ctype.h>
#include <fcntl.h>
#include <grp.h>
#include <pwd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/param.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#ifdef WITH_SELINUX
#include <selinux/selinux.h>
#endif
/* FIXME: Come up with a real solution for EUID instead of substituting RUID */
#if defined(__NetBSD__)
#define eaccess(p,m) (access((p),(m)))
#define HAVE_EACCESS 1
#endif
#include "slurm/slurm_errno.h"
#include "src/common/env.h"
#include "src/common/fd.h"
#include "src/common/log.h"
#include "src/common/run_command.h"
#include "src/common/spank.h"
#include "src/common/strlcpy.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/interfaces/auth.h"
#include "src/interfaces/gres.h"
#include "src/interfaces/mpi.h"
#include "src/interfaces/proctrack.h"
#include "src/interfaces/switch.h"
#include "src/interfaces/task.h"
#include "src/slurmd/common/fname.h"
#include "src/slurmd/slurmd/slurmd.h"
#include "src/slurmd/slurmstepd/container.h"
#include "src/slurmd/slurmstepd/pdebug.h"
#include "src/slurmd/slurmstepd/task.h"
#include "src/slurmd/slurmstepd/ulimits.h"
/*
* Static prototype definitions.
*/
static void _make_tmpdir(stepd_step_rec_t *step);
static int _run_script_and_set_env(const char *name, const char *path,
stepd_step_rec_t *step);
static void _proc_stdout(char *buf, stepd_step_rec_t *step);
/*
* Process TaskProlog output
* "export NAME=value" adds environment variables
* "unset NAME" clears an environment variable
* "print <whatever>" writes that to the step's stdout
*/
static void _proc_stdout(char *buf, stepd_step_rec_t *step)
{
bool end_buf = false;
int len;
char *buf_ptr, *name_ptr, *val_ptr;
char *end_line, *equal_ptr;
char ***env = &step->env;
buf_ptr = buf;
while (buf_ptr[0]) {
end_line = strchr(buf_ptr, '\n');
if (!end_line) {
end_line = buf_ptr + strlen(buf_ptr);
end_buf = true;
}
if (!xstrncmp(buf_ptr, "print ", 6)) {
buf_ptr += 6;
while (isspace(buf_ptr[0]))
buf_ptr++;
len = end_line - buf_ptr + 1;
safe_write(1, buf_ptr, len);
} else if (!xstrncmp(buf_ptr, "export ",7)) {
name_ptr = buf_ptr + 7;
while (isspace(name_ptr[0]))
name_ptr++;
equal_ptr = strchr(name_ptr, '=');
if (!equal_ptr || (equal_ptr > end_line))
goto rwfail;
val_ptr = equal_ptr + 1;
while (isspace(equal_ptr[-1]))
equal_ptr--;
equal_ptr[0] = '\0';
end_line[0] = '\0';
debug("export name:%s:val:%s:", name_ptr, val_ptr);
if (setenvf(env, name_ptr, "%s", val_ptr)) {
error("Unable to set %s environment variable",
buf_ptr);
}
equal_ptr[0] = '=';
if (end_buf)
end_line[0] = '\0';
else
end_line[0] = '\n';
} else if (!xstrncmp(buf_ptr, "unset ", 6)) {
name_ptr = buf_ptr + 6;
while (isspace(name_ptr[0]))
name_ptr++;
if ((name_ptr[0] == '\n') || (name_ptr[0] == '\0'))
goto rwfail;
while (isspace(end_line[-1]))
end_line--;
end_line[0] = '\0';
debug(" unset name:%s:", name_ptr);
unsetenvp(*env, name_ptr);
if (end_buf)
end_line[0] = '\0';
else
end_line[0] = '\n';
}
rwfail: /* process rest of script output */
if (end_buf)
break;
buf_ptr = end_line + 1;
}
return;
}
/*
* Run a task prolog script. Also read the stdout of the script and set
* environment variables in the task's environment as specified
* in the script's standard output.
* name IN: class of program ("system prolog", "user prolog", etc.)
* path IN: pathname of program to run
* step IN/OUT: pointer to associated step, can update step->env
* if prolog
* RET the exit status of the script or 1 on generic error and 0 on success
*/
static int
_run_script_and_set_env(const char *name, const char *path,
stepd_step_rec_t *step)
{
int status = 0, rc = 0;
char *argv[2];
char *buf = NULL;
run_command_args_t args = {
.job_id = step->step_id.job_id,
.max_wait = -1,
.script_path = path,
.script_type = name,
.direct_exec = true,
.status = &status
};
if (path == NULL || path[0] == '\0')
return rc;
xassert(step->env);
setenvf(&step->env, "SLURM_SCRIPT_CONTEXT", "prolog_task");
args.env = step->env;
argv[0] = xstrdup(path);
argv[1] = NULL;
args.script_argv = argv;
debug("[job %u] attempting to run %s [%s]",
step->step_id.job_id, name, path);
buf = run_command(&args);
if (WIFEXITED(status)) {
if (buf)
_proc_stdout(buf, step);
rc = WEXITSTATUS(status);
} else {
error("%s did not exit normally. reason: %s", name, buf);
rc = 1;
}
xfree(argv[0]);
xfree(buf);
return rc;
}
/* Given a program name, translate it to a fully qualified pathname as needed
* based upon the PATH environment variable and current working directory
* Returns xmalloc()'d string that must be xfree()'d */
static char *_build_path(char *fname, char **prog_env)
{
char *path_env = NULL, *dir = NULL;
char *file_name, *last = NULL;
struct stat stat_buf;
int len = PATH_MAX;
if (!fname)
return NULL;
file_name = (char *) xmalloc(len);
/* check if already absolute path */
if (fname[0] == '/') {
/* copy and ensure null termination */
strlcpy(file_name, fname, len);
return file_name;
}
if (fname[0] == '.') {
dir = xmalloc(len);
if (!getcwd(dir, len))
error("getcwd failed: %m");
snprintf(file_name, len, "%s/%s", dir, fname);
xfree(dir);
return file_name;
}
/* search for the file using PATH environment variable */
path_env = xstrdup(getenvp(prog_env, "PATH"));
if (path_env)
dir = strtok_r(path_env, ":", &last);
while (dir) {
snprintf(file_name, len, "%s/%s", dir, fname);
if ((stat(file_name, &stat_buf) == 0)
&& (! S_ISDIR(stat_buf.st_mode)))
break;
dir = strtok_r(NULL, ":", &last);
}
if (dir == NULL) /* not found */
strlcpy(file_name, fname, len);
xfree(path_env);
return file_name;
}
static int
_setup_mpi(stepd_step_rec_t *step, int ltaskid)
{
mpi_task_info_t info[1];
if (step->het_job_id && (step->het_job_id != NO_VAL))
info->step_id.job_id = step->het_job_id;
else
info->step_id.job_id = step->step_id.job_id;
if (step->het_job_offset != NO_VAL) {
info->step_id.step_id = step->step_id.step_id;
info->step_id.step_het_comp = step->step_id.step_het_comp;
info->nnodes = step->het_job_nnodes;
info->nodeid = step->het_job_node_offset + step->nodeid;
info->ntasks = step->het_job_ntasks;
info->ltasks = step->node_tasks;
info->gtaskid = step->het_job_task_offset +
step->task[ltaskid]->gtid;
info->ltaskid = step->task[ltaskid]->id;
info->client = step->envtp->cli;
} else {
info->step_id.step_id = step->step_id.step_id;
info->step_id.step_het_comp = step->step_id.step_het_comp;
info->nnodes = step->nnodes;
info->nodeid = step->nodeid;
info->ntasks = step->ntasks;
info->ltasks = step->node_tasks;
info->gtaskid = step->task[ltaskid]->gtid;
info->ltaskid = step->task[ltaskid]->id;
info->client = step->envtp->cli;
}
return mpi_g_slurmstepd_task(info, &step->env);
}
/*
* Current process is running as the user when this is called.
*/
extern void exec_task(stepd_step_rec_t *step, int local_proc_id)
{
int fd, j;
stepd_step_task_info_t *task = step->task[local_proc_id];
char **tmp_env;
int saved_errno, status;
uint32_t node_offset = 0, task_offset = 0;
if (step->container)
container_task_init(step, task);
if (step->het_job_node_offset != NO_VAL)
node_offset = step->het_job_node_offset;
if (step->het_job_task_offset != NO_VAL)
task_offset = step->het_job_task_offset;
for (j = 0; j < step->node_tasks; j++)
xstrfmtcat(step->envtp->sgtids, "%s%u", j ? "," : "",
step->task[j]->gtid + task_offset);
if (step->het_job_id != NO_VAL)
step->envtp->jobid = step->het_job_id;
else
step->envtp->jobid = step->step_id.job_id;
step->envtp->stepid = step->step_id.step_id;
step->envtp->nodeid = step->nodeid + node_offset;
step->envtp->cpus_on_node = step->cpus;
step->envtp->procid = task->gtid + task_offset;
step->envtp->localid = task->id;
step->envtp->task_pid = getpid();
step->envtp->distribution = step->task_dist;
step->envtp->cpu_bind = xstrdup(step->cpu_bind);
step->envtp->cpu_bind_type = step->cpu_bind_type;
step->envtp->cpu_freq_min = step->cpu_freq_min;
step->envtp->cpu_freq_max = step->cpu_freq_max;
step->envtp->cpu_freq_gov = step->cpu_freq_gov;
step->envtp->mem_bind = xstrdup(step->mem_bind);
step->envtp->mem_bind_type = step->mem_bind_type;
step->envtp->distribution = -1;
step->envtp->batch_flag = step->batch;
step->envtp->uid = step->uid;
step->envtp->job_end_time = step->job_end_time;
step->envtp->job_licenses = xstrdup(step->job_licenses);
step->envtp->job_start_time = step->job_start_time;
step->envtp->user_name = xstrdup(step->user_name);
step->envtp->oom_kill_step = step->oom_kill_step ? 1 : 0;
/*
* Modify copy of step's environment. Do not alter in place or
* concurrent searches of the environment can generate invalid memory
* references.
*/
step->envtp->env = env_array_copy((const char **) step->env);
setup_env(step->envtp, false);
setenvf(&step->envtp->env, "SLURM_JOB_GID", "%u", step->gid);
setenvf(&step->envtp->env, "SLURMD_NODENAME", "%s", conf->node_name);
if (step->tres_bind) {
setenvf(&step->envtp->env, "SLURMD_TRES_BIND", "%s",
step->tres_bind);
}
if (step->tres_freq) {
setenvf(&step->envtp->env, "SLURMD_TRES_FREQ", "%s",
step->tres_freq);
}
tmp_env = step->env;
step->env = step->envtp->env;
env_array_free(tmp_env);
step->envtp->env = NULL;
xfree(step->envtp->task_count);
if (!step->batch && (step->step_id.step_id != SLURM_EXTERN_CONT) &&
(step->step_id.step_id != SLURM_INTERACTIVE_STEP)) {
if (switch_g_job_attach(step->switch_step, &step->env,
step->nodeid, (uint32_t) local_proc_id,
step->nnodes, step->ntasks,
task->gtid + task_offset) < 0) {
error("Unable to attach to interconnect: %m");
log_fini();
_exit(1);
}
if (_setup_mpi(step, local_proc_id) != SLURM_SUCCESS) {
error("Unable to configure MPI plugin: %m");
log_fini();
_exit(1);
}
}
/* task-specific pre-launch activities */
/* task plugin hook */
if (task_g_pre_launch(step)) {
error("Failed to invoke task plugins: task_p_pre_launch error");
_exit(1);
}
if (!step->batch && (step->step_id.step_id != SLURM_INTERACTIVE_STEP) &&
(step->accel_bind_type || step->tres_bind)) {
/*
* Modify copy of step's environment as needed for GRES. Do not
* alter in place or concurrent searches of the environment can
* generate invalid memory references.
*/
step->envtp->env = env_array_copy((const char **) step->env);
gres_g_task_set_env(step, local_proc_id);
tmp_env = step->env;
step->env = step->envtp->env;
env_array_free(tmp_env);
}
/*
* test7.21 calls slurm_load_job() as an example of weird things people
* may do within a SPANK stack. That will deadlock if we don't drop the
* lock here.
*/
auth_setuid_unlock();
if (spank_user_task(step, local_proc_id)) {
error("Failed to invoke spank plugin stack");
_exit(1);
}
auth_setuid_lock();
#ifdef WITH_SELINUX
if (setexeccon(step->selinux_context)) {
error("Failed to set SELinux context to '%s': %m",
step->selinux_context);
_exit(1);
}
#else
if (step->selinux_context) {
error("Built without SELinux support but context was specified");
_exit(1);
}
#endif
if (slurm_conf.task_prolog) {
status = _run_script_and_set_env("slurm task_prolog",
slurm_conf.task_prolog, step);
if (status) {
error("TaskProlog failed status=%d", status);
_exit(status);
}
}
if (step->task_prolog) {
status = _run_script_and_set_env("user task_prolog",
step->task_prolog, step);
if (status) {
error("--task-prolog failed status=%d", status);
_exit(status);
}
}
/*
* Set TMPDIR after running prolog scripts, since TMPDIR
* might be set or changed in one of the prolog scripts.
*/
if (local_proc_id == 0)
_make_tmpdir(step);
if (!step->batch)
pdebug_stop_current(step);
if (step->env == NULL) {
debug("step->env is NULL");
step->env = (char **)xmalloc(sizeof(char *));
step->env[0] = (char *)NULL;
}
if (task->argv[0] == NULL) {
error("No executable program specified for this task");
_exit(2);
}
if (*task->argv[0] != '/') {
/*
* Handle PATH resolution for the command to launch.
* Need to handle this late so that SPANK and other plugins
* have a chance to manipulate the PATH and/or change the
* filesystem namespaces into the final arrangement, which
* may affect which executable we select.
*/
task->argv[0] = _build_path(task->argv[0], step->env);
}
/* Do this last so you don't worry too much about the users
limits including the slurmstepd in with it.
*/
set_user_limits(step, 0);
/*
* If argv[0] ends with '/' it indicates that srun was called with
* --bcast with destination dir instead of file name. So match the
* convention used by _rpc_file_bcast().
*/
if (task->argv[0][strlen(task->argv[0]) - 1] == '/') {
xstrfmtcat(task->argv[0], BCAST_FILE_FMT,
step->step_id.job_id, step->step_id.step_id,
step->node_name);
}
if (step->container)
container_run(step, task);
execve(task->argv[0], task->argv, step->env);
saved_errno = errno;
/*
* print error message and clean up if execve() returns:
*/
if ((errno == ENOENT) &&
((fd = open(task->argv[0], O_RDONLY)) >= 0)) {
char buf[256], *eol;
int sz;
sz = read(fd, buf, sizeof(buf));
if ((sz >= 3) && (xstrncmp(buf, "#!", 2) == 0)) {
buf[sizeof(buf)-1] = '\0';
eol = strchr(buf, '\n');
if (eol)
eol[0] = '\0';
errno = saved_errno;
error("execve(): bad interpreter(%s): %m", buf+2);
_exit(errno);
}
}
errno = saved_errno;
error("execve(): %s: %m", task->argv[0]);
_exit(errno);
}
static void
_make_tmpdir(stepd_step_rec_t *step)
{
char *tmpdir;
if (!(tmpdir = getenvp(step->env, "TMPDIR")))
setenvf(&step->env, "TMPDIR", "/tmp"); /* task may want it set */
else if (mkdir(tmpdir, 0700) < 0) {
struct stat st;
int mkdir_errno = errno;
if (stat(tmpdir, &st)) { /* does the file exist ? */
/* show why we were not able to create it */
error("Unable to create TMPDIR [%s]: %s",
tmpdir, strerror(mkdir_errno));
} else if (!S_ISDIR(st.st_mode)) { /* is it a directory? */
error("TMPDIR [%s] is not a directory", tmpdir);
}
/* Eaccess wasn't introduced until glibc 2.4 but euidaccess
* has been around for a while. So to make sure we
* still work with older systems we include this check.
*/
#if defined(HAVE_FACCESSAT)
else if (faccessat(AT_FDCWD, tmpdir, X_OK|W_OK, AT_EACCESS))
#elif defined(HAVE_EACCESS)
else if (eaccess(tmpdir, X_OK|W_OK)) /* check permissions */
#else
else if (euidaccess(tmpdir, X_OK|W_OK))
#endif
error("TMPDIR [%s] is not writeable", tmpdir);
else
return;
error("Setting TMPDIR to /tmp");
setenvf(&step->env, "TMPDIR", "/tmp");
}
return;
}