blob: 465d3365b2c46e1c5be8f682e500e07a7d95dab1 [file] [log] [blame]
/*****************************************************************************\
* opt.c - options processing for srun
*****************************************************************************
* Copyright (C) 2002-2007 The Regents of the University of California.
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Copyright (C) SchedMD LLC.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Mark Grondona <grondona1@llnl.gov>, et. al.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#define _GNU_SOURCE
#include <ctype.h> /* isdigit() */
#include <fcntl.h>
#include <getopt.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h> /* getenv */
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include "slurm/slurm.h"
#include "src/interfaces/cli_filter.h"
#include "src/common/cpu_frequency.h"
#include "src/common/list.h"
#include "src/common/log.h"
#include "src/common/optz.h"
#include "src/common/parse_time.h"
#include "src/common/proc_args.h"
#include "src/interfaces/mpi.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/slurm_protocol_socket.h"
#include "src/common/slurm_rlimits_info.h"
#include "src/common/slurm_resource_info.h"
#include "src/interfaces/acct_gather_profile.h"
#include "src/common/spank.h"
#include "src/common/uid.h"
#include "src/common/x11_util.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/common/util-net.h"
#include "src/api/pmi_server.h"
#include "debugger.h"
#include "launch.h"
#include "multi_prog.h"
#include "opt.h"
static void _help(void);
static void _usage(void);
static void _autocomplete(const char *query);
/*---- global variables, defined in opt.h ----*/
int colon_cnt = 0;
int error_exit = 1;
int immediate_exit = 1;
srun_opt_t sropt;
slurm_opt_t opt = {
.srun_opt = &sropt,
.help_func = _help,
.usage_func = _usage,
.autocomplete_func = _autocomplete,
};
list_t *opt_list = NULL;
int pass_number = 0;
time_t srun_begin_time = 0;
bool local_het_step = false;
/*---- forward declarations of static variables and functions ----*/
static bool is_step = false;
static slurm_opt_t *_get_first_opt(int het_job_offset);
static slurm_opt_t *_get_next_opt(int het_job_offset, slurm_opt_t *opt_last);
static bitstr_t *_get_het_group(const int argc, char **argv,
int default_het_job_offset, bool *opt_found);
/* fill in default options */
static void _opt_default(void);
/* set options based upon env vars */
static void _opt_env(int het_job_offset);
static void _opt_args(int argc, char **argv, int het_job_offset);
/* verify options sanity */
static bool _opt_verify(void);
static void _set_options(const int argc, char **argv);
static bool _under_parallel_debugger(void);
static bool _valid_node_list(char **node_list_pptr);
/*---[ end forward declarations of static functions ]---------------------*/
/*
* Find first option structure for a given het job offset
* het_job_offset IN - Offset into hetjob or -1 if regular job
* RET - Pointer to option structure or NULL if none found
*/
static slurm_opt_t *_get_first_opt(int het_job_offset)
{
list_itr_t *opt_iter;
slurm_opt_t *opt_local;
if (!opt_list) {
if (!sropt.het_grp_bits && (het_job_offset == -1))
return &opt;
if (sropt.het_grp_bits &&
(het_job_offset >= 0) &&
(het_job_offset < bit_size(sropt.het_grp_bits)) &&
bit_test(sropt.het_grp_bits, het_job_offset))
return &opt;
return NULL;
}
opt_iter = list_iterator_create(opt_list);
while ((opt_local = list_next(opt_iter))) {
srun_opt_t *srun_opt = opt_local->srun_opt;
xassert(srun_opt);
if (srun_opt->het_grp_bits && (het_job_offset >= 0)
&& (het_job_offset < bit_size(srun_opt->het_grp_bits))
&& bit_test(srun_opt->het_grp_bits, het_job_offset))
break;
}
list_iterator_destroy(opt_iter);
return opt_local;
}
/*
* Find next option structure for a given hetjob offset
* het_job_offset IN - Offset into hetjob or -1 if regular job
* opt_last IN - past option structure found for this het_job_offset
* RET - Pointer to option structure or NULL if none found
*/
static slurm_opt_t *_get_next_opt(int het_job_offset, slurm_opt_t *opt_last)
{
list_itr_t *opt_iter;
slurm_opt_t *opt_local;
bool found_last = false;
if (!opt_list)
return NULL;
opt_iter = list_iterator_create(opt_list);
while ((opt_local = list_next(opt_iter))) {
srun_opt_t *srun_opt = opt_local->srun_opt;
xassert(srun_opt);
if (!found_last) {
if (opt_last == opt_local)
found_last = true;
continue;
}
if (srun_opt->het_grp_bits && (het_job_offset >= 0)
&& (het_job_offset < bit_size(srun_opt->het_grp_bits))
&& bit_test(srun_opt->het_grp_bits, het_job_offset))
break;
}
list_iterator_destroy(opt_iter);
return opt_local;
}
/*
* Find option structure for a given hetjob offset
* het_job_offset IN - Offset into hetjob, -1 if regular job, -2 to reset
* RET - Pointer to next matching option structure or NULL if none found
*/
extern slurm_opt_t *get_next_opt(int het_job_offset)
{
static int offset_last = -2;
static slurm_opt_t *opt_last = NULL;
if (het_job_offset == -2) {
offset_last = -2;
opt_last = NULL;
return NULL;
}
if (offset_last != het_job_offset) {
offset_last = het_job_offset;
opt_last = _get_first_opt(het_job_offset);
} else {
opt_last = _get_next_opt(het_job_offset, opt_last);
}
return opt_last;
}
/*
* Return maximum het_group value for any step launch option request
*/
extern int get_max_het_group(void)
{
list_itr_t *opt_iter;
slurm_opt_t *opt_local;
int max_het_job_offset = 0, het_job_offset = 0;
if (opt_list) {
opt_iter = list_iterator_create(opt_list);
while ((opt_local = list_next(opt_iter))) {
srun_opt_t *srun_opt = opt_local->srun_opt;
xassert(srun_opt);
if (srun_opt->het_grp_bits)
het_job_offset =
bit_fls(srun_opt->het_grp_bits);
if (het_job_offset >= max_het_job_offset)
max_het_job_offset = het_job_offset;
}
list_iterator_destroy(opt_iter);
} else {
if (sropt.het_grp_bits)
max_het_job_offset = bit_fls(sropt.het_grp_bits);
}
return max_het_job_offset;
}
/*
* Copy the last option record:
* Copy strings if the original values will be preserved and
* reused for additional heterogeneous job/steps
* Otherwise clear/NULL the pointer so it does not get re-used
* and freed, which will render the copied pointer bad
*/
static slurm_opt_t *_opt_copy(void)
{
slurm_opt_t *opt_dup;
int i;
opt_dup = xmalloc(sizeof(slurm_opt_t));
memcpy(opt_dup, &opt, sizeof(slurm_opt_t));
opt_dup->srun_opt = xmalloc(sizeof(srun_opt_t));
memcpy(opt_dup->srun_opt, &sropt, sizeof(srun_opt_t));
opt_dup->account = xstrdup(opt.account);
opt_dup->acctg_freq = xstrdup(opt.acctg_freq);
opt_dup->srun_opt->alloc_nodelist = xstrdup(sropt.alloc_nodelist);
opt_dup->argv = xcalloc(sizeof(char *), opt.argc);
for (i = 0; i < opt.argc; i++)
opt_dup->argv[i] = xstrdup(opt.argv[i]);
sropt.bcast_file = NULL; /* Moved by memcpy */
opt.burst_buffer = NULL; /* Moved by memcpy */
opt_dup->c_constraint = xstrdup(opt.c_constraint);
opt_dup->clusters = xstrdup(opt.clusters);
opt_dup->srun_opt->cmd_name = xstrdup(sropt.cmd_name);
opt_dup->comment = xstrdup(opt.comment);
opt.constraint = NULL; /* Moved by memcpy */
opt_dup->context = xstrdup(opt.context);
opt_dup->srun_opt->cpu_bind = xstrdup(sropt.cpu_bind);
opt_dup->chdir = xstrdup(opt.chdir);
opt_dup->dependency = xstrdup(opt.dependency);
opt_dup->efname = xstrdup(opt.efname);
opt_dup->srun_opt->epilog = xstrdup(sropt.epilog);
opt_dup->exclude = xstrdup(opt.exclude);
opt_dup->export_env = xstrdup(opt.export_env);
opt_dup->extra = xstrdup(opt.extra);
opt.gres = NULL; /* Moved by memcpy */
opt.gpu_bind = NULL; /* Moved by memcpy */
opt.gpu_freq = NULL; /* Moved by memcpy */
opt.gpus = NULL; /* Moved by memcpy */
opt.gpus_per_node = NULL; /* Moved by memcpy */
opt.gpus_per_socket = NULL; /* Moved by memcpy */
opt.gpus_per_task = NULL; /* Moved by memcpy */
opt_dup->ifname = xstrdup(opt.ifname);
opt_dup->job_name = xstrdup(opt.job_name);
opt.licenses = NULL; /* Moved by memcpy */
opt.mail_user = NULL; /* Moved by memcpy */
opt_dup->mcs_label = xstrdup(opt.mcs_label);
opt.mem_bind = NULL; /* Moved by memcpy */
opt_dup->srun_opt->mpi_type = xstrdup(sropt.mpi_type);
opt.network = NULL; /* Moved by memcpy */
opt.nodelist = NULL; /* Moved by memcpy */
opt_dup->ofname = xstrdup(opt.ofname);
sropt.het_group = NULL; /* Moved by memcpy */
sropt.het_grp_bits = NULL; /* Moved by memcpy */
opt.partition = NULL; /* Moved by memcpy */
opt_dup->srun_opt->prolog = xstrdup(sropt.prolog);
opt_dup->srun_opt->propagate = xstrdup(sropt.propagate);
opt_dup->qos = xstrdup(opt.qos);
opt_dup->reservation = xstrdup(opt.reservation);
opt.spank_job_env = NULL; /* Moved by memcpy */
opt_dup->srun_opt->task_epilog = xstrdup(sropt.task_epilog);
opt_dup->srun_opt->task_prolog = xstrdup(sropt.task_prolog);
opt_dup->tres_bind = xstrdup(opt.tres_bind);
opt_dup->tres_freq = xstrdup(opt.tres_freq);
opt.tres_per_task = NULL; /* Moved by memcpy */
opt_dup->wckey = xstrdup(opt.wckey);
return opt_dup;
}
/*
* process options:
* 1. set defaults
* 2. update options with env vars
* 3. update options with commandline args
* 4. perform some verification that options are reasonable
*
* argc IN - Count of elements in argv
* argv IN - Array of elements to parse
* argc_off OUT - Offset of first non-parsable element
*/
extern int initialize_and_process_args(int argc, char **argv, int *argc_off)
{
static int default_het_job_offset = 0;
static bool pending_append = false;
bitstr_t *het_grp_bits;
int i, i_first, i_last;
bool opt_found = false;
static bool check_het_step = false;
is_step = getenv("SLURM_JOB_ID") ? true : false;
het_grp_bits = _get_het_group(argc, argv, default_het_job_offset++,
&opt_found);
/*
* Put all these bits on the global grp bits to send with the step
* requests.
*/
if (opt_found) {
if (!g_het_grp_bits)
g_het_grp_bits = bit_alloc(MAX_HET_JOB_COMPONENTS);
bit_or(g_het_grp_bits, het_grp_bits);
}
i_first = bit_ffs(het_grp_bits);
i_last = bit_fls(het_grp_bits);
for (i = i_first; i <= i_last; i++) {
if (!bit_test(het_grp_bits, i))
continue;
pass_number++;
if (pending_append) {
if (!opt_list)
opt_list = list_create(NULL);
list_append(opt_list, _opt_copy());
pending_append = false;
}
/* initialize option defaults */
_opt_default();
/* do not set adjust defaults in an active allocation */
if (!is_step) {
bool first = (pass_number == 1);
if (cli_filter_g_setup_defaults(&opt, first)) {
error("cli_filter plugin terminated with error");
exit(error_exit);
}
}
if (opt_found || (i > 0)) {
xstrfmtcat(sropt.het_group, "%d", i);
sropt.het_grp_bits = bit_alloc(MAX_HET_JOB_COMPONENTS);
bit_set(sropt.het_grp_bits, i);
}
/* initialize options with env vars */
_opt_env(i);
/* initialize options with argv */
_set_options(argc, argv);
_opt_args(argc, argv, i);
if (argc_off)
*argc_off = optind;
if (!check_het_step) {
/*
* SLURM_HET_SIZE not defined for a normal allocation.
* SLURM_JOB_ID defined if allocation already exists.
*
* Here we are seeing if we are trying to run a het step
* in the normal allocation. If so and we didn't
* request nodes on the command line we will clear this
* env variable and figure it out later instead of
* trying to use the whole allocation.
*/
if (!getenv("SLURM_HET_SIZE") &&
is_step &&
(optind >= 0) && (optind < argc)) {
for (int i2 = optind; i2 < argc; i2++) {
if (!xstrcmp(argv[i2], ":")) {
local_het_step = true;
break;
}
}
}
check_het_step = true;
if (local_het_step) {
/*
* If we are a het step don't just unset the
* JOB_NUM_NODES to avoid using it.
*/
unsetenv("SLURM_JOB_NUM_NODES");
/*
* If we already set the nodes based off this
* env var reset it.
*/
if (slurm_option_set_by_env(&opt, 'N')) {
opt.nodes_set = false;
opt.min_nodes = 1;
opt.max_nodes = 0;
}
}
}
if (cli_filter_g_pre_submit(&opt, i)) {
error("cli_filter plugin terminated with error");
exit(error_exit);
}
if (!_opt_verify())
exit(error_exit);
if (opt.verbose)
slurm_print_set_options(&opt);
if (spank_init_post_opt()) {
error("Plugin stack post-option processing failed.");
exit(error_exit);
}
pending_append = true;
}
FREE_NULL_BITMAP(het_grp_bits);
if (opt_list && pending_append) { /* Last record */
list_append(opt_list, _opt_copy());
pending_append = false;
}
return 1;
}
/*
* If the node list supplied is a file name, translate that into
* a list of nodes, we orphan the data pointed to
* RET true if the node list is a valid one
*/
static bool _valid_node_list(char **node_list_pptr)
{
int count = NO_VAL;
/* If we are using Arbitrary and we specified the number of
procs to use then we need exactly this many since we are
saying, lay it out this way! Same for max and min nodes.
Other than that just read in as many in the hostfile */
if (opt.ntasks_set)
count = opt.ntasks;
else if (opt.nodes_set) {
if (opt.max_nodes)
count = opt.max_nodes;
else if (opt.min_nodes)
count = opt.min_nodes;
}
return verify_node_list(node_list_pptr, opt.distribution, count);
}
/*
* _opt_default(): used by initialize_and_process_args to set defaults
*/
static void _opt_default(void)
{
if (pass_number == 1) {
xfree(sropt.cmd_name);
sropt.test_exec = false;
}
/*
* All other options must be specified individually for each component
* of the job/step. Do not use xfree() as the pointers have been copied.
* See initialize_and_process_args() above.
*/
sropt.exclusive = true;
opt.job_flags = 0;
sropt.multi_prog_cmds = 0;
sropt.het_group = NULL;
sropt.het_grp_bits = NULL;
opt.spank_job_env_size = 0;
opt.spank_job_env = NULL;
slurm_reset_all_options(&opt, (pass_number == 1));
}
/*---[ env var processing ]-----------------------------------------------*/
/*
* try to use a similar scheme as popt.
*
* in order to add a new env var (to be processed like an option):
*
* define a new entry into env_vars[], if the option is a simple int
* or string you may be able to get away with adding a pointer to the
* option to set. Otherwise, process var based on "type" in _opt_env.
*/
typedef struct {
const char *var;
int type;
} env_vars_t;
env_vars_t env_vars[] = {
{ "SLURM_ACCOUNT", 'A' },
{ "SLURM_ACCTG_FREQ", LONG_OPT_ACCTG_FREQ },
{ "SLURM_BCAST", LONG_OPT_BCAST },
{ "SLURM_BCAST_EXCLUDE", LONG_OPT_BCAST_EXCLUDE },
{ "SLURM_BURST_BUFFER", LONG_OPT_BURST_BUFFER_SPEC },
{ "SLURM_CLUSTERS", 'M' },
{ "SLURM_CLUSTER_CONSTRAINT", LONG_OPT_CLUSTER_CONSTRAINT },
{ "SLURM_COMPRESS", LONG_OPT_COMPRESS },
{ "SLURM_CONSTRAINT", 'C' },
{ "SLURM_CORE_SPEC", 'S' },
{ "SLURM_CPUS_PER_TASK", 'c' },
{ "SLURM_CPU_BIND", LONG_OPT_CPU_BIND },
{ "SLURM_CPU_FREQ_REQ", LONG_OPT_CPU_FREQ },
{ "SLURM_CPUS_PER_GPU", LONG_OPT_CPUS_PER_GPU },
{ "SLURM_DELAY_BOOT", LONG_OPT_DELAY_BOOT },
{ "SLURM_DEPENDENCY", 'd' },
{ "SLURM_DISABLE_STATUS", 'X' },
{ "SLURM_DISTRIBUTION", 'm' },
{ "SLURM_EPILOG", LONG_OPT_EPILOG },
{ "SLURM_EXACT", LONG_OPT_EXACT },
{ "SLURM_EXCLUSIVE", LONG_OPT_EXCLUSIVE },
{ "SLURM_EXPORT_ENV", LONG_OPT_EXPORT },
{ "SRUN_EXPORT_ENV", LONG_OPT_EXPORT }, /* overrides SLURM_EXPORT_ENV */
{ "SLURM_EXTERNAL_LAUNCHER", LONG_OPT_EXTERNAL_LAUNCHER },
{ "SLURM_GPUS", 'G' },
{ "SLURM_GPU_BIND", LONG_OPT_GPU_BIND },
{ "SLURM_GPU_FREQ", LONG_OPT_GPU_FREQ },
{ "SLURM_GPUS_PER_NODE", LONG_OPT_GPUS_PER_NODE },
{ "SLURM_GPUS_PER_SOCKET", LONG_OPT_GPUS_PER_SOCKET },
{ "SLURM_GPUS_PER_TASK", LONG_OPT_GPUS_PER_TASK },
{ "SLURM_GRES", LONG_OPT_GRES },
{ "SLURM_GRES_FLAGS", LONG_OPT_GRES_FLAGS },
{ "SLURM_HINT", LONG_OPT_HINT },
{ "SLURM_JOB_ID", LONG_OPT_JOBID },
{ "SLURM_JOB_NAME", 'J' },
{ "SLURM_JOB_NODELIST", LONG_OPT_ALLOC_NODELIST },
{ "SLURM_JOB_NUM_NODES", 'N' },
{ "SLURM_KILL_BAD_EXIT", 'K' },
{ "SLURM_LABELIO", 'l' },
{ "SLURM_MEM_BIND", LONG_OPT_MEM_BIND },
{ "SLURM_MEM_PER_CPU", LONG_OPT_MEM_PER_CPU },
{ "SLURM_MEM_PER_GPU", LONG_OPT_MEM_PER_GPU },
{ "SLURM_MEM_PER_NODE", LONG_OPT_MEM },
{ "SLURM_MPI_TYPE", LONG_OPT_MPI },
{ "SLURM_NCORES_PER_SOCKET", LONG_OPT_CORESPERSOCKET },
{ "SLURM_NETWORK", LONG_OPT_NETWORK },
{ "SLURM_NO_KILL", 'k' },
{ "SLURM_NPROCS", 'n' }, /* deprecated, should be removed */
/* listed first so SLURM_NTASKS overrides */
{ "SLURM_NTASKS", 'n' },
{ "SLURM_NSOCKETS_PER_NODE", LONG_OPT_SOCKETSPERNODE },
{ "SLURM_NTASKS_PER_CORE", LONG_OPT_NTASKSPERCORE },
{ "SLURM_NTASKS_PER_NODE", LONG_OPT_NTASKSPERNODE },
{ "SLURM_NTASKS_PER_GPU", LONG_OPT_NTASKSPERGPU },
{ "SLURM_NTASKS_PER_TRES", LONG_OPT_NTASKSPERTRES },
{ "SLURM_OOM_KILL_STEP", LONG_OPT_OOMKILLSTEP },
{ "SLURM_OPEN_MODE", LONG_OPT_OPEN_MODE },
{ "SLURM_OVERCOMMIT", 'O' },
{ "SLURM_OVERLAP", LONG_OPT_OVERLAP },
{ "SLURM_PARTITION", 'p' },
{ "SLURM_POWER", LONG_OPT_POWER },
{ "SLURM_PROFILE", LONG_OPT_PROFILE },
{ "SLURM_PROLOG", LONG_OPT_PROLOG },
{ "SLURM_QOS", 'q' },
{ "SLURM_REMOTE_CWD", 'D' },
{ "SLURM_REQ_SWITCH", LONG_OPT_SWITCH_REQ },
{ "SLURM_RESERVATION", LONG_OPT_RESERVATION },
{ "SLURM_RESV_PORTS", LONG_OPT_RESV_PORTS },
{ "SLURM_SEND_LIBS", LONG_OPT_SEND_LIBS },
{ "SLURM_SIGNAL", LONG_OPT_SIGNAL },
{ "SLURM_SPREAD_JOB", LONG_OPT_SPREAD_JOB },
{ "SLURM_SRUN_MULTI", LONG_OPT_MULTI },
{ "SLURM_STDERRMODE", 'e' }, /* Left for backward compatibility */
{ "SLURM_STDINMODE", 'i' }, /* Left for backward compatibility */
{ "SLURM_STDOUTMODE", 'o' }, /* Left for backward compatibility */
{ "SLURM_TASK_EPILOG", LONG_OPT_TASK_EPILOG },
{ "SLURM_TASK_PROLOG", LONG_OPT_TASK_PROLOG },
{ "SLURM_THREAD_SPEC", LONG_OPT_THREAD_SPEC },
{ "SLURM_THREADS", 'T' },
{ "SLURM_THREADS_PER_CORE", LONG_OPT_THREADSPERCORE },
{ "SLURM_TIMELIMIT", 't' },
{ "SLURM_TRES_BIND", LONG_OPT_TRES_BIND },
{ "SLURM_TRES_PER_TASK", LONG_OPT_TRES_PER_TASK },
{ "SLURM_UNBUFFEREDIO", 'u' },
{ "SLURM_USE_MIN_NODES", LONG_OPT_USE_MIN_NODES },
{ "SLURM_WAIT", 'W' },
{ "SLURM_WAIT4SWITCH", LONG_OPT_SWITCH_WAIT },
{ "SLURM_WCKEY", LONG_OPT_WCKEY },
{ "SLURM_WORKING_DIR", 'D' },
{ "SLURMD_DEBUG", LONG_OPT_SLURMD_DEBUG },
{ "SRUN_CONTAINER", LONG_OPT_CONTAINER },
{ "SRUN_CONTAINER_ID", LONG_OPT_CONTAINER_ID },
{ "SLURM_DEBUG", 'v'},
{ "SRUN_ERROR", 'e' },
{ "SRUN_INPUT", 'i' },
{ "SRUN_OUTPUT", 'o' },
{ "SRUN_SEGMENT_SIZE", LONG_OPT_SEGMENT_SIZE },
{ NULL }
};
/*
* _opt_env(): used by initialize_and_process_args to set options via
* environment variables. See comments above for how to
* extend srun to process different vars
*/
static void _opt_env(int het_job_offset)
{
char key[64], *val = NULL;
env_vars_t *e = env_vars;
while (e->var) {
if ((val = getenv(e->var)))
slurm_process_option_or_exit(&opt, e->type, val, true,
false);
if ((het_job_offset >= 0) &&
strcmp(e->var, "SLURM_JOBID") &&
strcmp(e->var, "SLURM_JOB_ID")) {
/* Continue supporting old hetjob terminology. */
snprintf(key, sizeof(key), "%s_PACK_GROUP_%d",
e->var, het_job_offset);
if ((val = getenv(key)))
slurm_process_option_or_exit(&opt, e->type, val,
true, false);
snprintf(key, sizeof(key), "%s_HET_GROUP_%d",
e->var, het_job_offset);
if ((val = getenv(key)))
slurm_process_option_or_exit(&opt, e->type, val,
true, false);
}
e++;
}
/* Process spank env options */
if (spank_process_env_options())
exit(error_exit);
}
/*
* If --het-group option found, return a bitmap representing their IDs
* argc IN - Argument count
* argv IN - Arguments
* default_het_job_offset IN - Default offset
* opt_found OUT - Set to true if --het-group option found
* RET bitmap if het groups to run
*/
static bitstr_t *_get_het_group(const int argc, char **argv,
int default_het_job_offset, bool *opt_found)
{
int i, opt_char, option_index = 0;
char *tmp = NULL;
bitstr_t *het_grp_bits = bit_alloc(MAX_HET_JOB_COMPONENTS);
hostlist_t *hl;
char *opt_string = NULL;
struct option *optz = slurm_option_table_create(&opt, &opt_string);
*opt_found = false;
optind = 0;
opterr = 0; /* disable error messages about unrecognized options */
while ((opt_char = getopt_long(argc, argv, opt_string,
optz, &option_index)) != -1) {
slurm_process_option_or_exit(&opt, opt_char, optarg, false,
true);
}
slurm_option_table_destroy(optz);
xfree(opt_string);
*opt_found = (sropt.het_group);
if (*opt_found == false) {
bit_set(het_grp_bits, default_het_job_offset);
return het_grp_bits;
}
if (sropt.het_group[0] == '[')
tmp = xstrdup(sropt.het_group);
else
xstrfmtcat(tmp, "[%s]", sropt.het_group);
hl = hostlist_create(tmp);
if (!hl) {
error("Invalid --het-group value: %s", sropt.het_group);
exit(error_exit);
}
xfree(tmp);
while ((tmp = hostlist_shift(hl))) {
char *end_ptr = NULL;
i = strtol(tmp, &end_ptr, 10);
if ((i < 0) || (i >= MAX_HET_JOB_COMPONENTS) ||
(end_ptr[0] != '\0')) {
error("Invalid --het-group value: %s",
sropt.het_group);
exit(error_exit);
}
bit_set(het_grp_bits, i);
free(tmp);
}
hostlist_destroy(hl);
if (bit_ffs(het_grp_bits) == -1) { /* No bits set */
error("Invalid --het-group value: %s", sropt.het_group);
exit(error_exit);
}
return het_grp_bits;
}
static void _set_options(const int argc, char **argv)
{
int opt_char, option_index = 0;
char *opt_string = NULL;
struct option *optz = slurm_option_table_create(&opt, &opt_string);
optind = 0;
opterr = 1; /* re-enable error messages for unrecognized options */
while ((opt_char = getopt_long(argc, argv, opt_string,
optz, &option_index)) != -1) {
slurm_process_option_or_exit(&opt, opt_char, optarg, false,
false);
}
slurm_option_table_destroy(optz);
xfree(opt_string);
}
static void _mpi_print_list(void)
{
plugrack_t *mpi_rack = plugrack_create("mpi");
plugrack_read_dir(mpi_rack, slurm_conf.plugindir);
plugrack_print_mpi_plugins(mpi_rack);
plugrack_destroy(mpi_rack);
}
/*
* _opt_args() : set options via commandline args and popt
*/
static void _opt_args(int argc, char **argv, int het_job_offset)
{
int i, command_pos = 0, command_args = 0;
char **rest = NULL;
char *fullpath;
static char *prev_mpi = NULL;
static int het_comp_number = -1;
het_comp_number++;
sropt.het_grp_bits = bit_alloc(MAX_HET_JOB_COMPONENTS);
bit_set(sropt.het_grp_bits, het_job_offset);
if (opt.container && !getenv("SLURM_CONTAINER"))
setenvf(NULL, "SLURM_CONTAINER", "%s", opt.container);
if (opt.container_id && !getenv("SLURM_CONTAINER_ID"))
setenvf(NULL, "SLURM_CONTAINER_ID", "%s", opt.container_id);
if (opt.network)
setenvf(NULL, "SLURM_NETWORK", "%s", opt.network);
if (opt.dependency)
setenvfs("SLURM_JOB_DEPENDENCY=%s", opt.dependency);
opt.argc = 0;
if (optind < argc) {
rest = argv + optind;
while ((rest[opt.argc] != NULL) && strcmp(rest[opt.argc], ":"))
opt.argc++;
}
command_args = opt.argc;
if (!prev_mpi && het_comp_number &&
xstrcmp(sropt.mpi_type, slurm_conf.mpi_default)) {
error("--mpi is only supported in the first heterogeneous component");
exit(error_exit);
}
prev_mpi = sropt.mpi_type;
if (!xstrcmp(sropt.mpi_type, "list")) {
_mpi_print_list();
exit(0);
}
if (!rest && !sropt.test_only)
fatal("No command given to execute.");
command_pos = launch_g_setup_srun_opt(rest, &opt);
/* make sure we have allocated things correctly */
if (command_args)
xassert((command_pos + command_args) <= opt.argc);
for (i = command_pos; i < opt.argc; i++) {
if (!rest || !rest[i-command_pos])
break;
// info("argv[%d]='%s'", i, opt.argv[i]);
opt.argv[i] = xstrdup(rest[i-command_pos]);
}
opt.argv[i] = NULL; /* End of argv's (for possible execv) */
if (getenv("SLURM_TEST_EXEC") ||
xstrstr(slurm_conf.launch_params, "test_exec"))
sropt.test_exec = true;
if (sropt.test_exec) {
/* Validate command's existence */
if (sropt.prolog && xstrcasecmp(sropt.prolog, "none")) {
if ((fullpath = search_path(opt.chdir, sropt.prolog,
true, R_OK|X_OK, true)))
sropt.prolog = fullpath;
else
error("prolog '%s' not found in PATH or CWD (%s), or wrong permissions",
sropt.prolog, opt.chdir);
}
if (sropt.epilog && xstrcasecmp(sropt.epilog, "none")) {
if ((fullpath = search_path(opt.chdir, sropt.epilog,
true, R_OK|X_OK, true)))
sropt.epilog = fullpath;
else
error("epilog '%s' not found in PATH or CWD (%s), or wrong permissions",
sropt.epilog, opt.chdir);
}
if (sropt.task_prolog) {
if ((fullpath = search_path(opt.chdir, sropt.task_prolog,
true, R_OK|X_OK, true)))
sropt.task_prolog = fullpath;
else
error("task-prolog '%s' not found in PATH or CWD (%s), or wrong permissions",
sropt.task_prolog, opt.chdir);
}
if (sropt.task_epilog) {
if ((fullpath = search_path(opt.chdir, sropt.task_epilog,
true, R_OK|X_OK, true)))
sropt.task_epilog = fullpath;
else
error("task-epilog '%s' not found in PATH or CWD (%s), or wrong permissions",
sropt.task_epilog, opt.chdir);
}
}
/* may exit() if an error with the multi_prog script */
(void) launch_g_handle_multi_prog_verify(command_pos, &opt);
if (!sropt.multi_prog && (sropt.test_exec || sropt.bcast_flag) &&
opt.argv && opt.argv[command_pos]) {
if ((fullpath = search_path(opt.chdir, opt.argv[command_pos],
true, X_OK, true))) {
xfree(opt.argv[command_pos]);
opt.argv[command_pos] = fullpath;
} else {
fatal("Can not execute %s", opt.argv[command_pos]);
}
}
}
/*
* _opt_verify : perform some post option processing verification
*
*/
static bool _opt_verify(void)
{
bool verified = true;
hostlist_t *hl = NULL;
int hl_cnt = 0;
bool mpack_reset_nodes = false;
if (opt.srun_opt->interactive) {
if (((opt.distribution & SLURM_DIST_STATE_BASE) ==
SLURM_DIST_ARBITRARY)) {
opt.distribution &= ~SLURM_DIST_ARBITRARY;
}
}
/*
* This means --ntasks was read from the environment.
* We will override it with what the user specified in the hostlist.
*/
if (((opt.distribution & SLURM_DIST_STATE_BASE) == SLURM_DIST_ARBITRARY)) {
if (slurm_option_set_by_env(&opt, 'n'))
opt.ntasks_set = false;
if (slurm_option_set_by_env(&opt, 'N'))
opt.nodes_set = false;
}
/*
* Specifying --gpus should override SLURM_GPUS_PER_NODE env if present
* in step request.
*/
if (slurm_option_set_by_env(&opt, LONG_OPT_GPUS_PER_NODE) &&
slurm_option_set_by_cli(&opt, 'G') && is_step)
slurm_option_reset(&opt, "gpus-per-node");
validate_options_salloc_sbatch_srun(&opt);
/*
* If they are requesting block without 'nopack' and the system
* is setup to pack nodes set it here.
*/
if ((slurm_conf.select_type_param & SELECT_PACK_NODES) &&
!(opt.distribution & SLURM_DIST_NO_PACK_NODES) &&
((opt.distribution & SLURM_DIST_BLOCK) ||
(opt.distribution == SLURM_DIST_UNKNOWN)))
opt.distribution |= SLURM_DIST_PACK_NODES;
/*
* If we are packing the nodes in an allocation set min_nodes to
* 1. The slurmctld will adjust the max_nodes to the appropriate
* number if the allocation is homogeneous.
*/
if ((opt.distribution & SLURM_DIST_PACK_NODES) &&
slurm_option_set_by_env(&opt, 'N')) {
opt.min_nodes = 1;
if (opt.verbose)
info("Resetting -N set by environment variable because of -mpack");
mpack_reset_nodes = true;
}
/*
* Do not set slurmd debug level higher than DEBUG2,
* as DEBUG3 is used for slurmd IO operations, which
* are not appropriate to be sent back to srun. (because
* these debug messages cause the generation of more
* debug messages ad infinitum)
*/
if (sropt.slurmd_debug > LOG_LEVEL_DEBUG2) {
sropt.slurmd_debug = LOG_LEVEL_DEBUG2;
info("Using max slurmd-debug value of '%s'",
log_num2string(sropt.slurmd_debug));
} else if (sropt.slurmd_debug < LOG_LEVEL_ERROR) {
sropt.slurmd_debug = LOG_LEVEL_ERROR;
info("Using min slurmd-debug level of %s",
log_num2string(sropt.slurmd_debug));
}
if (opt.quiet && opt.verbose) {
error ("don't specify both --verbose (-v) and --quiet (-Q)");
verified = false;
}
if (opt.burst_buffer && opt.burst_buffer_file) {
error("Cannot specify both --burst-buffer and --bbf");
exit(error_exit);
} else if (opt.burst_buffer_file) {
buf_t *buf = create_mmap_buf(opt.burst_buffer_file);
if (!buf) {
error("Invalid --bbf specification");
exit(error_exit);
}
opt.burst_buffer = xstrdup(get_buf_data(buf));
FREE_NULL_BUFFER(buf);
xfree(opt.burst_buffer_file);
}
if (sropt.exact && sropt.whole) {
error("--exact and --whole are mutually exclusive.");
verified = false;
}
if (sropt.no_alloc && !opt.nodelist) {
error("must specify a node list with -Z, --no-allocate.");
verified = false;
}
if (sropt.no_alloc && opt.exclude) {
error("can not specify --exclude list with -Z, --no-allocate.");
verified = false;
}
if (sropt.no_alloc && (sropt.relative != NO_VAL)) {
error("do not specify -r,--relative with -Z,--no-allocate.");
verified = false;
}
if ((sropt.relative != NO_VAL) && (opt.exclude || opt.nodelist)) {
error("-r,--relative not allowed with "
"-w,--nodelist or -x,--exclude.");
verified = false;
}
if (!sropt.epilog)
sropt.epilog = xstrdup(slurm_conf.srun_epilog);
if (!sropt.prolog)
sropt.prolog = xstrdup(slurm_conf.srun_prolog);
/* slurm_verify_cpu_bind has to be called before validate_hint_option */
if (opt.srun_opt->cpu_bind) {
if (slurm_verify_cpu_bind(opt.srun_opt->cpu_bind,
&opt.srun_opt->cpu_bind,
&opt.srun_opt->cpu_bind_type))
verified = false;
}
if (opt.hint &&
!validate_hint_option(&opt)) {
xassert(opt.ntasks_per_core == NO_VAL);
xassert(opt.threads_per_core == NO_VAL);
if (verify_hint(opt.hint,
&opt.sockets_per_node,
&opt.cores_per_socket,
&opt.threads_per_core,
&opt.ntasks_per_core,
&sropt.cpu_bind_type)) {
exit(error_exit);
}
}
if (opt.cpus_set && (opt.pn_min_cpus < opt.cpus_per_task))
opt.pn_min_cpus = opt.cpus_per_task;
if ((opt.argc > 0) && xstrcmp(opt.argv[0], ":")) {
xfree(sropt.cmd_name);
sropt.cmd_name = base_name(opt.argv[0]);
}
if (opt.exclude && !_valid_node_list(&opt.exclude))
exit(error_exit);
if (slurm_option_set_by_cli(&opt, LONG_OPT_EXCLUSIVE) &&
slurm_option_set_by_cli(&opt, LONG_OPT_OVERLAP)) {
error("--exclusive and --overlap are mutually exclusive");
verified = false;
}
/* set proc and node counts based on the arbitrary list of nodes */
if (((opt.distribution & SLURM_DIST_STATE_BASE) == SLURM_DIST_ARBITRARY)
&& (!opt.nodes_set || !opt.ntasks_set)
&& !xstrchr(opt.nodelist, '{')) {
hostlist_t *hl = hostlist_create(opt.nodelist);
if (!hl)
fatal("Invalid node list specified");
if (!opt.ntasks_set) {
opt.ntasks_set = true;
opt.ntasks = hostlist_count(hl);
}
if (!opt.nodes_set) {
opt.nodes_set = true;
hostlist_uniq(hl);
opt.min_nodes = opt.max_nodes = hostlist_count(hl);
}
hostlist_destroy(hl);
}
/*
* Handle special settings for parallel debugging.
*/
if (sropt.debugger_test || _under_parallel_debugger())
sropt.parallel_debug = true;
if (sropt.parallel_debug) {
/* Set --threads 1 */
slurm_process_option_or_exit(&opt, 'T', "1", false, false);
/* Set --msg-timeout 15 */
slurm_process_option_or_exit(&opt, LONG_OPT_MSG_TIMEOUT, "1",
false, false);
}
pmi_server_max_threads(sropt.max_threads);
/* check for realistic arguments */
if (opt.ntasks <= 0) {
error("invalid number of tasks (-n %d)", opt.ntasks);
verified = false;
}
if (opt.cpus_set && (opt.cpus_per_task <= 0)) {
error("invalid number of cpus per task (-c %d)",
opt.cpus_per_task);
verified = false;
}
if ((opt.min_nodes < 0) || (opt.max_nodes < 0) ||
(opt.max_nodes && (opt.min_nodes > opt.max_nodes))) {
error("invalid number of nodes (-N %d-%d)",
opt.min_nodes, opt.max_nodes);
verified = false;
}
if (!opt.ntasks_per_node) {
error("ntasks-per-node is 0");
verified = false;
}
/* bound max_threads/cores from ntasks_cores/sockets */
if (opt.ntasks_per_core > 0) {
/* if cpu_bind_type doesn't already have a auto pref,
* choose the level based on the level of ntasks
*/
if (!(sropt.cpu_bind_type & (CPU_BIND_TO_SOCKETS |
CPU_BIND_TO_CORES |
CPU_BIND_TO_THREADS |
CPU_BIND_TO_LDOMS))) {
if (opt.ntasks_per_core == 1)
sropt.cpu_bind_type |= CPU_BIND_TO_CORES;
else
sropt.cpu_bind_type |= CPU_BIND_TO_THREADS;
}
if ((opt.threads_per_core != NO_VAL) &&
(opt.threads_per_core < opt.ntasks_per_core)) {
error("--ntasks-per-core (%d) can not be bigger than --threads-per-core (%d)",
opt.ntasks_per_core, opt.threads_per_core);
verified = false;
}
}
if (opt.ntasks_per_socket > 0) {
/* if cpu_bind_type doesn't already have a auto pref,
* choose the level based on the level of ntasks
*/
if (!(sropt.cpu_bind_type & (CPU_BIND_TO_SOCKETS |
CPU_BIND_TO_CORES |
CPU_BIND_TO_THREADS |
CPU_BIND_TO_LDOMS))) {
sropt.cpu_bind_type |= CPU_BIND_TO_SOCKETS;
}
}
/* massage the numbers */
if (opt.nodelist && !opt.nodes_set && !xstrchr(opt.nodelist, '{')) {
hl = hostlist_create(opt.nodelist);
if (!hl)
fatal("Invalid node list specified");
hostlist_uniq(hl);
hl_cnt = hostlist_count(hl);
opt.min_nodes = hl_cnt;
opt.nodes_set = true;
}
if ((opt.nodes_set || opt.extra_set) &&
((opt.min_nodes == opt.max_nodes) || (opt.max_nodes == 0)) &&
(opt.ntasks_per_node == NO_VAL) &&
!opt.ntasks_set) {
/* 1 proc / node default */
opt.ntasks = opt.min_nodes;
/* 1 proc / min_[socket * core * thread] default */
if ((opt.sockets_per_node != NO_VAL) &&
(opt.cores_per_socket != NO_VAL) &&
(opt.threads_per_core != NO_VAL)) {
opt.ntasks *= opt.sockets_per_node;
opt.ntasks *= opt.cores_per_socket;
opt.ntasks *= opt.threads_per_core;
opt.ntasks_set = true;
if (opt.verbose)
info("Number of tasks implicitly set to %d",
opt.ntasks);
}
/* massage the numbers */
if (opt.nodelist) {
FREE_NULL_HOSTLIST(hl);
hl = hostlist_create(opt.nodelist);
if (!hl)
fatal("Invalid node list specified");
if (((opt.distribution & SLURM_DIST_STATE_BASE) ==
SLURM_DIST_ARBITRARY) && !opt.ntasks_set) {
opt.ntasks = hostlist_count(hl);
opt.ntasks_set = true;
}
hostlist_uniq(hl);
hl_cnt = hostlist_count(hl);
/* Don't destroy hl here since it may be used later */
}
} else if (opt.nodes_set && opt.ntasks_set) {
/*
* Make sure that the number of
* max_nodes is <= number of tasks
*/
if (opt.ntasks < opt.max_nodes)
opt.max_nodes = opt.ntasks;
/*
* make sure # of procs >= min_nodes
*/
if ((opt.ntasks < opt.min_nodes) && (opt.ntasks > 0)) {
char *tmp = NULL;
warning("can't run %d processes on %d nodes, setting nnodes to %d",
opt.ntasks, opt.min_nodes, opt.ntasks);
opt.min_nodes = opt.ntasks;
if (opt.max_nodes
&& (opt.min_nodes > opt.max_nodes) )
opt.max_nodes = opt.min_nodes;
/*
* This will force the set_by_env flag to false,
* which influences future decisions.
*/
xstrfmtcat(tmp, "%d", opt.min_nodes);
slurm_process_option_or_exit(&opt, 'N', tmp, false,
false);
xfree(tmp);
if (hl_cnt > opt.min_nodes) {
int del_cnt, i;
char *host;
del_cnt = hl_cnt - opt.min_nodes;
for (i=0; i<del_cnt; i++) {
host = hostlist_pop(hl);
free(host);
}
xfree(opt.nodelist);
opt.nodelist =
hostlist_ranged_string_xmalloc(hl);
}
}
if ((opt.ntasks_per_node != NO_VAL) &&
slurm_option_set_by_env(&opt, 'n') &&
!slurm_option_set_by_env(&opt, 'N')) {
slurm_option_reset(&opt, "ntasks");
} else if (opt.ntasks_per_node != NO_VAL) {
bool ntasks_per_node_reset = false;
int min_ntasks, max_ntasks;
min_ntasks = opt.min_nodes * opt.ntasks_per_node;
max_ntasks = opt.max_nodes * opt.ntasks_per_node;
/*
* We only want to notify incoherent combinations of
* -n/-N/--ntasks-per-node for steps, since job
* allocations will be already rejected.
*/
if (opt.max_nodes &&
(opt.ntasks > max_ntasks) &&
!mpack_reset_nodes &&
is_step) {
warning("can't honor --ntasks-per-node set to %u which doesn't match the requested tasks %u with the maximum number of requested nodes %u. Ignoring --ntasks-per-node.",
opt.ntasks_per_node, opt.ntasks,
opt.max_nodes);
ntasks_per_node_reset = true;
}
else if (opt.min_nodes &&
(opt.ntasks != min_ntasks) &&
(opt.ntasks > opt.ntasks_per_node) &&
mpack_reset_nodes) {
warning("can't honor --ntasks-per-node set to %u which doesn't match the requested tasks %u and -mpack, which forces min number of nodes to 1",
opt.ntasks_per_node, opt.ntasks);
ntasks_per_node_reset = true;
}
if (ntasks_per_node_reset)
slurm_option_reset(&opt, "ntasks-per-node");
}
} /* else if (opt.ntasks_set && !opt.nodes_set) */
if ((opt.ntasks_per_node != NO_VAL) && (!opt.ntasks_set)) {
opt.ntasks = opt.min_nodes * opt.ntasks_per_node;
opt.ntasks_set = 1;
}
FREE_NULL_HOSTLIST(hl);
if ((opt.deadline) && (opt.begin) && (opt.deadline < opt.begin)) {
error("Incompatible begin and deadline time specification");
exit(error_exit);
}
if (!sropt.mpi_type)
sropt.mpi_type = xstrdup(slurm_conf.mpi_default);
if (!opt.job_name)
opt.job_name = xstrdup(sropt.cmd_name);
if (sropt.pty) {
#ifdef HAVE_PTY_H
sropt.unbuffered = true; /* implicit */
if (opt.efname ||opt.ifname || opt.ofname) {
error("--error/--input/--output are incompatible with --pty");
exit(error_exit);
}
#else
error("--pty not currently supported on this system type, ignoring option");
sropt.pty = false;
#endif
}
if (opt.x11) {
x11_get_display(&opt.x11_target_port, &opt.x11_target);
opt.x11_magic_cookie = x11_get_xauth();
}
if (sropt.pty) {
if (opt.efname || opt.ifname || opt.ofname) {
error("--error/--input/--output are incompatible with --pty%s%s",
(sropt.pty[0] ? "=" : ""), sropt.pty);
exit(error_exit);
}
}
return verified;
}
/* Initialize the spank_job_env based upon environment variables set
* via salloc or sbatch commands */
extern void init_spank_env(void)
{
extern char **environ;
if (environ == NULL) {
debug3("%s: environ is NULL", __func__);
return;
}
for (int i = 0; environ[i]; i++) {
char *name, *eq, *value;
if (xstrncmp(environ[i], "SLURM_SPANK_", 12)) {
debug3("%s: skipping environ[%d]: %s",
__func__, i, environ[i]);
continue;
}
name = xstrdup(environ[i] + 12);
eq = strchr(name, (int)'=');
if (eq == NULL) {
fatal("Malformed SPANK environment entry: %s",
environ[i]);
}
eq[0] = '\0';
value = eq + 1;
spank_set_job_env(name, value, 1);
xfree(name);
debug3("%s: adding SPANK environ[%d]: %s",
__func__, i, environ[i]);
}
}
/* Functions used by SPANK plugins to read and write job environment
* variables for use within job's Prolog and/or Epilog */
extern char *spank_get_job_env(const char *name)
{
int i, len;
char *tmp_str = NULL;
if ((name == NULL) || (name[0] == '\0') ||
(strchr(name, (int)'=') != NULL)) {
errno = EINVAL;
return NULL;
}
xstrcat(tmp_str, name);
xstrcat(tmp_str, "=");
len = strlen(tmp_str);
for (i = 0; i < opt.spank_job_env_size; i++) {
if (xstrncmp(opt.spank_job_env[i], tmp_str, len))
continue;
xfree(tmp_str);
return (opt.spank_job_env[i] + len);
}
return NULL;
}
extern int spank_set_job_env(const char *name, const char *value,
int overwrite)
{
int i, len;
char *tmp_str = NULL;
if ((name == NULL) || (name[0] == '\0') ||
(strchr(name, (int)'=') != NULL)) {
errno = EINVAL;
return -1;
}
xstrcat(tmp_str, name);
xstrcat(tmp_str, "=");
len = strlen(tmp_str);
xstrcat(tmp_str, value);
for (i = 0; i < opt.spank_job_env_size; i++) {
if (xstrncmp(opt.spank_job_env[i], tmp_str, len))
continue;
if (overwrite) {
xfree(opt.spank_job_env[i]);
opt.spank_job_env[i] = tmp_str;
} else
xfree(tmp_str);
return 0;
}
/* Need to add an entry */
opt.spank_job_env_size++;
xrealloc(opt.spank_job_env, sizeof(char *) * opt.spank_job_env_size);
opt.spank_job_env[i] = tmp_str;
return 0;
}
extern int spank_unset_job_env(const char *name)
{
int i, j, len;
char *tmp_str = NULL;
if ((name == NULL) || (name[0] == '\0') ||
(strchr(name, (int)'=') != NULL)) {
errno = EINVAL;
return -1;
}
xstrcat(tmp_str, name);
xstrcat(tmp_str, "=");
len = strlen(tmp_str);
for (i = 0; i < opt.spank_job_env_size; i++) {
if (xstrncmp(opt.spank_job_env[i], tmp_str, len))
continue;
xfree(opt.spank_job_env[i]);
for (j = (i+1); j < opt.spank_job_env_size; i++, j++)
opt.spank_job_env[i] = opt.spank_job_env[j];
opt.spank_job_env_size--;
if (opt.spank_job_env_size == 0)
xfree(opt.spank_job_env);
return 0;
}
return 0; /* not found */
}
/* Determine if srun is under the control of a parallel debugger or not */
static bool _under_parallel_debugger (void)
{
return (MPIR_being_debugged != 0);
}
static void _autocomplete(const char *query)
{
char *opt_string = NULL;
struct option *optz = slurm_option_table_create(&opt, &opt_string);
suggest_completion(optz, query);
xfree(opt_string);
slurm_option_table_destroy(optz);
}
static void _usage(void)
{
printf(
"Usage: srun [-N nnodes] [-n ntasks] [-i in] [-o out] [-e err]\n"
" [-c ncpus] [-r n] [-p partition] [--hold] [-t minutes]\n"
" [-D path] [--immediate[=secs]] [--overcommit] [--overlap] [--no-kill]\n"
" [--oversubscribe] [--label] [--unbuffered] [-m dist] [-J jobname]\n"
" [--jobid=id] [--verbose] [--slurmd_debug=#] [--gres=list]\n"
" [-T threads] [-W sec] [--gres-flags=opts]\n"
" [--licenses=names] [--clusters=cluster_names]\n"
" [--qos=qos] [--time-min=minutes]\n"
" [--contiguous] [--mincpus=n] [--mem=MB] [--tmp=MB] [-C list]\n"
" [--mpi=type] [--account=name] [--dependency=type:jobid[+time]]\n"
" [--kill-on-bad-exit] [--propagate[=rlimits] [--comment=name]\n"
" [--cpu-bind=...] [--mem-bind=...] [--network=type]\n"
" [--ntasks-per-node=n] [--ntasks-per-socket=n] [reservation=name]\n"
" [--ntasks-per-core=n] [--mem-per-cpu=MB] [--preserve-env]\n"
" [--profile=...] [--exact]\n"
" [--mail-type=type] [--mail-user=user] [--nice[=value]]\n"
" [--prolog=fname] [--epilog=fname]\n"
" [--task-prolog=fname] [--task-epilog=fname]\n"
" [--ctrl-comm-ifhn=addr] [--multi-prog] [--mcs-label=mcs]\n"
" [--cpu-freq=min[-max[:gov]]] [--power=flags] [--spread-job]\n"
" [--switches=max-switches{@max-time-to-wait}] [--reboot]\n"
" [--core-spec=cores] [--thread-spec=threads]\n"
" [--bb=burst_buffer_spec] [--bbf=burst_buffer_file]\n"
" [--bcast=<dest_path>] [--bcast-exclude=<NONE|path1,...,pathN>]\n"
" [--send-libs[=y|n]] [--compress[=library]]\n"
" [--acctg-freq=<datatype>=<interval>] [--delay-boot=mins]\n"
" [-w hosts...] [-x hosts...] [--use-min-nodes]\n"
" [--mpi-combine=yes|no] [--het-group=value]\n"
" [--cpus-per-gpu=n] [--gpus=n] [--gpu-bind=...] [--gpu-freq=...]\n"
" [--gpus-per-node=n] [--gpus-per-socket=n] [--gpus-per-task=n]\n"
" [--mem-per-gpu=MB] [--tres-bind=...] [--tres-per-task=list]\n"
" [--oom-kill-step[=0|1]]\n"
" executable [args...]\n");
}
static void _help(void)
{
slurm_conf_t *conf = slurm_conf_lock();
printf (
"Usage: srun [OPTIONS(0)... [executable(0) [args(0)...]]] [ : [OPTIONS(N)...]] executable(N) [args(N)...]\n"
"\n"
"Parallel run options:\n"
" -A, --account=name charge job to specified account\n"
" --acctg-freq=<datatype>=<interval> accounting and profiling sampling\n"
" intervals. Supported datatypes:\n"
" task=<interval> energy=<interval>\n"
" network=<interval> filesystem=<interval>\n"
" --bb=<spec> burst buffer specifications\n"
" --bbf=<file_name> burst buffer specification file\n"
" --bcast=<dest_path> Copy executable file to compute nodes\n"
" --bcast-exclude=<paths> Shared object directory paths to exclude\n"
" -b, --begin=time defer job until HH:MM MM/DD/YY\n"
" -c, --cpus-per-task=ncpus number of cpus required per task\n"
" --comment=name arbitrary comment\n"
" --compress[=library] data compression library used with --bcast\n"
" --container Path to OCI container bundle\n"
" --container-id OCI container ID\n"
" --cpu-freq=min[-max[:gov]] requested cpu frequency (and governor)\n"
" -d, --dependency=type:jobid[:time] defer job until condition on jobid is satisfied\n"
" --deadline=time remove the job if no ending possible before\n"
" this deadline (start > (deadline - time[-min]))\n"
" --delay-boot=mins delay boot for desired node features\n"
" -D, --chdir=path change remote current working directory\n"
" --export=env_vars|NONE environment variables passed to launcher with\n"
" optional values or NONE (pass no variables)\n"
" -e, --error=err location of stderr redirection\n"
" --epilog=program run \"program\" after launching job step\n"
" -E, --preserve-env env vars for node and task counts override\n"
" command-line flags\n"
" --gres=list required generic resources per node\n"
" --gres-flags=opts flags related to GRES management\n"
" -H, --hold submit job in held state\n"
" -i, --input=in location of stdin redirection\n"
" -I, --immediate[=secs] exit if resources not available in \"secs\"\n"
" --jobid=id run under already allocated job\n"
" -J, --job-name=jobname name of job\n"
" -k, --no-kill do not kill job on node failure\n"
" -K, --kill-on-bad-exit kill the job if any task terminates with a\n"
" non-zero exit code\n"
" -l, --label prepend task number to lines of stdout/err\n"
" -L, --licenses=names required license, comma separated\n"
" -M, --clusters=names Comma separated list of clusters to issue\n"
" commands to. Default is current cluster.\n"
" Name of 'all' will submit to run on all clusters.\n"
" NOTE: SlurmDBD must up.\n"
" -m, --distribution=type distribution method for processes to nodes\n"
" (type = block|cyclic|arbitrary)\n"
" --mail-type=type notify on state change: BEGIN, END, FAIL or ALL\n"
" --mail-user=user who to send email notification for job state\n"
" changes\n"
" --mcs-label=mcs mcs label if mcs plugin mcs/group is used\n"
" --mpi=type type of MPI being used\n"
" --multi-prog if set the program name specified is the\n"
" configuration specification for multiple programs\n"
" -n, --ntasks=ntasks number of tasks to run\n"
" --nice[=value] decrease scheduling priority by value\n"
" --ntasks-per-node=n number of tasks to invoke on each node\n"
" -N, --nodes=N number of nodes on which to run (N = min[-max])\n"
" --oom-kill-step[=0|1] set the OOMKillStep behaviour\n"
" -o, --output=out location of stdout redirection\n"
" -O, --overcommit overcommit resources\n"
" --overlap Allow other steps to overlap this step\n"
" --het-group=value hetjob component allocation(s) in which to launch\n"
" application\n"
" -p, --partition=partition partition requested\n"
" --power=flags power management options\n"
" --priority=value set the priority of the job to value\n"
" --prolog=program run \"program\" before launching job step\n"
" --profile=value enable acct_gather_profile for detailed data\n"
" value is all or none or any combination of\n"
" energy, lustre, network or task\n"
" --propagate[=rlimits] propagate all [or specific list of] rlimits\n"
#ifdef HAVE_PTY_H
" --pty[=fd] run task zero in pseudo terminal [or in requested terminal given by fd]\n"
#endif
" --quit-on-interrupt quit on single Ctrl-C\n"
" -q, --qos=qos quality of service\n"
" -Q, --quiet quiet mode (suppress informational messages)\n"
" --reboot reboot block before starting job\n"
" -r, --relative=n run job step relative to node n of allocation\n"
" -s, --oversubscribe over-subscribe resources with other jobs\n"
" -S, --core-spec=cores count of reserved cores\n"
" --send-libs[=yes|no] autodetect and broadcast shared objects\n"
" --signal=[R:]num[@time] send signal when time limit within time seconds\n"
" --slurmd-debug=level slurmd debug level\n"
" --spread-job spread job across as many nodes as possible\n"
" --switches=max-switches{@max-time-to-wait}\n"
" Optimum switches and max time to wait for optimum\n"
" --task-epilog=program run \"program\" after launching task\n"
" --task-prolog=program run \"program\" before launching task\n"
" --thread-spec=threads count of reserved threads\n"
" -T, --threads=threads set srun launch fanout\n"
" -t, --time=minutes time limit\n"
" --time-min=minutes minimum time limit (if distinct)\n"
" --tres-bind=... task to tres binding options\n"
" --tres-per-task=list list of tres required per task\n"
" -u, --unbuffered do not line-buffer stdout/err\n"
" --use-min-nodes if a range of node counts is given, prefer the\n"
" smaller count\n"
" -v, --verbose verbose mode (multiple -v's increase verbosity)\n"
" --wait-for-children wait for all children processes in a task to\n"
" close before considering the task ended.\n"
" -W, --wait=sec seconds to wait after first task exits\n"
" before killing job\n"
" --wckey=wckey wckey to run job under\n"
" -X, --disable-status Disable Ctrl-C status feature\n"
"\n"
"Constraint options:\n"
" --cluster-constraint=list specify a list of cluster-constraints\n"
" --contiguous demand a contiguous range of nodes\n"
" -C, --constraint=list specify a list of constraints\n"
" --mem=MB minimum amount of real memory\n"
" --mincpus=n minimum number of logical processors (threads)\n"
" per node\n"
" --reservation=name allocate resources from named reservation\n"
" --tmp=MB minimum amount of temporary disk\n"
" -w, --nodelist=hosts... request a specific list of hosts\n"
" -x, --exclude=hosts... exclude a specific list of hosts\n"
" -Z, --no-allocate don't allocate nodes (must supply -w)\n"
"\n"
"Consumable resources related options:\n"
" --exact use only the resources requested for the step\n"
" (by default, all non-gres resources on each node\n"
" in the allocation will be used in the step)\n"
" --exclusive[=user] for job allocation, this allocates nodes in\n"
" in exclusive mode\n"
" for job steps, this is equivalent to --exact\n"
" --exclusive[=mcs] allocate nodes in exclusive mode when\n"
" cpu consumable resource is enabled\n"
" and mcs plugin is enabled (--exact implied)\n"
" or don't share CPUs for job steps\n"
" --mem-per-cpu=MB maximum amount of real memory per allocated\n"
" cpu required by the job.\n"
" --mem >= --mem-per-cpu if --mem is specified.\n"
" --resv-ports reserve communication ports\n"
"\n"
"Affinity/Multi-core options: (when the task/affinity plugin is enabled)\n"
" For the following 4 options, you are\n"
" specifying the minimum resources available for\n"
" the node(s) allocated to the job.\n"
" --sockets-per-node=S number of sockets per node to allocate\n"
" --cores-per-socket=C number of cores per socket to allocate\n"
" --threads-per-core=T number of threads per core to allocate\n"
" -B, --extra-node-info=S[:C[:T]] combine request of sockets per node,\n"
" cores per socket and threads per core.\n"
" Specify an asterisk (*) as a placeholder,\n"
" a minimum value, or a min-max range.\n"
"\n"
" --ntasks-per-core=n number of tasks to invoke on each core\n"
" --ntasks-per-socket=n number of tasks to invoke on each socket\n");
if (xstrstr(conf->task_plugin, "affinity") ||
xstrstr(conf->task_plugin, "cgroup")) {
printf(
" --cpu-bind= Bind tasks to CPUs\n"
" (see \"--cpu-bind=help\" for options)\n"
" --hint= Bind tasks according to application hints\n"
" (see \"--hint=help\" for options)\n");
}
if (xstrstr(conf->task_plugin, "affinity")) {
printf(
" --mem-bind= Bind memory to locality domains (ldom)\n"
" (see \"--mem-bind=help\" for options)\n");
}
slurm_conf_unlock();
spank_print_options(stdout, 6, 30);
printf("\n"
"GPU scheduling options:\n"
" --cpus-per-gpu=n number of CPUs required per allocated GPU\n"
" -G, --gpus=n count of GPUs required for the job\n"
" --gpu-bind=... task to gpu binding options\n"
" --gpu-freq=... frequency and voltage of GPUs\n"
" --gpus-per-node=n number of GPUs required per allocated node\n"
" --gpus-per-socket=n number of GPUs required per allocated socket\n"
" --gpus-per-task=n number of GPUs required per spawned task\n"
" --mem-per-gpu=n real memory required per allocated GPU\n"
);
printf("\n"
"Help options:\n"
" -h, --help show this help message\n"
" --usage display brief usage message\n"
"\n"
"Other options:\n"
" -V, --version output version information and exit\n"
"\n"
);
}