| /****************************************************************************\ |
| * srun_job.c - job data structure creation functions |
| ***************************************************************************** |
| * Copyright (C) 2002-2007 The Regents of the University of California. |
| * Copyright (C) 2008 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Mark Grondona <grondona@llnl.gov>. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #include <fcntl.h> |
| #include <grp.h> |
| #include <limits.h> |
| #include <netdb.h> |
| #include <signal.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/resource.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <sys/wait.h> |
| #include <unistd.h> |
| |
| #include "src/common/bitstring.h" |
| #include "src/interfaces/cli_filter.h" |
| #include "src/common/cbuf.h" |
| #include "src/common/fd.h" |
| #include "src/common/forward.h" |
| #include "src/common/hostlist.h" |
| #include "src/common/log.h" |
| #include "src/common/macros.h" |
| #include "src/common/proc_args.h" |
| #include "src/common/read_config.h" |
| #include "src/common/slurm_opt.h" |
| #include "src/common/slurm_protocol_api.h" |
| #include "src/common/slurm_rlimits_info.h" |
| #include "src/common/spank.h" |
| #include "src/common/uid.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xsignal.h" |
| #include "src/common/xstring.h" |
| |
| #include "src/api/step_launch.h" |
| |
| #include "src/srun/allocate.h" |
| #include "src/srun/debugger.h" |
| #include "src/srun/fname.h" |
| #include "src/srun/launch.h" |
| #include "src/srun/opt.h" |
| #include "src/srun/multi_prog.h" |
| #include "src/srun/srun_job.h" |
| |
| /* |
| * allocation information structure used to store general information |
| * about node allocation to be passed to _job_create_structure() |
| */ |
| typedef struct allocation_info { |
| uint16_t *cpus_per_node; |
| uint32_t *cpu_count_reps; |
| uint32_t nnodes; |
| char *nodelist; |
| uint16_t ntasks_per_board;/* number of tasks to invoke on each board */ |
| uint16_t ntasks_per_core; /* number of tasks to invoke on each core */ |
| uint16_t ntasks_per_tres; /* number of tasks that can access each gpu */ |
| uint16_t ntasks_per_socket;/* number of tasks to invoke on |
| * each socket */ |
| uint32_t num_cpu_groups; |
| char *partition; |
| slurm_step_id_t step_id; |
| uid_t uid; /* resolved user id of job */ |
| char *user_name; /* resolved user name of job */ |
| gid_t gid; /* resolved group id of job */ |
| char *group_name; /* resolved group name of job */ |
| } allocation_info_t; |
| |
| typedef struct het_job_resp_struct { |
| uint16_t *cpu_cnt; |
| hostlist_t *host_list; |
| uint32_t node_cnt; |
| } het_job_resp_struct_t; |
| |
| |
| static int shepherd_fd = -1; |
| static pthread_t signal_thread = (pthread_t) 0; |
| static int pty_sigarray[] = { SIGWINCH, 0 }; |
| |
| extern char **environ; |
| |
| /* |
| * Prototypes: |
| */ |
| |
| static void _call_spank_fini(void); |
| static int _call_spank_local_user(srun_job_t *job, slurm_opt_t *opt_local); |
| static long _diff_tv_str(struct timeval *tv1, struct timeval *tv2); |
| static void _handle_intr(srun_job_t *job); |
| static void _handle_pipe(void); |
| static srun_job_t *_job_create_structure(allocation_info_t *ainfo, |
| slurm_opt_t *opt_local); |
| static char *_normalize_hostlist(const char *hostlist); |
| static void _print_job_information(resource_allocation_response_msg_t *resp); |
| static void _run_srun_epilog (srun_job_t *job); |
| static void _run_srun_prolog (srun_job_t *job); |
| static int _run_srun_script (srun_job_t *job, char *script); |
| static void _set_env_vars(resource_allocation_response_msg_t *resp, |
| int het_job_offset); |
| static void _set_env_vars2(resource_allocation_response_msg_t *resp, |
| int het_job_offset); |
| static void _set_ntasks(allocation_info_t *ai, slurm_opt_t *opt_local); |
| static int _set_rlimit_env(void); |
| static void _set_submit_dir_env(void); |
| static int _set_umask_env(void); |
| static void _shepherd_notify(int shepherd_fd); |
| static int _shepherd_spawn(srun_job_t *job, list_t *srun_job_list, |
| bool got_alloc); |
| static void *_srun_signal_mgr(void *no_data); |
| static void _srun_cli_filter_post_submit(uint32_t jobid, uint32_t stepid); |
| static int _validate_relative(resource_allocation_response_msg_t *resp, |
| slurm_opt_t *opt_local); |
| |
| |
| /* |
| * Create an srun job structure w/out an allocation response msg. |
| * (i.e. use the command line options) |
| */ |
| srun_job_t * |
| job_create_noalloc(void) |
| { |
| srun_job_t *job = NULL; |
| allocation_info_t *ai = xmalloc(sizeof(allocation_info_t)); |
| uint16_t cpn[1]; |
| uint32_t cpu_count_reps[1]; |
| slurm_opt_t *opt_local = &opt; |
| hostlist_t *hl = hostlist_create(opt_local->nodelist); |
| |
| if (!hl) { |
| error("Invalid node list `%s' specified", opt_local->nodelist); |
| goto error; |
| } |
| srand48(getpid()); |
| ai->step_id.job_id = MIN_NOALLOC_JOBID + |
| ((uint32_t) lrand48() % |
| (MAX_NOALLOC_JOBID - MIN_NOALLOC_JOBID + 1)); |
| ai->step_id.step_id = (uint32_t) (lrand48()); |
| ai->step_id.step_het_comp = NO_VAL; |
| ai->nodelist = opt_local->nodelist; |
| ai->nnodes = hostlist_count(hl); |
| ai->uid = getuid(); |
| ai->user_name = uid_to_string_or_null(ai->uid); |
| ai->gid = getgid(); |
| ai->group_name = gid_to_string_or_null(ai->gid); |
| |
| hostlist_destroy(hl); |
| |
| cpn[0] = ROUNDUP(opt_local->ntasks, ai->nnodes); |
| ai->cpus_per_node = cpn; |
| cpu_count_reps[0] = ai->nnodes; |
| ai->cpu_count_reps = cpu_count_reps; |
| ai->num_cpu_groups = 1; |
| |
| /* |
| * Create job, then fill in host addresses |
| */ |
| job = _job_create_structure(ai, opt_local); |
| |
| if (job != NULL) |
| job_update_io_fnames(job, opt_local); |
| if (job && (job->ntasks == NO_VAL)) { |
| job->ntasks = ai->nnodes; |
| job->cpu_count = opt_local->cpus_per_task * job->ntasks; |
| } |
| |
| error: |
| xfree(ai); |
| return (job); |
| |
| } |
| |
| static void _set_min_node_count(allocation_info_t *ai, |
| resource_allocation_response_msg_t *resp, |
| slurm_opt_t *opt_local) |
| { |
| int num_tasks; |
| |
| if (opt_local->nodes_set) |
| return; |
| |
| opt_local->nodes_set = true; |
| |
| if (!local_het_step) { |
| /* |
| * we don't want to set the number of nodes = |
| * to the number of requested processes unless we |
| * know it is less than the number of nodes |
| * in the allocation |
| */ |
| if (opt_local->ntasks_set && |
| (opt_local->ntasks < ai->nnodes)) |
| opt_local->min_nodes = opt_local->ntasks; |
| else |
| opt_local->min_nodes = ai->nnodes; |
| return; |
| } |
| |
| /* |
| * Here we want to try to figure out what the minimum amount of nodes |
| * should be needed to put this step into the allocation. |
| */ |
| num_tasks = 0; |
| opt_local->min_nodes = 0; |
| for (int i = 0; ((i < resp->num_cpu_groups) && |
| (opt_local->min_nodes < resp->node_cnt)); |
| i++) { |
| for (int j = 0; j < resp->cpu_count_reps[i]; j++) { |
| /* |
| * Given this node, figure out how many tasks could fit |
| * on it. |
| */ |
| int ntasks_per_node = resp->cpus_per_node[i]; |
| |
| if (opt_local->cpus_per_task) |
| ntasks_per_node /= |
| opt_local->cpus_per_task; |
| |
| if ((opt_local->ntasks_per_node != NO_VAL) && |
| (ntasks_per_node >= opt_local->ntasks_per_node)) |
| ntasks_per_node = opt_local->ntasks_per_node; |
| |
| /* Then add it to the total task count */ |
| num_tasks += ntasks_per_node; |
| |
| opt_local->min_nodes++; |
| if (num_tasks >= opt_local->ntasks) |
| return; |
| } |
| } |
| } |
| |
| /* |
| * Create an srun job structure for a step w/out an allocation response msg. |
| * (i.e. inside an allocation) |
| */ |
| extern srun_job_t *job_step_create_allocation( |
| resource_allocation_response_msg_t *resp, |
| slurm_opt_t *opt_local) |
| { |
| srun_opt_t *srun_opt = opt_local->srun_opt; |
| uint32_t job_id = resp->job_id; |
| srun_job_t *job = NULL; |
| allocation_info_t *ai = xmalloc(sizeof(allocation_info_t)); |
| hostlist_t *hl = NULL; |
| char *buf = NULL; |
| int count = 0; |
| uint32_t alloc_count = 0; |
| char *step_nodelist = NULL; |
| xassert(srun_opt); |
| |
| ai->step_id.job_id = job_id; |
| ai->step_id.step_id = NO_VAL; |
| ai->step_id.step_het_comp = NO_VAL; |
| if (srun_opt->alloc_nodelist) |
| ai->nodelist = xstrdup(srun_opt->alloc_nodelist); |
| else |
| ai->nodelist = xstrdup(resp->node_list); |
| |
| hl = hostlist_create(ai->nodelist); |
| hostlist_uniq(hl); |
| alloc_count = hostlist_count(hl); |
| ai->nnodes = alloc_count; |
| hostlist_destroy(hl); |
| |
| /* exclude is handled elsewhere for het steps */ |
| if (!local_het_step && opt_local->exclude) { |
| hostlist_t *exc_hl = hostlist_create(opt_local->exclude); |
| hostlist_t *inc_hl = NULL; |
| char *node_name = NULL; |
| |
| hl = hostlist_create(ai->nodelist); |
| if (opt_local->nodelist) |
| inc_hl = hostlist_create(opt_local->nodelist); |
| hostlist_uniq(hl); |
| //info("using %s or %s", opt_local->nodelist, ai->nodelist); |
| while ((node_name = hostlist_shift(exc_hl))) { |
| int inx = hostlist_find(hl, node_name); |
| if (inx >= 0) { |
| debug("excluding node %s", node_name); |
| hostlist_delete_nth(hl, inx); |
| ai->nnodes--; /* decrement node count */ |
| } |
| if (inc_hl) { |
| inx = hostlist_find(inc_hl, node_name); |
| if (inx >= 0) { |
| error("Requested node %s is also " |
| "in the excluded list.", |
| node_name); |
| error("Job not submitted."); |
| hostlist_destroy(exc_hl); |
| hostlist_destroy(inc_hl); |
| goto error; |
| } |
| } |
| free(node_name); |
| } |
| hostlist_destroy(exc_hl); |
| |
| /* we need to set this here so if there are more nodes |
| * available than we requested we can set it |
| * straight. If there is no exclude list then we set |
| * the vars then. |
| */ |
| if (!opt_local->nodes_set) { |
| /* we don't want to set the number of nodes = |
| * to the number of requested processes unless we |
| * know it is less than the number of nodes |
| * in the allocation |
| */ |
| if (opt_local->ntasks_set && |
| (opt_local->ntasks < ai->nnodes)) |
| opt_local->min_nodes = opt_local->ntasks; |
| else |
| opt_local->min_nodes = ai->nnodes; |
| opt_local->nodes_set = true; |
| } |
| if (!opt_local->max_nodes) |
| opt_local->max_nodes = opt_local->min_nodes; |
| if ((opt_local->max_nodes > 0) && |
| (opt_local->max_nodes < ai->nnodes)) |
| ai->nnodes = opt_local->max_nodes; |
| |
| count = hostlist_count(hl); |
| if (!count) { |
| error("Hostlist is empty! Can't run job."); |
| hostlist_destroy(hl); |
| goto error; |
| } |
| if (inc_hl) { |
| count = hostlist_count(inc_hl); |
| if (count < ai->nnodes) { |
| /* add more nodes to get correct number for |
| allocation */ |
| hostlist_t *tmp_hl = hostlist_copy(hl); |
| int i = 0; |
| int diff = ai->nnodes - count; |
| buf = hostlist_ranged_string_xmalloc(inc_hl); |
| hostlist_delete(tmp_hl, buf); |
| xfree(buf); |
| while ((i < diff) && |
| (node_name = hostlist_shift(tmp_hl))) { |
| hostlist_push_host(inc_hl, node_name); |
| free(node_name); |
| i++; |
| } |
| hostlist_destroy(tmp_hl); |
| } |
| buf = hostlist_ranged_string_xmalloc(inc_hl); |
| hostlist_destroy(inc_hl); |
| xfree(opt_local->nodelist); |
| opt_local->nodelist = buf; |
| } else { |
| /* remove more nodes than needed for allocation */ |
| for (int i = count; i > ai->nnodes; i--) { |
| hostlist_delete_nth(hl, i - 1); |
| } |
| xfree(opt_local->nodelist); |
| opt_local->nodelist = hostlist_ranged_string_xmalloc(hl); |
| } |
| |
| hostlist_destroy(hl); |
| } else { |
| _set_min_node_count(ai, resp, opt_local); |
| |
| if (!opt_local->max_nodes) |
| opt_local->max_nodes = opt_local->min_nodes; |
| if ((opt_local->max_nodes > 0) && |
| (opt_local->max_nodes < ai->nnodes)) |
| ai->nnodes = opt_local->max_nodes; |
| /* Don't reset the ai->nodelist because that is the |
| * nodelist we want to say the allocation is under |
| * opt_local->nodelist is what is used for the allocation. |
| */ |
| /* xfree(ai->nodelist); */ |
| /* ai->nodelist = xstrdup(buf); */ |
| } |
| |
| /* get the correct number of hosts to run tasks on */ |
| if (opt_local->nodelist) |
| step_nodelist = opt_local->nodelist; |
| else if (((opt_local->distribution & SLURM_DIST_STATE_BASE) == |
| SLURM_DIST_ARBITRARY) && (count == 0)) |
| step_nodelist = getenv("SLURM_ARBITRARY_NODELIST"); |
| if (step_nodelist) { |
| hl = hostlist_create(step_nodelist); |
| if ((opt_local->distribution & SLURM_DIST_STATE_BASE) != |
| SLURM_DIST_ARBITRARY) |
| hostlist_uniq(hl); |
| if (!hostlist_count(hl)) { |
| error("Hostlist is empty! Can not run job."); |
| hostlist_destroy(hl); |
| goto error; |
| } |
| |
| buf = hostlist_ranged_string_xmalloc(hl); |
| count = hostlist_count(hl); |
| hostlist_destroy(hl); |
| /* |
| * Don't reset the ai->nodelist because that is the |
| * nodelist we want to say the allocation is under |
| * opt_local->nodelist is what is used for the allocation. |
| */ |
| /* xfree(ai->nodelist); */ |
| /* ai->nodelist = xstrdup(buf); */ |
| xfree(opt_local->nodelist); |
| opt_local->nodelist = buf; |
| } |
| |
| if (((opt_local->distribution & SLURM_DIST_STATE_BASE) == |
| SLURM_DIST_ARBITRARY) && (count != opt_local->ntasks)) { |
| error("You asked for %d tasks but hostlist specified %d nodes", |
| opt_local->ntasks, count); |
| goto error; |
| } |
| |
| if (ai->nnodes == 0) { |
| error("No nodes in allocation, can't run job"); |
| goto error; |
| } |
| |
| ai->num_cpu_groups = resp->num_cpu_groups; |
| ai->cpus_per_node = resp->cpus_per_node; |
| ai->cpu_count_reps = resp->cpu_count_reps; |
| ai->ntasks_per_board = resp->ntasks_per_board; |
| |
| /* Here let the srun options override the allocation resp */ |
| ai->ntasks_per_core = (opt_local->ntasks_per_core != NO_VAL) ? |
| opt_local->ntasks_per_core : resp->ntasks_per_core; |
| ai->ntasks_per_socket = (opt_local->ntasks_per_socket != NO_VAL) ? |
| opt_local->ntasks_per_socket : resp->ntasks_per_socket; |
| |
| ai->partition = resp->partition; |
| |
| /* info("looking for %d nodes out of %s with a must list of %s", */ |
| /* ai->nnodes, ai->nodelist, opt_local->nodelist); */ |
| /* |
| * Create job |
| */ |
| job = _job_create_structure(ai, opt_local); |
| error: |
| xfree(ai->nodelist); |
| xfree(ai); |
| return (job); |
| |
| } |
| |
| /* |
| * Create an srun job structure from a resource allocation response msg |
| */ |
| extern srun_job_t *job_create_allocation( |
| resource_allocation_response_msg_t *resp, |
| slurm_opt_t *opt_local) |
| { |
| srun_job_t *job; |
| allocation_info_t *i = xmalloc(sizeof(allocation_info_t)); |
| |
| i->nodelist = _normalize_hostlist(resp->node_list); |
| i->nnodes = resp->node_cnt; |
| i->partition = resp->partition; |
| i->step_id.job_id = resp->job_id; |
| i->step_id.step_id = NO_VAL; |
| i->step_id.step_het_comp = NO_VAL; |
| i->num_cpu_groups = resp->num_cpu_groups; |
| i->cpus_per_node = resp->cpus_per_node; |
| i->cpu_count_reps = resp->cpu_count_reps; |
| i->ntasks_per_board = resp->ntasks_per_board; |
| i->ntasks_per_core = resp->ntasks_per_core; |
| i->ntasks_per_socket = resp->ntasks_per_socket; |
| |
| i->uid = resp->uid; |
| i->user_name = xstrdup(resp->user_name); |
| i->gid = resp->gid; |
| i->group_name = xstrdup(resp->group_name); |
| |
| job = _job_create_structure(i, opt_local); |
| if (job) { |
| job->account = xstrdup(resp->account); |
| job->qos = xstrdup(resp->qos); |
| job->resv_name = xstrdup(resp->resv_name); |
| } |
| |
| xfree(i->nodelist); |
| xfree(i); |
| |
| return (job); |
| } |
| |
| static void _copy_args(list_t *missing_argc_list, slurm_opt_t *opt_master) |
| { |
| list_itr_t *iter; |
| slurm_opt_t *opt_local; |
| int i; |
| |
| iter = list_iterator_create(missing_argc_list); |
| while ((opt_local = list_next(iter))) { |
| opt_local->argc = opt_master->argc; |
| opt_local->argv = xcalloc(sizeof(char *), |
| (opt_local->argc + 1)); |
| for (i = 0; i < opt_local->argc; i++) |
| opt_local->argv[i] = xstrdup(opt_master->argv[i]); |
| list_remove(iter); |
| } |
| list_iterator_destroy(iter); |
| } |
| |
| /* |
| * Build "het_group" string. If set on execute line, it may need to be |
| * rebuilt for multiple option structures ("--het-group=1,2" becomes two |
| * opt structures). Clear "het_grp_bits".if determined to not be a hetjob. |
| */ |
| static void _het_grp_test(list_t *opt_list) |
| { |
| list_itr_t *iter; |
| int het_job_offset; |
| bitstr_t *master_map = NULL; |
| list_t *missing_argv_list = NULL; |
| bool multi_comp = false, multi_prog = false; |
| |
| if (opt_list) { |
| slurm_opt_t *opt_local; |
| missing_argv_list = list_create(NULL); |
| iter = list_iterator_create(opt_list); |
| while ((opt_local = list_next(iter))) { |
| srun_opt_t *srun_opt = opt_local->srun_opt; |
| xassert(srun_opt); |
| if (opt_local->argc == 0) |
| list_append(missing_argv_list, opt_local); |
| else |
| _copy_args(missing_argv_list, opt_local); |
| xfree(srun_opt->het_group); |
| if (srun_opt->het_grp_bits && |
| ((het_job_offset = |
| bit_ffs(srun_opt->het_grp_bits)) >= 0)) { |
| xstrfmtcat(srun_opt->het_group, "%d", |
| het_job_offset); |
| } |
| if (!srun_opt->het_grp_bits) { |
| error("%s: het_grp_bits is NULL", __func__); |
| } else if (!master_map) { |
| master_map |
| = bit_copy(srun_opt->het_grp_bits); |
| } else { |
| if (bit_overlap_any(master_map, |
| srun_opt->het_grp_bits)) { |
| fatal("Duplicate het groups in single srun not supported"); |
| } |
| bit_or(master_map, srun_opt->het_grp_bits); |
| } |
| if (srun_opt->multi_prog) |
| multi_prog = true; |
| } |
| if (master_map && (bit_set_count(master_map) > 1)) |
| multi_comp = true; |
| FREE_NULL_BITMAP(master_map); |
| list_iterator_destroy(iter); |
| FREE_NULL_LIST(missing_argv_list); |
| } else if (!sropt.het_group && !getenv("SLURM_HET_SIZE")) { |
| FREE_NULL_BITMAP(sropt.het_grp_bits); |
| /* het_group is already NULL */ |
| } else if (!sropt.het_group && sropt.het_grp_bits) { |
| if ((het_job_offset = bit_ffs(sropt.het_grp_bits)) < 0) |
| het_job_offset = 0; |
| else if (bit_set_count(sropt.het_grp_bits) > 1) |
| multi_comp = true; |
| if (sropt.multi_prog) |
| multi_prog = true; |
| xstrfmtcat(sropt.het_group, "%d", het_job_offset); |
| } |
| |
| if (multi_comp && multi_prog) |
| fatal("--multi-prog option not supported with multiple het groups"); |
| } |
| |
| /* |
| * Copy job name from last component to all hetjob components unless |
| * explicitly set. |
| */ |
| static void _match_job_name(list_t *opt_list) |
| { |
| int cnt; |
| list_itr_t *iter; |
| slurm_opt_t *opt_local; |
| |
| if (!opt_list) |
| return; |
| |
| cnt = list_count(opt_list); |
| if (cnt < 2) |
| return; |
| |
| iter = list_iterator_create(opt_list); |
| while ((opt_local = list_next(iter))) { |
| if (!opt_local->job_name) |
| opt_local->job_name = xstrdup(opt.job_name); |
| if (opt_local->open_mode == 0) { |
| opt_local->open_mode = OPEN_MODE_APPEND; |
| } |
| } |
| list_iterator_destroy(iter); |
| } |
| |
| static int _sort_by_offset(void *x, void *y) |
| { |
| slurm_opt_t *opt_local1 = *(slurm_opt_t **) x; |
| slurm_opt_t *opt_local2 = *(slurm_opt_t **) y; |
| int offset1 = -1, offset2 = -1; |
| |
| if (opt_local1->srun_opt->het_grp_bits) |
| offset1 = bit_ffs(opt_local1->srun_opt->het_grp_bits); |
| if (opt_local2->srun_opt->het_grp_bits) |
| offset2 = bit_ffs(opt_local2->srun_opt->het_grp_bits); |
| if (offset1 < offset2) |
| return -1; |
| if (offset1 > offset2) |
| return 1; |
| return 0; |
| } |
| |
| static void _post_opts(list_t *opt_list) |
| { |
| _het_grp_test(opt_list); |
| _match_job_name(opt_list); |
| if (opt_list) |
| list_sort(opt_list, _sort_by_offset); |
| } |
| |
| extern void init_srun(int argc, char **argv, log_options_t *logopt, |
| bool handle_signals) |
| { |
| bool het_job_fini = false; |
| int i, het_job_argc, het_job_argc_off; |
| char **het_job_argv; |
| |
| /* |
| * This must happen before we spawn any threads |
| * which are not designed to handle arbitrary signals |
| */ |
| if (handle_signals) { |
| if (xsignal_block(sig_array) < 0) |
| error("Unable to block signals"); |
| } |
| xsignal_block(pty_sigarray); |
| |
| /* |
| * Initialize plugin stack, read options from plugins, etc. |
| */ |
| init_spank_env(); |
| if (spank_init(NULL)) { |
| error("Plug-in initialization failed"); |
| exit(error_exit); |
| } |
| |
| /* |
| * Be sure to call spank_fini when srun exits. |
| */ |
| if (atexit(_call_spank_fini) < 0) |
| error("Failed to register atexit handler for plugins: %m"); |
| |
| opt.submit_line = slurm_option_get_argv_str(argc, argv); |
| |
| het_job_argc = argc; |
| het_job_argv = argv; |
| while (!het_job_fini) { |
| het_job_argc_off = -1; |
| if (initialize_and_process_args(het_job_argc, het_job_argv, |
| &het_job_argc_off) < 0) { |
| error("srun parameter parsing"); |
| exit(1); |
| } |
| if ((het_job_argc_off >= 0) && |
| (het_job_argc_off < het_job_argc)) { |
| for (i = het_job_argc_off; i < het_job_argc; i++) { |
| if (!xstrcmp(het_job_argv[i], ":")) { |
| het_job_argc_off = i; |
| break; |
| } |
| } |
| } |
| if ((het_job_argc_off >= 0) && |
| (het_job_argc_off < het_job_argc) && |
| !xstrcmp(het_job_argv[het_job_argc_off], ":")) { |
| /* |
| * move het_job_argv[0] from "srun" to ":" |
| */ |
| het_job_argc -= het_job_argc_off; |
| het_job_argv += het_job_argc_off; |
| colon_cnt++; |
| } else { |
| het_job_fini = true; |
| } |
| } |
| |
| if (!mpi_g_client_init(&sropt.mpi_type)) { |
| error("Invalid MPI type '%s', --mpi=list for acceptable types", |
| sropt.mpi_type); |
| exit(error_exit); |
| } |
| |
| _post_opts(opt_list); |
| |
| /* |
| * reinit log with new verbosity (if changed by command line) |
| */ |
| if (logopt && (opt.verbose || opt.quiet)) { |
| /* |
| * If log level is already increased, only increment the |
| * level to the difference of opt.verbose an LOG_LEVEL_INFO |
| */ |
| if ((opt.verbose -= (logopt->stderr_level - LOG_LEVEL_INFO)) > 0) |
| logopt->stderr_level += opt.verbose; |
| logopt->stderr_level -= opt.quiet; |
| logopt->prefix_level = 1; |
| log_alter(*logopt, 0, NULL); |
| } |
| |
| (void) _set_rlimit_env(); |
| set_prio_process_env(); |
| (void) _set_umask_env(); |
| _set_submit_dir_env(); |
| |
| /* |
| * save process startup time to be used with -I<timeout> |
| */ |
| srun_begin_time = time(NULL); |
| } |
| |
| /* |
| * Modify options for a job step (after job allocation is complete |
| */ |
| static void _set_step_opts(slurm_opt_t *opt_local, |
| resource_allocation_response_msg_t *resp) |
| { |
| srun_opt_t *srun_opt = opt_local->srun_opt; |
| int new_cpt; |
| |
| xassert(srun_opt); |
| |
| opt_local->time_limit = NO_VAL;/* not applicable for step, only job */ |
| xfree(opt_local->constraint); /* not applicable for this step */ |
| if ((srun_opt->core_spec_set || srun_opt->exclusive) |
| && opt_local->cpus_set) { |
| /* Step gets specified CPU count, which may only part |
| * of the job allocation. */ |
| srun_opt->exclusive = true; |
| } else { |
| /* Step gets all CPUs in the job allocation. */ |
| srun_opt->exclusive = false; |
| } |
| |
| new_cpt = slurm_opt_get_tres_per_task_cpu_cnt(resp->tres_per_task); |
| if (new_cpt) |
| opt_local->cpus_per_task = new_cpt; |
| |
| if (resp->tres_per_task) { |
| xfree(opt_local->tres_per_task); |
| SWAP(opt_local->tres_per_task, resp->tres_per_task); |
| } |
| } |
| |
| static int _handle_het_step_exclude(srun_job_t *job, slurm_opt_t *opt_local, |
| hostlist_t *exclude_hl_in) |
| { |
| hostlist_t *exclude_hl, *allocation_hl; |
| int rc = SLURM_SUCCESS; |
| |
| if (!exclude_hl_in || !hostlist_count(exclude_hl_in)) |
| return rc; |
| |
| allocation_hl = hostlist_create(job->nodelist); |
| hostlist_uniq(allocation_hl); |
| |
| exclude_hl = hostlist_copy(exclude_hl_in); |
| hostlist_push(exclude_hl, opt_local->exclude); |
| hostlist_uniq(exclude_hl); |
| hostlist_sort(exclude_hl); |
| |
| xfree(opt_local->exclude); |
| opt_local->exclude = hostlist_ranged_string_xmalloc(exclude_hl); |
| |
| if ((hostlist_count(allocation_hl) - hostlist_count(exclude_hl)) < |
| opt_local->min_nodes) { |
| error("Allocation failure of %d nodes: job size of %d, already allocated %d nodes to previous components.", |
| opt_local->min_nodes, hostlist_count(allocation_hl), |
| hostlist_count(exclude_hl)); |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| |
| if (opt_local->nodelist) { |
| char *node_name = NULL; |
| hostlist_t *inc_hl = hostlist_create(opt_local->nodelist); |
| while ((node_name = hostlist_shift(exclude_hl))) { |
| if (hostlist_find(inc_hl, node_name) >= 0) { |
| error("Requested nodelist %s overlaps with excluded %s.", |
| opt_local->nodelist, |
| opt_local->exclude); |
| error("Job not submitted."); |
| rc = SLURM_ERROR; |
| break; |
| } |
| free(node_name); |
| } |
| FREE_NULL_HOSTLIST(inc_hl); |
| } |
| end_it: |
| FREE_NULL_HOSTLIST(allocation_hl); |
| FREE_NULL_HOSTLIST(exclude_hl); |
| |
| return rc; |
| } |
| |
| /* |
| * Create the job step(s). For a heterogeneous job, each step is requested in |
| * a separate RPC. create_job_step() references "opt", so we need to match up |
| * the job allocation request with its requested options. |
| */ |
| static int _create_job_step(srun_job_t *job, bool use_all_cpus, |
| list_t *srun_job_list, uint32_t het_job_id, |
| char *het_job_nodelist) |
| { |
| list_itr_t *opt_iter = NULL, *job_iter; |
| slurm_opt_t *opt_local = &opt; |
| uint32_t node_offset = 0, het_job_nnodes = 0, step_id = NO_VAL; |
| uint32_t het_job_ntasks = 0, task_offset = 0; |
| bool update_het_nnodes = false; |
| uint32_t updated_het_nnodes; |
| uint32_t updated_het_ntasks = 0; |
| |
| job_step_create_response_msg_t *step_resp; |
| char *resv_ports = NULL; |
| int rc = 0; |
| |
| if (srun_job_list) { |
| hostlist_t *exclude_hl = NULL; |
| |
| if (local_het_step) |
| exclude_hl = hostlist_create(NULL); |
| |
| if (opt_list) |
| opt_iter = list_iterator_create(opt_list); |
| job_iter = list_iterator_create(srun_job_list); |
| while ((job = list_next(job_iter))) { |
| if (het_job_id) |
| job->het_job_id = het_job_id; |
| job->step_id.step_id = NO_VAL; |
| |
| /* |
| * Only set the step_het_comp if we are in a het step |
| * from a single allocation |
| */ |
| if (local_het_step) |
| job->step_id.step_het_comp = |
| job->het_job_offset; |
| else |
| job->step_id.step_het_comp = NO_VAL; |
| |
| het_job_nnodes += job->nhosts; |
| if (job->ntasks == NO_VAL) |
| het_job_ntasks = NO_VAL; |
| else if (het_job_ntasks != NO_VAL) |
| het_job_ntasks += job->ntasks; |
| } |
| |
| updated_het_nnodes = het_job_nnodes; |
| list_iterator_reset(job_iter); |
| while ((job = list_next(job_iter))) { |
| uint32_t old_nhosts = job->nhosts; |
| if (opt_list) |
| opt_local = list_next(opt_iter); |
| if (!opt_local) |
| fatal("%s: opt_list too short", __func__); |
| job->het_job_node_offset = node_offset; |
| job->het_job_nnodes = het_job_nnodes; |
| job->het_job_ntasks = het_job_ntasks; |
| job->het_job_task_offset = task_offset; |
| if (step_id != NO_VAL) |
| job->step_id.step_id = step_id; |
| |
| if ((rc = _handle_het_step_exclude( |
| job, opt_local, exclude_hl)) != |
| SLURM_SUCCESS) |
| break; |
| |
| rc = create_job_step(job, use_all_cpus, opt_local); |
| if (rc < 0) |
| break; |
| if (step_id == NO_VAL) |
| step_id = job->step_id.step_id; |
| if (exclude_hl) { |
| slurm_step_layout_t *step_layout = |
| launch_common_get_slurm_step_layout( |
| job); |
| hostlist_push(exclude_hl, |
| step_layout->node_list); |
| } |
| step_resp = job->step_ctx->step_resp; |
| if (step_resp && step_resp->resv_ports && |
| strcmp(step_resp->resv_ports, "(null)")) { |
| if (resv_ports) |
| xstrcat(resv_ports, ","); |
| xstrcat(resv_ports, step_resp->resv_ports); |
| } |
| node_offset += job->nhosts; |
| task_offset += job->ntasks; |
| |
| /* |
| * If packing nodes (SELECT_PACK_NODES, -mpack), the step |
| * may have an updated layout. Need to update each |
| * component's het_job_nnodes with the updated counts. |
| */ |
| if (job->nhosts < old_nhosts) { |
| update_het_nnodes = true; |
| updated_het_nnodes -= old_nhosts - job->nhosts; |
| } |
| |
| if (het_job_ntasks == NO_VAL) |
| updated_het_ntasks += job->ntasks; |
| } |
| |
| if (update_het_nnodes) { |
| list_iterator_reset(job_iter); |
| while ((job = list_next(job_iter))) { |
| job->het_job_nnodes = updated_het_nnodes; |
| } |
| } |
| if (updated_het_ntasks) { |
| list_iterator_reset(job_iter); |
| while ((job = list_next(job_iter))) { |
| job->het_job_ntasks = updated_het_ntasks; |
| } |
| } |
| |
| FREE_NULL_HOSTLIST(exclude_hl); |
| |
| if (!rc && resv_ports) { |
| /* |
| * Merge numeric values into single range |
| * (e.g. "10-12,13-15,16-18" -> "10-18") |
| */ |
| hostset_t *hs; |
| char *tmp = NULL, *sep; |
| xstrfmtcat(tmp, "[%s]", resv_ports); |
| hs = hostset_create(tmp); |
| hostset_ranged_string(hs, strlen(tmp) + 1, tmp); |
| sep = strchr(tmp, ']'); |
| if (sep) |
| sep[0] = '\0'; |
| xfree(resv_ports); |
| resv_ports = xstrdup(tmp + 1); |
| xfree(tmp); |
| hostset_destroy(hs); |
| |
| list_iterator_reset(job_iter); |
| while ((job = list_next(job_iter))) { |
| if (!job->step_ctx->step_resp) |
| continue; |
| xfree(job->step_ctx->step_resp->resv_ports); |
| job->step_ctx->step_resp->resv_ports = |
| xstrdup(resv_ports); |
| } |
| } |
| xfree(resv_ports); |
| list_iterator_destroy(job_iter); |
| if (opt_iter) |
| list_iterator_destroy(opt_iter); |
| return rc; |
| } else if (job) { |
| if (het_job_id) { |
| job->het_job_id = het_job_id; |
| job->het_job_nnodes = job->nhosts; |
| job->het_job_ntasks = job->ntasks; |
| job->het_job_task_offset = 0; |
| } |
| if ((rc = create_job_step(job, use_all_cpus, &opt)) < 0) |
| return rc; |
| |
| if (het_job_id) { |
| /* |
| * If packing nodes (SELECT_PACK_NODES, -mpack), the step |
| * may have an updated layout. |
| */ |
| job->het_job_nnodes = job->nhosts; |
| /* The stepmgr logic can modify ntasks */ |
| job->het_job_ntasks = job->ntasks; |
| } |
| |
| return rc; |
| } else { |
| return -1; |
| } |
| } |
| |
| static void _cancel_steps(list_t *srun_job_list) |
| { |
| srun_job_t *job; |
| list_itr_t *job_iter; |
| slurm_msg_t req; |
| step_complete_msg_t msg; |
| int rc = 0; |
| |
| if (!srun_job_list) |
| return; |
| |
| slurm_msg_t_init(&req); |
| req.msg_type = REQUEST_STEP_COMPLETE; |
| req.data = &msg; |
| memset(&msg, 0, sizeof(step_complete_msg_t)); |
| msg.step_rc = 0; |
| |
| job_iter = list_iterator_create(srun_job_list); |
| while ((job = list_next(job_iter))) { |
| if (job->step_id.step_id == NO_VAL) |
| continue; |
| memcpy(&msg.step_id, &job->step_id, sizeof(msg.step_id)); |
| msg.range_first = 0; |
| msg.range_last = job->nhosts - 1; |
| (void) slurm_send_recv_controller_rc_msg(&req, &rc, |
| working_cluster_rec); |
| } |
| list_iterator_destroy(job_iter); |
| } |
| |
| static void _het_job_struct_del(void *x) |
| { |
| het_job_resp_struct_t *het_job_resp = (het_job_resp_struct_t *) x; |
| |
| xfree(het_job_resp->cpu_cnt); |
| if (het_job_resp->host_list) |
| hostlist_destroy(het_job_resp->host_list); |
| xfree(het_job_resp); |
| } |
| |
| static char *_compress_het_job_nodelist(list_t *used_resp_list) |
| { |
| resource_allocation_response_msg_t *resp; |
| het_job_resp_struct_t *het_job_resp; |
| list_t *het_job_resp_list; |
| list_itr_t *resp_iter; |
| char *tmp; |
| char *het_job_nodelist = NULL, *node_name; |
| hostset_t *hs; |
| int cnt, i, j, k; |
| uint16_t *cpus; |
| uint32_t *reps, cpu_inx; |
| |
| if (!used_resp_list) |
| return het_job_nodelist; |
| |
| cnt = list_count(used_resp_list); |
| het_job_resp_list = list_create(_het_job_struct_del); |
| hs = hostset_create(""); |
| resp_iter = list_iterator_create(used_resp_list); |
| while ((resp = list_next(resp_iter))) { |
| if (!resp->node_list) |
| continue; |
| hostset_insert(hs, resp->node_list); |
| het_job_resp = xmalloc(sizeof(het_job_resp_struct_t)); |
| het_job_resp->node_cnt = resp->node_cnt; |
| het_job_resp->cpu_cnt = |
| xmalloc(sizeof(uint16_t) * resp->node_cnt); |
| het_job_resp->host_list = hostlist_create(resp->node_list); |
| for (i = 0, k = 0; |
| (i < resp->num_cpu_groups) && (k < resp->node_cnt); i++) { |
| for (j = 0; j < resp->cpu_count_reps[i]; j++) { |
| het_job_resp->cpu_cnt[k++] = |
| resp->cpus_per_node[i]; |
| if (k >= resp->node_cnt) |
| break; |
| } |
| if (k >= resp->node_cnt) |
| break; |
| } |
| list_append(het_job_resp_list, het_job_resp); |
| } |
| list_iterator_destroy(resp_iter); |
| |
| het_job_nodelist = hostset_ranged_string_xmalloc(hs); |
| |
| cpu_inx = 0; |
| cnt = hostset_count(hs); |
| cpus = xmalloc(sizeof(uint16_t) * (cnt + 1)); |
| reps = xmalloc(sizeof(uint32_t) * (cnt + 1)); |
| for (i = 0; i < cnt; i++) { |
| node_name = hostset_nth(hs, i); |
| resp_iter = list_iterator_create(het_job_resp_list); |
| while ((het_job_resp = list_next(resp_iter))) { |
| j = hostlist_find(het_job_resp->host_list, node_name); |
| if ((j == -1) || !het_job_resp->cpu_cnt) |
| continue; /* node not in this hetjob */ |
| if (cpus[cpu_inx] == het_job_resp->cpu_cnt[j]) { |
| reps[cpu_inx]++; |
| } else { |
| if (cpus[cpu_inx] != 0) |
| cpu_inx++; |
| cpus[cpu_inx] = het_job_resp->cpu_cnt[j]; |
| reps[cpu_inx]++; |
| } |
| break; |
| } |
| list_iterator_destroy(resp_iter); |
| free(node_name); |
| } |
| |
| cpu_inx++; |
| tmp = uint32_compressed_to_str(cpu_inx, cpus, reps); |
| if (setenv("SLURM_JOB_CPUS_PER_NODE", tmp, 1) < 0) { |
| error("%s: Unable to set SLURM_JOB_CPUS_PER_NODE in environment", |
| __func__); |
| } |
| xfree(tmp); |
| |
| xfree(reps); |
| xfree(cpus); |
| hostset_destroy(hs); |
| FREE_NULL_LIST(het_job_resp_list); |
| |
| return het_job_nodelist; |
| } |
| |
| /* |
| * Here we have a regular job allocation, but we are requesting a het step in |
| * that allocation. So here we will copy the resp_list to the number of |
| * components we care about. |
| */ |
| static void _copy_job_resp(list_t *job_resp_list, int count) |
| { |
| resource_allocation_response_msg_t *new, *orig; |
| xassert(job_resp_list); |
| xassert(list_count(job_resp_list) == 1); |
| |
| orig = list_peek(job_resp_list); |
| for (int i = 0; i < count; i++) { |
| new = slurm_copy_resource_allocation_response_msg(orig); |
| list_append(job_resp_list, new); |
| } |
| } |
| static void _check_gpus_per_socket(slurm_opt_t *opt_local) |
| { |
| static bool checked = false; /* Only log the warning once */ |
| |
| if (!opt_local->gpus_per_socket || checked) |
| return; |
| |
| checked = true; |
| if (opt_local->gpus_per_socket && |
| !slurm_option_set_by_env(opt_local, LONG_OPT_GPUS_PER_SOCKET)) { |
| /* |
| * gpus_per_socket does not work for steps. |
| * If it is set by env, it was likely inherited by the job. |
| */ |
| warning("Ignoring --gpus-per-socket because it can only be specified at job allocation time, not during step allocation."); |
| } |
| } |
| |
| extern void create_srun_job(void **p_job, bool *got_alloc) |
| { |
| resource_allocation_response_msg_t *resp; |
| list_t *job_resp_list = NULL, *srun_job_list = NULL; |
| list_t *used_resp_list = NULL; |
| list_itr_t *opt_iter, *resp_iter; |
| srun_job_t *job = NULL; |
| int i, max_list_offset, max_het_job_offset, het_job_offset = -1, |
| het_step_offset = -1; |
| uint32_t my_job_id = 0, het_job_id = 0; |
| char *het_job_nodelist = NULL; |
| bool begin_error_logged = false; |
| bool core_spec_error_logged = false; |
| bool node_cnt_error_logged = false; |
| bool tres_license_error_logged = false; |
| bool x11_error_logged = false; |
| |
| /* |
| * now global "opt" should be filled in and available, |
| * create a job from opt |
| */ |
| if (sropt.test_only) { |
| int rc = allocate_test(); |
| if (rc) { |
| slurm_perror("allocation failure"); |
| exit (1); |
| } |
| exit (0); |
| |
| } else if (sropt.no_alloc) { |
| if (opt_list || |
| (sropt.het_grp_bits && (bit_fls(sropt.het_grp_bits) > 0))) |
| fatal("--no-allocation option not supported for heterogeneous jobs"); |
| info("do not allocate resources"); |
| job = job_create_noalloc(); |
| if (job == NULL) { |
| error("Job creation failure."); |
| exit(error_exit); |
| } |
| if (create_job_step(job, false, &opt) < 0) |
| exit(error_exit); |
| } else if ((job_resp_list = existing_allocation())) { |
| slurm_opt_t *opt_local; |
| |
| max_list_offset = 0; |
| max_het_job_offset = list_count(job_resp_list) - 1; |
| if (opt_list) { |
| opt_iter = list_iterator_create(opt_list); |
| while ((opt_local = list_next(opt_iter))) { |
| srun_opt_t *srun_opt = opt_local->srun_opt; |
| xassert(srun_opt); |
| if (srun_opt->het_grp_bits) { |
| i = bit_fls(srun_opt->het_grp_bits); |
| max_list_offset = MAX(max_list_offset, |
| i); |
| } |
| } |
| list_iterator_destroy(opt_iter); |
| if (max_list_offset > max_het_job_offset) { |
| if (list_count(job_resp_list) != 1) { |
| error("Attempt to run a job step with het group value of %d, but the job allocation has maximum value of %d", |
| max_list_offset, |
| max_het_job_offset); |
| exit(1); |
| } |
| |
| /* |
| * Here we have a regular job allocation, but we |
| * are requesting a het step in that |
| * allocation. So here we will copy the |
| * resp_list to the number of components we care |
| * about. |
| */ |
| _copy_job_resp(job_resp_list, max_list_offset); |
| max_het_job_offset = max_list_offset; |
| local_het_step = true; |
| } |
| if (list_count(opt_list) > 1) |
| het_step_offset = 0; |
| } |
| srun_job_list = list_create(NULL); |
| used_resp_list = list_create(NULL); |
| if (max_het_job_offset > 0) |
| het_job_offset = 0; |
| resp_iter = list_iterator_create(job_resp_list); |
| while ((resp = list_next(resp_iter))) { |
| bool merge_nodelist = true; |
| if (my_job_id == 0) { |
| my_job_id = resp->job_id; |
| if (resp->working_cluster_rec) |
| slurm_setup_remote_working_cluster(resp); |
| } |
| _print_job_information(resp); |
| (void) get_next_opt(-2); |
| /* |
| * Check using het_job_offset here, but we use |
| * het_step_offset for the job being added. |
| */ |
| while ((opt_local = get_next_opt(het_job_offset))) { |
| srun_opt_t *srun_opt = opt_local->srun_opt; |
| xassert(srun_opt); |
| |
| if (local_het_step) |
| opt_local->step_het_comp_cnt = |
| max_het_job_offset; |
| |
| if (merge_nodelist) { |
| merge_nodelist = false; |
| list_append(used_resp_list, resp); |
| } |
| if (slurm_option_set_by_env(opt_local, 'N') && |
| (opt_local->min_nodes > resp->node_cnt)) { |
| /* |
| * This signifies the job used the |
| * --no-kill option and a node went DOWN |
| * or it used a node count range |
| * specification, was checkpointed from |
| * one size and restarted at a different |
| * size |
| */ |
| if (!node_cnt_error_logged) { |
| error("SLURM_JOB_NUM_NODES environment variable conflicts with allocated node count (%u != %u).", |
| opt_local->min_nodes, |
| resp->node_cnt); |
| node_cnt_error_logged = true; |
| } |
| /* |
| * Modify options to match resource |
| * allocation. |
| * NOTE: Some options are not supported |
| */ |
| opt_local->min_nodes = resp->node_cnt; |
| xfree(srun_opt->alloc_nodelist); |
| if (!opt_local->ntasks_set) { |
| opt_local->ntasks = |
| opt_local->min_nodes; |
| } |
| } |
| _check_gpus_per_socket(opt_local); |
| if (!tres_license_error_logged && |
| !slurm_option_set_by_env( |
| opt_local, |
| LONG_OPT_TRES_PER_TASK) && |
| xstrstr(opt_local->tres_per_task, |
| "license")) { |
| warning("Ignoring --tres-per-task license specification because licenses can only be specified at job allocation time, not during step allocation."); |
| tres_license_error_logged = true; |
| } |
| if (srun_opt->core_spec_set && |
| !core_spec_error_logged) { |
| /* |
| * NOTE: Silently ignore specialized |
| * core count set with SLURM_CORE_SPEC |
| * environment variable |
| */ |
| error("Ignoring --core-spec value for a job step " |
| "within an existing job. Set specialized cores " |
| "at job allocation time."); |
| core_spec_error_logged = true; |
| } |
| |
| /* |
| * Here we send the het job groups to the |
| * slurmctld to set up the interconnect |
| * correctly. We only ever need to send it to |
| * the first component of the step. |
| * |
| * FIXME - is this still needed post-Cray? |
| */ |
| if (g_het_grp_bits) { |
| xfree(opt_local->step_het_grps); |
| opt_local->step_het_grps = |
| bit_fmt_hexmask(g_het_grp_bits); |
| } |
| |
| _set_env_vars(resp, het_step_offset); |
| if (_validate_relative(resp, opt_local)) |
| exit(error_exit); |
| if (opt_local->begin && !begin_error_logged) { |
| error("--begin is ignored because nodes are already allocated."); |
| begin_error_logged = true; |
| } |
| if (opt_local->x11 && !x11_error_logged) { |
| error("Ignoring --x11 option for a job step within an " |
| "existing job. Set x11 options at job allocation time."); |
| x11_error_logged = true; |
| } |
| job = job_step_create_allocation(resp, |
| opt_local); |
| if (!job) |
| exit(error_exit); |
| if (max_het_job_offset > 0) |
| job->het_job_offset = het_step_offset; |
| list_append(srun_job_list, job); |
| het_step_offset++; |
| } /* While more option structures */ |
| het_job_offset++; |
| } /* More hetjob components */ |
| list_iterator_destroy(resp_iter); |
| |
| max_het_job_offset = get_max_het_group(); |
| het_job_offset = list_count(job_resp_list) - 1; |
| if (max_het_job_offset > het_job_offset) { |
| error("Requested het-group offset exceeds highest hetjob index (%d > %d)", |
| max_het_job_offset, het_job_offset); |
| exit(error_exit); |
| } |
| i = list_count(srun_job_list); |
| if (i == 0) { |
| error("No directives to start application on any available hetjob components"); |
| exit(error_exit); |
| } |
| if (i == 1) |
| FREE_NULL_LIST(srun_job_list); /* Just use "job" */ |
| if (list_count(job_resp_list) > 1) { |
| /* only set if actually a hetjob */ |
| if (!local_het_step && my_job_id) |
| het_job_id = my_job_id; |
| het_job_nodelist = |
| _compress_het_job_nodelist(used_resp_list); |
| } |
| FREE_NULL_LIST(used_resp_list); |
| if (_create_job_step(job, false, srun_job_list, het_job_id, |
| het_job_nodelist) < 0) { |
| if (*got_alloc) |
| slurm_complete_job(my_job_id, 1); |
| else |
| _cancel_steps(srun_job_list); |
| exit(error_exit); |
| } |
| xfree(het_job_nodelist); |
| } else { |
| /* Combined job allocation and job step launch */ |
| if (slurm_option_set_by_cli(&opt, 'J')) |
| setenvfs("SLURM_JOB_NAME=%s", opt.job_name); |
| else if (!slurm_option_set_by_env(&opt, 'J') && opt.argc) |
| setenvfs("SLURM_JOB_NAME=%s", opt.argv[0]); |
| |
| if (opt_list) { |
| if (!colon_cnt) { |
| error("--het-group expected to be used within an HetJob allocation"); |
| exit(error_exit); |
| } |
| job_resp_list = allocate_het_job_nodes(); |
| if (!job_resp_list) |
| exit(error_exit); |
| srun_job_list = list_create(NULL); |
| opt_iter = list_iterator_create(opt_list); |
| resp_iter = list_iterator_create(job_resp_list); |
| while ((resp = list_next(resp_iter))) { |
| slurm_opt_t *opt_local; |
| |
| if (my_job_id == 0) { |
| my_job_id = resp->job_id; |
| *got_alloc = true; |
| } |
| opt_local = list_next(opt_iter); |
| if (!opt_local) |
| break; |
| _print_job_information(resp); |
| _set_env_vars(resp, ++het_job_offset); |
| _set_env_vars2(resp, het_job_offset); |
| if (_validate_relative(resp, opt_local)) { |
| slurm_complete_job(my_job_id, 1); |
| exit(error_exit); |
| } |
| job = job_create_allocation(resp, opt_local); |
| job->het_job_offset = het_job_offset; |
| list_append(srun_job_list, job); |
| _set_step_opts(opt_local, resp); |
| } |
| list_iterator_destroy(opt_iter); |
| list_iterator_destroy(resp_iter); |
| if (!local_het_step) { |
| /* Continue support for old pack terminology. */ |
| setenvfs("SLURM_PACK_SIZE=%d", |
| het_job_offset + 1); |
| setenvfs("SLURM_HET_SIZE=%d", |
| het_job_offset + 1); |
| } |
| } else { |
| if (sropt.het_grp_bits && |
| (bit_fls(sropt.het_grp_bits) != -1)) { |
| error("--het-group expected to be used within an HetJob allocation"); |
| exit(error_exit); |
| } |
| |
| if (!(resp = allocate_nodes(&opt))) |
| exit(error_exit); |
| *got_alloc = true; |
| my_job_id = resp->job_id; |
| _print_job_information(resp); |
| _set_env_vars(resp, -1); |
| if (_validate_relative(resp, &opt)) { |
| slurm_complete_job(resp->job_id, 1); |
| exit(error_exit); |
| } |
| job = job_create_allocation(resp, &opt); |
| _set_step_opts(&opt, resp); |
| } |
| if (srun_job_list && (list_count(srun_job_list) > 1) && |
| opt_list && (list_count(opt_list) > 1) && my_job_id) { |
| /* only set if actually a hetjob */ |
| if (!local_het_step) |
| het_job_id = my_job_id; |
| het_job_nodelist = |
| _compress_het_job_nodelist(job_resp_list); |
| } |
| |
| if (_create_job_step(job, true, srun_job_list, het_job_id, |
| het_job_nodelist) < 0) { |
| slurm_complete_job(my_job_id, 1); |
| exit(error_exit); |
| } |
| xfree(het_job_nodelist); |
| |
| if (opt_list) { |
| resp_iter = list_iterator_create(job_resp_list); |
| while ((resp = list_next(resp_iter))) { |
| slurm_free_resource_allocation_response_msg( |
| resp); |
| } |
| list_iterator_destroy(resp_iter); |
| } else { |
| slurm_free_resource_allocation_response_msg(resp); |
| } |
| } |
| |
| /* |
| * Spawn process to ensure clean-up of job and/or step |
| * on abnormal termination |
| */ |
| shepherd_fd = _shepherd_spawn(job, srun_job_list, *got_alloc); |
| |
| if (opt_list) |
| *p_job = (void *) srun_job_list; |
| else |
| *p_job = (void *) job; |
| |
| if (job) |
| _srun_cli_filter_post_submit(my_job_id, job->step_id.step_id); |
| } |
| |
| extern void pre_launch_srun_job(srun_job_t *job, slurm_opt_t *opt_local) |
| { |
| if (!signal_thread) |
| slurm_thread_create(&signal_thread, _srun_signal_mgr, job); |
| |
| _run_srun_prolog(job); |
| if (_call_spank_local_user(job, opt_local)) { |
| error("Failure in local plugin stack"); |
| slurm_step_launch_abort(job->step_ctx); |
| exit(error_exit); |
| } |
| |
| env_array_merge(&job->env, (const char **)environ); |
| } |
| |
| extern void fini_srun(srun_job_t *job, bool got_alloc, uint32_t *global_rc) |
| { |
| if (got_alloc) { |
| cleanup_allocation(); |
| |
| /* Tell slurmctld that we were cancelled */ |
| if (job->state >= SRUN_JOB_CANCELLED) |
| slurm_complete_job(job->step_id.job_id, NO_VAL); |
| else |
| slurm_complete_job(job->step_id.job_id, *global_rc); |
| } |
| _shepherd_notify(shepherd_fd); |
| |
| if (signal_thread) { |
| srun_shutdown = true; |
| pthread_kill(signal_thread, SIGINT); |
| slurm_thread_join(signal_thread); |
| } |
| |
| _run_srun_epilog(job); |
| |
| step_ctx_destroy(job->step_ctx); |
| |
| if (WIFEXITED(*global_rc)) |
| *global_rc = WEXITSTATUS(*global_rc); |
| else if (WIFSIGNALED(*global_rc)) |
| *global_rc = 128 + WTERMSIG(*global_rc); |
| |
| mpir_cleanup(); |
| } |
| |
| void |
| update_job_state(srun_job_t *job, srun_job_state_t state) |
| { |
| slurm_mutex_lock(&job->state_mutex); |
| if (job->state < state) { |
| job->state = state; |
| slurm_cond_signal(&job->state_cond); |
| |
| } |
| slurm_mutex_unlock(&job->state_mutex); |
| return; |
| } |
| |
| srun_job_state_t |
| job_state(srun_job_t *job) |
| { |
| srun_job_state_t state; |
| slurm_mutex_lock(&job->state_mutex); |
| state = job->state; |
| slurm_mutex_unlock(&job->state_mutex); |
| return state; |
| } |
| |
| |
| void |
| job_force_termination(srun_job_t *job) |
| { |
| static int kill_sent = 0; |
| static time_t last_msg = 0; |
| |
| if (kill_sent == 0) { |
| info("forcing job termination"); |
| /* Send SIGKILL to tasks directly */ |
| update_job_state(job, SRUN_JOB_CANCELLED); |
| launch_g_fwd_signal(SIGKILL); |
| } else { |
| time_t now = time(NULL); |
| if (last_msg != now) { |
| info("job abort in progress"); |
| last_msg = now; |
| } |
| if (kill_sent == 1) { |
| /* Try sending SIGKILL through slurmctld */ |
| slurm_kill_job_step(job->step_id.job_id, |
| job->step_id.step_id, SIGKILL, 0); |
| } |
| } |
| kill_sent++; |
| } |
| |
| static void _set_ntasks(allocation_info_t *ai, slurm_opt_t *opt_local) |
| { |
| int cnt = 0; |
| |
| /* Distinction between explicit or implicit set of ntasks */ |
| if (opt_local->ntasks_opt_set || |
| (opt_local->ntasks_set && |
| (opt_local->ntasks_per_node == NO_VAL))) |
| return; |
| |
| if (opt_local->ntasks_per_node != NO_VAL) { |
| cnt = ai->nnodes * opt_local->ntasks_per_node; |
| opt_local->ntasks_set = true; /* implicit */ |
| } else if (opt_local->cpus_set) { |
| opt_local->ntasks = NO_VAL; |
| opt_local->ntasks_set = true; /* implicit */ |
| return; |
| } |
| |
| opt_local->ntasks = (cnt < ai->nnodes) ? ai->nnodes : cnt; |
| } |
| |
| /* |
| * Create an srun job structure from a resource allocation response msg |
| */ |
| static srun_job_t *_job_create_structure(allocation_info_t *ainfo, |
| slurm_opt_t *opt_local) |
| { |
| srun_job_t *job = xmalloc(sizeof(srun_job_t)); |
| int i; |
| |
| _set_ntasks(ainfo, opt_local); |
| debug2("creating job with %d tasks", opt_local->ntasks); |
| |
| slurm_mutex_init(&job->state_mutex); |
| slurm_cond_init(&job->state_cond, NULL); |
| job->state = SRUN_JOB_INIT; |
| |
| job->container = xstrdup(opt_local->container); |
| job->nodelist = xstrdup(ainfo->nodelist); |
| job->partition = xstrdup(ainfo->partition); |
| memcpy(&job->step_id, &ainfo->step_id, sizeof(job->step_id)); |
| job->het_job_id = NO_VAL; |
| job->het_job_nnodes = NO_VAL; |
| job->het_job_ntasks = NO_VAL; |
| job->het_job_offset = NO_VAL; |
| job->het_job_task_offset = NO_VAL; |
| job->nhosts = ainfo->nnodes; |
| |
| if (opt_local->min_nodes > job->nhosts) { |
| error("Only allocated %d nodes asked for %d", |
| job->nhosts, opt_local->min_nodes); |
| if (opt_local->exclude) { |
| /* When resources are pre-allocated and some nodes |
| * are explicitly excluded, this error can occur. */ |
| error("Are required nodes explicitly excluded?"); |
| } |
| xfree(job); |
| return NULL; |
| } |
| if ((ainfo->cpus_per_node == NULL) || |
| (ainfo->cpu_count_reps == NULL)) { |
| error("cpus_per_node array is not set"); |
| xfree(job); |
| return NULL; |
| } |
| |
| job->ntasks = opt_local->ntasks; |
| job->ntasks_per_board = ainfo->ntasks_per_board; |
| job->ntasks_per_core = ainfo->ntasks_per_core; |
| job->ntasks_per_socket = ainfo->ntasks_per_socket; |
| |
| /* |
| * If cpus_per_task is set then get the exact count of cpus for the |
| * requested step (we might very well use less, especially if |
| * --exclusive is used). Else get the total for the allocation given. |
| */ |
| if (opt_local->cpus_set) { |
| if (opt_local->ntasks == NO_VAL) |
| job->cpu_count = NO_VAL; |
| else |
| job->cpu_count = opt_local->ntasks * |
| opt_local->cpus_per_task; |
| } else { |
| for (i = 0; i < ainfo->num_cpu_groups; i++) { |
| job->cpu_count += ainfo->cpus_per_node[i] * |
| ainfo->cpu_count_reps[i]; |
| } |
| } |
| |
| job->rc = -1; |
| |
| job_update_io_fnames(job, opt_local); |
| |
| job->uid = ainfo->uid; |
| job->user_name = xstrdup(ainfo->user_name); |
| job->gid = ainfo->gid; |
| job->group_name = xstrdup(ainfo->group_name); |
| |
| return (job); |
| } |
| |
| extern void job_update_io_fnames(srun_job_t *job, slurm_opt_t *opt_local) |
| { |
| job->ifname = fname_create(job, opt_local->ifname, opt_local->ntasks); |
| job->ofname = fname_create(job, opt_local->ofname, opt_local->ntasks); |
| job->efname = opt_local->efname ? |
| fname_create(job, opt_local->efname, opt_local->ntasks) : |
| job->ofname; |
| } |
| |
| static char * |
| _normalize_hostlist(const char *hostlist) |
| { |
| char *buf = NULL; |
| hostlist_t *hl = hostlist_create(hostlist); |
| |
| if (hl) { |
| buf = hostlist_ranged_string_xmalloc(hl); |
| hostlist_destroy(hl); |
| } |
| if (!buf) |
| return xstrdup(hostlist); |
| |
| return buf; |
| } |
| |
| static int _call_spank_local_user(srun_job_t *job, slurm_opt_t *opt_local) |
| { |
| struct spank_launcher_job_info info[1]; |
| |
| info->argc = opt_local->argc; |
| info->argv = opt_local->argv; |
| info->gid = opt_local->gid; |
| info->jobid = job->step_id.job_id; |
| info->stepid = job->step_id.step_id; |
| info->step_layout = launch_common_get_slurm_step_layout(job); |
| info->uid = opt_local->uid; |
| |
| return spank_local_user(info); |
| } |
| |
| /* Return the number of microseconds between tv1 and tv2 with a maximum |
| * a maximum value of 10,000,000 to prevent overflows */ |
| static long _diff_tv_str(struct timeval *tv1, struct timeval *tv2) |
| { |
| long delta_t; |
| |
| delta_t = MIN((tv2->tv_sec - tv1->tv_sec), 10); |
| delta_t *= USEC_IN_SEC; |
| delta_t += tv2->tv_usec - tv1->tv_usec; |
| return delta_t; |
| } |
| |
| static void _handle_intr(srun_job_t *job) |
| { |
| static struct timeval last_intr = { 0, 0 }; |
| struct timeval now; |
| |
| gettimeofday(&now, NULL); |
| if (sropt.quit_on_intr || _diff_tv_str(&last_intr, &now) < 1000000) { |
| info("sending Ctrl-C to %ps", &job->step_id); |
| launch_g_fwd_signal(SIGINT); |
| job_force_termination(job); |
| } else { |
| if (sropt.disable_status) { |
| info("sending Ctrl-C to %ps", &job->step_id); |
| launch_g_fwd_signal(SIGINT); |
| } else if (job->state < SRUN_JOB_CANCELLED) { |
| info("interrupt (one more within 1 sec to abort)"); |
| launch_g_print_status(); |
| } |
| last_intr = now; |
| } |
| } |
| |
| static void _handle_pipe(void) |
| { |
| static int ending = 0; |
| |
| if (ending) |
| return; |
| ending = 1; |
| launch_g_fwd_signal(SIGKILL); |
| } |
| |
| |
| static void _print_job_information(resource_allocation_response_msg_t *resp) |
| { |
| int i; |
| char *str = NULL; |
| char *sep = ""; |
| |
| if (!opt.verbose) |
| return; |
| |
| xstrfmtcat(str, "jobid %u: nodes(%u):`%s', cpu counts: ", |
| resp->job_id, resp->node_cnt, resp->node_list); |
| |
| for (i = 0; i < resp->num_cpu_groups; i++) { |
| xstrfmtcat(str, "%s%u(x%u)", |
| sep, resp->cpus_per_node[i], |
| resp->cpu_count_reps[i]); |
| sep = ","; |
| } |
| verbose("%s", str); |
| xfree(str); |
| } |
| |
| /* NOTE: Executed once for entire hetjob */ |
| static void _run_srun_epilog (srun_job_t *job) |
| { |
| int rc; |
| |
| if (sropt.epilog && xstrcasecmp(sropt.epilog, "none") != 0) { |
| if (setenvf(NULL, "SLURM_SCRIPT_CONTEXT", "epilog_srun") < 0) |
| error("unable to set SLURM_SCRIPT_CONTEXT in environment"); |
| rc = _run_srun_script(job, sropt.epilog); |
| if (rc) { |
| error("srun epilog failed status=%d", rc); |
| } |
| } |
| } |
| |
| static void _run_srun_prolog (srun_job_t *job) |
| { |
| int rc; |
| |
| if (sropt.prolog && xstrcasecmp(sropt.prolog, "none") != 0) { |
| if (setenvf(NULL, "SLURM_SCRIPT_CONTEXT", "prolog_srun") < 0) |
| error("unable to set SLURM_SCRIPT_CONTEXT in environment"); |
| rc = _run_srun_script(job, sropt.prolog); |
| if (rc) { |
| error("srun prolog failed rc = %d. Aborting step.", rc); |
| slurm_step_launch_abort(job->step_ctx); |
| } |
| } |
| } |
| |
| /* |
| * Run srun prolog/epilog script. |
| * |
| * RET the exit status of the script or 1 on generic error and 0 on success |
| */ |
| static int _run_srun_script (srun_job_t *job, char *script) |
| { |
| int status; |
| pid_t cpid; |
| int i; |
| char **args = NULL; |
| |
| if (script == NULL || script[0] == '\0') |
| return 0; |
| |
| if (access(script, R_OK | X_OK) < 0) { |
| info("Access denied for %s: %m", script); |
| return 0; |
| } |
| |
| if ((cpid = fork()) < 0) { |
| error ("run_srun_script: fork: %m"); |
| return 1; |
| } |
| if (cpid == 0) { |
| /* |
| * set the prolog/epilog scripts command line arguments to the |
| * application arguments (for last hetjob component), but |
| * shifted one higher |
| */ |
| args = xmalloc(sizeof(char *) * 1024); |
| args[0] = script; |
| for (i = 0; i < opt.argc; i++) { |
| args[i + 1] = opt.argv[i]; |
| } |
| args[i + 1] = NULL; |
| execv(script, args); |
| error("Failed to execute srun prolog/epilog script: %m"); |
| _exit(127); |
| } |
| |
| do { |
| if (waitpid(cpid, &status, 0) < 0) { |
| if (errno == EINTR) |
| continue; |
| error("waitpid: %m"); |
| return 0; |
| } else if (WIFEXITED(status)) { |
| return WEXITSTATUS(status); |
| } else { |
| error("script did not exit normally"); |
| return 1; |
| } |
| } while(1); |
| |
| /* NOTREACHED */ |
| } |
| |
| static char *_build_key(char *base, int het_job_offset) |
| { |
| char *key = NULL; |
| |
| /* If we are a local_het_step we treat it like a normal step */ |
| if (local_het_step || (het_job_offset == -1)) |
| key = xstrdup(base); |
| else |
| xstrfmtcat(key, "%s_PACK_GROUP_%d", base, het_job_offset); |
| |
| return key; |
| } |
| |
| static void _set_env_vars(resource_allocation_response_msg_t *resp, |
| int het_job_offset) |
| { |
| char *key, *value, *tmp; |
| int i; |
| |
| key = _build_key("SLURM_JOB_CPUS_PER_NODE", het_job_offset); |
| if (!getenv(key)) { |
| tmp = uint32_compressed_to_str(resp->num_cpu_groups, |
| resp->cpus_per_node, |
| resp->cpu_count_reps); |
| if (setenvf(NULL, key, "%s", tmp) < 0) |
| error("unable to set %s in environment", key); |
| xfree(tmp); |
| } |
| xfree(key); |
| |
| if (resp->env_size) { /* Used to set Burst Buffer environment */ |
| for (i = 0; i < resp->env_size; i++) { |
| tmp = xstrdup(resp->environment[i]); |
| key = tmp; |
| value = strchr(tmp, '='); |
| if (value) { |
| value[0] = '\0'; |
| value++; |
| setenv(key, value, 0); |
| } |
| xfree(tmp); |
| } |
| } |
| |
| if (resp->pn_min_memory & MEM_PER_CPU) { |
| uint64_t tmp_mem = resp->pn_min_memory & (~MEM_PER_CPU); |
| key = _build_key("SLURM_MEM_PER_CPU", het_job_offset); |
| if (!getenv(key) && |
| (setenvf(NULL, key, "%"PRIu64, tmp_mem) < 0)) { |
| error("unable to set %s in environment", key); |
| } |
| xfree(key); |
| } else if (resp->pn_min_memory) { |
| uint64_t tmp_mem = resp->pn_min_memory; |
| key = _build_key("SLURM_MEM_PER_NODE", het_job_offset); |
| if (!getenv(key) && |
| (setenvf(NULL, key, "%"PRIu64, tmp_mem) < 0)) { |
| error("unable to set %s in environment", key); |
| } |
| xfree(key); |
| } |
| |
| if (resp->segment_size) { |
| key = _build_key("SLURM_JOB_SEGMENT_SIZE", het_job_offset); |
| if (!getenv(key) && |
| (setenvf(NULL, key, "%u", resp->segment_size) < 0)) { |
| error("unable to set %s in environment", key); |
| } |
| xfree(key); |
| } |
| |
| return; |
| } |
| |
| /* |
| * Set some hetjob environment variables for combined job & step allocation |
| */ |
| static void _set_env_vars2(resource_allocation_response_msg_t *resp, |
| int het_job_offset) |
| { |
| char *key; |
| |
| if (resp->account) { |
| key = _build_key("SLURM_JOB_ACCOUNT", het_job_offset); |
| if (!getenv(key) && |
| (setenvf(NULL, key, "%s", resp->account) < 0)) { |
| error("unable to set %s in environment", key); |
| } |
| xfree(key); |
| } |
| |
| key = _build_key("SLURM_JOB_ID", het_job_offset); |
| if (!getenv(key) && |
| (setenvf(NULL, key, "%u", resp->job_id) < 0)) { |
| error("unable to set %s in environment", key); |
| } |
| xfree(key); |
| |
| key = _build_key("SLURM_JOB_NODELIST", het_job_offset); |
| if (!getenv(key) && |
| (setenvf(NULL, key, "%s", resp->node_list) < 0)) { |
| error("unable to set %s in environment", key); |
| } |
| xfree(key); |
| |
| key = _build_key("SLURM_JOB_PARTITION", het_job_offset); |
| if (!getenv(key) && |
| (setenvf(NULL, key, "%s", resp->partition) < 0)) { |
| error("unable to set %s in environment", key); |
| } |
| xfree(key); |
| |
| if (resp->qos) { |
| key = _build_key("SLURM_JOB_QOS", het_job_offset); |
| if (!getenv(key) && |
| (setenvf(NULL, key, "%s", resp->qos) < 0)) { |
| error("unable to set %s in environment", key); |
| } |
| xfree(key); |
| } |
| |
| if (resp->resv_name) { |
| key = _build_key("SLURM_JOB_RESERVATION", het_job_offset); |
| if (!getenv(key) && |
| (setenvf(NULL, key, "%s", resp->resv_name) < 0)) { |
| error("unable to set %s in environment", key); |
| } |
| xfree(key); |
| } |
| } |
| |
| /* Set SLURM_RLIMIT_* environment variables with current resource |
| * limit values, reset RLIMIT_NOFILE to maximum possible value */ |
| static int _set_rlimit_env(void) |
| { |
| int rc = SLURM_SUCCESS; |
| struct rlimit rlim[1]; |
| unsigned long cur; |
| char name[64], *format; |
| slurm_rlimits_info_t *rli; |
| |
| /* Modify limits with any command-line options */ |
| if (sropt.propagate |
| && parse_rlimits(sropt.propagate, PROPAGATE_RLIMITS)) { |
| error( "--propagate=%s is not valid.", sropt.propagate ); |
| exit(error_exit); |
| } |
| |
| for (rli = get_slurm_rlimits_info(); rli->name != NULL; rli++ ) { |
| |
| if (rli->propagate_flag != PROPAGATE_RLIMITS) |
| continue; |
| |
| if (getrlimit (rli->resource, rlim) < 0) { |
| error ("getrlimit (RLIMIT_%s): %m", rli->name); |
| rc = SLURM_ERROR; |
| continue; |
| } |
| |
| cur = (unsigned long) rlim->rlim_cur; |
| snprintf(name, sizeof(name), "SLURM_RLIMIT_%s", rli->name); |
| if (sropt.propagate && (rli->propagate_flag == PROPAGATE_RLIMITS)) |
| /* |
| * Prepend 'U' to indicate user requested propagate |
| */ |
| format = "U%lu"; |
| else |
| format = "%lu"; |
| |
| if (setenvf (NULL, name, format, cur) < 0) { |
| error ("unable to set %s in environment", name); |
| rc = SLURM_ERROR; |
| continue; |
| } |
| |
| debug ("propagating RLIMIT_%s=%lu", rli->name, cur); |
| } |
| |
| /* |
| * Now increase NOFILE to the max available for this srun |
| */ |
| rlimits_use_max_nofile(); |
| |
| return rc; |
| } |
| |
| /* Set SLURM_CLUSTER_NAME< SLURM_SUBMIT_DIR and SLURM_SUBMIT_HOST environment |
| * variables within current state */ |
| static void _set_submit_dir_env(void) |
| { |
| char buf[PATH_MAX], host[256]; |
| |
| /* Only set these environment variables in new allocations */ |
| if (sropt.jobid != NO_VAL) |
| return; |
| |
| if (setenvf(NULL, "SLURM_CLUSTER_NAME", "%s", |
| slurm_conf.cluster_name) < 0) |
| error("unable to set SLURM_CLUSTER_NAME in environment"); |
| |
| if ((getcwd(buf, PATH_MAX)) == NULL) |
| error("getcwd failed: %m"); |
| else if (setenvf(NULL, "SLURM_SUBMIT_DIR", "%s", buf) < 0) |
| error("unable to set SLURM_SUBMIT_DIR in environment"); |
| |
| if ((gethostname(host, sizeof(host)))) |
| error("gethostname_short failed: %m"); |
| else if (setenvf(NULL, "SLURM_SUBMIT_HOST", "%s", host) < 0) |
| error("unable to set SLURM_SUBMIT_HOST in environment"); |
| } |
| |
| /* Set some environment variables with current state */ |
| static int _set_umask_env(void) |
| { |
| if (!getenv("SRUN_DEBUG")) { /* do not change current value */ |
| /* NOTE: Default debug level is 3 (info) */ |
| int log_level = LOG_LEVEL_INFO + opt.verbose - opt.quiet; |
| |
| if (setenvf(NULL, "SRUN_DEBUG", "%d", log_level) < 0) |
| error ("unable to set SRUN_DEBUG in environment"); |
| } |
| |
| if (!getenv("SLURM_UMASK")) { /* do not change current value */ |
| char mask_char[5]; |
| mode_t mask; |
| |
| mask = (int)umask(0); |
| umask(mask); |
| |
| sprintf(mask_char, "0%d%d%d", |
| ((mask>>6)&07), ((mask>>3)&07), mask&07); |
| if (setenvf(NULL, "SLURM_UMASK", "%s", mask_char) < 0) { |
| error ("unable to set SLURM_UMASK in environment"); |
| return SLURM_ERROR; |
| } |
| debug ("propagating UMASK=%s", mask_char); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static void _shepherd_notify(int shepherd_fd) |
| { |
| int rc; |
| |
| while (1) { |
| rc = write(shepherd_fd, "", 1); |
| if (rc == -1) { |
| if ((errno == EAGAIN) || (errno == EINTR)) |
| continue; |
| error("write(shepherd): %m"); |
| } |
| break; |
| } |
| close(shepherd_fd); |
| } |
| |
| static int _shepherd_spawn(srun_job_t *job, list_t *srun_job_list, |
| bool got_alloc) |
| { |
| int shepherd_pipe[2], rc; |
| pid_t shepherd_pid; |
| char buf[1]; |
| |
| if (pipe(shepherd_pipe)) { |
| error("pipe: %m"); |
| return -1; |
| } |
| |
| shepherd_pid = fork(); |
| if (shepherd_pid == -1) { |
| error("fork: %m"); |
| return -1; |
| } |
| if (shepherd_pid != 0) { |
| close(shepherd_pipe[0]); |
| return shepherd_pipe[1]; |
| } |
| |
| /* Wait for parent to notify of completion or I/O error on abort */ |
| close(shepherd_pipe[1]); |
| while (1) { |
| rc = read(shepherd_pipe[0], buf, 1); |
| if (rc == 1) { |
| _exit(0); |
| } else if (rc == 0) { |
| break; /* EOF */ |
| } else if (rc == -1) { |
| if ((errno == EAGAIN) || (errno == EINTR)) |
| continue; |
| break; |
| } |
| } |
| |
| if (srun_job_list) { |
| list_itr_t *job_iter; |
| job_iter = list_iterator_create(srun_job_list); |
| while ((job = list_next(job_iter))) { |
| (void) slurm_kill_job_step(job->step_id.job_id, job->step_id.step_id, |
| SIGKILL, 0); |
| if (got_alloc) |
| slurm_complete_job(job->step_id.job_id, NO_VAL); |
| } |
| list_iterator_destroy(job_iter); |
| } else { |
| (void) slurm_kill_job_step(job->step_id.job_id, |
| job->step_id.step_id, SIGKILL, 0); |
| if (got_alloc) |
| slurm_complete_job(job->step_id.job_id, NO_VAL); |
| } |
| |
| _exit(0); |
| return -1; |
| } |
| |
| /* _srun_signal_mgr - Process daemon-wide signals */ |
| static void *_srun_signal_mgr(void *job_ptr) |
| { |
| int sig; |
| int i, rc; |
| sigset_t set; |
| srun_job_t *job = (srun_job_t *)job_ptr; |
| |
| /* Make sure no required signals are ignored (possibly inherited) */ |
| for (i = 0; sig_array[i]; i++) |
| xsignal_default(sig_array[i]); |
| while (!srun_shutdown) { |
| xsignal_sigset_create(sig_array, &set); |
| rc = sigwait(&set, &sig); |
| if (rc == EINTR) |
| continue; |
| switch (sig) { |
| case SIGINT: |
| if (!srun_shutdown) |
| _handle_intr(job); |
| break; |
| case SIGQUIT: |
| info("Quit"); |
| /* continue with slurm_step_launch_abort */ |
| case SIGTERM: |
| case SIGHUP: |
| /* No need to call job_force_termination here since we |
| * are ending the job now and we don't need to update |
| * the state. */ |
| info("forcing job termination"); |
| launch_g_fwd_signal(SIGKILL); |
| break; |
| case SIGCONT: |
| info("got SIGCONT"); |
| break; |
| case SIGPIPE: |
| _handle_pipe(); |
| break; |
| case SIGALRM: |
| if (srun_max_timer) { |
| info("First task exited %ds ago", sropt.max_wait); |
| launch_g_print_status(); |
| launch_g_step_terminate(); |
| } |
| break; |
| default: |
| launch_g_fwd_signal(sig); |
| break; |
| } |
| } |
| return NULL; |
| } |
| |
| static int _validate_relative(resource_allocation_response_msg_t *resp, |
| slurm_opt_t *opt_local) |
| { |
| srun_opt_t *srun_opt = opt_local->srun_opt; |
| xassert(srun_opt); |
| |
| if ((srun_opt->relative != NO_VAL) && |
| ((srun_opt->relative + opt_local->min_nodes) |
| > resp->node_cnt)) { |
| if (slurm_option_set_by_cli(opt_local, 'N')) { |
| /* -N command line option used */ |
| error("--relative and --nodes option incompatible " |
| "with count of allocated nodes (%d+%d>%d)", |
| srun_opt->relative, |
| opt_local->min_nodes, |
| resp->node_cnt); |
| } else { /* SLURM_JOB_NUM_NODES option used */ |
| error("--relative and SLURM_JOB_NUM_NODES option incompatible with count of allocated nodes (%d+%d>%d)", |
| srun_opt->relative, |
| opt_local->min_nodes, |
| resp->node_cnt); |
| } |
| return SLURM_ERROR; |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| static void _call_spank_fini(void) |
| { |
| if (-1 != shepherd_fd) |
| spank_fini(NULL); |
| } |
| |
| /* |
| * Run cli_filter_post_submit on all opt structures |
| * Convenience function since this might need to run in two spots |
| */ |
| static void _srun_cli_filter_post_submit(uint32_t jobid, uint32_t stepid) |
| { |
| static bool post_submit_ran = false; |
| int idx = 0, components = 1; |
| |
| if (post_submit_ran) |
| return; |
| |
| if (opt_list) |
| components = list_count(opt_list); |
| |
| for (idx = 0; idx < components; idx++) |
| cli_filter_g_post_submit(idx, jobid, stepid); |
| |
| post_submit_ran = true; |
| } |