| /*****************************************************************************\ |
| * salloc.c - Request a Slurm job allocation and |
| * launch a user-specified command. |
| ***************************************************************************** |
| * Copyright (C) 2006-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Christopher J. Morrone <morrone2@llnl.gov> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #include <dirent.h> |
| #include <fcntl.h> |
| #include <grp.h> |
| #include <limits.h> |
| #include <pwd.h> |
| #include <stdbool.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <sys/resource.h> /* for struct rlimit */ |
| #include <sys/types.h> |
| #include <sys/wait.h> |
| #include <termios.h> |
| #include <time.h> |
| #include <unistd.h> |
| |
| #include "slurm/slurm.h" |
| |
| #include "src/interfaces/cli_filter.h" |
| #include "src/common/cpu_frequency.h" |
| #include "src/common/env.h" |
| #include "src/interfaces/gres.h" |
| #include "src/common/proc_args.h" |
| #include "src/common/read_config.h" |
| #include "src/interfaces/auth.h" |
| #include "src/common/slurm_rlimits_info.h" |
| #include "src/common/slurm_time.h" |
| #include "src/common/spank.h" |
| #include "src/common/uid.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xsignal.h" |
| #include "src/common/xstring.h" |
| |
| #include "src/salloc/salloc.h" |
| #include "src/salloc/opt.h" |
| |
| #ifndef __USE_XOPEN_EXTENDED |
| extern pid_t getpgid(pid_t pid); |
| #endif |
| |
| #define MAX_RETRIES 10 |
| #define POLL_SLEEP 0.5 /* retry interval in seconds */ |
| |
| char *argvzero = NULL; |
| pid_t command_pid = -1; |
| char *work_dir = NULL; |
| static int is_interactive; |
| |
| enum possible_allocation_states allocation_state = NOT_GRANTED; |
| pthread_cond_t allocation_state_cond = PTHREAD_COND_INITIALIZER; |
| pthread_mutex_t allocation_state_lock = PTHREAD_MUTEX_INITIALIZER; |
| |
| static bool allocation_interrupted = false; |
| static bool allocation_revoked = false; |
| static bool exit_flag = false; |
| static bool is_het_job = false; |
| static bool suspend_flag = false; |
| static uint32_t my_job_id = 0; |
| static time_t last_timeout = 0; |
| static struct termios saved_tty_attributes; |
| static int het_job_limit = 0; |
| static bool _cli_filter_post_submit_run = false; |
| |
| static void _exit_on_signal(int signo); |
| static int _fill_job_desc_from_opts(job_desc_msg_t *desc); |
| static pid_t _fork_command(char **command); |
| static void _forward_signal(int signo); |
| static void _job_complete_handler(srun_job_complete_msg_t *msg); |
| static void _job_suspend_handler(suspend_msg_t *msg); |
| static void _match_job_name(job_desc_msg_t *desc_last, list_t *job_req_list); |
| static void _node_fail_handler(srun_node_fail_msg_t *msg); |
| static void _pending_callback(uint32_t job_id); |
| static int _proc_alloc(resource_allocation_response_msg_t *alloc); |
| static void _ring_terminal_bell(void); |
| static int _set_cluster_name(void *x, void *arg); |
| static void _set_exit_code(void); |
| static void _set_spank_env(void); |
| static void _set_submit_dir_env(void); |
| static void _signal_while_allocating(int signo); |
| static void _timeout_handler(srun_timeout_msg_t *msg); |
| static void _user_msg_handler(srun_user_msg_t *msg); |
| static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc); |
| static void _salloc_cli_filter_post_submit(uint32_t jobid, uint32_t stepid); |
| |
| bool salloc_shutdown = false; |
| /* Signals that are considered terminal before resource allocation. */ |
| int sig_array[] = { |
| SIGHUP, SIGINT, SIGQUIT, SIGPIPE, |
| SIGTERM, SIGUSR1, SIGUSR2, 0 |
| }; |
| |
| static void _reset_input_mode (void) |
| { |
| /* SIGTTOU needs to be blocked per the POSIX spec: |
| * http://pubs.opengroup.org/onlinepubs/009695399/functions/tcsetattr.html |
| */ |
| int sig_block[] = { SIGTTOU, SIGTTIN, 0 }; |
| xsignal_block (sig_block); |
| tcsetattr (STDIN_FILENO, TCSANOW, &saved_tty_attributes); |
| /* If salloc was run as interactive, with job control, reset the |
| * foreground process group of the terminal to the process group of |
| * the parent pid before exiting */ |
| if (is_interactive) |
| tcsetpgrp(STDIN_FILENO, getpgid(getppid())); |
| } |
| |
| static int _set_cluster_name(void *x, void *arg) |
| { job_desc_msg_t *desc = (job_desc_msg_t *) x; |
| desc->origin_cluster = xstrdup(slurm_conf.cluster_name); |
| return 0; |
| } |
| |
| static int _copy_other_port(void *x, void *arg) |
| { |
| job_desc_msg_t *desc = x; |
| desc->other_port = *(uint16_t *)arg; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| int main(int argc, char **argv) |
| { |
| log_options_t logopt = LOG_OPTS_STDERR_ONLY; |
| job_desc_msg_t *desc = NULL, *first_job = NULL; |
| list_t *job_req_list = NULL, *job_resp_list = NULL; |
| resource_allocation_response_msg_t *alloc = NULL; |
| time_t before, after; |
| allocation_msg_thread_t *msg_thr = NULL; |
| char **env = NULL; |
| int status = 0; |
| int retries = 0; |
| pid_t pid = getpid(); |
| pid_t tpgid = 0; |
| pid_t rc_pid = -1; |
| int i, j, rc = 0; |
| uint32_t num_tasks = 0; |
| bool het_job_fini = false; |
| int het_job_argc, het_job_inx, het_job_argc_off; |
| char **het_job_argv; |
| static char *msg = "Slurm job queue full, sleeping and retrying."; |
| slurm_allocation_callbacks_t callbacks; |
| list_itr_t *iter_req, *iter_resp; |
| |
| slurm_init(NULL); |
| log_init(xbasename(argv[0]), logopt, 0, NULL); |
| |
| if (cli_filter_init() != SLURM_SUCCESS) |
| fatal("failed to initialize cli_filter plugin"); |
| |
| argvzero = argv[0]; |
| _set_exit_code(); |
| |
| if (spank_init_allocator()) { |
| error("Failed to initialize plugin stack"); |
| exit(error_exit); |
| } |
| |
| /* Be sure to call spank_fini when salloc exits |
| */ |
| if (atexit((void (*) (void)) spank_fini)) |
| error("Failed to register atexit handler for plugins: %m"); |
| |
| |
| het_job_argc = argc; |
| het_job_argv = argv; |
| for (het_job_inx = 0; !het_job_fini; het_job_inx++) { |
| het_job_argc_off = -1; |
| if (initialize_and_process_args(het_job_argc, het_job_argv, |
| &het_job_argc_off, |
| het_job_inx) < 0) { |
| error("salloc parameter parsing"); |
| exit(error_exit); |
| } |
| if ((het_job_argc_off >= 0) && |
| (het_job_argc_off < het_job_argc) && |
| !xstrcmp(het_job_argv[het_job_argc_off], ":")) { |
| /* het_job_argv[0] moves from "salloc" to ":" */ |
| het_job_argc -= het_job_argc_off; |
| het_job_argv += het_job_argc_off; |
| } else |
| het_job_fini = true; |
| |
| /* reinit log with new verbosity (if changed by command line) */ |
| if (opt.verbose || opt.quiet) { |
| logopt.stderr_level += opt.verbose; |
| logopt.stderr_level -= opt.quiet; |
| logopt.prefix_level = 1; |
| log_alter(logopt, 0, NULL); |
| } |
| |
| if (spank_init_post_opt()) { |
| error("Plugin stack post-option processing failed"); |
| exit(error_exit); |
| } |
| |
| _set_spank_env(); |
| if (het_job_inx == 0) |
| _set_submit_dir_env(); |
| if (opt.chdir && chdir(opt.chdir)) { |
| error("chdir(%s): %m", opt.chdir); |
| exit(error_exit); |
| } else if (work_dir) |
| opt.chdir = work_dir; |
| |
| if (desc && !job_req_list) { |
| job_req_list = list_create(NULL); |
| list_append(job_req_list, desc); |
| } |
| |
| desc = slurm_opt_create_job_desc(&opt, true); |
| if (_fill_job_desc_from_opts(desc) == -1) |
| exit(error_exit); |
| |
| if (het_job_inx || !het_job_fini) |
| set_env_from_opts(&opt, &env, het_job_inx); |
| else |
| set_env_from_opts(&opt, &env, -1); |
| if (job_req_list) |
| list_append(job_req_list, desc); |
| if (!first_job) |
| first_job = desc; |
| } |
| het_job_limit = het_job_inx; |
| if (!desc) { |
| fatal("%s: desc is NULL", __func__); |
| exit(error_exit); /* error already logged */ |
| } |
| _match_job_name(desc, job_req_list); |
| |
| /* |
| * Job control for interactive salloc sessions: only if ... |
| * |
| * a) input is from a terminal (stdin has valid termios attributes), |
| * b) controlling terminal exists (non-negative tpgid), |
| * c) salloc is not run in allocation-only (--no-shell) mode, |
| * NOTE: d and e below are configuration dependent |
| * d) salloc runs in its own process group (true in interactive |
| * shells that support job control), |
| * e) salloc has been configured at compile-time to support background |
| * execution and is not currently in the background process group. |
| */ |
| if (tcgetattr(STDIN_FILENO, &saved_tty_attributes) < 0) { |
| /* |
| * Test existence of controlling terminal (tpgid > 0) |
| * after first making sure stdin is not redirected. |
| */ |
| } else if ((tpgid = tcgetpgrp(STDIN_FILENO)) < 0) { |
| if (!saopt.no_shell) { |
| error("no controlling terminal: please set --no-shell"); |
| exit(error_exit); |
| } |
| } else if ((!saopt.no_shell) && (getpgrp() == tcgetpgrp(STDIN_FILENO))) { |
| is_interactive = true; |
| } |
| /* |
| * Reset saved tty attributes at exit, in case a child |
| * process died before properly resetting terminal. |
| */ |
| if (is_interactive) |
| atexit(_reset_input_mode); |
| |
| /* If can run on multiple clusters find the earliest run time |
| * and run it there */ |
| if (opt.clusters) { |
| if (job_req_list) { |
| rc = slurmdb_get_first_het_job_cluster(job_req_list, |
| opt.clusters, &working_cluster_rec); |
| } else { |
| rc = slurmdb_get_first_avail_cluster(desc, |
| opt.clusters, &working_cluster_rec); |
| } |
| if (rc != SLURM_SUCCESS) { |
| print_db_notok(opt.clusters, 0); |
| exit(error_exit); |
| } |
| } |
| if (job_req_list) |
| (void) list_for_each(job_req_list, _set_cluster_name, NULL); |
| else |
| desc->origin_cluster = xstrdup(slurm_conf.cluster_name); |
| |
| callbacks.timeout = _timeout_handler; |
| callbacks.job_complete = _job_complete_handler; |
| callbacks.job_suspend = _job_suspend_handler; |
| callbacks.user_msg = _user_msg_handler; |
| callbacks.node_fail = _node_fail_handler; |
| /* |
| * Create message thread to handle pings and such from slurmctld. |
| * salloc --no-shell jobs aren't interactive, so they won't respond to |
| * srun_ping(), so we don't want to kill it after InactiveLimit seconds. |
| * Not creating this thread will leave other_port == 0, and will |
| * prevent slurmctld from killing the salloc --no-shell job. |
| */ |
| if (!saopt.no_shell) { |
| msg_thr = slurm_allocation_msg_thr_create(&first_job->other_port, |
| &callbacks); |
| if (job_req_list) |
| list_for_each(job_req_list, _copy_other_port, |
| &first_job->other_port); |
| } |
| |
| /* NOTE: Do not process signals in separate pthread. The signal will |
| * cause slurm_allocate_resources_blocking() to exit immediately. */ |
| for (i = 0; sig_array[i]; i++) |
| xsignal(sig_array[i], _signal_while_allocating); |
| |
| /* |
| * This option is a bit odd - it's not actually used as part of the |
| * allocation, but instead just needs to be propagated to an interactive |
| * step launch correctly to make it convenient for the user. |
| * Thus the seemingly out of place copy here. |
| */ |
| desc->container = xstrdup(opt.container); |
| |
| before = time(NULL); |
| while (true) { |
| if (job_req_list) { |
| is_het_job = true; |
| job_resp_list = slurm_allocate_het_job_blocking( |
| job_req_list, opt.immediate, |
| _pending_callback); |
| if (job_resp_list) |
| break; |
| } else { |
| alloc = slurm_allocate_resources_blocking(desc, |
| opt.immediate, _pending_callback); |
| if (alloc) |
| break; |
| } |
| if (((errno != ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) && |
| (errno != EAGAIN)) || (retries >= MAX_RETRIES)) |
| break; |
| if (retries == 0) |
| error("%s", msg); |
| else |
| debug("%s", msg); |
| sleep(++retries); |
| } |
| |
| if (!alloc && !job_resp_list) { |
| if (allocation_interrupted) { |
| /* cancelled by signal */ |
| info("Job aborted due to signal"); |
| } else if (errno == EINTR) { |
| error("Interrupted by signal. Allocation request rescinded."); |
| } else if (opt.immediate && |
| ((errno == ETIMEDOUT) || |
| (errno == ESLURM_NOT_TOP_PRIORITY) || |
| (errno == ESLURM_NODES_BUSY) || |
| (errno == ESLURM_PORTS_BUSY))) { |
| error("Unable to allocate resources: %m"); |
| error_exit = immediate_exit; |
| } else { |
| error("Job submit/allocate failed: %m"); |
| } |
| if (msg_thr) |
| slurm_allocation_msg_thr_destroy(msg_thr); |
| exit(error_exit); |
| } else if (job_resp_list && !allocation_interrupted) { |
| /* Allocation granted to regular job */ |
| i = 0; |
| iter_resp = list_iterator_create(job_resp_list); |
| while ((alloc = list_next(iter_resp))) { |
| if (i == 0) { |
| my_job_id = alloc->job_id; |
| info("Granted job allocation %u", my_job_id); |
| } |
| log_flag(HETJOB, "Hetjob ID %u+%u (%u) on nodes %s", |
| my_job_id, i, alloc->job_id, alloc->node_list); |
| i++; |
| if (_proc_alloc(alloc) != SLURM_SUCCESS) { |
| list_iterator_destroy(iter_resp); |
| goto relinquish; |
| } |
| } |
| list_iterator_destroy(iter_resp); |
| } else if (!allocation_interrupted) { |
| /* Allocation granted to regular job */ |
| my_job_id = alloc->job_id; |
| |
| print_multi_line_string(alloc->job_submit_user_msg, |
| -1, LOG_LEVEL_INFO); |
| info("Granted job allocation %u", my_job_id); |
| |
| if (_proc_alloc(alloc) != SLURM_SUCCESS) |
| goto relinquish; |
| } |
| |
| _salloc_cli_filter_post_submit(my_job_id, NO_VAL); |
| |
| after = time(NULL); |
| if ((saopt.bell == BELL_ALWAYS) || |
| ((saopt.bell == BELL_AFTER_DELAY) && |
| ((after - before) > DEFAULT_BELL_DELAY))) { |
| _ring_terminal_bell(); |
| } |
| if (saopt.no_shell) |
| exit(0); |
| if (allocation_interrupted) { |
| if (alloc) |
| my_job_id = alloc->job_id; |
| /* salloc process received a signal after |
| * slurm_allocate_resources_blocking returned with the |
| * allocation, but before the new signal handlers were |
| * registered. |
| */ |
| goto relinquish; |
| } |
| |
| /* |
| * Set environment variables |
| */ |
| if (job_resp_list) { |
| bool num_tasks_always_set = true; |
| |
| i = list_count(job_req_list); |
| j = list_count(job_resp_list); |
| if (i != j) { |
| error("Job component count mismatch, submit/response " |
| "count mismatch (%d != %d)", i, j); |
| goto relinquish; |
| } |
| /* Continue support for old hetjob terminology. */ |
| env_array_append_fmt(&env, "SLURM_PACK_SIZE", "%d", i); |
| env_array_append_fmt(&env, "SLURM_HET_SIZE", "%d", i); |
| |
| i = 0; |
| iter_req = list_iterator_create(job_req_list); |
| iter_resp = list_iterator_create(job_resp_list); |
| while ((desc = list_next(iter_req))) { |
| alloc = list_next(iter_resp); |
| |
| /* |
| * Set JOB_NTASKS_SET to make SLURM_NTASKS get set when |
| * --ntasks-per-node is requested. |
| */ |
| if (desc->ntasks_per_node != NO_VAL16) |
| desc->bitflags |= JOB_NTASKS_SET; |
| if (alloc && desc && |
| (desc->bitflags & JOB_NTASKS_SET)) { |
| if (desc->num_tasks == NO_VAL) |
| desc->num_tasks = |
| alloc->node_cnt * |
| desc->ntasks_per_node; |
| else if (alloc->node_cnt > desc->num_tasks) |
| desc->num_tasks = alloc->node_cnt; |
| } |
| if ((desc->num_tasks != NO_VAL) && num_tasks_always_set) |
| num_tasks += desc->num_tasks; |
| else { |
| num_tasks = 0; |
| num_tasks_always_set = false; |
| } |
| if (env_array_for_job(&env, alloc, desc, i++) != |
| SLURM_SUCCESS) |
| goto relinquish; |
| } |
| list_iterator_destroy(iter_resp); |
| list_iterator_destroy(iter_req); |
| } else { |
| /* |
| * Set JOB_NTASKS_SET to make SLURM_NTASKS get set when |
| * --ntasks-per-node is requested. |
| */ |
| if (desc->ntasks_per_node != NO_VAL16) |
| desc->bitflags |= JOB_NTASKS_SET; |
| if (alloc && desc && (desc->bitflags & JOB_NTASKS_SET)) { |
| if (desc->num_tasks == NO_VAL) |
| desc->num_tasks = |
| alloc->node_cnt * desc->ntasks_per_node; |
| else if (alloc->node_cnt > desc->num_tasks) |
| desc->num_tasks = alloc->node_cnt; |
| } |
| if (desc->num_tasks != NO_VAL) |
| num_tasks += desc->num_tasks; |
| if (env_array_for_job(&env, alloc, desc, -1) != SLURM_SUCCESS) |
| goto relinquish; |
| } |
| |
| if (num_tasks) { |
| env_array_append_fmt(&env, "SLURM_NTASKS", "%d", num_tasks); |
| env_array_append_fmt(&env, "SLURM_NPROCS", "%d", num_tasks); |
| } |
| |
| if (working_cluster_rec && working_cluster_rec->name) { |
| env_array_append_fmt(&env, "SLURM_CLUSTER_NAME", "%s", |
| working_cluster_rec->name); |
| } else |
| env_array_append_fmt(&env, "SLURM_CLUSTER_NAME", "%s", |
| slurm_conf.cluster_name); |
| |
| env_array_set_environment(env); |
| env_array_free(env); |
| slurm_mutex_lock(&allocation_state_lock); |
| if (allocation_state == REVOKED) { |
| error("Allocation was revoked for job %u before command could be run", |
| my_job_id); |
| slurm_cond_broadcast(&allocation_state_cond); |
| slurm_mutex_unlock(&allocation_state_lock); |
| if (slurm_complete_job(my_job_id, status) != 0) { |
| error("Unable to clean up allocation for job %u: %m", |
| my_job_id); |
| } |
| return 1; |
| } |
| allocation_state = GRANTED; |
| slurm_cond_broadcast(&allocation_state_cond); |
| slurm_mutex_unlock(&allocation_state_lock); |
| |
| /* Ensure that salloc has initial terminal foreground control. */ |
| if (is_interactive) { |
| /* |
| * Ignore remaining job-control signals (other than those in |
| * sig_array, which at this state act like SIG_IGN). |
| */ |
| xsignal(SIGTSTP, SIG_IGN); |
| xsignal(SIGTTIN, SIG_IGN); |
| xsignal(SIGTTOU, SIG_IGN); |
| |
| pid = getpid(); |
| setpgid(pid, pid); |
| |
| tcsetpgrp(STDIN_FILENO, pid); |
| } |
| slurm_mutex_lock(&allocation_state_lock); |
| if (suspend_flag) |
| slurm_cond_wait(&allocation_state_cond, &allocation_state_lock); |
| command_pid = _fork_command(opt.argv); |
| slurm_cond_broadcast(&allocation_state_cond); |
| slurm_mutex_unlock(&allocation_state_lock); |
| |
| /* |
| * Wait for command to exit, OR for waitpid to be interrupted by a |
| * signal. Either way, we are going to release the allocation next. |
| */ |
| if (command_pid > 0) { |
| setpgid(command_pid, command_pid); |
| if (is_interactive) |
| tcsetpgrp(STDIN_FILENO, command_pid); |
| |
| /* NOTE: Do not process signals in separate pthread. |
| * The signal will cause waitpid() to exit immediately. */ |
| xsignal(SIGHUP, _exit_on_signal); |
| /* Use WUNTRACED to treat stopped children like terminated ones */ |
| do { |
| rc_pid = waitpid(command_pid, &status, WUNTRACED); |
| } while (WIFSTOPPED(status) || ((rc_pid == -1) && (!exit_flag))); |
| if ((rc_pid == -1) && (errno != EINTR)) |
| error("waitpid for %s failed: %m", opt.argv[0]); |
| } |
| |
| if (is_interactive) |
| tcsetpgrp(STDIN_FILENO, pid); |
| |
| /* |
| * Relinquish the job allocation (if not already revoked). |
| */ |
| relinquish: |
| slurm_mutex_lock(&allocation_state_lock); |
| if (allocation_state != REVOKED) { |
| slurm_mutex_unlock(&allocation_state_lock); |
| |
| info("Relinquishing job allocation %u", my_job_id); |
| if ((slurm_complete_job(my_job_id, status) != 0) && |
| (errno != ESLURM_ALREADY_DONE)) |
| error("Unable to clean up job allocation %u: %m", |
| my_job_id); |
| slurm_mutex_lock(&allocation_state_lock); |
| allocation_state = REVOKED; |
| } |
| slurm_cond_broadcast(&allocation_state_cond); |
| slurm_mutex_unlock(&allocation_state_lock); |
| |
| slurm_free_resource_allocation_response_msg(alloc); |
| if (msg_thr) |
| slurm_allocation_msg_thr_destroy(msg_thr); |
| |
| /* |
| * Figure out what return code we should use. If the user's command |
| * exited normally, return the user's return code. |
| */ |
| rc = 1; |
| if (rc_pid != -1) { |
| if (WIFEXITED(status)) { |
| rc = WEXITSTATUS(status); |
| } else if (WIFSTOPPED(status)) { |
| /* Terminate stopped child process */ |
| _forward_signal(SIGKILL); |
| } else if (WIFSIGNALED(status)) { |
| verbose("Command \"%s\" was terminated by signal %d", |
| opt.argv[0], WTERMSIG(status)); |
| /* if we get these signals we return a normal |
| * exit since this was most likely sent from the |
| * user */ |
| switch (WTERMSIG(status)) { |
| case SIGHUP: |
| case SIGINT: |
| case SIGQUIT: |
| case SIGKILL: |
| rc = 0; |
| break; |
| default: |
| break; |
| } |
| } |
| } |
| |
| #ifdef MEMORY_LEAK_DEBUG |
| cli_filter_fini(); |
| slurm_reset_all_options(&opt, false); |
| slurm_fini(); |
| log_fini(); |
| #endif /* MEMORY_LEAK_DEBUG */ |
| |
| return rc; |
| } |
| |
| /* Initial processing of resource allocation response, including waiting for |
| * compute nodes to be ready for use. |
| * Ret SLURM_SUCCESS on success */ |
| static int _proc_alloc(resource_allocation_response_msg_t *alloc) |
| { |
| static int elem = 0; |
| |
| if ((elem++ == 0) && alloc->working_cluster_rec) { |
| slurm_setup_remote_working_cluster(alloc); |
| |
| /* set env for srun's to find the right cluster */ |
| if (xstrstr(working_cluster_rec->control_host, ":")) { |
| /* |
| * If the control_host has ':'s then it's an ipv6 |
| * address and need to be wrapped with "[]" because |
| * SLURM_WORKING_CLUSTER is ':' delimited. In 24.11+, |
| * _setup_env_working_cluster() handles this new format. |
| */ |
| setenvf(NULL, "SLURM_WORKING_CLUSTER", "%s:[%s]:%d:%d", |
| working_cluster_rec->name, |
| working_cluster_rec->control_host, |
| working_cluster_rec->control_port, |
| working_cluster_rec->rpc_version); |
| } else { |
| /* |
| * When 24.11 is no longer supported this else clause |
| * can be removed. |
| */ |
| setenvf(NULL, "SLURM_WORKING_CLUSTER", "%s:%s:%d:%d", |
| working_cluster_rec->name, |
| working_cluster_rec->control_host, |
| working_cluster_rec->control_port, |
| working_cluster_rec->rpc_version); |
| } |
| } |
| |
| if (!_wait_nodes_ready(alloc)) { |
| if (!allocation_interrupted) |
| error("Something is wrong with the boot of the nodes."); |
| return SLURM_ERROR; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| |
| /* Copy job name from last component to all hetjob components unless |
| * explicitly set. The default value comes from _salloc_default_command() |
| * and is "sh". */ |
| static void _match_job_name(job_desc_msg_t *desc_last, list_t *job_req_list) |
| { |
| list_itr_t *iter; |
| job_desc_msg_t *desc = NULL; |
| char *name; |
| |
| if (!desc_last) |
| return; |
| |
| if (!desc_last->name && opt.argv[0]) |
| desc_last->name = xstrdup(xbasename(opt.argv[0])); |
| name = desc_last->name; |
| |
| if (!job_req_list) |
| return; |
| |
| iter = list_iterator_create(job_req_list); |
| while ((desc = list_next(iter))) |
| if (!desc->name) |
| desc->name = xstrdup(name); |
| |
| list_iterator_destroy(iter); |
| } |
| |
| static void _set_exit_code(void) |
| { |
| int i; |
| char *val; |
| |
| if ((val = getenv("SLURM_EXIT_ERROR"))) { |
| i = atoi(val); |
| if (i == 0) |
| error("SLURM_EXIT_ERROR has zero value"); |
| else |
| error_exit = i; |
| } |
| |
| if ((val = getenv("SLURM_EXIT_IMMEDIATE"))) { |
| i = atoi(val); |
| if (i == 0) |
| error("SLURM_EXIT_IMMEDIATE has zero value"); |
| else |
| immediate_exit = i; |
| } |
| } |
| |
| /* Propagate SPANK environment via SLURM_SPANK_ environment variables */ |
| static void _set_spank_env(void) |
| { |
| int i; |
| |
| for (i = 0; i < opt.spank_job_env_size; i++) { |
| if (setenvfs("SLURM_SPANK_%s", opt.spank_job_env[i]) < 0) { |
| error("unable to set %s in environment", |
| opt.spank_job_env[i]); |
| } |
| } |
| } |
| |
| /* Set SLURM_SUBMIT_DIR and SLURM_SUBMIT_HOST environment variables within |
| * current state */ |
| static void _set_submit_dir_env(void) |
| { |
| char host[256]; |
| |
| work_dir = xmalloc(PATH_MAX); |
| if ((getcwd(work_dir, PATH_MAX)) == NULL) |
| error("getcwd failed: %m"); |
| else if (setenvf(NULL, "SLURM_SUBMIT_DIR", "%s", work_dir) < 0) |
| error("unable to set SLURM_SUBMIT_DIR in environment"); |
| |
| if ((gethostname(host, sizeof(host)))) |
| error("gethostname_short failed: %m"); |
| else if (setenvf(NULL, "SLURM_SUBMIT_HOST", "%s", host) < 0) |
| error("unable to set SLURM_SUBMIT_HOST in environment"); |
| } |
| |
| /* Returns 0 on success, -1 on failure */ |
| static int _fill_job_desc_from_opts(job_desc_msg_t *desc) |
| { |
| if (!desc) |
| return -1; |
| |
| desc->wait_all_nodes = saopt.wait_all_nodes; |
| desc->argv = opt.argv; |
| desc->argc = opt.argc; |
| |
| return 0; |
| } |
| |
| static void _ring_terminal_bell(void) |
| { |
| if (isatty(STDOUT_FILENO)) { |
| fprintf(stdout, "\a"); |
| fflush(stdout); |
| } |
| } |
| |
| /* returns the pid of the forked command, or <0 on error */ |
| static pid_t _fork_command(char **command) |
| { |
| pid_t pid; |
| |
| pid = fork(); |
| if (pid < 0) { |
| error("%s: fork failed: %m", |
| __func__); |
| } else if (pid == 0) { |
| /* child */ |
| char *cwd = opt.chdir ? opt.chdir : work_dir; |
| char *cpath = search_path(cwd, command[0], true, X_OK, true); |
| |
| xassert(cwd); |
| if (!cpath) { |
| error("%s: Unable to find command \"%s\"", |
| __func__, command[0]); |
| _exit(error_exit); |
| } |
| |
| setpgid(getpid(), 0); |
| |
| /* |
| * Reset job control signals. |
| * Suspend (TSTP) is not restored (ignored, as in the parent): |
| * shells with job-control override this and look after their |
| * processes. |
| * Suspending single commands is more complex and would require |
| * adding full shell-like job control to salloc. |
| */ |
| xsignal(SIGINT, SIG_DFL); |
| xsignal(SIGQUIT, SIG_DFL); |
| xsignal(SIGTTIN, SIG_DFL); |
| xsignal(SIGTTOU, SIG_DFL); |
| |
| execvp(cpath, command); |
| |
| /* should only get here if execvp failed */ |
| error("%s: Unable to exec command \"%s\": %m", |
| __func__, cpath); |
| xfree(cpath); |
| _exit(error_exit); |
| } |
| /* parent returns */ |
| return pid; |
| } |
| |
| static void _pending_callback(uint32_t job_id) |
| { |
| info("Pending job allocation %u", job_id); |
| my_job_id = job_id; |
| |
| /* call cli_filter post_submit here so it runs while allocating */ |
| _salloc_cli_filter_post_submit(my_job_id, NO_VAL); |
| } |
| |
| /* |
| * Run cli_filter_post_submit on all opt structures |
| * Convenience function since this might need to run in two spots |
| * uses a static bool to prevent multiple executions |
| */ |
| static void _salloc_cli_filter_post_submit(uint32_t jobid, uint32_t stepid) |
| { |
| int idx = 0; |
| if (_cli_filter_post_submit_run) |
| return; |
| for (idx = 0; idx < het_job_limit; idx++) |
| cli_filter_g_post_submit(idx, jobid, stepid); |
| |
| _cli_filter_post_submit_run = true; |
| } |
| |
| static void _exit_on_signal(int signo) |
| { |
| _forward_signal(signo); |
| exit_flag = true; |
| } |
| |
| static void _forward_signal(int signo) |
| { |
| if (command_pid > 0) |
| killpg(command_pid, signo); |
| } |
| |
| static void _signal_while_allocating(int signo) |
| { |
| allocation_interrupted = true; |
| if (my_job_id != 0) { |
| slurm_complete_job(my_job_id, 128 + signo); |
| } |
| } |
| |
| /* This typically signifies the job was cancelled by scancel */ |
| static void _job_complete_handler(srun_job_complete_msg_t *comp) |
| { |
| if (!is_het_job && my_job_id && (my_job_id != comp->job_id)) { |
| error("Ignoring job_complete for job %u because our job ID is %u", |
| comp->job_id, my_job_id); |
| return; |
| } |
| |
| if (comp->step_id == NO_VAL) { |
| slurm_mutex_lock(&allocation_state_lock); |
| if (allocation_state != REVOKED) { |
| /* If the allocation_state is already REVOKED, then |
| * no need to print this message. We probably |
| * relinquished the allocation ourself. |
| */ |
| if (last_timeout && (last_timeout < time(NULL))) { |
| info("Job %u has exceeded its time limit and " |
| "its allocation has been revoked.", |
| comp->job_id); |
| } else { |
| info("Job allocation %u has been revoked.", |
| comp->job_id); |
| allocation_revoked = true; |
| } |
| } |
| allocation_state = REVOKED; |
| slurm_cond_broadcast(&allocation_state_cond); |
| slurm_mutex_unlock(&allocation_state_lock); |
| /* |
| * Clean up child process: only if the forked process has not |
| * yet changed state (waitpid returning 0). |
| */ |
| if ((command_pid > -1) && |
| (waitpid(command_pid, NULL, WNOHANG) == 0)) { |
| int signal = 0; |
| |
| if (is_interactive) { |
| pid_t tpgid = tcgetpgrp(STDIN_FILENO); |
| /* |
| * This happens if the command forks further |
| * subprocesses, e.g. a user shell (since we |
| * are ignoring TSTP, the process must have |
| * originated from salloc). Notify foreground |
| * process about pending termination. |
| */ |
| if (tpgid != command_pid && tpgid != getpgrp()) |
| killpg(tpgid, SIGHUP); |
| } |
| |
| if (saopt.kill_command_signal) |
| signal = saopt.kill_command_signal; |
| #ifdef SALLOC_KILL_CMD |
| else if (is_interactive) |
| signal = SIGHUP; |
| else |
| signal = SIGTERM; |
| #endif |
| if (signal) { |
| verbose("Sending signal %d to command \"%s\"," |
| " pid %d", |
| signal, opt.argv[0], command_pid); |
| if (suspend_flag) |
| _forward_signal(SIGCONT); |
| _forward_signal(signal); |
| } |
| } |
| } else { |
| verbose("%ps is finished.", comp); |
| } |
| } |
| |
| static void _job_suspend_handler(suspend_msg_t *msg) |
| { |
| if (msg->op == SUSPEND_JOB) { |
| verbose("job has been suspended"); |
| } else if (msg->op == RESUME_JOB) { |
| verbose("job has been resumed"); |
| } |
| } |
| |
| /* |
| * Job has been notified of it's approaching time limit. |
| * Job will be killed shortly after timeout. |
| * This RPC can arrive multiple times with the same or updated timeouts. |
| * FIXME: We may want to signal the job or perform other action for this. |
| * FIXME: How much lead time do we want for this message? Some jobs may |
| * require tens of minutes to gracefully terminate. |
| */ |
| static void _timeout_handler(srun_timeout_msg_t *msg) |
| { |
| if (msg->timeout != last_timeout) { |
| last_timeout = msg->timeout; |
| verbose("Job allocation time limit to be reached at %s", |
| slurm_ctime2(&msg->timeout)); |
| } |
| } |
| |
| static void _user_msg_handler(srun_user_msg_t *msg) |
| { |
| info("%s", msg->msg); |
| } |
| |
| static void _node_fail_handler(srun_node_fail_msg_t *msg) |
| { |
| error("Node failure on %s", msg->nodelist); |
| } |
| |
| /* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ |
| static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) |
| { |
| double cur_delay = 0; |
| double cur_sleep = 0; |
| int is_ready = 0, i = 0, rc; |
| bool job_killed = false; |
| |
| if (saopt.wait_all_nodes == NO_VAL16) |
| saopt.wait_all_nodes = 0; |
| |
| while (true) { |
| if (i) { |
| /* |
| * First sleep should be very quick to improve |
| * responsiveness. |
| * |
| * Otherwise, increment by POLL_SLEEP for every loop. |
| */ |
| if (cur_delay == 0) |
| cur_sleep = 0.1; |
| else if (cur_sleep < 300) |
| cur_sleep = POLL_SLEEP * i; |
| if (i == 2) |
| info("Waiting for resource configuration"); |
| else if (i > 2) |
| debug("Waited %f sec and still waiting: next sleep for %f sec", |
| cur_delay, cur_sleep); |
| usleep(USEC_IN_SEC * cur_sleep); |
| cur_delay += cur_sleep; |
| } |
| i += 1; |
| |
| rc = slurm_job_node_ready(alloc->job_id); |
| if (rc == READY_JOB_FATAL) |
| break; /* fatal error */ |
| if (allocation_interrupted || allocation_revoked) |
| break; |
| if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) |
| continue; /* retry */ |
| if ((rc & READY_JOB_STATE) == 0) { /* job killed */ |
| job_killed = true; |
| break; |
| } |
| if ((rc & READY_JOB_STATE) && |
| (rc & READY_PROLOG_STATE) && |
| ((rc & READY_NODE_STATE) || !saopt.wait_all_nodes)) { |
| is_ready = 1; |
| break; |
| } |
| } |
| if (is_ready) { |
| info("Nodes %s are ready for job", alloc->node_list); |
| } else if (!allocation_interrupted) { |
| if (job_killed || allocation_revoked) { |
| error("Job allocation %u has been revoked", |
| alloc->job_id); |
| allocation_interrupted = true; |
| } else |
| error("Nodes %s are still not ready", alloc->node_list); |
| } else /* allocation_interrupted or slurmctld not responing */ |
| is_ready = 0; |
| |
| return is_ready; |
| } |