| /*****************************************************************************\ |
| * src/srun/allocate.c - srun functions for managing node allocations |
| ***************************************************************************** |
| * Copyright (C) 2002-2007 The Regents of the University of California. |
| * Copyright (C) 2008-2010 Lawrence Livermore National Security. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Mark Grondona <mgrondona@llnl.gov>. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #include <poll.h> |
| #include <pwd.h> |
| #include <stdlib.h> |
| #include <sys/types.h> |
| #include <unistd.h> |
| |
| #include "src/common/env.h" |
| #include "src/common/fd.h" |
| #include "src/common/forward.h" |
| #include "src/interfaces/gres.h" |
| #include "src/common/list.h" |
| #include "src/common/log.h" |
| #include "src/common/macros.h" |
| #include "src/common/proc_args.h" |
| #include "src/interfaces/auth.h" |
| #include "src/common/slurm_protocol_api.h" |
| #include "src/common/slurm_time.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xsignal.h" |
| #include "src/common/xstring.h" |
| |
| #include "allocate.h" |
| #include "opt.h" |
| #include "launch.h" |
| |
| #define MAX_ALLOC_WAIT 60 /* seconds */ |
| #define MIN_ALLOC_WAIT 5 /* seconds */ |
| #define MAX_RETRIES 10 |
| #define POLL_SLEEP 0.5 /* retry interval in seconds */ |
| |
| pthread_mutex_t msg_lock = PTHREAD_MUTEX_INITIALIZER; |
| pthread_cond_t msg_cond = PTHREAD_COND_INITIALIZER; |
| allocation_msg_thread_t *msg_thr = NULL; |
| struct pollfd global_fds[1]; |
| |
| extern char **environ; |
| |
| static uint32_t pending_job_id = 0; |
| |
| /* |
| * Static Prototypes |
| */ |
| static job_desc_msg_t *_job_desc_msg_create_from_opts(slurm_opt_t *opt_local); |
| static void _set_pending_job_id(uint32_t job_id); |
| static void _signal_while_allocating(int signo); |
| static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc); |
| |
| static sig_atomic_t destroy_job = 0; |
| static bool is_het_job = false; |
| static bool revoke_job = false; |
| |
| static void _set_pending_job_id(uint32_t job_id) |
| { |
| debug2("Pending job allocation %u", job_id); |
| pending_job_id = job_id; |
| } |
| |
| static void *_safe_signal_while_allocating(void *in_data) |
| { |
| int signo = *(int *)in_data; |
| |
| debug("Got signal %d", signo); |
| xfree(in_data); |
| if (pending_job_id != 0) { |
| slurm_complete_job(pending_job_id, 128 + signo); |
| } |
| |
| return NULL; |
| } |
| |
| static void _signal_while_allocating(int signo) |
| { |
| int *local_signal; |
| |
| /* |
| * There are places where _signal_while_allocating() can't be |
| * put into a thread, but if this isn't on a separate thread |
| * and we try to print something using the log functions and |
| * it just so happens to be in a poll or something we can get |
| * deadlock. So after the signal happens we are able to spawn |
| * a thread here and avoid the deadlock. |
| * |
| * SO, DON'T PRINT ANYTHING IN THIS FUNCTION. |
| */ |
| if (signo == SIGCONT) |
| return; |
| |
| destroy_job = 1; |
| |
| local_signal = xmalloc(sizeof(int)); |
| *local_signal = signo; |
| slurm_thread_create_detached(_safe_signal_while_allocating, |
| local_signal); |
| } |
| |
| /* This typically signifies the job was cancelled by scancel */ |
| static void _job_complete_handler(srun_job_complete_msg_t *msg) |
| { |
| if (!is_het_job && pending_job_id && (pending_job_id != msg->job_id)) { |
| error("Ignoring job_complete for job %u because our job ID is %u", |
| msg->job_id, pending_job_id); |
| return; |
| } |
| |
| /* Only print if we know we were signaled */ |
| if (destroy_job) |
| info("Force Terminated %ps", msg); |
| revoke_job = true; |
| } |
| |
| /* |
| * Job has been notified of it's approaching time limit. |
| * Job will be killed shortly after timeout. |
| * This RPC can arrive multiple times with the same or updated timeouts. |
| * FIXME: We may want to signal the job or perform other action for this. |
| * FIXME: How much lead time do we want for this message? Some jobs may |
| * require tens of minutes to gracefully terminate. |
| */ |
| static void _timeout_handler(srun_timeout_msg_t *msg) |
| { |
| static time_t last_timeout = 0; |
| |
| if (msg->timeout != last_timeout) { |
| last_timeout = msg->timeout; |
| verbose("job time limit to be reached at %s", |
| slurm_ctime2(&msg->timeout)); |
| } |
| } |
| |
| static void _user_msg_handler(srun_user_msg_t *msg) |
| { |
| info("%s", msg->msg); |
| } |
| |
| static void _node_fail_handler(srun_node_fail_msg_t *msg) |
| { |
| error("Node failure on %s", msg->nodelist); |
| } |
| |
| |
| |
| static bool _retry(void) |
| { |
| static int retries = 0; |
| static char *msg = "Slurm controller not responding, " |
| "sleeping and retrying."; |
| |
| if ((errno == ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) || (errno == EAGAIN)) { |
| if (retries == 0) |
| error("%s", msg); |
| else if (retries < MAX_RETRIES) |
| debug("%s", msg); |
| else |
| return false; |
| sleep (++retries); |
| } else if (errno == EINTR) { |
| /* srun may be interrupted by the BLCR checkpoint signal */ |
| /* |
| * XXX: this will cause the old job cancelled and a new |
| * job allocated |
| */ |
| debug("Syscall interrupted while allocating resources, " |
| "retrying."); |
| return true; |
| } else if (opt.immediate && |
| ((errno == ETIMEDOUT) || (errno == ESLURM_NODES_BUSY) || |
| (errno == ESLURM_PORTS_BUSY))) { |
| error("Unable to allocate resources: %s", |
| slurm_strerror(ESLURM_NODES_BUSY)); |
| error_exit = immediate_exit; |
| return false; |
| } else if ((errno == SLURM_PROTOCOL_AUTHENTICATION_ERROR) || |
| (errno == SLURM_UNEXPECTED_MSG_ERROR) || |
| (errno == SLURM_PROTOCOL_INSANE_MSG_LENGTH)) { |
| static int external_msg_count = 0; |
| error("Srun communication socket apparently being written to " |
| "by something other than Slurm"); |
| if (external_msg_count++ < 4) |
| return true; |
| error("Unable to allocate resources: %m"); |
| return false; |
| } else { |
| error("Unable to allocate resources: %m"); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ |
| static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) |
| { |
| double cur_delay = 0; |
| double cur_sleep = 0; |
| int is_ready = 0, i = 0, rc; |
| bool job_killed = false; |
| |
| pending_job_id = alloc->job_id; |
| |
| while (true) { |
| if (i) { |
| /* |
| * First sleep should be very quick to improve |
| * responsiveness. |
| * |
| * Otherwise, increment by POLL_SLEEP for every loop. |
| */ |
| if (cur_delay == 0) |
| cur_sleep = 0.1; |
| else if (cur_sleep < 300) |
| cur_sleep = POLL_SLEEP * i; |
| if (i == 1) |
| verbose("Waiting for resource configuration"); |
| else |
| debug("Waited %f sec and still waiting: next sleep for %f sec", |
| cur_delay, cur_sleep); |
| usleep(USEC_IN_SEC * cur_sleep); |
| cur_delay += cur_sleep; |
| } |
| i += 1; |
| |
| rc = slurm_job_node_ready(alloc->job_id); |
| if (rc == READY_JOB_FATAL) |
| break; /* fatal error */ |
| if (destroy_job || revoke_job) |
| break; |
| if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) |
| continue; /* retry */ |
| if ((rc & READY_JOB_STATE) == 0) { /* job killed */ |
| job_killed = true; |
| break; |
| } |
| if ((rc & READY_NODE_STATE) && |
| (rc & READY_PROLOG_STATE)) { |
| is_ready = 1; |
| break; |
| } |
| } |
| if (is_ready) { |
| if (i > 0) |
| verbose("Nodes %s are ready for job", alloc->node_list); |
| } else if (!destroy_job) { |
| if (job_killed) { |
| error("Job allocation %u has been revoked", |
| alloc->job_id); |
| destroy_job = true; |
| } else |
| error("Nodes %s are still not ready", alloc->node_list); |
| } else /* allocation_interrupted and slurmctld not responing */ |
| is_ready = 0; |
| |
| pending_job_id = 0; |
| |
| return is_ready; |
| } |
| |
| static int _allocate_test(slurm_opt_t *opt_local) |
| { |
| job_desc_msg_t *j; |
| int rc; |
| |
| if ((j = _job_desc_msg_create_from_opts(opt_local)) == NULL) |
| return SLURM_ERROR; |
| |
| if (opt_local->clusters && |
| (slurmdb_get_first_avail_cluster(j, opt_local->clusters, |
| &working_cluster_rec) |
| != SLURM_SUCCESS)) { |
| print_db_notok(opt_local->clusters, 0); |
| return SLURM_ERROR; |
| } |
| |
| rc = slurm_job_will_run(j); |
| job_desc_msg_destroy(j); |
| return rc; |
| |
| } |
| |
| extern int allocate_test(void) |
| { |
| int rc = SLURM_SUCCESS; |
| list_itr_t *iter; |
| slurm_opt_t *opt_local; |
| |
| if (opt_list) { |
| iter = list_iterator_create(opt_list); |
| while ((opt_local = list_next(iter))) { |
| if ((rc = _allocate_test(opt_local)) != SLURM_SUCCESS) |
| break; |
| } |
| list_iterator_destroy(iter); |
| } else { |
| rc = _allocate_test(&opt); |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * Allocate nodes from the slurm controller -- retrying the attempt |
| * if the controller appears to be down, and optionally waiting for |
| * resources if none are currently available (see opt.immediate) |
| * |
| * Returns a pointer to a resource_allocation_response_msg which must |
| * be freed with slurm_free_resource_allocation_response_msg() |
| */ |
| extern resource_allocation_response_msg_t *allocate_nodes( |
| slurm_opt_t *opt_local) |
| { |
| srun_opt_t *srun_opt = opt_local->srun_opt; |
| resource_allocation_response_msg_t *resp = NULL; |
| job_desc_msg_t *j; |
| slurm_allocation_callbacks_t callbacks; |
| int i; |
| |
| xassert(srun_opt); |
| |
| if (srun_opt->relative != NO_VAL) |
| fatal("--relative option invalid for job allocation request"); |
| |
| if ((j = _job_desc_msg_create_from_opts(&opt)) == NULL) |
| return NULL; |
| |
| if (opt_local->clusters && |
| (slurmdb_get_first_avail_cluster(j, opt_local->clusters, |
| &working_cluster_rec) |
| != SLURM_SUCCESS)) { |
| print_db_notok(opt_local->clusters, 0); |
| return NULL; |
| } |
| |
| j->origin_cluster = xstrdup(slurm_conf.cluster_name); |
| |
| callbacks.timeout = _timeout_handler; |
| callbacks.job_complete = _job_complete_handler; |
| callbacks.job_suspend = NULL; |
| callbacks.user_msg = _user_msg_handler; |
| callbacks.node_fail = _node_fail_handler; |
| |
| /* create message thread to handle pings and such from slurmctld */ |
| msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks); |
| |
| /* NOTE: Do not process signals in separate pthread. The signal will |
| * cause slurm_allocate_resources_blocking() to exit immediately. */ |
| xsignal_unblock(sig_array); |
| for (i = 0; sig_array[i]; i++) |
| xsignal(sig_array[i], _signal_while_allocating); |
| |
| while (!resp) { |
| resp = slurm_allocate_resources_blocking(j, |
| opt_local->immediate, |
| _set_pending_job_id); |
| if (destroy_job) { |
| if (pending_job_id != 0) |
| info("Job allocation %u has been revoked", |
| pending_job_id); |
| /* cancelled by signal */ |
| break; |
| } else if (!resp && !_retry()) { |
| break; |
| } |
| } |
| |
| if (resp) |
| print_multi_line_string(resp->job_submit_user_msg, |
| -1, LOG_LEVEL_INFO); |
| |
| if (resp && !destroy_job) { |
| /* |
| * Allocation granted! |
| */ |
| pending_job_id = resp->job_id; |
| |
| /* |
| * These values could be changed while the job was |
| * pending so overwrite the request with what was |
| * allocated so we don't have issues when we use them |
| * in the step creation. |
| */ |
| opt_local->pn_min_memory = NO_VAL64; |
| opt_local->mem_per_cpu = NO_VAL64; |
| if (resp->pn_min_memory != NO_VAL64) { |
| if (resp->pn_min_memory & MEM_PER_CPU) { |
| opt_local->mem_per_cpu = (resp->pn_min_memory & |
| (~MEM_PER_CPU)); |
| } else { |
| opt_local->pn_min_memory = resp->pn_min_memory; |
| } |
| } |
| |
| opt_local->min_nodes = resp->node_cnt; |
| opt_local->max_nodes = resp->node_cnt; |
| xfree(opt_local->gres); |
| opt_local->gres = xstrdup(resp->tres_per_node); |
| |
| if (resp->working_cluster_rec) |
| slurm_setup_remote_working_cluster(resp); |
| |
| if (!_wait_nodes_ready(resp)) { |
| if (!destroy_job) |
| error("Something is wrong with the boot of the nodes."); |
| goto relinquish; |
| } |
| } else if (destroy_job || revoke_job) { |
| goto relinquish; |
| } |
| |
| xsignal_block(sig_array); |
| |
| job_desc_msg_destroy(j); |
| |
| return resp; |
| |
| relinquish: |
| if (resp) { |
| if (destroy_job || revoke_job) |
| slurm_complete_job(resp->job_id, 1); |
| slurm_free_resource_allocation_response_msg(resp); |
| } |
| exit(error_exit); |
| return NULL; |
| } |
| |
| static int _copy_other_port(void *x, void *arg) |
| { |
| job_desc_msg_t *desc = x; |
| desc->other_port = *(uint16_t *)arg; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Allocate nodes for heterogeneous job from the slurm controller -- |
| * retrying the attempt if the controller appears to be down, and optionally |
| * waiting for resources if none are currently available (see opt.immediate) |
| * |
| * Returns a pointer to a resource_allocation_response_msg which must |
| * be freed with slurm_free_resource_allocation_response_msg() |
| */ |
| list_t *allocate_het_job_nodes(void) |
| { |
| resource_allocation_response_msg_t *resp = NULL; |
| job_desc_msg_t *j, *first_job = NULL; |
| slurm_allocation_callbacks_t callbacks; |
| list_itr_t *opt_iter, *resp_iter; |
| slurm_opt_t *opt_local, *first_opt = NULL; |
| list_t *job_req_list = NULL, *job_resp_list = NULL; |
| uint32_t my_job_id = 0; |
| int i, k; |
| |
| job_req_list = list_create(NULL); |
| opt_iter = list_iterator_create(opt_list); |
| while ((opt_local = list_next(opt_iter))) { |
| srun_opt_t *srun_opt = opt_local->srun_opt; |
| xassert(srun_opt); |
| if (!first_opt) |
| first_opt = opt_local; |
| if (srun_opt->relative != NO_VAL) |
| fatal("--relative option invalid for job allocation request"); |
| |
| if ((j = _job_desc_msg_create_from_opts(opt_local)) == NULL) { |
| FREE_NULL_LIST(job_req_list); |
| return NULL; |
| } |
| if (!first_job) |
| first_job = j; |
| |
| j->origin_cluster = xstrdup(slurm_conf.cluster_name); |
| |
| list_append(job_req_list, j); |
| } |
| list_iterator_destroy(opt_iter); |
| |
| if (!first_job) { |
| error("%s: No job requests found", __func__); |
| FREE_NULL_LIST(job_req_list); |
| return NULL; |
| } |
| |
| if (first_opt && first_opt->clusters && |
| (slurmdb_get_first_het_job_cluster(job_req_list, |
| first_opt->clusters, |
| &working_cluster_rec) |
| != SLURM_SUCCESS)) { |
| print_db_notok(first_opt->clusters, 0); |
| FREE_NULL_LIST(job_req_list); |
| return NULL; |
| } |
| |
| callbacks.timeout = _timeout_handler; |
| callbacks.job_complete = _job_complete_handler; |
| callbacks.job_suspend = NULL; |
| callbacks.user_msg = _user_msg_handler; |
| callbacks.node_fail = _node_fail_handler; |
| |
| /* create message thread to handle pings and such from slurmctld */ |
| msg_thr = slurm_allocation_msg_thr_create(&first_job->other_port, |
| &callbacks); |
| list_for_each(job_req_list, _copy_other_port, &first_job->other_port); |
| |
| /* NOTE: Do not process signals in separate pthread. The signal will |
| * cause slurm_allocate_resources_blocking() to exit immediately. */ |
| xsignal_unblock(sig_array); |
| for (i = 0; sig_array[i]; i++) |
| xsignal(sig_array[i], _signal_while_allocating); |
| |
| is_het_job = true; |
| |
| while (first_opt && !job_resp_list) { |
| job_resp_list = slurm_allocate_het_job_blocking(job_req_list, |
| first_opt->immediate, _set_pending_job_id); |
| if (destroy_job) { |
| /* cancelled by signal */ |
| if (pending_job_id != 0) |
| info("Job allocation %u has been revoked", |
| pending_job_id); |
| break; |
| } else if (!job_resp_list && !_retry()) { |
| break; |
| } |
| } |
| FREE_NULL_LIST(job_req_list); |
| |
| if (job_resp_list && !destroy_job) { |
| /* |
| * Allocation granted! |
| */ |
| |
| opt_iter = list_iterator_create(opt_list); |
| resp_iter = list_iterator_create(job_resp_list); |
| while ((opt_local = list_next(opt_iter))) { |
| resp = (resource_allocation_response_msg_t *) |
| list_next(resp_iter); |
| if (!resp) |
| break; |
| |
| if (pending_job_id == 0) |
| pending_job_id = resp->job_id; |
| if (my_job_id == 0) { |
| my_job_id = resp->job_id; |
| i = list_count(opt_list); |
| k = list_count(job_resp_list); |
| if (i != k) { |
| error("%s: request count != response count (%d != %d)", |
| __func__, i, k); |
| goto relinquish; |
| } |
| } |
| |
| /* |
| * These values could be changed while the job was |
| * pending so overwrite the request with what was |
| * allocated so we don't have issues when we use them |
| * in the step creation. |
| */ |
| if (opt_local->pn_min_memory != NO_VAL64) |
| opt_local->pn_min_memory = |
| (resp->pn_min_memory & (~MEM_PER_CPU)); |
| else if (opt_local->mem_per_cpu != NO_VAL64) |
| opt_local->mem_per_cpu = |
| (resp->pn_min_memory & (~MEM_PER_CPU)); |
| |
| opt_local->min_nodes = resp->node_cnt; |
| opt_local->max_nodes = resp->node_cnt; |
| |
| xfree(opt_local->gres); |
| opt_local->gres = xstrdup(resp->tres_per_node); |
| |
| if (resp->working_cluster_rec) |
| slurm_setup_remote_working_cluster(resp); |
| |
| if (!_wait_nodes_ready(resp)) { |
| if (!destroy_job) |
| error("Something is wrong with the " |
| "boot of the nodes."); |
| goto relinquish; |
| } |
| } |
| list_iterator_destroy(resp_iter); |
| list_iterator_destroy(opt_iter); |
| } else if (destroy_job) { |
| goto relinquish; |
| } |
| |
| xsignal_block(sig_array); |
| |
| return job_resp_list; |
| |
| relinquish: |
| if (job_resp_list) { |
| if (my_job_id == 0) { |
| resp = (resource_allocation_response_msg_t *) |
| list_peek(job_resp_list); |
| my_job_id = resp->job_id; |
| } |
| |
| if (destroy_job && my_job_id) { |
| slurm_complete_job(my_job_id, 1); |
| } |
| FREE_NULL_LIST(job_resp_list); |
| } |
| exit(error_exit); |
| return NULL; |
| } |
| |
| void |
| ignore_signal(int signo) |
| { |
| /* do nothing */ |
| } |
| |
| int |
| cleanup_allocation(void) |
| { |
| slurm_allocation_msg_thr_destroy(msg_thr); |
| return SLURM_SUCCESS; |
| } |
| |
| extern list_t *existing_allocation(void) |
| { |
| uint32_t old_job_id; |
| list_t *job_resp_list = NULL; |
| |
| if (sropt.jobid == NO_VAL) |
| return NULL; |
| |
| if (opt.clusters) { |
| list_t *clusters = NULL; |
| if (slurm_get_cluster_info(&(clusters), opt.clusters, 0)) { |
| print_db_notok(opt.clusters, 0); |
| fatal("Could not get cluster information"); |
| } |
| working_cluster_rec = list_peek(clusters); |
| debug2("Looking for job %d on cluster %s (addr: %s)", |
| sropt.jobid, |
| working_cluster_rec->name, |
| working_cluster_rec->control_host); |
| } |
| |
| old_job_id = (uint32_t) sropt.jobid; |
| if (slurm_het_job_lookup(old_job_id, &job_resp_list) < 0) { |
| if (sropt.parallel_debug) |
| return NULL; /* create new allocation as needed */ |
| if (errno == ESLURM_ALREADY_DONE) |
| error("Slurm job %u has expired", old_job_id); |
| else |
| error("Unable to confirm allocation for job %u: %m", |
| old_job_id); |
| info("Check SLURM_JOB_ID environment variable. Expired or invalid job %u", |
| old_job_id); |
| exit(error_exit); |
| } |
| |
| return job_resp_list; |
| } |
| |
| /* |
| * Create job description structure based off srun options |
| * (see opt.h) |
| */ |
| static job_desc_msg_t *_job_desc_msg_create_from_opts(slurm_opt_t *opt_local) |
| { |
| srun_opt_t *srun_opt = opt_local->srun_opt; |
| job_desc_msg_t *j = slurm_opt_create_job_desc(opt_local, true); |
| |
| if (!j) { |
| return NULL; |
| } |
| |
| /* |
| * The controller rejects any non-stepmgr allocation requesting |
| * resv-ports. To allow srun to request --resv-ports outside of stepmgr |
| * jobs, clear resv_port_cnt when creating a non-stepmgr allocation. |
| */ |
| if ((opt_local->resv_port_cnt != NO_VAL) && |
| !(opt_local->job_flags & STEPMGR_ENABLED) && |
| !xstrstr(slurm_conf.slurmctld_params, "enable_stepmgr")) |
| j->resv_port_cnt = NO_VAL16; |
| |
| xassert(srun_opt); |
| |
| if (!j->name) |
| j->name = xstrdup(srun_opt->cmd_name); |
| |
| if (opt_local->argc > 0) { |
| j->argc = opt_local->argc; |
| j->argv = opt_local->argv; |
| } |
| |
| j->container = xstrdup(opt_local->container); |
| j->container_id = xstrdup(opt_local->container_id); |
| |
| if (srun_opt->cpu_bind) |
| j->cpu_bind = xstrdup(srun_opt->cpu_bind); |
| if (srun_opt->cpu_bind_type) |
| j->cpu_bind_type = srun_opt->cpu_bind_type; |
| |
| if (!j->x11 && opt.x11) { |
| j->x11_magic_cookie = xstrdup(opt.x11_magic_cookie); |
| j->x11_target = xstrdup(opt.x11_target); |
| j->x11_target_port = opt.x11_target_port; |
| } |
| |
| j->wait_all_nodes = 1; |
| |
| return j; |
| } |
| |
| void |
| job_desc_msg_destroy(job_desc_msg_t *j) |
| { |
| if (j) { |
| xfree(j->req_nodes); |
| xfree(j); |
| } |
| } |
| |
| extern int create_job_step(srun_job_t *job, bool use_all_cpus, |
| slurm_opt_t *opt_local) |
| { |
| return launch_g_create_job_step(job, use_all_cpus, |
| _signal_while_allocating, |
| &destroy_job, opt_local); |
| } |