src/srun/allocate.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  src/srun/allocate.c - srun functions for managing node allocations
  *****************************************************************************
  *  Copyright (C) 2002-2007 The Regents of the University of California.
  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Mark Grondona <mgrondona@llnl.gov>.
  *  CODE-OCEC-09-009. All rights reserved.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #include "config.h"

 #include <poll.h>
 #include <pwd.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <unistd.h>

 #include "src/common/env.h"
 #include "src/common/fd.h"
 #include "src/common/forward.h"
 #include "src/interfaces/gres.h"
 #include "src/common/list.h"
 #include "src/common/log.h"
 #include "src/common/macros.h"
 #include "src/common/proc_args.h"
 #include "src/interfaces/auth.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/slurm_time.h"
 #include "src/common/xmalloc.h"
 #include "src/common/xsignal.h"
 #include "src/common/xstring.h"

 #include "allocate.h"
 #include "opt.h"
 #include "launch.h"

 #define MAX_ALLOC_WAIT	60	/* seconds */
 #define MIN_ALLOC_WAIT	5	/* seconds */
 #define MAX_RETRIES	10
 #define POLL_SLEEP	0.5	/* retry interval in seconds  */

 pthread_mutex_t msg_lock = PTHREAD_MUTEX_INITIALIZER;
 pthread_cond_t msg_cond = PTHREAD_COND_INITIALIZER;
 allocation_msg_thread_t *msg_thr = NULL;
 struct pollfd global_fds[1];

 extern char **environ;

 static uint32_t pending_job_id = 0;

 /*
  * Static Prototypes
  */
 static job_desc_msg_t *_job_desc_msg_create_from_opts(slurm_opt_t *opt_local);
 static void _set_pending_job_id(uint32_t job_id);
 static void _signal_while_allocating(int signo);
 static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc);

 static sig_atomic_t destroy_job = 0;
 static bool is_het_job = false;
 static bool revoke_job = false;

 static void _set_pending_job_id(uint32_t job_id)
 {
 	debug2("Pending job allocation %u", job_id);
 	pending_job_id = job_id;
 }

 static void *_safe_signal_while_allocating(void *in_data)
 {
 	int signo = *(int *)in_data;

 	debug("Got signal %d", signo);
 	xfree(in_data);
 	if (pending_job_id != 0) {
 		slurm_complete_job(pending_job_id, 128 + signo);
 	}

 	return NULL;
 }

 static void _signal_while_allocating(int signo)
 {
 	int *local_signal;

 	/*
 	 * There are places where _signal_while_allocating() can't be
 	 * put into a thread, but if this isn't on a separate thread
 	 * and we try to print something using the log functions and
 	 * it just so happens to be in a poll or something we can get
 	 * deadlock. So after the signal happens we are able to spawn
 	 * a thread here and avoid the deadlock.
 	 *
 	 * SO, DON'T PRINT ANYTHING IN THIS FUNCTION.
 	 */
 	if (signo == SIGCONT)
 		return;

 	destroy_job = 1;

 	local_signal = xmalloc(sizeof(int));
 	*local_signal = signo;
 	slurm_thread_create_detached(_safe_signal_while_allocating,
 				     local_signal);
 }

 /* This typically signifies the job was cancelled by scancel */
 static void _job_complete_handler(srun_job_complete_msg_t *msg)
 {
 	if (!is_het_job && pending_job_id && (pending_job_id != msg->job_id)) {
 		error("Ignoring job_complete for job %u because our job ID is %u",
 		      msg->job_id, pending_job_id);
 		return;
 	}

 	/* Only print if we know we were signaled */
 	if (destroy_job)
 		info("Force Terminated %ps", msg);
 	revoke_job = true;
 }

 /*
  * Job has been notified of it's approaching time limit.
  * Job will be killed shortly after timeout.
  * This RPC can arrive multiple times with the same or updated timeouts.
  * FIXME: We may want to signal the job or perform other action for this.
  * FIXME: How much lead time do we want for this message? Some jobs may
  *	require tens of minutes to gracefully terminate.
  */
 static void _timeout_handler(srun_timeout_msg_t *msg)
 {
 	static time_t last_timeout = 0;

 	if (msg->timeout != last_timeout) {
 		last_timeout = msg->timeout;
 		verbose("job time limit to be reached at %s",
 			slurm_ctime2(&msg->timeout));
 	}
 }

 static void _user_msg_handler(srun_user_msg_t *msg)
 {
 	info("%s", msg->msg);
 }

 static void _node_fail_handler(srun_node_fail_msg_t *msg)
 {
 	error("Node failure on %s", msg->nodelist);
 }


 static bool _retry(void)
 {
 	static int  retries = 0;
 	static char *msg = "Slurm controller not responding, "
 		"sleeping and retrying.";

 	if ((errno == ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) || (errno == EAGAIN)) {
 		if (retries == 0)
 			error("%s", msg);
 		else if (retries < MAX_RETRIES)
 			debug("%s", msg);
 		else
 			return false;
 		sleep (++retries);
 	} else if (errno == EINTR) {
 		/* srun may be interrupted by the BLCR checkpoint signal */
 		/*
 		 * XXX: this will cause the old job cancelled and a new
 		 * job allocated
 		 */
 		debug("Syscall interrupted while allocating resources, "
 		      "retrying.");
 		return true;
 	} else if (opt.immediate &&
 		   ((errno == ETIMEDOUT) || (errno == ESLURM_NODES_BUSY) ||
 		    (errno == ESLURM_PORTS_BUSY))) {
 		error("Unable to allocate resources: %s",
 		      slurm_strerror(ESLURM_NODES_BUSY));
 		error_exit = immediate_exit;
 		return false;
 	} else if ((errno == SLURM_PROTOCOL_AUTHENTICATION_ERROR) ||
 		   (errno == SLURM_UNEXPECTED_MSG_ERROR) ||
 		   (errno == SLURM_PROTOCOL_INSANE_MSG_LENGTH)) {
 		static int external_msg_count = 0;
 		error("Srun communication socket apparently being written to "
 		      "by something other than Slurm");
 		if (external_msg_count++ < 4)
 			return true;
 		error("Unable to allocate resources: %m");
 		return false;
 	} else {
 		error("Unable to allocate resources: %m");
 		return false;
 	}

 	return true;
 }

 /* returns 1 if job and nodes are ready for job to begin, 0 otherwise */
 static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc)
 {
 	double cur_delay = 0;
 	double cur_sleep = 0;
 	int is_ready = 0, i = 0, rc;
 	bool job_killed = false;

 	pending_job_id = alloc->job_id;

 	while (true) {
 		if (i) {
 			/*
 			 * First sleep should be very quick to improve
 			 * responsiveness.
 			 *
 			 * Otherwise, increment by POLL_SLEEP for every loop.
 			 */
 			if (cur_delay == 0)
 				cur_sleep = 0.1;
 			else if (cur_sleep < 300)
 				cur_sleep = POLL_SLEEP * i;
 			if (i == 1)
 				verbose("Waiting for resource configuration");
 			else
 				debug("Waited %f sec and still waiting: next sleep for %f sec",
 				      cur_delay, cur_sleep);
 			usleep(USEC_IN_SEC * cur_sleep);
 			cur_delay += cur_sleep;
 		}
 		i += 1;

 		rc = slurm_job_node_ready(alloc->job_id);
 		if (rc == READY_JOB_FATAL)
 			break;				/* fatal error */
 		if (destroy_job || revoke_job)
 			break;
 		if ((rc == READY_JOB_ERROR) || (rc == EAGAIN))
 			continue;			/* retry */
 		if ((rc & READY_JOB_STATE) == 0) {	/* job killed */
 			job_killed = true;
 			break;
 		}
 		if ((rc & READY_NODE_STATE) &&
 		    (rc & READY_PROLOG_STATE)) {
 			is_ready = 1;
 			break;
 		}
 	}
 	if (is_ready) {
 		if (i > 0)
      			verbose("Nodes %s are ready for job", alloc->node_list);
 	} else if (!destroy_job) {
 		if (job_killed) {
 			error("Job allocation %u has been revoked",
 			      alloc->job_id);
 			destroy_job = true;
 		} else
 			error("Nodes %s are still not ready", alloc->node_list);
 	} else	/* allocation_interrupted and slurmctld not responing */
 		is_ready = 0;

 	pending_job_id = 0;

 	return is_ready;
 }

 static int _allocate_test(slurm_opt_t *opt_local)
 {
 	job_desc_msg_t *j;
 	int rc;

 	if ((j = _job_desc_msg_create_from_opts(opt_local)) == NULL)
 		return SLURM_ERROR;

 	if (opt_local->clusters &&
 	    (slurmdb_get_first_avail_cluster(j, opt_local->clusters,
 					     &working_cluster_rec)
 	     != SLURM_SUCCESS)) {
 		print_db_notok(opt_local->clusters, 0);
 		return SLURM_ERROR;
 	}

 	rc = slurm_job_will_run(j);
 	job_desc_msg_destroy(j);
 	return rc;

 }

 extern int allocate_test(void)
 {
 	int rc = SLURM_SUCCESS;
 	list_itr_t *iter;
 	slurm_opt_t *opt_local;

 	if (opt_list) {
 		iter = list_iterator_create(opt_list);
 		while ((opt_local = list_next(iter))) {
 			if ((rc = _allocate_test(opt_local)) != SLURM_SUCCESS)
 				break;
  		}
 		list_iterator_destroy(iter);
 	} else {
 		rc = _allocate_test(&opt);
 	}

 	return rc;
 }

 /*
  * Allocate nodes from the slurm controller -- retrying the attempt
  * if the controller appears to be down, and optionally waiting for
  * resources if none are currently available (see opt.immediate)
  *
  * Returns a pointer to a resource_allocation_response_msg which must
  * be freed with slurm_free_resource_allocation_response_msg()
  */
 extern resource_allocation_response_msg_t *allocate_nodes(
 	slurm_opt_t *opt_local)
 {
 	srun_opt_t *srun_opt = opt_local->srun_opt;
 	resource_allocation_response_msg_t *resp = NULL;
 	job_desc_msg_t *j;
 	slurm_allocation_callbacks_t callbacks;
 	int i;

 	xassert(srun_opt);

 	if (srun_opt->relative != NO_VAL)
 		fatal("--relative option invalid for job allocation request");

 	if ((j = _job_desc_msg_create_from_opts(&opt)) == NULL)
 		return NULL;

 	if (opt_local->clusters &&
 	    (slurmdb_get_first_avail_cluster(j, opt_local->clusters,
 					     &working_cluster_rec)
 	     != SLURM_SUCCESS)) {
 		print_db_notok(opt_local->clusters, 0);
 		return NULL;
 	}

 	j->origin_cluster = xstrdup(slurm_conf.cluster_name);

 	callbacks.timeout = _timeout_handler;
 	callbacks.job_complete = _job_complete_handler;
 	callbacks.job_suspend = NULL;
 	callbacks.user_msg = _user_msg_handler;
 	callbacks.node_fail = _node_fail_handler;

 	/* create message thread to handle pings and such from slurmctld */
 	msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks);

 	/* NOTE: Do not process signals in separate pthread. The signal will
 	 * cause slurm_allocate_resources_blocking() to exit immediately. */
 	xsignal_unblock(sig_array);
 	for (i = 0; sig_array[i]; i++)
 		xsignal(sig_array[i], _signal_while_allocating);

 	while (!resp) {
 		resp = slurm_allocate_resources_blocking(j,
 							 opt_local->immediate,
 							 _set_pending_job_id);
 		if (destroy_job) {
 			if (pending_job_id != 0)
 				info("Job allocation %u has been revoked",
 				     pending_job_id);
 			/* cancelled by signal */
 			break;
 		} else if (!resp && !_retry()) {
 			break;
 		}
 	}

 	if (resp)
 		print_multi_line_string(resp->job_submit_user_msg,
 					-1, LOG_LEVEL_INFO);

 	if (resp && !destroy_job) {
 		/*
 		 * Allocation granted!
 		 */
 		pending_job_id = resp->job_id;

 		/*
 		 * These values could be changed while the job was
 		 * pending so overwrite the request with what was
 		 * allocated so we don't have issues when we use them
 		 * in the step creation.
 		 */
 		opt_local->pn_min_memory = NO_VAL64;
 		opt_local->mem_per_cpu = NO_VAL64;
 		if (resp->pn_min_memory != NO_VAL64) {
 			if (resp->pn_min_memory & MEM_PER_CPU) {
 				opt_local->mem_per_cpu = (resp->pn_min_memory &
 							 (~MEM_PER_CPU));
 			} else {
 				opt_local->pn_min_memory = resp->pn_min_memory;
 			}
 		}

 		opt_local->min_nodes = resp->node_cnt;
 		opt_local->max_nodes = resp->node_cnt;
 		xfree(opt_local->gres);
 		opt_local->gres = xstrdup(resp->tres_per_node);

 		if (resp->working_cluster_rec)
 			slurm_setup_remote_working_cluster(resp);

 		if (!_wait_nodes_ready(resp)) {
 			if (!destroy_job)
 				error("Something is wrong with the boot of the nodes.");
 			goto relinquish;
 		}
 	} else if (destroy_job || revoke_job) {
 		goto relinquish;
 	}

 	xsignal_block(sig_array);

 	job_desc_msg_destroy(j);

 	return resp;

 relinquish:
 	if (resp) {
 		if (destroy_job || revoke_job)
 			slurm_complete_job(resp->job_id, 1);
 		slurm_free_resource_allocation_response_msg(resp);
 	}
 	exit(error_exit);
 	return NULL;
 }

 static int _copy_other_port(void *x, void *arg)
 {
 	job_desc_msg_t *desc = x;
 	desc->other_port = *(uint16_t *)arg;

 	return SLURM_SUCCESS;
 }

 /*
  * Allocate nodes for heterogeneous job from the slurm controller --
  * retrying the attempt if the controller appears to be down, and optionally
  * waiting for resources if none are currently available (see opt.immediate)
  *
  * Returns a pointer to a resource_allocation_response_msg which must
  * be freed with slurm_free_resource_allocation_response_msg()
  */
 list_t *allocate_het_job_nodes(void)
 {
 	resource_allocation_response_msg_t *resp = NULL;
 	job_desc_msg_t *j, *first_job = NULL;
 	slurm_allocation_callbacks_t callbacks;
 	list_itr_t *opt_iter, *resp_iter;
 	slurm_opt_t *opt_local, *first_opt = NULL;
 	list_t *job_req_list = NULL, *job_resp_list = NULL;
 	uint32_t my_job_id = 0;
 	int i, k;

 	job_req_list = list_create(NULL);
 	opt_iter = list_iterator_create(opt_list);
 	while ((opt_local = list_next(opt_iter))) {
 		srun_opt_t *srun_opt = opt_local->srun_opt;
 		xassert(srun_opt);
 		if (!first_opt)
 			first_opt = opt_local;
 		if (srun_opt->relative != NO_VAL)
 			fatal("--relative option invalid for job allocation request");

 		if ((j = _job_desc_msg_create_from_opts(opt_local)) == NULL) {
 			FREE_NULL_LIST(job_req_list);
 			return NULL;
 		}
 		if (!first_job)
 			first_job = j;

 		j->origin_cluster = xstrdup(slurm_conf.cluster_name);

 		list_append(job_req_list, j);
 	}
 	list_iterator_destroy(opt_iter);

 	if (!first_job) {
 		error("%s: No job requests found", __func__);
 		FREE_NULL_LIST(job_req_list);
 		return NULL;
 	}

 	if (first_opt && first_opt->clusters &&
 	    (slurmdb_get_first_het_job_cluster(job_req_list,
 					       first_opt->clusters,
 					       &working_cluster_rec)
 	     != SLURM_SUCCESS)) {
 		print_db_notok(first_opt->clusters, 0);
 		FREE_NULL_LIST(job_req_list);
 		return NULL;
 	}

 	callbacks.timeout = _timeout_handler;
 	callbacks.job_complete = _job_complete_handler;
 	callbacks.job_suspend = NULL;
 	callbacks.user_msg = _user_msg_handler;
 	callbacks.node_fail = _node_fail_handler;

 	/* create message thread to handle pings and such from slurmctld */
 	msg_thr = slurm_allocation_msg_thr_create(&first_job->other_port,
 						  &callbacks);
 	list_for_each(job_req_list, _copy_other_port, &first_job->other_port);

 	/* NOTE: Do not process signals in separate pthread. The signal will
 	 * cause slurm_allocate_resources_blocking() to exit immediately. */
 	xsignal_unblock(sig_array);
 	for (i = 0; sig_array[i]; i++)
 		xsignal(sig_array[i], _signal_while_allocating);

 	is_het_job = true;

 	while (first_opt && !job_resp_list) {
 		job_resp_list = slurm_allocate_het_job_blocking(job_req_list,
 				 first_opt->immediate, _set_pending_job_id);
 		if (destroy_job) {
 			/* cancelled by signal */
 			if (pending_job_id != 0)
 				info("Job allocation %u has been revoked",
 				     pending_job_id);
 			break;
 		} else if (!job_resp_list && !_retry()) {
 			break;
 		}
 	}
 	FREE_NULL_LIST(job_req_list);

 	if (job_resp_list && !destroy_job) {
 		/*
 		 * Allocation granted!
 		 */

 		opt_iter  = list_iterator_create(opt_list);
 		resp_iter = list_iterator_create(job_resp_list);
 		while ((opt_local = list_next(opt_iter))) {
 			resp = (resource_allocation_response_msg_t *)
 			       list_next(resp_iter);
 			if (!resp)
 				break;

 			if (pending_job_id == 0)
 				pending_job_id = resp->job_id;
 			if (my_job_id == 0) {
 				my_job_id = resp->job_id;
 				i = list_count(opt_list);
 				k = list_count(job_resp_list);
 				if (i != k) {
 					error("%s: request count != response count (%d != %d)",
 					      __func__, i, k);
 					goto relinquish;
 				}
 			}

 			/*
 			 * These values could be changed while the job was
 			 * pending so overwrite the request with what was
 			 * allocated so we don't have issues when we use them
 			 * in the step creation.
 			 */
 			if (opt_local->pn_min_memory != NO_VAL64)
 				opt_local->pn_min_memory =
 					(resp->pn_min_memory & (~MEM_PER_CPU));
 			else if (opt_local->mem_per_cpu != NO_VAL64)
 				opt_local->mem_per_cpu =
 					(resp->pn_min_memory & (~MEM_PER_CPU));

 			opt_local->min_nodes = resp->node_cnt;
 			opt_local->max_nodes = resp->node_cnt;

 			xfree(opt_local->gres);
 			opt_local->gres = xstrdup(resp->tres_per_node);

 			if (resp->working_cluster_rec)
 				slurm_setup_remote_working_cluster(resp);

 			if (!_wait_nodes_ready(resp)) {
 				if (!destroy_job)
 					error("Something is wrong with the "
 					      "boot of the nodes.");
 				goto relinquish;
 			}
 		}
 		list_iterator_destroy(resp_iter);
 		list_iterator_destroy(opt_iter);
 	} else if (destroy_job) {
 		goto relinquish;
 	}

 	xsignal_block(sig_array);

 	return job_resp_list;

 relinquish:
 	if (job_resp_list) {
 		if (my_job_id == 0) {
 			resp = (resource_allocation_response_msg_t *)
 			       list_peek(job_resp_list);
 			my_job_id = resp->job_id;
 		}

 		if (destroy_job && my_job_id) {
 			slurm_complete_job(my_job_id, 1);
 		}
 		FREE_NULL_LIST(job_resp_list);
 	}
 	exit(error_exit);
 	return NULL;
 }

 void
 ignore_signal(int signo)
 {
 	/* do nothing */
 }

 int
 cleanup_allocation(void)
 {
 	slurm_allocation_msg_thr_destroy(msg_thr);
 	return SLURM_SUCCESS;
 }

 extern list_t *existing_allocation(void)
 {
 	uint32_t old_job_id;
 	list_t *job_resp_list = NULL;

 	if (sropt.jobid == NO_VAL)
 		return NULL;

 	if (opt.clusters) {
 		list_t *clusters = NULL;
 		if (slurm_get_cluster_info(&(clusters), opt.clusters, 0)) {
 			print_db_notok(opt.clusters, 0);
 			fatal("Could not get cluster information");
 		}
 		working_cluster_rec = list_peek(clusters);
 		debug2("Looking for job %d on cluster %s (addr: %s)",
 		       sropt.jobid,
 		       working_cluster_rec->name,
 		       working_cluster_rec->control_host);
 	}

 	old_job_id = (uint32_t) sropt.jobid;
 	if (slurm_het_job_lookup(old_job_id, &job_resp_list) < 0) {
 		if (sropt.parallel_debug)
 			return NULL;    /* create new allocation as needed */
 		if (errno == ESLURM_ALREADY_DONE)
 			error("Slurm job %u has expired", old_job_id);
 		else
 			error("Unable to confirm allocation for job %u: %m",
 			      old_job_id);
 		info("Check SLURM_JOB_ID environment variable. Expired or invalid job %u",
 		     old_job_id);
 		exit(error_exit);
 	}

 	return job_resp_list;
 }

 /*
  * Create job description structure based off srun options
  * (see opt.h)
  */
 static job_desc_msg_t *_job_desc_msg_create_from_opts(slurm_opt_t *opt_local)
 {
 	srun_opt_t *srun_opt = opt_local->srun_opt;
 	job_desc_msg_t *j = slurm_opt_create_job_desc(opt_local, true);

 	if (!j) {
 		return NULL;
 	}

 	/*
 	 * The controller rejects any non-stepmgr allocation requesting
 	 * resv-ports. To allow srun to request --resv-ports outside of stepmgr
 	 * jobs, clear resv_port_cnt when creating a non-stepmgr allocation.
 	 */
 	if ((opt_local->resv_port_cnt != NO_VAL) &&
 	    !(opt_local->job_flags & STEPMGR_ENABLED) &&
 	    !xstrstr(slurm_conf.slurmctld_params, "enable_stepmgr"))
 		j->resv_port_cnt = NO_VAL16;

 	xassert(srun_opt);

 	if (!j->name)
 		j->name = xstrdup(srun_opt->cmd_name);

 	if (opt_local->argc > 0) {
 		j->argc = opt_local->argc;
 		j->argv = opt_local->argv;
 	}

 	j->container = xstrdup(opt_local->container);
 	j->container_id = xstrdup(opt_local->container_id);

 	if (srun_opt->cpu_bind)
 		j->cpu_bind = xstrdup(srun_opt->cpu_bind);
 	if (srun_opt->cpu_bind_type)
 		j->cpu_bind_type = srun_opt->cpu_bind_type;

 	if (!j->x11 && opt.x11) {
 		j->x11_magic_cookie = xstrdup(opt.x11_magic_cookie);
 		j->x11_target = xstrdup(opt.x11_target);
 		j->x11_target_port = opt.x11_target_port;
 	}

 	j->wait_all_nodes = 1;

 	return j;
 }

 void
 job_desc_msg_destroy(job_desc_msg_t *j)
 {
 	if (j) {
 		xfree(j->req_nodes);
 		xfree(j);
 	}
 }

 extern int create_job_step(srun_job_t *job, bool use_all_cpus,
 			   slurm_opt_t *opt_local)
 {
 	return launch_g_create_job_step(job, use_all_cpus,
 					_signal_while_allocating,
 					&destroy_job, opt_local);
 }
	/*****************************************************************************\
	* src/srun/allocate.c - srun functions for managing node allocations
	*****************************************************************************
	* Copyright (C) 2002-2007 The Regents of the University of California.
	* Copyright (C) 2008-2010 Lawrence Livermore National Security.
	* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
	* Written by Mark Grondona <mgrondona@llnl.gov>.
	* CODE-OCEC-09-009. All rights reserved.
	*
	* This file is part of Slurm, a resource management program.
	* For details, see <https://slurm.schedmd.com/>.
	* Please also read the included file: DISCLAIMER.
	*
	* Slurm is free software; you can redistribute it and/or modify it under
	* the terms of the GNU General Public License as published by the Free
	* Software Foundation; either version 2 of the License, or (at your option)
	* any later version.
	*
	* In addition, as a special exception, the copyright holders give permission
	* to link the code of portions of this program with the OpenSSL library under
	* certain conditions as described in each individual source file, and
	* distribute linked combinations including the two. You must obey the GNU
	* General Public License in all respects for all of the code used other than
	* OpenSSL. If you modify file(s) with this exception, you may extend this
	* exception to your version of the file(s), but you are not obligated to do
	* so. If you do not wish to do so, delete this exception statement from your
	* version. If you delete this exception statement from all source files in
	* the program, then also delete it here.
	*
	* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
	* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
	* details.
	*
	* You should have received a copy of the GNU General Public License along
	* with Slurm; if not, write to the Free Software Foundation, Inc.,
	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	\*****************************************************************************/

	#include "config.h"

	#include <poll.h>
	#include <pwd.h>
	#include <stdlib.h>
	#include <sys/types.h>
	#include <unistd.h>

	#include "src/common/env.h"
	#include "src/common/fd.h"
	#include "src/common/forward.h"
	#include "src/interfaces/gres.h"
	#include "src/common/list.h"
	#include "src/common/log.h"
	#include "src/common/macros.h"
	#include "src/common/proc_args.h"
	#include "src/interfaces/auth.h"
	#include "src/common/slurm_protocol_api.h"
	#include "src/common/slurm_time.h"
	#include "src/common/xmalloc.h"
	#include "src/common/xsignal.h"
	#include "src/common/xstring.h"

	#include "allocate.h"
	#include "opt.h"
	#include "launch.h"

	#define MAX_ALLOC_WAIT 60 /* seconds */
	#define MIN_ALLOC_WAIT 5 /* seconds */
	#define MAX_RETRIES 10
	#define POLL_SLEEP 0.5 /* retry interval in seconds */

	pthread_mutex_t msg_lock = PTHREAD_MUTEX_INITIALIZER;
	pthread_cond_t msg_cond = PTHREAD_COND_INITIALIZER;
	allocation_msg_thread_t *msg_thr = NULL;
	struct pollfd global_fds[1];

	extern char **environ;

	static uint32_t pending_job_id = 0;

	/*
	* Static Prototypes
	*/
	static job_desc_msg_t _job_desc_msg_create_from_opts(slurm_opt_t opt_local);
	static void _set_pending_job_id(uint32_t job_id);
	static void _signal_while_allocating(int signo);
	static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc);

	static sig_atomic_t destroy_job = 0;
	static bool is_het_job = false;
	static bool revoke_job = false;

	static void _set_pending_job_id(uint32_t job_id)
	{
	debug2("Pending job allocation %u", job_id);
	pending_job_id = job_id;
	}

	static void _safe_signal_while_allocating(void in_data)
	{
	int signo = (int )in_data;

	debug("Got signal %d", signo);
	xfree(in_data);
	if (pending_job_id != 0) {
	slurm_complete_job(pending_job_id, 128 + signo);
	}

	return NULL;
	}

	static void _signal_while_allocating(int signo)
	{
	int *local_signal;

	/*
	* There are places where _signal_while_allocating() can't be
	* put into a thread, but if this isn't on a separate thread
	* and we try to print something using the log functions and
	* it just so happens to be in a poll or something we can get
	* deadlock. So after the signal happens we are able to spawn
	* a thread here and avoid the deadlock.
	*
	* SO, DON'T PRINT ANYTHING IN THIS FUNCTION.
	*/
	if (signo == SIGCONT)
	return;

	destroy_job = 1;

	local_signal = xmalloc(sizeof(int));
	*local_signal = signo;
	slurm_thread_create_detached(_safe_signal_while_allocating,
	local_signal);
	}

	/* This typically signifies the job was cancelled by scancel */
	static void _job_complete_handler(srun_job_complete_msg_t *msg)
	{
	if (!is_het_job && pending_job_id && (pending_job_id != msg->job_id)) {
	error("Ignoring job_complete for job %u because our job ID is %u",
	msg->job_id, pending_job_id);
	return;
	}

	/* Only print if we know we were signaled */
	if (destroy_job)
	info("Force Terminated %ps", msg);
	revoke_job = true;
	}

	/*
	* Job has been notified of it's approaching time limit.
	* Job will be killed shortly after timeout.
	* This RPC can arrive multiple times with the same or updated timeouts.
	* FIXME: We may want to signal the job or perform other action for this.
	* FIXME: How much lead time do we want for this message? Some jobs may
	* require tens of minutes to gracefully terminate.
	*/
	static void _timeout_handler(srun_timeout_msg_t *msg)
	{
	static time_t last_timeout = 0;

	if (msg->timeout != last_timeout) {
	last_timeout = msg->timeout;
	verbose("job time limit to be reached at %s",
	slurm_ctime2(&msg->timeout));
	}
	}

	static void _user_msg_handler(srun_user_msg_t *msg)
	{
	info("%s", msg->msg);
	}

	static void _node_fail_handler(srun_node_fail_msg_t *msg)
	{
	error("Node failure on %s", msg->nodelist);
	}



	static bool _retry(void)
	{
	static int retries = 0;
	static char *msg = "Slurm controller not responding, "
	"sleeping and retrying.";

	if ((errno == ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) \|\| (errno == EAGAIN)) {
	if (retries == 0)
	error("%s", msg);
	else if (retries < MAX_RETRIES)
	debug("%s", msg);
	else
	return false;
	sleep (++retries);
	} else if (errno == EINTR) {
	/* srun may be interrupted by the BLCR checkpoint signal */
	/*
	* XXX: this will cause the old job cancelled and a new
	* job allocated
	*/
	debug("Syscall interrupted while allocating resources, "
	"retrying.");
	return true;
	} else if (opt.immediate &&
	((errno == ETIMEDOUT) \|\| (errno == ESLURM_NODES_BUSY) \|\|
	(errno == ESLURM_PORTS_BUSY))) {
	error("Unable to allocate resources: %s",
	slurm_strerror(ESLURM_NODES_BUSY));
	error_exit = immediate_exit;
	return false;
	} else if ((errno == SLURM_PROTOCOL_AUTHENTICATION_ERROR) \|\|
	(errno == SLURM_UNEXPECTED_MSG_ERROR) \|\|
	(errno == SLURM_PROTOCOL_INSANE_MSG_LENGTH)) {
	static int external_msg_count = 0;
	error("Srun communication socket apparently being written to "
	"by something other than Slurm");
	if (external_msg_count++ < 4)
	return true;
	error("Unable to allocate resources: %m");
	return false;
	} else {
	error("Unable to allocate resources: %m");
	return false;
	}

	return true;
	}

	/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */
	static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc)
	{
	double cur_delay = 0;
	double cur_sleep = 0;
	int is_ready = 0, i = 0, rc;
	bool job_killed = false;

	pending_job_id = alloc->job_id;

	while (true) {
	if (i) {
	/*
	* First sleep should be very quick to improve
	* responsiveness.
	*
	* Otherwise, increment by POLL_SLEEP for every loop.
	*/
	if (cur_delay == 0)
	cur_sleep = 0.1;
	else if (cur_sleep < 300)
	cur_sleep = POLL_SLEEP * i;
	if (i == 1)
	verbose("Waiting for resource configuration");
	else
	debug("Waited %f sec and still waiting: next sleep for %f sec",
	cur_delay, cur_sleep);
	usleep(USEC_IN_SEC * cur_sleep);
	cur_delay += cur_sleep;
	}
	i += 1;

	rc = slurm_job_node_ready(alloc->job_id);
	if (rc == READY_JOB_FATAL)
	break; /* fatal error */
	if (destroy_job \|\| revoke_job)
	break;
	if ((rc == READY_JOB_ERROR) \|\| (rc == EAGAIN))
	continue; /* retry */
	if ((rc & READY_JOB_STATE) == 0) { /* job killed */
	job_killed = true;
	break;
	}
	if ((rc & READY_NODE_STATE) &&
	(rc & READY_PROLOG_STATE)) {
	is_ready = 1;
	break;
	}
	}
	if (is_ready) {
	if (i > 0)
	verbose("Nodes %s are ready for job", alloc->node_list);
	} else if (!destroy_job) {
	if (job_killed) {
	error("Job allocation %u has been revoked",
	alloc->job_id);
	destroy_job = true;
	} else
	error("Nodes %s are still not ready", alloc->node_list);
	} else /* allocation_interrupted and slurmctld not responing */
	is_ready = 0;

	pending_job_id = 0;

	return is_ready;
	}

	static int _allocate_test(slurm_opt_t *opt_local)
	{
	job_desc_msg_t *j;
	int rc;

	if ((j = _job_desc_msg_create_from_opts(opt_local)) == NULL)
	return SLURM_ERROR;

	if (opt_local->clusters &&
	(slurmdb_get_first_avail_cluster(j, opt_local->clusters,
	&working_cluster_rec)
	!= SLURM_SUCCESS)) {
	print_db_notok(opt_local->clusters, 0);
	return SLURM_ERROR;
	}

	rc = slurm_job_will_run(j);
	job_desc_msg_destroy(j);
	return rc;

	}

	extern int allocate_test(void)
	{
	int rc = SLURM_SUCCESS;
	list_itr_t *iter;
	slurm_opt_t *opt_local;

	if (opt_list) {
	iter = list_iterator_create(opt_list);
	while ((opt_local = list_next(iter))) {
	if ((rc = _allocate_test(opt_local)) != SLURM_SUCCESS)
	break;
	}
	list_iterator_destroy(iter);
	} else {
	rc = _allocate_test(&opt);
	}

	return rc;
	}

	/*
	* Allocate nodes from the slurm controller -- retrying the attempt
	* if the controller appears to be down, and optionally waiting for
	* resources if none are currently available (see opt.immediate)
	*
	* Returns a pointer to a resource_allocation_response_msg which must
	* be freed with slurm_free_resource_allocation_response_msg()
	*/
	extern resource_allocation_response_msg_t *allocate_nodes(
	slurm_opt_t *opt_local)
	{
	srun_opt_t *srun_opt = opt_local->srun_opt;
	resource_allocation_response_msg_t *resp = NULL;
	job_desc_msg_t *j;
	slurm_allocation_callbacks_t callbacks;
	int i;

	xassert(srun_opt);

	if (srun_opt->relative != NO_VAL)
	fatal("--relative option invalid for job allocation request");

	if ((j = _job_desc_msg_create_from_opts(&opt)) == NULL)
	return NULL;

	if (opt_local->clusters &&
	(slurmdb_get_first_avail_cluster(j, opt_local->clusters,
	&working_cluster_rec)
	!= SLURM_SUCCESS)) {
	print_db_notok(opt_local->clusters, 0);
	return NULL;
	}

	j->origin_cluster = xstrdup(slurm_conf.cluster_name);

	callbacks.timeout = _timeout_handler;
	callbacks.job_complete = _job_complete_handler;
	callbacks.job_suspend = NULL;
	callbacks.user_msg = _user_msg_handler;
	callbacks.node_fail = _node_fail_handler;

	/* create message thread to handle pings and such from slurmctld */
	msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks);

	/* NOTE: Do not process signals in separate pthread. The signal will
	* cause slurm_allocate_resources_blocking() to exit immediately. */
	xsignal_unblock(sig_array);
	for (i = 0; sig_array[i]; i++)
	xsignal(sig_array[i], _signal_while_allocating);

	while (!resp) {
	resp = slurm_allocate_resources_blocking(j,
	opt_local->immediate,
	_set_pending_job_id);
	if (destroy_job) {
	if (pending_job_id != 0)
	info("Job allocation %u has been revoked",
	pending_job_id);
	/* cancelled by signal */
	break;
	} else if (!resp && !_retry()) {
	break;
	}
	}

	if (resp)
	print_multi_line_string(resp->job_submit_user_msg,
	-1, LOG_LEVEL_INFO);

	if (resp && !destroy_job) {
	/*
	* Allocation granted!
	*/
	pending_job_id = resp->job_id;

	/*
	* These values could be changed while the job was
	* pending so overwrite the request with what was
	* allocated so we don't have issues when we use them
	* in the step creation.
	*/
	opt_local->pn_min_memory = NO_VAL64;
	opt_local->mem_per_cpu = NO_VAL64;
	if (resp->pn_min_memory != NO_VAL64) {
	if (resp->pn_min_memory & MEM_PER_CPU) {
	opt_local->mem_per_cpu = (resp->pn_min_memory &
	(~MEM_PER_CPU));
	} else {
	opt_local->pn_min_memory = resp->pn_min_memory;
	}
	}

	opt_local->min_nodes = resp->node_cnt;
	opt_local->max_nodes = resp->node_cnt;
	xfree(opt_local->gres);
	opt_local->gres = xstrdup(resp->tres_per_node);

	if (resp->working_cluster_rec)
	slurm_setup_remote_working_cluster(resp);

	if (!_wait_nodes_ready(resp)) {
	if (!destroy_job)
	error("Something is wrong with the boot of the nodes.");
	goto relinquish;
	}
	} else if (destroy_job \|\| revoke_job) {
	goto relinquish;
	}

	xsignal_block(sig_array);

	job_desc_msg_destroy(j);

	return resp;

	relinquish:
	if (resp) {
	if (destroy_job \|\| revoke_job)
	slurm_complete_job(resp->job_id, 1);
	slurm_free_resource_allocation_response_msg(resp);
	}
	exit(error_exit);
	return NULL;
	}

	static int _copy_other_port(void x, void arg)
	{
	job_desc_msg_t *desc = x;
	desc->other_port = (uint16_t )arg;

	return SLURM_SUCCESS;
	}

	/*
	* Allocate nodes for heterogeneous job from the slurm controller --
	* retrying the attempt if the controller appears to be down, and optionally
	* waiting for resources if none are currently available (see opt.immediate)
	*
	* Returns a pointer to a resource_allocation_response_msg which must
	* be freed with slurm_free_resource_allocation_response_msg()
	*/
	list_t *allocate_het_job_nodes(void)
	{
	resource_allocation_response_msg_t *resp = NULL;
	job_desc_msg_t j, first_job = NULL;
	slurm_allocation_callbacks_t callbacks;
	list_itr_t opt_iter, resp_iter;
	slurm_opt_t opt_local, first_opt = NULL;
	list_t job_req_list = NULL, job_resp_list = NULL;
	uint32_t my_job_id = 0;
	int i, k;

	job_req_list = list_create(NULL);
	opt_iter = list_iterator_create(opt_list);
	while ((opt_local = list_next(opt_iter))) {
	srun_opt_t *srun_opt = opt_local->srun_opt;
	xassert(srun_opt);
	if (!first_opt)
	first_opt = opt_local;
	if (srun_opt->relative != NO_VAL)
	fatal("--relative option invalid for job allocation request");

	if ((j = _job_desc_msg_create_from_opts(opt_local)) == NULL) {
	FREE_NULL_LIST(job_req_list);
	return NULL;
	}
	if (!first_job)
	first_job = j;

	j->origin_cluster = xstrdup(slurm_conf.cluster_name);

	list_append(job_req_list, j);
	}
	list_iterator_destroy(opt_iter);

	if (!first_job) {
	error("%s: No job requests found", __func__);
	FREE_NULL_LIST(job_req_list);
	return NULL;
	}

	if (first_opt && first_opt->clusters &&
	(slurmdb_get_first_het_job_cluster(job_req_list,
	first_opt->clusters,
	&working_cluster_rec)
	!= SLURM_SUCCESS)) {
	print_db_notok(first_opt->clusters, 0);
	FREE_NULL_LIST(job_req_list);
	return NULL;
	}

	callbacks.timeout = _timeout_handler;
	callbacks.job_complete = _job_complete_handler;
	callbacks.job_suspend = NULL;
	callbacks.user_msg = _user_msg_handler;
	callbacks.node_fail = _node_fail_handler;

	/* create message thread to handle pings and such from slurmctld */
	msg_thr = slurm_allocation_msg_thr_create(&first_job->other_port,
	&callbacks);
	list_for_each(job_req_list, _copy_other_port, &first_job->other_port);

	/* NOTE: Do not process signals in separate pthread. The signal will
	* cause slurm_allocate_resources_blocking() to exit immediately. */
	xsignal_unblock(sig_array);
	for (i = 0; sig_array[i]; i++)
	xsignal(sig_array[i], _signal_while_allocating);

	is_het_job = true;

	while (first_opt && !job_resp_list) {
	job_resp_list = slurm_allocate_het_job_blocking(job_req_list,
	first_opt->immediate, _set_pending_job_id);
	if (destroy_job) {
	/* cancelled by signal */
	if (pending_job_id != 0)
	info("Job allocation %u has been revoked",
	pending_job_id);
	break;
	} else if (!job_resp_list && !_retry()) {
	break;
	}
	}
	FREE_NULL_LIST(job_req_list);

	if (job_resp_list && !destroy_job) {
	/*
	* Allocation granted!
	*/

	opt_iter = list_iterator_create(opt_list);
	resp_iter = list_iterator_create(job_resp_list);
	while ((opt_local = list_next(opt_iter))) {
	resp = (resource_allocation_response_msg_t *)
	list_next(resp_iter);
	if (!resp)
	break;

	if (pending_job_id == 0)
	pending_job_id = resp->job_id;
	if (my_job_id == 0) {
	my_job_id = resp->job_id;
	i = list_count(opt_list);
	k = list_count(job_resp_list);
	if (i != k) {
	error("%s: request count != response count (%d != %d)",
	__func__, i, k);
	goto relinquish;
	}
	}

	/*
	* These values could be changed while the job was
	* pending so overwrite the request with what was
	* allocated so we don't have issues when we use them
	* in the step creation.
	*/
	if (opt_local->pn_min_memory != NO_VAL64)
	opt_local->pn_min_memory =
	(resp->pn_min_memory & (~MEM_PER_CPU));
	else if (opt_local->mem_per_cpu != NO_VAL64)
	opt_local->mem_per_cpu =
	(resp->pn_min_memory & (~MEM_PER_CPU));

	opt_local->min_nodes = resp->node_cnt;
	opt_local->max_nodes = resp->node_cnt;

	xfree(opt_local->gres);
	opt_local->gres = xstrdup(resp->tres_per_node);

	if (resp->working_cluster_rec)
	slurm_setup_remote_working_cluster(resp);

	if (!_wait_nodes_ready(resp)) {
	if (!destroy_job)
	error("Something is wrong with the "
	"boot of the nodes.");
	goto relinquish;
	}
	}
	list_iterator_destroy(resp_iter);
	list_iterator_destroy(opt_iter);
	} else if (destroy_job) {
	goto relinquish;
	}

	xsignal_block(sig_array);

	return job_resp_list;

	relinquish:
	if (job_resp_list) {
	if (my_job_id == 0) {
	resp = (resource_allocation_response_msg_t *)
	list_peek(job_resp_list);
	my_job_id = resp->job_id;
	}

	if (destroy_job && my_job_id) {
	slurm_complete_job(my_job_id, 1);
	}
	FREE_NULL_LIST(job_resp_list);
	}
	exit(error_exit);
	return NULL;
	}

	void
	ignore_signal(int signo)
	{
	/* do nothing */
	}

	int
	cleanup_allocation(void)
	{
	slurm_allocation_msg_thr_destroy(msg_thr);
	return SLURM_SUCCESS;
	}

	extern list_t *existing_allocation(void)
	{
	uint32_t old_job_id;
	list_t *job_resp_list = NULL;

	if (sropt.jobid == NO_VAL)
	return NULL;

	if (opt.clusters) {
	list_t *clusters = NULL;
	if (slurm_get_cluster_info(&(clusters), opt.clusters, 0)) {
	print_db_notok(opt.clusters, 0);
	fatal("Could not get cluster information");
	}
	working_cluster_rec = list_peek(clusters);
	debug2("Looking for job %d on cluster %s (addr: %s)",
	sropt.jobid,
	working_cluster_rec->name,
	working_cluster_rec->control_host);
	}

	old_job_id = (uint32_t) sropt.jobid;
	if (slurm_het_job_lookup(old_job_id, &job_resp_list) < 0) {
	if (sropt.parallel_debug)
	return NULL; /* create new allocation as needed */
	if (errno == ESLURM_ALREADY_DONE)
	error("Slurm job %u has expired", old_job_id);
	else
	error("Unable to confirm allocation for job %u: %m",
	old_job_id);
	info("Check SLURM_JOB_ID environment variable. Expired or invalid job %u",
	old_job_id);
	exit(error_exit);
	}

	return job_resp_list;
	}

	/*
	* Create job description structure based off srun options
	* (see opt.h)
	*/
	static job_desc_msg_t _job_desc_msg_create_from_opts(slurm_opt_t opt_local)
	{
	srun_opt_t *srun_opt = opt_local->srun_opt;
	job_desc_msg_t *j = slurm_opt_create_job_desc(opt_local, true);

	if (!j) {
	return NULL;
	}

	/*
	* The controller rejects any non-stepmgr allocation requesting
	* resv-ports. To allow srun to request --resv-ports outside of stepmgr
	* jobs, clear resv_port_cnt when creating a non-stepmgr allocation.
	*/
	if ((opt_local->resv_port_cnt != NO_VAL) &&
	!(opt_local->job_flags & STEPMGR_ENABLED) &&
	!xstrstr(slurm_conf.slurmctld_params, "enable_stepmgr"))
	j->resv_port_cnt = NO_VAL16;

	xassert(srun_opt);

	if (!j->name)
	j->name = xstrdup(srun_opt->cmd_name);

	if (opt_local->argc > 0) {
	j->argc = opt_local->argc;
	j->argv = opt_local->argv;
	}

	j->container = xstrdup(opt_local->container);
	j->container_id = xstrdup(opt_local->container_id);

	if (srun_opt->cpu_bind)
	j->cpu_bind = xstrdup(srun_opt->cpu_bind);
	if (srun_opt->cpu_bind_type)
	j->cpu_bind_type = srun_opt->cpu_bind_type;

	if (!j->x11 && opt.x11) {
	j->x11_magic_cookie = xstrdup(opt.x11_magic_cookie);
	j->x11_target = xstrdup(opt.x11_target);
	j->x11_target_port = opt.x11_target_port;
	}

	j->wait_all_nodes = 1;

	return j;
	}

	void
	job_desc_msg_destroy(job_desc_msg_t *j)
	{
	if (j) {
	xfree(j->req_nodes);
	xfree(j);
	}
	}

	extern int create_job_step(srun_job_t *job, bool use_all_cpus,
	slurm_opt_t *opt_local)
	{
	return launch_g_create_job_step(job, use_all_cpus,
	_signal_while_allocating,
	&destroy_job, opt_local);
	}