src/srun/allocate.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  * src/srun/allocate.c - srun functions for managing node allocations
  * $Id$
  *****************************************************************************
  *  Copyright (C) 2002-2006 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Mark Grondona <mgrondona@llnl.gov>.
  *  UCRL-CODE-226842.
  *
  *  This file is part of SLURM, a resource management program.
  *  For details, see <http://www.llnl.gov/linux/slurm/>.
  *
  *  SLURM is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with SLURM; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #if HAVE_CONFIG_H
 #  include "config.h"
 #endif

 #include <stdlib.h>
 #include <unistd.h>
 #include <sys/poll.h>
 #include <sys/types.h>
 #include <pwd.h>

 #include "src/common/log.h"
 #include "src/common/macros.h"
 #include "src/common/slurm_auth.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/xmalloc.h"
 #include "src/common/xsignal.h"
 #include "src/common/xstring.h"
 #include "src/common/forward.h"
 #include "src/common/env.h"

 #include "src/srun/allocate.h"
 #include "src/srun/msg.h"
 #include "src/srun/opt.h"
 #include "src/srun/attach.h"

 #define MAX_ALLOC_WAIT 60	/* seconds */
 #define MIN_ALLOC_WAIT  5	/* seconds */
 #define MAX_RETRIES    10

 extern char **environ;

 /*
  * Static Prototypes
  */
 static int   _accept_msg_connection(slurm_fd slurmctld_fd,
 		resource_allocation_response_msg_t **resp);
 static int   _handle_msg(slurm_msg_t *msg, \
 		resource_allocation_response_msg_t **resp);
 static int   _wait_for_alloc_rpc(int sleep_time,
 		resource_allocation_response_msg_t **resp);
 static void  _wait_for_resources(resource_allocation_response_msg_t **resp);
 static bool  _retry();
 static void  _intr_handler(int signo);

 static job_step_create_request_msg_t * _step_req_create(srun_job_t *j);

 static sig_atomic_t destroy_job = 0;
 static srun_job_t *allocate_job = NULL;

 int
 allocate_test(void)
 {
 	int rc;
 	job_desc_msg_t *j = job_desc_msg_create_from_opts (NULL);
 	if(!j)
 		return SLURM_ERROR;

 	rc = slurm_job_will_run(j);
 	job_desc_msg_destroy(j);
 	return rc;
 }

 resource_allocation_response_msg_t *
 allocate_nodes(void)
 {
 	int rc = 0;
 	static int sigarray[] = { SIGQUIT, SIGINT, SIGTERM, 0 };
 	SigFunc *oquitf, *ointf, *otermf;
 	sigset_t oset;
 	resource_allocation_response_msg_t *resp = NULL;
 	job_desc_msg_t *j = job_desc_msg_create_from_opts (NULL);

 	if(!j)
 		return NULL;

 	oquitf = xsignal(SIGQUIT, _intr_handler);
 	ointf  = xsignal(SIGINT,  _intr_handler);
 	otermf = xsignal(SIGTERM, _intr_handler);

 	xsignal_save_mask(&oset);
 	xsignal_unblock(sigarray);

 	/* Do not re-use existing job id when submitting new job
 	 * from within a running job */
 	if ((j->job_id != NO_VAL) && !opt.jobid_set) {
 		info("WARNING: Creating SLURM job allocation from within "
 			"another allocation");
 		info("WARNING: You are attempting to initiate a second job");
 		if (!opt.jobid_set)	/* Let slurmctld set jobid */
 			j->job_id = NO_VAL;
 	}

 	while ((rc = slurm_allocate_resources(j, &resp) < 0) && _retry()) {
 		if (destroy_job)
 			goto done;
 	}

 	if(!resp)
 		goto done;

 	if ((rc == 0) && (resp->node_list == NULL)) {
 		if (resp->error_code)
 			verbose("Warning: %s",
 				slurm_strerror(resp->error_code));
 		_wait_for_resources(&resp);
 	}

     done:
 	xsignal_set_mask(&oset);
 	xsignal(SIGINT,  ointf);
 	xsignal(SIGTERM, otermf);
 	xsignal(SIGQUIT, oquitf);

 	job_desc_msg_destroy(j);

 	return resp;
 }

 resource_allocation_response_msg_t *
 existing_allocation(void)
 {
 	uint32_t old_job_id;
         resource_allocation_response_msg_t *resp = NULL;

 	if (opt.jobid != NO_VAL)
 		old_job_id = (uint32_t)opt.jobid;
 	else
                 return NULL;

         if (slurm_allocation_lookup_lite(old_job_id, &resp) < 0) {
                 if (opt.parallel_debug || opt.jobid_set)
                         return NULL;    /* create new allocation as needed */
                 if (errno == ESLURM_ALREADY_DONE)
                         error ("SLURM job %u has expired.", old_job_id);
                 else
                         error ("Unable to confirm allocation for job %u: %m",
                               old_job_id);
                 info ("Check SLURM_JOBID environment variable "
                       "for expired or invalid job.");
                 exit(1);
         }

         return resp;
 }

 static void
 _wait_for_resources(resource_allocation_response_msg_t **resp)
 {
 	resource_allocation_response_msg_t *r = *resp;
 	int sleep_time = MIN_ALLOC_WAIT;
 	int job_id = r->job_id;

 	if (!opt.quiet)
 		info ("job %u queued and waiting for resources", r->job_id);

 	slurm_free_resource_allocation_response_msg(r);

 	/* Keep polling until the job is allocated resources */
 	while (_wait_for_alloc_rpc(sleep_time, resp) <= 0) {
 		if (slurm_allocation_lookup_lite(job_id, resp) >= 0)
 			break;

 		if (slurm_get_errno() == ESLURM_JOB_PENDING)
 			debug3 ("Still waiting for allocation");
 		else
 			fatal ("Unable to confirm allocation for job %u: %m",
 			       job_id);

 		if (destroy_job) {
 			verbose("cancelling job %u", job_id);
 			slurm_complete_job(job_id, 0);
 			debugger_launch_failure(allocate_job);
 			exit(0);
 		}

 		if (sleep_time < MAX_ALLOC_WAIT)
 			sleep_time++;
 	}
 	if (!opt.quiet)
 		info ("job %u has been allocated resources", (*resp)->job_id);
 }

 /* Wait up to sleep_time for RPC from slurmctld indicating resource allocation
  * has occured.
  * IN sleep_time: delay in seconds
  * OUT resp: resource allocation response message
  * RET 1 if resp is filled in, 0 otherwise */
 static int
 _wait_for_alloc_rpc(int sleep_time, resource_allocation_response_msg_t **resp)
 {
 	struct pollfd fds[1];
 	slurm_fd slurmctld_fd;

 	if ((slurmctld_fd = slurmctld_msg_init()) < 0) {
 		sleep (sleep_time);
 		return (0);
 	}

 	fds[0].fd = slurmctld_fd;
 	fds[0].events = POLLIN;

 	while (poll (fds, 1, (sleep_time * 1000)) < 0) {
 		switch (errno) {
 			case EAGAIN:
 			case EINTR:
 				return (-1);
 			case ENOMEM:
 			case EINVAL:
 			case EFAULT:
 				fatal("poll: %m");
 			default:
 				error("poll: %m. Continuing...");
 		}
 	}

 	if (fds[0].revents & POLLIN)
 		return (_accept_msg_connection(slurmctld_fd, resp));

 	return (0);
 }

 /* Accept RPC from slurmctld and process it.
  * IN slurmctld_fd: file descriptor for slurmctld communications
  * OUT resp: resource allocation response message
  * RET 1 if resp is filled in, 0 otherwise */
 static int
 _accept_msg_connection(slurm_fd slurmctld_fd,
 		resource_allocation_response_msg_t **resp)
 {
 	slurm_fd     fd;
 	slurm_msg_t *msg = NULL;
 	slurm_addr   cli_addr;
 	char         host[256];
 	uint16_t     port;
 	int          rc = 0;

 	fd = slurm_accept_msg_conn(slurmctld_fd, &cli_addr);
 	if (fd < 0) {
 		error("Unable to accept connection: %m");
 		return rc;
 	}

 	slurm_get_addr(&cli_addr, &port, host, sizeof(host));
 	debug2("got message connection from %s:%hu", host, port);

 	msg = xmalloc(sizeof(slurm_msg_t));
 	slurm_msg_t_init(msg);

   again:
 	if(slurm_receive_msg(fd, msg, 0) != 0) {
 		if (errno == EINTR) {
 			goto again;
 		}
 		error("_accept_msg_connection[%s]: %m", host);
 		rc = SLURM_ERROR;
 		goto cleanup;

 	}

 	rc = _handle_msg(msg, resp); /* handle_msg frees msg->data */
 cleanup:
 	slurm_free_msg(msg);

 	slurm_close_accepted_conn(fd);
 	return rc;
 }

 /* process RPC from slurmctld
  * IN msg: message recieved
  * OUT resp: resource allocation response message
  * RET 1 if resp is filled in, 0 otherwise */
 static int
 _handle_msg(slurm_msg_t *msg, resource_allocation_response_msg_t **resp)
 {
 	uid_t req_uid   = g_slurm_auth_get_uid(msg->auth_cred);
 	uid_t uid       = getuid();
 	uid_t slurm_uid = (uid_t) slurm_get_slurm_user_id();
 	int rc = 0;
 	srun_timeout_msg_t *to;
 	srun_user_msg_t *um;

 	if ((req_uid != slurm_uid) && (req_uid != 0) && (req_uid != uid)) {
 		error ("Security violation, slurm message from uid %u",
 			(unsigned int) req_uid);
 		return 0;
 	}

 	switch (msg->msg_type) {
 		case SRUN_PING:
 			debug3("slurmctld ping received");
 			slurm_send_rc_msg(msg, SLURM_SUCCESS);
 			slurm_free_srun_ping_msg(msg->data);
 			break;
 		case SRUN_JOB_COMPLETE:
 			debug3("job complete received");
 			/* FIXME: do something here */
 			slurm_free_srun_job_complete_msg(msg->data);
 			break;
 		case RESPONSE_RESOURCE_ALLOCATION:
 			debug2("resource allocation response received");
 			slurm_send_rc_msg(msg, SLURM_SUCCESS);
 			*resp = msg->data;
 			rc = 1;
 			break;
 		case SRUN_TIMEOUT:
 			debug2("timeout received");
 			to = msg->data;
 			timeout_handler(to->timeout);
 			slurm_free_srun_timeout_msg(msg->data);
 			break;
 		case SRUN_USER_MSG:
 			um = msg->data;
 			info("%s", um->msg);
 			slurm_free_srun_user_msg(msg->data);
 			break;
 		default:
 			error("received spurious message type: %d\n",
 				 msg->msg_type);
 	}
 	return rc;
 }

 static bool
 _retry()
 {
 	static int  retries = 0;
 	static char *msg = "Slurm job queue full, sleeping and retrying.";

 	if (errno == ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) {
 		if (retries == 0)
 			error (msg);
 		else if (retries < MAX_RETRIES)
 			debug (msg);
 		else
 			return false;
 		sleep (++retries);
 	} else {
 		error("Unable to allocate resources: %m");
 		return false;
 	}

 	return true;
 }

 /*
  * SIGINT handler while waiting for resources to become available.
  */
 static void
 _intr_handler(int signo)
 {
 	destroy_job = 1;
 }


 /*
  * Create job description structure based off srun options
  * (see opt.h)
  */
 job_desc_msg_t *
 job_desc_msg_create_from_opts (char *script)
 {
 	job_desc_msg_t *j = xmalloc(sizeof(*j));
 	char buf[8192];
 	hostlist_t hl = NULL;

 	slurm_init_job_desc_msg(j);

 	j->contiguous     = opt.contiguous;
 	j->features       = opt.constraints;
 	j->immediate      = opt.immediate;
 	j->name           = opt.job_name;
 	j->req_nodes      = xstrdup(opt.nodelist);

 	/* simplify the job allocation nodelist,
 	  not laying out tasks until step */
 	if(j->req_nodes) {
 		hl = hostlist_create(j->req_nodes);
 		hostlist_ranged_string(hl, sizeof(buf), buf);
 		xfree(opt.nodelist);
 		opt.nodelist = xstrdup(buf);
 		hostlist_uniq(hl);
 		hostlist_ranged_string(hl, sizeof(buf), buf);
 		hostlist_destroy(hl);

 		xfree(j->req_nodes);
 		j->req_nodes = xstrdup(buf);
 	}

 	if(opt.distribution == SLURM_DIST_ARBITRARY
 	   && !j->req_nodes) {
 		error("With Arbitrary distribution you need to "
 		      "specify a nodelist or hostfile with the -w option");
 		return NULL;
 	}
 	j->exc_nodes      = opt.exc_nodes;
 	j->partition      = opt.partition;
 	j->min_nodes      = opt.min_nodes;
 	if (opt.min_sockets_per_node != NO_VAL)
 		j->min_sockets    = opt.min_sockets_per_node;
 	if (opt.min_cores_per_socket != NO_VAL)
 		j->min_cores      = opt.min_cores_per_socket;
 	if (opt.min_threads_per_core != NO_VAL)
 		j->min_threads    = opt.min_threads_per_core;
 	j->user_id        = opt.uid;
 	j->dependency     = opt.dependency;
 	if (opt.nice)
 		j->nice   = NICE_OFFSET + opt.nice;
 	j->task_dist      = opt.distribution;
 	if (opt.plane_size != NO_VAL)
 		j->plane_size     = opt.plane_size;
 	j->group_id       = opt.gid;
 	j->mail_type      = opt.mail_type;

 	if (opt.ntasks_per_node != NO_VAL)
 		j->ntasks_per_node   = opt.ntasks_per_node;
 	if (opt.ntasks_per_socket != NO_VAL)
 		j->ntasks_per_socket = opt.ntasks_per_socket;
 	if (opt.ntasks_per_core != NO_VAL)
 		j->ntasks_per_core   = opt.ntasks_per_core;

 	if (opt.mail_user)
 		j->mail_user = xstrdup(opt.mail_user);
 	if (opt.begin)
 		j->begin_time = opt.begin;
 	if (opt.network)
 		j->network = xstrdup(opt.network);
 	if (opt.account)
 		j->account = xstrdup(opt.account);
 	if (opt.comment)
 		j->comment = xstrdup(opt.comment);

 	if (opt.hold)
 		j->priority     = 0;
 	if (opt.jobid != NO_VAL)
 		j->job_id	= opt.jobid;
 #if SYSTEM_DIMENSIONS
 	if (opt.geometry[0] > 0) {
 		int i;
 		for (i=0; i<SYSTEM_DIMENSIONS; i++)
 			j->geometry[i] = opt.geometry[i];
 	}
 #endif

 	if (opt.conn_type != (uint16_t) NO_VAL)
 		j->conn_type = opt.conn_type;

 	if (opt.reboot)
 		j->reboot = 1;
 	if (opt.no_rotate)
 		j->rotate = 0;

 	if (opt.blrtsimage)
 		j->blrtsimage = xstrdup(opt.blrtsimage);
 	if (opt.linuximage)
 		j->linuximage = xstrdup(opt.linuximage);
 	if (opt.mloaderimage)
 		j->mloaderimage = xstrdup(opt.mloaderimage);
 	if (opt.ramdiskimage)
 		j->ramdiskimage = xstrdup(opt.ramdiskimage);

 	if (opt.max_nodes)
 		j->max_nodes    = opt.max_nodes;
 	if (opt.max_sockets_per_node)
 		j->max_sockets  = opt.max_sockets_per_node;
 	if (opt.max_cores_per_socket)
 		j->max_cores    = opt.max_cores_per_socket;
 	if (opt.max_threads_per_core)
 		j->max_threads  = opt.max_threads_per_core;

 	if (opt.job_min_cpus != NO_VAL)
 		j->job_min_procs    = opt.job_min_cpus;
 	if (opt.job_min_sockets != NO_VAL)
 		j->job_min_sockets  = opt.job_min_sockets;
 	if (opt.job_min_cores != NO_VAL)
 		j->job_min_cores    = opt.job_min_cores;
 	if (opt.job_min_threads != NO_VAL)
 		j->job_min_threads  = opt.job_min_threads;
 	if (opt.job_min_memory != NO_VAL)
 		j->job_min_memory   = opt.job_min_memory;
 	if (opt.job_max_memory != NO_VAL)
 		j->job_max_memory   = opt.job_max_memory;
 	if (opt.job_min_tmp_disk != NO_VAL)
 		j->job_min_tmp_disk = opt.job_min_tmp_disk;
 	if (opt.overcommit) {
 		j->num_procs    = opt.min_nodes;
 		j->overcommit	= opt.overcommit;
 	} else
 		j->num_procs    = opt.nprocs * opt.cpus_per_task;
 	if (opt.nprocs_set)
 		j->num_tasks    = opt.nprocs;

 	if (opt.cpus_set)
 		j->cpus_per_task = opt.cpus_per_task;

 	if (opt.no_kill)
 		j->kill_on_node_fail   = 0;
 	if (opt.time_limit != NO_VAL)
 		j->time_limit          = opt.time_limit;
 	j->shared = opt.shared;

 	/* srun uses the same listening port for the allocation response
 	 * message as all other messages */
 	j->alloc_resp_port = slurmctld_comm_addr.port;
 	j->other_port = slurmctld_comm_addr.port;
 	/* alloc_resp_hostname and other_hostname are set by slurmctld */

 	if (script) {
 		/*
 		 * If script is set then we are building a request for
 		 *  a batch job
 		 */
 		xassert (opt.batch);

 		j->environment = NULL;
 		if (opt.get_user_env_time >= 0) {
 			struct passwd *pw = NULL;
 			pw = getpwuid(opt.uid);
 			if (pw != NULL) {
 				j->environment = env_array_user_default(
 							pw->pw_name,
 							opt.get_user_env_time,
 							opt.get_user_env_mode);
 				if (j->environment == NULL)
 					exit(1);    /* error already logged */
 			}
 		}
 		env_array_merge(&j->environment, (const char **)environ);
 		j->env_size = envcount (j->environment);
 		j->script = script;
 		j->argv = remote_argv;
 		j->argc = remote_argc;
 		j->err  = opt.efname;
 		j->in   = opt.ifname;
 		j->out  = opt.ofname;
 		j->work_dir = opt.cwd;
 		j->no_requeue = opt.no_requeue;
 	}

 	return (j);
 }

 void
 job_desc_msg_destroy(job_desc_msg_t *j)
 {
 	if (j) {
 		xfree(j->account);
 		xfree(j->comment);
 		xfree(j);
 	}
 }

 static job_step_create_request_msg_t *
 _step_req_create(srun_job_t *j)
 {
 	job_step_create_request_msg_t *r = xmalloc(sizeof(*r));
 	r->job_id     = j->jobid;
 	r->user_id    = opt.uid;

 	r->node_count = j->nhosts;
 	/* info("send %d or %d? sending %d", opt.max_nodes, */
 /* 		     j->nhosts, r->node_count); */
 	if(r->node_count > j->nhosts) {
 		error("Asking for more nodes that allocated");
 		return NULL;
 	}
 	r->cpu_count  = opt.overcommit ? r->node_count
 		                       : (opt.nprocs*opt.cpus_per_task);
 	if (!opt.nprocs_set && (opt.ntasks_per_node != NO_VAL))
 		opt.nprocs = r->node_count * opt.ntasks_per_node;
 	r->num_tasks  = opt.nprocs;
 	r->node_list  = xstrdup(opt.nodelist);
 	r->network    = xstrdup(opt.network);
 	r->name       = xstrdup(opt.job_name);
 	r->relative   = (uint16_t)opt.relative;
 	r->overcommit = opt.overcommit ? 1 : 0;
 	debug("requesting job %u, user %u, nodes %u including (%s)",
 	      r->job_id, r->user_id, r->node_count, r->node_list);
 	debug("cpus %u, tasks %u, name %s, relative %u",
 	      r->cpu_count, r->num_tasks, r->name, r->relative);

 	switch (opt.distribution) {
 	case SLURM_DIST_BLOCK:
 		r->task_dist = SLURM_DIST_BLOCK;
 		break;
 	case SLURM_DIST_ARBITRARY:
 		r->task_dist = SLURM_DIST_ARBITRARY;
 		break;
 	case SLURM_DIST_CYCLIC:
 		r->task_dist = SLURM_DIST_CYCLIC;
 		break;
 	case SLURM_DIST_CYCLIC_CYCLIC:
 		r->task_dist = SLURM_DIST_CYCLIC_CYCLIC;
 		break;
 	case SLURM_DIST_CYCLIC_BLOCK:
 		r->task_dist = SLURM_DIST_CYCLIC_BLOCK;
 		break;
 	case SLURM_DIST_BLOCK_CYCLIC:
 		r->task_dist = SLURM_DIST_BLOCK_CYCLIC;
 		break;
 	case SLURM_DIST_BLOCK_BLOCK:
 		r->task_dist = SLURM_DIST_BLOCK_BLOCK;
 		break;
 	case SLURM_DIST_PLANE:
 		r->task_dist = SLURM_DIST_PLANE;
 		r->plane_size = opt.plane_size;
 		break;
 	default:
 		r->task_dist = (r->num_tasks <= r->node_count)
 			? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK;
 		break;

 	}
 	opt.distribution = r->task_dist;

 	if (slurmctld_comm_addr.port) {
 		r->host = xstrdup(slurmctld_comm_addr.hostname);
 		r->port = slurmctld_comm_addr.port;
 	}

 	return(r);
 }

 int
 create_job_step(srun_job_t *job)
 {
 	job_step_create_request_msg_t  *req  = NULL;
 	job_step_create_response_msg_t *resp = NULL;
 	int i;

 	if (!(req = _step_req_create(job))) {
 		error ("Unable to allocate step request message");
 		return -1;
 	}

 	for (i=0; ;i++) {
 		if ((slurm_job_step_create(req, &resp) == SLURM_SUCCESS)
 		&&  (resp != NULL))
 			break;
 		if (slurm_get_errno() != ESLURM_DISABLED) {
 			error ("Unable to create job step: %m");
 			return -1;
 		}
 		if (i == 0)
 			info("Job step creation temporarily disabled, retrying");
 		sleep(MIN((i*10), 60));
 	}

 	job->stepid  = resp->job_step_id;
 	job->step_layout = resp->step_layout;
 	job->cred    = resp->cred;
 	job->switch_job = resp->switch_job;


 	/*  Number of hosts in job may not have been initialized yet if
 	 *    --jobid was used or only SLURM_JOBID was set in user env.
 	 *    Reset the value here just in case.
 	 */
 	job->nhosts = job->step_layout->node_cnt;

 	if(!job->step_layout) {
 		error("step_layout not returned");
 		return -1;
 	}

 	/*
 	 * Recreate filenames which may depend upon step id
 	 */
 	job_update_io_fnames(job);

 	slurm_free_job_step_create_request_msg(req);

 	return 0;
 }

 void
 set_allocate_job(srun_job_t *job)
 {
 	allocate_job = job;
 	return;
 }
	/*****************************************************************************\
	* src/srun/allocate.c - srun functions for managing node allocations
	* $Id$
	*****************************************************************************
	* Copyright (C) 2002-2006 The Regents of the University of California.
	* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
	* Written by Mark Grondona <mgrondona@llnl.gov>.
	* UCRL-CODE-226842.
	*
	* This file is part of SLURM, a resource management program.
	* For details, see <http://www.llnl.gov/linux/slurm/>.
	*
	* SLURM is free software; you can redistribute it and/or modify it under
	* the terms of the GNU General Public License as published by the Free
	* Software Foundation; either version 2 of the License, or (at your option)
	* any later version.
	*
	* In addition, as a special exception, the copyright holders give permission
	* to link the code of portions of this program with the OpenSSL library under
	* certain conditions as described in each individual source file, and
	* distribute linked combinations including the two. You must obey the GNU
	* General Public License in all respects for all of the code used other than
	* OpenSSL. If you modify file(s) with this exception, you may extend this
	* exception to your version of the file(s), but you are not obligated to do
	* so. If you do not wish to do so, delete this exception statement from your
	* version. If you delete this exception statement from all source files in
	* the program, then also delete it here.
	*
	* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
	* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
	* details.
	*
	* You should have received a copy of the GNU General Public License along
	* with SLURM; if not, write to the Free Software Foundation, Inc.,
	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	\*****************************************************************************/

	#if HAVE_CONFIG_H
	# include "config.h"
	#endif

	#include <stdlib.h>
	#include <unistd.h>
	#include <sys/poll.h>
	#include <sys/types.h>
	#include <pwd.h>

	#include "src/common/log.h"
	#include "src/common/macros.h"
	#include "src/common/slurm_auth.h"
	#include "src/common/slurm_protocol_api.h"
	#include "src/common/xmalloc.h"
	#include "src/common/xsignal.h"
	#include "src/common/xstring.h"
	#include "src/common/forward.h"
	#include "src/common/env.h"

	#include "src/srun/allocate.h"
	#include "src/srun/msg.h"
	#include "src/srun/opt.h"
	#include "src/srun/attach.h"

	#define MAX_ALLOC_WAIT 60 /* seconds */
	#define MIN_ALLOC_WAIT 5 /* seconds */
	#define MAX_RETRIES 10

	extern char **environ;

	/*
	* Static Prototypes
	*/
	static int _accept_msg_connection(slurm_fd slurmctld_fd,
	resource_allocation_response_msg_t **resp);
	static int _handle_msg(slurm_msg_t *msg, \
	resource_allocation_response_msg_t **resp);
	static int _wait_for_alloc_rpc(int sleep_time,
	resource_allocation_response_msg_t **resp);
	static void _wait_for_resources(resource_allocation_response_msg_t **resp);
	static bool _retry();
	static void _intr_handler(int signo);

	static job_step_create_request_msg_t * _step_req_create(srun_job_t *j);

	static sig_atomic_t destroy_job = 0;
	static srun_job_t *allocate_job = NULL;

	int
	allocate_test(void)
	{
	int rc;
	job_desc_msg_t *j = job_desc_msg_create_from_opts (NULL);
	if(!j)
	return SLURM_ERROR;

	rc = slurm_job_will_run(j);
	job_desc_msg_destroy(j);
	return rc;
	}

	resource_allocation_response_msg_t *
	allocate_nodes(void)
	{
	int rc = 0;
	static int sigarray[] = { SIGQUIT, SIGINT, SIGTERM, 0 };
	SigFunc oquitf, ointf, *otermf;
	sigset_t oset;
	resource_allocation_response_msg_t *resp = NULL;
	job_desc_msg_t *j = job_desc_msg_create_from_opts (NULL);

	if(!j)
	return NULL;

	oquitf = xsignal(SIGQUIT, _intr_handler);
	ointf = xsignal(SIGINT, _intr_handler);
	otermf = xsignal(SIGTERM, _intr_handler);

	xsignal_save_mask(&oset);
	xsignal_unblock(sigarray);

	/* Do not re-use existing job id when submitting new job
	* from within a running job */
	if ((j->job_id != NO_VAL) && !opt.jobid_set) {
	info("WARNING: Creating SLURM job allocation from within "
	"another allocation");
	info("WARNING: You are attempting to initiate a second job");
	if (!opt.jobid_set) /* Let slurmctld set jobid */
	j->job_id = NO_VAL;
	}

	while ((rc = slurm_allocate_resources(j, &resp) < 0) && _retry()) {
	if (destroy_job)
	goto done;
	}

	if(!resp)
	goto done;

	if ((rc == 0) && (resp->node_list == NULL)) {
	if (resp->error_code)
	verbose("Warning: %s",
	slurm_strerror(resp->error_code));
	_wait_for_resources(&resp);
	}

	done:
	xsignal_set_mask(&oset);
	xsignal(SIGINT, ointf);
	xsignal(SIGTERM, otermf);
	xsignal(SIGQUIT, oquitf);

	job_desc_msg_destroy(j);

	return resp;
	}

	resource_allocation_response_msg_t *
	existing_allocation(void)
	{
	uint32_t old_job_id;
	resource_allocation_response_msg_t *resp = NULL;

	if (opt.jobid != NO_VAL)
	old_job_id = (uint32_t)opt.jobid;
	else
	return NULL;

	if (slurm_allocation_lookup_lite(old_job_id, &resp) < 0) {
	if (opt.parallel_debug \|\| opt.jobid_set)
	return NULL; /* create new allocation as needed */
	if (errno == ESLURM_ALREADY_DONE)
	error ("SLURM job %u has expired.", old_job_id);
	else
	error ("Unable to confirm allocation for job %u: %m",
	old_job_id);
	info ("Check SLURM_JOBID environment variable "
	"for expired or invalid job.");
	exit(1);
	}

	return resp;
	}

	static void
	_wait_for_resources(resource_allocation_response_msg_t **resp)
	{
	resource_allocation_response_msg_t r = resp;
	int sleep_time = MIN_ALLOC_WAIT;
	int job_id = r->job_id;

	if (!opt.quiet)
	info ("job %u queued and waiting for resources", r->job_id);

	slurm_free_resource_allocation_response_msg(r);

	/* Keep polling until the job is allocated resources */
	while (_wait_for_alloc_rpc(sleep_time, resp) <= 0) {
	if (slurm_allocation_lookup_lite(job_id, resp) >= 0)
	break;

	if (slurm_get_errno() == ESLURM_JOB_PENDING)
	debug3 ("Still waiting for allocation");
	else
	fatal ("Unable to confirm allocation for job %u: %m",
	job_id);

	if (destroy_job) {
	verbose("cancelling job %u", job_id);
	slurm_complete_job(job_id, 0);
	debugger_launch_failure(allocate_job);
	exit(0);
	}

	if (sleep_time < MAX_ALLOC_WAIT)
	sleep_time++;
	}
	if (!opt.quiet)
	info ("job %u has been allocated resources", (*resp)->job_id);
	}

	/* Wait up to sleep_time for RPC from slurmctld indicating resource allocation
	* has occured.
	* IN sleep_time: delay in seconds
	* OUT resp: resource allocation response message
	* RET 1 if resp is filled in, 0 otherwise */
	static int
	_wait_for_alloc_rpc(int sleep_time, resource_allocation_response_msg_t **resp)
	{
	struct pollfd fds[1];
	slurm_fd slurmctld_fd;

	if ((slurmctld_fd = slurmctld_msg_init()) < 0) {
	sleep (sleep_time);
	return (0);
	}

	fds[0].fd = slurmctld_fd;
	fds[0].events = POLLIN;

	while (poll (fds, 1, (sleep_time * 1000)) < 0) {
	switch (errno) {
	case EAGAIN:
	case EINTR:
	return (-1);
	case ENOMEM:
	case EINVAL:
	case EFAULT:
	fatal("poll: %m");
	default:
	error("poll: %m. Continuing...");
	}
	}

	if (fds[0].revents & POLLIN)
	return (_accept_msg_connection(slurmctld_fd, resp));

	return (0);
	}

	/* Accept RPC from slurmctld and process it.
	* IN slurmctld_fd: file descriptor for slurmctld communications
	* OUT resp: resource allocation response message
	* RET 1 if resp is filled in, 0 otherwise */
	static int
	_accept_msg_connection(slurm_fd slurmctld_fd,
	resource_allocation_response_msg_t **resp)
	{
	slurm_fd fd;
	slurm_msg_t *msg = NULL;
	slurm_addr cli_addr;
	char host[256];
	uint16_t port;
	int rc = 0;

	fd = slurm_accept_msg_conn(slurmctld_fd, &cli_addr);
	if (fd < 0) {
	error("Unable to accept connection: %m");
	return rc;
	}

	slurm_get_addr(&cli_addr, &port, host, sizeof(host));
	debug2("got message connection from %s:%hu", host, port);

	msg = xmalloc(sizeof(slurm_msg_t));
	slurm_msg_t_init(msg);

	again:
	if(slurm_receive_msg(fd, msg, 0) != 0) {
	if (errno == EINTR) {
	goto again;
	}
	error("_accept_msg_connection[%s]: %m", host);
	rc = SLURM_ERROR;
	goto cleanup;

	}

	rc = _handle_msg(msg, resp); /* handle_msg frees msg->data */
	cleanup:
	slurm_free_msg(msg);

	slurm_close_accepted_conn(fd);
	return rc;
	}

	/* process RPC from slurmctld
	* IN msg: message recieved
	* OUT resp: resource allocation response message
	* RET 1 if resp is filled in, 0 otherwise */
	static int
	_handle_msg(slurm_msg_t msg, resource_allocation_response_msg_t *resp)
	{
	uid_t req_uid = g_slurm_auth_get_uid(msg->auth_cred);
	uid_t uid = getuid();
	uid_t slurm_uid = (uid_t) slurm_get_slurm_user_id();
	int rc = 0;
	srun_timeout_msg_t *to;
	srun_user_msg_t *um;

	if ((req_uid != slurm_uid) && (req_uid != 0) && (req_uid != uid)) {
	error ("Security violation, slurm message from uid %u",
	(unsigned int) req_uid);
	return 0;
	}

	switch (msg->msg_type) {
	case SRUN_PING:
	debug3("slurmctld ping received");
	slurm_send_rc_msg(msg, SLURM_SUCCESS);
	slurm_free_srun_ping_msg(msg->data);
	break;
	case SRUN_JOB_COMPLETE:
	debug3("job complete received");
	/* FIXME: do something here */
	slurm_free_srun_job_complete_msg(msg->data);
	break;
	case RESPONSE_RESOURCE_ALLOCATION:
	debug2("resource allocation response received");
	slurm_send_rc_msg(msg, SLURM_SUCCESS);
	*resp = msg->data;
	rc = 1;
	break;
	case SRUN_TIMEOUT:
	debug2("timeout received");
	to = msg->data;
	timeout_handler(to->timeout);
	slurm_free_srun_timeout_msg(msg->data);
	break;
	case SRUN_USER_MSG:
	um = msg->data;
	info("%s", um->msg);
	slurm_free_srun_user_msg(msg->data);
	break;
	default:
	error("received spurious message type: %d\n",
	msg->msg_type);
	}
	return rc;
	}

	static bool
	_retry()
	{
	static int retries = 0;
	static char *msg = "Slurm job queue full, sleeping and retrying.";

	if (errno == ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) {
	if (retries == 0)
	error (msg);
	else if (retries < MAX_RETRIES)
	debug (msg);
	else
	return false;
	sleep (++retries);
	} else {
	error("Unable to allocate resources: %m");
	return false;
	}

	return true;
	}

	/*
	* SIGINT handler while waiting for resources to become available.
	*/
	static void
	_intr_handler(int signo)
	{
	destroy_job = 1;
	}


	/*
	* Create job description structure based off srun options
	* (see opt.h)
	*/
	job_desc_msg_t *
	job_desc_msg_create_from_opts (char *script)
	{
	job_desc_msg_t j = xmalloc(sizeof(j));
	char buf[8192];
	hostlist_t hl = NULL;

	slurm_init_job_desc_msg(j);

	j->contiguous = opt.contiguous;
	j->features = opt.constraints;
	j->immediate = opt.immediate;
	j->name = opt.job_name;
	j->req_nodes = xstrdup(opt.nodelist);

	/* simplify the job allocation nodelist,
	not laying out tasks until step */
	if(j->req_nodes) {
	hl = hostlist_create(j->req_nodes);
	hostlist_ranged_string(hl, sizeof(buf), buf);
	xfree(opt.nodelist);
	opt.nodelist = xstrdup(buf);
	hostlist_uniq(hl);
	hostlist_ranged_string(hl, sizeof(buf), buf);
	hostlist_destroy(hl);

	xfree(j->req_nodes);
	j->req_nodes = xstrdup(buf);
	}

	if(opt.distribution == SLURM_DIST_ARBITRARY
	&& !j->req_nodes) {
	error("With Arbitrary distribution you need to "
	"specify a nodelist or hostfile with the -w option");
	return NULL;
	}
	j->exc_nodes = opt.exc_nodes;
	j->partition = opt.partition;
	j->min_nodes = opt.min_nodes;
	if (opt.min_sockets_per_node != NO_VAL)
	j->min_sockets = opt.min_sockets_per_node;
	if (opt.min_cores_per_socket != NO_VAL)
	j->min_cores = opt.min_cores_per_socket;
	if (opt.min_threads_per_core != NO_VAL)
	j->min_threads = opt.min_threads_per_core;
	j->user_id = opt.uid;
	j->dependency = opt.dependency;
	if (opt.nice)
	j->nice = NICE_OFFSET + opt.nice;
	j->task_dist = opt.distribution;
	if (opt.plane_size != NO_VAL)
	j->plane_size = opt.plane_size;
	j->group_id = opt.gid;
	j->mail_type = opt.mail_type;

	if (opt.ntasks_per_node != NO_VAL)
	j->ntasks_per_node = opt.ntasks_per_node;
	if (opt.ntasks_per_socket != NO_VAL)
	j->ntasks_per_socket = opt.ntasks_per_socket;
	if (opt.ntasks_per_core != NO_VAL)
	j->ntasks_per_core = opt.ntasks_per_core;

	if (opt.mail_user)
	j->mail_user = xstrdup(opt.mail_user);
	if (opt.begin)
	j->begin_time = opt.begin;
	if (opt.network)
	j->network = xstrdup(opt.network);
	if (opt.account)
	j->account = xstrdup(opt.account);
	if (opt.comment)
	j->comment = xstrdup(opt.comment);

	if (opt.hold)
	j->priority = 0;
	if (opt.jobid != NO_VAL)
	j->job_id = opt.jobid;
	#if SYSTEM_DIMENSIONS
	if (opt.geometry[0] > 0) {
	int i;
	for (i=0; i<SYSTEM_DIMENSIONS; i++)
	j->geometry[i] = opt.geometry[i];
	}
	#endif

	if (opt.conn_type != (uint16_t) NO_VAL)
	j->conn_type = opt.conn_type;

	if (opt.reboot)
	j->reboot = 1;
	if (opt.no_rotate)
	j->rotate = 0;

	if (opt.blrtsimage)
	j->blrtsimage = xstrdup(opt.blrtsimage);
	if (opt.linuximage)
	j->linuximage = xstrdup(opt.linuximage);
	if (opt.mloaderimage)
	j->mloaderimage = xstrdup(opt.mloaderimage);
	if (opt.ramdiskimage)
	j->ramdiskimage = xstrdup(opt.ramdiskimage);

	if (opt.max_nodes)
	j->max_nodes = opt.max_nodes;
	if (opt.max_sockets_per_node)
	j->max_sockets = opt.max_sockets_per_node;
	if (opt.max_cores_per_socket)
	j->max_cores = opt.max_cores_per_socket;
	if (opt.max_threads_per_core)
	j->max_threads = opt.max_threads_per_core;

	if (opt.job_min_cpus != NO_VAL)
	j->job_min_procs = opt.job_min_cpus;
	if (opt.job_min_sockets != NO_VAL)
	j->job_min_sockets = opt.job_min_sockets;
	if (opt.job_min_cores != NO_VAL)
	j->job_min_cores = opt.job_min_cores;
	if (opt.job_min_threads != NO_VAL)
	j->job_min_threads = opt.job_min_threads;
	if (opt.job_min_memory != NO_VAL)
	j->job_min_memory = opt.job_min_memory;
	if (opt.job_max_memory != NO_VAL)
	j->job_max_memory = opt.job_max_memory;
	if (opt.job_min_tmp_disk != NO_VAL)
	j->job_min_tmp_disk = opt.job_min_tmp_disk;
	if (opt.overcommit) {
	j->num_procs = opt.min_nodes;
	j->overcommit = opt.overcommit;
	} else
	j->num_procs = opt.nprocs * opt.cpus_per_task;
	if (opt.nprocs_set)
	j->num_tasks = opt.nprocs;

	if (opt.cpus_set)
	j->cpus_per_task = opt.cpus_per_task;

	if (opt.no_kill)
	j->kill_on_node_fail = 0;
	if (opt.time_limit != NO_VAL)
	j->time_limit = opt.time_limit;
	j->shared = opt.shared;

	/* srun uses the same listening port for the allocation response
	* message as all other messages */
	j->alloc_resp_port = slurmctld_comm_addr.port;
	j->other_port = slurmctld_comm_addr.port;
	/* alloc_resp_hostname and other_hostname are set by slurmctld */

	if (script) {
	/*
	* If script is set then we are building a request for
	* a batch job
	*/
	xassert (opt.batch);

	j->environment = NULL;
	if (opt.get_user_env_time >= 0) {
	struct passwd *pw = NULL;
	pw = getpwuid(opt.uid);
	if (pw != NULL) {
	j->environment = env_array_user_default(
	pw->pw_name,
	opt.get_user_env_time,
	opt.get_user_env_mode);
	if (j->environment == NULL)
	exit(1); /* error already logged */
	}
	}
	env_array_merge(&j->environment, (const char **)environ);
	j->env_size = envcount (j->environment);
	j->script = script;
	j->argv = remote_argv;
	j->argc = remote_argc;
	j->err = opt.efname;
	j->in = opt.ifname;
	j->out = opt.ofname;
	j->work_dir = opt.cwd;
	j->no_requeue = opt.no_requeue;
	}

	return (j);
	}

	void
	job_desc_msg_destroy(job_desc_msg_t *j)
	{
	if (j) {
	xfree(j->account);
	xfree(j->comment);
	xfree(j);
	}
	}

	static job_step_create_request_msg_t *
	_step_req_create(srun_job_t *j)
	{
	job_step_create_request_msg_t r = xmalloc(sizeof(r));
	r->job_id = j->jobid;
	r->user_id = opt.uid;

	r->node_count = j->nhosts;
	/* info("send %d or %d? sending %d", opt.max_nodes, */
	/* j->nhosts, r->node_count); */
	if(r->node_count > j->nhosts) {
	error("Asking for more nodes that allocated");
	return NULL;
	}
	r->cpu_count = opt.overcommit ? r->node_count
	: (opt.nprocs*opt.cpus_per_task);
	if (!opt.nprocs_set && (opt.ntasks_per_node != NO_VAL))
	opt.nprocs = r->node_count * opt.ntasks_per_node;
	r->num_tasks = opt.nprocs;
	r->node_list = xstrdup(opt.nodelist);
	r->network = xstrdup(opt.network);
	r->name = xstrdup(opt.job_name);
	r->relative = (uint16_t)opt.relative;
	r->overcommit = opt.overcommit ? 1 : 0;
	debug("requesting job %u, user %u, nodes %u including (%s)",
	r->job_id, r->user_id, r->node_count, r->node_list);
	debug("cpus %u, tasks %u, name %s, relative %u",
	r->cpu_count, r->num_tasks, r->name, r->relative);

	switch (opt.distribution) {
	case SLURM_DIST_BLOCK:
	r->task_dist = SLURM_DIST_BLOCK;
	break;
	case SLURM_DIST_ARBITRARY:
	r->task_dist = SLURM_DIST_ARBITRARY;
	break;
	case SLURM_DIST_CYCLIC:
	r->task_dist = SLURM_DIST_CYCLIC;
	break;
	case SLURM_DIST_CYCLIC_CYCLIC:
	r->task_dist = SLURM_DIST_CYCLIC_CYCLIC;
	break;
	case SLURM_DIST_CYCLIC_BLOCK:
	r->task_dist = SLURM_DIST_CYCLIC_BLOCK;
	break;
	case SLURM_DIST_BLOCK_CYCLIC:
	r->task_dist = SLURM_DIST_BLOCK_CYCLIC;
	break;
	case SLURM_DIST_BLOCK_BLOCK:
	r->task_dist = SLURM_DIST_BLOCK_BLOCK;
	break;
	case SLURM_DIST_PLANE:
	r->task_dist = SLURM_DIST_PLANE;
	r->plane_size = opt.plane_size;
	break;
	default:
	r->task_dist = (r->num_tasks <= r->node_count)
	? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK;
	break;

	}
	opt.distribution = r->task_dist;

	if (slurmctld_comm_addr.port) {
	r->host = xstrdup(slurmctld_comm_addr.hostname);
	r->port = slurmctld_comm_addr.port;
	}

	return(r);
	}

	int
	create_job_step(srun_job_t *job)
	{
	job_step_create_request_msg_t *req = NULL;
	job_step_create_response_msg_t *resp = NULL;
	int i;

	if (!(req = _step_req_create(job))) {
	error ("Unable to allocate step request message");
	return -1;
	}

	for (i=0; ;i++) {
	if ((slurm_job_step_create(req, &resp) == SLURM_SUCCESS)
	&& (resp != NULL))
	break;
	if (slurm_get_errno() != ESLURM_DISABLED) {
	error ("Unable to create job step: %m");
	return -1;
	}
	if (i == 0)
	info("Job step creation temporarily disabled, retrying");
	sleep(MIN((i*10), 60));
	}

	job->stepid = resp->job_step_id;
	job->step_layout = resp->step_layout;
	job->cred = resp->cred;
	job->switch_job = resp->switch_job;


	/* Number of hosts in job may not have been initialized yet if
	* --jobid was used or only SLURM_JOBID was set in user env.
	* Reset the value here just in case.
	*/
	job->nhosts = job->step_layout->node_cnt;

	if(!job->step_layout) {
	error("step_layout not returned");
	return -1;
	}

	/*
	* Recreate filenames which may depend upon step id
	*/
	job_update_io_fnames(job);

	slurm_free_job_step_create_request_msg(req);

	return 0;
	}

	void
	set_allocate_job(srun_job_t *job)
	{
	allocate_job = job;
	return;
	}