src/api/allocate.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  allocate.c - allocate nodes for a job or step with supplied contraints
  *  $Id$
  *****************************************************************************
  *  Copyright (C) 2002 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov>.
  *  UCRL-CODE-226842.
  *
  *  This file is part of SLURM, a resource management program.
  *  For details, see <http://www.llnl.gov/linux/slurm/>.
  *
  *  SLURM is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with SLURM; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #endif

 #include <errno.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/poll.h>
 #include <stdbool.h>
 #include <time.h>
 #include <netinet/in.h> /* for ntohs() */

 #ifndef __USE_XOPEN_EXTENDED
 extern pid_t getsid(pid_t pid);		/* missing from <unistd.h> */
 #endif

 #include <slurm/slurm.h>
 #include <stdlib.h>

 #include "src/common/read_config.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/hostlist.h"
 #include "src/common/xmalloc.h"
 #include "src/common/xstring.h"
 #include "src/common/forward.h"
 #include "src/common/fd.h"
 #include "src/common/slurm_auth.h"

 #define BUFFER_SIZE 1024
 #define MAX_ALLOC_WAIT 60	/* seconds */
 #define MIN_ALLOC_WAIT  5	/* seconds */

 typedef struct {
 	slurm_addr address;
 	int fd;
 	char *hostname;
 	uint16_t port;
 } listen_t;

 static int _handle_rc_msg(slurm_msg_t *msg);
 static listen_t *_create_allocation_response_socket();
 static void _destroy_allocation_response_socket(listen_t *listen);
 static resource_allocation_response_msg_t *_wait_for_allocation_response(
 	uint32_t job_id, const listen_t *listen, int timeout);

 /*
  * slurm_allocate_resources - allocate resources for a job request
  * IN job_desc_msg - description of resource allocation request
  * OUT slurm_alloc_msg - response to request
  * RET 0 on success, otherwise return -1 and set errno to indicate the error
  * NOTE: free the allocated using slurm_free_resource_allocation_response_msg
  */
 int
 slurm_allocate_resources (job_desc_msg_t *req,
 			  resource_allocation_response_msg_t **resp)
 {
 	int rc;
 	slurm_msg_t req_msg;
 	slurm_msg_t resp_msg;
 	bool host_set = false;
 	char host[64];

 	slurm_msg_t_init(&req_msg);
 	slurm_msg_t_init(&resp_msg);

 	/*
 	 * set Node and session id for this request
 	 */
 	if (req->alloc_sid == NO_VAL)
 		req->alloc_sid = getsid(0);

 	if ( (req->alloc_node == NULL)
 	    && (gethostname_short(host, sizeof(host)) == 0) ) {
 		req->alloc_node = host;
 		host_set  = true;
 	}

 	req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION;
 	req_msg.data     = req;

 	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg);

 	/*
 	 *  Clear this hostname if set internally to this function
 	 *    (memory is on the stack)
 	 */
 	if (host_set)
 		req->alloc_node = NULL;

 	if (rc == SLURM_SOCKET_ERROR)
 		return SLURM_SOCKET_ERROR;

 	switch (resp_msg.msg_type) {
 	case RESPONSE_SLURM_RC:
 		if (_handle_rc_msg(&resp_msg) < 0)
 			return SLURM_PROTOCOL_ERROR;
 		*resp = NULL;
 		break;
 	case RESPONSE_RESOURCE_ALLOCATION:
 		*resp = (resource_allocation_response_msg_t *) resp_msg.data;
 		break;
 	default:
 		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
 	}

 	return SLURM_PROTOCOL_SUCCESS;
 }


 /*
  * slurm_allocate_resources_blocking
  *	allocate resources for a job request.  This call will block until
  *	the allocation is granted, or the specified timeout limit is reached.
  * IN req - description of resource allocation request
  * IN timeout - amount of time, in seconds, to wait for a response before
  * 	giving up.
  *	A timeout of zero will wait indefinitely.
  * IN pending_callback - If the allocation cannot be granted immediately,
  *      the controller will put the job in the PENDING state.  If
  *      pending callback is not NULL, it will be called with the job_id
  *      of the pending job as the sole parameter.
  *
  * RET allocation structure on success, NULL on error set errno to
  *	indicate the error (errno will be ETIMEDOUT if the timeout is reached
  *      with no allocation granted)
  * NOTE: free the allocation structure using
  *	slurm_free_resource_allocation_response_msg
  */
 resource_allocation_response_msg_t *
 slurm_allocate_resources_blocking (const job_desc_msg_t *user_req,
 				   time_t timeout,
 				   void(*pending_callback)(uint32_t job_id))
 {
 	int rc;
 	slurm_msg_t req_msg;
 	slurm_msg_t resp_msg;
 	resource_allocation_response_msg_t *resp = NULL;
 	char *hostname = NULL;
 	uint32_t job_id;
 	job_desc_msg_t *req;
 	listen_t *listen = NULL;
 	int errnum = SLURM_SUCCESS;

 	slurm_msg_t_init(&req_msg);
 	slurm_msg_t_init(&resp_msg);

 	/* make a copy of the user's job description struct so that we
 	 * can make changes before contacting the controller */
 	req = (job_desc_msg_t *)xmalloc(sizeof(job_desc_msg_t));
 	if (req == NULL)
 		return NULL;
 	memcpy(req, user_req, sizeof(job_desc_msg_t));

 	/*
 	 * set Node and session id for this request
 	 */
 	if (req->alloc_sid == NO_VAL)
 		req->alloc_sid = getsid(0);

 	if (user_req->alloc_node != NULL) {
 		req->alloc_node = xstrdup(user_req->alloc_node);
 	} else if ((hostname = xshort_hostname()) != NULL) {
 		req->alloc_node = hostname;
 	} else {
 		error("Could not get local hostname,"
 		      " forcing immediate allocation mode.");
 		req->immediate = 1;
 	}

 	if (!req->immediate) {
 		listen = _create_allocation_response_socket(hostname);
 		if (listen == NULL) {
 			xfree(req);
 			return NULL;
 		}
 		/* req->alloc_resp_hostname is set by slurmctld */
 		req->alloc_resp_port = listen->port;
 	}

 	req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION;
 	req_msg.data     = req;

 	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg);

 	if (rc == SLURM_SOCKET_ERROR) {
 		int errnum = errno;
 		destroy_forward(&req_msg.forward);
 		destroy_forward(&resp_msg.forward);
 		if (!req->immediate)
 			_destroy_allocation_response_socket(listen);
 		xfree(req);
 		errno = errnum;
 		return NULL;
 	}

 	switch (resp_msg.msg_type) {
 	case RESPONSE_SLURM_RC:
 		if (_handle_rc_msg(&resp_msg) < 0) {
 			/* will reach this when the allocation fails */
 			errnum = errno;
 		} else {
 			/* shouldn't get here */
 			errnum = -1;
 		}
 		break;
 	case RESPONSE_RESOURCE_ALLOCATION:
 		/* Yay, the controller has acknowledge our request!  But did
 		   we really get an allocation yet? */
 		resp = (resource_allocation_response_msg_t *) resp_msg.data;
 		if (resp->node_cnt > 0) {
 			/* yes, allocation has been granted */
 			errno = SLURM_PROTOCOL_SUCCESS;
 		} else if (!req->immediate) {
 			/* no, we need to wait for a response */
 			job_id = resp->job_id;
 			slurm_free_resource_allocation_response_msg(resp);
 			if (pending_callback != NULL)
 				pending_callback(job_id);
  			resp = _wait_for_allocation_response(job_id, listen,
 							     timeout);
 			/* If NULL, we didn't get the allocation in
 			   the time desired, so just free the job id */
 			if (resp == NULL) {
 				errnum = errno;
 				slurm_complete_job(job_id, -1);
 			}
 		}
 		break;
 	default:
 		errnum = SLURM_UNEXPECTED_MSG_ERROR;
 		return NULL;
 	}

 	destroy_forward(&req_msg.forward);
 	destroy_forward(&resp_msg.forward);
 	if (!req->immediate)
 		_destroy_allocation_response_socket(listen);
 	xfree(req);
 	errno = errnum;
 	return resp;
 }


 /*
  * slurm_job_will_run - determine if a job would execute immediately if
  *	submitted now
  * IN job_desc_msg - description of resource allocation request
  * RET 0 on success, otherwise return -1 and set errno to indicate the error
  */
 int slurm_job_will_run (job_desc_msg_t *req)
 {
 	slurm_msg_t req_msg;
 	int rc;

 	/* req.immediate = true;    implicit */
 	slurm_msg_t_init(&req_msg);
 	req_msg.msg_type = REQUEST_JOB_WILL_RUN;
 	req_msg.data     = req;

 	if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
 		return SLURM_SOCKET_ERROR;

 	if (rc)
 		slurm_seterrno_ret(rc);

 	return SLURM_PROTOCOL_SUCCESS;
 }

 /*
  * slurm_job_step_create - create a job step for a given job id
  * IN slurm_step_alloc_req_msg - description of job step request
  * OUT slurm_step_alloc_resp_msg - response to request
  * RET 0 on success, otherwise return -1 and set errno to indicate the error
  * NOTE: free the response using slurm_free_job_step_create_response_msg
  */
 int
 slurm_job_step_create (job_step_create_request_msg_t *req,
                        job_step_create_response_msg_t **resp)
 {
 	slurm_msg_t req_msg;
 	slurm_msg_t resp_msg;

 	slurm_msg_t_init(&req_msg);
 	slurm_msg_t_init(&resp_msg);
 	req_msg.msg_type = REQUEST_JOB_STEP_CREATE;
 	req_msg.data     = req;

 	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
 		return SLURM_ERROR;

 	switch (resp_msg.msg_type) {
 	case RESPONSE_SLURM_RC:
 		if (_handle_rc_msg(&resp_msg) < 0)
 			return SLURM_PROTOCOL_ERROR;
 		*resp = NULL;
 		break;
 	case RESPONSE_JOB_STEP_CREATE:
 		*resp = (job_step_create_response_msg_t *) resp_msg.data;
 		break;
 	default:
 		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
 		break;
 	}

 	return SLURM_PROTOCOL_SUCCESS ;
 }

 /*
  * slurm_allocation_lookup - retrieve info for an existing resource allocation
  * IN jobid - job allocation identifier
  * OUT info - job allocation information
  * RET 0 on success, otherwise return -1 and set errno to indicate the error
  * NOTE: free the "resp" using slurm_free_resource_allocation_response_msg
  */
 int
 slurm_allocation_lookup(uint32_t jobid,
 			job_alloc_info_response_msg_t **info)
 {
 	job_alloc_info_msg_t req;
 	slurm_msg_t req_msg;
 	slurm_msg_t resp_msg;

 	req.job_id = jobid;
 	slurm_msg_t_init(&req_msg);
 	slurm_msg_t_init(&resp_msg);
 	req_msg.msg_type = REQUEST_JOB_ALLOCATION_INFO;
 	req_msg.data     = &req;

 	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
 		return SLURM_ERROR;

 	switch(resp_msg.msg_type) {
 	case RESPONSE_SLURM_RC:
 		if (_handle_rc_msg(&resp_msg) < 0)
 			return SLURM_ERROR;
 		*info = NULL;
 		break;
 	case RESPONSE_JOB_ALLOCATION_INFO:
 		*info = (job_alloc_info_response_msg_t *)resp_msg.data;
 		return SLURM_PROTOCOL_SUCCESS;
 		break;
 	default:
 		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
 		break;
 	}

 	return SLURM_PROTOCOL_SUCCESS;
 }

 /*
  * slurm_allocation_lookup_lite - retrieve info for an existing resource
  *                                allocation with out the addrs and such
  * IN jobid - job allocation identifier
  * OUT info - job allocation information
  * RET 0 on success, otherwise return -1 and set errno to indicate the error
  * NOTE: free the "resp" using slurm_free_resource_allocation_response_msg
  */
 int
 slurm_allocation_lookup_lite(uint32_t jobid,
 			     resource_allocation_response_msg_t **info)
 {
 	job_alloc_info_msg_t req;
 	slurm_msg_t req_msg;
 	slurm_msg_t resp_msg;

 	req.job_id = jobid;
 	slurm_msg_t_init(&req_msg);
 	slurm_msg_t_init(&resp_msg);
 	req_msg.msg_type = REQUEST_JOB_ALLOCATION_INFO_LITE;
 	req_msg.data     = &req;

 	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
 		return SLURM_ERROR;

 	switch(resp_msg.msg_type) {
 	case RESPONSE_SLURM_RC:
 		if (_handle_rc_msg(&resp_msg) < 0)
 			return SLURM_ERROR;
 		*info = NULL;
 		break;
 	case RESPONSE_JOB_ALLOCATION_INFO_LITE:
 		*info = (resource_allocation_response_msg_t *) resp_msg.data;
 		return SLURM_PROTOCOL_SUCCESS;
 		break;
 	default:
 		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
 		break;
 	}

 	return SLURM_PROTOCOL_SUCCESS;
 }

 /*
  *  Handle a return code message type.
  *    if return code is nonzero, sets errno to return code and returns < 0.
  *    Otherwise, returns 0 (SLURM_SUCCES)
  */
 static int
 _handle_rc_msg(slurm_msg_t *msg)
 {
 	int rc = ((return_code_msg_t *) msg->data)->return_code;
 	slurm_free_return_code_msg(msg->data);

 	if (rc)
 		slurm_seterrno_ret(rc);
 	else
 		return SLURM_SUCCESS;
 }

 /*
  * Read a SLURM hostfile specified by "filename".  "filename" must contain
  * a list of SLURM NodeNames, one per line.  Reads up to "n" number of hostnames
  * from the file. Returns a string representing a hostlist ranged string of
  * the contents of the file.  This is a helper function, it does not
  * contact any SLURM daemons.
  *
  * Returns a string representing the hostlist.  Returns NULL if there are fewer
  * than "n" hostnames in the file, or if an error occurs.  If "n" ==
  * NO_VAL then the entire file is read in
  *
  * Returned string must be freed with free().
  */
 char *slurm_read_hostfile(char *filename, int n)
 {
 	FILE *fp = NULL;
 	char in_line[BUFFER_SIZE];	/* input line */
 	int i, j;
 	int line_size;
 	int line_num = 0;
 	hostlist_t hostlist = NULL;
 	char *nodelist = NULL;

 	if (filename == NULL || strlen(filename) == 0)
 		return NULL;

 	if((fp = fopen(filename, "r")) == NULL) {
 		error("slurm_allocate_resources error opening file %s, %m",
 		      filename);
 		return NULL;
 	}

 	hostlist = hostlist_create(NULL);
 	if (hostlist == NULL)
 		return NULL;

 	while (fgets(in_line, BUFFER_SIZE, fp) != NULL) {
 		line_num++;
 		line_size = strlen(in_line);
 		if (line_size == (BUFFER_SIZE - 1)) {
 			error ("Line %d, of hostfile %s too long",
 			       line_num, filename);
 			fclose (fp);
 			return NULL;
 		}

 		for (i = 0; i < line_size; i++) {
 			if (in_line[i] == '\n') {
 				in_line[i] = '\0';
 				break;
 			}
 			if (in_line[i] == '\0')
 				break;
 			if (in_line[i] != '#')
 				continue;
 			if ((i > 0) && (in_line[i - 1] == '\\')) {
 				for (j = i; j < line_size; j++) {
 					in_line[j - 1] = in_line[j];
 				}
 				line_size--;
 				continue;
 			}
 			in_line[i] = '\0';
 			break;
 		}

 		hostlist_push(hostlist, in_line);
 		if(n != (int)NO_VAL && hostlist_count(hostlist) == n)
 			break;
 	}
 	fclose(fp);

 	if (hostlist_count(hostlist) <= 0) {
 		error("Hostlist is empty!\n");
 		goto cleanup_hostfile;
 	}
 	if (hostlist_count(hostlist) < n) {
 		error("Too few NodeNames in SLURM Hostfile");
 		goto cleanup_hostfile;
 	}

 	nodelist = (char *)malloc(0xffff);
 	if (!nodelist) {
 		error("Nodelist xmalloc failed");
 		goto cleanup_hostfile;
 	}

 	if (hostlist_ranged_string(hostlist, 0xffff, nodelist) == -1) {
 		error("Hostlist is too long for the allocate RPC!");
 		free(nodelist);
 		nodelist = NULL;
 		goto cleanup_hostfile;
 	}

 	debug2("Hostlist from SLURM_HOSTFILE = %s\n", nodelist);

 cleanup_hostfile:
 	hostlist_destroy(hostlist);

 	return nodelist;
 }

 /***************************************************************************
  * Support functions for slurm_allocate_resources_blocking()
  ***************************************************************************/
 static listen_t *_create_allocation_response_socket(char *interface_hostname)
 {
 	listen_t *listen = NULL;

 	listen = xmalloc(sizeof(listen_t));
 	if (listen == NULL)
 		return NULL;

 	/* port "0" lets the operating system pick any port */
 	if ((listen->fd = slurm_init_msg_engine_port(0)) < 0) {
 		error("slurm_init_msg_engine_port error %m");
 		return NULL;
 	}
 	if (slurm_get_stream_addr(listen->fd, &listen->address) < 0) {
 		error("slurm_get_stream_addr error %m");
 		slurm_shutdown_msg_engine(listen->fd);
 		return NULL;
 	}
 	listen->hostname = xstrdup(interface_hostname);
 	/* FIXME - screw it!  I can't seem to get the port number through
 	   slurm_* functions */
 	listen->port = ntohs(listen->address.sin_port);
 	fd_set_nonblocking(listen->fd);

 	return listen;
 }

 static void _destroy_allocation_response_socket(listen_t *listen)
 {
 	xassert(listen != NULL);

 	slurm_shutdown_msg_engine(listen->fd);
 	if (listen->hostname)
 		xfree(listen->hostname);
 	xfree(listen);
 }

 /* process RPC from slurmctld
  * IN msg: message recieved
  * OUT resp: resource allocation response message
  * RET 1 if resp is filled in, 0 otherwise */
 static int
 _handle_msg(slurm_msg_t *msg, resource_allocation_response_msg_t **resp)
 {
 	uid_t req_uid   = g_slurm_auth_get_uid(msg->auth_cred);
 	uid_t uid       = getuid();
 	uid_t slurm_uid = (uid_t) slurm_get_slurm_user_id();
 	int rc = 0;

 	if ((req_uid != slurm_uid) && (req_uid != 0) && (req_uid != uid)) {
 		error ("Security violation, slurm message from uid %u",
 			(unsigned int) req_uid);
 		return 0;
 	}

 	switch (msg->msg_type) {
 		case RESPONSE_RESOURCE_ALLOCATION:
 			debug2("resource allocation response received");
 			slurm_send_rc_msg(msg, SLURM_SUCCESS);
 			*resp = msg->data;
 			rc = 1;
 			break;
 		case SRUN_JOB_COMPLETE:
 			info("Job has been cancelled");
 			break;
 		default:
 			error("received spurious message type: %d\n",
 				 msg->msg_type);
 	}
 	return rc;
 }

 /* Accept RPC from slurmctld and process it.
  * IN slurmctld_fd: file descriptor for slurmctld communications
  * OUT resp: resource allocation response message
  * RET 1 if resp is filled in, 0 otherwise */
 static int
 _accept_msg_connection(int listen_fd,
 		       resource_allocation_response_msg_t **resp)
 {
 	int	     conn_fd;
 	slurm_msg_t  *msg = NULL;
 	slurm_addr   cli_addr;
 	char         host[256];
 	uint16_t     port;
 	int          rc = 0;

 	conn_fd = slurm_accept_msg_conn(listen_fd, &cli_addr);
 	if (conn_fd < 0) {
 		error("Unable to accept connection: %m");
 		return rc;
 	}

 	slurm_get_addr(&cli_addr, &port, host, sizeof(host));
 	debug2("got message connection from %s:%hu", host, port);

 	msg = xmalloc(sizeof(slurm_msg_t));
 	slurm_msg_t_init(msg);

 	if((rc = slurm_receive_msg(conn_fd, msg, 0)) != 0) {
 		slurm_free_msg(msg);

 		if (errno == EINTR) {
 			slurm_close_accepted_conn(conn_fd);
 			*resp = NULL;
 			return 0;
 		}

 		error("_accept_msg_connection[%s]: %m", host);
 		return SLURM_ERROR;
 	}

 	rc = _handle_msg(msg, resp); /* handle_msg frees msg */
 	slurm_free_msg(msg);

 	slurm_close_accepted_conn(conn_fd);
 	return rc;
 }

 /* Wait up to sleep_time for RPC from slurmctld indicating resource allocation
  * has occured.
  * IN sleep_time: delay in seconds (0 means unbounded wait)
  * OUT resp: resource allocation response message
  * RET 1 if resp is filled in, 0 otherwise */
 static int
 _wait_for_alloc_rpc(const listen_t *listen, int sleep_time,
 		    resource_allocation_response_msg_t **resp)
 {
 	struct pollfd fds[1];
 	int rc;
 	int timeout_ms;

 	fds[0].fd = listen->fd;
 	fds[0].events = POLLIN;

 	if (sleep_time != 0) {
 		timeout_ms = sleep_time * 1000;
 	} else {
 		timeout_ms = -1;
 	}
 	while ((rc = poll(fds, 1, timeout_ms)) < 0) {
 		switch (errno) {
 			case EAGAIN:
 			case EINTR:
 				*resp = NULL;
 				return -1;
 			case ENOMEM:
 			case EINVAL:
 			case EFAULT:
 				error("poll: %m");
 				*resp = NULL;
 				return -1;
 			default:
 				error("poll: %m. Continuing...");
 		}
 	}

 	if (rc == 0) { /* poll timed out */
 		errno = ETIMEDOUT;
 	} else if (fds[0].revents & POLLIN) {
 		return (_accept_msg_connection(listen->fd, resp));
 	}

 	return 0;
 }

 static resource_allocation_response_msg_t *
 _wait_for_allocation_response(uint32_t job_id, const listen_t *listen,
 			      int timeout)
 {
 	resource_allocation_response_msg_t *resp = NULL;
 	int errnum;

 	debug("job %u queued and waiting for resources", job_id);
 	if (_wait_for_alloc_rpc(listen, timeout, &resp) <= 0) {
 		errnum = errno;
 		/* Maybe the resource allocation response RPC got lost
 		 * in the mail; surely it should have arrived by now.
 		 * Let's see if the controller thinks that the allocation
 		 * has been granted.
 		 */
 		if (slurm_allocation_lookup_lite(job_id, &resp) >= 0) {
 			return resp;
 		}
 		if (slurm_get_errno() == ESLURM_JOB_PENDING) {
 			debug3("Still waiting for allocation");
 			errno = errnum;
 			return NULL;
 		} else {
 			debug3("Unable to confirm allocation for job %u: %m",
 			       job_id);
 			return NULL;
 		}
 	}

 	return resp;
 }
	/*****************************************************************************\
	* allocate.c - allocate nodes for a job or step with supplied contraints
	* $Id$
	*****************************************************************************
	* Copyright (C) 2002 The Regents of the University of California.
	* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
	* Written by Morris Jette <jette1@llnl.gov>.
	* UCRL-CODE-226842.
	*
	* This file is part of SLURM, a resource management program.
	* For details, see <http://www.llnl.gov/linux/slurm/>.
	*
	* SLURM is free software; you can redistribute it and/or modify it under
	* the terms of the GNU General Public License as published by the Free
	* Software Foundation; either version 2 of the License, or (at your option)
	* any later version.
	*
	* In addition, as a special exception, the copyright holders give permission
	* to link the code of portions of this program with the OpenSSL library under
	* certain conditions as described in each individual source file, and
	* distribute linked combinations including the two. You must obey the GNU
	* General Public License in all respects for all of the code used other than
	* OpenSSL. If you modify file(s) with this exception, you may extend this
	* exception to your version of the file(s), but you are not obligated to do
	* so. If you do not wish to do so, delete this exception statement from your
	* version. If you delete this exception statement from all source files in
	* the program, then also delete it here.
	*
	* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
	* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
	* details.
	*
	* You should have received a copy of the GNU General Public License along
	* with SLURM; if not, write to the Free Software Foundation, Inc.,
	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
	\*****************************************************************************/

	#ifdef HAVE_CONFIG_H
	# include "config.h"
	#endif

	#include <errno.h>
	#include <stdio.h>
	#include <unistd.h>
	#include <sys/types.h>
	#include <sys/poll.h>
	#include <stdbool.h>
	#include <time.h>
	#include <netinet/in.h> /* for ntohs() */

	#ifndef __USE_XOPEN_EXTENDED
	extern pid_t getsid(pid_t pid); /* missing from <unistd.h> */
	#endif

	#include <slurm/slurm.h>
	#include <stdlib.h>

	#include "src/common/read_config.h"
	#include "src/common/slurm_protocol_api.h"
	#include "src/common/hostlist.h"
	#include "src/common/xmalloc.h"
	#include "src/common/xstring.h"
	#include "src/common/forward.h"
	#include "src/common/fd.h"
	#include "src/common/slurm_auth.h"

	#define BUFFER_SIZE 1024
	#define MAX_ALLOC_WAIT 60 /* seconds */
	#define MIN_ALLOC_WAIT 5 /* seconds */

	typedef struct {
	slurm_addr address;
	int fd;
	char *hostname;
	uint16_t port;
	} listen_t;

	static int _handle_rc_msg(slurm_msg_t *msg);
	static listen_t *_create_allocation_response_socket();
	static void _destroy_allocation_response_socket(listen_t *listen);
	static resource_allocation_response_msg_t *_wait_for_allocation_response(
	uint32_t job_id, const listen_t *listen, int timeout);

	/*
	* slurm_allocate_resources - allocate resources for a job request
	* IN job_desc_msg - description of resource allocation request
	* OUT slurm_alloc_msg - response to request
	* RET 0 on success, otherwise return -1 and set errno to indicate the error
	* NOTE: free the allocated using slurm_free_resource_allocation_response_msg
	*/
	int
	slurm_allocate_resources (job_desc_msg_t *req,
	resource_allocation_response_msg_t **resp)
	{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	bool host_set = false;
	char host[64];

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	/*
	* set Node and session id for this request
	*/
	if (req->alloc_sid == NO_VAL)
	req->alloc_sid = getsid(0);

	if ( (req->alloc_node == NULL)
	&& (gethostname_short(host, sizeof(host)) == 0) ) {
	req->alloc_node = host;
	host_set = true;
	}

	req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION;
	req_msg.data = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg);

	/*
	* Clear this hostname if set internally to this function
	* (memory is on the stack)
	*/
	if (host_set)
	req->alloc_node = NULL;

	if (rc == SLURM_SOCKET_ERROR)
	return SLURM_SOCKET_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
	if (_handle_rc_msg(&resp_msg) < 0)
	return SLURM_PROTOCOL_ERROR;
	*resp = NULL;
	break;
	case RESPONSE_RESOURCE_ALLOCATION:
	resp = (resource_allocation_response_msg_t ) resp_msg.data;
	break;
	default:
	slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
	}

	return SLURM_PROTOCOL_SUCCESS;
	}


	/*
	* slurm_allocate_resources_blocking
	* allocate resources for a job request. This call will block until
	* the allocation is granted, or the specified timeout limit is reached.
	* IN req - description of resource allocation request
	* IN timeout - amount of time, in seconds, to wait for a response before
	* giving up.
	* A timeout of zero will wait indefinitely.
	* IN pending_callback - If the allocation cannot be granted immediately,
	* the controller will put the job in the PENDING state. If
	* pending callback is not NULL, it will be called with the job_id
	* of the pending job as the sole parameter.
	*
	* RET allocation structure on success, NULL on error set errno to
	* indicate the error (errno will be ETIMEDOUT if the timeout is reached
	* with no allocation granted)
	* NOTE: free the allocation structure using
	* slurm_free_resource_allocation_response_msg
	*/
	resource_allocation_response_msg_t *
	slurm_allocate_resources_blocking (const job_desc_msg_t *user_req,
	time_t timeout,
	void(*pending_callback)(uint32_t job_id))
	{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	resource_allocation_response_msg_t *resp = NULL;
	char *hostname = NULL;
	uint32_t job_id;
	job_desc_msg_t *req;
	listen_t *listen = NULL;
	int errnum = SLURM_SUCCESS;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	/* make a copy of the user's job description struct so that we
	* can make changes before contacting the controller */
	req = (job_desc_msg_t *)xmalloc(sizeof(job_desc_msg_t));
	if (req == NULL)
	return NULL;
	memcpy(req, user_req, sizeof(job_desc_msg_t));

	/*
	* set Node and session id for this request
	*/
	if (req->alloc_sid == NO_VAL)
	req->alloc_sid = getsid(0);

	if (user_req->alloc_node != NULL) {
	req->alloc_node = xstrdup(user_req->alloc_node);
	} else if ((hostname = xshort_hostname()) != NULL) {
	req->alloc_node = hostname;
	} else {
	error("Could not get local hostname,"
	" forcing immediate allocation mode.");
	req->immediate = 1;
	}

	if (!req->immediate) {
	listen = _create_allocation_response_socket(hostname);
	if (listen == NULL) {
	xfree(req);
	return NULL;
	}
	/* req->alloc_resp_hostname is set by slurmctld */
	req->alloc_resp_port = listen->port;
	}

	req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION;
	req_msg.data = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg);

	if (rc == SLURM_SOCKET_ERROR) {
	int errnum = errno;
	destroy_forward(&req_msg.forward);
	destroy_forward(&resp_msg.forward);
	if (!req->immediate)
	_destroy_allocation_response_socket(listen);
	xfree(req);
	errno = errnum;
	return NULL;
	}

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
	if (_handle_rc_msg(&resp_msg) < 0) {
	/* will reach this when the allocation fails */
	errnum = errno;
	} else {
	/* shouldn't get here */
	errnum = -1;
	}
	break;
	case RESPONSE_RESOURCE_ALLOCATION:
	/* Yay, the controller has acknowledge our request! But did
	we really get an allocation yet? */
	resp = (resource_allocation_response_msg_t *) resp_msg.data;
	if (resp->node_cnt > 0) {
	/* yes, allocation has been granted */
	errno = SLURM_PROTOCOL_SUCCESS;
	} else if (!req->immediate) {
	/* no, we need to wait for a response */
	job_id = resp->job_id;
	slurm_free_resource_allocation_response_msg(resp);
	if (pending_callback != NULL)
	pending_callback(job_id);
	resp = _wait_for_allocation_response(job_id, listen,
	timeout);
	/* If NULL, we didn't get the allocation in
	the time desired, so just free the job id */
	if (resp == NULL) {
	errnum = errno;
	slurm_complete_job(job_id, -1);
	}
	}
	break;
	default:
	errnum = SLURM_UNEXPECTED_MSG_ERROR;
	return NULL;
	}

	destroy_forward(&req_msg.forward);
	destroy_forward(&resp_msg.forward);
	if (!req->immediate)
	_destroy_allocation_response_socket(listen);
	xfree(req);
	errno = errnum;
	return resp;
	}


	/*
	* slurm_job_will_run - determine if a job would execute immediately if
	* submitted now
	* IN job_desc_msg - description of resource allocation request
	* RET 0 on success, otherwise return -1 and set errno to indicate the error
	*/
	int slurm_job_will_run (job_desc_msg_t *req)
	{
	slurm_msg_t req_msg;
	int rc;

	/* req.immediate = true; implicit */
	slurm_msg_t_init(&req_msg);
	req_msg.msg_type = REQUEST_JOB_WILL_RUN;
	req_msg.data = req;

	if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
	return SLURM_SOCKET_ERROR;

	if (rc)
	slurm_seterrno_ret(rc);

	return SLURM_PROTOCOL_SUCCESS;
	}

	/*
	* slurm_job_step_create - create a job step for a given job id
	* IN slurm_step_alloc_req_msg - description of job step request
	* OUT slurm_step_alloc_resp_msg - response to request
	* RET 0 on success, otherwise return -1 and set errno to indicate the error
	* NOTE: free the response using slurm_free_job_step_create_response_msg
	*/
	int
	slurm_job_step_create (job_step_create_request_msg_t *req,
	job_step_create_response_msg_t **resp)
	{
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type = REQUEST_JOB_STEP_CREATE;
	req_msg.data = req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
	return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
	if (_handle_rc_msg(&resp_msg) < 0)
	return SLURM_PROTOCOL_ERROR;
	*resp = NULL;
	break;
	case RESPONSE_JOB_STEP_CREATE:
	resp = (job_step_create_response_msg_t ) resp_msg.data;
	break;
	default:
	slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
	break;
	}

	return SLURM_PROTOCOL_SUCCESS ;
	}

	/*
	* slurm_allocation_lookup - retrieve info for an existing resource allocation
	* IN jobid - job allocation identifier
	* OUT info - job allocation information
	* RET 0 on success, otherwise return -1 and set errno to indicate the error
	* NOTE: free the "resp" using slurm_free_resource_allocation_response_msg
	*/
	int
	slurm_allocation_lookup(uint32_t jobid,
	job_alloc_info_response_msg_t **info)
	{
	job_alloc_info_msg_t req;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;

	req.job_id = jobid;
	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type = REQUEST_JOB_ALLOCATION_INFO;
	req_msg.data = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
	return SLURM_ERROR;

	switch(resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
	if (_handle_rc_msg(&resp_msg) < 0)
	return SLURM_ERROR;
	*info = NULL;
	break;
	case RESPONSE_JOB_ALLOCATION_INFO:
	info = (job_alloc_info_response_msg_t )resp_msg.data;
	return SLURM_PROTOCOL_SUCCESS;
	break;
	default:
	slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
	break;
	}

	return SLURM_PROTOCOL_SUCCESS;
	}

	/*
	* slurm_allocation_lookup_lite - retrieve info for an existing resource
	* allocation with out the addrs and such
	* IN jobid - job allocation identifier
	* OUT info - job allocation information
	* RET 0 on success, otherwise return -1 and set errno to indicate the error
	* NOTE: free the "resp" using slurm_free_resource_allocation_response_msg
	*/
	int
	slurm_allocation_lookup_lite(uint32_t jobid,
	resource_allocation_response_msg_t **info)
	{
	job_alloc_info_msg_t req;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;

	req.job_id = jobid;
	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type = REQUEST_JOB_ALLOCATION_INFO_LITE;
	req_msg.data = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
	return SLURM_ERROR;

	switch(resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
	if (_handle_rc_msg(&resp_msg) < 0)
	return SLURM_ERROR;
	*info = NULL;
	break;
	case RESPONSE_JOB_ALLOCATION_INFO_LITE:
	info = (resource_allocation_response_msg_t ) resp_msg.data;
	return SLURM_PROTOCOL_SUCCESS;
	break;
	default:
	slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
	break;
	}

	return SLURM_PROTOCOL_SUCCESS;
	}

	/*
	* Handle a return code message type.
	* if return code is nonzero, sets errno to return code and returns < 0.
	* Otherwise, returns 0 (SLURM_SUCCES)
	*/
	static int
	_handle_rc_msg(slurm_msg_t *msg)
	{
	int rc = ((return_code_msg_t *) msg->data)->return_code;
	slurm_free_return_code_msg(msg->data);

	if (rc)
	slurm_seterrno_ret(rc);
	else
	return SLURM_SUCCESS;
	}

	/*
	* Read a SLURM hostfile specified by "filename". "filename" must contain
	* a list of SLURM NodeNames, one per line. Reads up to "n" number of hostnames
	* from the file. Returns a string representing a hostlist ranged string of
	* the contents of the file. This is a helper function, it does not
	* contact any SLURM daemons.
	*
	* Returns a string representing the hostlist. Returns NULL if there are fewer
	* than "n" hostnames in the file, or if an error occurs. If "n" ==
	* NO_VAL then the entire file is read in
	*
	* Returned string must be freed with free().
	*/
	char slurm_read_hostfile(char filename, int n)
	{
	FILE *fp = NULL;
	char in_line[BUFFER_SIZE]; /* input line */
	int i, j;
	int line_size;
	int line_num = 0;
	hostlist_t hostlist = NULL;
	char *nodelist = NULL;

	if (filename == NULL \|\| strlen(filename) == 0)
	return NULL;

	if((fp = fopen(filename, "r")) == NULL) {
	error("slurm_allocate_resources error opening file %s, %m",
	filename);
	return NULL;
	}

	hostlist = hostlist_create(NULL);
	if (hostlist == NULL)
	return NULL;

	while (fgets(in_line, BUFFER_SIZE, fp) != NULL) {
	line_num++;
	line_size = strlen(in_line);
	if (line_size == (BUFFER_SIZE - 1)) {
	error ("Line %d, of hostfile %s too long",
	line_num, filename);
	fclose (fp);
	return NULL;
	}

	for (i = 0; i < line_size; i++) {
	if (in_line[i] == '\n') {
	in_line[i] = '\0';
	break;
	}
	if (in_line[i] == '\0')
	break;
	if (in_line[i] != '#')
	continue;
	if ((i > 0) && (in_line[i - 1] == '\\')) {
	for (j = i; j < line_size; j++) {
	in_line[j - 1] = in_line[j];
	}
	line_size--;
	continue;
	}
	in_line[i] = '\0';
	break;
	}

	hostlist_push(hostlist, in_line);
	if(n != (int)NO_VAL && hostlist_count(hostlist) == n)
	break;
	}
	fclose(fp);

	if (hostlist_count(hostlist) <= 0) {
	error("Hostlist is empty!\n");
	goto cleanup_hostfile;
	}
	if (hostlist_count(hostlist) < n) {
	error("Too few NodeNames in SLURM Hostfile");
	goto cleanup_hostfile;
	}

	nodelist = (char *)malloc(0xffff);
	if (!nodelist) {
	error("Nodelist xmalloc failed");
	goto cleanup_hostfile;
	}

	if (hostlist_ranged_string(hostlist, 0xffff, nodelist) == -1) {
	error("Hostlist is too long for the allocate RPC!");
	free(nodelist);
	nodelist = NULL;
	goto cleanup_hostfile;
	}

	debug2("Hostlist from SLURM_HOSTFILE = %s\n", nodelist);

	cleanup_hostfile:
	hostlist_destroy(hostlist);

	return nodelist;
	}

	/***************************************************************************
	* Support functions for slurm_allocate_resources_blocking()
	***************************************************************************/
	static listen_t _create_allocation_response_socket(char interface_hostname)
	{
	listen_t *listen = NULL;

	listen = xmalloc(sizeof(listen_t));
	if (listen == NULL)
	return NULL;

	/* port "0" lets the operating system pick any port */
	if ((listen->fd = slurm_init_msg_engine_port(0)) < 0) {
	error("slurm_init_msg_engine_port error %m");
	return NULL;
	}
	if (slurm_get_stream_addr(listen->fd, &listen->address) < 0) {
	error("slurm_get_stream_addr error %m");
	slurm_shutdown_msg_engine(listen->fd);
	return NULL;
	}
	listen->hostname = xstrdup(interface_hostname);
	/* FIXME - screw it! I can't seem to get the port number through
	slurm_* functions */
	listen->port = ntohs(listen->address.sin_port);
	fd_set_nonblocking(listen->fd);

	return listen;
	}

	static void _destroy_allocation_response_socket(listen_t *listen)
	{
	xassert(listen != NULL);

	slurm_shutdown_msg_engine(listen->fd);
	if (listen->hostname)
	xfree(listen->hostname);
	xfree(listen);
	}

	/* process RPC from slurmctld
	* IN msg: message recieved
	* OUT resp: resource allocation response message
	* RET 1 if resp is filled in, 0 otherwise */
	static int
	_handle_msg(slurm_msg_t msg, resource_allocation_response_msg_t *resp)
	{
	uid_t req_uid = g_slurm_auth_get_uid(msg->auth_cred);
	uid_t uid = getuid();
	uid_t slurm_uid = (uid_t) slurm_get_slurm_user_id();
	int rc = 0;

	if ((req_uid != slurm_uid) && (req_uid != 0) && (req_uid != uid)) {
	error ("Security violation, slurm message from uid %u",
	(unsigned int) req_uid);
	return 0;
	}

	switch (msg->msg_type) {
	case RESPONSE_RESOURCE_ALLOCATION:
	debug2("resource allocation response received");
	slurm_send_rc_msg(msg, SLURM_SUCCESS);
	*resp = msg->data;
	rc = 1;
	break;
	case SRUN_JOB_COMPLETE:
	info("Job has been cancelled");
	break;
	default:
	error("received spurious message type: %d\n",
	msg->msg_type);
	}
	return rc;
	}

	/* Accept RPC from slurmctld and process it.
	* IN slurmctld_fd: file descriptor for slurmctld communications
	* OUT resp: resource allocation response message
	* RET 1 if resp is filled in, 0 otherwise */
	static int
	_accept_msg_connection(int listen_fd,
	resource_allocation_response_msg_t **resp)
	{
	int conn_fd;
	slurm_msg_t *msg = NULL;
	slurm_addr cli_addr;
	char host[256];
	uint16_t port;
	int rc = 0;

	conn_fd = slurm_accept_msg_conn(listen_fd, &cli_addr);
	if (conn_fd < 0) {
	error("Unable to accept connection: %m");
	return rc;
	}

	slurm_get_addr(&cli_addr, &port, host, sizeof(host));
	debug2("got message connection from %s:%hu", host, port);

	msg = xmalloc(sizeof(slurm_msg_t));
	slurm_msg_t_init(msg);

	if((rc = slurm_receive_msg(conn_fd, msg, 0)) != 0) {
	slurm_free_msg(msg);

	if (errno == EINTR) {
	slurm_close_accepted_conn(conn_fd);
	*resp = NULL;
	return 0;
	}

	error("_accept_msg_connection[%s]: %m", host);
	return SLURM_ERROR;
	}

	rc = _handle_msg(msg, resp); /* handle_msg frees msg */
	slurm_free_msg(msg);

	slurm_close_accepted_conn(conn_fd);
	return rc;
	}

	/* Wait up to sleep_time for RPC from slurmctld indicating resource allocation
	* has occured.
	* IN sleep_time: delay in seconds (0 means unbounded wait)
	* OUT resp: resource allocation response message
	* RET 1 if resp is filled in, 0 otherwise */
	static int
	_wait_for_alloc_rpc(const listen_t *listen, int sleep_time,
	resource_allocation_response_msg_t **resp)
	{
	struct pollfd fds[1];
	int rc;
	int timeout_ms;

	fds[0].fd = listen->fd;
	fds[0].events = POLLIN;

	if (sleep_time != 0) {
	timeout_ms = sleep_time * 1000;
	} else {
	timeout_ms = -1;
	}
	while ((rc = poll(fds, 1, timeout_ms)) < 0) {
	switch (errno) {
	case EAGAIN:
	case EINTR:
	*resp = NULL;
	return -1;
	case ENOMEM:
	case EINVAL:
	case EFAULT:
	error("poll: %m");
	*resp = NULL;
	return -1;
	default:
	error("poll: %m. Continuing...");
	}
	}

	if (rc == 0) { /* poll timed out */
	errno = ETIMEDOUT;
	} else if (fds[0].revents & POLLIN) {
	return (_accept_msg_connection(listen->fd, resp));
	}

	return 0;
	}

	static resource_allocation_response_msg_t *
	_wait_for_allocation_response(uint32_t job_id, const listen_t *listen,
	int timeout)
	{
	resource_allocation_response_msg_t *resp = NULL;
	int errnum;

	debug("job %u queued and waiting for resources", job_id);
	if (_wait_for_alloc_rpc(listen, timeout, &resp) <= 0) {
	errnum = errno;
	/* Maybe the resource allocation response RPC got lost
	* in the mail; surely it should have arrived by now.
	* Let's see if the controller thinks that the allocation
	* has been granted.
	*/
	if (slurm_allocation_lookup_lite(job_id, &resp) >= 0) {
	return resp;
	}
	if (slurm_get_errno() == ESLURM_JOB_PENDING) {
	debug3("Still waiting for allocation");
	errno = errnum;
	return NULL;
	} else {
	debug3("Unable to confirm allocation for job %u: %m",
	job_id);
	return NULL;
	}
	}

	return resp;
	}