src/slurmctld/job_mgr.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  job_mgr.c - manage the job information of slurm
  *	Note: there is a global job list (job_list), time stamp
  *	(last_job_update), and hash table (job_hash)
  *
  *  $Id$
  *****************************************************************************
  *  Copyright (C) 2002-2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov>
  *  UCRL-CODE-226842.
  *
  *  This file is part of SLURM, a resource management program.
  *  For details, see <http://www.llnl.gov/linux/slurm/>.
  *
  *  SLURM is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with SLURM; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #endif

 #include <ctype.h>
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>

 #include <slurm/slurm_errno.h>

 #include "src/api/job_info.h"
 #include "src/common/bitstring.h"
 #include "src/common/hostlist.h"
 #include "src/common/node_select.h"
 #include "src/common/parse_time.h"
 #include "src/common/slurm_jobcomp.h"
 #include "src/common/switch.h"
 #include "src/common/xassert.h"
 #include "src/common/xstring.h"
 #include "src/common/forward.h"
 #include "src/common/slurm_jobacct.h"
 #include "src/common/slurm_protocol_pack.h"

 #include "src/slurmctld/agent.h"
 #include "src/slurmctld/locks.h"
 #include "src/slurmctld/node_scheduler.h"
 #include "src/slurmctld/proc_req.h"
 #include "src/slurmctld/slurmctld.h"
 #include "src/slurmctld/sched_plugin.h"
 #include "src/slurmctld/srun_comm.h"
 #include "src/slurmctld/trigger_mgr.h"

 #define DETAILS_FLAG 0xdddd
 #define MAX_RETRIES  10
 #define MAX_STR_LEN  1024
 #define SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0 0
 #define STEP_FLAG 0xbbbb
 #define TOP_PRIORITY 0xffff0000	/* large, but leave headroom for higher */

 #define JOB_HASH_INX(_job_id)	(_job_id % hash_table_size)

 /* Change JOB_STATE_VERSION value when changing the state save format */
 #define JOB_STATE_VERSION      "VER005"

 /* Global variables */
 List   job_list = NULL;		/* job_record list */
 time_t last_job_update;		/* time of last update to job records */

 /* Local variables */
 static uint32_t maximum_prio = TOP_PRIORITY;
 static int      hash_table_size = 0;
 static int      job_count = 0;		/* job's in the system */
 static uint32_t job_id_sequence = 0;	/* first job_id to assign new job */
 static struct   job_record **job_hash = NULL;

 /* Local functions */
 static void _add_job_hash(struct job_record *job_ptr);
 static int  _copy_job_desc_to_file(job_desc_msg_t * job_desc,
 				   uint32_t job_id);
 static int  _copy_job_desc_to_job_record(job_desc_msg_t * job_desc,
 					 struct job_record **job_ptr,
 					 struct part_record *part_ptr,
 					 bitstr_t ** exc_bitmap,
 					 bitstr_t ** req_bitmap);
 static char *_copy_nodelist_no_dup(char *node_list);
 static void _del_batch_list_rec(void *x);
 static void _delete_job_desc_files(uint32_t job_id);
 static void _dump_job_details(struct job_details *detail_ptr,
 			      Buf buffer);
 static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer);
 static void _excise_node_from_job(struct job_record *job_ptr,
 				  struct node_record *node_ptr);
 static int  _find_batch_dir(void *x, void *key);
 static void _get_batch_job_dir_ids(List batch_dirs);
 static void _job_timed_out(struct job_record *job_ptr);
 static int  _job_create(job_desc_msg_t * job_specs, int allocate, int will_run,
 		        struct job_record **job_rec_ptr, uid_t submit_uid);
 static void _list_delete_job(void *job_entry);
 static int  _list_find_job_id(void *job_entry, void *key);
 static int  _list_find_job_old(void *job_entry, void *key);
 static int  _load_job_details(struct job_record *job_ptr, Buf buffer);
 static int  _load_job_state(Buf buffer);
 static void _pack_default_job_details(struct job_details *detail_ptr,
 				      Buf buffer);
 static void _pack_pending_job_details(struct job_details *detail_ptr,
 				      Buf buffer);
 static int  _purge_job_record(uint32_t job_id);
 static void _purge_lost_batch_jobs(int node_inx, time_t now);
 static void _read_data_array_from_file(char *file_name, char ***data,
 				       uint16_t * size);
 static void _read_data_from_file(char *file_name, char **data);
 static void _remove_defunct_batch_dirs(List batch_dirs);
 static int  _reset_detail_bitmaps(struct job_record *job_ptr);
 static void _reset_step_bitmaps(struct job_record *job_ptr);
 static int  _resume_job_nodes(struct job_record *job_ptr);
 static void _set_job_id(struct job_record *job_ptr);
 static void _set_job_prio(struct job_record *job_ptr);
 static void _signal_batch_job(struct job_record *job_ptr, uint16_t signal);
 static void _signal_job(struct job_record *job_ptr, int signal);
 static void _suspend_job(struct job_record *job_ptr, uint16_t op);
 static int  _suspend_job_nodes(struct job_record *job_ptr);
 static bool _top_priority(struct job_record *job_ptr);
 static int  _validate_job_create_req(job_desc_msg_t * job_desc);
 static int  _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate,
 			       uid_t submit_uid);
 static void _validate_job_files(List batch_dirs);
 static int  _write_data_to_file(char *file_name, char *data);
 static int  _write_data_array_to_file(char *file_name, char **data,
 				      uint16_t size);
 static void _xmit_new_end_time(struct job_record *job_ptr);

 /*
  * create_job_record - create an empty job_record including job_details.
  *	load its values with defaults (zeros, nulls, and magic cookie)
  * IN/OUT error_code - set to zero if no error, errno otherwise
  * RET pointer to the record or NULL if error
  * global: job_list - global job list
  *	job_count - number of jobs in the system
  *	last_job_update - time of last job table update
  * NOTE: allocates memory that should be xfreed with _list_delete_job
  */
 struct job_record *create_job_record(int *error_code)
 {
 	struct job_record  *job_ptr;
 	struct job_details *detail_ptr;

 	if (job_count >= slurmctld_conf.max_job_cnt) {
 		error("create_job_record: job_count exceeds limit");
 		*error_code = EAGAIN;
 		return NULL;
 	}

 	job_count++;
 	*error_code = 0;
 	last_job_update = time(NULL);

 	job_ptr    = (struct job_record *) xmalloc(sizeof(struct job_record));
 	detail_ptr = (struct job_details *)xmalloc(sizeof(struct job_details));

 	xassert (job_ptr->magic = JOB_MAGIC); /* sets value */
 	job_ptr->details = detail_ptr;
 	job_ptr->step_list = list_create(NULL);
 	if (job_ptr->step_list == NULL)
 		fatal("memory allocation failure");

 	xassert (detail_ptr->magic = DETAILS_MAGIC); /* set value */
 	detail_ptr->submit_time = time(NULL);

 	if (list_append(job_list, job_ptr) == 0)
 		fatal("list_append memory allocation failure");

 	return job_ptr;
 }


 /*
  * delete_job_details - delete a job's detail record and clear it's pointer
  *	this information can be deleted as soon as the job is allocated
  *	resources and running (could need to restart batch job)
  * IN job_entry - pointer to job_record to clear the record of
  */
 void delete_job_details(struct job_record *job_entry)
 {
 	int i;

 	if (job_entry->details == NULL)
 		return;

 	_delete_job_desc_files(job_entry->job_id);
 	xassert (job_entry->details->magic == DETAILS_MAGIC);
 	for (i=0; i<job_entry->details->argc; i++)
 		xfree(job_entry->details->argv[i]);
 	xfree(job_entry->details->argv);
 	xfree(job_entry->details->req_nodes);
 	xfree(job_entry->details->exc_nodes);
 	FREE_NULL_BITMAP(job_entry->details->req_node_bitmap);
 	xfree(job_entry->details->req_node_layout);
 	FREE_NULL_BITMAP(job_entry->details->exc_node_bitmap);
 	xfree(job_entry->details->features);
 	xfree(job_entry->details->err);
 	xfree(job_entry->details->in);
 	xfree(job_entry->details->out);
 	xfree(job_entry->details->work_dir);
 	xfree(job_entry->details->mc_ptr);
 	xfree(job_entry->details);
 }

 /* _delete_job_desc_files - delete job descriptor related files */
 static void _delete_job_desc_files(uint32_t job_id)
 {
 	char *dir_name, job_dir[20], *file_name;
 	struct stat sbuf;

 	dir_name = xstrdup(slurmctld_conf.state_save_location);

 	sprintf(job_dir, "/job.%d", job_id);
 	xstrcat(dir_name, job_dir);

 	file_name = xstrdup(dir_name);
 	xstrcat(file_name, "/environment");
 	(void) unlink(file_name);
 	xfree(file_name);

 	file_name = xstrdup(dir_name);
 	xstrcat(file_name, "/script");
 	(void) unlink(file_name);
 	xfree(file_name);

 	if (stat(dir_name, &sbuf) == 0)	/* remove job directory as needed */
 		(void) rmdir(dir_name);
 	xfree(dir_name);
 }

 /* dump_all_job_state - save the state of all jobs to file for checkpoint
  * RET 0 or error code */
 int dump_all_job_state(void)
 {
 	static int high_buffer_size = (1024 * 1024);
 	int error_code = 0, log_fd;
 	char *old_file, *new_file, *reg_file;
 	/* Locks: Read config and job */
 	slurmctld_lock_t job_read_lock =
 		{ READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
 	ListIterator job_iterator;
 	struct job_record *job_ptr;
 	Buf buffer = init_buf(high_buffer_size);
 	DEF_TIMERS;

 	START_TIMER;
 	/* write header: version, time */
 	packstr(JOB_STATE_VERSION, buffer);
 	pack_time(time(NULL), buffer);

         /*
          * write header: job id
          * This is needed so that the job id remains persistent even after
          * slurmctld is restarted.
          */
 	pack32( job_id_sequence, buffer);

 	debug3("Writing job id %u to header record of job_state file",
 	       job_id_sequence);

 	/* write individual job records */
 	lock_slurmctld(job_read_lock);
 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		xassert (job_ptr->magic == JOB_MAGIC);
 		_dump_job_state(job_ptr, buffer);
 	}
 	/* Maintain config lock until we get the state_save_location *\
 	   \* unlock_slurmctld(job_read_lock);         - see below      */
 	list_iterator_destroy(job_iterator);

 	/* write the buffer to file */
 	old_file = xstrdup(slurmctld_conf.state_save_location);
 	xstrcat(old_file, "/job_state.old");
 	reg_file = xstrdup(slurmctld_conf.state_save_location);
 	xstrcat(reg_file, "/job_state");
 	new_file = xstrdup(slurmctld_conf.state_save_location);
 	xstrcat(new_file, "/job_state.new");
 	unlock_slurmctld(job_read_lock);

 	lock_state_files();
 	log_fd = creat(new_file, 0600);
 	if (log_fd == 0) {
 		error("Can't save state, create file %s error %m",
 		      new_file);
 		error_code = errno;
 	} else {
 		int pos = 0, nwrite = get_buf_offset(buffer), amount;
 		char *data = (char *)get_buf_data(buffer);
 		high_buffer_size = MAX(nwrite, high_buffer_size);
 		while (nwrite > 0) {
 			amount = write(log_fd, &data[pos], nwrite);
 			if ((amount < 0) && (errno != EINTR)) {
 				error("Error writing file %s, %m", new_file);
 				error_code = errno;
 				break;
 			}
 			nwrite -= amount;
 			pos    += amount;
 		}
 		fsync(log_fd);
 		close(log_fd);
 	}
 	if (error_code)
 		(void) unlink(new_file);
 	else {			/* file shuffle */
 		(void) unlink(old_file);
 		(void) link(reg_file, old_file);
 		(void) unlink(reg_file);
 		(void) link(new_file, reg_file);
 		(void) unlink(new_file);
 	}
 	xfree(old_file);
 	xfree(reg_file);
 	xfree(new_file);
 	unlock_state_files();

 	free_buf(buffer);
 	END_TIMER2("dump_all_job_state");
 	return error_code;
 }

 /*
  * load_all_job_state - load the job state from file, recover from last
  *	checkpoint. Execute this after loading the configuration file data.
  * RET 0 or error code
  */
 int load_all_job_state(void)
 {
 	int data_allocated, data_read = 0, error_code = 0;
 	uint32_t data_size = 0;
 	int state_fd, job_cnt = 0;
 	char *data = NULL, *state_file;
 	Buf buffer;
 	time_t buf_time;
 	uint32_t saved_job_id;
 	char *ver_str = NULL;
 	uint16_t ver_str_len;

 	/* read the file */
 	state_file = xstrdup(slurmctld_conf.state_save_location);
 	xstrcat(state_file, "/job_state");
 	lock_state_files();
 	state_fd = open(state_file, O_RDONLY);
 	if (state_fd < 0) {
 		info("No job state file (%s) to recover", state_file);
 		error_code = ENOENT;
 	} else {
 		data_allocated = BUF_SIZE;
 		data = xmalloc(data_allocated);
 		while (1) {
 			data_read = read(state_fd, &data[data_size],
 					 BUF_SIZE);
 			if (data_read < 0) {
 				if (errno == EINTR)
 					continue;
 				else {
 					error("Read error on %s: %m",
 					      state_file);
 					break;
 				}
 			} else if (data_read == 0)	/* eof */
 				break;
 			data_size      += data_read;
 			data_allocated += data_read;
 			xrealloc(data, data_allocated);
 		}
 		close(state_fd);
 	}
 	xfree(state_file);
 	unlock_state_files();

 	if (job_id_sequence == 0)
 		job_id_sequence = slurmctld_conf.first_job_id;

 	buffer = create_buf(data, data_size);

         /*
          * The old header of the "job_state" file simply contained a
          * timestamp, while the new header contains a "VERXXX" at the
          * beginning (VER001, VER002, etc), a timestamp, and the last
          * job id. To determine if we're looking at an old header or
          * new header, we first check if the file begins with "VER".
          *
          * Each field is preceeded by two bytes which contains the field
          * size.  Since we are bypassing the "pack" functions in order
          * see if the header contains a "VERXXX" string, we need to make
          * sure that there is enough data in the buffer to compare against.
          */
 	if (size_buf(buffer) >= sizeof(uint16_t) + strlen(JOB_STATE_VERSION))
 	{
 	        char *ptr = get_buf_data(buffer);

 	        if (memcmp(&ptr[sizeof(uint16_t)], JOB_STATE_VERSION, 3) == 0)
 		{
 		        safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
 		        debug3("Version string in job_state header is %s",
 			       ver_str);
 		}
 	}
 	if (ver_str && (strcmp(ver_str, JOB_STATE_VERSION) != 0)) {
 		error("***********************************************");
 		error("Can not recover job state, incompatable version");
 		error("***********************************************");
 		xfree(ver_str);
 		free_buf(buffer);
 		return EFAULT;
 	}
 	xfree(ver_str);

 	safe_unpack_time(&buf_time, buffer);
 	safe_unpack32( &saved_job_id, buffer);
 	debug3("Job id in job_state header is %u", saved_job_id);

 	while (remaining_buf(buffer) > 0) {
 		error_code = _load_job_state(buffer);
 		if (error_code != SLURM_SUCCESS)
 			goto unpack_error;
 		job_cnt++;
 	}

 	job_id_sequence = MAX(saved_job_id, job_id_sequence);
 	debug3("Set job_id_sequence to %u", job_id_sequence);

 	free_buf(buffer);
 	info("Recovered state of %d jobs", job_cnt);
 	return error_code;

 unpack_error:
 	error("Incomplete job data checkpoint file");
 	info("State of %d jobs recovered", job_cnt);
 	free_buf(buffer);
 	return SLURM_FAILURE;
 }

 /*
  * _dump_job_state - dump the state of a specific job, its details, and
  *	steps to a buffer
  * IN dump_job_ptr - pointer to job for which information is requested
  * IN/OUT buffer - location to store data, pointers automatically advanced
  */
 static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer)
 {
 	struct job_details *detail_ptr;
 	ListIterator step_iterator;
 	struct step_record *step_ptr;

 	/* Dump basic job info */
 	pack32(dump_job_ptr->job_id, buffer);
 	pack32(dump_job_ptr->user_id, buffer);
 	pack32(dump_job_ptr->group_id, buffer);
 	pack32(dump_job_ptr->time_limit, buffer);
 	pack32(dump_job_ptr->priority, buffer);
 	pack32(dump_job_ptr->alloc_sid, buffer);
 	pack32(dump_job_ptr->dependency, buffer);
 	pack32(dump_job_ptr->num_procs, buffer);
 	pack32(dump_job_ptr->exit_code, buffer);

 	pack_time(dump_job_ptr->start_time, buffer);
 	pack_time(dump_job_ptr->end_time, buffer);
 	pack_time(dump_job_ptr->suspend_time, buffer);
 	pack_time(dump_job_ptr->pre_sus_time, buffer);

 	pack16(dump_job_ptr->job_state, buffer);
 	pack16(dump_job_ptr->next_step_id, buffer);
 	pack16(dump_job_ptr->kill_on_node_fail, buffer);
 	pack16(dump_job_ptr->kill_on_step_done, buffer);
 	pack16(dump_job_ptr->batch_flag, buffer);
 	pack16(dump_job_ptr->alloc_resp_port, buffer);
 	pack16(dump_job_ptr->other_port, buffer);
 	pack16(dump_job_ptr->mail_type, buffer);
 	pack16(dump_job_ptr->state_reason, buffer);

 	packstr(dump_job_ptr->alloc_resp_host, buffer);
 	packstr(dump_job_ptr->other_host, buffer);
 	if (dump_job_ptr->job_state & JOB_COMPLETING) {
 		if (dump_job_ptr->nodes_completing == NULL) {
 			dump_job_ptr->nodes_completing =
 				bitmap2node_name(
 					dump_job_ptr->node_bitmap);
 		}
 		packstr(dump_job_ptr->nodes_completing, buffer);
 	}
 	packstr(dump_job_ptr->nodes, buffer);
 	packstr(dump_job_ptr->partition, buffer);
 	packstr(dump_job_ptr->name, buffer);
 	packstr(dump_job_ptr->alloc_node, buffer);
 	packstr(dump_job_ptr->account, buffer);
 	packstr(dump_job_ptr->comment, buffer);
 	packstr(dump_job_ptr->network, buffer);
 	packstr(dump_job_ptr->mail_user, buffer);

 	select_g_pack_jobinfo(dump_job_ptr->select_jobinfo,
 			      buffer);

 	/* Dump job details, if available */
 	detail_ptr = dump_job_ptr->details;
 	if (detail_ptr) {
 		xassert (detail_ptr->magic == DETAILS_MAGIC);
 		pack16((uint16_t) DETAILS_FLAG, buffer);
 		_dump_job_details(detail_ptr, buffer);
 	} else
 		pack16((uint16_t) 0, buffer);	/* no details flag */

 	/* Dump job steps */
 	step_iterator = list_iterator_create(dump_job_ptr->step_list);
 	while ((step_ptr = (struct step_record *)
 		list_next(step_iterator))) {
 		pack16((uint16_t) STEP_FLAG, buffer);
 		dump_job_step_state(step_ptr, buffer);
 	}
 	list_iterator_destroy(step_iterator);
 	pack16((uint16_t) 0, buffer);	/* no step flag */
 }

 /* Unpack a job's state information from a buffer */
 static int _load_job_state(Buf buffer)
 {
 	uint32_t job_id, user_id, group_id, time_limit, priority, alloc_sid;
 	uint32_t dependency, exit_code, num_procs;
 	time_t start_time, end_time, suspend_time, pre_sus_time;
 	uint16_t job_state, next_step_id, details, batch_flag, step_flag;
 	uint16_t kill_on_node_fail, kill_on_step_done, name_len;
 	uint16_t alloc_resp_port, other_port, mail_type, state_reason;
 	char *nodes = NULL, *partition = NULL, *name = NULL;
 	char *alloc_node = NULL, *alloc_resp_host = NULL, *other_host = NULL;
 	char *account = NULL, *network = NULL, *mail_user = NULL;
 	char *comment = NULL, *nodes_completing = NULL;
 	struct job_record *job_ptr;
 	struct part_record *part_ptr;
 	int error_code;
 	select_jobinfo_t select_jobinfo = NULL;

 	safe_unpack32(&job_id, buffer);
 	safe_unpack32(&user_id, buffer);
 	safe_unpack32(&group_id, buffer);
 	safe_unpack32(&time_limit, buffer);
 	safe_unpack32(&priority, buffer);
 	safe_unpack32(&alloc_sid, buffer);
 	safe_unpack32(&dependency, buffer);
 	safe_unpack32(&num_procs, buffer);
 	safe_unpack32(&exit_code, buffer);

 	safe_unpack_time(&start_time, buffer);
 	safe_unpack_time(&end_time, buffer);
 	safe_unpack_time(&suspend_time, buffer);
 	safe_unpack_time(&pre_sus_time, buffer);

 	safe_unpack16(&job_state, buffer);
 	safe_unpack16(&next_step_id, buffer);
 	safe_unpack16(&kill_on_node_fail, buffer);
 	safe_unpack16(&kill_on_step_done, buffer);
 	safe_unpack16(&batch_flag, buffer);
 	safe_unpack16(&alloc_resp_port, buffer);
 	safe_unpack16(&other_port, buffer);
 	safe_unpack16(&mail_type, buffer);
 	safe_unpack16(&state_reason, buffer);

 	safe_unpackstr_xmalloc(&alloc_resp_host, &name_len, buffer);
 	safe_unpackstr_xmalloc(&other_host, &name_len, buffer);
 	if (job_state & JOB_COMPLETING) {
 		safe_unpackstr_xmalloc(&nodes_completing,
 				       &name_len, buffer);
 	}
 	safe_unpackstr_xmalloc(&nodes, &name_len, buffer);
 	safe_unpackstr_xmalloc(&partition, &name_len, buffer);
 	safe_unpackstr_xmalloc(&name, &name_len, buffer);
 	safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer);
 	safe_unpackstr_xmalloc(&account, &name_len, buffer);
 	safe_unpackstr_xmalloc(&comment, &name_len, buffer);
 	safe_unpackstr_xmalloc(&network, &name_len, buffer);
 	safe_unpackstr_xmalloc(&mail_user, &name_len, buffer);

 	if (select_g_alloc_jobinfo(&select_jobinfo)
 	    ||  select_g_unpack_jobinfo(select_jobinfo, buffer))
 		goto unpack_error;

 	/* validity test as possible */
 	if (job_id == 0) {
 		verbose("Invalid job_id %u", job_id);
 		goto unpack_error;
 	}

 	if (((job_state & (~JOB_COMPLETING)) >= JOB_END) ||
 	    (batch_flag > 2)) {
 		error("Invalid data for job %u: job_state=%u batch_flag=%u",
 		      job_id, job_state, batch_flag);
 		goto unpack_error;
 	}
 	if (kill_on_step_done > KILL_ON_STEP_DONE) {
 		error("Invalid data for job %u: kill_on_step_done=%u",
 		      job_id, kill_on_step_done);
 		goto unpack_error;
 	}
 	if (kill_on_node_fail > 1) {
 		error("Invalid data for job %u: kill_on_node_fail=%u",
 		      job_id, kill_on_node_fail);
 		goto unpack_error;
 	}
 	part_ptr = find_part_record (partition);
 	if (part_ptr == NULL) {
 		verbose("Invalid partition (%s) for job_id %u",
 			partition, job_id);
 		/* not a fatal error, partition could have been removed,
 		 * reset_job_bitmaps() will clean-up this job */
 	}

 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL) {
 		job_ptr = create_job_record(&error_code);
 		if (error_code) {
 			error("Create job entry failed for job_id %u",
 			      job_id);
 			goto unpack_error;
 		}
 		job_ptr->job_id = job_id;
 		_add_job_hash(job_ptr);
 	}

 	if ((maximum_prio >= priority) && (priority > 1))
 		maximum_prio = priority;
 	if (job_id_sequence <= job_id)
 		job_id_sequence = job_id + 1;

 	safe_unpack16(&details, buffer);
 	if ((details == DETAILS_FLAG) &&
 	    (_load_job_details(job_ptr, buffer))) {
 		job_ptr->job_state = JOB_FAILED;
 		job_ptr->exit_code = 1;
 		job_ptr->state_reason = FAIL_SYSTEM;
 		job_ptr->end_time = time(NULL);
 		goto unpack_error;
 	}

 	job_ptr->user_id      = user_id;
 	job_ptr->group_id     = group_id;
 	job_ptr->time_limit   = time_limit;
 	job_ptr->priority     = priority;
 	job_ptr->alloc_sid    = alloc_sid;
 	job_ptr->start_time   = start_time;
 	job_ptr->end_time     = end_time;
 	job_ptr->suspend_time = suspend_time;
 	job_ptr->pre_sus_time = pre_sus_time;
 	job_ptr->job_state    = job_state;
 	job_ptr->next_step_id = next_step_id;
 	job_ptr->dependency   = dependency;
 	job_ptr->exit_code    = exit_code;
 	job_ptr->state_reason = state_reason;
 	job_ptr->num_procs    = num_procs;
 	job_ptr->time_last_active = time(NULL);
 	strncpy(job_ptr->name, name, MAX_JOBNAME_LEN);
 	xfree(name);
 	xfree(job_ptr->nodes);
 	job_ptr->nodes  = nodes;
 	nodes           = NULL;	/* reused, nothing left to free */
 	if (nodes_completing) {
 		xfree(job_ptr->nodes_completing);
 		job_ptr->nodes_completing = nodes_completing;
 		nodes_completing = NULL;  /* reused, nothing left to free */
 	}
 	xfree(job_ptr->alloc_node);
 	job_ptr->alloc_node = alloc_node;
 	alloc_node          = NULL;	/* reused, nothing left to free */
 	strncpy(job_ptr->partition, partition, MAX_SLURM_NAME);
 	xfree(partition);
 	job_ptr->account = account;
 	account          = NULL;  /* reused, nothing left to free */
 	job_ptr->comment = comment;
 	comment          = NULL;  /* reused, nothing left to free */
 	job_ptr->network = network;
 	network          = NULL;  /* reused, nothing left to free */
 	job_ptr->part_ptr = part_ptr;
 	job_ptr->kill_on_node_fail = kill_on_node_fail;
 	job_ptr->kill_on_step_done = kill_on_step_done;
 	job_ptr->batch_flag        = batch_flag;
 	job_ptr->alloc_resp_port   = alloc_resp_port;
 	job_ptr->alloc_resp_host   = alloc_resp_host;
 	job_ptr->other_port        = other_port;
 	job_ptr->other_host        = other_host;
 	job_ptr->mail_type         = mail_type;
 	job_ptr->mail_user         = mail_user;
 	mail_user = NULL;	/* reused, nothing left to free */
 	job_ptr->select_jobinfo = select_jobinfo;

 	build_node_details(job_ptr);	/* set: num_cpu_groups, cpus_per_node,
 					 *	cpu_count_reps, node_cnt, and
 					 *	node_addr */
 	info("recovered job id %u", job_id);

 	safe_unpack16(&step_flag, buffer);
 	while (step_flag == STEP_FLAG) {
 		if ((error_code = load_step_state(job_ptr, buffer)))
 			goto unpack_error;
 		safe_unpack16(&step_flag, buffer);
 	}

 	return SLURM_SUCCESS;

 unpack_error:
 	error("Incomplete job record");
 	xfree(alloc_resp_host);
 	xfree(other_host);
 	xfree(nodes);
 	xfree(nodes_completing);
 	xfree(partition);
 	xfree(name);
 	xfree(alloc_node);
 	xfree(account);
 	xfree(comment);
 	xfree(mail_user);
 	select_g_free_jobinfo(&select_jobinfo);
 	return SLURM_FAILURE;
 }

 /*
  * _dump_job_details - dump the state of a specific job details to
  *	a buffer
  * IN detail_ptr - pointer to job details for which information is requested
  * IN/OUT buffer - location to store data, pointers automatically advanced
  */
 void _dump_job_details(struct job_details *detail_ptr, Buf buffer)
 {
 	pack32(detail_ptr->min_nodes, buffer);
 	pack32(detail_ptr->max_nodes, buffer);
 	pack32(detail_ptr->total_procs, buffer);
 	pack32(detail_ptr->num_tasks, buffer);

 	pack16(detail_ptr->shared, buffer);
 	pack16(detail_ptr->contiguous, buffer);
 	pack16(detail_ptr->cpus_per_task, buffer);
 	pack16(detail_ptr->ntasks_per_node, buffer);
 	pack16(detail_ptr->no_requeue, buffer);
 	pack16(detail_ptr->overcommit, buffer);

 	pack32(detail_ptr->job_min_procs, buffer);
 	pack32(detail_ptr->job_min_memory, buffer);
 	pack32(detail_ptr->job_max_memory, buffer);
 	pack32(detail_ptr->job_min_tmp_disk, buffer);
 	pack_time(detail_ptr->begin_time, buffer);
 	pack_time(detail_ptr->submit_time, buffer);

 	packstr(detail_ptr->req_nodes, buffer);
 	packstr(detail_ptr->exc_nodes, buffer);
 	packstr(detail_ptr->features,  buffer);

 	packstr(detail_ptr->err,       buffer);
 	packstr(detail_ptr->in,        buffer);
 	packstr(detail_ptr->out,       buffer);
 	packstr(detail_ptr->work_dir,  buffer);

 	pack_multi_core_data(detail_ptr->mc_ptr, buffer);
 	packstr_array(detail_ptr->argv, detail_ptr->argc, buffer);
 }

 /* _load_job_details - Unpack a job details information from buffer */
 static int _load_job_details(struct job_record *job_ptr, Buf buffer)
 {
 	char *req_nodes = NULL, *exc_nodes = NULL, *features = NULL;
 	char *err = NULL, *in = NULL, *out = NULL, *work_dir = NULL;
 	char **argv = (char **) NULL;
 	uint32_t min_nodes, max_nodes;
 	uint32_t job_min_procs, total_procs;
 	uint32_t job_min_memory, job_max_memory, job_min_tmp_disk;
 	uint32_t num_tasks;
 	uint16_t argc = 0, shared, contiguous, ntasks_per_node;
 	uint16_t cpus_per_task, name_len, no_requeue, overcommit;
 	time_t begin_time, submit_time;
 	int i;
 	multi_core_data_t *mc_ptr;

 	/* unpack the job's details from the buffer */
 	safe_unpack32(&min_nodes, buffer);
 	safe_unpack32(&max_nodes, buffer);
 	safe_unpack32(&total_procs, buffer);
 	safe_unpack32(&num_tasks, buffer);

 	safe_unpack16(&shared, buffer);
 	safe_unpack16(&contiguous, buffer);
 	safe_unpack16(&cpus_per_task, buffer);
 	safe_unpack16(&ntasks_per_node, buffer);
 	safe_unpack16(&no_requeue, buffer);
 	safe_unpack16(&overcommit, buffer);

 	safe_unpack32(&job_min_procs, buffer);
 	safe_unpack32(&job_min_memory, buffer);
 	safe_unpack32(&job_max_memory, buffer);
 	safe_unpack32(&job_min_tmp_disk, buffer);
 	safe_unpack_time(&begin_time, buffer);
 	safe_unpack_time(&submit_time, buffer);

 	safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer);
 	safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer);
 	safe_unpackstr_xmalloc(&features,  &name_len, buffer);

 	safe_unpackstr_xmalloc(&err, &name_len, buffer);
 	safe_unpackstr_xmalloc(&in,  &name_len, buffer);
 	safe_unpackstr_xmalloc(&out, &name_len, buffer);
 	safe_unpackstr_xmalloc(&work_dir, &name_len, buffer);

 	if (unpack_multi_core_data(&mc_ptr, buffer))
 		goto unpack_error;
 	safe_unpackstr_array(&argv, &argc, buffer);

 	/* validity test as possible */
 	if (contiguous > 1) {
 		error("Invalid data for job %u: contiguous=%u",
 		      job_ptr->job_id, contiguous);
 		goto unpack_error;
 	}
 	if ((no_requeue > 1) || (overcommit > 1)) {
 		error("Invalid data for job %u: no_requeue=%u overcommit=%u",
 		      no_requeue, overcommit);
 		goto unpack_error;
 	}


 	/* free any left-over detail data */
 	xfree(job_ptr->details->req_nodes);
 	xfree(job_ptr->details->exc_nodes);
 	xfree(job_ptr->details->features);
 	xfree(job_ptr->details->err);
 	xfree(job_ptr->details->in);
 	xfree(job_ptr->details->out);
 	xfree(job_ptr->details->work_dir);
 	for (i=0; i<job_ptr->details->argc; i++)
 		xfree(job_ptr->details->argv[i]);
 	xfree(job_ptr->details->argv);

 	/* now put the details into the job record */
 	job_ptr->details->min_nodes = min_nodes;
 	job_ptr->details->max_nodes = max_nodes;
 	job_ptr->details->total_procs = total_procs;
 	job_ptr->details->num_tasks = num_tasks;
 	job_ptr->details->shared = shared;
 	job_ptr->details->contiguous = contiguous;
 	job_ptr->details->cpus_per_task = cpus_per_task;
 	job_ptr->details->ntasks_per_node = ntasks_per_node;
 	job_ptr->details->job_min_procs = job_min_procs;
 	job_ptr->details->job_min_memory = job_min_memory;
 	job_ptr->details->job_max_memory = job_max_memory;
 	job_ptr->details->job_min_tmp_disk = job_min_tmp_disk;
 	job_ptr->details->no_requeue = no_requeue;
 	job_ptr->details->overcommit = overcommit;
 	job_ptr->details->begin_time = begin_time;
 	job_ptr->details->submit_time = submit_time;
 	job_ptr->details->req_nodes = req_nodes;
 	job_ptr->details->exc_nodes = exc_nodes;
 	job_ptr->details->features = features;
 	job_ptr->details->err = err;
 	job_ptr->details->in = in;
 	job_ptr->details->out = out;
 	job_ptr->details->work_dir = work_dir;
 	job_ptr->details->argc = argc;
 	job_ptr->details->argv = argv;
 	job_ptr->details->mc_ptr = mc_ptr;

 	return SLURM_SUCCESS;

 unpack_error:
 	xfree(req_nodes);
 	xfree(exc_nodes);
 	xfree(features);
 	xfree(err);
 	xfree(in);
 	xfree(out);
 	xfree(work_dir);
 /*	for (i=0; i<argc; i++)
 	xfree(argv[i]);  Don't trust this on unpack error */
 	xfree(argv);
 	return SLURM_FAILURE;
 }

 /* _add_job_hash - add a job hash entry for given job record, job_id must
  *	already be set
  * IN job_ptr - pointer to job record
  * Globals: hash table updated
  */
 void _add_job_hash(struct job_record *job_ptr)
 {
 	int inx;

 	inx = JOB_HASH_INX(job_ptr->job_id);
 	job_ptr->job_next = job_hash[inx];
 	job_hash[inx] = job_ptr;
 }


 /*
  * find_job_record - return a pointer to the job record with the given job_id
  * IN job_id - requested job's id
  * RET pointer to the job's record, NULL on error
  * global: job_list - global job list pointer
  *	job_hash - hash table into job records
  */
 struct job_record *find_job_record(uint32_t job_id)
 {
 	struct job_record *job_ptr;

 	job_ptr = job_hash[JOB_HASH_INX(job_id)];
 	while (job_ptr) {
 		if (job_ptr->job_id == job_id)
 			return job_ptr;
 		job_ptr = job_ptr->job_next;
 	}

 	return NULL;
 }

 /*
  * kill_job_by_part_name - Given a partition name, deallocate resource for
  *	its jobs and kill them. All jobs associated with this partition
  *	will have their partition pointer cleared.
  * IN part_name - name of a partition
  * RET number of jobs associated with this partition
  */
 extern int kill_job_by_part_name(char *part_name)
 {
 	ListIterator job_iterator;
 	struct job_record  *job_ptr;
 	struct part_record *part_ptr;
 	int job_count = 0;
 	time_t now = time(NULL);

 	part_ptr = find_part_record (part_name);
 	if (part_ptr == NULL)	/* No such partition */
 		return 0;

 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		bool suspended = false;
 		if (job_ptr->part_ptr != part_ptr)
 			continue;
 		job_ptr->part_ptr = NULL;

 		if (job_ptr->job_state == JOB_SUSPENDED)
 			suspended = true;
 		if ((job_ptr->job_state == JOB_RUNNING)
 		    || (job_ptr->job_state == JOB_PENDING)
 		    || suspended) {
 			job_count++;
 			info("Killing job_id %u on defunct partition %s",
 			     job_ptr->job_id, part_name);
 			job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING;
 			job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
 			job_ptr->state_reason = FAIL_DOWN_PARTITION;
 			if (suspended)
 				job_ptr->end_time = job_ptr->suspend_time;
 			else
 				job_ptr->end_time = time(NULL);
 			job_completion_logger(job_ptr);
 			deallocate_nodes(job_ptr, false, suspended);
 		} else if (job_ptr->job_state == JOB_PENDING) {
 			job_count++;
 			info("Killing job_id %u on defunct partition %s",
 				job_ptr->job_id, part_name);
 			job_ptr->job_state	= JOB_CANCELLED;
 			job_ptr->start_time	= now;
 			job_ptr->end_time	= now;
 			job_ptr->exit_code	= 1;
 			job_completion_logger(job_ptr);
 		}

 	}
 	list_iterator_destroy(job_iterator);

 	if (job_count)
 		last_job_update = time(NULL);
 	return job_count;
 }

 /*
  * kill_running_job_by_node_name - Given a node name, deallocate RUNNING
  *	or COMPLETING jobs from the node or kill them
  * IN node_name - name of a node
  * IN step_test - if true, only kill the job if a step is running on the node
  * RET number of killed jobs
  */
 extern int kill_running_job_by_node_name(char *node_name, bool step_test)
 {
 	ListIterator job_iterator;
 	struct job_record *job_ptr;
 	struct node_record *node_ptr;
 	int bit_position;
 	int job_count = 0;
 	time_t now = time(NULL);

 	node_ptr = find_node_record(node_name);
 	if (node_ptr == NULL)	/* No such node */
 		return 0;
 	bit_position = node_ptr - node_record_table_ptr;

 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		bool suspended = false;
 		if ((job_ptr->node_bitmap == NULL) ||
 		    (!bit_test(job_ptr->node_bitmap, bit_position)))
 			continue;	/* job not on this node */
 		if (job_ptr->job_state == JOB_SUSPENDED)
 			suspended = true;
 		if (job_ptr->job_state & JOB_COMPLETING) {
 			job_count++;
 			bit_clear(job_ptr->node_bitmap, bit_position);
 			if (job_ptr->node_cnt)
 				(job_ptr->node_cnt)--;
 			else
 				error("node_cnt underflow on JobId=%u",
 			   	      job_ptr->job_id);
 			if (job_ptr->node_cnt == 0) {
 				job_ptr->job_state &= (~JOB_COMPLETING);
 				delete_step_records(job_ptr, 0);
 				slurm_sched_schedule();
 			}
 			if (node_ptr->comp_job_cnt)
 				(node_ptr->comp_job_cnt)--;
 			else
 				error("Node %s comp_job_cnt underflow, "
 				      "JobId=%u",
 				      node_ptr->name, job_ptr->job_id);
 		} else if ((job_ptr->job_state == JOB_RUNNING) || suspended) {
 			if (step_test &&
 			    (step_on_node(job_ptr, node_ptr) == 0))
 				continue;

 			job_count++;
 			if ((job_ptr->details) &&
 			    (job_ptr->kill_on_node_fail == 0) &&
 			    (job_ptr->node_cnt > 1)) {
 				/* keep job running on remaining nodes */
 				srun_node_fail(job_ptr->job_id, node_name);
 				error("Removing failed node %s from job_id %u",
 				      node_name, job_ptr->job_id);
 				_excise_node_from_job(job_ptr, node_ptr);
 			} else if (job_ptr->batch_flag && job_ptr->details &&
 			           (job_ptr->details->no_requeue == 0)) {
 				srun_node_fail(job_ptr->job_id, node_name);
 				info("requeue job %u due to failure of node %s",
 				     job_ptr->job_id, node_name);
 				_set_job_prio(job_ptr);
 				job_ptr->time_last_active  = now;
 				if (suspended)
 					job_ptr->end_time =
 						job_ptr->suspend_time;
 				else
 					job_ptr->end_time = now;

 				/* We want this job to look like it
 				 * was cancelled in the accounting
 				 * logs. Set a new submit time so the restarted
 				 * job looks like a new job. */
 				job_ptr->job_state  = JOB_CANCELLED;
 				deallocate_nodes(job_ptr, false, suspended);
 				job_completion_logger(job_ptr);
 				job_ptr->job_state = JOB_PENDING;
 				if (job_ptr->node_cnt)
 					job_ptr->job_state |= JOB_COMPLETING;
 				job_ptr->details->submit_time = now;
 			} else {
 				info("Killing job_id %u on failed node %s",
 				     job_ptr->job_id, node_name);
 				srun_node_fail(job_ptr->job_id, node_name);
 				job_ptr->job_state = JOB_NODE_FAIL |
 					JOB_COMPLETING;
 				job_ptr->exit_code =
 					MAX(job_ptr->exit_code, 1);
 				job_ptr->state_reason = FAIL_DOWN_NODE;
 				if (suspended)
 					job_ptr->end_time =
 						job_ptr->suspend_time;
 				else
 					job_ptr->end_time = time(NULL);
 				deallocate_nodes(job_ptr, false, suspended);
 				job_completion_logger(job_ptr);
 			}
 		}

 	}
 	list_iterator_destroy(job_iterator);
 	if (job_count)
 		last_job_update = now;

 	return job_count;
 }

 /* Remove one node from a job's allocation */
 static void _excise_node_from_job(struct job_record *job_ptr,
 				  struct node_record *node_ptr)
 {
 	make_node_idle(node_ptr, job_ptr); /* updates bitmap */
 	xfree(job_ptr->nodes);
 	job_ptr->nodes = bitmap2node_name(job_ptr->node_bitmap);
 	xfree(job_ptr->cpus_per_node);
 	xfree(job_ptr->cpu_count_reps);
 	xfree(job_ptr->node_addr);

 	/* build_node_details rebuilds everything from node_bitmap */
 	build_node_details(job_ptr);
 }


 /*
  * dump_job_desc - dump the incoming job submit request message
  * IN job_specs - job specification from RPC
  */
 void dump_job_desc(job_desc_msg_t * job_specs)
 {
 	long job_id;
 	long job_min_procs, job_min_sockets, job_min_cores, job_min_threads;
 	long job_min_memory, job_max_memory, job_min_tmp_disk, num_procs;
 	long time_limit, priority, contiguous;
 	long kill_on_node_fail, shared, immediate, dependency;
 	long cpus_per_task, no_requeue, num_tasks, overcommit;
 	long ntasks_per_node, ntasks_per_socket, ntasks_per_core;
 	char buf[100];

 	if (job_specs == NULL)
 		return;

 	job_id = (job_specs->job_id != NO_VAL) ?
 		(long) job_specs->job_id : -1L;
 	debug3("JobDesc: user_id=%u job_id=%ld partition=%s name=%s",
 	       job_specs->user_id, job_id,
 	       job_specs->partition, job_specs->name);

 	num_procs = (job_specs->num_procs != NO_VAL) ?
 		(long) job_specs->num_procs : -1L;
 	debug3("   num_procs=%ld", num_procs);

 	debug3("   -N min-[max]: %u-[%u]:%u-[%u]:%u-[%u]:%u-[%u]",
 	       job_specs->min_nodes,   job_specs->max_nodes,
 	       job_specs->min_sockets, job_specs->max_sockets,
 	       job_specs->min_cores,   job_specs->max_cores,
 	       job_specs->min_threads, job_specs->max_threads);

 	job_min_procs    = (job_specs->job_min_procs != (uint16_t) NO_VAL) ?
 		(long) job_specs->job_min_procs : -1L;
 	job_min_sockets  = (job_specs->job_min_sockets != (uint16_t) NO_VAL) ?
 		(long) job_specs->job_min_sockets : -1L;
 	job_min_cores    = (job_specs->job_min_cores != (uint16_t) NO_VAL) ?
 		(long) job_specs->job_min_cores : -1L;
 	job_min_threads  = (job_specs->job_min_threads != (uint16_t) NO_VAL) ?
 		(long) job_specs->job_min_threads : -1L;
 	debug3("   job_min_procs=%ld job_min_sockets=%ld",
 	       job_min_procs, job_min_sockets);
 	debug3("   job_min_cores=%ld job_min_threads=%ld",
 	       job_min_cores, job_min_threads);

 	job_min_memory   = (job_specs->job_min_memory != NO_VAL) ?
 		(long) job_specs->job_min_memory : -1L;
 	job_max_memory   = (job_specs->job_max_memory != NO_VAL) ?
 		(long) job_specs->job_max_memory : -1L;
 	job_min_tmp_disk = (job_specs->job_min_tmp_disk != NO_VAL) ?
 		(long) job_specs->job_min_tmp_disk : -1L;
 	debug3("   job_min_memory=%ld job_max_memory=%ld job_min_tmp_disk=%ld",
 	       job_min_memory, job_max_memory, job_min_tmp_disk);
 	immediate = (job_specs->immediate == 0) ? 0L : 1L;
 	debug3("   immediate=%ld features=%s",
 	       immediate, job_specs->features);

 	debug3("   req_nodes=%s exc_nodes=%s",
 	       job_specs->req_nodes, job_specs->exc_nodes);

 	time_limit = (job_specs->time_limit != NO_VAL) ?
 		(long) job_specs->time_limit : -1L;
 	priority   = (job_specs->priority != NO_VAL) ?
 		(long) job_specs->priority : -1L;
 	contiguous = (job_specs->contiguous != (uint16_t) NO_VAL) ?
 		(long) job_specs->contiguous : -1L;
 	shared = (job_specs->shared != (uint16_t) NO_VAL) ?
 		(long) job_specs->shared : -1L;
 	debug3("   time_limit=%ld priority=%ld contiguous=%ld shared=%ld",
 	       time_limit, priority, contiguous, shared);

 	kill_on_node_fail = (job_specs->kill_on_node_fail !=
 			     (uint16_t) NO_VAL) ?
 		(long) job_specs->kill_on_node_fail : -1L;
 	if (job_specs->script)	/* log has problem with string len & null */
 		debug3("   kill_on_node_fail=%ld script=%.40s...",
 		       kill_on_node_fail, job_specs->script);
 	else
 		debug3("   kill_on_node_fail=%ld script=%s",
 		       kill_on_node_fail, job_specs->script);

 	if (job_specs->argc == 1)
 		debug3("   argv=\"%s\"",
 		       job_specs->argv[0]);
 	else if (job_specs->argc == 2)
 		debug3("   argv=%s,%s",
 		       job_specs->argv[0],
 		       job_specs->argv[1]);
 	else if (job_specs->argc > 2)
 		debug3("   argv=%s,%s,%s,...",
 		       job_specs->argv[0],
 		       job_specs->argv[1],
 		       job_specs->argv[2]);

 	if (job_specs->env_size == 1)
 		debug3("   environment=\"%s\"",
 		       job_specs->environment[0]);
 	else if (job_specs->env_size == 2)
 		debug3("   environment=%s,%s",
 		       job_specs->environment[0],
 		       job_specs->environment[1]);
 	else if (job_specs->env_size > 2)
 		debug3("   environment=%s,%s,%s,...",
 		       job_specs->environment[0],
 		       job_specs->environment[1],
 		       job_specs->environment[2]);

 	debug3("   in=%s out=%s err=%s",
 	       job_specs->in, job_specs->out, job_specs->err);

 	debug3("   work_dir=%s alloc_node:sid=%s:%u",
 	       job_specs->work_dir,
 	       job_specs->alloc_node, job_specs->alloc_sid);

 	dependency = (job_specs->dependency != NO_VAL) ?
 		(long) job_specs->dependency : -1L;
 	debug3("   alloc_resp_hostname=%s alloc_resp_port=%u",
 	       job_specs->alloc_resp_hostname, job_specs->alloc_resp_port);
 	debug3("   other_hostname=%s other_port=%u",
 	       job_specs->other_hostname, job_specs->other_port);
 	debug3("   dependency=%ld account=%s comment=%s",
 	       dependency, job_specs->account, job_specs->comment);

 	num_tasks = (job_specs->num_tasks != (uint16_t) NO_VAL) ?
 		(long) job_specs->num_tasks : -1L;
 	overcommit = (job_specs->overcommit != (uint16_t) NO_VAL) ?
 		(long) job_specs->overcommit : -1L;
 	debug3("   mail_type=%u mail_user=%s nice=%d num_tasks=%d "
 	       "overcommit=%d",
 	       job_specs->mail_type, job_specs->mail_user,
 	       (int)job_specs->nice - NICE_OFFSET, num_tasks, overcommit);

 	slurm_make_time_str(&job_specs->begin_time, buf, sizeof(buf));
 	cpus_per_task = (job_specs->cpus_per_task != (uint16_t) NO_VAL) ?
 		(long) job_specs->cpus_per_task : -1L;
 	no_requeue = (job_specs->no_requeue != (uint16_t) NO_VAL) ?
 		(long) job_specs->no_requeue : -1L;
 	debug3("   network=%s begin=%s cpus_per_task=%ld no_requeue=%ld",
 	       job_specs->network, buf, cpus_per_task, no_requeue);

 	ntasks_per_node = (job_specs->ntasks_per_node != (uint16_t) NO_VAL) ?
 		(long) job_specs->ntasks_per_node : -1L;
 	ntasks_per_socket = (job_specs->ntasks_per_socket !=
 			     (uint16_t) NO_VAL) ?
 		(long) job_specs->ntasks_per_socket : -1L;
 	ntasks_per_core = (job_specs->ntasks_per_core != (uint16_t) NO_VAL) ?
 		(long) job_specs->ntasks_per_core : -1L;
 	debug3("   ntasks_per_node=%ld ntasks_per_socket=%ld "
 	       "ntasks_per_core=%ld",
 	       ntasks_per_node, ntasks_per_socket, ntasks_per_core);

 	select_g_sprint_jobinfo(job_specs->select_jobinfo,
 				buf, sizeof(buf), SELECT_PRINT_MIXED);
 	if (buf[0] != '\0')
 		debug3("   %s", buf);
 }


 /*
  * init_job_conf - initialize the job configuration tables and values.
  *	this should be called after creating node information, but
  *	before creating any job entries. Pre-existing job entries are
  *	left unchanged.
  *	NOTE: The job hash table size does not change after initial creation.
  * RET 0 if no error, otherwise an error code
  * global: last_job_update - time of last job table update
  *	job_list - pointer to global job list
  */
 int init_job_conf(void)
 {
 	if (job_list == NULL) {
 		job_count = 0;
 		job_list = list_create(_list_delete_job);
 		if (job_list == NULL)
 			fatal ("Memory allocation failure");
 	}

 	last_job_update = time(NULL);
 	return SLURM_SUCCESS;
 }

 /*
  * rehash_jobs - Create or rebuild the job hash table.
  * NOTE: run lock_slurmctld before entry: Read config, write job
  */
 extern void rehash_jobs(void)
 {
 	if (job_hash == NULL) {
 		hash_table_size = slurmctld_conf.max_job_cnt;
 		job_hash = (struct job_record **)
 			xmalloc(hash_table_size * sizeof(struct job_record *));
 	} else if (hash_table_size < (slurmctld_conf.max_job_cnt / 2)) {
 		/* If the MaxJobCount grows by too much, the hash table will
 		 * be ineffective without rebuilding. We don't presently bother
 		 * to rebuild the hash table, but cut MaxJobCount back as
 		 * needed. */
 		error ("MaxJobCount reset too high, restart slurmctld");
 		slurmctld_conf.max_job_cnt = hash_table_size;
 	}
 }

 /*
  * job_allocate - create job_records for the suppied job specification and
  *	allocate nodes for it.
  * IN job_specs - job specifications
  * IN immediate - if set then either initiate the job immediately or fail
  * IN will_run - don't initiate the job if set, just test if it could run
  *	now or later
  * IN allocate - resource allocation request if set, not a full job
  * IN submit_uid -uid of user issuing the request
  * OUT job_pptr - set to pointer to job record
  * RET 0 or an error code. If the job would only be able to execute with
  *	some change in partition configuration then
  *	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
  * NOTE: If allocating nodes lx[0-7] to a job and those nodes have cpu counts
  *	of 4, 4, 4, 4, 8, 8, 4, 4 then num_cpu_groups=3, cpus_per_node={4,8,4}
  *	and cpu_count_reps={4,2,2}
  * globals: job_list - pointer to global job list
  *	list_part - global list of partition info
  *	default_part_loc - pointer to default partition
  * NOTE: lock_slurmctld on entry: Read config Write job, Write node, Read part
  */
 extern int job_allocate(job_desc_msg_t * job_specs, int immediate,
 			int will_run,
 			int allocate, uid_t submit_uid,
 			struct job_record **job_pptr)
 {
 	int error_code;
 	bool no_alloc, top_prio, test_only, too_fragmented, independent;
 	struct job_record *job_ptr;
 	error_code = _job_create(job_specs, allocate, will_run,
 				 &job_ptr, submit_uid);
 	*job_pptr = job_ptr;
 	time_t now = time(NULL);

 	if (error_code) {
 		if (immediate && job_ptr) {
 			job_ptr->job_state = JOB_FAILED;
 			job_ptr->exit_code = 1;
 			job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 			job_ptr->start_time = job_ptr->end_time = now;
 			job_completion_logger(job_ptr);
 		}
 		return error_code;
 	}
 	xassert(job_ptr);

 	independent = job_independent(job_ptr);

 	/* Avoid resource fragmentation if important */
 	if (independent && switch_no_frag() &&
 	    (submit_uid || (job_specs->req_nodes == NULL)) &&
 	    job_is_completing())
 		too_fragmented = true;	/* Don't pick nodes for job now */
 	/* FIXME: Ideally we only want to refuse the request if the
 	 * required node list is insufficient to satisfy the job's
 	 * processor or node count requirements, but the overhead is
 	 * rather high to do that right here. We let requests from
 	 * user root proceed if a node list is specified, for
 	 * meta-schedulers (e.g. LCRM). */
 	else
 		too_fragmented = false;

 	if (independent && (!too_fragmented))
 		top_prio = _top_priority(job_ptr);
 	else
 		top_prio = true;	/* don't bother testing,
 					 * it is not runable anyway */
 	if (immediate && (too_fragmented || (!top_prio) || (!independent))) {
 		job_ptr->job_state  = JOB_FAILED;
 		job_ptr->exit_code  = 1;
 		job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 		job_ptr->start_time = job_ptr->end_time = now;
 		job_completion_logger(job_ptr);
 		if (!independent)
 			return ESLURM_DEPENDENCY;
 		else if (too_fragmented)
 			return ESLURM_FRAGMENTATION;
 		else
 			return ESLURM_NOT_TOP_PRIORITY;
 	}

 	test_only = will_run || (allocate == 0);

 	no_alloc = test_only || too_fragmented ||
 		(!top_prio) || (!independent);
 	error_code = select_nodes(job_ptr, no_alloc, NULL);
 	if (!test_only) {
 		last_job_update = now;
 		slurm_sched_schedule();	/* work for external scheduler */
 	}

 	if ((error_code == ESLURM_NODES_BUSY) ||
 	    (error_code == ESLURM_JOB_HELD) ||
 	    (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) {
 		/* Not fatal error, but job can't be scheduled right now */
 		if (immediate) {
 			job_ptr->job_state  = JOB_FAILED;
 			job_ptr->exit_code  = 1;
 			job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 			job_ptr->start_time = job_ptr->end_time = now;
 			job_completion_logger(job_ptr);
 		} else {	/* job remains queued */
 			if (error_code == ESLURM_NODES_BUSY) {
 				error_code = SLURM_SUCCESS;
 			}
 		}
 		return error_code;
 	}

 	if (error_code) {	/* fundamental flaw in job request */
 		job_ptr->job_state  = JOB_FAILED;
 		job_ptr->exit_code  = 1;
 		job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 		job_ptr->start_time = job_ptr->end_time = now;
 		job_completion_logger(job_ptr);
 		return error_code;
 	}

 	if (will_run) {		/* job would run, flag job destruction */
 		job_ptr->job_state  = JOB_FAILED;
 		job_ptr->exit_code  = 1;
 		job_ptr->start_time = job_ptr->end_time = now;
 	}
 	return SLURM_SUCCESS;
 }

 /*
  * job_fail - terminate a job due to initiation failure
  * IN job_id - id of the job to be killed
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_fail(uint32_t job_id)
 {
 	struct job_record *job_ptr;
 	time_t now = time(NULL);
 	bool suspended = false;

 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL) {
 		error("job_fail: invalid job id %u", job_id);
 		return ESLURM_INVALID_JOB_ID;
 	}

 	if (IS_JOB_FINISHED(job_ptr))
 		return ESLURM_ALREADY_DONE;
 	if (job_ptr->job_state == JOB_SUSPENDED)
 		suspended = true;
 	if ((job_ptr->job_state == JOB_RUNNING) || suspended) {
 		/* No need to signal steps, deallocate kills them */
 		job_ptr->time_last_active       = now;
 		if (suspended)
 			job_ptr->end_time       = job_ptr->suspend_time;
 		else
 			job_ptr->end_time       = now;
 		last_job_update                 = now;
 		job_ptr->job_state = JOB_FAILED | JOB_COMPLETING;
 		job_ptr->exit_code = 1;
 		job_ptr->state_reason = FAIL_LAUNCH;
 		deallocate_nodes(job_ptr, false, suspended);
 		job_completion_logger(job_ptr);
 		return SLURM_SUCCESS;
 	}
 	/* All other states */
 	verbose("job_fail: job %u can't be killed from state=%s",
 		job_id, job_state_string(job_ptr->job_state));
 	return ESLURM_TRANSITION_STATE_NO_UPDATE;

 }

 /*
  * job_signal - signal the specified job
  * IN job_id - id of the job to be signaled
  * IN signal - signal to send, SIGKILL == cancel the job
  * IN batch_flag - signal batch shell only if set
  * IN uid - uid of requesting user
  * RET 0 on success, otherwise ESLURM error code
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern int job_signal(uint32_t job_id, uint16_t signal, uint16_t batch_flag,
 		      uid_t uid)
 {
 	struct job_record *job_ptr;
 	time_t now = time(NULL);
 	bool super_user;
 	static bool wiki2_sched = false;
 	static bool wiki2_sched_test = false;

 	/* Jobs submitted using Moab command should be cancelled using
 	 * Moab command for accurate job records */
 	if (!wiki2_sched_test) {
 		char *sched_type = slurm_get_sched_type();
 		if (strcmp(sched_type, "sched/wiki2") == 0)
 			wiki2_sched = true;
 		xfree(sched_type);
 		wiki2_sched_test = true;
 	}

 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL) {
 		info("job_signal: invalid job id %u", job_id);
 		return ESLURM_INVALID_JOB_ID;
 	}

 	super_user = ((uid == 0) || (uid == getuid()));
 	if ((job_ptr->user_id != uid) && (!super_user)) {
 		error("Security violation, JOB_CANCEL RPC from uid %d",
 		      uid);
 		return ESLURM_ACCESS_DENIED;
 	}
 	if ((!super_user) && (signal == SIGKILL) && job_ptr->part_ptr &&
 	    (job_ptr->part_ptr->root_only) && wiki2_sched) {
 		info("Attempt to cancel Moab job using Slurm command from "
 		     "uid %d", uid);
 		return ESLURM_ACCESS_DENIED;
 	}

 	if (IS_JOB_FINISHED(job_ptr))
 		return ESLURM_ALREADY_DONE;

 	/* save user ID of the one who requested the job be cancelled */
 	if(signal == SIGKILL)
 		job_ptr->requid = uid;
 	if ((job_ptr->job_state == (JOB_PENDING | JOB_COMPLETING)) &&
 	    (signal == SIGKILL)) {
 		job_ptr->job_state = JOB_CANCELLED | JOB_COMPLETING;
 		verbose("job_signal of requeuing job %u successful", job_id);
 		return SLURM_SUCCESS;
 	}

 	if ((job_ptr->job_state == JOB_PENDING) &&
 	    (signal == SIGKILL)) {
 		last_job_update		= now;
 		job_ptr->job_state	= JOB_CANCELLED;
 		job_ptr->start_time	= now;
 		job_ptr->end_time	= now;
 		srun_allocate_abort(job_ptr);
 		job_completion_logger(job_ptr);
 		delete_job_details(job_ptr);
 		verbose("job_signal of pending job %u successful", job_id);
 		return SLURM_SUCCESS;
 	}

 	if ((job_ptr->job_state == JOB_SUSPENDED)
 	    &&  (signal == SIGKILL)) {
 		last_job_update         = now;
 		job_ptr->end_time       = job_ptr->suspend_time;
 		job_ptr->job_state      = JOB_CANCELLED | JOB_COMPLETING;
 		deallocate_nodes(job_ptr, false, true);
 		job_completion_logger(job_ptr);
 		verbose("job_signal %u of suspended job %u successful",
 			signal, job_id);
 		return SLURM_SUCCESS;
 	}

 	if (job_ptr->job_state == JOB_RUNNING) {
 		if (signal == SIGKILL) {
 			/* No need to signal steps, deallocate kills them */
 			job_ptr->time_last_active	= now;
 			job_ptr->end_time		= now;
 			last_job_update			= now;
 			job_ptr->job_state = JOB_CANCELLED | JOB_COMPLETING;
 			deallocate_nodes(job_ptr, false, false);
 			job_completion_logger(job_ptr);
 		} else if (batch_flag) {
 			if (job_ptr->batch_flag)
 				_signal_batch_job(job_ptr, signal);
 			else
 				return ESLURM_JOB_SCRIPT_MISSING;
 		} else {
 			_signal_job(job_ptr, signal);
 		}
 		verbose("job_signal %u of running job %u successful",
 			signal, job_id);
 		return SLURM_SUCCESS;
 	}

 	verbose("job_signal: job %u can't be sent signal %u from state=%s",
 		job_id, signal, job_state_string(job_ptr->job_state));
 	return ESLURM_TRANSITION_STATE_NO_UPDATE;
 }

 static void
 _signal_batch_job(struct job_record *job_ptr, uint16_t signal)
 {
 	bitoff_t i;
 	kill_tasks_msg_t *kill_tasks_msg = NULL;
 	agent_arg_t *agent_args = NULL;

 	xassert(job_ptr);
 	i = bit_ffs(job_ptr->node_bitmap);
 	if (i < 0) {
 		error("_signal_batch_job JobId=%u lacks assigned nodes");
 		return;
 	}

 	agent_args = xmalloc(sizeof(agent_arg_t));
 	agent_args->msg_type	= REQUEST_SIGNAL_TASKS;
 	agent_args->retry	= 1;
 	agent_args->node_count  = 1;
 	agent_args->hostlist	=
 		hostlist_create(node_record_table_ptr[i].name);
 	kill_tasks_msg = xmalloc(sizeof(kill_tasks_msg_t));
 	kill_tasks_msg->job_id      = job_ptr->job_id;
 	kill_tasks_msg->job_step_id = NO_VAL;
 	kill_tasks_msg->signal      = signal;

 	agent_args->msg_args = kill_tasks_msg;
 	agent_args->node_count = 1;/* slurm/477 be sure to update node_count */
 	agent_queue_request(agent_args);
 	return;
 }

 /*
  * job_complete - note the normal termination the specified job
  * IN job_id - id of the job which completed
  * IN uid - user id of user issuing the RPC
  * IN requeue - job should be run again if possible
  * IN job_return_code - job's return code, if set then set state to FAILED
  * RET - 0 on success, otherwise ESLURM error code
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern int job_complete(uint32_t job_id, uid_t uid, bool requeue,
 			uint32_t job_return_code)
 {
 	struct job_record *job_ptr;
 	time_t now = time(NULL);
 	uint32_t job_comp_flag = 0;
 	bool suspended = false;

 	info("completing job %u", job_id);
 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL) {
 		info("job_complete: invalid JobId=%u", job_id);
 		return ESLURM_INVALID_JOB_ID;
 	}

 	if (IS_JOB_FINISHED(job_ptr))
 		return ESLURM_ALREADY_DONE;

 	if ((job_ptr->user_id != uid) && (uid != 0) && (uid != getuid())) {
 		error("Security violation, JOB_COMPLETE RPC from uid %d",
 		      uid);
 		return ESLURM_USER_ID_MISSING;
 	}
 	if (job_ptr->job_state & JOB_COMPLETING)
 		return SLURM_SUCCESS;	/* avoid replay */

 	if (job_ptr->job_state == JOB_RUNNING)
 		job_comp_flag = JOB_COMPLETING;
 	if (job_ptr->job_state == JOB_SUSPENDED) {
 		job_comp_flag = JOB_COMPLETING;
 		suspended = true;
 	}

 	if (requeue && (job_ptr->batch_flag > 1)) {
 		/* Failed one requeue, just kill it */
 		requeue = 0;
 		if (job_return_code == 0)
 			job_return_code = 1;
 		info("Batch job launch failure, JobId=%u", job_ptr->job_id);
 	}

 	if (requeue && job_ptr->details && job_ptr->batch_flag) {
 		job_ptr->batch_flag++;	/* only one retry */
 		job_ptr->job_state = JOB_PENDING | job_comp_flag;
 		info("Non-responding node, requeue JobId=%u", job_ptr->job_id);
 	} else if ((job_ptr->job_state == JOB_PENDING) && job_ptr->details &&
 		   job_ptr->batch_flag) {
 		/* Possible failure mode with DOWN node and job requeue.
 		 * The DOWN node might actually respond to the cancel and
 		 * take us here. */
 		return SLURM_SUCCESS;
 	} else {
 		if (job_return_code == NO_VAL) {
 			job_ptr->job_state = JOB_CANCELLED | job_comp_flag;
 			job_ptr->requid = uid;
 		} else if (WIFEXITED(job_return_code) &&
 		           WEXITSTATUS(job_return_code)) {
 			job_ptr->job_state = JOB_FAILED   | job_comp_flag;
 			job_ptr->exit_code = job_return_code;
 			job_ptr->state_reason = FAIL_EXIT_CODE;
 		} else if (job_comp_flag &&		/* job was running */
 			 (job_ptr->end_time < now)) {	/* over time limit */
 			job_ptr->job_state = JOB_TIMEOUT  | job_comp_flag;
 			job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
 			job_ptr->state_reason = FAIL_TIMEOUT;
 		} else
 			job_ptr->job_state = JOB_COMPLETE | job_comp_flag;
 		if (suspended)
 			job_ptr->end_time = job_ptr->suspend_time;
 		else
 			job_ptr->end_time = now;
 		job_completion_logger(job_ptr);
 	}

 	last_job_update = now;
 	if (job_comp_flag) 	/* job was running */
 		deallocate_nodes(job_ptr, false, suspended);
 	info("job_complete for JobId=%u successful", job_id);

 	return SLURM_SUCCESS;
 }

 /*
  * _job_create - create a job table record for the supplied specifications.
  *	this performs only basic tests for request validity (access to
  *	partition, nodes count in partition, and sufficient processors in
  *	partition).
  * input: job_specs - job specifications
  * IN allocate - resource allocation request if set rather than job submit
  * IN will_run - job is not to be created, test of validity only
  * OUT job_pptr - pointer to the job (NULL on error)
  * RET 0 on success, otherwise ESLURM error code. If the job would only be
  *	able to execute with some change in partition configuration then
  *	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
  * globals: job_list - pointer to global job list
  *	list_part - global list of partition info
  *	default_part_loc - pointer to default partition
  *	job_hash - hash table into job records
  */

 static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run,
 		       struct job_record **job_pptr, uid_t submit_uid)
 {
 	int error_code = SLURM_SUCCESS, i;
 	struct job_details *detail_ptr;
 	enum job_state_reason fail_reason;
 	struct part_record *part_ptr;
 	bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL;
 	bool super_user = false;
 	struct job_record *job_ptr;
 	uint32_t total_nodes, max_procs;
 #if SYSTEM_DIMENSIONS
 	uint16_t geo[SYSTEM_DIMENSIONS];
 	uint16_t reboot;
 	uint16_t rotate;
 	uint16_t conn_type;
 #endif

 	debug2("before alteration asking for nodes %u-%u procs %u",
 	       job_desc->min_nodes, job_desc->max_nodes,
 	       job_desc->num_procs);
 	select_g_alter_node_cnt(SELECT_SET_NODE_CNT, job_desc);
 	select_g_get_jobinfo(job_desc->select_jobinfo,
 			     SELECT_DATA_MAX_PROCS, &max_procs);

 	debug2("after alteration asking for nodes %u-%u procs %u-%u",
 	       job_desc->min_nodes, job_desc->max_nodes,
 	       job_desc->num_procs, max_procs);

 	*job_pptr = (struct job_record *) NULL;
 	if ((error_code = _validate_job_desc(job_desc, allocate, submit_uid)))
 		return error_code;

 	/* find selected partition */
 	if (job_desc->partition) {
 		part_ptr = list_find_first(part_list, &list_find_part,
 					   job_desc->partition);
 		if (part_ptr == NULL) {
 			info("_job_create: invalid partition specified: %s",
 			     job_desc->partition);
 			error_code = ESLURM_INVALID_PARTITION_NAME;
 			return error_code;
 		}
 	} else {
 		if (default_part_loc == NULL) {
 			error("_job_create: default partition not set.");
 			error_code = ESLURM_DEFAULT_PARTITION_NOT_SET;
 			return error_code;
 		}
 		part_ptr = default_part_loc;
 	}

 	if ((job_desc->user_id == 0) && part_ptr->disable_root_jobs) {
 		error("Security violation, SUBMIT_JOB for user root disabled");
 		return ESLURM_USER_ID_MISSING;
 	}

 	/* can this user access this partition */
 	if ((part_ptr->root_only) && (submit_uid != 0)) {
 		info("_job_create: uid %u access to partition %s denied, %s",
 		     (unsigned int) submit_uid, part_ptr->name, "not root");
 		error_code = ESLURM_ACCESS_DENIED;
 		return error_code;
 	}
 	if (validate_group(part_ptr, job_desc->user_id) == 0) {
 		info("_job_create: uid %u access to partition %s denied, %s",
 		     (unsigned int) job_desc->user_id, part_ptr->name,
 		     "bad group");
 		error_code = ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP;
 		return error_code;
 	}

 	/* check if select partition has sufficient resources to satisfy
 	 * the request */

 	/* insure that selected nodes are in this partition */
 	if (job_desc->req_nodes) {
 		error_code = node_name2bitmap(job_desc->req_nodes, false,
 					      &req_bitmap);
 		if (error_code) {
 			error_code = ESLURM_INVALID_NODE_NAME;
 			goto cleanup;
 		}
 		if (job_desc->contiguous)
 			bit_fill_gaps(req_bitmap);
 		if (bit_super_set(req_bitmap, part_ptr->node_bitmap) != 1) {
 			char *tmp = bitmap2node_name(req_bitmap);
 			info("_job_create: requested nodes %s not in "
 			     "partition %s", tmp, part_ptr->name);
 			xfree(tmp);
 			error_code = ESLURM_REQUESTED_NODES_NOT_IN_PARTITION;
 			goto cleanup;
 		}

 		i = bit_set_count(req_bitmap);
 		if (i > job_desc->min_nodes)
 			job_desc->min_nodes = i;
 		if (i > job_desc->num_procs)
 			job_desc->num_procs = i;
 		if(job_desc->max_nodes
 		   && job_desc->min_nodes > job_desc->max_nodes)
 			job_desc->max_nodes = job_desc->min_nodes;
 	}
 	if (job_desc->exc_nodes) {
 		error_code = node_name2bitmap(job_desc->exc_nodes, false,
 					      &exc_bitmap);
 		if (error_code) {
 			error_code = ESLURM_INVALID_NODE_NAME;
 			goto cleanup;
 		}
 	}
 	if (exc_bitmap && req_bitmap) {
 		bitstr_t *tmp_bitmap = NULL;
 		bitoff_t first_set;
 		tmp_bitmap = bit_copy(exc_bitmap);
 		if (tmp_bitmap == NULL)
 			fatal("bit_copy malloc failure");
 		bit_and(tmp_bitmap, req_bitmap);
 		first_set = bit_ffs(tmp_bitmap);
 		FREE_NULL_BITMAP(tmp_bitmap);
 		if (first_set != -1) {
 			info("Job's required and excluded node lists overlap");
 			error_code = ESLURM_INVALID_NODE_NAME;
 			goto cleanup;
 		}
 	}

 	if (job_desc->min_nodes == NO_VAL)
 		job_desc->min_nodes = 1;

 #if SYSTEM_DIMENSIONS
 	select_g_get_jobinfo(job_desc->select_jobinfo,
 			     SELECT_DATA_GEOMETRY, &geo);
 	if (geo[0] == (uint16_t) NO_VAL) {
 		for (i=0; i<SYSTEM_DIMENSIONS; i++) {
 			geo[i] = 0;
 		}
 		select_g_set_jobinfo(job_desc->select_jobinfo,
 				     SELECT_DATA_GEOMETRY, &geo);
 	} else if (geo[0] != 0) {
 		uint32_t i, tot = 1;
 		for (i=0; i<SYSTEM_DIMENSIONS; i++)
 			tot *= geo[i];
 		if (job_desc->min_nodes > tot) {
 			info("MinNodes(%d) > GeometryNodes(%d)",
 			     job_desc->min_nodes, tot);
 			error_code = ESLURM_TOO_MANY_REQUESTED_CPUS;
 			goto cleanup;
 		}
 		job_desc->min_nodes = tot;
 	}
 	select_g_get_jobinfo(job_desc->select_jobinfo,
 			     SELECT_DATA_REBOOT, &reboot);
 	if (reboot == (uint16_t) NO_VAL) {
 		reboot = 0;	/* default is no reboot */
 		select_g_set_jobinfo(job_desc->select_jobinfo,
 				     SELECT_DATA_REBOOT, &reboot);
 	}
 	select_g_get_jobinfo(job_desc->select_jobinfo,
 			     SELECT_DATA_ROTATE, &rotate);
 	if (rotate == (uint16_t) NO_VAL) {
 		rotate = 1;	/* refault is to rotate */
 		select_g_set_jobinfo(job_desc->select_jobinfo,
 				     SELECT_DATA_ROTATE, &rotate);
 	}
 	select_g_get_jobinfo(job_desc->select_jobinfo,
 			     SELECT_DATA_CONN_TYPE, &conn_type);
 	if (conn_type == (uint16_t) NO_VAL) {
 		conn_type = (uint16_t) SELECT_TORUS;
 		select_g_set_jobinfo(job_desc->select_jobinfo,
 				     SELECT_DATA_CONN_TYPE, &conn_type);
 	}
 #endif

 	if (job_desc->max_nodes == NO_VAL)
 		job_desc->max_nodes = 0;
 	if ((part_ptr->state_up)
 	    &&  (job_desc->num_procs > part_ptr->total_cpus)) {
 		info("Job requested too many cpus (%d) of partition %s(%d)",
 		     job_desc->num_procs, part_ptr->name,
 		     part_ptr->total_cpus);
 		error_code = ESLURM_TOO_MANY_REQUESTED_CPUS;
 		goto cleanup;
 	}
 	total_nodes = part_ptr->total_nodes;
 	select_g_alter_node_cnt(SELECT_APPLY_NODE_MIN_OFFSET,
 				&total_nodes);
 	if ((part_ptr->state_up) &&  (job_desc->min_nodes > total_nodes)) {
 		info("Job requested too many nodes (%d) of partition %s(%d)",
 		     job_desc->min_nodes, part_ptr->name,
 		     part_ptr->total_nodes);
 		error_code = ESLURM_TOO_MANY_REQUESTED_NODES;
 		goto cleanup;
 	}
 	if (job_desc->max_nodes &&
 	    (job_desc->max_nodes < job_desc->min_nodes)) {
 		info("Job's max_nodes(%u) < min_nodes(%u)",
 		     job_desc->max_nodes, job_desc->min_nodes);
 		error_code = ESLURM_TOO_MANY_REQUESTED_NODES;
 		goto cleanup;
 	}


 	if ((error_code =_validate_job_create_req(job_desc)))
 		goto cleanup;

 	if ((error_code = _copy_job_desc_to_job_record(job_desc,
 						       job_pptr,
 						       part_ptr,
 						       &req_bitmap,
 						       &exc_bitmap))) {
 		error_code = ESLURM_ERROR_ON_DESC_TO_RECORD_COPY;
 		goto cleanup;
 	}

 	job_ptr = *job_pptr;
 	if (job_ptr->dependency == job_ptr->job_id) {
 		info("User specified self as dependent job");
 		error_code = ESLURM_DEPENDENCY;
 		goto cleanup;
 	}

 	if (job_desc->script
 	    &&  (!will_run)) {	/* don't bother with copy if just a test */
 		if ((error_code = _copy_job_desc_to_file(job_desc,
 							 job_ptr->job_id))) {
 			job_ptr->job_state = JOB_FAILED;
 			job_ptr->exit_code = 1;
 			job_ptr->state_reason = FAIL_SYSTEM;
 			job_ptr->start_time = job_ptr->end_time = time(NULL);
 			error_code = ESLURM_WRITING_TO_FILE;
 			goto cleanup;
 		}
 		job_ptr->batch_flag = 1;
 	} else
 		job_ptr->batch_flag = 0;

 	/* Insure that requested partition is valid right now,
 	 * otherwise leave job queued and provide warning code */
 	detail_ptr = job_ptr->details;
 	fail_reason= WAIT_NO_REASON;
 	if ((job_desc->user_id == 0) ||
 	    (job_desc->user_id == slurmctld_conf.slurm_user_id))
 		super_user = true;
 	if ((!super_user) &&
 	    (job_desc->min_nodes > part_ptr->max_nodes)) {
 		info("Job %u requested too many nodes (%d) of "
 		     "partition %s(%d)",
 		     job_ptr->job_id, job_desc->min_nodes,
 		     part_ptr->name, part_ptr->max_nodes);
 		fail_reason = WAIT_PART_NODE_LIMIT;
 	} else if ((!super_user) &&
 	           (job_desc->max_nodes != 0) &&    /* no max_nodes for job */
 		   (job_desc->max_nodes < part_ptr->min_nodes)) {
 		info("Job %u requested too few nodes (%d) of partition %s(%d)",
 		     job_ptr->job_id, job_desc->max_nodes,
 		     part_ptr->name, part_ptr->min_nodes);
 		fail_reason = WAIT_PART_NODE_LIMIT;
 	} else if (part_ptr->state_up == 0) {
 		info("Job %u requested down partition %s",
 		     job_ptr->job_id, part_ptr->name);
 		fail_reason = WAIT_PART_STATE;
 	}
 	if (fail_reason != WAIT_NO_REASON) {
 		error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
 		job_ptr->priority = 1;      /* Move to end of queue */
 		job_ptr->state_reason = fail_reason;
 	}


 cleanup:
 	FREE_NULL_BITMAP(req_bitmap);
 	FREE_NULL_BITMAP(exc_bitmap);
 	return error_code;
 }

 /* Perform some size checks on strings we store to prevent
  * malicious user filling slurmctld's memory
  * RET 0 or error code */
 static int _validate_job_create_req(job_desc_msg_t * job_desc)
 {
 	if (job_desc->err && (strlen(job_desc->err) > MAX_STR_LEN)) {
 		info("_validate_job_create_req: strlen(err) too big (%d)",
 		     strlen(job_desc->err));
 		return ESLURM_PATHNAME_TOO_LONG;
 	}
 	if (job_desc->in && (strlen(job_desc->in) > MAX_STR_LEN)) {
 		info("_validate_job_create_req: strlen(in) too big (%d)",
 		     strlen(job_desc->in));
 		return  ESLURM_PATHNAME_TOO_LONG;
 	}
 	if (job_desc->out && (strlen(job_desc->out) > MAX_STR_LEN)) {
 		info("_validate_job_create_req: strlen(out) too big (%d)",
 		     strlen(job_desc->out));
 		return  ESLURM_PATHNAME_TOO_LONG;
 	}
 	if (job_desc->work_dir && (strlen(job_desc->work_dir) > MAX_STR_LEN)) {
 		info("_validate_job_create_req: strlen(work_dir) too big (%d)",
 		     strlen(job_desc->work_dir));
 		return  ESLURM_PATHNAME_TOO_LONG;
 	}
 	return SLURM_SUCCESS;
 }

 /* _copy_job_desc_to_file - copy the job script and environment from the RPC
  *	structure into a file */
 static int
 _copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id)
 {
 	int error_code = 0;
 	char *dir_name, job_dir[20], *file_name;
 	DEF_TIMERS;

 	START_TIMER;
 	/* Create state_save_location directory */
 	dir_name = xstrdup(slurmctld_conf.state_save_location);

 	/* Create job_id specific directory */
 	sprintf(job_dir, "/job.%u", job_id);
 	xstrcat(dir_name, job_dir);
 	if (mkdir(dir_name, 0700)) {
 		error("mkdir(%s) error %m", dir_name);
 		xfree(dir_name);
 		return ESLURM_WRITING_TO_FILE;
 	}

 	/* Create environment file, and write data to it */
 	file_name = xstrdup(dir_name);
 	xstrcat(file_name, "/environment");
 	error_code = _write_data_array_to_file(file_name,
 					       job_desc->environment,
 					       job_desc->env_size);
 	xfree(file_name);

 	if (error_code == 0) {
 		/* Create script file */
 		file_name = xstrdup(dir_name);
 		xstrcat(file_name, "/script");
 		error_code =
 			_write_data_to_file(file_name, job_desc->script);
 		xfree(file_name);
 	}

 	xfree(dir_name);
 	END_TIMER2("_copy_job_desc_to_file");
 	return error_code;
 }

 /*
  * Create file with specified name and write the supplied data array to it
  * IN file_name - file to create and write to
  * IN data - array of pointers to strings (e.g. env)
  * IN size - number of elements in data
  */
 static int
 _write_data_array_to_file(char *file_name, char **data, uint16_t size)
 {
 	int fd, i, pos, nwrite, amount;

 	fd = creat(file_name, 0600);
 	if (fd < 0) {
 		error("Error creating file %s, %m", file_name);
 		return ESLURM_WRITING_TO_FILE;
 	}

 	amount = write(fd, &size, sizeof(uint16_t));
 	if (amount < sizeof(uint16_t)) {
 		error("Error writing file %s, %m", file_name);
 		close(fd);
 		return ESLURM_WRITING_TO_FILE;
 	}

 	if (data == NULL)
 		return SLURM_SUCCESS;

 	for (i = 0; i < size; i++) {
 		nwrite = strlen(data[i]) + 1;
 		pos = 0;
 		while (nwrite > 0) {
 			amount = write(fd, &data[i][pos], nwrite);
 			if ((amount < 0) && (errno != EINTR)) {
 				error("Error writing file %s, %m",
 				      file_name);
 				close(fd);
 				return ESLURM_WRITING_TO_FILE;
 			}
 			nwrite -= amount;
 			pos    += amount;
 		}
 	}

 	close(fd);
 	return SLURM_SUCCESS;
 }

 /*
  * Create file with specified name and write the supplied data array to it
  * IN file_name - file to create and write to
  * IN data - pointer to string
  */
 static int _write_data_to_file(char *file_name, char *data)
 {
 	int fd, pos, nwrite, amount;

 	if (data == NULL) {
 		(void) unlink(file_name);
 		return SLURM_SUCCESS;
 	}

 	fd = creat(file_name, 0700);
 	if (fd < 0) {
 		error("Error creating file %s, %m", file_name);
 		return ESLURM_WRITING_TO_FILE;
 	}

 	nwrite = strlen(data) + 1;
 	pos = 0;
 	while (nwrite > 0) {
 		amount = write(fd, &data[pos], nwrite);
 		if ((amount < 0) && (errno != EINTR)) {
 			error("Error writing file %s, %m", file_name);
 			close(fd);
 			return ESLURM_WRITING_TO_FILE;
 		}
 		nwrite -= amount;
 		pos    += amount;
 	}
 	close(fd);
 	return SLURM_SUCCESS;
 }

 /*
  * get_job_env - return the environment variables and their count for a
  *	given job
  * IN job_ptr - pointer to job for which data is required
  * OUT env_size - number of elements to read
  * RET point to array of string pointers containing environment variables
  * NOTE: READ lock_slurmctld config before entry
  */
 char **get_job_env(struct job_record *job_ptr, uint16_t * env_size)
 {
 	char job_dir[30], *file_name, **environment = NULL;

 	file_name = xstrdup(slurmctld_conf.state_save_location);
 	sprintf(job_dir, "/job.%d/environment", job_ptr->job_id);
 	xstrcat(file_name, job_dir);

 	_read_data_array_from_file(file_name, &environment, env_size);

 	xfree(file_name);
 	return environment;
 }

 /*
  * get_job_script - return the script for a given job
  * IN job_ptr - pointer to job for which data is required
  * RET point to string containing job script
  * NOTE: READ lock_slurmctld config before entry
  */
 char *get_job_script(struct job_record *job_ptr)
 {
 	char job_dir[30], *file_name, *script = NULL;

 	file_name = xstrdup(slurmctld_conf.state_save_location);
 	sprintf(job_dir, "/job.%d/script", job_ptr->job_id);
 	xstrcat(file_name, job_dir);

 	_read_data_from_file(file_name, &script);

 	xfree(file_name);
 	return script;
 }

 /*
  * Read a collection of strings from a file
  * IN file_name - file to read from
  * OUT data - pointer to array of pointers to strings (e.g. env),
  *	must be xfreed when no longer needed
  * OUT size - number of elements in data
  * NOTE: The output format of this must be identical with _xduparray2()
  */
 static void
 _read_data_array_from_file(char *file_name, char ***data, uint16_t * size)
 {
 	int fd, pos, buf_size, amount, i;
 	char *buffer, **array_ptr;
 	uint16_t rec_cnt;

 	xassert(file_name);
 	xassert(data);
 	xassert(size);
 	*data = NULL;
 	*size = 0;

 	fd = open(file_name, 0);
 	if (fd < 0) {
 		error("Error opening file %s, %m", file_name);
 		return;
 	}

 	amount = read(fd, &rec_cnt, sizeof(uint16_t));
 	if (amount < sizeof(uint16_t)) {
 		if (amount != 0)	/* incomplete write */
 			error("Error reading file %s, %m", file_name);
 		else
 			verbose("File %s has zero size", file_name);
 		close(fd);
 		return;
 	}

 	if (rec_cnt == 0) {
 		*data = NULL;
 		*size = 0;
 		return;
 	}

 	pos = 0;
 	buf_size = BUF_SIZE;
 	buffer = xmalloc(buf_size);
 	while (1) {
 		amount = read(fd, &buffer[pos], BUF_SIZE);
 		if (amount < 0) {
 			error("Error reading file %s, %m", file_name);
 			xfree(buffer);
 			close(fd);
 			return;
 		}
 		if (amount < BUF_SIZE)	/* end of file */
 			break;
 		pos += amount;
 		buf_size += amount;
 		xrealloc(buffer, buf_size);
 	}
 	close(fd);

 	/* We have all the data, now let's compute the pointers */
 	pos = 0;
 	array_ptr = xmalloc(rec_cnt * sizeof(char *));
 	for (i = 0; i < rec_cnt; i++) {
 		array_ptr[i] = &buffer[pos];
 		pos += strlen(&buffer[pos]) + 1;
 		if ((pos > buf_size) && ((i + 1) < rec_cnt)) {
 			error("Bad environment file %s", file_name);
 			break;
 		}
 	}

 	*size = rec_cnt;
 	*data = array_ptr;
 	return;
 }

 /*
  * Read a string from a file
  * IN file_name - file to read from
  * OUT data - pointer to  string
  *	must be xfreed when no longer needed
  */
 void _read_data_from_file(char *file_name, char **data)
 {
 	int fd, pos, buf_size, amount;
 	char *buffer;

 	xassert(file_name);
 	xassert(data);
 	*data = NULL;

 	fd = open(file_name, 0);
 	if (fd < 0) {
 		error("Error opening file %s, %m", file_name);
 		return;
 	}

 	pos = 0;
 	buf_size = BUF_SIZE;
 	buffer = xmalloc(buf_size);
 	while (1) {
 		amount = read(fd, &buffer[pos], BUF_SIZE);
 		if (amount < 0) {
 			error("Error reading file %s, %m", file_name);
 			xfree(buffer);
 			close(fd);
 			return;
 		}
 		if (amount < BUF_SIZE)	/* end of file */
 			break;
 		pos += amount;
 		buf_size += amount;
 		xrealloc(buffer, buf_size);
 	}

 	*data = buffer;
 	close(fd);
 	return;
 }

 /* Given a job request, return a multi_core_data struct.
  * Returns NULL if no values set in the job/step request */
 static multi_core_data_t *
 _set_multi_core_data(job_desc_msg_t * job_desc)
 {
 	multi_core_data_t * mc_ptr;

 	if (((job_desc->job_min_sockets  == (uint16_t) NO_VAL)
 	     ||   (job_desc->job_min_sockets  == (uint16_t) 1))
 	    &&  ((job_desc->job_min_cores    == (uint16_t) NO_VAL)
 		 ||   (job_desc->job_min_cores    == (uint16_t) 1))
 	    &&  ((job_desc->job_min_threads  == (uint16_t) NO_VAL)
 		 ||   (job_desc->job_min_threads  == (uint16_t) NO_VAL))
 	    &&  ((job_desc->min_sockets      == (uint16_t) NO_VAL)
 		 ||   (job_desc->min_sockets      == (uint16_t) 1))
 	    &&  (job_desc->max_sockets       == (uint16_t) NO_VAL)
 	    &&  ((job_desc->min_cores        == (uint16_t) NO_VAL)
 		 ||   (job_desc->min_cores        == (uint16_t) 1))
 	    &&  (job_desc->max_cores         == (uint16_t) NO_VAL)
 	    &&  ((job_desc->min_threads      == (uint16_t) NO_VAL)
 		 ||   (job_desc->min_threads      == (uint16_t) 1))
 	    &&  (job_desc->max_threads       == (uint16_t) NO_VAL)
 	    &&  (job_desc->ntasks_per_socket == (uint16_t) NO_VAL)
 	    &&  (job_desc->ntasks_per_core   == (uint16_t) NO_VAL)
 	    &&  (job_desc->plane_size        == (uint16_t) NO_VAL))
 		return NULL;

 	mc_ptr = xmalloc(sizeof(multi_core_data_t));
 	if (job_desc->job_min_sockets != (uint16_t) NO_VAL)
 		mc_ptr->job_min_sockets    = job_desc->job_min_sockets;
 	else
 		mc_ptr->job_min_sockets    = 1;
 	if (job_desc->job_min_cores != (uint16_t) NO_VAL)
 		mc_ptr->job_min_cores      = job_desc->job_min_cores;
 	else
 		mc_ptr->job_min_cores      = 1;
 	if (job_desc->job_min_threads != (uint16_t) NO_VAL)
 		mc_ptr->job_min_threads    = job_desc->job_min_threads;
 	else
 		mc_ptr->job_min_threads    = 1;
 	if (job_desc->min_sockets != (uint16_t) NO_VAL)
 		mc_ptr->min_sockets        = job_desc->min_sockets;
 	else
 		mc_ptr->min_sockets        = 1;
 	if (job_desc->max_sockets != (uint16_t) NO_VAL)
 		mc_ptr->max_sockets        = job_desc->max_sockets;
 	else
 		mc_ptr->max_sockets        = 0xffff;
 	if (job_desc->min_cores != (uint16_t) NO_VAL)
 		mc_ptr->min_cores          = job_desc->min_cores;
 	else
 		mc_ptr->min_cores          = 1;
 	if (job_desc->max_cores != (uint16_t) NO_VAL)
 		mc_ptr->max_cores          = job_desc->max_cores;
 	else
 		mc_ptr->max_cores          = 0xffff;
 	if (job_desc->min_threads != (uint16_t) NO_VAL)
 		mc_ptr->min_threads        = job_desc->min_threads;
 	else
 		mc_ptr->min_threads        = 1;
 	if (job_desc->max_threads != (uint16_t) NO_VAL)
 		mc_ptr->max_threads        = job_desc->max_threads;
 	else
 		mc_ptr->max_threads        = 0xffff;
 	if (mc_ptr->ntasks_per_socket != (uint16_t) NO_VAL)
 		mc_ptr->ntasks_per_socket  = job_desc->ntasks_per_socket;
 	else
 		mc_ptr->ntasks_per_socket  = 0;
 	if (mc_ptr->ntasks_per_core != (uint16_t) NO_VAL)
 		mc_ptr->ntasks_per_core    = job_desc->ntasks_per_core;
 	else
 		mc_ptr->ntasks_per_core    = 0;
 	if (job_desc->plane_size != (uint16_t) NO_VAL)
 		mc_ptr->plane_size         = job_desc->plane_size;
 	else
 		mc_ptr->plane_size         = 0;

 	return mc_ptr;
 }

 /* _copy_job_desc_to_job_record - copy the job descriptor from the RPC
  *	structure into the actual slurmctld job record */
 static int
 _copy_job_desc_to_job_record(job_desc_msg_t * job_desc,
 			     struct job_record **job_rec_ptr,
 			     struct part_record *part_ptr,
 			     bitstr_t ** req_bitmap,
 			     bitstr_t ** exc_bitmap)
 {
 	int error_code;
 	struct job_details *detail_ptr;
 	struct job_record *job_ptr;

 	job_ptr = create_job_record(&error_code);
 	if (error_code)
 		return error_code;

 	strncpy(job_ptr->partition, part_ptr->name, MAX_SLURM_NAME);
 	job_ptr->part_ptr = part_ptr;
 	if (job_desc->job_id != NO_VAL)		/* already confirmed unique */
 		job_ptr->job_id = job_desc->job_id;
 	else
 		_set_job_id(job_ptr);
 	_add_job_hash(job_ptr);

 	if (job_desc->name) {
 		strncpy(job_ptr->name, job_desc->name, MAX_JOBNAME_LEN);
 	}
 	job_ptr->user_id    = (uid_t) job_desc->user_id;
 	job_ptr->group_id   = (gid_t) job_desc->group_id;
 	job_ptr->job_state  = JOB_PENDING;
 	job_ptr->time_limit = job_desc->time_limit;
 	job_ptr->alloc_sid  = job_desc->alloc_sid;
 	job_ptr->alloc_node = xstrdup(job_desc->alloc_node);
 	job_ptr->account    = xstrdup(job_desc->account);
 	job_ptr->network    = xstrdup(job_desc->network);
 	job_ptr->comment    = xstrdup(job_desc->comment);
 	if (job_desc->dependency != NO_VAL) /* leave as zero */
 		job_ptr->dependency = job_desc->dependency;

 	if (job_desc->priority != NO_VAL) /* already confirmed submit_uid==0 */
 		job_ptr->priority = job_desc->priority;
 	else {
 		_set_job_prio(job_ptr);
 		job_ptr->priority -= ((int)job_desc->nice - NICE_OFFSET);
 	}

 	if (job_desc->kill_on_node_fail != (uint16_t) NO_VAL)
 		job_ptr->kill_on_node_fail = job_desc->kill_on_node_fail;

 	job_ptr->alloc_resp_port = job_desc->alloc_resp_port;
 	job_ptr->alloc_resp_host = xstrdup(job_desc->alloc_resp_hostname);
 	job_ptr->other_port = job_desc->other_port;
 	job_ptr->other_host = xstrdup(job_desc->other_hostname);
 	job_ptr->time_last_active = time(NULL);
 	job_ptr->num_procs = job_desc->num_procs;
         job_ptr->cr_enabled = 0;

 	job_ptr->mail_type = job_desc->mail_type;
 	job_ptr->mail_user = xstrdup(job_desc->mail_user);

 	detail_ptr = job_ptr->details;
 	detail_ptr->argc = job_desc->argc;
 	detail_ptr->argv = job_desc->argv;
 	job_desc->argv   = (char **) NULL; /* nothing left */
 	job_desc->argc   = 0;		   /* nothing left */
 	detail_ptr->min_nodes = job_desc->min_nodes;
 	detail_ptr->max_nodes = job_desc->max_nodes;
 	if (job_desc->req_nodes) {
 		detail_ptr->req_nodes =
 			_copy_nodelist_no_dup(job_desc->req_nodes);
 		detail_ptr->req_node_bitmap = *req_bitmap;
 		detail_ptr->req_node_layout = NULL; /* Layout specified at start time */
 		*req_bitmap = NULL;	/* Reused nothing left to free */
 	}
 	if (job_desc->exc_nodes) {
 		detail_ptr->exc_nodes =
 			_copy_nodelist_no_dup(job_desc->exc_nodes);
 		detail_ptr->exc_node_bitmap = *exc_bitmap;
 		*exc_bitmap = NULL;	/* Reused nothing left to free */
 	}
 	if (job_desc->features)
 		detail_ptr->features = xstrdup(job_desc->features);
 	detail_ptr->shared = job_desc->shared;
 	if (job_desc->contiguous != (uint16_t) NO_VAL)
 		detail_ptr->contiguous = job_desc->contiguous;
 	if (job_desc->task_dist != (uint16_t) NO_VAL)
 		detail_ptr->task_dist = job_desc->task_dist;
 	if (job_desc->cpus_per_task != (uint16_t) NO_VAL)
 		detail_ptr->cpus_per_task = job_desc->cpus_per_task;
 	if (job_desc->ntasks_per_node != (uint16_t) NO_VAL)
 		detail_ptr->ntasks_per_node = job_desc->ntasks_per_node;
 	if (job_desc->no_requeue != (uint16_t) NO_VAL)
 		detail_ptr->no_requeue = job_desc->no_requeue;
 	if (job_desc->job_min_procs != (uint16_t) NO_VAL)
 		detail_ptr->job_min_procs = job_desc->job_min_procs;
 	detail_ptr->job_min_procs = MAX(detail_ptr->job_min_procs,
 					detail_ptr->cpus_per_task);
 	if (job_desc->job_min_memory != NO_VAL)
 		detail_ptr->job_min_memory = job_desc->job_min_memory;
 	if (job_desc->job_max_memory != NO_VAL)
 		detail_ptr->job_max_memory = job_desc->job_max_memory;
 	if (job_desc->job_min_tmp_disk != NO_VAL)
 		detail_ptr->job_min_tmp_disk = job_desc->job_min_tmp_disk;
 	if (job_desc->num_tasks != NO_VAL)
 		detail_ptr->num_tasks = job_desc->num_tasks;
 	if (job_desc->err)
 		detail_ptr->err = xstrdup(job_desc->err);
 	if (job_desc->in)
 		detail_ptr->in = xstrdup(job_desc->in);
 	if (job_desc->out)
 		detail_ptr->out = xstrdup(job_desc->out);
 	if (job_desc->work_dir)
 		detail_ptr->work_dir = xstrdup(job_desc->work_dir);
 	if (job_desc->overcommit != (uint16_t) NO_VAL)
 		detail_ptr->overcommit = job_desc->overcommit;
 	if (job_desc->begin_time > time(NULL))
 		detail_ptr->begin_time = job_desc->begin_time;
 	job_ptr->select_jobinfo =
 		select_g_copy_jobinfo(job_desc->select_jobinfo);
 	detail_ptr->mc_ptr = _set_multi_core_data(job_desc);

 	*job_rec_ptr = job_ptr;
 	return SLURM_SUCCESS;
 }

 /*
  * _copy_nodelist_no_dup - Take a node_list string and convert it to an
  *	expression without duplicate names. For example, we want to convert
  *	a users request for nodes "lx1,lx2,lx1,lx3" to "lx[1-3]"
  * node_list IN - string describing a list of nodes
  * RET a compact node expression, must be xfreed by the user
  */
 static char *_copy_nodelist_no_dup(char *node_list)
 {
 	char buf[8192];

 	hostlist_t hl = hostlist_create(node_list);
 	if (hl == NULL)
 		return NULL;
 	hostlist_uniq(hl);
 	hostlist_ranged_string(hl, 8192, buf);
 	hostlist_destroy(hl);

 	return xstrdup(buf);
 }

 /*
  * job_time_limit - terminate jobs which have exceeded their time limit
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  * NOTE: READ lock_slurmctld config before entry
  */
 void job_time_limit(void)
 {
 	ListIterator job_iterator;
 	struct job_record *job_ptr;
 	time_t now = time(NULL);
 	time_t old = now - slurmctld_conf.inactive_limit;

 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr =
 		(struct job_record *) list_next(job_iterator))) {
 		xassert (job_ptr->magic == JOB_MAGIC);
 		if (job_ptr->job_state != JOB_RUNNING)
 			continue;

 		/* Consider a job active if it has any active steps */
 		if (job_ptr->step_list
 		    &&  (list_count(job_ptr->step_list) > 0))
 			job_ptr->time_last_active = now;

 		if (slurmctld_conf.inactive_limit
 		    &&  (job_ptr->time_last_active <= old)
 		    &&  (job_ptr->part_ptr)
 		    &&  (job_ptr->part_ptr->root_only == 0)) {
 			/* job inactive, kill it */
 			info("Inactivity time limit reached for JobId=%u",
 			     job_ptr->job_id);
 			_job_timed_out(job_ptr);
 			job_ptr->state_reason = FAIL_INACTIVE_LIMIT;
 			continue;
 		}
 		if ((job_ptr->time_limit != INFINITE)
 		    &&  (job_ptr->end_time <= now)) {
 			last_job_update = now;
 			info("Time limit exhausted for JobId=%u",
 			     job_ptr->job_id);
 			_job_timed_out(job_ptr);
 			job_ptr->state_reason = FAIL_TIMEOUT;
 			continue;
 		}

 		/* Give srun command warning message about pending timeout */
 		if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2))
 			srun_timeout (job_ptr);
 	}

 	list_iterator_destroy(job_iterator);
 }

 /* Terminate a job that has exhausted its time limit */
 static void _job_timed_out(struct job_record *job_ptr)
 {
 	xassert(job_ptr);

 	if (job_ptr->details) {
 		time_t now      = time(NULL);
 		job_ptr->end_time           = now;
 		job_ptr->time_last_active   = now;
 		job_ptr->job_state          = JOB_TIMEOUT | JOB_COMPLETING;
 		job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
 		deallocate_nodes(job_ptr, true, false);
 		job_completion_logger(job_ptr);
 	} else
 		job_signal(job_ptr->job_id, SIGKILL, 0, 0);
 	return;
 }

 /* _validate_job_desc - validate that a job descriptor for job submit or
  *	allocate has valid data, set values to defaults as required
  * IN/OUT job_desc_msg - pointer to job descriptor, modified as needed
  * IN allocate - if clear job to be queued, if set allocate for user now
  * IN submit_uid - who request originated
  */
 static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate,
 			      uid_t submit_uid)
 {
 	if ((job_desc_msg->num_procs == NO_VAL)
 	    &&  (job_desc_msg->min_nodes == NO_VAL)
 	    &&  (job_desc_msg->req_nodes == NULL)) {
 		info("Job specified no num_procs, min_nodes or req_nodes");
 		return ESLURM_JOB_MISSING_SIZE_SPECIFICATION;
 	}
 	if ((allocate == SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0) &&
 	    (job_desc_msg->script == NULL)) {
 		info("_validate_job_desc: job failed to specify Script");
 		return ESLURM_JOB_SCRIPT_MISSING;
 	}
 	if (job_desc_msg->user_id == NO_VAL) {
 		info("_validate_job_desc: job failed to specify User");
 		return ESLURM_USER_ID_MISSING;
 	}
 	if ( job_desc_msg->group_id == NO_VAL ) {
 		debug("_validate_job_desc: job failed to specify group");
 		job_desc_msg->group_id = 0;	/* uses user default */
 	}
 	if ((job_desc_msg->name) &&
 	    (strlen(job_desc_msg->name) >= MAX_JOBNAME_LEN)) {
 		job_desc_msg->name[MAX_JOBNAME_LEN-1] = '\0';
 	}
 	if (job_desc_msg->contiguous == (uint16_t) NO_VAL)
 		job_desc_msg->contiguous = 0;

 	if (job_desc_msg->task_dist == (uint16_t) NO_VAL) {
 		/* not typically set by salloc or sbatch */
 		job_desc_msg->task_dist = SLURM_DIST_CYCLIC;
 	}
 	if (job_desc_msg->plane_size == (uint16_t) NO_VAL)
 		job_desc_msg->plane_size = 0;

 	if (job_desc_msg->kill_on_node_fail == (uint16_t) NO_VAL)
 		job_desc_msg->kill_on_node_fail = 1;

 	if (job_desc_msg->job_id != NO_VAL) {
 		struct job_record *dup_job_ptr;
 		if ((submit_uid != 0) &&
 		    (submit_uid != slurmctld_conf.slurm_user_id)) {
 			info("attempt by uid %u to set job_id", submit_uid);
 			return ESLURM_INVALID_JOB_ID;
 		}
 		if (job_desc_msg->job_id == 0) {
 			info("attempt by uid %u to set zero job_id",
 			     submit_uid);
 			return ESLURM_INVALID_JOB_ID;
 		}
 		dup_job_ptr = find_job_record((uint32_t) job_desc_msg->job_id);
 		if (dup_job_ptr &&
 		    (!(IS_JOB_FINISHED(dup_job_ptr)))) {
 			info("attempt re-use active job_id %u",
 			     job_desc_msg->job_id);
 			return ESLURM_DUPLICATE_JOB_ID;
 		}
 		if (dup_job_ptr)	/* Purge the record for re-use */
 			_purge_job_record(job_desc_msg->job_id);
 	}

 	if ((submit_uid != 0) 	/* only root or SlurmUser can set job prio */
 	    &&  (submit_uid != slurmctld_conf.slurm_user_id)) {
 		if (job_desc_msg->priority != 0)
 			job_desc_msg->priority = NO_VAL;
 		if (job_desc_msg->nice < NICE_OFFSET)
 			job_desc_msg->nice = NICE_OFFSET;
 	}

 	if (job_desc_msg->min_sockets == (uint16_t) NO_VAL)
 		job_desc_msg->min_sockets = 1;	/* default socket count of 1 */
 	if (job_desc_msg->min_cores == (uint16_t) NO_VAL)
 		job_desc_msg->min_cores = 1;	/* default core count of 1 */
 	if (job_desc_msg->min_threads == (uint16_t) NO_VAL)
 		job_desc_msg->min_threads = 1;	/* default thread count of 1 */
 	if (job_desc_msg->min_nodes == NO_VAL)
 		job_desc_msg->min_nodes = 1;	/* default node count of 1 */
 	if (job_desc_msg->num_procs == NO_VAL)
 		job_desc_msg->num_procs = job_desc_msg->min_nodes;
 	if (job_desc_msg->min_sockets == (uint16_t) NO_VAL)
 		job_desc_msg->min_sockets = 1;	/* default socket count of 1 */
 	if (job_desc_msg->min_cores == (uint16_t) NO_VAL)
 		job_desc_msg->min_cores = 1;	/* default core count of 1 */
 	if (job_desc_msg->min_threads == (uint16_t) NO_VAL)
 		job_desc_msg->min_threads = 1;	/* default thread count of 1 */

 	if (job_desc_msg->job_min_procs == (uint16_t) NO_VAL)
 		job_desc_msg->job_min_procs = 1;   /* default 1 cpu per node */
 	if (job_desc_msg->job_min_sockets == (uint16_t) NO_VAL)
 		job_desc_msg->job_min_sockets = 1; /* default 1 socket per node */
 	if (job_desc_msg->job_min_cores == (uint16_t) NO_VAL)
 		job_desc_msg->job_min_cores = 1;   /* default 1 core per socket */
 	if (job_desc_msg->job_min_threads == (uint16_t) NO_VAL)
 		job_desc_msg->job_min_threads = 1; /* default 1 thread per core */
 	if (job_desc_msg->job_min_memory == NO_VAL)
 		job_desc_msg->job_min_memory = 1;  /* default 1MB mem per node */
 	if (job_desc_msg->job_max_memory == NO_VAL)
 		job_desc_msg->job_max_memory = 1;  /* default 1MB mem per node */
 	if (job_desc_msg->job_min_tmp_disk == NO_VAL)
 		job_desc_msg->job_min_tmp_disk = 0;/* default 0MB disk per node */

 	return SLURM_SUCCESS;
 }

 /*
  * _list_delete_job - delete a job record and its corresponding job_details,
  *	see common/list.h for documentation
  * IN job_entry - pointer to job_record to delete
  * global: job_list - pointer to global job list
  *	job_count - count of job list entries
  *	job_hash - hash table into job records
  */
 static void _list_delete_job(void *job_entry)
 {
 	struct job_record *job_ptr = (struct job_record *) job_entry;
 	struct job_record **job_pptr;

 	xassert(job_entry);
 	xassert (job_ptr->magic == JOB_MAGIC);

 	/* Remove the record from the hash table */
 	job_pptr = &job_hash[JOB_HASH_INX(job_ptr->job_id)];
 	while ((job_pptr != NULL) &&
 	       ((job_ptr = *job_pptr) != (struct job_record *) job_entry)) {
 		job_pptr = &job_ptr->job_next;
 	}
 	if (job_pptr == NULL)
 		fatal("job hash error");
 	*job_pptr = job_ptr->job_next;

 	delete_job_details(job_ptr);
 	xfree(job_ptr->alloc_node);
 	xfree(job_ptr->nodes);
 	xfree(job_ptr->nodes_completing);
 	FREE_NULL_BITMAP(job_ptr->node_bitmap);
 	xfree(job_ptr->cpus_per_node);
 	xfree(job_ptr->cpu_count_reps);
 	xfree(job_ptr->node_addr);
 	xfree(job_ptr->alloc_resp_host);
 	xfree(job_ptr->other_host);
 	xfree(job_ptr->account);
 	xfree(job_ptr->mail_user);
 	xfree(job_ptr->network);
 	xfree(job_ptr->alloc_lps);
 	xfree(job_ptr->comment);
 	select_g_free_jobinfo(&job_ptr->select_jobinfo);
 	if (job_ptr->step_list) {
 		delete_step_records(job_ptr, 0);
 		list_destroy(job_ptr->step_list);
 	}
 	job_count--;
 	xfree(job_ptr);
 }


 /*
  * _list_find_job_id - find specific job_id entry in the job list,
  *	see common/list.h for documentation, key is job_id_ptr
  * global- job_list - the global partition list
  */
 static int _list_find_job_id(void *job_entry, void *key)
 {
 	uint32_t *job_id_ptr = (uint32_t *) key;

 	if (((struct job_record *) job_entry)->job_id == *job_id_ptr)
 		return 1;
 	else
 		return 0;
 }


 /*
  * _list_find_job_old - find old entries in the job list,
  *	see common/list.h for documentation, key is ignored
  * global- job_list - the global partition list
  */
 static int _list_find_job_old(void *job_entry, void *key)
 {
 	time_t now      = time(NULL);
 	time_t kill_age = now - (slurmctld_conf.kill_wait + 20);
 	time_t min_age  = now - slurmctld_conf.min_job_age;
 	struct job_record *job_ptr = (struct job_record *)job_entry;

 	if ( (job_ptr->job_state & JOB_COMPLETING) &&
 	     (job_ptr->end_time < kill_age) ) {
 		re_kill_job(job_ptr);
 		return 0;       /* Job still completing */
 	}

 	if (slurmctld_conf.min_job_age == 0)
 		return 0;	/* No job record purging */

 	if (job_ptr->end_time > min_age)
 		return 0;	/* Too new to purge */

 	if (!(IS_JOB_FINISHED(job_ptr)))
 		return 0;	/* Job still active */

 	return 1;		/* Purge the job */
 }


 /*
  * pack_all_jobs - dump all job information for all jobs in
  *	machine independent form (for network transmission)
  * OUT buffer_ptr - the pointer is set to the allocated buffer.
  * OUT buffer_size - set to size of the buffer in bytes
  * IN show_flags - job filtering options
  * IN uid - uid of user making request (for partition filtering)
  * global: job_list - global list of job records
  * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
  * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c
  *	whenever the data format changes
  */
 extern void pack_all_jobs(char **buffer_ptr, int *buffer_size,
 			  uint16_t show_flags, uid_t uid)
 {
 	ListIterator job_iterator;
 	struct job_record *job_ptr;
 	uint32_t jobs_packed = 0, tmp_offset;
 	Buf buffer;
 	time_t now = time(NULL);

 	buffer_ptr[0] = NULL;
 	*buffer_size = 0;

 	buffer = init_buf(BUF_SIZE);

 	/* write message body header : size and time */
 	/* put in a place holder job record count of 0 for now */
 	pack32(jobs_packed, buffer);
 	pack_time(now, buffer);

 	/* write individual job records */
 	part_filter_set(uid);
 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		xassert (job_ptr->magic == JOB_MAGIC);

 		if (((show_flags & SHOW_ALL) == 0) && (uid != 0) &&
 		    (job_ptr->part_ptr) &&
 		    (job_ptr->part_ptr->hidden))
 			continue;

 		pack_job(job_ptr, buffer);
 		jobs_packed++;
 	}
 	part_filter_clear();
 	list_iterator_destroy(job_iterator);

 	/* put the real record count in the message body header */
 	tmp_offset = get_buf_offset(buffer);
 	set_buf_offset(buffer, 0);
 	pack32(jobs_packed, buffer);
 	set_buf_offset(buffer, tmp_offset);

 	*buffer_size = get_buf_offset(buffer);
 	buffer_ptr[0] = xfer_buf_data(buffer);
 }


 /*
  * pack_job - dump all configuration information about a specific job in
  *	machine independent form (for network transmission)
  * IN dump_job_ptr - pointer to job for which information is requested
  * IN/OUT buffer - buffer in which data is placed, pointers automatically
  *	updated
  * NOTE: change _unpack_job_info_members() in common/slurm_protocol_pack.c
  *	  whenever the data format changes
  */
 void pack_job(struct job_record *dump_job_ptr, Buf buffer)
 {
 	struct job_details *detail_ptr;
 	uint32_t size_tmp;

 	pack32(dump_job_ptr->job_id, buffer);
 	pack32(dump_job_ptr->user_id, buffer);
 	pack32(dump_job_ptr->group_id, buffer);

 	pack16(dump_job_ptr->job_state, buffer);
 	pack16(dump_job_ptr->batch_flag, buffer);
 	pack16(dump_job_ptr->state_reason, buffer);

 	pack32(dump_job_ptr->alloc_sid, buffer);
 	if ((dump_job_ptr->time_limit == NO_VAL) && dump_job_ptr->part_ptr)
 		pack32(dump_job_ptr->part_ptr->max_time, buffer);
 	else
 		pack32(dump_job_ptr->time_limit, buffer);

 	if (dump_job_ptr->details) {
 		pack_time(dump_job_ptr->details->submit_time, buffer);
 	} else {
 		pack_time((time_t) 0, buffer);
 	}
 	if (IS_JOB_PENDING(dump_job_ptr)) {
 		if (dump_job_ptr->details)
 			pack_time(dump_job_ptr->details->begin_time,
 				  buffer);
 		else
 			pack_time((time_t) 0, buffer);
 	} else
 		pack_time(dump_job_ptr->start_time, buffer);
 	pack_time(dump_job_ptr->end_time, buffer);
 	pack_time(dump_job_ptr->suspend_time, buffer);
 	pack_time(dump_job_ptr->pre_sus_time, buffer);
 	pack32(dump_job_ptr->priority, buffer);

 	packstr(dump_job_ptr->nodes, buffer);
 	packstr(dump_job_ptr->partition, buffer);
 	packstr(dump_job_ptr->account, buffer);
 	packstr(dump_job_ptr->network, buffer);
 	packstr(dump_job_ptr->comment, buffer);
 	pack32(dump_job_ptr->dependency, buffer);
 	pack32(dump_job_ptr->exit_code, buffer);

 	pack16(dump_job_ptr->num_cpu_groups, buffer);
 	size_tmp = dump_job_ptr->num_cpu_groups;
 	if (size_tmp < 0) {
 	    	size_tmp = 0;
 	}
 	pack32_array(dump_job_ptr->cpus_per_node, size_tmp, buffer);
 	pack32_array(dump_job_ptr->cpu_count_reps, size_tmp, buffer);

 	packstr(dump_job_ptr->name, buffer);
 	packstr(dump_job_ptr->alloc_node, buffer);
 	pack_bit_fmt(dump_job_ptr->node_bitmap, buffer);
 	pack32(dump_job_ptr->num_procs, buffer);

 	select_g_pack_jobinfo(dump_job_ptr->select_jobinfo, buffer);

 	detail_ptr = dump_job_ptr->details;
 	/* A few details are always dumped here */
 	_pack_default_job_details(detail_ptr, buffer);

 	/* other job details are only dumped until the job starts
 	 * running (at which time they become meaningless) */
 	if (detail_ptr)
 		_pack_pending_job_details(detail_ptr, buffer);
 	else
 		_pack_pending_job_details(NULL, buffer);
 }

 /* pack default job details for "get_job_info" RPC */
 static void _pack_default_job_details(struct job_details *detail_ptr,
 				      Buf buffer)
 {
 	if (detail_ptr) {
 		packstr(detail_ptr->features, buffer);

 		pack32(detail_ptr->min_nodes, buffer);
 		pack32(detail_ptr->max_nodes, buffer);
 	} else {
 		packnull(buffer);

 		pack32((uint32_t) 0, buffer);
 		pack32((uint32_t) 0, buffer);
 	}
 }

 /* pack pending job details for "get_job_info" RPC */
 static void _pack_pending_job_details(struct job_details *detail_ptr,
 				      Buf buffer)
 {
 	if (detail_ptr) {
 		pack16(detail_ptr->shared, buffer);
 		pack16(detail_ptr->contiguous, buffer);
 		pack16(detail_ptr->cpus_per_task, buffer);
 		pack16(detail_ptr->job_min_procs, buffer);

 		pack32(detail_ptr->job_min_memory, buffer);
 		pack32(detail_ptr->job_max_memory, buffer);
 		pack32(detail_ptr->job_min_tmp_disk, buffer);

 		packstr(detail_ptr->req_nodes, buffer);
 		pack_bit_fmt(detail_ptr->req_node_bitmap, buffer);
 		/* detail_ptr->req_node_layout is not packed */
 		packstr(detail_ptr->exc_nodes, buffer);
 		pack_bit_fmt(detail_ptr->exc_node_bitmap, buffer);

 		pack_multi_core_data(detail_ptr->mc_ptr, buffer);
 	}

 	else {
 		pack16((uint16_t) 0, buffer);
 		pack16((uint16_t) 0, buffer);
 		pack16((uint16_t) 0, buffer);
 		pack16((uint16_t) 0, buffer);

 		pack32((uint32_t) 0, buffer);
 		pack32((uint32_t) 0, buffer);
 		pack32((uint32_t) 0, buffer);

 		packnull(buffer);
 		packnull(buffer);
 		packnull(buffer);
 		packnull(buffer);

 		pack_multi_core_data(NULL, buffer);
 	}
 }

 /*
  * purge_old_job - purge old job records.
  *	the jobs must have completed at least MIN_JOB_AGE minutes ago
  * global: job_list - global job table
  *	last_job_update - time of last job table update
  * NOTE: READ lock_slurmctld config before entry
  */
 void purge_old_job(void)
 {
 	int i;

 	i = list_delete_all(job_list, &_list_find_job_old, "");
 	if (i) {
 		debug2("purge_old_job: purged %d old job records", i);
 /*		last_job_update = time(NULL);	don't worry about state save */
 	}
 }


 /*
  * _purge_job_record - purge specific job record
  * IN job_id - job_id of job record to be purged
  * RET int - count of job's purged
  * global: job_list - global job table
  */
 static int _purge_job_record(uint32_t job_id)
 {
 	return list_delete_all(job_list, &_list_find_job_id, (void *) &job_id);
 }


 /*
  * reset_job_bitmaps - reestablish bitmaps for existing jobs.
  *	this should be called after rebuilding node information,
  *	but before using any job entries.
  * global: last_job_update - time of last job table update
  *	job_list - pointer to global job list
  */
 void reset_job_bitmaps(void)
 {
 	ListIterator job_iterator;
 	struct job_record  *job_ptr;
 	struct part_record *part_ptr;
 	bool job_fail = false;

 	xassert(job_list);

 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		xassert (job_ptr->magic == JOB_MAGIC);
 		job_fail = false;
 		part_ptr = list_find_first(part_list, &list_find_part,
 					   job_ptr->partition);
 		if (part_ptr == NULL) {
 			error("Invalid partition (%s) for job_id %u",
 		    	      job_ptr->partition, job_ptr->job_id);
 			job_fail = true;
 		}
 		job_ptr->part_ptr = part_ptr;

 		FREE_NULL_BITMAP(job_ptr->node_bitmap);
 		if ((job_ptr->nodes_completing) &&
 		     (node_name2bitmap(job_ptr->nodes_completing,
 				     false,  &job_ptr->node_bitmap))) {
 			error("Invalid nodes (%s) for job_id %u",
 			      job_ptr->nodes_completing,
 			      job_ptr->job_id);
 			job_fail = true;
 		} else if ((job_ptr->node_bitmap == NULL)  && job_ptr->nodes &&
 			   (node_name2bitmap(job_ptr->nodes, false,
 						 &job_ptr->node_bitmap))) {
 			error("Invalid nodes (%s) for job_id %u",
 		    	      job_ptr->nodes, job_ptr->job_id);
 			job_fail = true;
 		}
 		build_node_details(job_ptr);	/* set: num_cpu_groups,
 						 * cpu_count_reps, node_cnt,
 						 * cpus_per_node, node_addr */
 		if (select_g_update_nodeinfo(job_ptr) != SLURM_SUCCESS) {
 			error("select_g_update_nodeinfo(%u): %m",
 				job_ptr->job_id);
 			/* not critical ... ? */
 			/* probably job_fail should be set here */
 		}

 		if (_reset_detail_bitmaps(job_ptr))
 			job_fail = true;

 		_reset_step_bitmaps(job_ptr);

 		if ((job_ptr->kill_on_step_done)
 		    &&  (list_count(job_ptr->step_list) <= 1)) {
 			info("Single job step done, job is complete");
 			job_fail = true;
 		}

 		if (job_fail) {
 			if (job_ptr->job_state == JOB_PENDING) {
 				job_ptr->start_time =
 					job_ptr->end_time = time(NULL);
 				job_ptr->job_state = JOB_NODE_FAIL;
 			} else if (job_ptr->job_state == JOB_RUNNING) {
 				job_ptr->end_time = time(NULL);
 				job_ptr->job_state = JOB_NODE_FAIL |
 					JOB_COMPLETING;
 			} else if (job_ptr->job_state == JOB_SUSPENDED) {
 				job_ptr->end_time = job_ptr->suspend_time;
 				job_ptr->job_state = JOB_NODE_FAIL |
 					JOB_COMPLETING;
 			}
 			job_ptr->exit_code = MAX(job_ptr->exit_code, 1);
 			job_ptr->state_reason = FAIL_DOWN_NODE;
 			job_completion_logger(job_ptr);
 		}
 	}

 	list_iterator_destroy(job_iterator);
 	last_job_update = time(NULL);
 }

 static int _reset_detail_bitmaps(struct job_record *job_ptr)
 {
 	if (job_ptr->details == NULL)
 		return SLURM_SUCCESS;

 	FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
 	xfree(job_ptr->details->req_node_layout); /* layout info is lost
 	                                           * but should be re-generated
 	                                           * at job start time */
 	if ((job_ptr->details->req_nodes) &&
 	    (node_name2bitmap(job_ptr->details->req_nodes, false,
 			      &job_ptr->details->req_node_bitmap))) {
 		error("Invalid req_nodes (%s) for job_id %u",
 	    	      job_ptr->details->req_nodes, job_ptr->job_id);
 		return SLURM_ERROR;
 	}

 	FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
 	if ((job_ptr->details->exc_nodes) &&
 	    (node_name2bitmap(job_ptr->details->exc_nodes, true,
 			      &job_ptr->details->exc_node_bitmap))) {
 		error("Invalid exc_nodes (%s) for job_id %u",
 	    	      job_ptr->details->exc_nodes, job_ptr->job_id);
 		return SLURM_ERROR;
 	}

 	return SLURM_SUCCESS;
 }

 static void _reset_step_bitmaps(struct job_record *job_ptr)
 {
 	ListIterator step_iterator;
 	struct step_record *step_ptr;

 	step_iterator = list_iterator_create (job_ptr->step_list);
 	while ((step_ptr = (struct step_record *) list_next (step_iterator))) {
 		FREE_NULL_BITMAP(step_ptr->step_node_bitmap);
 		if (step_ptr->step_layout &&
 		    step_ptr->step_layout->node_list &&
 		    (node_name2bitmap(step_ptr->step_layout->node_list, false,
 				      &step_ptr->step_node_bitmap))) {
 			error("Invalid step_node_list (%s) for step_id %u.%u",
 	   	 	      step_ptr->step_layout->node_list,
 			      job_ptr->job_id, step_ptr->step_id);
 			delete_step_record (job_ptr, step_ptr->step_id);
 		}
 	}

 	list_iterator_destroy (step_iterator);
 	return;
 }

 /* update first assigned job id as needed on reconfigure
  * NOTE: READ lock_slurmctld config before entry */
 void reset_first_job_id(void)
 {
 	if (job_id_sequence < slurmctld_conf.first_job_id)
 		job_id_sequence = slurmctld_conf.first_job_id;
 }

 /*
  * get_next_job_id - return the job_id to be used by default for
  *	the next job
  */
 extern uint32_t get_next_job_id(void)
 {
 	uint32_t next_id;

 	if (job_id_sequence == 0)
 		job_id_sequence = slurmctld_conf.first_job_id;
 	next_id = job_id_sequence + 1;
 	if (next_id >= MIN_NOALLOC_JOBID)
 		next_id = slurmctld_conf.first_job_id;
 	return next_id;
 }

 /*
  * _set_job_id - set a default job_id, insure that it is unique
  * IN job_ptr - pointer to the job_record
  */
 static void _set_job_id(struct job_record *job_ptr)
 {
 	uint32_t new_id;

 	if (job_id_sequence == 0)
 		job_id_sequence = slurmctld_conf.first_job_id;

 	xassert(job_ptr);
 	xassert (job_ptr->magic == JOB_MAGIC);
 	if ((job_ptr->partition == NULL)
 	    || (strlen(job_ptr->partition) == 0))
 		fatal("_set_job_id: partition not set");

 	/* Insure no conflict in job id if we roll over 32 bits */
 	while (1) {
 		if (++job_id_sequence >= MIN_NOALLOC_JOBID)
 			job_id_sequence = slurmctld_conf.first_job_id;
 		new_id = job_id_sequence;
 		if (find_job_record(new_id) == NULL) {
 			job_ptr->job_id = new_id;
 			break;
 		}
 	}
 }


 /*
  * _set_job_prio - set a default job priority
  * IN job_ptr - pointer to the job_record
  * NOTE: this is a simple prototype, we need to re-establish value on restart
  */
 static void _set_job_prio(struct job_record *job_ptr)
 {
 	xassert(job_ptr);
 	xassert (job_ptr->magic == JOB_MAGIC);
 	job_ptr->priority = slurm_sched_initial_priority(maximum_prio,
 							 job_ptr);
 	if (job_ptr->priority > 0)
 		maximum_prio = MIN(job_ptr->priority, maximum_prio);
 }


 /* After a node is returned to service, reset the priority of jobs
  * which may have been held due to that node being unavailable */
 void reset_job_priority(void)
 {
 	ListIterator job_iterator;
 	struct job_record *job_ptr;
 	int count = 0;

 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		if (job_ptr->priority == 1) {
 			_set_job_prio(job_ptr);
 			count++;
 		}
 	}
 	list_iterator_destroy(job_iterator);
 	if (count)
 		last_job_update = time(NULL);
 }

 /*
  * _top_priority - determine if any other job for this partition has a
  *	higher priority than specified job
  * IN job_ptr - pointer to selected partition
  * RET true if selected job has highest priority
  */
 static bool _top_priority(struct job_record *job_ptr)
 {
 #ifdef HAVE_BG
 	/* On BlueGene, all jobs run ASAP.
 	 * Priority only matters within a specific job size. */
 	return true;

 #else
 	struct job_details *detail_ptr = job_ptr->details;
 	bool top;

 	if (job_ptr->priority == 0)	/* user held */
 		top = false;
 	else {
 		ListIterator job_iterator;
 		struct job_record *job_ptr2;

 		top = true;	/* assume top priority until found otherwise */
 		job_iterator = list_iterator_create(job_list);
 		while ((job_ptr2 = (struct job_record *)
 			list_next(job_iterator))) {
 			if (job_ptr2 == job_ptr)
 				continue;
 			if (job_ptr2->job_state != JOB_PENDING)
 				continue;
 			if (!job_independent(job_ptr2))
 				continue;
 			if ((job_ptr2->priority >  job_ptr->priority) &&
 			    (job_ptr2->part_ptr == job_ptr->part_ptr)) {
 				top = false;
 				break;
 			}
 		}
 		list_iterator_destroy(job_iterator);
 	}

 	if ((!top) && detail_ptr) {	/* not top prio */
 		if (job_ptr->priority == 0)		/* user/admin hold */
 			job_ptr->state_reason = WAIT_HELD;
 		else if (job_ptr->priority != 1)	/* not system hold */
 			job_ptr->state_reason = WAIT_PRIORITY;
 	}
 	return top;
 #endif
 }


 /*
  * update_job - update a job's parameters per the supplied specifications
  * IN job_specs - a job's specification
  * IN uid - uid of user issuing RPC
  * RET returns an error code from slurm_errno.h
  * global: job_list - global list of job entries
  *	last_job_update - time of last job table update
  */
 int update_job(job_desc_msg_t * job_specs, uid_t uid)
 {
 	int error_code = SLURM_SUCCESS;
 	int super_user = 0;
 	struct job_record *job_ptr;
 	struct job_details *detail_ptr;
 	struct part_record *tmp_part_ptr;
 	bitstr_t *exc_bitmap = NULL, *req_bitmap = NULL;
 	time_t now = time(NULL);
 	multi_core_data_t *mc_ptr = NULL;

 	job_ptr = find_job_record(job_specs->job_id);
 	if (job_ptr == NULL) {
 		error("update_job: job_id %u does not exist.",
 		      job_specs->job_id);
 		return ESLURM_INVALID_JOB_ID;
 	}
 	if ((uid == 0) || (uid == slurmctld_conf.slurm_user_id))
 		super_user = 1;
 	if ((job_ptr->user_id != uid) && (super_user == 0)) {
 		error("Security violation, JOB_UPDATE RPC from uid %d",
 		      uid);
 		return ESLURM_USER_ID_MISSING;
 	}

 	detail_ptr = job_ptr->details;
 	if (detail_ptr)
 		mc_ptr = detail_ptr->mc_ptr;
 	last_job_update = now;

 	if ((job_specs->time_limit != NO_VAL) && (!IS_JOB_FINISHED(job_ptr))) {
 		if (job_ptr->time_limit == job_specs->time_limit) {
 			verbose("update_job: new time limit identical to old "
 				"time limit %u", job_specs->job_id);
 		} else if (super_user ||
 			   (job_ptr->time_limit > job_specs->time_limit)) {
 			time_t old_time =  job_ptr->time_limit;
 			if (old_time == INFINITE)	/* one year in mins */
 				old_time = (365 * 24 * 60);
 			job_ptr->time_limit = job_specs->time_limit;
 			if (job_ptr->time_limit == INFINITE) {	/* one year */
 				job_ptr->end_time = now +
 					(365 * 24 * 60 * 60);
 			} else {
 				/* Update end_time based upon change
 				 * to preserve suspend time info */
 				job_ptr->end_time = job_ptr->end_time +
 					((job_ptr->time_limit -
 					  old_time) * 60);
 			}
 			if (job_ptr->end_time < now)
 				job_ptr->end_time = now;
 			if ((job_ptr->job_state == JOB_RUNNING) &&
 			    (list_is_empty(job_ptr->step_list) == 0))
 				_xmit_new_end_time(job_ptr);
 			info("update_job: setting time_limit to %u for "
 			     "job_id %u", job_specs->time_limit,
 			     job_specs->job_id);
 		} else {
 			error("Attempt to increase time limit for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->priority != NO_VAL) {
 		if (super_user
 		    ||  (job_ptr->priority > job_specs->priority)) {
 			job_ptr->priority = job_specs->priority;
 			info("update_job: setting priority to %u for "
 			     "job_id %u", job_ptr->priority,
 			     job_specs->job_id);
 		} else {
 			error("Attempt to increase priority for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->nice != NICE_OFFSET) {
 		if (!IS_JOB_PENDING(job_ptr))
 			error_code = ESLURM_DISABLED;
 		else if (super_user || (job_specs->nice < NICE_OFFSET)) {
 			job_ptr->priority -= ((int)job_specs->nice -
 					      NICE_OFFSET);
 			info("update_job: setting priority to %u for "
 			     "job_id %u", job_ptr->priority,
 			     job_specs->job_id);
 		} else {
 			error("Attempt to increase priority for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->job_min_procs != (uint16_t) NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else if (super_user
 			 || (detail_ptr->job_min_procs
 			     > job_specs->job_min_procs)) {
 			detail_ptr->job_min_procs = job_specs->job_min_procs;
 			info("update_job: setting job_min_procs to %u for "
 			     "job_id %u", job_specs->job_min_procs,
 			     job_specs->job_id);
 		} else {
 			error("Attempt to increase job_min_procs for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->job_min_sockets != (uint16_t) NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else {
 			mc_ptr->job_min_sockets = job_specs->job_min_sockets;
 			info("update_job: setting job_min_sockets to %u for "
 			     "job_id %u", job_specs->job_min_sockets,
 			     job_specs->job_id);
 		}
 	}

 	if (job_specs->job_min_cores != (uint16_t) NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else {
 			mc_ptr->job_min_cores = job_specs->job_min_cores;
 			info("update_job: setting job_min_cores to %u for "
 			     "job_id %u", job_specs->job_min_cores,
 			     job_specs->job_id);
 		}
 	}

 	if (job_specs->job_min_threads != (uint16_t) NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else {
 			mc_ptr->job_min_threads = job_specs->job_min_threads;
 			info("update_job: setting job_min_threads to %u for "
 			     "job_id %u", job_specs->job_min_threads,
 			     job_specs->job_id);
 		}
 	}

 	if (job_specs->job_min_memory != NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else if (super_user
 			 || (detail_ptr->job_min_memory
 			     > job_specs->job_min_memory)) {
 			detail_ptr->job_min_memory = job_specs->job_min_memory;
 			info("update_job: setting job_min_memory to %u for "
 			     "job_id %u", job_specs->job_min_memory,
 			     job_specs->job_id);
 		} else {
 			error("Attempt to increase job_min_memory for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->job_min_tmp_disk != NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else if (super_user
 			 || (detail_ptr->job_min_tmp_disk
 			     > job_specs->job_min_tmp_disk)) {
 			detail_ptr->job_min_tmp_disk =
 				job_specs->job_min_tmp_disk;
 			info("update_job: setting job_min_tmp_disk to %u for "
 			     "job_id %u", job_specs->job_min_tmp_disk,
 			     job_specs->job_id);
 		} else {
 			error("Attempt to increase job_min_tmp_disk "
 			      "for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->num_procs != NO_VAL) {
 		if (!IS_JOB_PENDING(job_ptr))
 			error_code = ESLURM_DISABLED;
 		else if (super_user
 			 || (job_ptr->num_procs > job_specs->num_procs)) {
 			job_ptr->num_procs = job_specs->num_procs;
 			info("update_job: setting num_procs to %u for "
 			     "job_id %u", job_specs->num_procs,
 			     job_specs->job_id);
 		} else {
 			error("Attempt to increase num_procs for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->min_nodes != NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else if (super_user
 			 || (detail_ptr->min_nodes > job_specs->min_nodes)) {
 			detail_ptr->min_nodes = job_specs->min_nodes;
 			info("update_job: setting min_nodes to %u for "
 			     "job_id %u", job_specs->min_nodes,
 			     job_specs->job_id);
 		} else {
 			error("Attempt to increase min_nodes for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->max_nodes != NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else if (super_user
 			 || (detail_ptr->max_nodes > job_specs->max_nodes)) {
 			detail_ptr->max_nodes = job_specs->max_nodes;
 			info("update_job: setting max_nodes to %u for "
 			     "job_id %u", job_specs->max_nodes,
 			     job_specs->job_id);
 		} else {
 			error("Attempt to increase max_nodes for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->min_sockets != (uint16_t) NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else {
 			mc_ptr->min_sockets = job_specs->min_sockets;
 			info("update_job: setting min_sockets to %u for "
 			     "job_id %u", job_specs->min_sockets,
 			     job_specs->job_id);
 		}
 	}

 	if (job_specs->min_cores != (uint16_t) NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else {
 			mc_ptr->min_cores = job_specs->min_cores;
 			info("update_job: setting min_cores to %u for "
 			     "job_id %u", job_specs->min_cores,
 			     job_specs->job_id);
 		}
 	}

 	if ((job_specs->min_threads != (uint16_t) NO_VAL)) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else {
 			mc_ptr->min_threads = job_specs->min_threads;
 			info("update_job: setting min_threads to %u for "
 			     "job_id %u", job_specs->min_threads,
 			     job_specs->job_id);
 		}
 	}

 	if (job_specs->shared != (uint16_t) NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else if (super_user
 			 ||       (detail_ptr->shared > job_specs->shared)) {
 			detail_ptr->shared = job_specs->shared;
 			info("update_job: setting shared to %u for job_id %u",
 			     job_specs->shared, job_specs->job_id);
 		} else {
 			error("Attempt to remove sharing for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->contiguous != (uint16_t) NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else if (super_user
 			 || (detail_ptr->contiguous > job_specs->contiguous)) {
 			detail_ptr->contiguous = job_specs->contiguous;
 			info("update_job: setting contiguous to %u for "
 			     "job_id %u", job_specs->contiguous,
 			     job_specs->job_id);
 		} else {
 			error("Attempt to add contiguous for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->features) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else if (super_user) {
 			xfree(detail_ptr->features);
 			if (job_specs->features[0] != '\0') {
 				detail_ptr->features = job_specs->features;
 				job_specs->features = NULL;
 				info("update_job: setting features to %s for "
 				     "job_id %u", job_specs->features,
 				     job_specs->job_id);
 			} else {
 				info("update_job: cleared features for job %u",
 				     job_specs->job_id);
 			}
 		} else {
 			error("Attempt to change features for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->comment) {
 		xfree(job_ptr->comment);
 		job_ptr->comment = job_specs->comment;
 		job_specs->comment = NULL;	/* Nothing left to free */
 		info("update_job: setting comment to %s for job_id %u",
 		     job_ptr->comment, job_specs->job_id);
 	}

 	if (job_specs->name) {
 		strncpy(job_ptr->name, job_specs->name, MAX_JOBNAME_LEN);
 		info("update_job: setting name to %s for job_id %u",
 		     job_specs->name, job_specs->job_id);
 	}

 	if (job_specs->partition) {
 		tmp_part_ptr = find_part_record(job_specs->partition);
 		if (!IS_JOB_PENDING(job_ptr))
 			error_code = ESLURM_DISABLED;
 		else if (tmp_part_ptr == NULL)
 			error_code = ESLURM_INVALID_PARTITION_NAME;
 		else if (super_user) {
 			strncpy(job_ptr->partition, job_specs->partition,
 				MAX_SLURM_NAME);
 			job_ptr->part_ptr = tmp_part_ptr;
 			info("update_job: setting partition to %s for "
 			     "job_id %u", job_specs->partition,
 			     job_specs->job_id);
 		} else {
 			error("Attempt to change partition for job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->exc_nodes) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else if (job_specs->exc_nodes[0] == '\0') {
 			xfree(detail_ptr->exc_nodes);
 			FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap);
 		} else {
 			if (node_name2bitmap(job_specs->exc_nodes, false,
 					     &exc_bitmap)) {
 				error("Invalid node list for job_update: %s",
 				      job_specs->exc_nodes);
 				FREE_NULL_BITMAP(exc_bitmap);
 				error_code = ESLURM_INVALID_NODE_NAME;
 			}
 			if (exc_bitmap) {
 				xfree(detail_ptr->exc_nodes);
 				detail_ptr->exc_nodes =
 					job_specs->exc_nodes;
 				FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap);
 				detail_ptr->exc_node_bitmap = exc_bitmap;
 				info("update_job: setting exc_nodes to %s "
 				     "for job_id %u", job_specs->exc_nodes,
 				     job_specs->job_id);
 				job_specs->exc_nodes = NULL;
 			}
 		}
 	}

 	if (job_specs->req_nodes) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else if (job_specs->req_nodes[0] == '\0') {
 			xfree(detail_ptr->req_nodes);
 			FREE_NULL_BITMAP(detail_ptr->req_node_bitmap);
 			xfree(detail_ptr->req_node_layout);
 		} else {
 			if (node_name2bitmap(job_specs->req_nodes, false,
 					     &req_bitmap)) {
 				error("Invalid node list for job_update: %s",
 				      job_specs->req_nodes);
 				FREE_NULL_BITMAP(req_bitmap);
 				error_code = ESLURM_INVALID_NODE_NAME;
 			}
 			if (req_bitmap) {
 				xfree(detail_ptr->req_nodes);
 				detail_ptr->req_nodes =
 					job_specs->req_nodes;
 				FREE_NULL_BITMAP(detail_ptr->req_node_bitmap);
 				xfree(detail_ptr->req_node_layout);
 				detail_ptr->req_node_bitmap = req_bitmap;
 				info("update_job: setting req_nodes to %s "
 				     "for job_id %u", job_specs->req_nodes,
 				     job_specs->job_id);
 				job_specs->req_nodes = NULL;
 			}
 		}
 	}

 	if (job_specs->account) {
 		xfree(job_ptr->account);
 		if (job_specs->account[0] != '\0') {
 			job_ptr->account = job_specs->account;
 			job_specs->account = NULL;  /* Nothing left to free */
 			info("update_job: setting account to %s for job_id %u",
 			     job_ptr->account, job_specs->job_id);
 		} else {
 			info("update_job: cleared account for job_id %u",
 			     job_specs->job_id);
 		}
 	}

 	if (job_specs->ntasks_per_node != (uint16_t) NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_DISABLED;
 		else if (super_user) {
 			detail_ptr->ntasks_per_node = job_specs->ntasks_per_node;
 			info("update_job: setting ntasks_per_node to %u for "
 			     "job_id %u", job_specs->ntasks_per_node,
 			     job_specs->job_id);
 		} else {
 			error("Not super user: setting ntasks_oper_node to job %u",
 			      job_specs->job_id);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if (job_specs->dependency != NO_VAL) {
 		if (!IS_JOB_PENDING(job_ptr))
 			error_code = ESLURM_DISABLED;
 		else if (job_specs->dependency == job_ptr->job_id)
 			error_code = ESLURM_DEPENDENCY;
 		else {
 			job_ptr->dependency = job_specs->dependency;
 			info("update_job: setting dependency to %u for "
 			     "job_id %u",  job_ptr->dependency,
 			     job_ptr->job_id);
 		}
 	}

 	if (job_specs->begin_time) {
 		if (IS_JOB_PENDING(job_ptr) && detail_ptr)
 			detail_ptr->begin_time = job_specs->begin_time;
 		else
 			error_code = ESLURM_DISABLED;
 	}

 #ifdef HAVE_BG
  {
 	 uint16_t reboot = (uint16_t) NO_VAL;
 	 uint16_t rotate = (uint16_t) NO_VAL;
 	 uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL};
 	 char *image = NULL;

 	 select_g_get_jobinfo(job_specs->select_jobinfo,
 			      SELECT_DATA_ROTATE, &rotate);
 	 if (rotate != (uint16_t) NO_VAL) {
 		 if (!IS_JOB_PENDING(job_ptr))
 			 error_code = ESLURM_DISABLED;
 		 else {
 			 info("update_job: setting rotate to %u for "
 			      "jobid %u", rotate, job_ptr->job_id);
 			 select_g_set_jobinfo(job_ptr->select_jobinfo,
 					      SELECT_DATA_ROTATE, &rotate);
 		 }
 	 }

 	 select_g_get_jobinfo(job_specs->select_jobinfo,
 			      SELECT_DATA_REBOOT, &reboot);
 	 if (reboot != (uint16_t) NO_VAL) {
 		if (!IS_JOB_PENDING(job_ptr))
 			error_code = ESLURM_DISABLED;
 		else {
 			info("update_job: setting reboot to %u for "
 			     "jobid %u", reboot, job_ptr->job_id);
 			select_g_set_jobinfo(job_ptr->select_jobinfo,
 					     SELECT_DATA_REBOOT, &reboot);
 		}
 	}

 	 select_g_get_jobinfo(job_specs->select_jobinfo,
 			      SELECT_DATA_GEOMETRY, geometry);
 	 if (geometry[0] != (uint16_t) NO_VAL) {
 		 if (!IS_JOB_PENDING(job_ptr))
 			 error_code = ESLURM_DISABLED;
 		 else if (super_user) {
 			 uint32_t i, tot = 1;
 			 for (i=0; i<SYSTEM_DIMENSIONS; i++)
 				 tot *= geometry[i];
 			 info("update_job: setting geometry to %ux%ux%u "
 			      "min_nodes=%u for jobid %u",
 			      geometry[0], geometry[1],
 			      geometry[2], tot, job_ptr->job_id);
 			 select_g_set_jobinfo(job_ptr->select_jobinfo,
 					      SELECT_DATA_GEOMETRY, geometry);
 			 detail_ptr->min_nodes = tot;
 		 } else {
 			 error("Attempt to change geometry for job %u",
 			       job_specs->job_id);
 			 error_code = ESLURM_ACCESS_DENIED;
 		 }
 	 }
 	 select_g_get_jobinfo(job_specs->select_jobinfo,
 			      SELECT_DATA_START, geometry);
 	 if (geometry[0] != (uint16_t) NO_VAL) {
 		 if (!IS_JOB_PENDING(job_ptr))
 			 error_code = ESLURM_DISABLED;
 		 else if (super_user) {
 			 uint32_t i, tot = 1;
 			 for (i=0; i<SYSTEM_DIMENSIONS; i++)
 				 tot *= geometry[i];
 			 info("update_job: setting start to %ux%ux%u "
 			      "for job %u",
 			      geometry[0], geometry[1],
 			      geometry[2], job_ptr->job_id);
 			 select_g_set_jobinfo(job_ptr->select_jobinfo,
 					      SELECT_DATA_GEOMETRY, geometry);
 			 detail_ptr->min_nodes = tot;
 		 } else {
 			 error("Attempt to change geometry for job %u",
 			       job_specs->job_id);
 			 error_code = ESLURM_ACCESS_DENIED;
 		 }
 	 }

 	 select_g_get_jobinfo(job_specs->select_jobinfo,
 			      SELECT_DATA_BLRTS_IMAGE, &image);
 	 if (image) {
 		 if (!IS_JOB_PENDING(job_ptr))
 			 error_code = ESLURM_DISABLED;
 		 else {
 			 info("update_job: setting BlrtsImage to %s for "
 			      "jobid %u", image, job_ptr->job_id);
 			 select_g_set_jobinfo(job_ptr->select_jobinfo,
 					      SELECT_DATA_BLRTS_IMAGE, image);
 		 }
 	 }
 	 select_g_get_jobinfo(job_specs->select_jobinfo,
 			      SELECT_DATA_LINUX_IMAGE, &image);
 	 if (image) {
 		 if (!IS_JOB_PENDING(job_ptr))
 			 error_code = ESLURM_DISABLED;
 		 else {
 			 info("update_job: setting LinuxImage to %s for "
 			      "jobid %u", image, job_ptr->job_id);
 			 select_g_set_jobinfo(job_ptr->select_jobinfo,
 					      SELECT_DATA_LINUX_IMAGE, image);
 		 }
 	 }
 	 select_g_get_jobinfo(job_specs->select_jobinfo,
 			      SELECT_DATA_MLOADER_IMAGE, &image);
 	 if (image) {
 		 if (!IS_JOB_PENDING(job_ptr))
 			 error_code = ESLURM_DISABLED;
 		 else {
 			 info("update_job: setting MloaderImage to %s for "
 			      "jobid %u", image, job_ptr->job_id);
 			 select_g_set_jobinfo(job_ptr->select_jobinfo,
 					      SELECT_DATA_MLOADER_IMAGE,
 					      image);
 		 }
 	 }
 	 select_g_get_jobinfo(job_specs->select_jobinfo,
 			      SELECT_DATA_RAMDISK_IMAGE, &image);
 	 if (image) {
 		 if (!IS_JOB_PENDING(job_ptr))
 			 error_code = ESLURM_DISABLED;
 		 else {
 			 info("update_job: setting RamdiskImage to %s for "
 			      "jobid %u", image, job_ptr->job_id);
 			 select_g_set_jobinfo(job_ptr->select_jobinfo,
 					      SELECT_DATA_RAMDISK_IMAGE,
 					      image);
 		 }
 	 }
  }
 #endif

 	return error_code;
 }


 /*
  * validate_jobs_on_node - validate that any jobs that should be on the node
  *	are actually running, if not clean up the job records and/or node
  *	records
  * IN node_name - node which should have jobs running
  * IN/OUT job_count - number of jobs which should be running on specified node
  * IN job_id_ptr - pointer to array of job_ids that should be on this node
  * IN step_id_ptr - pointer to array of job step ids that should be on node
  */
 void
 validate_jobs_on_node(char *node_name, uint32_t * job_count,
 		      uint32_t * job_id_ptr, uint16_t * step_id_ptr)
 {
 	int i, node_inx, jobs_on_node;
 	struct node_record *node_ptr;
 	struct job_record *job_ptr;
 	time_t now = time(NULL);

 	node_ptr = find_node_record(node_name);
 	if (node_ptr == NULL) {
 		error("slurmd registered on unknown node %s", node_name);
 		return;
 	}
 	node_inx = node_ptr - node_record_table_ptr;

 	/* Check that jobs running are really supposed to be there */
 	for (i = 0; i < *job_count; i++) {
 		if ( (job_id_ptr[i] >= MIN_NOALLOC_JOBID) &&
 		     (job_id_ptr[i] <= MAX_NOALLOC_JOBID) ) {
 			info("NoAllocate job %u.%u reported on node %s",
 				job_id_ptr[i], step_id_ptr[i], node_name);
 			continue;
 		}

 		job_ptr = find_job_record(job_id_ptr[i]);
 		if (job_ptr == NULL) {
 			error("Orphan job %u.%u reported on node %s",
 			      job_id_ptr[i], step_id_ptr[i], node_name);
 			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
 		}

 		else if ((job_ptr->job_state == JOB_RUNNING) ||
 				(job_ptr->job_state == JOB_SUSPENDED)) {
 			if (bit_test(job_ptr->node_bitmap, node_inx)) {
 				debug3("Registered job %u.%u on node %s ",
 				       job_id_ptr[i], step_id_ptr[i],
 				       node_name);
 				if ((job_ptr->batch_flag) &&
 				    (node_inx == bit_ffs(
 						job_ptr->node_bitmap))) {
 					/* NOTE: Used for purging defunct
 					 * batch jobs */
 					job_ptr->time_last_active = now;
 				}
 			} else {
 				error
 				    ("Registered job %u.%u on wrong node %s ",
 				     job_id_ptr[i], step_id_ptr[i], node_name);
 				kill_job_on_node(job_id_ptr[i], job_ptr,
 						node_ptr);
 			}
 		}

 		else if (job_ptr->job_state & JOB_COMPLETING) {
 			/* Re-send kill request as needed,
 			 * not necessarily an error */
 			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
 		}


 		else if (job_ptr->job_state == JOB_PENDING) {
 			error("Registered PENDING job %u.%u on node %s ",
 			      job_id_ptr[i], step_id_ptr[i], node_name);
 			job_ptr->job_state = JOB_FAILED;
 			job_ptr->exit_code = 1;
 			job_ptr->state_reason = FAIL_SYSTEM;
 			last_job_update    = now;
 			job_ptr->start_time = job_ptr->end_time  = now;
 			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
 			job_completion_logger(job_ptr);
 			delete_job_details(job_ptr);
 		}

 		else {		/* else job is supposed to be done */
 			error
 			    ("Registered job %u.%u in state %s on node %s ",
 			     job_id_ptr[i], step_id_ptr[i],
 			     job_state_string(job_ptr->job_state),
 			     node_name);
 			kill_job_on_node(job_id_ptr[i], job_ptr, node_ptr);
 		}
 	}

 	jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
 	if (jobs_on_node)
 		_purge_lost_batch_jobs(node_inx, now);

 	if (jobs_on_node != *job_count) {
 		/* slurmd will not know of a job unless the job has
 		 * steps active at registration time, so this is not
 		 * an error condition, slurmd is also reporting steps
 		 * rather than jobs */
 		debug3("resetting job_count on node %s from %d to %d",
 		     node_name, *job_count, jobs_on_node);
 		*job_count = jobs_on_node;
 	}

 	return;
 }

 /* Purge any batch job that should have its script running on node
  * node_inx, but is not (i.e. its time_last_active != now) */
 static void _purge_lost_batch_jobs(int node_inx, time_t now)
 {
 	ListIterator job_iterator;
 	struct job_record *job_ptr;

 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		bool job_active = ((job_ptr->job_state == JOB_RUNNING) ||
 				   (job_ptr->job_state == JOB_SUSPENDED));
 		if ((!job_active)                       ||
 		    (job_ptr->batch_flag == 0)          ||
 		    (job_ptr->time_last_active == now)  ||
 		    (node_inx != bit_ffs(job_ptr->node_bitmap)))
 			continue;

 		info("Master node lost JobId=%u, killing it",
 			job_ptr->job_id);
 		job_complete(job_ptr->job_id, 0, false, 0);
 	}
 	list_iterator_destroy(job_iterator);
 }

 /*
  * kill_job_on_node - Kill the specific job_id on a specific node,
  *	the request is not processed immediately, but queued.
  *	This is to prevent a flood of pthreads if slurmctld restarts
  *	without saved state and slurmd daemons register with a
  *	multitude of running jobs. Slurmctld will not recognize
  *	these jobs and use this function to kill them - one
  *	agent request per node as they register.
  * IN job_id - id of the job to be killed
  * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned)
  * IN node_ptr - pointer to the node on which the job resides
  */
 extern void
 kill_job_on_node(uint32_t job_id, struct job_record *job_ptr,
 		struct node_record *node_ptr)
 {
 	agent_arg_t *agent_info;
 	kill_job_msg_t *kill_req;

 	debug("Killing job %u on node %s", job_id, node_ptr->name);

 	kill_req = xmalloc(sizeof(kill_job_msg_t));
 	kill_req->job_id	= job_id;
 	kill_req->time          = time(NULL);
 	kill_req->nodes	        = xstrdup(node_ptr->name);
 	if (job_ptr) {  /* NULL if unknown */
 		kill_req->select_jobinfo =
 			select_g_copy_jobinfo(job_ptr->select_jobinfo);
 	}

 	agent_info = xmalloc(sizeof(agent_arg_t));
 	agent_info->node_count	= 1;
 	agent_info->retry	= 0;
 	agent_info->hostlist	= hostlist_create(node_ptr->name);
 	agent_info->msg_type	= REQUEST_TERMINATE_JOB;
 	agent_info->msg_args	= kill_req;

 	agent_queue_request(agent_info);
 }


 /*
  * job_alloc_info - get details about an existing job allocation
  * IN uid - job issuing the code
  * IN job_id - ID of job for which info is requested
  * OUT job_pptr - set to pointer to job record
  */
 extern int
 job_alloc_info(uint32_t uid, uint32_t job_id, struct job_record **job_pptr)
 {
 	struct job_record *job_ptr;

 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL)
 		return ESLURM_INVALID_JOB_ID;
 	if ((job_ptr->user_id != uid) &&
 	    (uid != 0) && (uid != slurmctld_conf.slurm_user_id))
 		return ESLURM_ACCESS_DENIED;
 	if (IS_JOB_PENDING(job_ptr))
 		return ESLURM_JOB_PENDING;
 	if (IS_JOB_FINISHED(job_ptr))
 		return ESLURM_ALREADY_DONE;

 	*job_pptr = job_ptr;
 	return SLURM_SUCCESS;
 }

 /*
  * Synchronize the batch job in the system with their files.
  * All pending batch jobs must have script and environment files
  * No other jobs should have such files
  * NOTE: READ lock_slurmctld config before entry
  */
 int sync_job_files(void)
 {
 	List batch_dirs;

 	batch_dirs = list_create(_del_batch_list_rec);
 	_get_batch_job_dir_ids(batch_dirs);
 	_validate_job_files(batch_dirs);
 	_remove_defunct_batch_dirs(batch_dirs);
 	list_destroy(batch_dirs);
 	return SLURM_SUCCESS;
 }

 /* Append to the batch_dirs list the job_id's associated with
  *	every batch job directory in existence
  * NOTE: READ lock_slurmctld config before entry
  */
 static void _get_batch_job_dir_ids(List batch_dirs)
 {
 	DIR *f_dir;
 	struct dirent *dir_ent;
 	long long_job_id;
 	uint32_t *job_id_ptr;
 	char *endptr;

 	xassert(slurmctld_conf.state_save_location);
 	f_dir = opendir(slurmctld_conf.state_save_location);
 	if (!f_dir) {
 		error("opendir(%s): %m",
 		      slurmctld_conf.state_save_location);
 		return;
 	}

 	while ((dir_ent = readdir(f_dir))) {
 		if (strncmp("job.#", dir_ent->d_name, 4))
 			continue;
 		long_job_id = strtol(&dir_ent->d_name[4], &endptr, 10);
 		if ((long_job_id == 0) || (endptr[0] != '\0'))
 			continue;
 		debug3("found batch directory for job_id %ld",long_job_id);
 		job_id_ptr = xmalloc(sizeof(uint32_t));
 		*job_id_ptr = long_job_id;
 		list_append (batch_dirs, job_id_ptr);
 	}

 	closedir(f_dir);
 }

 /* All pending batch jobs must have a batch_dir entry,
  *	otherwise we flag it as FAILED and don't schedule
  * If the batch_dir entry exists for a PENDING or RUNNING batch job,
  *	remove it the list (of directories to be deleted) */
 static void _validate_job_files(List batch_dirs)
 {
 	ListIterator job_iterator;
 	struct job_record *job_ptr;
 	int del_cnt;

 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		if (!job_ptr->batch_flag)
 			continue;
 		if (IS_JOB_FINISHED(job_ptr))
 			continue;
 		/* Want to keep this job's files */
 		del_cnt = list_delete_all(batch_dirs, _find_batch_dir,
 					  &(job_ptr->job_id));
 		if ((del_cnt == 0) &&
 		    (job_ptr->job_state == JOB_PENDING)) {
 			error("Script for job %u lost, state set to FAILED",
 			      job_ptr->job_id);
 			job_ptr->job_state = JOB_FAILED;
 			job_ptr->exit_code = 1;
 			job_ptr->state_reason = FAIL_SYSTEM;
 			job_ptr->start_time = job_ptr->end_time = time(NULL);
 			job_completion_logger(job_ptr);
 		}
 	}
 	list_iterator_destroy(job_iterator);
 }

 /* List matching function, see common/list.h */
 static int _find_batch_dir(void *x, void *key)
 {
 	uint32_t *key1 = x;
 	uint32_t *key2 = key;
 	return (int)(*key1 == *key2);
 }
 /* List entry deletion function, see common/list.h */
 static void _del_batch_list_rec(void *x)
 {
 	xfree(x);
 }

 /* Remove all batch_dir entries in the list
  * NOTE: READ lock_slurmctld config before entry */
 static void _remove_defunct_batch_dirs(List batch_dirs)
 {
 	ListIterator batch_dir_inx;
 	uint32_t *job_id_ptr;

 	batch_dir_inx = list_iterator_create(batch_dirs);
 	while ((job_id_ptr = list_next(batch_dir_inx))) {
 		error("Purging files for defunct batch job %u",
 		      *job_id_ptr);
 		_delete_job_desc_files(*job_id_ptr);
 	}
 	list_iterator_destroy(batch_dir_inx);
 }

 /*
  *  _xmit_new_end_time
  *	Tell all slurmd's associated with a job of its new end time
  * IN job_ptr - pointer to terminating job
  * globals: node_record_count - number of nodes in the system
  *	node_record_table_ptr - pointer to global node table
  */
 static void
 _xmit_new_end_time(struct job_record *job_ptr)
 {
 	job_time_msg_t *job_time_msg_ptr;
 	agent_arg_t *agent_args;
 	int i;

 	agent_args = xmalloc(sizeof(agent_arg_t));
 	agent_args->msg_type = REQUEST_UPDATE_JOB_TIME;
 	agent_args->retry = 1;
 	agent_args->hostlist = hostlist_create("");
 	job_time_msg_ptr = xmalloc(sizeof(job_time_msg_t));
 	job_time_msg_ptr->job_id          = job_ptr->job_id;
 	job_time_msg_ptr->expiration_time = job_ptr->end_time;

 	for (i = 0; i < node_record_count; i++) {
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;
 		hostlist_push(agent_args->hostlist,
 			      node_record_table_ptr[i].name);
 		agent_args->node_count++;
 #ifdef HAVE_FRONT_END		/* operate only on front-end node */
 		break;
 #endif
 	}

 	agent_args->msg_args = job_time_msg_ptr;
 	agent_queue_request(agent_args);
 	return;
 }


 /*
  * job_epilog_complete - Note the completion of the epilog script for a
  *	given job
  * IN job_id      - id of the job for which the epilog was executed
  * IN node_name   - name of the node on which the epilog was executed
  * IN return_code - return code from epilog script
  * RET true if job is COMPLETED, otherwise false
  */
 extern bool job_epilog_complete(uint32_t job_id, char *node_name,
 		uint32_t return_code)
 {
 	struct job_record  *job_ptr = find_job_record(job_id);

 	if (job_ptr == NULL)
 		return true;

 	/* There is a potential race condition this handles.
 	 * If slurmctld cold-starts while slurmd keeps running,
 	 * slurmd could notify slurmctld of a job epilog completion
 	 * before getting synced up with slurmctld state. If
 	 * a new job arrives and the job_id is reused, we
 	 * could try to note the termination of a job that
 	 * hasn't really started. Very rare obviously. */
 	if ((job_ptr->job_state == JOB_PENDING)
 	||  (job_ptr->node_bitmap == NULL)) {
 		error("Epilog complete request for non-running job %u, "
 			"slurmctld and slurmd out of sync", job_id);
 		return false;
 	}

 #ifdef HAVE_FRONT_END		/* operate only on front-end node */
 {
 	int i;
 	struct node_record *node_ptr;

 	if (return_code)
 		error("Epilog error on %s, setting DOWN",
 			job_ptr->nodes);
 	for (i=0; i<node_record_count; i++) {
 		if (!bit_test(job_ptr->node_bitmap, i))
 			continue;
 		node_ptr = &node_record_table_ptr[i];
 		if (return_code)
 			set_node_down(node_ptr->name, "Epilog error");
 		else
 			make_node_idle(node_ptr, job_ptr);
 	}
 }
 #else
 	if (return_code) {
 		error("Epilog error on %s, setting DOWN", node_name);
 		set_node_down(node_name, "Epilog error");
 	} else {
 		struct node_record *node_ptr = find_node_record(node_name);
 		if (node_ptr)
 			make_node_idle(node_ptr, job_ptr);
 	}
 #endif

 	step_epilog_complete(job_ptr, node_name);
 	/* nodes_completing is out of date, rebuild when next saved */
 	xfree(job_ptr->nodes_completing);
 	if (!(job_ptr->job_state & JOB_COMPLETING)) {	/* COMPLETED */
 		if ((job_ptr->job_state == JOB_PENDING)
 		&&  (job_ptr->batch_flag)) {
 			info("requeue batch job %u", job_ptr->job_id);
 			if (job_ptr->details) {
 				/* the time stamp on the new batch launch
 				 * credential must be larger than the time
 				 * stamp on the revoke request. Also the
 				 * I/O must be all cleared out and the
 				 * named socket purged, so delay for at
 				 * least ten seconds. */
 				job_ptr->details->begin_time = time(NULL) + 10;
 			}
 		}
 		return true;
 	} else
 		return false;
 }

 /* job_fini - free all memory associated with job records */
 void job_fini (void)
 {
 	if (job_list) {
 		list_destroy(job_list);
 		job_list = NULL;
 	}
 	xfree(job_hash);
 }

 /* log the completion of the specified job */
 extern void job_completion_logger(struct job_record  *job_ptr)
 {
 	int base_state;
 	xassert(job_ptr);

 	base_state = job_ptr->job_state & (~JOB_COMPLETING);
 	if ((base_state == JOB_COMPLETE) || (base_state == JOB_CANCELLED)) {
 		if (job_ptr->mail_type & MAIL_JOB_END)
 			mail_job_info(job_ptr, MAIL_JOB_END);
 	} else {	/* JOB_FAILED, JOB_NODE_FAIL, or JOB_TIMEOUT */
 		if (job_ptr->mail_type & MAIL_JOB_FAIL)
 			mail_job_info(job_ptr, MAIL_JOB_FAIL);
 		else if (job_ptr->mail_type & MAIL_JOB_END)
 			mail_job_info(job_ptr, MAIL_JOB_END);
 	}

 	g_slurm_jobcomp_write(job_ptr);
 	srun_job_complete(job_ptr);
 }

 /*
  * job_independent - determine if this job has a depenendent job pending
  *	or if the job's scheduled begin time is in the future
  * IN job_ptr - pointer to job being tested
  * RET - true if job no longer must be defered for another job
  */
 extern bool job_independent(struct job_record *job_ptr)
 {
 	struct job_record *dep_ptr;
 	struct job_details *detail_ptr = job_ptr->details;
 	time_t now = time(NULL);
 	bool send_acct_rec = false;

 	if (detail_ptr && (detail_ptr->begin_time > now)) {
 		job_ptr->state_reason = WAIT_TIME;
 		return false;	/* not yet time */
 	}

 	if (job_ptr->dependency == 0)
 		goto indi;

 	dep_ptr = find_job_record(job_ptr->dependency);
 	if (dep_ptr == NULL)
 		goto indi;

 	if (((dep_ptr->job_state & JOB_COMPLETING) == 0) &&
 	    (dep_ptr->job_state >= JOB_COMPLETE))
 		goto indi;

 	job_ptr->state_reason = WAIT_DEPENDENCY;
 	return false;	/* job exists and incomplete */

  indi:	/* job is independent, set begin time as needed */
 	if (detail_ptr && (detail_ptr->begin_time == 0)) {
 		detail_ptr->begin_time = now;
 		send_acct_rec = true;
 	} else if (job_ptr->state_reason == WAIT_TIME) {
 		job_ptr->state_reason = WAIT_NO_REASON;
 		send_acct_rec = true;
 	}
 	if (send_acct_rec) {
 		/* We want to record when a job becomes eligible in
 		 * order to calculate reserved time (a measure of
 		 * system over-subscription), job really is not
 		 * starting now */
 		jobacct_g_job_start_slurmctld(job_ptr);
 	}
 	return true;
 }
 /*
  * determine if job is ready to execute per the node select plugin
  * IN job_id - job to test
  * OUT ready - 1 if job is ready to execute 0 otherwise
  * RET SLURM error code
  */
 extern int job_node_ready(uint32_t job_id, int *ready)
 {
 	int rc;
 	struct job_record *job_ptr;
 	xassert(ready);

 	*ready = 0;
 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL)
 		return ESLURM_INVALID_JOB_ID;

 	rc = select_g_job_ready(job_ptr);
 	if (rc == READY_JOB_FATAL)
 		return ESLURM_INVALID_PARTITION_NAME;
 	if (rc == READY_JOB_ERROR)
 		return EAGAIN;

 	if (rc)
 		rc = READY_NODE_STATE;
 	if (job_ptr->job_state == JOB_RUNNING)
 		rc |= READY_JOB_STATE;

 	*ready = rc;
 	return SLURM_SUCCESS;
 }

 /* Send specified signal to all steps associated with a job */
 static void _signal_job(struct job_record *job_ptr, int signal)
 {
 	agent_arg_t *agent_args = NULL;
 	signal_job_msg_t *signal_job_msg = NULL;
 	int i;

 	agent_args = xmalloc(sizeof(agent_arg_t));
 	agent_args->msg_type = REQUEST_SIGNAL_JOB;
 	agent_args->retry = 1;
 	agent_args->hostlist = hostlist_create("");
 	signal_job_msg = xmalloc(sizeof(kill_tasks_msg_t));
 	signal_job_msg->job_id = job_ptr->job_id;
 	signal_job_msg->signal = signal;

 	for (i = 0; i < node_record_count; i++) {
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;
 		hostlist_push(agent_args->hostlist,
 			      node_record_table_ptr[i].name);
 		agent_args->node_count++;
 #ifdef HAVE_FRONT_END	/* Operate only on front-end */
 		break;
 #endif
 	}

 	if (agent_args->node_count == 0) {
 		xfree(signal_job_msg);
 		xfree(agent_args);
 		return;
 	}

 	agent_args->msg_args = signal_job_msg;
 	agent_queue_request(agent_args);
 	return;
 }

 /* Send suspend request to slumrd of all nodes associated with a job */
 static void _suspend_job(struct job_record *job_ptr, uint16_t op)
 {
 	agent_arg_t *agent_args;
 	suspend_msg_t *sus_ptr;
 	int i;

 	agent_args = xmalloc(sizeof(agent_arg_t));
 	agent_args->msg_type = REQUEST_SUSPEND;
 	agent_args->retry = 0;	/* don't resend or gang schedulers
 				 * (sched/gang or sched/wiki) can
 				 * can quickly induce huge backlog
 				 * of agent.c RPCs */
 	agent_args->hostlist = hostlist_create("");
 	sus_ptr = xmalloc(sizeof(suspend_msg_t));
 	sus_ptr->job_id = job_ptr->job_id;
 	sus_ptr->op = op;

 	for (i = 0; i < node_record_count; i++) {
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;
 		hostlist_push(agent_args->hostlist,
 			      node_record_table_ptr[i].name);
 		agent_args->node_count++;
 #ifdef HAVE_FRONT_END	/* Operate only on front-end */
 		break;
 #endif
 	}

 	if (agent_args->node_count == 0) {
 		xfree(sus_ptr);
 		xfree(agent_args);
 		return;
 	}

 	agent_args->msg_args = sus_ptr;
 	agent_queue_request(agent_args);
 	return;
 }
 /* Specified job is being suspended, release allocated nodes */
 static int _suspend_job_nodes(struct job_record *job_ptr)
 {
 	int i, rc;
 	struct node_record *node_ptr = node_record_table_ptr;
 	uint16_t base_state, node_flags;

 	if ((rc = select_g_job_suspend(job_ptr)) != SLURM_SUCCESS)
 		return rc;

 	for (i=0; i<node_record_count; i++, node_ptr++) {
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;

 		if (node_ptr->run_job_cnt)
 			(node_ptr->run_job_cnt)--;
 		else {
 			error("Node %s run_job_cnt underflow",
 				node_ptr->name);
 		}
 		if (job_ptr->details
 		&&  (job_ptr->details->shared == 0)) {
 			if (node_ptr->no_share_job_cnt)
 				(node_ptr->no_share_job_cnt)--;
 			else {
 				error("Node %s no_share_job_cnt "
 					"underflow", node_ptr->name);
 			}
 			if (node_ptr->no_share_job_cnt == 0)
 				bit_set(share_node_bitmap, i);
 		}
 		base_state = node_ptr->node_state & NODE_STATE_BASE;
 		node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
 		if ((node_ptr->run_job_cnt  == 0)
 		&&  (node_ptr->comp_job_cnt == 0)) {
 			bit_set(idle_node_bitmap, i);
 		}
 		if (base_state == NODE_STATE_DOWN) {
 			debug3("_suspend_job_nodes: Node %s left DOWN",
 				node_ptr->name);
 		} else if (node_ptr->run_job_cnt) {
 			node_ptr->node_state = NODE_STATE_ALLOCATED |
 					node_flags;
 		} else {
 			node_ptr->node_state = NODE_STATE_IDLE |
 					node_flags;
 		}
 	}
 	last_job_update = last_node_update = time(NULL);
 	return rc;
 }

 /* Specified job is being resumed, re-allocate the nodes */
 static int _resume_job_nodes(struct job_record *job_ptr)
 {
 	int i, rc;
 	struct node_record *node_ptr = node_record_table_ptr;
 	uint16_t base_state, node_flags;

 	if ((rc = select_g_job_resume(job_ptr)) != SLURM_SUCCESS)
 		return rc;

 	for (i=0; i<node_record_count; i++, node_ptr++) {
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;
 		base_state = node_ptr->node_state & NODE_STATE_BASE;
 		if (base_state == NODE_STATE_DOWN)
 			return SLURM_ERROR;
 	}

 	node_ptr = node_record_table_ptr;
 	for (i=0; i<node_record_count; i++, node_ptr++) {
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;

 		node_ptr->run_job_cnt++;
 		if (job_ptr->details
 		&&  (job_ptr->details->shared == 0)) {
 			node_ptr->no_share_job_cnt++;
 			if (node_ptr->no_share_job_cnt)
 				bit_clear(share_node_bitmap, i);
 		}
 		bit_clear(idle_node_bitmap, i);
 		node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
 		node_ptr->node_state = NODE_STATE_ALLOCATED |
 				node_flags;
 	}
 	last_job_update = last_node_update = time(NULL);
 	return rc;
 }


 /*
  * job_suspend - perform some suspend/resume operation
  * IN sus_ptr - suspend/resume request message
  * IN uid - user id of the user issuing the RPC
  * IN conn_fd - file descriptor on which to send reply,
  *              -1 if none
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid,
 		slurm_fd conn_fd)
 {
 	int rc = SLURM_SUCCESS;
 	time_t now = time(NULL);
 	struct job_record *job_ptr = NULL;
 	slurm_msg_t resp_msg;
 	return_code_msg_t rc_msg;

 	/* test if this system configuration
 	 * supports job suspend/resume */
 	if (strcasecmp(slurmctld_conf.switch_type,
 			"switch/federation") == 0) {
 		/* Work is needed to support the
 		 * release and reuse of switch
 		 * windows associated with a job */
 		rc = ESLURM_NOT_SUPPORTED;
 	}
 #ifdef HAVE_BG
 	rc = ESLURM_NOT_SUPPORTED;
 #endif
 	if (rc)
 		goto reply;

 	/* find the job */
 	job_ptr = find_job_record (sus_ptr->job_id);
 	if (job_ptr == NULL) {
 		rc = ESLURM_INVALID_JOB_ID;
 		goto reply;
 	}

 	/* validate the request */
 	if ((uid != 0) && (uid != getuid())) {
 		rc = ESLURM_ACCESS_DENIED;
 		goto reply;
 	}
 	if (job_ptr->job_state == JOB_PENDING) {
 		rc = ESLURM_JOB_PENDING;
 		goto reply;
 	}
 	if (IS_JOB_FINISHED(job_ptr)) {
 		rc = ESLURM_ALREADY_DONE;
 		goto reply;
 	}

 	/* perform the operation */
 	if (sus_ptr->op == SUSPEND_JOB) {
 		if (job_ptr->job_state != JOB_RUNNING) {
 			rc = ESLURM_DISABLED;
 			goto reply;
 		}
 		rc = _suspend_job_nodes(job_ptr);
 		if (rc != SLURM_SUCCESS)
 			goto reply;
 		_suspend_job(job_ptr, sus_ptr->op);
 		job_ptr->job_state = JOB_SUSPENDED;
 		if (job_ptr->suspend_time) {
 			job_ptr->pre_sus_time +=
 				difftime(now,
 				job_ptr->suspend_time);
 		} else {
 			job_ptr->pre_sus_time +=
 				difftime(now,
 				job_ptr->start_time);
 		}
 		suspend_job_step(job_ptr);
 	} else if (sus_ptr->op == RESUME_JOB) {
 		if (job_ptr->job_state != JOB_SUSPENDED) {
 			rc = ESLURM_DISABLED;
 			goto reply;
 		}
 		rc = _resume_job_nodes(job_ptr);
 		if (rc != SLURM_SUCCESS)
 			goto reply;
 		_suspend_job(job_ptr, sus_ptr->op);
 		job_ptr->job_state = JOB_RUNNING;
 		if (job_ptr->time_limit != INFINITE) {
 			/* adjust effective time_limit */
 			job_ptr->end_time = now +
 				(job_ptr->time_limit * 60)
 				- job_ptr->pre_sus_time;
 		}
 	}

 	job_ptr->time_last_active = now;
 	job_ptr->suspend_time = now;
 	jobacct_g_suspend_slurmctld(job_ptr);

     reply:
 	if (conn_fd >= 0) {
 		slurm_msg_t_init(&resp_msg);
 		resp_msg.msg_type  = RESPONSE_SLURM_RC;
 		rc_msg.return_code = rc;
 		resp_msg.data      = &rc_msg;
 		slurm_send_node_msg(conn_fd, &resp_msg);
 	}
 	return rc;
 }

 /*
  * job_requeue - Requeue a running or pending batch job
  * IN uid - user id of user issuing the RPC
  * IN job_id - id of the job to be requeued
  * IN conn_fd - file descriptor on which to send reply
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_requeue (uid_t uid, uint32_t job_id, slurm_fd conn_fd)
 {
 	int rc = SLURM_SUCCESS;
 	struct job_record *job_ptr = NULL;
 	bool super_user = false, suspended = false;
 	slurm_msg_t resp_msg;
 	return_code_msg_t rc_msg;
 	time_t now = time(NULL);

 	/* find the job */
 	job_ptr = find_job_record (job_id);
 	if (job_ptr == NULL) {
 		rc = ESLURM_INVALID_JOB_ID;
 		goto reply;
 	}

 	/* validate the request */
 	if ((uid == 0) || (uid == slurmctld_conf.slurm_user_id))
 		super_user = 1;
 	if ((uid != job_ptr->user_id) && (!super_user)) {
 		rc = ESLURM_ACCESS_DENIED;
 		goto reply;
 	}
 	if (IS_JOB_FINISHED(job_ptr)) {
 		rc = ESLURM_ALREADY_DONE;
 		goto reply;
 	}
 	if ((job_ptr->details == NULL) || job_ptr->details->no_requeue) {
 		rc = ESLURM_DISABLED;
 		goto reply;
 	}
 	if (job_ptr->job_state & JOB_COMPLETING) {
 		rc = ESLURM_TRANSITION_STATE_NO_UPDATE;
 		goto reply;
 	}

 	/* reset the priority */
 	_set_job_prio(job_ptr);
 	last_job_update = now;

 	/* nothing else to do if pending */
 	if (job_ptr->job_state == JOB_PENDING)
 		goto reply;

 	if (job_ptr->batch_flag == 0) {
 		rc = ESLURM_BATCH_ONLY;
 		goto reply;
 	}

 	if ((job_ptr->job_state != JOB_SUSPENDED)
 	&&  (job_ptr->job_state != JOB_RUNNING)) {
 		error("job_requeue job %u state is bad %s", job_id,
 			job_state_string(job_ptr->job_state));
 		rc = EINVAL;
 		goto reply;
 	}

 	if (job_ptr->job_state == JOB_SUSPENDED)
 		suspended = true;
 	job_ptr->time_last_active  = now;
 	if (suspended)
 		job_ptr->end_time = job_ptr->suspend_time;
 	else
 		job_ptr->end_time = now;

 	/* We want this job to look like it was cancelled in the
 	 * accounting logs. Set a new submit time so the restarted
 	 * job looks like a new job. */
 	job_ptr->job_state  = JOB_CANCELLED;
 	deallocate_nodes(job_ptr, false, suspended);
 	xfree(job_ptr->details->req_node_layout);
 	job_completion_logger(job_ptr);
 	job_ptr->job_state = JOB_PENDING;
 	if (job_ptr->node_cnt)
 		job_ptr->job_state |= JOB_COMPLETING;
 	job_ptr->details->submit_time = now;

     reply:
 	if (conn_fd >= 0) {
 		slurm_msg_t_init(&resp_msg);
 		resp_msg.msg_type  = RESPONSE_SLURM_RC;
 		rc_msg.return_code = rc;
 		resp_msg.data      = &rc_msg;
 		slurm_send_node_msg(conn_fd, &resp_msg);
 	}
 	return rc;
 }

 /*
  * job_end_time - Process JOB_END_TIME
  * IN time_req_msg - job end time request
  * OUT timeout_msg - job timeout response to be sent
  * RET SLURM_SUCESS or an error code
  */
 extern int job_end_time(job_alloc_info_msg_t *time_req_msg,
 			srun_timeout_msg_t *timeout_msg)
 {
 	struct job_record *job_ptr;
 	xassert(timeout_msg);

 	job_ptr = find_job_record(time_req_msg->job_id);
 	if (!job_ptr)
 		return ESLURM_INVALID_JOB_ID;

 	timeout_msg->job_id  = time_req_msg->job_id;
 	timeout_msg->step_id = NO_VAL;
 	timeout_msg->timeout = job_ptr->end_time;
 	return SLURM_SUCCESS;
 }

 /* Reset nodes_completing field for all jobs.
  * Job write lock must be set before calling. */
 extern void update_job_nodes_completing(void)
 {
 	ListIterator job_iterator;
 	struct job_record *job_ptr;

 	if (!job_list)
 		return;

 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		if (((job_ptr->job_state & JOB_COMPLETING) == 0) ||
 		    (job_ptr->node_bitmap == NULL))
 			continue;
 		xfree(job_ptr->nodes_completing);
 		job_ptr->nodes_completing = bitmap2node_name(job_ptr->node_bitmap);
 	}
 	list_iterator_destroy(job_iterator);
 }