src/slurmctld/job_mgr.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  job_mgr.c - manage the job information of slurm
  *	Note: there is a global job list (job_list), time stamp
  *	(last_job_update), and hash table (job_hash)
  *****************************************************************************
  *  Copyright (C) 2002-2007 The Regents of the University of California.
  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
  *  Copyright (C) SchedMD LLC.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov>
  *  CODE-OCEC-09-009. All rights reserved.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #include "config.h"
 #define _GNU_SOURCE

 #include <ctype.h>
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <libgen.h>
 #include <limits.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>

 #include "slurm/slurm_errno.h"

 #include "src/common/assoc_mgr.h"
 #include "src/common/bitstring.h"
 #include "src/common/cpu_frequency.h"
 #include "src/common/cron.h"
 #include "src/common/fd.h"
 #include "src/common/forward.h"
 #include "src/common/hostlist.h"
 #include "src/common/id_util.h"
 #include "src/common/node_features.h"
 #include "src/common/parse_time.h"
 #include "src/common/port_mgr.h"
 #include "src/common/slurm_protocol_pack.h"
 #include "src/common/state_save.h"
 #include "src/common/timers.h"
 #include "src/common/track_script.h"
 #include "src/common/tres_bind.h"
 #include "src/common/tres_frequency.h"
 #include "src/common/xassert.h"
 #include "src/common/xstring.h"

 #include "src/interfaces/accounting_storage.h"
 #include "src/interfaces/acct_gather.h"
 #include "src/interfaces/auth.h"
 #include "src/interfaces/burst_buffer.h"
 #include "src/interfaces/cred.h"
 #include "src/interfaces/gres.h"
 #include "src/interfaces/hash.h"
 #include "src/interfaces/job_submit.h"
 #include "src/interfaces/jobcomp.h"
 #include "src/interfaces/mcs.h"
 #include "src/interfaces/node_features.h"
 #include "src/interfaces/preempt.h"
 #include "src/interfaces/priority.h"
 #include "src/interfaces/sched_plugin.h"
 #include "src/interfaces/select.h"
 #include "src/interfaces/switch.h"
 #include "src/interfaces/topology.h"

 #include "src/slurmctld/acct_policy.h"
 #include "src/slurmctld/agent.h"
 #include "src/slurmctld/fed_mgr.h"
 #include "src/slurmctld/gang.h"
 #include "src/slurmctld/job_scheduler.h"
 #include "src/slurmctld/licenses.h"
 #include "src/slurmctld/locks.h"
 #include "src/slurmctld/node_scheduler.h"
 #include "src/slurmctld/power_save.h"
 #include "src/slurmctld/proc_req.h"
 #include "src/slurmctld/reservation.h"
 #include "src/slurmctld/slurmctld.h"
 #include "src/slurmctld/slurmscriptd.h"
 #include "src/slurmctld/state_save.h"
 #include "src/slurmctld/trigger_mgr.h"

 #include "src/stepmgr/gres_stepmgr.h"
 #include "src/stepmgr/srun_comm.h"
 #include "src/stepmgr/stepmgr.h"

 #define ARRAY_ID_BUF_SIZE 32
 #define MAX_EXIT_VAL 255	/* Maximum value returned by WIFEXITED() */
 #define SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0 0
 #define TOP_PRIORITY 0xffff0000	/* large, but leave headroom for higher */
 #define PURGE_OLD_JOB_IN_SEC 2592000 /* 30 days in seconds */

 #define JOB_HASH_INX(_job_id)	(_job_id % hash_table_size)
 #define JOB_ARRAY_HASH_INX(_job_id, _task_id)		\
 	((_job_id + _task_id) % hash_table_size)

 /* No need to change we always pack SLURM_PROTOCOL_VERSION */
 #define JOB_STATE_VERSION     "PROTOCOL_VERSION"

 typedef enum {
 	JOB_HASH_JOB,
 	JOB_HASH_ARRAY_JOB,
 	JOB_HASH_ARRAY_TASK,
 } job_hash_type_t;

 typedef struct {
 	int resp_array_cnt;
 	int resp_array_size;
 	uint32_t *resp_array_rc;
 	bitstr_t **resp_array_task_id;
 	char **err_msg;
 } resp_array_struct_t;

 typedef struct {
 	buf_t *buffer;
 	uint32_t  filter_uid;
 	bool has_qos_lock;
 	job_record_t *het_leader;
 	uint32_t  jobs_packed;
 	uint16_t  protocol_version;
 	uint16_t  show_flags;
 	uid_t     uid;
 	slurmdb_user_rec_t user_rec;
 	bool privileged;
 	part_record_t **visible_parts;
 } _foreach_pack_job_info_t;

 typedef struct {
 	bitstr_t *node_map;
 	list_t *license_list;
 	int rc;
 } job_overlap_args_t;

 typedef struct {
 	slurm_selected_step_t *filter_id;
 	bool free_array_bitmap;
 	job_record_t *job_ptr;
 } array_task_filter_t;

 typedef struct {
 	list_t *array_leader_list; /* list of job_record_t */
 	list_t *pending_array_task_list; /* list of array_task_filter_t */
 	uid_t auth_uid;
 	bool filter_specific_job_ids;
 	job_record_t *het_leader;
 	kill_jobs_msg_t *kill_msg;
 	time_t now;
 	list_t *other_job_list; /* list of job_record_t */
 	list_t *responses; /* list of kill_jobs_resp_job_t */
 } signal_jobs_args_t;

 typedef struct {
 	int curr_count;
 	kill_jobs_resp_msg_t *resp_msg;
 } xfer_signal_jobs_responses_args_t;

 #define MAGIC_FOREACH_BY_JOBID_ARGS 0x1a0beebe
 typedef struct {
 	int magic; /* MAGIC_FOREACH_BY_JOBID_ARGS */
 	foreach_job_by_id_control_t control;
 	uint32_t count;
 	JobForEachFunc callback;
 	JobNullForEachFunc null_callback; /* If not set, then do nothing when
 					   * the job id is not found. */
 	JobROForEachFunc ro_callback;
 	void *callback_arg;
 	job_record_t *job_ptr;
 	const slurm_selected_step_t *filter;
 } for_each_by_job_id_args_t;

 typedef struct {
 	uint32_t error_code;
 	uint32_t max_nodes;
 	uint32_t min_nodes;
 	part_record_t *part_ptr;
 	uid_t submit_uid;
 	uint32_t time_limit;
 } qos_part_check_t;

 typedef struct {
 	uint32_t het_job_offset;
 	job_record_t *job_ptr;
 	uint16_t min_part_prio_tier;
 	time_t now;
 	bitstr_t *part_nodes;
 	bool use_none_resv_nodes;
 } top_prio_args_t;

 typedef struct {
 	job_record_t *job_ptr;
 	hostset_t *hs;
 } foreach_hetcomp_args_t;

 typedef struct {
 	job_step_kill_msg_t *job_step_kill_msg;
 	int rc;
 	uint32_t uid;
 } foreach_kill_hetjob_step_t;

 typedef struct {
 	slurmctld_resv_t *cur_resv;
 	bool found;
 	job_record_t *job_ptr2;
 } findfirst_resv_overlap_t;

 typedef struct {
 	time_t batch_startup_time;
 	job_record_t *job_ptr;
 	time_t node_boot_time;
 	node_record_t *node_ptr;
 	int node_inx;
 	time_t now;
 	bool power_save_on;
 } foreach_purge_missing_jobs_t;

 typedef struct {
 	int kill_job_cnt;
 	node_record_t *node_ptr;
 	time_t now;
 	part_record_t *part_ptr;
 	bool requeue_on_resume_failure;
 } foreach_kill_job_by_t;

 typedef struct {
 	uint16_t flags;
 	job_record_t *het_job_leader;
 	bool preempt;
 	int rc;
 	uint16_t signal;
 	uid_t uid;
 } foreach_kill_hetjob_t;

 typedef struct {
 	job_record_t *het_job_leader;
 	uint32_t job_return_code;
 	bool node_fail;
 	bool requeue;
 	int rc;
 	uid_t uid;
 } foreach_complete_hetjob_t;

 typedef struct {
 	char *names;
 	char *names_pos;
 	part_record_t *part_ptr;
 } foreach_rebuild_names_t;

 typedef struct {
 	bool any_check;
 	slurmdb_assoc_rec_t *assoc_ptr;
 	job_desc_msg_t *job_desc;
 	uint32_t max_nodes_orig;
 	uint32_t max_time;
 	uint32_t min_nodes_orig;
 	slurmdb_qos_rec_t *qos_ptr;
 	list_t *qos_ptr_list;
 	int rc;
 	bitstr_t *req_bitmap;
 	uid_t submit_uid;
 } foreach_valid_part_t;

 typedef struct {
 	uint16_t cpus_per_task;
 	job_desc_msg_t *job_desc;
 	uint32_t max_cpus;
 	uint32_t min_cpus;
 	uint32_t pn_min_cpus;
 	uint64_t pn_min_memory;
 	int rc;
 } foreach_valid_pn_min_mem_t;

 typedef struct {
 	char *err_msg;
 	job_record_t *het_leader;
 	job_desc_msg_t *job_desc;
 	int rc;
 	uid_t uid;
 } foreach_update_hetjob_t;

 typedef struct {
 	job_record_t *het_leader;
 	bool indf_susp;
 	uint16_t op;
 	int rc;
 } foreach_sus_hetjob_t;

 typedef struct {
 	uint32_t flags;
 	job_record_t *het_leader;
 	bool preempt;
 	int rc;
 	uid_t uid;
 } foreach_requeue_hetjob_t;

 typedef struct {
 	uint32_t id;
 	int cnt;
 } foreach_hold_by_id_t;

 /* Global variables */
 list_t *job_list = NULL;	/* job_record list */
 time_t last_job_update;		/* time of last update to job records */

 list_t *purge_jobs_list = NULL;	/* job_record_t entries to free */

 /* Local variables */
 static int      bf_min_age_reserve = 0;
 static uint32_t delay_boot = 0;
 static uint32_t highest_prio = 0;
 static uint32_t lowest_prio  = TOP_PRIORITY;
 static int      hash_table_size = 0;
 static int      job_count = 0;		/* job's in the system */
 static uint32_t job_id_sequence = 0;	/* first job_id to assign new job */
 static struct   job_record **job_hash = NULL;
 static struct   job_record **job_array_hash_j = NULL;
 static struct   job_record **job_array_hash_t = NULL;
 static bool     kill_invalid_dep;
 static time_t   last_file_write_time = (time_t) 0;
 static uint32_t max_array_size = NO_VAL;
 static bitstr_t *requeue_exit = NULL;
 static bitstr_t *requeue_exit_hold = NULL;
 static bool     validate_cfgd_licenses = true;

 /* Local functions */
 static void _signal_pending_job_array_tasks(job_record_t *job_ptr, bitstr_t
 					    **array_bitmap, uint16_t signal,
 					    uid_t uid, int32_t i_last,
 					    time_t now, int *rc);
 static void _add_job_hash(job_record_t *job_ptr);
 static void _add_job_array_hash(job_record_t *job_ptr);
 static void _handle_requeue_limit(job_record_t *job_ptr, const char *caller);
 static int  _copy_job_desc_to_file(job_desc_msg_t * job_desc,
 				   uint32_t job_id);
 static int  _copy_job_desc_to_job_record(job_desc_msg_t * job_desc,
 					 job_record_t **job_ptr,
 					 bitstr_t ** exc_bitmap,
 					 bitstr_t ** req_bitmap);
 static char *_copy_nodelist_no_dup(char *node_list);
 static job_record_t *_create_job_record(uint32_t num_jobs, bool list_add);
 static slurmdb_qos_rec_t *_determine_and_validate_qos(
 	char *resv_name, slurmdb_assoc_rec_t *assoc_ptr, bool privileged,
 	slurmdb_qos_rec_t *qos_rec, int *error_code, bool locked,
 	log_level_t log_lvl);
 static job_fed_details_t *_dup_job_fed_details(job_fed_details_t *src);
 static uint64_t _get_def_mem(part_record_t *part_ptr, uint64_t *tres_req_cnt);
 static bool _get_whole_hetjob(void);
 static bool _higher_precedence(job_record_t *job_ptr, job_record_t *job_ptr2);
 static void _job_array_comp(job_record_t *job_ptr, bool was_running,
 			    bool requeue);
 static int  _job_create(job_desc_msg_t *job_desc, int allocate, int will_run,
 			bool cron, job_record_t **job_rec_ptr, uid_t submit_uid,
 			char **err_msg, uint16_t protocol_version);
 static void _job_timed_out(job_record_t *job_ptr, bool preempted);
 static void _kill_dependent(job_record_t *job_ptr);
 static int  _list_find_job_old(void *job_entry, void *key);
 static bitstr_t *_make_requeue_array(char *conf_buf);
 static uint32_t _max_switch_wait(uint32_t input_wait);
 static void _move_to_purge_jobs_list(void *job_entry);
 static time_t _get_last_job_state_write_time(void);
 static void _pack_default_job_details(job_record_t *job_ptr, buf_t *buffer,
 				      uint16_t protocol_version);
 static void _pack_pending_job_details(job_details_t *detail_ptr, buf_t *buffer,
 				      uint16_t protocol_version);
 static void _purge_missing_jobs(int node_inx, time_t now);
 static int  _read_data_array_from_file(int fd, char *file_name, char ***data,
 				       uint32_t *size, job_record_t *job_ptr);
 static void _remove_job_hash(job_record_t *job_ptr, job_hash_type_t type);
 static void _resp_array_add(resp_array_struct_t **resp, job_record_t *job_ptr,
 			    uint32_t rc, char *err_msg);
 static void _resp_array_add_id(resp_array_struct_t **resp, uint32_t job_id,
 			       uint32_t task_id, uint32_t rc);
 static void _resp_array_free(resp_array_struct_t *resp);
 static job_array_resp_msg_t *_resp_array_xlate(resp_array_struct_t *resp,
 					       uint32_t job_id);
 static int  _resume_job_nodes(job_record_t *job_ptr, bool indf_susp);
 static void _send_job_kill(job_record_t *job_ptr);
 static int  _set_job_id(job_record_t *job_ptr);
 static void _set_job_requeue_exit_value(job_record_t *job_ptr);
 static void _signal_batch_job(job_record_t *job_ptr, uint16_t signal,
 			      uint16_t flags);
 static void _signal_job(job_record_t *job_ptr, int signal, uint16_t flags);
 static void _suspend_job(job_record_t *job_ptr, uint16_t op);
 static int  _suspend_job_nodes(job_record_t *job_ptr, bool indf_susp);
 static bool _top_priority(job_record_t *job_ptr, uint32_t het_job_offset);
 static int _update_job_nodes_str(job_record_t *job_ptr);
 static int  _valid_job_part(job_desc_msg_t *job_desc, uid_t submit_uid,
 			    bitstr_t *req_bitmap, part_record_t *part_ptr,
 			    list_t *part_ptr_list,
 			    slurmdb_assoc_rec_t *assoc_ptr,
 			    slurmdb_qos_rec_t *qos_ptr,
 			    list_t *qos_ptr_list);
 static int  _validate_job_desc(job_desc_msg_t *job_desc_msg, int allocate,
 			       bool cron, uid_t submit_uid,
 			       part_record_t *part_ptr, list_t *part_list);
 static void _validate_job_files(void);
 static int _clear_state_dir_flag(void *x, void *arg);
 static int _test_state_dir_flag(void *x, void *arg);

 static bool _validate_min_mem_partition(job_desc_msg_t *job_desc_msg,
 					part_record_t *part_ptr,
 					list_t *part_list);
 static bool _valid_pn_min_mem(job_desc_msg_t * job_desc_msg,
 			      part_record_t *part_ptr);
 static int  _write_data_array_to_file(char *file_name, char **data,
 				      uint32_t size);

 static char *_get_mail_user(const char *user_name, job_record_t *job_ptr)
 {
 	char *mail_user = NULL;
 	if (!user_name || (user_name[0] == '\0')) {
 		mail_user = user_from_job(job_ptr);
 		/* unqualified sender, append MailDomain if set */
 		if (slurm_conf.mail_domain)
 			xstrfmtcat(mail_user, "@%s", slurm_conf.mail_domain);
 	} else {
 		mail_user = xstrdup(user_name);
 	}

 	return mail_user;
 }

 static int _job_fail_account(job_record_t *job_ptr, const char *func_name,
 			     bool assoc_locked)
 {
 	int rc = 0; // Return number of pending jobs held

 	if (IS_JOB_FINISHED(job_ptr)) {
 		/*
 		 * The acct_policy has already be cleared for this job.  Just
 		 * reset the pointer.
 		 */
 		job_ptr->assoc_ptr = NULL;
 		job_ptr->assoc_id = 0;
 		return rc;
 	}

 	if (IS_JOB_PENDING(job_ptr)) {
 		info("%s: %pJ ineligible due to invalid association",
 		     func_name, job_ptr);

 		xfree(job_ptr->state_desc);
 		job_ptr->state_reason = FAIL_ACCOUNT;

 		if (job_ptr->details) {
 			/* reset the job */
 			job_ptr->details->accrue_time = 0;
 			job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
 			job_ptr->details->begin_time = 0;
 			/* Update job with new begin_time. */
 			jobacct_storage_g_job_start(acct_db_conn, job_ptr);
 		}
 		rc = 1;
 	}

 	/* This job is no longer eligible, so make it so. */
 	if (job_ptr->assoc_ptr) {
 		part_record_t *tmp_part = job_ptr->part_ptr;
 		list_t *tmp_part_list = job_ptr->part_ptr_list;
 		slurmdb_qos_rec_t *tmp_qos = job_ptr->qos_ptr;

 		/*
 		 * Force a start so the association doesn't get lost.  Since
 		 * there could be some delay in the start of the job when
 		 * running with the slurmdbd.
 		 */
 		if (!IS_JOB_IN_DB(job_ptr))
 			jobacct_storage_g_job_start(acct_db_conn, job_ptr);

 		/*
 		 * Don't call acct_policy_remove_accrue_time() here, the cnt on
 		 * parent associations will be handled correctly by the removal
 		 * of the association.
 		 */

 		/*
 		 * Clear ptrs so that only association usage is removed.
 		 * Otherwise qos and partition limits will be double accounted
 		 * for when this job finishes. Don't do this for acrrual time,
 		 * it has be on both because the job is ineligible and can't
 		 * accrue time.
 		 */
 		job_ptr->part_ptr = NULL;
 		job_ptr->part_ptr_list = NULL;
 		job_ptr->qos_ptr = NULL;

 		acct_policy_remove_job_submit(job_ptr, assoc_locked);

 		job_ptr->part_ptr = tmp_part;
 		job_ptr->part_ptr_list = tmp_part_list;
 		job_ptr->qos_ptr = tmp_qos;

 		job_ptr->assoc_ptr = NULL;
 		/* Don't clear assoc_id, since that is what the job requests */
 	}

 	job_ptr->assoc_id = 0;

 	return rc;
 }

 extern int job_fail_qos(job_record_t *job_ptr, const char *func_name,
 			bool assoc_locked)
 {
 	int rc = 0; // Return number of pending jobs held

 	if (IS_JOB_FINISHED(job_ptr)) {
 		/*
 		 * The acct_policy has already be cleared for this job.  Just
 		 * reset the pointer.
 		 */
 		job_ptr->qos_ptr = NULL;
 		job_ptr->qos_id = 0;
 		return rc;
 	}

 	if (IS_JOB_PENDING(job_ptr)) {
 		info("%s: %pJ ineligible due to invalid qos",
 		     func_name, job_ptr);

 		xfree(job_ptr->state_desc);
 		job_ptr->state_reason = FAIL_QOS;

 		if (job_ptr->details) {
 			/* reset the job */
 			acct_policy_remove_accrue_time(job_ptr, assoc_locked);
 			job_ptr->details->begin_time = 0;
 			/* Update job with new begin_time. */
 			jobacct_storage_g_job_start(acct_db_conn, job_ptr);
 		}
 		rc = 1;
 	}

 	/* This job is no longer eligible, so make it so. */
 	if (job_ptr->qos_ptr) {
 		slurmdb_assoc_rec_t *tmp_assoc = job_ptr->assoc_ptr;

 		/*
 		 * Force a start so the qos doesn't get lost.  Since
 		 * there could be some delay in the start of the job when
 		 * running with the slurmdbd.
 		 */
 		if (!IS_JOB_IN_DB(job_ptr))
 			jobacct_storage_g_job_start(acct_db_conn, job_ptr);

 		/*
 		 * Clear ptrs so that only qos usage is removed. Otherwise
 		 * association limits will be double accounted for when this
 		 * job finishes. Don't do this for acrrual time, it has be on
 		 * both because the job is ineligible and can't accrue time.
 		 */
 		job_ptr->assoc_ptr = NULL;

 		acct_policy_remove_job_submit(job_ptr, assoc_locked);

 		job_ptr->assoc_ptr = tmp_assoc;

 		job_ptr->qos_ptr = NULL;
 		FREE_NULL_LIST(job_ptr->qos_list);
 		/*
 		 * Don't clear qos_id or details->qos_req, since that is what
 		 * the job requests
 		 */
 	}

 	return rc;
 }

 /*
  * Functions used to manage job array responses with a separate return code
  * possible for each task ID
  */
 /* Add job record to resp_array_struct_t, free with _resp_array_free() */
 static void _resp_array_add(resp_array_struct_t **resp, job_record_t *job_ptr,
 			    uint32_t rc, char *err_msg)
 {
 	resp_array_struct_t *loc_resp;
 	int array_size;
 	int i;

 	if ((job_ptr->array_task_id == NO_VAL) &&
 	    (job_ptr->array_recs == NULL)) {
 		error("%s: called for non-job array %pJ",
 		      __func__, job_ptr);
 		return;
 	}

 	if (max_array_size == NO_VAL)
 		max_array_size = slurm_conf.max_array_sz;

 	xassert(resp);
 	if (*resp == NULL) {
 		/* Initialize the data structure */
 		loc_resp = xmalloc(sizeof(resp_array_struct_t));
 		loc_resp->resp_array_cnt  = 0;
 		loc_resp->resp_array_size = 10;
 		xrecalloc(loc_resp->resp_array_rc, loc_resp->resp_array_size,
 			  sizeof(uint32_t));
 		xrecalloc(loc_resp->resp_array_task_id,
 			  loc_resp->resp_array_size,
 			  sizeof(bitstr_t *));
 		xrecalloc(loc_resp->err_msg, loc_resp->resp_array_size,
 			  sizeof(char *));
 		*resp = loc_resp;
 	} else {
 		loc_resp = *resp;
 	}

 	for (i = 0; i < loc_resp->resp_array_cnt; i++) {
 		if (loc_resp->resp_array_rc[i] != rc)
 			continue;
 		/* Add to existing error code record */
 		if (job_ptr->array_task_id != NO_VAL) {
 			if (job_ptr->array_task_id <
 			    bit_size(loc_resp->resp_array_task_id[i])) {
 				bit_set(loc_resp->resp_array_task_id[i],
 					job_ptr->array_task_id);
 			} else {
 				error("%s: found invalid task id %pJ",
 				      __func__, job_ptr);
 			}
 		} else if (job_ptr->array_recs &&
 			   job_ptr->array_recs->task_id_bitmap) {
 			array_size = bit_size(job_ptr->array_recs->
 					      task_id_bitmap);
 			if (bit_size(loc_resp->resp_array_task_id[i]) !=
 			    array_size) {
 				bit_realloc(loc_resp->resp_array_task_id[i],
 					    array_size);
 			}
 			bit_or(loc_resp->resp_array_task_id[i],
 			       job_ptr->array_recs->task_id_bitmap);
 		} else {
 			error("%s: found job %pJ without task ID or bitmap",
 			      __func__, job_ptr);
 		}
 		return;
 	}

 	/* Need to add a new record for this error code */
 	if (loc_resp->resp_array_cnt >= loc_resp->resp_array_size) {
 		/* Need to grow the table size */
 		loc_resp->resp_array_size += 10;
 		xrecalloc(loc_resp->resp_array_rc, loc_resp->resp_array_size,
 			  sizeof(uint32_t));
 		xrecalloc(loc_resp->resp_array_task_id,
 			  loc_resp->resp_array_size,
 			  sizeof(bitstr_t *));
 		xrecalloc(loc_resp->err_msg, loc_resp->resp_array_size,
 			  sizeof(bitstr_t *));
 	}

 	loc_resp->resp_array_rc[loc_resp->resp_array_cnt] = rc;
 	loc_resp->err_msg[loc_resp->resp_array_cnt] = xstrdup(err_msg);
 	if (job_ptr->array_task_id != NO_VAL) {
 		loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] =
 			bit_alloc(max_array_size);
 		if (job_ptr->array_task_id <
 		    bit_size(loc_resp->resp_array_task_id
 			     [loc_resp->resp_array_cnt])) {
 			bit_set(loc_resp->resp_array_task_id
 				[loc_resp->resp_array_cnt],
 				job_ptr->array_task_id);
 		}
 	} else if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) {
 		loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] =
 			bit_copy(job_ptr->array_recs->task_id_bitmap);
 	} else {
 		error("%s: found %pJ without task ID or bitmap",
 		      __func__, job_ptr);
 		loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] =
 			bit_alloc(max_array_size);
 	}
 	loc_resp->resp_array_cnt++;
 }

 /* Add record to resp_array_struct_t, free with _resp_array_free().
  * This is a variant of _resp_array_add for the case where a job/task ID
  * is not found, so we use a dummy job record based upon the input IDs. */
 static void _resp_array_add_id(resp_array_struct_t **resp, uint32_t job_id,
 			       uint32_t task_id, uint32_t rc)
 {
 	job_record_t job_ptr;

 	job_ptr.job_id = job_id;
 	job_ptr.array_job_id = job_id;
 	job_ptr.array_task_id = task_id;
 	job_ptr.array_recs = NULL;
 	_resp_array_add(resp, &job_ptr, rc, NULL);
 }

 /* Free resp_array_struct_t built by _resp_array_add() */
 static void _resp_array_free(resp_array_struct_t *resp)
 {
 	int i;

 	if (resp) {
 		for (i = 0; i < resp->resp_array_cnt; i++) {
 			FREE_NULL_BITMAP(resp->resp_array_task_id[i]);
 			xfree(resp->err_msg[i]);
 		}
 		xfree(resp->err_msg);
 		xfree(resp->resp_array_task_id);
 		xfree(resp->resp_array_rc);
 		xfree(resp);
 	}
 }

 /* Translate internal job array data structure into a response message */
 static job_array_resp_msg_t *_resp_array_xlate(resp_array_struct_t *resp,
 					       uint32_t job_id)
 {
 	job_array_resp_msg_t *msg;
 	char task_str[ARRAY_ID_BUF_SIZE];
 	int *ffs = NULL;
 	int i, j, low;

 	ffs = xcalloc(resp->resp_array_cnt, sizeof(int));
 	for (i = 0; i < resp->resp_array_cnt; i++) {
 		ffs[i] = bit_ffs(resp->resp_array_task_id[i]);
 	}

 	msg = xmalloc(sizeof(job_array_resp_msg_t));
 	msg->job_array_count = resp->resp_array_cnt;
 	msg->job_array_id = xcalloc(resp->resp_array_cnt, sizeof(char *));
 	msg->error_code = xcalloc(resp->resp_array_cnt, sizeof(uint32_t));
 	msg->err_msg = xcalloc(resp->resp_array_cnt, sizeof(char *));
 	for (i = 0; i < resp->resp_array_cnt; i++) {
 		low = -1;
 		for (j = 0; j < resp->resp_array_cnt; j++) {
 			if ((ffs[j] != -1) &&
 			    ((low == -1) || (ffs[j] < ffs[low])))
 				low = j;
 		}
 		if (low == -1)
 			break;
 		ffs[low] = -1;

 		msg->error_code[i] = resp->resp_array_rc[low];
 		msg->err_msg[i] = xstrdup(resp->err_msg[low]);
 		bit_fmt(task_str, ARRAY_ID_BUF_SIZE,
 			resp->resp_array_task_id[low]);
 		if (strlen(task_str) >= ARRAY_ID_BUF_SIZE - 2) {
 			/* Append "..." to the buffer on overflow */
 			task_str[ARRAY_ID_BUF_SIZE - 4] = '.';
 			task_str[ARRAY_ID_BUF_SIZE - 3] = '.';
 			task_str[ARRAY_ID_BUF_SIZE - 2] = '.';
 			task_str[ARRAY_ID_BUF_SIZE - 1] = '\0';
 		}
 		xstrfmtcat(msg->job_array_id[i], "%u_%s", job_id, task_str);
 	}

 	xfree(ffs);
 	return msg;
 }

 static int _add_job_record(job_record_t *job_ptr, int num_jobs)
 {
 	if ((job_count + num_jobs) > slurm_conf.max_job_cnt) {
 		error("%s: MaxJobCount limit from slurm.conf reached (%u)",
 		      __func__, slurm_conf.max_job_cnt);
 		return SLURM_ERROR;
 	}
 	job_count += num_jobs;
 	last_job_update = time(NULL);
 	list_append(job_list, job_ptr);

 	return SLURM_SUCCESS;
 }

 /*
  * _create_job_record - create an empty job_record including job_details.
  *	load its values with defaults (zeros, nulls, and magic cookie)
  * IN num_jobs - number of jobs this record should represent
  *    = 0 - split out a job array record to its own job record
  *    = 1 - simple job OR job array with one task
  *    > 1 - job array create with the task count as num_jobs
  * IN list_add - add to the joblist or not.
  * RET pointer to the record or NULL if error
  * NOTE: allocates memory that should be xfreed with job_record_delete
  */
 static job_record_t *_create_job_record(uint32_t num_jobs, bool list_add)
 {
 	job_record_t *job_ptr = job_record_create();

 	if (list_add) {
 		_add_job_record(job_ptr, num_jobs);
 	}

 	return job_ptr;
 }

 /*
  * delete_job_desc_files - delete job descriptor related files
  *
  * Note that this will be called on all individual job array tasks,
  * even though (as of 17.11) individual directories are no longer created.
  */
 extern void delete_job_desc_files(uint32_t job_id)
 {
 	char *dir_name = NULL, *file_name = NULL;
 	int hash = job_id % 10;
 	DIR *f_dir;
 	struct dirent *dir_ent;

 	dir_name = xstrdup_printf("%s/hash.%d/job.%u",
 	                          slurm_conf.state_save_location,
 	                          hash, job_id);

 	f_dir = opendir(dir_name);
 	if (f_dir) {
 		while ((dir_ent = readdir(f_dir))) {
 			if (!xstrcmp(dir_ent->d_name, ".") ||
 			    !xstrcmp(dir_ent->d_name, ".."))
 				continue;
 			xstrfmtcat(file_name, "%s/%s", dir_name,
 				   dir_ent->d_name);
 			(void) unlink(file_name);
 			xfree(file_name);
 		}
 		closedir(f_dir);
 	} else if (errno == ENOENT) {
 		xfree(dir_name);
 		return;
 	} else {
 		error("opendir(%s): %m", dir_name);
 	}

 	(void) rmdir(dir_name);
 	xfree(dir_name);
 }

 static uint32_t _max_switch_wait(uint32_t input_wait)
 {
 	static time_t sched_update = 0;
 	static uint32_t max_wait = 300;	/* default max_switch_wait, seconds */
 	int i;

 	if (sched_update != slurm_conf.last_update) {
 		char *tmp_ptr;
 		sched_update = slurm_conf.last_update;
 		if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params,
 		                           "max_switch_wait="))) {
 			/*                  0123456789012345 */
 			i = atoi(tmp_ptr + 16);
 			if (i < 0) {
 				error("ignoring SchedulerParameters: "
 				      "max_switch_wait of %d", i);
 			} else {
 				max_wait = i;
 			}
 		}
 	}

 	if (max_wait > input_wait)
 		return input_wait;
 	return max_wait;
 }

 static slurmdb_qos_rec_t *_determine_and_validate_qos(
 	char *resv_name, slurmdb_assoc_rec_t *assoc_ptr, bool privileged,
 	slurmdb_qos_rec_t *qos_rec, int *error_code, bool locked,
 	log_level_t log_lvl)
 {
 	slurmdb_qos_rec_t *qos_ptr = NULL;

 	/* If enforcing associations make sure this is a valid qos
 	   with the association.  If not just fill in the qos and
 	   continue. */

 	xassert(qos_rec);

 	assoc_mgr_get_default_qos_info(assoc_ptr, qos_rec);
 	if (assoc_mgr_fill_in_qos(acct_db_conn, qos_rec, accounting_enforce,
 				  &qos_ptr, locked) != SLURM_SUCCESS) {
 		log_var(log_lvl, "Invalid qos (%s)", qos_rec->name);
 		*error_code = ESLURM_INVALID_QOS;
 		return NULL;
 	}

 	if ((accounting_enforce & ACCOUNTING_ENFORCE_QOS) && assoc_ptr &&
 	    !privileged &&
 	    (!assoc_ptr->usage->valid_qos ||
 	     !bit_test(assoc_ptr->usage->valid_qos, qos_rec->id))) {
 		log_var(log_lvl, "This association %d(account='%s', user='%s', partition='%s') does not have access to qos %s",
 		        assoc_ptr->id, assoc_ptr->acct, assoc_ptr->user,
 		        assoc_ptr->partition, qos_rec->name);
 		*error_code = ESLURM_INVALID_QOS;
 		return NULL;
 	}

 	if (qos_ptr) {
 		if ((qos_ptr->flags & QOS_FLAG_RELATIVE) &&
 		    (qos_ptr->flags & QOS_FLAG_PART_QOS)) {
 			log_var(log_lvl, "QOS %s is relative and used as a Partition QOS. This prohibits it from being used as a job's QOS",
 				qos_rec->name);
 			*error_code = ESLURM_INVALID_QOS;
 			return NULL;
 		}

 		if ((qos_ptr->flags & QOS_FLAG_REQ_RESV) &&
 		    (!resv_name || resv_name[0] == '\0')) {
 			log_var(log_lvl, "qos %s can only be used in a reservation",
 				qos_rec->name);
 			*error_code = ESLURM_INVALID_QOS;
 			return NULL;
 		}
 	}

 	*error_code = SLURM_SUCCESS;
 	return qos_ptr;
 }

 static list_t *_get_qos_ptr_list(char *qos_req, char *resv_name,
 				 slurmdb_assoc_rec_t *assoc_ptr,
 				 bool privileged, int *error_code, bool locked,
 				 log_level_t log_lvl)
 {
 	list_t *qos_ptr_list = NULL;
 	char *token, *last = NULL, *tmp_qos_req;

 	xassert(error_code);

 	if (!xstrchr(qos_req, ','))
 		return qos_ptr_list;

 	tmp_qos_req = xstrdup(qos_req);
 	token = strtok_r(tmp_qos_req, ",", &last);
 	while (token) {
 		slurmdb_qos_rec_t qos_rec = {
 			.name = token,
 		};
 		slurmdb_qos_rec_t *qos_ptr =
 			_determine_and_validate_qos(resv_name, assoc_ptr,
 						    privileged, &qos_rec,
 						    error_code, locked,
 						    log_lvl);

 		if (*error_code != SLURM_SUCCESS)
 			break;

 		/*
 		 * This should not happen as the error_code check should catch
 		 * issues before we get here.
 		 */
 		if (!qos_ptr) {
 			*error_code = ESLURM_INVALID_QOS;
 			break;
 		}

 		if (!qos_ptr_list)
 			qos_ptr_list = list_create(NULL);

 		if (!list_find_first_ro(qos_ptr_list,
 					slurm_find_ptr_in_list,
 					qos_ptr)) {
 			list_append(qos_ptr_list, qos_ptr);
 		}
 		token = strtok_r(NULL, ",", &last);
 	}
 	xfree(tmp_qos_req);

 	/* If we have a trailing comma error out */
 	if (qos_ptr_list && (list_count(qos_ptr_list) == 1)) {
 		error("%s: Invalid qos (%s), it appears there is a trailing comma",
 		      __func__, qos_req);
 		*error_code = ESLURM_INVALID_QOS;
 	}

 	if (*error_code != SLURM_SUCCESS)
 		FREE_NULL_LIST(qos_ptr_list);

 	if (qos_ptr_list)
 		list_sort(qos_ptr_list, priority_sort_qos_desc);

 	return qos_ptr_list;
 }

 static int _get_qos_info(char *qos_req, uint32_t qos_id, list_t **qos_plist,
 			 slurmdb_qos_rec_t **qos_pptr, char *resv_name,
 			 slurmdb_assoc_rec_t *assoc_ptr, bool privileged,
 			 bool locked, log_level_t log_lvl)
 {
 	int rc = SLURM_SUCCESS;

 	xassert(qos_plist);
 	xassert(qos_pptr);
 	xassert(!*qos_plist);

 	*qos_plist = _get_qos_ptr_list(qos_req, resv_name, assoc_ptr,
 				       privileged, &rc, locked, log_lvl);

 	if (!*qos_plist) {
 		slurmdb_qos_rec_t qos_rec = {
 			.name = qos_req,
 			.id = qos_id,
 		};

 		*qos_pptr = _determine_and_validate_qos(resv_name, assoc_ptr,
 							privileged, &qos_rec,
 							&rc, locked, log_lvl);
 	} else {
 		*qos_pptr = list_peek(*qos_plist);
 	}

 	return rc;
 }
 /*
  * dump_all_job_state - save the state of all jobs to file for checkpoint
  *	Changes here should be reflected in load_last_job_id() and
  *	load_all_job_state().
  * RET 0 or error code
  */
 int dump_all_job_state(void)
 {
 	/* Save high-water mark to avoid buffer growth with copies */
 	static uint32_t high_buffer_size = (1024 * 1024);
 	int error_code = SLURM_SUCCESS;
 	char *reg_file;
 	struct stat stat_buf;
 	/* Locks: Read config and job */
 	slurmctld_lock_t job_read_lock =
 		{ READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
 	buf_t *buffer = init_buf(high_buffer_size);
 	time_t now = time(NULL);
 	time_t last_state_file_time;
 	static time_t last_job_state_size_check = 0;
 	uint32_t jobs_start, jobs_end, jobs_count;
 	DEF_TIMERS;

 	START_TIMER;
 	/*
 	 * Check that last state file was written at expected time.
 	 * This is a check for two slurmctld daemons running at the same
 	 * time in primary mode (a split-brain problem).
 	 */
 	last_state_file_time = _get_last_job_state_write_time();
 	if (last_file_write_time && last_state_file_time &&
 	    (last_file_write_time != last_state_file_time)) {
 		error("Bad job state save file time. We wrote it at time %u, "
 		      "but the file contains a time stamp of %u.",
 		      (uint32_t) last_file_write_time,
 		      (uint32_t) last_state_file_time);
 		if (!slurmctld_primary) {
 			fatal("Two slurmctld daemons are running as primary. "
 			      "Shutting down this daemon to avoid inconsistent "
 			      "state due to split brain.");
 		}
 	}

 	/* write header: version, time */
 	packstr(JOB_STATE_VERSION, buffer);
 	pack16(SLURM_PROTOCOL_VERSION, buffer);
 	pack_time(now, buffer);

 	/*
 	 * write header: job id
 	 * This is needed so that the job id remains persistent even after
 	 * slurmctld is restarted.
 	 */
 	pack32( job_id_sequence, buffer);

 	debug3("Writing job id %u to header record of job_state file",
 	       job_id_sequence);

 	/* write individual job records */
 	lock_slurmctld(job_read_lock);

 	pack_time(slurmctld_diag_stats.bf_when_last_cycle, buffer);

 	jobs_start = get_buf_offset(buffer);
 	list_for_each_ro(job_list, job_mgr_dump_job_state, buffer);
 	jobs_end = get_buf_offset(buffer);
 	if ((difftime(now, last_job_state_size_check) > 60) &&
 	    (jobs_count = list_count(job_list))) {
 		uint64_t ave_job_size = jobs_end - jobs_start;
 		uint64_t estimated_job_state_size = ave_job_size *
 			slurm_conf.max_job_cnt;
 		last_job_state_size_check = time(NULL);
 		/*
 		 * We assume all jobs were written to buffer, which may not
 		 * be true, but in that case we'd already flood the log with
 		 * errors.
 		 */
 		estimated_job_state_size /= jobs_count;
 		estimated_job_state_size += jobs_start;
 		ave_job_size /= jobs_count;
 		if (estimated_job_state_size > MAX_BUF_SIZE)
 			error("Configured MaxJobCount may lead to job_state being larger then maximum buffer size and not saved, based on the average job state size(%.2f KiB) we can save state of %"PRIu64" jobs.",
 			      (float)ave_job_size / 1024,
 			      ((uint64_t)(MAX_BUF_SIZE - jobs_start)) /
 			      ave_job_size);
 	}

 	unlock_slurmctld(job_read_lock);

 	reg_file = xstrdup_printf("%s/job_state",
 	                          slurm_conf.state_save_location);

 	if (stat(reg_file, &stat_buf) == 0) {
 		static time_t last_mtime = (time_t) 0;
 		int delta_t = difftime(stat_buf.st_mtime, last_mtime);
 		if (delta_t < -10) {
 			error("The modification time of %s moved backwards "
 			      "by %d seconds",
 			      reg_file, (0-delta_t));
 			error("The clock of the file system and this computer "
 			      "appear to not be synchronized");
 			/* It could be safest to exit here. We likely mounted
 			 * a different file system with the state save files */
 		}
 		last_mtime = time(NULL);
 	}

 	error_code = save_buf_to_state("job_state", buffer, &high_buffer_size);
 	if (!error_code)
 		last_file_write_time = now;

 	xfree(reg_file);
 	FREE_NULL_BUFFER(buffer);
 	END_TIMER2(__func__);
 	return error_code;
 }

 static int _find_job_part(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;

 	if ((job_ptr->part_ptr == arg) && !IS_JOB_FINISHED(job_ptr))
 		return 1; /* match */
 	return 0;
 }

 static int _find_resv_part(void *x, void *key)
 {
 	slurmctld_resv_t *resv_ptr = (slurmctld_resv_t *) x;

 	if (resv_ptr->part_ptr != (part_record_t *) key)
 		return 0;
 	else
 		return 1;	/* match */
 }

 static int _find_part_assoc(void *x, void *key)
 {
 	part_record_t *part_ptr = (part_record_t *)x;
 	slurmdb_assoc_rec_t *assoc_ptr = (slurmdb_assoc_rec_t *) key;
 	slurmdb_assoc_rec_t assoc_rec;

 	memset(&assoc_rec, 0, sizeof(assoc_rec));
 	assoc_rec.acct      = assoc_ptr->acct;
 	assoc_rec.partition = part_ptr->name;
 	assoc_rec.uid       = assoc_ptr->uid;

 	(void) assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
 				       accounting_enforce, NULL, true);

 	if (assoc_rec.id != assoc_ptr->id) {
 		info("%s: can't check multiple partitions with partition based associations",
 		     __func__);
 		return 1;
 	}
 	return 0;
 }

 static int _check_for_part_assocs(list_t *part_ptr_list,
 				  slurmdb_assoc_rec_t *assoc_ptr)
 {
 	if (assoc_ptr && part_ptr_list &&
 	    list_find_first(part_ptr_list, _find_part_assoc, assoc_ptr)) {
 		return ESLURM_PARTITION_ASSOC;
 	}

 	return SLURM_SUCCESS;
 }

 extern void set_job_failed_assoc_qos_ptr(job_record_t *job_ptr)
 {
 	if (!job_ptr->assoc_ptr && (job_ptr->state_reason == FAIL_ACCOUNT)) {
 		slurmdb_assoc_rec_t assoc_rec;
 		memset(&assoc_rec, 0, sizeof(assoc_rec));
 		/*
 		 * For speed and accuracy we will first see if we once had an
 		 * association record.  If not look for it by
 		 * account,partition, user_id.
 		 */
 		if (job_ptr->assoc_id)
 			assoc_rec.id = job_ptr->assoc_id;
 		else {
 			assoc_rec.acct      = job_ptr->account;
 			if (job_ptr->part_ptr)
 				assoc_rec.partition = job_ptr->part_ptr->name;
 			assoc_rec.uid       = job_ptr->user_id;
 		}

 		if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
 		                            accounting_enforce,
 		                            &job_ptr->assoc_ptr, false) ==
 		    SLURM_SUCCESS) {
 			job_ptr->assoc_id = assoc_rec.id;
 			debug("%s: Filling in assoc for %pJ Assoc=%u",
 			      __func__, job_ptr, job_ptr->assoc_id);

 			job_ptr->state_reason = WAIT_NO_REASON;
 			xfree(job_ptr->state_desc);
 			last_job_update = time(NULL);
 		}
 	}

 	/*
 	 * This shouldn't matter if there is a qos_list as that will get
 	 * handled after this is called.
 	 */
 	if (!job_ptr->qos_ptr && (job_ptr->state_reason == FAIL_QOS)) {
 		int qos_error = SLURM_SUCCESS;
 		slurmdb_qos_rec_t qos_rec;
 		memset(&qos_rec, 0, sizeof(qos_rec));
 		qos_rec.id = job_ptr->qos_id;
 		job_ptr->qos_ptr = _determine_and_validate_qos(
 			job_ptr->resv_name, job_ptr->assoc_ptr,
 			job_ptr->limit_set.qos, &qos_rec,
 			&qos_error, false, LOG_LEVEL_DEBUG2);

 		if ((qos_error == SLURM_SUCCESS) && job_ptr->qos_ptr) {
 			/* job_ptr->qos_id should never start at 0 */
 			if (job_ptr->qos_id != qos_rec.id) {
 				error("%s: Changing job_ptr->qos_id from %u to %u; this should never happen",
 				      __func__, job_ptr->qos_id, qos_rec.id);
 				job_ptr->qos_id = qos_rec.id;
 			}
 			debug("%s: Filling in QOS for %pJ QOS=%s(%u)",
 			      __func__, job_ptr, qos_rec.name, job_ptr->qos_id);
 			job_ptr->state_reason = WAIT_NO_REASON;
 			xfree(job_ptr->state_desc);
 			last_job_update = time(NULL);
 		}
 	}
 }

 extern void set_job_tres_req_str(job_record_t *job_ptr, bool assoc_mgr_locked)
 {
 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };
 	xassert(job_ptr);

 	if (!assoc_mgr_locked)
 		assoc_mgr_lock(&locks);

 	xfree(job_ptr->tres_req_str);
 	job_ptr->tres_req_str = assoc_mgr_make_tres_str_from_array(
 		job_ptr->tres_req_cnt, TRES_STR_FLAG_SIMPLE, true);

 	xfree(job_ptr->tres_fmt_req_str);
 	job_ptr->tres_fmt_req_str = assoc_mgr_make_tres_str_from_array(
 		job_ptr->tres_req_cnt, TRES_STR_CONVERT_UNITS, true);

 	if (!assoc_mgr_locked)
 		assoc_mgr_unlock(&locks);
 }

 /* Note that the backup slurmctld has assumed primary control.
  * This function can be called multiple times. */
 extern void backup_slurmctld_restart(void)
 {
 	last_file_write_time = (time_t) 0;
 }

 /* Return the time stamp in the current job state save file, 0 is returned on
  * error */
 static time_t _get_last_job_state_write_time(void)
 {
 	int error_code = SLURM_SUCCESS;
 	char *state_file = NULL;
 	buf_t *buffer;
 	time_t buf_time = (time_t) 0;
 	char *ver_str = NULL;
 	uint16_t protocol_version = NO_VAL16;

 	/* read the file */
 	if (!(buffer = state_save_open("job_state", &state_file))) {
 		info("No job state file (%s) found", state_file);
 		error_code = ENOENT;
 	}
 	xfree(state_file);
 	if (error_code)
 		return buf_time;

 	safe_unpackstr(&ver_str, buffer);
 	if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION))
 		safe_unpack16(&protocol_version, buffer);
 	safe_unpack_time(&buf_time, buffer);

 unpack_error:
 	xfree(ver_str);
 	FREE_NULL_BUFFER(buffer);
 	return buf_time;
 }

 /*
  * load_all_job_state - load the job state from file, recover from last
  *	checkpoint. Execute this after loading the configuration file data.
  *	Changes here should be reflected in load_last_job_id().
  * RET 0 or error code
  */
 extern int load_all_job_state(void)
 {
 	int error_code = SLURM_SUCCESS;
 	int job_cnt = 0;
 	char *state_file = NULL;
 	buf_t *buffer;
 	time_t buf_time;
 	uint32_t saved_job_id;
 	char *ver_str = NULL;
 	uint16_t protocol_version = NO_VAL16;

 	/* read the file */
 	if (!(buffer = state_save_open("job_state", &state_file))) {
 		if ((clustername_existed == 1) && (!ignore_state_errors))
 			fatal("No job state file (%s) to recover", state_file);
 		info("No job state file (%s) to recover", state_file);
 		xfree(state_file);
 		return ENOENT;
 	}
 	xfree(state_file);

 	job_id_sequence = MAX(job_id_sequence, slurm_conf.first_job_id);

 	safe_unpackstr(&ver_str, buffer);
 	debug3("Version string in job_state header is %s", ver_str);
 	if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION))
 		safe_unpack16(&protocol_version, buffer);
 	xfree(ver_str);

 	if (protocol_version == NO_VAL16) {
 		if (!ignore_state_errors)
 			fatal("Can not recover job state, incompatible version, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
 		error("***********************************************");
 		error("Can not recover job state, incompatible version");
 		error("***********************************************");
 		FREE_NULL_BUFFER(buffer);
 		return EFAULT;
 	}

 	safe_unpack_time(&buf_time, buffer);
 	safe_unpack32(&saved_job_id, buffer);
 	if (saved_job_id <= slurm_conf.max_job_id)
 		job_id_sequence = MAX(saved_job_id, job_id_sequence);
 	debug3("Job id in job_state header is %u", saved_job_id);

 	safe_unpack_time(&buf_time, buffer); /* bf_when_last_cycle */
 	if (!slurmctld_diag_stats.bf_when_last_cycle)
 		slurmctld_diag_stats.bf_when_last_cycle = buf_time;

 	/*
 	 * Previously we locked the tres read lock before this loop.  It turned
 	 * out that created a double lock when steps were being loaded during
 	 * the calls to jobacctinfo_create() which also locks the read lock.
 	 * It ended up being much easier to move the locks for the assoc_mgr
 	 * into the job_mgr_load_job_state function than any other option.
 	 */
 	while (remaining_buf(buffer) > 0) {
 		error_code = job_mgr_load_job_state(buffer, protocol_version);
 		if (error_code != SLURM_SUCCESS)
 			goto unpack_error;
 		job_cnt++;
 	}
 	debug3("Set job_id_sequence to %u", job_id_sequence);

 	FREE_NULL_BUFFER(buffer);
 	info("Recovered information about %d jobs", job_cnt);
 	return error_code;

 unpack_error:
 	if (!ignore_state_errors)
 		fatal("Incomplete job state save file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
 	error("Incomplete job state save file");
 	info("Recovered information about %d jobs", job_cnt);
 	FREE_NULL_BUFFER(buffer);
 	return SLURM_ERROR;
 }

 /*
  * load_last_job_id - load only the last job ID from state save file.
  *	Changes here should be reflected in load_all_job_state().
  * RET 0 or error code
  */
 extern int load_last_job_id( void )
 {
 	char *state_file = NULL;
 	buf_t *buffer;
 	time_t buf_time;
 	char *ver_str = NULL;
 	uint16_t protocol_version = NO_VAL16;

 	/* read the file */
 	if (!(buffer = state_save_open("job_state", &state_file))) {
 		debug("No job state file (%s) to recover", state_file);
 		xfree(state_file);
 		return ENOENT;
 	}
 	xfree(state_file);

 	safe_unpackstr(&ver_str, buffer);
 	debug3("Version string in job_state header is %s", ver_str);
 	if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION))
 		safe_unpack16(&protocol_version, buffer);
 	xfree(ver_str);

 	if (protocol_version == NO_VAL16) {
 		if (!ignore_state_errors)
 			fatal("Can not recover last job ID, incompatible version, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
 		debug("*************************************************");
 		debug("Can not recover last job ID, incompatible version");
 		debug("*************************************************");
 		FREE_NULL_BUFFER(buffer);
 		return EFAULT;
 	}

 	safe_unpack_time(&buf_time, buffer);
 	safe_unpack32( &job_id_sequence, buffer);
 	debug3("Job ID in job_state header is %u", job_id_sequence);

 	/* Ignore the state for individual jobs stored here */

 	xfree(ver_str);
 	FREE_NULL_BUFFER(buffer);
 	return SLURM_SUCCESS;

 unpack_error:
 	if (!ignore_state_errors)
 		fatal("Invalid job data checkpoint file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
 	error("Invalid job data checkpoint file");
 	xfree(ver_str);
 	FREE_NULL_BUFFER(buffer);
 	return SLURM_ERROR;
 }

 extern int job_mgr_dump_job_state(void *object, void *arg)
 {
 	job_record_t *dump_job_ptr = object;
 	buf_t *buffer = arg;

 	xassert(dump_job_ptr->magic == JOB_MAGIC);

 	/* Don't pack "unlinked" job. */
 	if (dump_job_ptr->job_id == NO_VAL)
 		return 0;

 	if (dump_job_ptr->array_recs)
 		build_array_str(dump_job_ptr);
 	_update_job_nodes_str(dump_job_ptr);

 	job_record_pack(dump_job_ptr, slurmctld_tres_cnt, buffer,
 			SLURM_PROTOCOL_VERSION);
 	return 0;
 }

 extern int job_mgr_load_job_state(buf_t *buffer,
 				  uint16_t protocol_version)
 {
 	char *err_part = NULL;
 	time_t now = time(NULL);
 	job_record_t *job_ptr = NULL;
 	int rc;
 	slurmdb_assoc_rec_t assoc_rec;
 	bool job_finished = false;
 	assoc_mgr_lock_t locks = {
 		.assoc = WRITE_LOCK,
 		.qos = WRITE_LOCK,
 		.tres = READ_LOCK,
 		.user = READ_LOCK,
 	};

 	if (job_record_unpack(&job_ptr, slurmctld_tres_cnt, buffer,
 			      protocol_version)) {
 		error("failed to load job from state");
 		goto unpack_error;
 	}

 	if (find_job_record(job_ptr->job_id)) {
 		error("duplicate job state record found for %pJ", job_ptr);
 		goto unpack_error;
 	} else if (_add_job_record(job_ptr, 1)) {
 		rc = SLURM_SUCCESS;
 		job_record_delete(job_ptr);
 		job_ptr = NULL;
 		goto free_it;
 	}

 	/* "Don't load "unlinked" job. */
 	if (job_ptr->job_id == NO_VAL) {
 		debug("skipping unlinked job");
 		rc = SLURM_SUCCESS;
 		goto free_it;
 	}

 	if ((job_ptr->job_state & JOB_STATE_BASE) >= JOB_END) {
 		error("Invalid data for JobId=%u: job_state=%u",
 		      job_ptr->job_id, job_ptr->job_state);
 		goto unpack_error;
 	}
 	if (job_ptr->kill_on_node_fail > 1) {
 		error("Invalid data for JobId=%u: kill_on_node_fail=%u",
 		      job_ptr->job_id, job_ptr->kill_on_node_fail);
 		goto unpack_error;
 	}

 	if ((job_ptr->priority > 1) && (job_ptr->direct_set_prio == 0)) {
 		highest_prio = MAX(highest_prio, job_ptr->priority);
 		lowest_prio  = MIN(lowest_prio,  job_ptr->priority);
 	}

 	get_part_list(job_ptr->partition, &job_ptr->part_ptr_list,
 		      &job_ptr->part_ptr, &err_part);
 	if (job_ptr->part_ptr == NULL) {
 		verbose("Invalid partition (%s) for JobId=%u",
 			err_part, job_ptr->job_id);
 		xfree(err_part);
 		/* not fatal error, partition could have been
 		 * removed, reset_job_bitmaps() will clean-up
 		 * this job */
 	}

 #if 0
 	/*
 	 * This is not necessary since the job_id_sequence is checkpointed and
 	 * the jobid will be checked if it's in use in get_next_job_id().
 	 */

 	/* Base job_id_sequence off of local job id but only if the job
 	 * originated from this cluster -- so that the local job id of a
 	 * different cluster isn't restored here. */
 	if (!job_fed_details ||
 	    !xstrcmp(job_fed_details->origin_str, slurm_conf.cluster_name))
 		local_job_id = fed_mgr_get_local_id(job_id);
 	if (job_id_sequence <= local_job_id)
 		job_id_sequence = local_job_id + 1;
 #endif

 	if (job_ptr->array_recs && (job_ptr->array_recs->task_cnt > 1))
 		job_count += (job_ptr->array_recs->task_cnt - 1);

 	xstrtolower(job_ptr->account);
 	job_state_set(job_ptr, job_ptr->job_state);
 	job_ptr->time_last_active = now;

 	if (IS_JOB_PENDING(job_ptr))
 		job_ptr->node_cnt_wag = job_ptr->total_nodes;

 	/*
 	 * This needs to always to initialized to "true".  The select
 	 * plugin will deal with it every time it goes through the
 	 * logic if req_switch or wait4switch are set.
 	 */
 	job_ptr->best_switch     = true;

 	/* If start_protocol_ver is too old, reset to current version. */
 	if (job_ptr->start_protocol_ver < SLURM_MIN_PROTOCOL_VERSION)
 		job_ptr->start_protocol_ver = SLURM_PROTOCOL_VERSION;

 	/* Handle this after user_id and other identity has been filled in */
 	if (!job_ptr->mail_user) {
 		job_ptr->mail_user = _get_mail_user(NULL, job_ptr);
 	}

 	_add_job_hash(job_ptr);
 	_add_job_array_hash(job_ptr);

 	memset(&assoc_rec, 0, sizeof(assoc_rec));

 	/*
 	 * For speed and accuracy we will first see if we once had an
 	 * association record.  If not look for it by
 	 * account,partition, user_id.
 	 */
 	if (job_ptr->assoc_id)
 		assoc_rec.id = job_ptr->assoc_id;
 	else {
 		assoc_rec.acct      = job_ptr->account;
 		if (job_ptr->part_ptr)
 			assoc_rec.partition = job_ptr->part_ptr->name;
 		assoc_rec.uid       = job_ptr->user_id;
 	}

 	assoc_mgr_lock(&locks);
 	if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
 				    accounting_enforce,
 				    &job_ptr->assoc_ptr, true) &&
 	    (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) {
 		_job_fail_account(job_ptr, __func__, true);
 	} else {
 		job_ptr->assoc_id = assoc_rec.id;
 		info("Recovered %pJ Assoc=%u", job_ptr, job_ptr->assoc_id);

 		if (job_ptr->state_reason == FAIL_ACCOUNT) {
 			job_ptr->state_reason = WAIT_NO_REASON;
 			xfree(job_ptr->state_desc);
 		}

 		/* make sure we have started this job in accounting */
 		if (!IS_JOB_IN_DB(job_ptr)) {
 			debug("starting %pJ in accounting", job_ptr);
 			jobacct_storage_g_job_start(acct_db_conn, job_ptr);
 			if (slurmctld_init_db
 			    && IS_JOB_SUSPENDED(job_ptr)) {
 				jobacct_storage_g_job_suspend(acct_db_conn,
 							      job_ptr);
 			}
 		}
 		/* make sure we have this job completed in the database */
 		if (IS_JOB_FINISHED(job_ptr)) {
 			if (slurmctld_init_db &&
 			    !(job_ptr->bit_flags & TRES_STR_CALC) &&
 			    job_ptr->tres_alloc_cnt &&
 			    (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64))
 				assoc_mgr_set_job_tres_alloc_str(job_ptr,
 								 false);
 			jobacct_storage_g_job_complete(
 				acct_db_conn, job_ptr);
 			job_finished = 1;
 		}
 	}

 	if (!job_finished && (job_ptr->qos_id || job_ptr->details->qos_req) &&
 	    (job_ptr->state_reason != FAIL_ACCOUNT)) {
 		int qos_error = _get_qos_info(job_ptr->details->qos_req,
 					      job_ptr->qos_id,
 					      &job_ptr->qos_list,
 					      &job_ptr->qos_ptr,
 					      job_ptr->resv_name,
 					      job_ptr->assoc_ptr,
 					      job_ptr->limit_set.qos,
 					      true, LOG_LEVEL_ERROR);

 		if ((qos_error != SLURM_SUCCESS) &&
 		    !job_ptr->limit_set.qos) {
 			job_fail_qos(job_ptr, __func__, true);
 		} else if (job_ptr->qos_ptr) {
 			job_ptr->qos_id = job_ptr->qos_ptr->id;
 			if (job_ptr->state_reason == FAIL_QOS) {
 				job_ptr->state_reason = WAIT_NO_REASON;
 				xfree(job_ptr->state_desc);
 			}
 		}
 	}

 	/*
 	 * do this after the format string just in case for some
 	 * reason the tres_alloc_str is NULL but not the fmt_str
 	 */
 	if (job_ptr->tres_alloc_str)
 		assoc_mgr_set_tres_cnt_array(
 			&job_ptr->tres_alloc_cnt, job_ptr->tres_alloc_str,
 			0, true, false, NULL);
 	else
 		job_set_alloc_tres(job_ptr, true);

 	if (job_ptr->tres_req_str)
 		assoc_mgr_set_tres_cnt_array(
 			&job_ptr->tres_req_cnt, job_ptr->tres_req_str, 0, true,
 			false, NULL);
 	else
 		job_set_req_tres(job_ptr, true);
 	assoc_mgr_unlock(&locks);

 	build_node_details(job_ptr, false);	/* set node_addr */
 	gres_stepmgr_job_build_details(
 		job_ptr->gres_list_alloc, job_ptr->nodes,
 		&job_ptr->gres_detail_cnt,
 		&job_ptr->gres_detail_str,
 		&job_ptr->gres_used);

 	on_job_state_change(job_ptr, job_ptr->job_state);
 	last_job_update = now;
 	return SLURM_SUCCESS;

 unpack_error:
 	error("Incomplete job record");
 	rc = SLURM_ERROR;

 free_it:
 	if (job_ptr) {
 		if (job_ptr->job_id == 0)
 			job_ptr->job_id = NO_VAL;
 		purge_job_record(job_ptr->job_id);
 	}

 	return rc;
 }

 /* _add_job_hash - add a job hash entry for given job record, job_id must
  *	already be set
  * IN job_ptr - pointer to job record
  * Globals: hash table updated
  */
 static void _add_job_hash(job_record_t *job_ptr)
 {
 	int inx;

 	inx = JOB_HASH_INX(job_ptr->job_id);
 	job_ptr->job_next = job_hash[inx];
 	job_hash[inx] = job_ptr;
 }

 /* _remove_job_hash - remove a job hash entry for given job record, job_id must
  *	already be set
  * IN job_ptr - pointer to job record
  * IN type - which hash to work with
  * Globals: hash table updated
  */
 static void _remove_job_hash(job_record_t *job_entry, job_hash_type_t type)
 {
 	job_record_t *job_ptr, **job_pptr;

 	xassert(job_entry);

 	on_job_state_change(job_entry, NO_VAL);

 	switch (type) {
 	case JOB_HASH_JOB:
 		job_pptr = &job_hash[JOB_HASH_INX(job_entry->job_id)];
 		break;
 	case JOB_HASH_ARRAY_JOB:
 		job_pptr = &job_array_hash_j[
 			JOB_HASH_INX(job_entry->array_job_id)];
 		break;
 	case JOB_HASH_ARRAY_TASK:
 		job_pptr = &job_array_hash_t[
 			JOB_ARRAY_HASH_INX(job_entry->array_job_id,
 					   job_entry->array_task_id)];
 		break;
 	default:
 		fatal("%s: unknown job_hash_type_t %d", __func__, type);
 		return;
 	}

 	while ((job_pptr != NULL) && (*job_pptr != NULL) &&
 	       ((job_ptr = *job_pptr) != job_entry)) {
 		xassert(job_ptr->magic == JOB_MAGIC);
 		switch (type) {
 		case JOB_HASH_JOB:
 			job_pptr = &job_ptr->job_next;
 			break;
 		case JOB_HASH_ARRAY_JOB:
 			job_pptr = &job_ptr->job_array_next_j;
 			break;
 		case JOB_HASH_ARRAY_TASK:
 			job_pptr = &job_ptr->job_array_next_t;
 			break;
 		}
 	}

 	if (job_pptr == NULL || *job_pptr == NULL) {
 		if (job_entry->job_id == NO_VAL)
 			return;

 		switch (type) {
 		case JOB_HASH_JOB:
 			error("%s: Could not find hash entry for JobId=%u",
 			      __func__, job_entry->job_id);
 			break;
 		case JOB_HASH_ARRAY_JOB:
 			error("%s: job array hash error %u", __func__,
 			      job_entry->array_job_id);
 			break;
 		case JOB_HASH_ARRAY_TASK:
 			error("%s: job array, task ID hash error %u_%u",
 			      __func__,
 			      job_entry->array_job_id,
 			      job_entry->array_task_id);
 			break;
 		}
 		return;
 	}

 	switch (type) {
 	case JOB_HASH_JOB:
 		*job_pptr = job_entry->job_next;
 		job_entry->job_next = NULL;
 		break;
 	case JOB_HASH_ARRAY_JOB:
 		*job_pptr = job_entry->job_array_next_j;
 		job_entry->job_array_next_j = NULL;
 		break;
 	case JOB_HASH_ARRAY_TASK:
 		*job_pptr = job_entry->job_array_next_t;
 		job_entry->job_array_next_t = NULL;
 		break;
 	}
 }

 /* _add_job_array_hash - add a job hash entry for given job record,
  *	array_job_id and array_task_id must already be set
  * IN job_ptr - pointer to job record
  * Globals: hash table updated
  */
 void _add_job_array_hash(job_record_t *job_ptr)
 {
 	int inx;

 	if (job_ptr->array_task_id == NO_VAL)
 		return;	/* Not a job array */

 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));

 	inx = JOB_HASH_INX(job_ptr->array_job_id);
 	job_ptr->job_array_next_j = job_array_hash_j[inx];
 	job_array_hash_j[inx] = job_ptr;

 	inx = JOB_ARRAY_HASH_INX(job_ptr->array_job_id,job_ptr->array_task_id);
 	job_ptr->job_array_next_t = job_array_hash_t[inx];
 	job_array_hash_t[inx] = job_ptr;
 }

 /* For the job array data structure, build the string representation of the
  * bitmap.
  * NOTE: bit_fmt_hexmask() is far more scalable than bit_fmt(). */
 extern void build_array_str(job_record_t *job_ptr)
 {
 	job_array_struct_t *array_recs = job_ptr->array_recs;

 	if (!array_recs || array_recs->task_id_str ||
 	    !array_recs->task_id_bitmap ||
 	    (job_ptr->array_task_id != NO_VAL) ||
 	    (bit_ffs(job_ptr->array_recs->task_id_bitmap) == -1))
 		return;

 	array_recs->task_id_str = bit_fmt_hexmask(array_recs->task_id_bitmap);

 	/* Update the job in the database. */
 	jobacct_storage_g_job_start(acct_db_conn, job_ptr);
 }

 /* Return true if ALL tasks of specific array job ID are complete */
 extern bool test_job_array_complete(uint32_t array_job_id)
 {
 	job_record_t *job_ptr;
 	int inx;

 	job_ptr = find_job_record(array_job_id);
 	if (job_ptr) {
 		if (!IS_JOB_COMPLETE(job_ptr))
 			return false;
 		if (job_ptr->array_recs && job_ptr->array_recs->max_exit_code)
 			return false;
 	}

 	/* Need to test individual job array records */
 	inx = JOB_HASH_INX(array_job_id);
 	job_ptr = job_array_hash_j[inx];
 	while (job_ptr) {
 		if (job_ptr->array_job_id == array_job_id) {
 			if (!IS_JOB_COMPLETE(job_ptr))
 				return false;
 		}
 		job_ptr = job_ptr->job_array_next_j;
 	}
 	return true;
 }

 /* Return true if ALL tasks of specific array job ID are completed */
 extern bool test_job_array_completed(uint32_t array_job_id)
 {
 	job_record_t *job_ptr;
 	int inx;

 	job_ptr = find_job_record(array_job_id);
 	if (job_ptr) {
 		if (!IS_JOB_COMPLETED(job_ptr))
 			return false;
 	}

 	/* Need to test individual job array records */
 	inx = JOB_HASH_INX(array_job_id);
 	job_ptr = job_array_hash_j[inx];
 	while (job_ptr) {
 		if (job_ptr->array_job_id == array_job_id) {
 			if (!IS_JOB_COMPLETED(job_ptr))
 				return false;
 		}
 		job_ptr = job_ptr->job_array_next_j;
 	}
 	return true;
 }

 /*
  * Return true if ALL tasks of specific array job ID are completed AND
  * all except for the head job have been purged.
  */
 static bool _test_job_array_purged(uint32_t array_job_id)
 {
 	job_record_t *job_ptr, *head_job_ptr;
 	int inx;

 	head_job_ptr = find_job_record(array_job_id);
 	if (head_job_ptr) {
 		if (!IS_JOB_COMPLETED(head_job_ptr))
 			return false;
 	}

 	/* Need to test individual job array records */
 	inx = JOB_HASH_INX(array_job_id);
 	job_ptr = job_array_hash_j[inx];
 	while (job_ptr) {
 		if ((job_ptr->array_job_id == array_job_id) &&
 		    (job_ptr != head_job_ptr)) {
 			return false;
 		}
 		job_ptr = job_ptr->job_array_next_j;
 	}
 	return true;
 }

 /* Return true if ALL tasks of specific array job ID are finished */
 extern bool test_job_array_finished(uint32_t array_job_id)
 {
 	job_record_t *job_ptr;
 	int inx;

 	job_ptr = find_job_record(array_job_id);
 	if (job_ptr) {
 		if (!IS_JOB_FINISHED(job_ptr))
 			return false;
 	}

 	/* Need to test individual job array records */
 	inx = JOB_HASH_INX(array_job_id);
 	job_ptr = job_array_hash_j[inx];
 	while (job_ptr) {
 		if (job_ptr->array_job_id == array_job_id) {
 			if (!IS_JOB_FINISHED(job_ptr))
 				return false;
 		}
 		job_ptr = job_ptr->job_array_next_j;
 	}

 	return true;
 }

 /* Return true if ANY tasks of specific array job ID are pending */
 extern bool test_job_array_pending(uint32_t array_job_id)
 {
 	job_record_t *job_ptr;
 	int inx;

 	job_ptr = find_job_record(array_job_id);
 	if (job_ptr) {
 		if (IS_JOB_PENDING(job_ptr) || IS_JOB_CONFIGURING(job_ptr))
 			return true;
 		if (job_ptr->array_recs && job_ptr->array_recs->task_cnt)
 			return true;
 	}

 	/* Need to test individual job array records */
 	inx = JOB_HASH_INX(array_job_id);
 	job_ptr = job_array_hash_j[inx];
 	while (job_ptr) {
 		if (job_ptr->array_job_id == array_job_id) {
 			if (IS_JOB_PENDING(job_ptr) ||
 			    IS_JOB_CONFIGURING(job_ptr))
 				return true;
 		}
 		job_ptr = job_ptr->job_array_next_j;
 	}
 	return false;
 }

 /* For a given job ID return the number of PENDING tasks which have their
  * own separate job_record (do not count tasks in pending META job record) */
 extern int num_pending_job_array_tasks(uint32_t array_job_id)
 {
 	job_record_t *job_ptr;
 	int count = 0, inx;

 	inx = JOB_HASH_INX(array_job_id);
 	job_ptr = job_array_hash_j[inx];
 	while (job_ptr) {
 		if ((job_ptr->array_job_id == array_job_id) &&
 		    IS_JOB_PENDING(job_ptr))
 			count++;
 		job_ptr = job_ptr->job_array_next_j;
 	}

 	return count;
 }

 static void _foreach_by_job_callback(job_record_t *job_ptr,
 				     for_each_by_job_id_args_t *args)
 {
 	xassert(args->magic == MAGIC_FOREACH_BY_JOBID_ARGS);

 	if (!job_ptr || !job_ptr->job_id)
 		return;

 	xassert(!!args->ro_callback != !!args->callback); /* xor */
 	xassert(args->control == FOR_EACH_JOB_BY_ID_EACH_CONT);

 	if (args->ro_callback)
 		args->control = args->ro_callback(job_ptr, args->filter,
 						  args->callback_arg);
 	else
 		args->control = args->callback(job_ptr, args->filter,
 					       args->callback_arg);

 	xassert(args->control > FOR_EACH_JOB_BY_ID_EACH_INVALID);
 	xassert(args->control < FOR_EACH_JOB_BY_ID_EACH_INVALID_MAX);
 }

 static int _foreach_job_by_id_single(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;
 	for_each_by_job_id_args_t *args = arg;

 	xassert(args->magic == MAGIC_FOREACH_BY_JOBID_ARGS);

 	_foreach_by_job_callback(job_ptr, args);

 	switch (args->control)
 	{
 	case FOR_EACH_JOB_BY_ID_EACH_CONT:
 		return SLURM_SUCCESS;
 	case FOR_EACH_JOB_BY_ID_EACH_STOP:
 	case FOR_EACH_JOB_BY_ID_EACH_FAIL:
 		/* must return error as only way to stop list foreach */
 		return SLURM_ERROR;
 	case FOR_EACH_JOB_BY_ID_EACH_INVALID_MAX:
 	case FOR_EACH_JOB_BY_ID_EACH_INVALID:
 		fatal_abort("should never happen");
 	}

 	return SLURM_SUCCESS;
 }

 static int _foreach_by_het_job(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;
 	for_each_by_job_id_args_t *args = arg;

 	xassert(args->magic == MAGIC_FOREACH_BY_JOBID_ARGS);

 	/* Filter to only this HetJob */

 	if (job_ptr->het_job_id != args->job_ptr->het_job_id)
 		return SLURM_SUCCESS;

 	if ((args->filter->het_job_offset != NO_VAL) &&
 	    (job_ptr->het_job_offset != args->filter->het_job_offset))
 		return SLURM_SUCCESS;

 	return _foreach_job_by_id_single(job_ptr, args);
 }

 static job_record_t *_find_first_job_array_rec(uint32_t array_job_id)
 {
 	job_record_t *job_ptr;
 	int inx;

 	inx = JOB_HASH_INX(array_job_id);
 	job_ptr = job_array_hash_j[inx];
 	while (job_ptr) {
 		if (job_ptr->array_job_id == array_job_id)
 			return job_ptr;
 		job_ptr = job_ptr->job_array_next_j;
 	}

 	return NULL;
 }

 static void _foreach_job_by_id_array(for_each_by_job_id_args_t *args)
 {
 	job_record_t *meta, *start;
 	bool dumped_meta = false, dumped_linked = false;
 	const uint32_t array_job_id = args->job_ptr->array_job_id;

 	xassert(args->magic == MAGIC_FOREACH_BY_JOBID_ARGS);

 	start = _find_first_job_array_rec(array_job_id);

 	for (job_record_t *j = start; j; j = j->job_array_next_j) {
 		if (j->array_job_id != array_job_id)
 			continue;

 		if (j->array_recs)
 			dumped_meta = true;

 		if ((args->filter->array_task_id != NO_VAL) &&
 		    (j->array_task_id != args->filter->array_task_id))
 			continue;

 		debug3("%pJ->array_recs=%"PRIxPTR" linked to %pJ->array_recs=%"PRIxPTR,
 		       start, (uintptr_t) (start ? start->array_recs : NULL), j,
 		       (uintptr_t) j->array_recs);

 		_foreach_by_job_callback(j, args);

 		if (args->control != FOR_EACH_JOB_BY_ID_EACH_CONT)
 			return;

 		dumped_linked = true;
 	}

 	if (dumped_meta)
 		return;

 	meta = find_job_record(args->job_ptr->array_job_id);

 	if (!meta)
 		return;

 	if (!meta->array_recs) {
 		debug3("%pJ->array_recs = NULL", meta);
 		return;
 	} else if (!meta->array_recs->task_id_bitmap) {
 		debug3("%pJ->array_recs->task_id_bitmap = NULL", meta);
 		return;
 	}

 	xassert(meta->array_task_id == NO_VAL);
 	xassert(meta->array_job_id == meta->job_id);

 	_foreach_by_job_callback(meta, args);

 	if (args->control != FOR_EACH_JOB_BY_ID_EACH_CONT)
 		return;

 	if (dumped_linked)
 		return;

 	for (int i = 0; i < bit_size(meta->array_recs->task_id_bitmap); i++) {
 		if (!bit_test(meta->array_recs->task_id_bitmap, i)) {
 			job_record_t *job_ptr =
 				find_job_array_rec(meta->array_job_id, i);

 			if (!job_ptr)
 				continue;

 			if ((args->filter->array_task_id != NO_VAL) &&
 			    (job_ptr->array_task_id !=
 			     args->filter->array_task_id))
 				continue;

 			debug3("%pJ resolving bit:%d=%c to %pJ",
 			       meta, i,
 			       (bit_test(meta->array_recs->task_id_bitmap, i) ?
 				'1' : '0'), job_ptr);

 			_foreach_by_job_callback(job_ptr, args);

 			if (args->control != FOR_EACH_JOB_BY_ID_EACH_CONT)
 				return;
 		}
 	}
 }

 static void _find_array_expression_jobs(const slurm_selected_step_t *filter,
 					for_each_by_job_id_args_t *args,
 					list_t *match_job_list,
 					slurm_selected_step_t *not_found_tasks)
 {
 	int32_t i_first, i_last;
 	uint32_t job_id = filter->step_id.job_id;
 	bitstr_t *array_bitmap = filter->array_bitmap;
 	job_record_t *job_ptr;
 	job_record_t *meta_job = NULL;

 	i_first = bit_ffs(array_bitmap);
 	if (i_first >= 0)
 		i_last = bit_fls(array_bitmap);
 	else
 		i_last = -2;
 	for (int i = i_first; i <= i_last; i++) {
 		if (!bit_test(array_bitmap, i))
 			continue;
 		job_ptr = find_job_array_rec(job_id, i);
 		/* If !job_ptr, the array task does not exist. */
 		if (!job_ptr && !not_found_tasks)
 			continue;
 		if (!job_ptr && not_found_tasks) {
 			bit_set(not_found_tasks->array_bitmap, i);
 			continue;
 		}
 		if (IS_JOB_PENDING(job_ptr) && job_ptr->array_recs) {
 			/* Found the meta job, or a task in the meta job */
 			meta_job = job_ptr;
 			continue;
 		}
 		/*
 		 * Found an array task that has been split from the meta record,
 		 * or the meta record is not pending and all tasks have already
 		 * been split out.
 		 */
 		list_append(match_job_list, job_ptr);
 	}
 	if (meta_job)
 		list_append(match_job_list, meta_job);
 }

 static void _foreach_array_bitmap(const slurm_selected_step_t *filter,
 				  for_each_by_job_id_args_t *args)
 {
 	list_t *match_job_list = list_create(NULL); /* list of job_record_t */
 	slurm_selected_step_t *not_found_tasks = NULL;
 	foreach_job_by_id_control_t tmp_control =
 		FOR_EACH_JOB_BY_ID_EACH_INVALID;

 	/*
 	 * Call the callback once per record that has been split off.
 	 * Then call it once for the meta record.
 	 */
 	if (args->null_callback) {
 		not_found_tasks = xmalloc(sizeof(*not_found_tasks));
 		memcpy(not_found_tasks, filter, sizeof(*not_found_tasks));
 		not_found_tasks->array_bitmap =
 			bit_alloc(bit_size(filter->array_bitmap));
 	}
 	_find_array_expression_jobs(filter, args, match_job_list,
 				    not_found_tasks);

 	/*
 	 * Because this is a single filter, call both callbacks (no-match and
 	 * match). Then, set args->control to the max of each callback return
 	 * value.
 	 */
 	if (not_found_tasks) {
 		if (bit_ffs(not_found_tasks->array_bitmap) != -1)
 			tmp_control = args->null_callback(not_found_tasks,
 							  args->callback_arg);
 		FREE_NULL_BITMAP(not_found_tasks->array_bitmap);
 		xfree(not_found_tasks);
 	}

 	if (list_count(match_job_list))
 		(void) list_for_each(match_job_list, _foreach_job_by_id_single,
 				     args);

 	FREE_NULL_LIST(match_job_list);
 	if (tmp_control != FOR_EACH_JOB_BY_ID_EACH_INVALID)
 		args->control = MAX(args->control, tmp_control);
 }

 static int _walk_jobs_by_selected_step(const slurm_selected_step_t *filter,
 				       for_each_by_job_id_args_t *args)
 {
 	xassert(args->magic == MAGIC_FOREACH_BY_JOBID_ARGS);

 	if (!filter->step_id.job_id) {
 		/* 0 is never a valid job so just return now */
 		goto done;
 	} else if (filter->step_id.job_id == NO_VAL) {
 		/* walk all jobs */
 		(void) list_for_each_ro(job_list, _foreach_job_by_id_single,
 					args);
 		goto done;
 	}

 	xassert(!((filter->array_task_id != NO_VAL) &&
 		  (filter->het_job_offset != NO_VAL)));

 	if (filter->array_bitmap) {
 		_foreach_array_bitmap(filter, args);
 		goto done;
 	}

 	if (filter->array_task_id != NO_VAL)
 		args->job_ptr = find_job_array_rec(filter->step_id.job_id,
 						   filter->array_task_id);
 	else if (filter->het_job_offset != NO_VAL)
 		args->job_ptr = find_job_record(filter->step_id.job_id +
 						filter->het_job_offset);
 	else /* not array task or het component */
 		args->job_ptr = find_job_record(filter->step_id.job_id);

 	if (!args->job_ptr) {
 		if (!args->null_callback) {
 			args->control = FOR_EACH_JOB_BY_ID_EACH_CONT;
 		} else {
 			args->control = args->null_callback(filter,
 							    args->callback_arg);
 		}
 		goto done;
 	}

 	if (args->job_ptr->het_job_list) {
 		xassert(args->job_ptr->het_job_id > 0);
 		(void) list_for_each(args->job_ptr->het_job_list,
 				     _foreach_by_het_job, args);
 	} else if (args->job_ptr->array_job_id != args->job_ptr->job_id) {
 		/* Pack regular (not array/het) job */
 		_foreach_by_job_callback(args->job_ptr, args);
 	} else {
 		/* array job */
 		_foreach_job_by_id_array(args);
 	}

 done:
 	switch (args->control)
 	{
 	case FOR_EACH_JOB_BY_ID_EACH_STOP:
 	case FOR_EACH_JOB_BY_ID_EACH_CONT:
 		return args->count;
 	case FOR_EACH_JOB_BY_ID_EACH_FAIL:
 		return args->count * -1;
 	case FOR_EACH_JOB_BY_ID_EACH_INVALID_MAX:
 	case FOR_EACH_JOB_BY_ID_EACH_INVALID:
 		fatal_abort("should never happen");
 	}

 	fatal_abort("should never happen");
 }

 extern int foreach_job_by_id(const slurm_selected_step_t *filter,
 			     JobForEachFunc callback,
 			     JobNullForEachFunc null_callback, void *arg)
 {
 	for_each_by_job_id_args_t args = {
 		.magic = MAGIC_FOREACH_BY_JOBID_ARGS,
 		.control = FOR_EACH_JOB_BY_ID_EACH_CONT,
 		.count = 0,
 		.callback = callback,
 		.callback_arg = arg,
 		.null_callback = null_callback,
 		.filter = filter,
 	};

 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));

 	return _walk_jobs_by_selected_step(filter, &args);
 }

 extern int foreach_job_by_id_ro(const slurm_selected_step_t *filter,
 				JobROForEachFunc callback,
 				JobNullForEachFunc null_callback, void *arg)
 {
 	for_each_by_job_id_args_t args = {
 		.magic = MAGIC_FOREACH_BY_JOBID_ARGS,
 		.control = FOR_EACH_JOB_BY_ID_EACH_CONT,
 		.count = 0,
 		.ro_callback = callback,
 		.callback_arg = arg,
 		.null_callback = null_callback,
 		.filter = filter,
 	};

 	xassert(verify_lock(JOB_LOCK, READ_LOCK));

 	return _walk_jobs_by_selected_step(filter, &args);
 }

 /*
  * find_job_array_rec - return a pointer to the job record with the given
  *	array_job_id/array_task_id
  * IN job_id - requested job's id
  * IN array_task_id - requested job's task id,
  *		      NO_VAL if none specified (i.e. not a job array)
  *		      INFINITE return any task for specified job id
  * RET pointer to the job's record, NULL on error
  */
 extern job_record_t *find_job_array_rec(uint32_t array_job_id,
 					uint32_t array_task_id)
 {
 	job_record_t *job_ptr, *match_job_ptr = NULL;
 	int inx;

 	if (array_task_id == NO_VAL)
 		return find_job_record(array_job_id);

 	if (array_task_id == INFINITE) {	/* find by job ID */
 		/* Look for job record with all of the pending tasks */
 		job_ptr = find_job_record(array_job_id);
 		if (job_ptr && job_ptr->array_recs &&
 		    (job_ptr->array_job_id == array_job_id))
 			return job_ptr;

 		inx = JOB_HASH_INX(array_job_id);
 		job_ptr = job_array_hash_j[inx];
 		while (job_ptr) {
 			if (job_ptr->array_job_id == array_job_id) {
 				match_job_ptr = job_ptr;
 				if (!IS_JOB_FINISHED(job_ptr)) {
 					return job_ptr;
 				}
 			}
 			job_ptr = job_ptr->job_array_next_j;
 		}
 		return match_job_ptr;
 	} else {		/* Find specific task ID */
 		inx = JOB_ARRAY_HASH_INX(array_job_id, array_task_id);
 		job_ptr = job_array_hash_t[inx];
 		while (job_ptr) {
 			if ((job_ptr->array_job_id == array_job_id) &&
 			    (job_ptr->array_task_id == array_task_id)) {
 				return job_ptr;
 			}
 			job_ptr = job_ptr->job_array_next_t;
 		}
 		/* Look for job record with all of the pending tasks */
 		job_ptr = find_job_record(array_job_id);
 		if (job_ptr && job_ptr->array_recs &&
 		    job_ptr->array_recs->task_id_bitmap) {
 			inx = bit_size(job_ptr->array_recs->task_id_bitmap);
 			if ((array_task_id < inx) &&
 			    bit_test(job_ptr->array_recs->task_id_bitmap,
 				     array_task_id)) {
 				return job_ptr;
 			}
 		}
 		return NULL;	/* None found */
 	}
 }

 static int _find_het_job(void *x, void *arg)
 {
 	job_record_t *het_job = x;
 	job_record_t *search_job_ptr = arg;

 	if (search_job_ptr->het_job_id != het_job->het_job_id) {
 		error("%s: Bad het_job_list for %u", __func__,
 		      search_job_ptr->job_id);
 		return 0;
 	}

 	if (het_job->het_job_offset == search_job_ptr->het_job_offset)
 		return 1;

 	return 0;
 }

 /*
  * find_het_job_record - return a pointer to the job record with the given ID
  * IN job_id - requested job's ID
  * IN het_job_offset - hetjob component offset
  * RET pointer to the job's record, NULL on error
  */
 extern job_record_t *find_het_job_record(uint32_t job_id,
 					 uint32_t het_job_offset)
 {
 	job_record_t *het_job_leader = find_job_record(job_id);
 	job_record_t search_job_rec = { 0 };

 	if (!het_job_leader)
 		return NULL;
 	if (het_job_leader->het_job_offset == het_job_offset)
 		return het_job_leader;

 	if (!het_job_leader->het_job_list)
 		return NULL;

 	search_job_rec.job_id = het_job_leader->job_id;
 	search_job_rec.het_job_id = het_job_leader->het_job_id;
 	search_job_rec.het_job_offset = het_job_offset;

 	return list_find_first(het_job_leader->het_job_list, _find_het_job,
 			       &search_job_rec);
 }

 /*
  * find_job_record - return a pointer to the job record with the given job_id
  * IN job_id - requested job's id
  * RET pointer to the job's record, NULL on error
  */
 extern job_record_t *find_job_record(uint32_t job_id)
 {
 	job_record_t *job_ptr;
 	xassert(verify_lock(JOB_LOCK, READ_LOCK));

 	job_ptr = job_hash[JOB_HASH_INX(job_id)];
 	while (job_ptr) {
 		if (job_ptr->job_id == job_id)
 			return job_ptr;
 		job_ptr = job_ptr->job_next;
 	}

 	return NULL;
 }

 /*
  * Set a requeued job to PENDING and COMPLETING if all the nodes are completed
  * and the EpilogSlurmctld is not running
  */
 static void _set_requeued_job_pending_completing(job_record_t *job_ptr)
 {
 	/* do this after the epilog complete, setting it here is too early */
 	//job_record_set_sluid(job_ptr);
 	//job_ptr->details->submit_time = now;

 	if (job_ptr->node_cnt || job_ptr->epilog_running)
 		job_state_set(job_ptr, (JOB_PENDING | JOB_COMPLETING));
 	else
 		job_state_set(job_ptr, JOB_PENDING);
 }

 /*
  * Kill job or job step
  *
  * IN job_step_kill_msg - msg with specs on which job/step to cancel.
  * IN job_ptr           - pointer to job_record_t to cancel.
  * IN uid               - uid of user requesting job/step cancel.
  */
 static int _kill_job_step(job_step_kill_msg_t *job_step_kill_msg,
 			  job_record_t *job_ptr, uint32_t uid)
 {
 	DEF_TIMERS;
 	int error_code = SLURM_SUCCESS;
 	xassert(job_ptr);
 	xassert(job_ptr->job_id == job_step_kill_msg->step_id.job_id);

 	START_TIMER;

 	log_flag(TRACE_JOBS, "%s: enter %pJ", __func__, job_ptr);

 	/* do RPC call */
 	if (job_step_kill_msg->step_id.step_id == NO_VAL) {
 		/* NO_VAL means the whole job, not individual steps */
 		error_code = job_signal(job_ptr,
 					job_step_kill_msg->signal,
 					job_step_kill_msg->flags, uid,
 					false);
 		END_TIMER2(__func__);

 		/* return result */
 		if (error_code) {
 			log_flag(STEPS, "Signal %u %pJ by UID=%u: %s",
 				 job_step_kill_msg->signal, job_ptr, uid,
 				 slurm_strerror(error_code));
 		} else {
 			if (job_step_kill_msg->signal == SIGKILL) {
 				log_flag(STEPS, "%s: Cancel of %pJ by UID=%u, %s",
 					 __func__, job_ptr, uid, TIME_STR);
 				slurmctld_diag_stats.jobs_canceled++;
 			} else
 				log_flag(STEPS, "%s: Signal %u of %pJ by UID=%u, %s",
 					 __func__, job_step_kill_msg->signal,
 					 job_ptr, uid, TIME_STR);

 			/* Below function provides its own locking */
 			schedule_job_save();
 		}
 	} else {
 		error_code = job_step_signal(&job_step_kill_msg->step_id,
 					     job_step_kill_msg->signal,
 					     job_step_kill_msg->flags,
 					     uid);
 		END_TIMER2(__func__);

 		/* return result */
 		if (error_code) {
 			log_flag(STEPS, "Signal %u of JobId=%u StepId=%u by UID=%u: %s",
 				 job_step_kill_msg->signal,
 				 job_step_kill_msg->step_id.job_id,
 				 job_step_kill_msg->step_id.step_id, uid,
 				 slurm_strerror(error_code));
 		} else {
 			if (job_step_kill_msg->signal == SIGKILL)
 				log_flag(STEPS, "%s: Cancel of JobId=%u StepId=%u by UID=%u %s",
 					 __func__,
 					 job_step_kill_msg->step_id.job_id,
 					 job_step_kill_msg->step_id.step_id,
 					 uid,
 					 TIME_STR);
 			else
 				log_flag(STEPS, "%s: Signal %u of JobId=%u StepId=%u by UID=%u %s",
 					 __func__, job_step_kill_msg->signal,
 					 job_step_kill_msg->step_id.job_id,
 					 job_step_kill_msg->step_id.step_id,
 					 uid,
 					 TIME_STR);

 			/* Below function provides its own locking */
 			schedule_job_save();
 		}
 	}

 	log_flag(TRACE_JOBS, "%s: return %pJ", __func__, job_ptr);
 	return error_code;
 }

 static int _foreach_kill_hetjob_step(void *x, void *arg)
 {
 	job_record_t *het_job_ptr = x;
 	foreach_kill_hetjob_step_t *foreach_kill_hetjob_step = arg;
 	job_step_kill_msg_t *job_step_kill_msg =
 		foreach_kill_hetjob_step->job_step_kill_msg;
 	int rc;

 	job_step_kill_msg->step_id.job_id = het_job_ptr->job_id;
 	rc = _kill_job_step(job_step_kill_msg, het_job_ptr,
 			    foreach_kill_hetjob_step->uid);

 	if (rc != SLURM_SUCCESS)
 		foreach_kill_hetjob_step->rc = rc;

 	return 0;
 }

 /*
  * Kill job or job step
  *
  * IN job_step_kill_msg - msg with specs on which job/step to cancel.
  * IN uid               - uid of user requesting job/step cancel.
  */
 extern int kill_job_step(job_step_kill_msg_t *job_step_kill_msg, uint32_t uid)
 {
 	/* Locks: Read config, write job, write node, read fed */
 	slurmctld_lock_t job_write_lock = {
 		.conf = READ_LOCK,
 		.job = WRITE_LOCK,
 		.node = WRITE_LOCK,
 		.fed = READ_LOCK,
 	};

 	job_record_t *job_ptr;
 	int error_code = SLURM_SUCCESS;

 	lock_slurmctld(job_write_lock);
 	job_ptr = find_job_record(job_step_kill_msg->step_id.job_id);

 	if (!job_ptr) {
 		info("%s: invalid JobId=%u",
 		     __func__, job_step_kill_msg->step_id.job_id);
 		error_code = ESLURM_INVALID_JOB_ID;
 		goto endit;
 	}

 	if ((job_ptr->user_id != uid) && !validate_operator(uid) &&
 	    !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
 					  job_ptr->account, false)) {
 		error("Security violation, JOB_CANCEL RPC for %pJ from uid %u",
 		      job_ptr, uid);
 		error_code = ESLURM_ACCESS_DENIED;
 		goto endit;
 	}

 	if (job_ptr->het_job_list &&
 	    (job_step_kill_msg->signal == SIGKILL) &&
 	    (job_step_kill_msg->step_id.step_id != NO_VAL)) {
 		foreach_kill_hetjob_step_t foreach_kill_hetjob_step = {
 			.job_step_kill_msg = job_step_kill_msg,
 			.rc = SLURM_SUCCESS,
 			.uid = uid,
 		};
 		(void) list_for_each(job_ptr->het_job_list,
 				     _foreach_kill_hetjob_step,
 				     &foreach_kill_hetjob_step);
 		if (foreach_kill_hetjob_step.rc != SLURM_SUCCESS)
 			error_code = foreach_kill_hetjob_step.rc;
 	} else {
 		error_code = _kill_job_step(job_step_kill_msg, job_ptr, uid);
 	}

 endit:
 	unlock_slurmctld(job_write_lock);

 	return error_code;
 }

 static int _foreach_kill_job_by_part_name(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;
 	foreach_kill_job_by_t *foreach_kill_job_by = arg;
 	part_record_t *part_ptr = foreach_kill_job_by->part_ptr;
 	time_t now = foreach_kill_job_by->now;
 	bool pending = false, suspended = false;

 	pending = IS_JOB_PENDING(job_ptr);
 	if (job_ptr->part_ptr_list) {
 		/* Remove partition if candidate for a job */
 		int rebuild_name_list =
 			list_delete_first(job_ptr->part_ptr_list,
 					  slurm_find_ptr_in_list,
 					  part_ptr);

 		if (rebuild_name_list == -1) {
 			error("%s: Processing part_ptr_list, this should never happen.",
 			      __func__);
 		} else if (rebuild_name_list) {
 			if (list_count(job_ptr->part_ptr_list) > 0) {
 				rebuild_job_part_list(job_ptr);
 				job_ptr->part_ptr =
 					list_peek(job_ptr->part_ptr_list);
 			} else {
 				FREE_NULL_LIST(job_ptr->part_ptr_list);
 			}
 		}
 	}

 	if (job_ptr->part_ptr != part_ptr)
 		return 0;

 	if (IS_JOB_SUSPENDED(job_ptr)) {
 		uint32_t suspend_job_state = job_ptr->job_state;
 		/* we can't have it as suspended when we call the
 		 * accounting stuff.
 		 */
 		job_state_set(job_ptr, JOB_CANCELLED);
 		jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
 		job_state_set(job_ptr, suspend_job_state);
 		suspended = true;
 	}
 	if (IS_JOB_RUNNING(job_ptr) || suspended) {
 		foreach_kill_job_by->kill_job_cnt++;
 		info("Killing %pJ on defunct partition %s",
 		     job_ptr, part_ptr->name);
 		job_state_set(job_ptr, (JOB_NODE_FAIL | JOB_COMPLETING));
 		build_cg_bitmap(job_ptr);
 		job_ptr->state_reason = FAIL_DOWN_PARTITION;
 		xfree(job_ptr->state_desc);
 		if (suspended) {
 			job_ptr->end_time = job_ptr->suspend_time;
 			job_ptr->tot_sus_time +=
 				difftime(now, job_ptr->suspend_time);
 		} else
 			job_ptr->end_time = now;
 		job_ptr->exit_code = 1;
 		job_completion_logger(job_ptr, false);
 		if (!pending)
 			deallocate_nodes(job_ptr, false, suspended, false);
 	} else if (pending) {
 		foreach_kill_job_by->kill_job_cnt++;
 		info("Killing %pJ on defunct partition %s",
 		     job_ptr, part_ptr->name);
 		job_state_set(job_ptr, JOB_CANCELLED);
 		job_ptr->start_time = now;
 		job_ptr->end_time = now;
 		job_ptr->exit_code = 1;
 		job_completion_logger(job_ptr, false);
 		fed_mgr_job_complete(job_ptr, 0, now);
 	}
 	job_ptr->part_ptr = NULL;
 	FREE_NULL_LIST(job_ptr->part_ptr_list);

 	return 0;
 }

 /*
  * kill_job_by_part_name - Given a partition name, deallocate resource for
  *	its jobs and kill them. All jobs associated with this partition
  *	will have their partition pointer cleared.
  * IN part_name - name of a partition
  * RET number of jobs associated with this partition
  */
 extern int kill_job_by_part_name(char *part_name)
 {
 	foreach_kill_job_by_t foreach_kill_job_by = {
 		.now = time(NULL),
 		.part_ptr = find_part_record(part_name),
 	};

 	if (!foreach_kill_job_by.part_ptr) /* No such partition */
 		return 0;

 	(void) list_for_each(job_list, _foreach_kill_job_by_part_name,
 			     &foreach_kill_job_by);

 	if (foreach_kill_job_by.kill_job_cnt)
 		last_job_update = foreach_kill_job_by.now;
 	return foreach_kill_job_by.kill_job_cnt;
 }

 /*
  * partition_in_use - determine whether a partition is in use by a RUNNING
  *	PENDING or SUSPENDED job or reservations
  * IN part_name - name of a partition
  * RET true if the partition is in use, else false
  */
 extern bool partition_in_use(char *part_name)
 {
 	part_record_t *part_ptr;

 	part_ptr = find_part_record (part_name);
 	if (part_ptr == NULL)	/* No such partition */
 		return false;

 	/* check jobs */
 	if (list_find_first(job_list, _find_job_part, part_ptr))
 		return true;

 	/* check reservations */
 	if (list_find_first(resv_list, _find_resv_part, part_ptr))
 		return true;

 	return false;
 }

 static bool _job_node_test(job_record_t *job_ptr, int node_inx)
 {
 	if (job_ptr->node_bitmap &&
 	    bit_test(job_ptr->node_bitmap, node_inx))
 		return true;
 	return false;
 }

 static int _find_het_job_on_node(void *x, void *arg)
 {
 	job_record_t *het_job = x;
 	int node_inx = *(int *)arg;

 	if (_job_node_test(het_job, node_inx))
 		return 1;
 	/*
 	 * After a DOWN node is removed from another job component,
 	 * we have no way to identify other hetjob components with
 	 * the same node, so assume if one component is in NODE_FAILED
 	 * state, they all should be.
 	 */
 	if (IS_JOB_NODE_FAILED(het_job))
 		return 1;

 	return 0;
 }

 static bool _het_job_on_node(job_record_t *job_ptr, int node_inx)
 {
 	job_record_t *het_job_leader;

 	if (!job_ptr->het_job_id)
 		return _job_node_test(job_ptr, node_inx);

 	het_job_leader = find_job_record(job_ptr->het_job_id);
 	if (!het_job_leader) {
 		error("%s: Hetjob leader %pJ not found",
 		      __func__, job_ptr);
 		return _job_node_test(job_ptr, node_inx);
 	}
 	if (!het_job_leader->het_job_list) {
 		error("%s: Hetjob leader %pJ job list is NULL",
 		      __func__, job_ptr);
 		return _job_node_test(job_ptr, node_inx);
 	}

 	return list_find_first(het_job_leader->het_job_list,
 			       _find_het_job_on_node, &node_inx);
 }

 static int _foreach_kill_running_job_by_node(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;
 	foreach_kill_job_by_t *foreach_kill_job_by = arg;
 	node_record_t *node_ptr = foreach_kill_job_by->node_ptr;
 	bool suspended = false;
 	job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;

 	xassert(node_ptr);

 	if (!_het_job_on_node(job_ptr, node_ptr->index))
 		return 0; /* job not on this node */
 	if (IS_JOB_SUSPENDED(job_ptr)) {
 		uint32_t suspend_job_state = job_ptr->job_state;
 		/*
 		 * we can't have it as suspended when we call the
 		 * accounting stuff.
 		 */
 		job_state_set(job_ptr, JOB_CANCELLED);
 		jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
 		job_state_set(job_ptr, suspend_job_state);
 		suspended = true;
 	}

 	if (IS_JOB_COMPLETING(job_ptr)) {
 		if (!bit_test(job_ptr->node_bitmap_cg, node_ptr->index))
 			return 0;
 		foreach_kill_job_by->kill_job_cnt++;
 		bit_clear(job_ptr->node_bitmap_cg, node_ptr->index);
 		job_update_tres_cnt(job_ptr, node_ptr->index);
 		if (job_ptr->node_cnt)
 			(job_ptr->node_cnt)--;
 		else {
 			error("node_cnt underflow on %pJ", job_ptr);
 		}
 		cleanup_completing(job_ptr, true);

 		if (node_ptr->comp_job_cnt)
 			node_ptr->comp_job_cnt--;
 		else {
 			error("Node %s comp_job_cnt underflow, %pJ",
 			      node_ptr->name, job_ptr);
 		}
 	} else if (IS_JOB_RUNNING(job_ptr) || suspended) {
 		foreach_kill_job_by->kill_job_cnt++;
 		if ((job_ptr->details) &&
 		    (job_ptr->kill_on_node_fail == 0) &&
 		    (job_ptr->node_cnt > 1) &&
 		    !IS_JOB_CONFIGURING(job_ptr)) {
 			bitstr_t *orig_job_node_bitmap;

 			/* keep job running on remaining nodes */
 			srun_node_fail(job_ptr, node_ptr->name);
 			error("Removing failed node %s from %pJ",
 			      node_ptr->name, job_ptr);
 			job_pre_resize_acctg(job_ptr);
 			kill_step_on_node(job_ptr, node_ptr, true);
 			orig_job_node_bitmap =
 				bit_copy(job_resrcs_ptr->node_bitmap);
 			excise_node_from_job(job_ptr, node_ptr);
 			/* Resize the bitmaps of the job's steps */
 			rebuild_step_bitmaps(job_ptr, orig_job_node_bitmap);
 			FREE_NULL_BITMAP(orig_job_node_bitmap);
 			(void) gs_job_start(job_ptr);
 			gres_stepmgr_job_build_details(
 				job_ptr->gres_list_alloc,
 				job_ptr->nodes,
 				&job_ptr->gres_detail_cnt,
 				&job_ptr->gres_detail_str,
 				&job_ptr->gres_used);
 			job_post_resize_acctg(job_ptr);
 		} else if (job_ptr->batch_flag &&
 			   ((job_ptr->details &&
 			     job_ptr->details->requeue) ||
 			    (foreach_kill_job_by->requeue_on_resume_failure &&
 			     IS_NODE_POWERED_DOWN(node_ptr) &&
 			     IS_JOB_CONFIGURING(job_ptr)))) {
 			srun_node_fail(job_ptr, node_ptr->name);
 			info("requeue job %pJ due to failure of node %s",
 			     job_ptr, node_ptr->name);
 			job_ptr->time_last_active = foreach_kill_job_by->now;
 			if (suspended) {
 				job_ptr->end_time = job_ptr->suspend_time;
 				job_ptr->tot_sus_time +=
 					difftime(foreach_kill_job_by->now,
 						 job_ptr->suspend_time);
 			} else
 				job_ptr->end_time = foreach_kill_job_by->now;

 			/*
 			 * We want this job to look like it
 			 * was terminated in the accounting logs.
 			 * Set a new submit time so the restarted
 			 * job looks like a new job.
 			 */
 			job_state_set(job_ptr, JOB_NODE_FAIL);
 			job_ptr->failed_node = xstrdup(node_ptr->name);
 			build_cg_bitmap(job_ptr);
 			job_ptr->exit_code = 1;
 			job_completion_logger(job_ptr, true);
 			deallocate_nodes(job_ptr, false, suspended, false);

 			_set_requeued_job_pending_completing(job_ptr);

 			job_ptr->restart_cnt++;

 			/* clear signal sent flag on requeue */
 			job_ptr->warn_flags &= ~WARN_SENT;

 			job_ptr->exit_code = 0;

 			/*
 			 * Since the job completion logger
 			 * removes the submit we need to add it
 			 * again.
 			 */
 			acct_policy_add_job_submit(job_ptr, false);

 			if (!job_ptr->node_bitmap_cg ||
 			    bit_ffs(job_ptr->node_bitmap_cg) == -1)
 				batch_requeue_fini(job_ptr);
 		} else {
 			info("Killing %pJ on failed node %s",
 			     job_ptr, node_ptr->name);
 			srun_node_fail(job_ptr, node_ptr->name);
 			job_state_set(job_ptr,
 				      (JOB_NODE_FAIL | JOB_COMPLETING));
 			job_ptr->failed_node = xstrdup(node_ptr->name);
 			build_cg_bitmap(job_ptr);
 			job_ptr->state_reason = FAIL_DOWN_NODE;
 			xfree(job_ptr->state_desc);
 			if (suspended) {
 				job_ptr->end_time = job_ptr->suspend_time;
 				job_ptr->tot_sus_time +=
 					difftime(foreach_kill_job_by->now,
 						 job_ptr->suspend_time);
 			} else
 				job_ptr->end_time = foreach_kill_job_by->now;
 			job_ptr->exit_code = 1;
 			job_completion_logger(job_ptr, false);
 			deallocate_nodes(job_ptr, false, suspended, false);
 		}
 	}
 	return 0;
 }

 extern int kill_running_job_by_node_ptr(node_record_t *node_ptr)
 {
 	static time_t sched_update = 0;
 	static bool requeue_on_resume_failure = false;
 	foreach_kill_job_by_t foreach_kill_job_by = {
 		.node_ptr = node_ptr,
 		.now = time(NULL),
 	};

 	if (sched_update != slurm_conf.last_update) {
 		requeue_on_resume_failure =
 			xstrcasestr(slurm_conf.sched_params,
 				    "requeue_on_resume_failure");
 		sched_update = slurm_conf.last_update;
 	}

 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
 	xassert(verify_lock(NODE_LOCK, WRITE_LOCK));

 	if (!foreach_kill_job_by.node_ptr) /* No such node */
 		return 0;

 	foreach_kill_job_by.requeue_on_resume_failure =
 		requeue_on_resume_failure;

 	list_for_each(job_list, _foreach_kill_running_job_by_node,
 		      &foreach_kill_job_by);

 	if (foreach_kill_job_by.kill_job_cnt)
 		last_job_update = foreach_kill_job_by.now;

 	return foreach_kill_job_by.kill_job_cnt;
 }

 /* Remove one node from a job's allocation */
 extern void excise_node_from_job(job_record_t *job_ptr,
 				 node_record_t *node_ptr)
 {
 	make_node_idle(node_ptr, job_ptr); /* updates bitmap */
 	xfree(job_ptr->nodes);
 	job_ptr->nodes = bitmap2node_name(job_ptr->node_bitmap);

 	job_ptr->total_nodes = job_ptr->node_cnt = bit_set_count(job_ptr->node_bitmap);

 	(void) select_g_job_resized(job_ptr, node_ptr);
 }

 /*
  * dump_job_desc - dump the incoming job submit request message
  * IN job_desc - job specification from RPC
  */
 void dump_job_desc(job_desc_msg_t *job_desc)
 {
 	long pn_min_cpus, pn_min_tmp_disk, min_cpus;
 	uint64_t pn_min_memory;
 	long time_limit, priority, contiguous, nice, time_min;
 	long kill_on_node_fail, shared, immediate, wait_all_nodes;
 	long cpus_per_task, requeue, num_tasks, overcommit;
 	long ntasks_per_node, ntasks_per_socket, ntasks_per_core;
 	long ntasks_per_tres;
 	int spec_count;
 	char *mem_type, buf[256], *signal_flags, *spec_type, *job_id;

 	if (get_log_level() < LOG_LEVEL_DEBUG3)
 		return;

 	if (job_desc == NULL)
 		return;

 	if (job_desc->job_id_str)
 		job_id = job_desc->job_id_str;
 	else if (job_desc->job_id == NO_VAL)
 		job_id = "N/A";
 	else {
 		snprintf(buf, sizeof(buf), "%u", job_desc->job_id);
 		job_id = buf;
 	}
 	debug3("JobDesc: user_id=%u JobId=%s partition=%s name=%s",
 	       job_desc->user_id, job_id,
 	       job_desc->partition, job_desc->name);

 	min_cpus = (job_desc->min_cpus != NO_VAL) ?
 		(long) job_desc->min_cpus : -1L;
 	pn_min_cpus    = (job_desc->pn_min_cpus != NO_VAL16) ?
 		(long) job_desc->pn_min_cpus : -1L;
 	if (job_desc->core_spec == NO_VAL16) {
 		spec_type  = "core";
 		spec_count = -1;
 	} else if (job_desc->core_spec & CORE_SPEC_THREAD) {
 		spec_type  = "thread";
 		spec_count = job_desc->core_spec & (~CORE_SPEC_THREAD);
 	} else {
 		spec_type  = "core";
 		spec_count = job_desc->core_spec;
 	}
 	debug3("   cpus=%ld-%u pn_min_cpus=%ld %s_spec=%d",
 	       min_cpus, job_desc->max_cpus, pn_min_cpus,
 	       spec_type, spec_count);

 	debug3("   Nodes=%u-[%u] Sock/Node=%u Core/Sock=%u Thread/Core=%u",
 	       job_desc->min_nodes, job_desc->max_nodes,
 	       job_desc->sockets_per_node, job_desc->cores_per_socket,
 	       job_desc->threads_per_core);

 	if (job_desc->pn_min_memory == NO_VAL64) {
 		pn_min_memory = -1L;
 		mem_type = "job";
 	} else if (job_desc->pn_min_memory & MEM_PER_CPU) {
 		pn_min_memory = job_desc->pn_min_memory & (~MEM_PER_CPU);
 		mem_type = "cpu";
 	} else {
 		pn_min_memory = job_desc->pn_min_memory;
 		mem_type = "job";
 	}
 	pn_min_tmp_disk = (job_desc->pn_min_tmp_disk != NO_VAL) ?
 		(long) job_desc->pn_min_tmp_disk : -1L;
 	debug3("   pn_min_memory_%s=%"PRIu64" pn_min_tmp_disk=%ld",
 	       mem_type, pn_min_memory, pn_min_tmp_disk);
 	immediate = (job_desc->immediate == 0) ? 0L : 1L;
 	debug3("   immediate=%ld reservation=%s",
 	       immediate, job_desc->reservation);
 	debug3("   features=%s batch_features=%s cluster_features=%s prefer=%s",
 	       job_desc->features, job_desc->batch_features,
 	       job_desc->cluster_features, job_desc->prefer);

 	debug3("   req_nodes=%s exc_nodes=%s",
 	       job_desc->req_nodes, job_desc->exc_nodes);

 	time_limit = (job_desc->time_limit != NO_VAL) ?
 		(long) job_desc->time_limit : -1L;
 	time_min = (job_desc->time_min != NO_VAL) ?
 		(long) job_desc->time_min : time_limit;
 	priority   = (job_desc->priority != NO_VAL) ?
 		(long) job_desc->priority : -1L;
 	contiguous = (job_desc->contiguous != NO_VAL16) ?
 		(long) job_desc->contiguous : -1L;
 	shared = (job_desc->shared != NO_VAL16) ?
 		(long) job_desc->shared : -1L;
 	debug3("   time_limit=%ld-%ld priority=%ld contiguous=%ld shared=%ld",
 	       time_min, time_limit, priority, contiguous, shared);

 	kill_on_node_fail = (job_desc->kill_on_node_fail !=
 			     NO_VAL16) ?
 		(long) job_desc->kill_on_node_fail : -1L;
 	if (job_desc->script)	/* log has problem with string len & null */
 		debug3("   kill_on_node_fail=%ld script=%.40s...",
 		       kill_on_node_fail, job_desc->script);
 	else
 		debug3("   kill_on_node_fail=%ld script=(null)",
 		       kill_on_node_fail);

 	if (job_desc->argc == 1)
 		debug3("   argv=\"%s\"",
 		       job_desc->argv[0]);
 	else if (job_desc->argc == 2)
 		debug3("   argv=%s,%s",
 		       job_desc->argv[0],
 		       job_desc->argv[1]);
 	else if (job_desc->argc > 2)
 		debug3("   argv=%s,%s,%s,...",
 		       job_desc->argv[0],
 		       job_desc->argv[1],
 		       job_desc->argv[2]);

 	if (job_desc->env_size == 1)
 		debug3("   environment=\"%s\"",
 		       job_desc->environment[0]);
 	else if (job_desc->env_size == 2)
 		debug3("   environment=%s,%s",
 		       job_desc->environment[0],
 		       job_desc->environment[1]);
 	else if (job_desc->env_size > 2)
 		debug3("   environment=%s,%s,%s,...",
 		       job_desc->environment[0],
 		       job_desc->environment[1],
 		       job_desc->environment[2]);

 	if (job_desc->spank_job_env_size == 1)
 		debug3("   spank_job_env=\"%s\"",
 		       job_desc->spank_job_env[0]);
 	else if (job_desc->spank_job_env_size == 2)
 		debug3("   spank_job_env=%s,%s",
 		       job_desc->spank_job_env[0],
 		       job_desc->spank_job_env[1]);
 	else if (job_desc->spank_job_env_size > 2)
 		debug3("   spank_job_env=%s,%s,%s,...",
 		       job_desc->spank_job_env[0],
 		       job_desc->spank_job_env[1],
 		       job_desc->spank_job_env[2]);

 	debug3("   stdin=%s stdout=%s stderr=%s",
 	       job_desc->std_in, job_desc->std_out, job_desc->std_err);

 	debug3("   work_dir=%s alloc_node:sid=%s:%u",
 	       job_desc->work_dir,
 	       job_desc->alloc_node, job_desc->alloc_sid);

 	debug3("   resp_host=%s alloc_resp_port=%u other_port=%u",
 	       job_desc->resp_host,
 	       job_desc->alloc_resp_port, job_desc->other_port);
 	debug3("   dependency=%s account=%s qos=%s comment=%s",
 	       job_desc->dependency, job_desc->account,
 	       job_desc->qos, job_desc->comment);

 	num_tasks = (job_desc->num_tasks != NO_VAL) ?
 		(long) job_desc->num_tasks : -1L;
 	overcommit = (job_desc->overcommit != NO_VAL8) ?
 		(long) job_desc->overcommit : -1L;
 	nice = (job_desc->nice != NO_VAL) ?
 		((int64_t)job_desc->nice - NICE_OFFSET) : 0;
 	debug3("   mail_type=%u mail_user=%s nice=%ld num_tasks=%ld "
 	       "open_mode=%u overcommit=%ld acctg_freq=%s",
 	       job_desc->mail_type, job_desc->mail_user, nice, num_tasks,
 	       job_desc->open_mode, overcommit, job_desc->acctg_freq);

 	slurm_make_time_str(&job_desc->begin_time, buf, sizeof(buf));
 	cpus_per_task = (job_desc->cpus_per_task != NO_VAL16) ?
 		(long) job_desc->cpus_per_task : -1L;
 	requeue = (job_desc->requeue != NO_VAL16) ?
 		(long) job_desc->requeue : -1L;
 	debug3("   network=%s begin=%s cpus_per_task=%ld requeue=%ld "
 	       "licenses=%s",
 	       job_desc->network, buf, cpus_per_task, requeue,
 	       job_desc->licenses);

 	slurm_make_time_str(&job_desc->end_time, buf, sizeof(buf));
 	wait_all_nodes = (job_desc->wait_all_nodes != NO_VAL16) ?
 		(long) job_desc->wait_all_nodes : -1L;
 	if (job_desc->warn_flags & KILL_JOB_BATCH)
 		signal_flags = "B:";
 	else
 		signal_flags = "";
 	cpu_freq_debug(NULL, NULL, buf, sizeof(buf), job_desc->cpu_freq_gov,
 		       job_desc->cpu_freq_min, job_desc->cpu_freq_max,
 		       NO_VAL);
 	debug3("   end_time=%s signal=%s%u@%u wait_all_nodes=%ld cpu_freq=%s",
 	       buf, signal_flags, job_desc->warn_signal, job_desc->warn_time,
 	       wait_all_nodes, buf);

 	ntasks_per_node = (job_desc->ntasks_per_node != NO_VAL16) ?
 		(long) job_desc->ntasks_per_node : -1L;
 	ntasks_per_socket = (job_desc->ntasks_per_socket !=
 			     NO_VAL16) ?
 		(long) job_desc->ntasks_per_socket : -1L;
 	ntasks_per_core = (job_desc->ntasks_per_core != NO_VAL16) ?
 		(long) job_desc->ntasks_per_core : -1L;
 	ntasks_per_tres = (job_desc->ntasks_per_tres != NO_VAL16) ?
 		(long) job_desc->ntasks_per_tres : -1L;
 	debug3("   ntasks_per_node=%ld ntasks_per_socket=%ld ntasks_per_core=%ld ntasks_per_tres=%ld",
 	       ntasks_per_node, ntasks_per_socket, ntasks_per_core,
 	       ntasks_per_tres);

 	debug3("   mem_bind=%u:%s plane_size:%u",
 	       job_desc->mem_bind_type, job_desc->mem_bind,
 	       job_desc->plane_size);
 	debug3("   array_inx=%s", job_desc->array_inx);
 	debug3("   burst_buffer=%s", job_desc->burst_buffer);
 	debug3("   mcs_label=%s", job_desc->mcs_label);
 	slurm_make_time_str(&job_desc->deadline, buf, sizeof(buf));
 	debug3("   deadline=%s", buf);
 	debug3("   bitflags=0x%"PRIx64" delay_boot=%u",
 	       job_desc->bitflags, job_desc->delay_boot);

 	if (job_desc->cpus_per_tres)
 		debug3("   CPUs_per_TRES=%s", job_desc->cpus_per_tres);
 	if (job_desc->mem_per_tres)
 		debug3("   Mem_per_TRES=%s", job_desc->mem_per_tres);
 	if (job_desc->tres_bind)
 		debug3("   TRES_bind=%s", job_desc->tres_bind);
 	if (job_desc->tres_freq)
 		debug3("   TRES_freq=%s", job_desc->tres_freq);
 	if (job_desc->tres_per_job)
 		debug3("   TRES_per_job=%s", job_desc->tres_per_job);
 	if (job_desc->tres_per_node)
 		debug3("   TRES_per_node=%s", job_desc->tres_per_node);
 	if (job_desc->tres_per_socket)
 		debug3("   TRES_per_socket=%s", job_desc->tres_per_socket);
 	if (job_desc->tres_per_task)
 		debug3("   TRES_per_task=%s", job_desc->tres_per_task);

 	if (job_desc->container || job_desc->container_id)
 		debug3("   container=%s container-id=%s",
 		       job_desc->container, job_desc->container_id);
 }

 /*
  * init_job_conf - initialize the job configuration tables and values.
  *	this should be called after creating node information, but
  *	before creating any job entries. Pre-existing job entries are
  *	left unchanged.
  *	NOTE: The job hash table size does not change after initial creation.
  * global: last_job_update - time of last job table update
  *	job_list - pointer to global job list
  *	purge_jobs_list - pointer to purge_jobs_list
  */
 void init_job_conf(void)
 {
 	if (job_list == NULL) {
 		job_count = 0;
 		job_list = list_create(_move_to_purge_jobs_list);
 	}

 	last_job_update = time(NULL);

 	if (!purge_files_list) {
 		purge_files_list = list_create(xfree_ptr);
 	}

 	if (!purge_jobs_list)
 		purge_jobs_list = list_create(job_record_delete);
 }

 /*
  * rehash_jobs - Create or rebuild the job hash table.
  */
 extern void rehash_jobs(void)
 {
 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));

 	if (job_hash == NULL) {
 		hash_table_size = slurm_conf.max_job_cnt;
 		job_hash = xcalloc(hash_table_size, sizeof(job_record_t *));
 		job_array_hash_j = xcalloc(hash_table_size,
 					   sizeof(job_record_t *));
 		job_array_hash_t = xcalloc(hash_table_size,
 					   sizeof(job_record_t *));
 		if (xstrcasestr(slurm_conf.sched_params,
 				"enable_job_state_cache"))
 			setup_job_state_hash(hash_table_size);
 	} else if (hash_table_size < (slurm_conf.max_job_cnt / 2)) {
 		/* If the MaxJobCount grows by too much, the hash table will
 		 * be ineffective without rebuilding. We don't presently bother
 		 * to rebuild the hash table, but cut MaxJobCount back as
 		 * needed. */
 		error ("MaxJobCount reset too high, restart slurmctld");
 		slurm_conf.max_job_cnt = hash_table_size;
 	}
 }

 /* Create an exact copy of an existing job record for a job array.
  * IN job_ptr - META job record for a job array, which is to become an
  *		individual task of the job array.
  *		Set the job's array_task_id to the task to be split out.
  * RET - The new job record, which is the new META job record. */
 extern job_record_t *job_array_split(job_record_t *job_ptr, bool list_add)
 {
 	job_record_t *job_ptr_pend = NULL;
 	job_details_t *job_details, *details_new, *save_details;
 	uint32_t save_job_id, save_db_flags = job_ptr->db_flags;
 	uint64_t save_db_index = job_ptr->db_index;
 	priority_factors_t *save_prio_factors;
 	list_t *save_step_list = NULL;
 	int i;

 	job_ptr_pend = _create_job_record(0, list_add);

 	_remove_job_hash(job_ptr, JOB_HASH_JOB);
 	job_ptr_pend->job_id = job_ptr->job_id;
 	if (_set_job_id(job_ptr) != SLURM_SUCCESS)
 		fatal("%s: _set_job_id error", __func__);
 	if (!job_ptr->array_recs) {
 		fatal_abort("%s: %pJ record lacks array structure",
 			    __func__, job_ptr);
 	}

 	/*
 	 * Copy most of original job data.
 	 * This could be done in parallel, but performance was worse.
 	 */
 	save_job_id   = job_ptr_pend->job_id;
 	save_details  = job_ptr_pend->details;
 	save_prio_factors = job_ptr_pend->prio_factors;
 	save_step_list = job_ptr_pend->step_list;
 	memcpy(job_ptr_pend, job_ptr, sizeof(job_record_t));

 	job_ptr_pend->job_id   = save_job_id;
 	job_ptr_pend->details  = save_details;
 	job_ptr_pend->db_flags = save_db_flags;
 	job_ptr_pend->step_list = save_step_list;
 	job_ptr_pend->db_index = save_db_index;

 	job_ptr_pend->prio_factors = save_prio_factors;
 	slurm_copy_priority_factors(job_ptr_pend->prio_factors,
 				    job_ptr->prio_factors);

 	job_ptr_pend->account = xstrdup(job_ptr->account);
 	job_ptr_pend->admin_comment = xstrdup(job_ptr->admin_comment);
 	job_ptr_pend->alias_list = NULL;
 	job_ptr_pend->alloc_node = xstrdup(job_ptr->alloc_node);
 	job_ptr_pend->node_addrs = NULL;

 	job_ptr_pend->array_recs = job_ptr->array_recs;
 	job_ptr->array_recs = NULL;

 	if (job_ptr_pend->array_recs &&
 	    job_ptr_pend->array_recs->task_id_bitmap) {
 		bit_clear(job_ptr_pend->array_recs->task_id_bitmap,
 			  job_ptr_pend->array_task_id);
 	}
 	xfree(job_ptr_pend->array_recs->task_id_str);
 	if (job_ptr_pend->array_recs->task_cnt) {
 		job_ptr_pend->array_recs->task_cnt--;
 		if (job_ptr_pend->array_recs->task_cnt <= 1) {
 			/*
 			 * This is the last task of the job array, so we need to
 			 * set array_task_id to a specific task id. We also
 			 * need to call job_array_post_sched() to do cleanup
 			 * on the array, specifically how job_array_post_sched()
 			 * handles adding the job to the array_hash, otherwise
 			 * we'll get errors.
 			 */
 			i = bit_ffs(job_ptr_pend->array_recs->task_id_bitmap);
 			if (i < 0) {
 				error("%s: No tasks in task_id_bitmap for %pJ",
 				      __func__, job_ptr_pend);
 				job_ptr_pend->array_task_id = NO_VAL;
 			} else {
 				job_ptr_pend->array_task_id = i;
 				job_array_post_sched(job_ptr_pend, true);
 			}
 		} else {
 			/* Still have tasks left to split off in the array */
 			job_ptr_pend->array_task_id = NO_VAL;
 		}
 	} else {
 		error("%pJ array_recs->task_cnt underflow",
 		      job_ptr);
 		job_ptr_pend->array_task_id = NO_VAL;
 	}

 	job_ptr_pend->batch_features = xstrdup(job_ptr->batch_features);
 	job_ptr_pend->batch_host = NULL;
 	job_ptr_pend->burst_buffer = xstrdup(job_ptr->burst_buffer);
 	job_ptr_pend->burst_buffer_state = xstrdup(job_ptr->burst_buffer_state);
 	job_ptr_pend->clusters = xstrdup(job_ptr->clusters);
 	job_ptr_pend->comment = xstrdup(job_ptr->comment);
 	job_ptr_pend->container = xstrdup(job_ptr->container);
 	job_ptr_pend->container_id = xstrdup(job_ptr->container_id);
 	job_ptr_pend->extra = xstrdup(job_ptr->extra);
 	if ((extra_constraints_parse(job_ptr_pend->extra,
 				     &job_ptr_pend->extra_constraints)) !=
 	    SLURM_SUCCESS)
 		error("%s: %pJ Invalid extra_constraints %s",
 		      __func__, job_ptr, job_ptr_pend->extra);


 	job_ptr_pend->fed_details = _dup_job_fed_details(job_ptr->fed_details);

 	/* job_details_t *details;		*** NOTE: Copied below */

 	job_ptr_pend->limit_set.tres = xcalloc(slurmctld_tres_cnt,
 					       sizeof(uint16_t));
 	memcpy(job_ptr_pend->limit_set.tres, job_ptr->limit_set.tres,
 	       sizeof(uint16_t) * slurmctld_tres_cnt);

 	_add_job_hash(job_ptr);		/* Sets job_next */
 	_add_job_hash(job_ptr_pend);	/* Sets job_next */
 	_add_job_array_hash(job_ptr);
 	job_ptr_pend->job_resrcs = NULL;

 	job_ptr_pend->id = copy_identity(job_ptr->id);
 	job_ptr_pend->licenses = xstrdup(job_ptr->licenses);
 	job_ptr_pend->licenses_allocated = NULL;
 	job_ptr_pend->license_list = license_copy(job_ptr->license_list);
 	job_ptr_pend->licenses_to_preempt = NULL;
 	job_ptr_pend->lic_req = xstrdup(job_ptr->lic_req);
 	job_ptr_pend->mail_user = xstrdup(job_ptr->mail_user);
 	job_ptr_pend->mcs_label = xstrdup(job_ptr->mcs_label);
 	job_ptr_pend->name = xstrdup(job_ptr->name);
 	job_ptr_pend->network = xstrdup(job_ptr->network);
 	job_ptr_pend->node_bitmap = NULL;
 	job_ptr_pend->node_bitmap_cg = NULL;
 	job_ptr_pend->node_bitmap_pr = NULL;
 	job_ptr_pend->node_bitmap_preempt = NULL;
 	job_ptr_pend->nodes = NULL;
 	job_ptr_pend->nodes_completing = NULL;
 	job_ptr_pend->nodes_pr = NULL;
 	job_ptr_pend->origin_cluster = xstrdup(job_ptr->origin_cluster);
 	job_ptr_pend->partition = xstrdup(job_ptr->partition);
 	job_ptr_pend->part_ptr_list = part_list_copy(job_ptr->part_ptr_list);
 	/* On jobs that are held the priority_array isn't set up yet,
 	 * so check to see if it exists before copying. */
 	if ((job_ptr->part_ptr_list || job_ptr->qos_list) &&
 	    job_ptr->prio_mult) {
 		job_ptr_pend->prio_mult =
 			xmalloc(sizeof(*job_ptr_pend->prio_mult));

 		if (job_ptr->prio_mult->priority_array) {
 			i = xsize(job_ptr->prio_mult->priority_array);
 			job_ptr_pend->prio_mult->priority_array = xmalloc(i);
 			memcpy(job_ptr_pend->prio_mult->priority_array,
 			       job_ptr->prio_mult->priority_array, i);
 		}

 		job_ptr_pend->prio_mult->priority_array_names =
 			xstrdup(job_ptr->prio_mult->priority_array_names);
 	} else if (job_ptr->prio_mult) {
 		/* this should never happen */
 		error("%s: prio_mult is set without part_ptr_list or qos_list, setting prio_mult to NULL.",
 		      __func__);
 		job_ptr_pend->prio_mult = NULL;
 	}
 	if (job_ptr->qos_list)
 		job_ptr_pend->qos_list = list_shallow_copy(job_ptr->qos_list);
 	job_ptr_pend->resv_name = xstrdup(job_ptr->resv_name);
 	if (job_ptr->resv_list)
 		job_ptr_pend->resv_list = list_shallow_copy(job_ptr->resv_list);
 	job_ptr_pend->resv_ports = NULL;
 	job_ptr_pend->resv_port_array = NULL;
 	job_ptr_pend->resp_host = xstrdup(job_ptr->resp_host);
 	job_ptr_pend->selinux_context = xstrdup(job_ptr->selinux_context);
 	job_ptr_pend->sched_nodes = NULL;
 	if (job_ptr->spank_job_env_size) {
 		job_ptr_pend->spank_job_env =
 			xcalloc((job_ptr->spank_job_env_size + 1),
 				sizeof(char *));
 		for (i = 0; i < job_ptr->spank_job_env_size; i++) {
 			job_ptr_pend->spank_job_env[i] =
 				xstrdup(job_ptr->spank_job_env[i]);
 		}
 	}
 	job_ptr_pend->state_desc = xstrdup(job_ptr->state_desc);

 	job_ptr_pend->system_comment = xstrdup(job_ptr->system_comment);

 	i = sizeof(uint64_t) * slurmctld_tres_cnt;
 	job_ptr_pend->tres_req_cnt = xmalloc(i);
 	memcpy(job_ptr_pend->tres_req_cnt, job_ptr->tres_req_cnt, i);
 	job_ptr_pend->tres_req_str = xstrdup(job_ptr->tres_req_str);
 	job_ptr_pend->tres_fmt_req_str = xstrdup(job_ptr->tres_fmt_req_str);
 	job_ptr_pend->tres_alloc_str = NULL;
 	job_ptr_pend->tres_fmt_alloc_str = NULL;
 	job_ptr_pend->tres_alloc_cnt = NULL;

 	job_ptr_pend->cpus_per_tres = xstrdup(job_ptr->cpus_per_tres);
 	job_ptr_pend->mem_per_tres = xstrdup(job_ptr->mem_per_tres);
 	job_ptr_pend->tres_bind = xstrdup(job_ptr->tres_bind);
 	job_ptr_pend->tres_freq = xstrdup(job_ptr->tres_freq);
 	job_ptr_pend->tres_per_job = xstrdup(job_ptr->tres_per_job);
 	job_ptr_pend->tres_per_node = xstrdup(job_ptr->tres_per_node);
 	job_ptr_pend->tres_per_socket = xstrdup(job_ptr->tres_per_socket);
 	job_ptr_pend->tres_per_task = xstrdup(job_ptr->tres_per_task);

 	job_ptr_pend->user_name = xstrdup(job_ptr->user_name);
 	job_ptr_pend->wckey = xstrdup(job_ptr->wckey);
 	job_ptr_pend->deadline = job_ptr->deadline;

 	job_details = job_ptr->details;
 	details_new = job_ptr_pend->details;
 	memcpy(details_new, job_details, sizeof(job_details_t));

 	/*
 	 * Reset the preempt_start_time or high priority array jobs will hang
 	 * for a period before preempting more jobs.
 	 */
 	details_new->preempt_start_time = 0;

 	details_new->acctg_freq = xstrdup(job_details->acctg_freq);
 	if (job_details->argc) {
 		details_new->argv =
 			xcalloc((job_details->argc + 1), sizeof(char *));
 		for (i = 0; i < job_details->argc; i++) {
 			details_new->argv[i] = xstrdup(job_details->argv[i]);
 		}
 	}
 	details_new->cpu_bind = xstrdup(job_details->cpu_bind);
 	details_new->cpu_bind_type = job_details->cpu_bind_type;
 	details_new->cpu_freq_min = job_details->cpu_freq_min;
 	details_new->cpu_freq_max = job_details->cpu_freq_max;
 	details_new->cpu_freq_gov = job_details->cpu_freq_gov;
 	details_new->depend_list = depended_list_copy(job_details->depend_list);
 	details_new->dependency = xstrdup(job_details->dependency);
 	details_new->orig_dependency = xstrdup(job_details->orig_dependency);
 	if (job_details->env_cnt) {
 		details_new->env_sup =
 			xcalloc((job_details->env_cnt + 1), sizeof(char *));
 		for (i = 0; i < job_details->env_cnt; i++) {
 			details_new->env_sup[i] =
 				xstrdup(job_details->env_sup[i]);
 		}
 	}
 	if (job_details->exc_node_bitmap) {
 		details_new->exc_node_bitmap =
 			bit_copy(job_details->exc_node_bitmap);
 	}
 	details_new->exc_nodes = xstrdup(job_details->exc_nodes);
 	details_new->feature_list =
 		feature_list_copy(job_details->feature_list);
 	details_new->features = xstrdup(job_details->features);
 	details_new->cluster_features = xstrdup(job_details->cluster_features);
 	if (job_details->job_size_bitmap) {
 		details_new->job_size_bitmap =
 			bit_copy(job_details->job_size_bitmap);
 	}
 	details_new->prefer = xstrdup(job_details->prefer);
 	details_new->prefer_list =
 		feature_list_copy(job_details->prefer_list);
 	set_job_features_use(details_new);
 	if (job_details->mc_ptr) {
 		i = sizeof(multi_core_data_t);
 		details_new->mc_ptr = xmalloc(i);
 		memcpy(details_new->mc_ptr, job_details->mc_ptr, i);
 	}
 	details_new->mem_bind = xstrdup(job_details->mem_bind);
 	details_new->mem_bind_type = job_details->mem_bind_type;
 	details_new->qos_req = xstrdup(job_details->qos_req);
 	details_new->resv_req = xstrdup(job_details->resv_req);
 	if (job_details->req_node_bitmap) {
 		details_new->req_node_bitmap =
 			bit_copy(job_details->req_node_bitmap);
 	}
 	details_new->req_context = xstrdup(job_details->req_context);
 	details_new->req_nodes = xstrdup(job_details->req_nodes);
 	details_new->std_err = xstrdup(job_details->std_err);
 	details_new->std_in = xstrdup(job_details->std_in);
 	details_new->std_out = xstrdup(job_details->std_out);
 	details_new->submit_line = xstrdup(job_details->submit_line);
 	details_new->work_dir = xstrdup(job_details->work_dir);
 	details_new->x11_magic_cookie = xstrdup(job_details->x11_magic_cookie);
 	details_new->env_hash = xstrdup(job_details->env_hash);
 	details_new->script_hash = xstrdup(job_details->script_hash);

 	if (job_ptr->gres_list_req) {
 		if (details_new->whole_node & WHOLE_NODE_REQUIRED) {
 			multi_core_data_t *mc_ptr = details_new->mc_ptr;
 			gres_job_state_validate_t gres_js_val = {
 				.cpus_per_tres = job_ptr_pend->cpus_per_tres,
 				.mem_per_tres = job_ptr_pend->mem_per_tres,
 				.tres_freq = job_ptr_pend->tres_freq,
 				.tres_per_job = job_ptr_pend->tres_per_job,
 				.tres_per_node = job_ptr_pend->tres_per_node,
 				.tres_per_socket = job_ptr->tres_per_socket,
 				.tres_per_task = job_ptr->tres_per_task,

 				.cpus_per_task =
 				&details_new->orig_cpus_per_task,
 				.max_nodes = &details_new->max_nodes,
 				.min_cpus = &details_new->min_cpus,
 				.min_nodes = &details_new->min_nodes,
 				.ntasks_per_node =
 				&details_new->ntasks_per_node,
 				.ntasks_per_socket = &mc_ptr->ntasks_per_socket,
 				.ntasks_per_tres =
 				&details_new->ntasks_per_tres,
 				.num_tasks = &details_new->num_tasks,
 				.sockets_per_node = &mc_ptr->sockets_per_node,

 				.gres_list = &job_ptr_pend->gres_list_req,
 			};

 			/*
 			 * We need to reset the gres_list to what was requested
 			 * instead of what was given exclusively.
 			 */
 			job_ptr_pend->gres_list_req = NULL;
 			(void)gres_job_state_validate(&gres_js_val);
 		} else
 			job_ptr_pend->gres_list_req =
 				gres_job_state_list_dup(job_ptr->gres_list_req);
 	}
 	job_ptr_pend->gres_list_req_accum = NULL;
 	job_ptr_pend->gres_list_alloc = NULL;
 	job_ptr_pend->gres_detail_cnt = 0;
 	job_ptr_pend->gres_detail_str = NULL;
 	job_ptr_pend->gres_used = NULL;

 	if (job_ptr->fed_details) {
 		add_fed_job_info(job_ptr);
 		/*
 		 * The new (split) job needs its remote dependencies tested
 		 * separately from just the meta job, so send remote
 		 * dependencies to siblings if needed.
 		 */
 		if (job_ptr->details->dependency &&
 		    job_ptr->details->depend_list)
 			fed_mgr_submit_remote_dependencies(job_ptr, false,
 							   false);
 	}

 	on_job_state_change(job_ptr, job_ptr->job_state);
 	on_job_state_change(job_ptr_pend, job_ptr_pend->job_state);

 	return job_ptr_pend;
 }

 /* Add job array data structure to the job record */
 static void _create_job_array(job_record_t *job_ptr, job_desc_msg_t *job_desc)
 {
 	job_details_t *details;
 	char *sep = NULL;
 	int max_run_tasks, min_task_id, max_task_id, step_task_id = 1, task_cnt;

 	if (!job_desc->array_bitmap)
 		return;

 	if ((min_task_id = bit_ffs(job_desc->array_bitmap)) == -1) {
 		info("%s: %pJ array_bitmap is empty", __func__, job_ptr);
 		return;
 	}

 	job_ptr->array_job_id = job_ptr->job_id;
 	job_ptr->array_recs = xmalloc(sizeof(job_array_struct_t));
 	max_task_id = bit_fls(job_desc->array_bitmap);
 	task_cnt = bit_set_count(job_desc->array_bitmap);
 	bit_realloc(job_desc->array_bitmap, max_task_id + 1);
 	job_ptr->array_recs->task_id_bitmap = job_desc->array_bitmap;
 	job_desc->array_bitmap = NULL;
 	job_ptr->array_recs->task_cnt =
 		bit_set_count(job_ptr->array_recs->task_id_bitmap);
 	if (job_ptr->array_recs->task_cnt > 1)
 		job_count += (job_ptr->array_recs->task_cnt - 1);

 	if (job_desc->array_inx)
 		sep = strchr(job_desc->array_inx, '%');
 	if (sep) {
 		max_run_tasks = atoi(sep + 1);
 		if (max_run_tasks > 0)
 			job_ptr->array_recs->max_run_tasks = max_run_tasks;
 	}

 	details = job_ptr->details;
 	if (details) {
 		if (job_desc->array_inx) {
 			sep = strchr(job_desc->array_inx, ':');
 			if (sep)
 				step_task_id = atoi(sep + 1);
 		}
 		xrecalloc(details->env_sup,
 			  MAX(job_ptr->details->env_cnt, 1) + 4,
 			  sizeof(char *));
 		xstrfmtcat(details->env_sup[details->env_cnt++],
 			   "SLURM_ARRAY_TASK_COUNT=%d", task_cnt);
 		xstrfmtcat(details->env_sup[details->env_cnt++],
 			   "SLURM_ARRAY_TASK_MIN=%d", min_task_id);
 		xstrfmtcat(details->env_sup[details->env_cnt++],
 			   "SLURM_ARRAY_TASK_MAX=%d", max_task_id);
 		xstrfmtcat(details->env_sup[details->env_cnt++],
 			   "SLURM_ARRAY_TASK_STEP=%d", step_task_id);
 	}

 	on_job_state_change(job_ptr, job_ptr->job_state);
 }

 static int _select_nodes_base(job_node_select_t *job_node_select)
 {
 	job_node_select->rc_part_limits =
 		job_limits_check(&job_node_select->job_ptr, false);

 	if ((job_node_select->rc_part_limits != WAIT_NO_REASON) &&
 	    (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ANY))
 		return SLURM_ERROR;

 	if ((job_node_select->rc_part_limits != WAIT_NO_REASON) &&
 	    (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ALL)) {
 		if (job_node_select->rc_part_limits != WAIT_PART_DOWN) {
 			job_node_select->rc_best =
 				ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
 			return SLURM_SUCCESS;
 		} else {
 			job_node_select->rc_best = ESLURM_PARTITION_DOWN;
 		}
 	}

 	if (job_node_select->rc_part_limits == WAIT_NO_REASON) {
 		job_node_select->rc = select_nodes(job_node_select,
 						   job_node_select->test_only,
 						   true,
 						   SLURMDB_JOB_FLAG_SUBMIT);
 	} else if (job_node_select->rc_part_limits != WAIT_PART_CONFIG) {
 		job_node_select->rc = select_nodes(job_node_select,
 						   true,
 						   true,
 						   SLURMDB_JOB_FLAG_SUBMIT);
 		if ((job_node_select->rc == SLURM_SUCCESS) &&
 		    (job_node_select->rc_part_limits == WAIT_PART_DOWN))
 			job_node_select->rc = ESLURM_PARTITION_DOWN;
 	}
 	if ((job_node_select->rc == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) &&
 	    (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ALL)) {
 		/* Job can not run */
 		job_node_select->rc_best = job_node_select->rc;
 		return SLURM_SUCCESS;
 	}
 	if ((job_node_select->rc != ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) &&
 	    (job_node_select->rc != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) &&
 	    (job_node_select->rc != ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE) &&
 	    (job_node_select->rc != ESLURM_RESERVATION_BUSY) &&
 	    (job_node_select->rc != ESLURM_NODES_BUSY)) {
 		/* Job can run now */
 		job_node_select->rc_best = job_node_select->rc;
 		if ((slurm_conf.enforce_part_limits ==
 		     PARTITION_ENFORCE_ANY) ||
 		    (slurm_conf.enforce_part_limits ==
 		     PARTITION_ENFORCE_NONE) ||
 		    (!job_node_select->test_only &&
 		     (job_node_select->rc_part_limits == WAIT_NO_REASON)))
 			return SLURM_SUCCESS;
 	}
 	if (((job_node_select->rc == ESLURM_NODES_BUSY) ||
 	     (job_node_select->rc == ESLURM_RESERVATION_BUSY) ||
 	     (job_node_select->rc == ESLURM_PORTS_BUSY)) &&
 	    (job_node_select->rc_best == -1)) {
 		if (job_node_select->test_only)
 			return SLURM_SUCCESS;

 		/* Keep looking for partition where job can start now */
 		job_node_select->rc_best = job_node_select->rc;
 	}
 	if ((job_node_select->job_ptr->preempt_in_progress) &&
 	    (job_node_select->rc != ESLURM_NODES_BUSY)) {
 		/* Already started preempting jobs, don't
 		 * consider starting this job in another
 		 * partition as we iterator over others. */
 		job_node_select->test_only = true;
 	}

 	return SLURM_ERROR;
 }

 static int _foreach_select_nodes_resvs(void *object, void *args)
 {
 	slurmctld_resv_t *resv_ptr = object;
 	job_node_select_t *job_node_select = args;
 	job_record_t *job_ptr = job_node_select->job_ptr;

 	job_ptr->resv_ptr = resv_ptr;
 	job_ptr->resv_id = resv_ptr->resv_id;

 	if ((job_ptr->bit_flags & JOB_PART_ASSIGNED) && resv_ptr->part_ptr)
 		job_ptr->part_ptr = resv_ptr->part_ptr;

 	debug2("Try %pJ on next reservation %s", job_ptr, resv_ptr->name);

 	if ((job_node_select->rc_resv =
 	     _select_nodes_base(job_node_select)) == SLURM_SUCCESS) {
 		/* break if success */
 		if ((job_node_select->rc != ESLURM_RESERVATION_NOT_USABLE) &&
 		    (job_node_select->rc != ESLURM_RESERVATION_BUSY)) {
 			return -1;
 		}
 	}

 	return 0;
 }

 static int _select_nodes_resvs(job_node_select_t *job_node_select)
 {
 	job_record_t *job_ptr = job_node_select->job_ptr;

 	if (!job_ptr->resv_list)
 		return _select_nodes_base(job_node_select);

 	job_node_select->rc_resv = SLURM_ERROR;
 	(void) list_for_each(job_ptr->resv_list,
 			     _foreach_select_nodes_resvs,
 			     job_node_select);

 	return job_node_select->rc_resv;
 }

 static int _foreach_select_nodes_qos(void *object, void *args)
 {
 	slurmdb_qos_rec_t *qos_ptr = object;
 	job_node_select_t *job_node_select = args;
 	job_record_t *job_ptr = job_node_select->job_ptr;

 	job_ptr->qos_ptr = qos_ptr;

 	debug2("Try %pJ on next QOS %s", job_ptr, qos_ptr->name);

 	/* break if success */
 	if ((job_node_select->rc_qos =
 	     _select_nodes_resvs(job_node_select)) == SLURM_SUCCESS)
 		return -1;

 	return 0;
 }

 static int _select_nodes_qos(job_node_select_t *job_node_select)
 {
 	job_record_t *job_ptr = job_node_select->job_ptr;

 	if (!job_ptr->qos_list)
 		return _select_nodes_resvs(job_node_select);

 	job_node_select->rc_qos = SLURM_ERROR;
 	(void) list_for_each(job_ptr->qos_list,
 			     _foreach_select_nodes_qos,
 			     job_node_select);

 	return job_node_select->rc_qos;
 }

 static int _foreach_select_nodes_part_list(void *x, void *arg)
 {
 	part_record_t *part_ptr = x;
 	job_node_select_t *job_node_select = arg;
 	job_record_t *job_ptr = job_node_select->job_ptr;

 	job_ptr->part_ptr = part_ptr;
 	debug2("Try %pJ on next partition %s", job_ptr, part_ptr->name);

 	if (_select_nodes_qos(job_node_select) == SLURM_SUCCESS)
 		return -1;

 	return 0;
 }

 /*
  * Wrapper for select_nodes() function that will test all valid partitions
  * for a new job
  * IN job_ptr - pointer to the job record
  * IN test_only - if set do not allocate nodes, just confirm they
  *	could be allocated now
  * OUT err_msg - error message for job, caller must xfree
  */
 static int _select_nodes_parts(job_record_t *job_ptr, bool test_only,
 			       char **err_msg)
 {
 	job_node_select_t job_node_select = {
 		.err_msg = err_msg,
 		.job_ptr = job_ptr,
 		.rc_part_limits = WAIT_NO_REASON,
 		.rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE,
 		.rc_best = -1,
 		.test_only = test_only,
 	};
 	int rc, best_rc, part_limits_rc;

 	if (job_ptr->part_ptr_list) {
 		/* part_ptr_list is already sorted */
 		(void) list_find_first(job_ptr->part_ptr_list,
 				       _foreach_select_nodes_part_list,
 				       &job_node_select);
 	} else {
 		/*
 		 * We don't need to check the return code of this as the rc we
 		 * are sending in is the rc we care about.
 		 */
 		(void)_select_nodes_qos(&job_node_select);
 	}

 	rc = job_node_select.rc;
 	best_rc = job_node_select.rc_best;
 	part_limits_rc = job_node_select.rc_part_limits;
 	if (best_rc != -1)
 		rc = best_rc;
 	else if (part_limits_rc == WAIT_PART_DOWN)
 		rc = ESLURM_PARTITION_DOWN;
 	if (rc == ESLURM_NODES_BUSY)
 		job_ptr->state_reason = WAIT_RESOURCES;
 	else if ((rc == ESLURM_RESERVATION_BUSY) ||
 		 (rc == ESLURM_RESERVATION_NOT_USABLE))
 		job_ptr->state_reason = WAIT_RESERVATION;
 	else if (rc == ESLURM_JOB_HELD)
 		/* Do not reset the state_reason field here. select_nodes()
 		 * already set the state_reason field, and this error code
 		 * does not distinguish between user and admin holds. */
 		;
 	else if (rc == ESLURM_NODE_NOT_AVAIL)
 		job_ptr->state_reason = WAIT_NODE_NOT_AVAIL;
 	else if (rc == ESLURM_QOS_THRES)
 		job_ptr->state_reason = WAIT_QOS_THRES;
 	else if (rc == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)
 		job_ptr->state_reason = WAIT_PART_CONFIG;
 	else if (rc == ESLURM_BURST_BUFFER_WAIT)
 		job_ptr->state_reason = WAIT_BURST_BUFFER_RESOURCE;
 	else if (rc == ESLURM_PARTITION_DOWN)
 		job_ptr->state_reason = WAIT_PART_DOWN;
 	else if (rc == ESLURM_INVALID_QOS)
 		job_ptr->state_reason = FAIL_QOS;
 	else if (rc == ESLURM_INVALID_ACCOUNT)
 		job_ptr->state_reason = FAIL_ACCOUNT;

 	return rc;
 }

 static inline bool _has_deadline(job_record_t *job_ptr)
 {
 	if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL)) {
 		queue_job_scheduler();
 		return true;
 	}
 	return false;
 }

 /*
  * job_allocate - create job_records for the supplied job specification and
  *	allocate nodes for it.
  * IN job_desc - job specifications
  * IN immediate - if set then either initiate the job immediately or fail
  * IN will_run - don't initiate the job if set, just test if it could run
  *	now or later
  * OUT resp - will run response (includes start location, time, etc.)
  * IN allocate - resource allocation request only if set, batch job if zero
  * IN submit_uid -uid of user issuing the request
  * OUT job_pptr - set to pointer to job record
  * OUT err_msg - Custom error message to the user, caller to xfree results
  * IN protocol_version - version of the code the caller is using
  * RET 0 or an error code. If the job would only be able to execute with
  *	some change in partition configuration then
  *	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
  * globals: job_list - pointer to global job list
  *	list_part - global list of partition info
  *	default_part_loc - pointer to default partition
  */
 extern int job_allocate(job_desc_msg_t *job_desc, int immediate,
 			int will_run, will_run_response_msg_t **resp,
 			int allocate, uid_t submit_uid, bool cron,
 			job_record_t **job_pptr, char **err_msg,
 			uint16_t protocol_version)
 {
 	static time_t sched_update = 0;
 	static bool defer_batch = false, defer_sched = false;
 	static bool ignore_prefer_val = false, ignore_constraint_val = false;
 	int error_code, i;
 	bool no_alloc, top_prio, test_only, too_fragmented, independent;
 	job_record_t *job_ptr;
 	time_t now = time(NULL);
 	bool held_user = false;
 	bool defer_this = false;

 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
 	xassert(verify_lock(NODE_LOCK, WRITE_LOCK));
 	xassert(verify_lock(PART_LOCK, READ_LOCK));

 	if (sched_update != slurm_conf.last_update) {
 		char *tmp_ptr;
 		sched_update = slurm_conf.last_update;
 		defer_batch = defer_sched = false;
 		if (xstrcasestr(slurm_conf.sched_params, "defer_batch"))
 			defer_batch = true;
 		else if (xstrcasestr(slurm_conf.sched_params, "defer"))
 			defer_sched = true;
 		if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params,
 		                           "delay_boot="))) {
 			char *tmp_comma;
 			if ((tmp_comma = xstrstr(tmp_ptr, ",")))
 				*tmp_comma = '\0';
 			i = time_str2secs(tmp_ptr + 11);
 			if (i != NO_VAL)
 				delay_boot = i;
 			if (tmp_comma)
 				*tmp_comma = ',';
 		}
 		bf_min_age_reserve = 0;
 		if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params,
 					   "bf_min_age_reserve="))) {
 			int min_age = atoi(tmp_ptr + 19);
 			if (min_age > 0)
 				bf_min_age_reserve = min_age;
 		}

 		if (xstrcasestr(slurm_conf.sched_params, "allow_zero_lic"))
 			validate_cfgd_licenses = false;

 		if (xstrcasestr(slurm_conf.sched_params,
 				"ignore_prefer_validation"))
 			ignore_prefer_val = true;
 		else
 			ignore_prefer_val = false;
 		if (xstrcasestr(slurm_conf.sched_params,
 				"ignore_constraint_validation"))
 			ignore_constraint_val = true;
 		else
 			ignore_constraint_val = false;
 	}

 	if (job_desc->array_bitmap)
 		i = bit_set_count(job_desc->array_bitmap);
 	else
 		i = 1;

 	if ((job_count + i) > slurm_conf.max_job_cnt) {
 		error("%s: MaxJobCount limit from slurm.conf reached (%u)",
 		      __func__, slurm_conf.max_job_cnt);
 		return EAGAIN;
 	}

 	error_code = _job_create(job_desc, allocate, will_run, cron,
 				 &job_ptr, submit_uid, err_msg,
 				 protocol_version);
 	*job_pptr = job_ptr;
 	if (error_code) {
 		if (job_ptr && (immediate || will_run)) {
 			/* this should never really happen here */
 			job_state_set(job_ptr, JOB_FAILED);
 			job_ptr->exit_code = 1;
 			job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 			xfree(job_ptr->state_desc);
 			job_ptr->start_time = job_ptr->end_time = now;
 			job_completion_logger(job_ptr, false);
 			error("%s: setting %pJ to \"%s\"",
 			      __func__, job_ptr,
 			      job_state_reason_string(job_ptr->state_reason));
 		}
 		return error_code;
 	}
 	xassert(job_ptr);
 	if (job_desc->array_bitmap)
 		independent = false;
 	else
 		independent = job_independent(job_ptr);
 	/*
 	 * priority needs to be calculated after this since we set a
 	 * begin time in job_independent and that lets us know if the
 	 * job is eligible.
 	 */
 	if (job_ptr->priority == NO_VAL)
 		set_job_prio(job_ptr);

 	if (job_ptr->state_reason == WAIT_HELD_USER)
 		held_user = true;

 	/* Avoid resource fragmentation if important */
 	if ((submit_uid || (job_desc->req_nodes == NULL)) &&
 	    independent && job_is_completing(NULL))
 		too_fragmented = true;	/* Don't pick nodes for job now */
 	/*
 	 * FIXME: Ideally we only want to refuse the request if the
 	 * required node list is insufficient to satisfy the job's
 	 * processor or node count requirements, but the overhead is
 	 * rather high to do that right here. We let requests from
 	 * user root proceed if a node list is specified, for
 	 * meta-schedulers (e.g. Maui, Moab, etc.).
 	 */
 	else
 		too_fragmented = false;

 	defer_this = defer_sched || (defer_batch && job_ptr->batch_flag);

 	if (independent && (!too_fragmented) && !defer_this)
 		top_prio = _top_priority(job_ptr, job_desc->het_job_offset);
 	else
 		top_prio = true;	/* don't bother testing,
 					 * it is not runable anyway */

 	if (immediate &&
 	    (too_fragmented || (!top_prio) || (!independent) || defer_this)) {
 		job_state_set(job_ptr, JOB_FAILED);
 		job_ptr->exit_code  = 1;
 		job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 		xfree(job_ptr->state_desc);
 		job_ptr->start_time = job_ptr->end_time = now;
 		job_completion_logger(job_ptr, false);
 		if (!independent) {
 			debug2("%s: setting %pJ to \"%s\" due to dependency (%s)",
 			       __func__, job_ptr,
 			       job_state_reason_string(job_ptr->state_reason),
 			       slurm_strerror(ESLURM_DEPENDENCY));
 			return ESLURM_DEPENDENCY;
 		}
 		else if (too_fragmented) {
 			debug2("%s: setting %pJ to \"%s\" due to fragmentation (%s)",
 			       __func__, job_ptr,
 			       job_state_reason_string(job_ptr->state_reason),
 			       slurm_strerror(ESLURM_FRAGMENTATION));
 			return ESLURM_FRAGMENTATION;
 		}
 		else if (!top_prio) {
 			debug2("%s: setting %pJ to \"%s\" because it's not top priority (%s)",
 			       __func__, job_ptr,
 			       job_state_reason_string(job_ptr->state_reason),
 			       slurm_strerror(ESLURM_NOT_TOP_PRIORITY));
 			return ESLURM_NOT_TOP_PRIORITY;
 		} else {
 			job_ptr->state_reason = FAIL_DEFER;
 			debug2("%s: setting %pJ to \"%s\" due to SchedulerParameters=defer (%s)",
 			       __func__, job_ptr,
 			       job_state_reason_string(job_ptr->state_reason),
 			       slurm_strerror(ESLURM_DEFER));
 			return ESLURM_DEFER;
 		}
 	}

 	if (will_run && resp) {
 		int rc;
 		rc = job_start_data(job_ptr, resp);
 		job_state_set(job_ptr, JOB_FAILED);
 		job_ptr->exit_code  = 1;
 		job_ptr->start_time = job_ptr->end_time = now;
 		purge_job_record(job_ptr->job_id);
 		return rc;
 	}

 	/*
 	 * We should have a job_ptr->details here if not something is really
 	 * wrong.
 	 */
 	xassert(job_ptr->details);

 	/*
 	 * fed jobs need to go to the siblings first so don't attempt to
 	 * schedule the job now.
 	 */
 	test_only = will_run || job_ptr->deadline || (allocate == 0) ||
 		job_ptr->fed_details;

 	no_alloc = test_only || too_fragmented || _has_deadline(job_ptr) ||
 		(!top_prio) || (!independent) ||
 		(job_desc->het_job_offset != NO_VAL) || defer_this ||
 		(job_ptr->details->prefer && ignore_prefer_val) ||
 		(job_ptr->details->features && ignore_constraint_val);

 	no_alloc = no_alloc || (bb_g_job_test_stage_in(job_ptr, no_alloc) != 1);

 	no_alloc = no_alloc || (!job_ptr->resv_name &&
 				get_magnetic_resv_count());

 	/*
 	 * If we have a prefer feature list check that, if not check the
 	 * normal features.
 	 */
 	if (job_ptr->details->prefer && !ignore_prefer_val) {
 		job_ptr->details->features_use = job_ptr->details->prefer;
 		job_ptr->details->feature_list_use =
 			job_ptr->details->prefer_list;
 	} else if (!ignore_constraint_val) {
 		job_ptr->details->features_use = job_ptr->details->features;
 		job_ptr->details->feature_list_use =
 			job_ptr->details->feature_list;
 	} else {
 		/*
 		 * Set features_use to "" because ignore_constraint_val is set.
 		 * We also set no_alloc to true to avoid actually allocating
 		 * with this setup.
 		 * We are using an empty string rather than NULL because
 		 * valid_feature_counts() will use features rather than
 		 * features_use if it is NULL.
 		 */
 		job_ptr->details->features_use = "";
 		job_ptr->details->feature_list_use = NULL;
 	}

 	error_code = _select_nodes_parts(job_ptr, no_alloc, err_msg);

 	set_job_features_use(job_ptr->details);

 	if (!test_only) {
 		last_job_update = now;
 	}

 	if (held_user)
 		job_ptr->state_reason = WAIT_HELD_USER;
 	/*
 	 * Moved this (_create_job_array) here to handle when a job
 	 * array is submitted since we
 	 * want to know the array task count when we check the job against
 	 * QOS/Assoc limits
 	 */
 	_create_job_array(job_ptr, job_desc);

 	slurmctld_diag_stats.jobs_submitted +=
 		(job_ptr->array_recs && job_ptr->array_recs->task_cnt) ?
 		job_ptr->array_recs->task_cnt : 1;

 	acct_policy_add_job_submit(job_ptr, false);

 	/*
 	 * This only needs to happen if the job didn't schedule immediately.
 	 * select_nodes() can start it if there are nodes available, but if
 	 * that didn't happened send the start record now.
 	 */
 	if (!IS_JOB_IN_DB(job_ptr))
 		jobacct_storage_g_job_start(acct_db_conn, job_ptr);

 	if ((error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) &&
 	    (slurm_conf.enforce_part_limits != PARTITION_ENFORCE_NONE))
 		;	/* Reject job submission */
 	else if ((error_code == ESLURM_NODES_BUSY) ||
 		 (error_code == ESLURM_RESERVATION_BUSY) ||
 		 (error_code == ESLURM_JOB_HELD) ||
 		 (error_code == ESLURM_NODE_NOT_AVAIL) ||
 		 (error_code == ESLURM_QOS_THRES) ||
 		 (error_code == ESLURM_ACCOUNTING_POLICY) ||
 		 (error_code == ESLURM_RESERVATION_NOT_USABLE) ||
 		 (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) ||
 		 (error_code == ESLURM_BURST_BUFFER_WAIT) ||
 		 (error_code == ESLURM_PARTITION_DOWN) ||
 		 (error_code == ESLURM_LICENSES_UNAVAILABLE) ||
 		 (error_code == ESLURM_PORTS_BUSY) ||
 		 ((error_code == ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) &&
 		  (job_ptr->state_reason == FAIL_CONSTRAINTS))) {
 		/*
 		 * Non-fatal error, but job can't be scheduled right now.
 		 *
 		 * Note: Keep list in sync with nonfatal_errors[] in
 		 * openapi/slurmctld.
 		 */
 		if (immediate) {
 			job_state_set(job_ptr, JOB_FAILED);
 			job_ptr->exit_code  = 1;
 			job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 			xfree(job_ptr->state_desc);
 			job_ptr->start_time = job_ptr->end_time = now;
 			job_completion_logger(job_ptr, false);
 			debug2("%s: setting %pJ to \"%s\" because it cannot be immediately allocated (%s)",
 			       __func__, job_ptr,
 			       job_state_reason_string(job_ptr->state_reason),
 			       slurm_strerror(error_code));
 		} else {	/* job remains queued */
 			if ((error_code == ESLURM_NODES_BUSY) ||
 			    (error_code == ESLURM_BURST_BUFFER_WAIT) ||
 			    (error_code == ESLURM_RESERVATION_BUSY) ||
 			    (error_code == ESLURM_ACCOUNTING_POLICY) ||
 			    (error_code == ESLURM_PORTS_BUSY) ||
 			    ((error_code == ESLURM_PARTITION_DOWN) &&
 			     (job_ptr->batch_flag))) {
 				job_ptr->details->features_use = NULL;
 				job_ptr->details->feature_list_use = NULL;
 				error_code = SLURM_SUCCESS;
 			}
 		}
 		return error_code;
 	}

 	if (error_code) {	/* fundamental flaw in job request */
 		job_state_set(job_ptr, JOB_FAILED);
 		job_ptr->exit_code  = 1;
 		job_ptr->state_reason = FAIL_BAD_CONSTRAINTS;
 		xfree(job_ptr->state_desc);
 		job_ptr->start_time = job_ptr->end_time = now;
 		job_completion_logger(job_ptr, false);
 		debug2("%s: setting %pJ to \"%s\" due to a flaw in the job request (%s)",
 		       __func__, job_ptr,
 		       job_state_reason_string(job_ptr->state_reason),
 		       slurm_strerror(error_code));
 		return error_code;
 	}

 	if (will_run) {		/* job would run, flag job destruction */
 		job_state_set(job_ptr, JOB_FAILED);
 		job_ptr->exit_code  = 1;
 		job_ptr->start_time = job_ptr->end_time = now;
 		purge_job_record(job_ptr->job_id);
 	}

 	if (!will_run) {
 		sched_debug2("%pJ allocated resources: NodeList=%s",
 			     job_ptr, job_ptr->nodes);
 		rebuild_job_part_list(job_ptr);
 	}

 	return SLURM_SUCCESS;
 }

 /*
  * job_fail - terminate a job due to initiation failure
  * IN job_ptr - Pointer to job to be killed
  * IN job_state - desired job state (JOB_BOOT_FAIL, JOB_NODE_FAIL, etc.)
  * RET 0 on success, otherwise ESLURM error code
  */
 static int _job_fail(job_record_t *job_ptr, uint32_t job_state)
 {
 	time_t now = time(NULL);
 	bool suspended = false;

 	if (IS_JOB_FINISHED(job_ptr))
 		return ESLURM_ALREADY_DONE;
 	if (IS_JOB_SUSPENDED(job_ptr)) {
 		uint32_t suspend_job_state = job_ptr->job_state;
 		/*
 		 * we can't have it as suspended when we call the
 		 * accounting stuff.
 		 */
 		job_state_set(job_ptr, JOB_CANCELLED);
 		jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
 		job_state_set(job_ptr, suspend_job_state);
 		suspended = true;
 	}

 	if (IS_JOB_CONFIGURING(job_ptr) || IS_JOB_RUNNING(job_ptr) ||
 	    suspended) {
 		/* No need to signal steps, deallocate kills them */
 		job_ptr->time_last_active       = now;
 		if (suspended) {
 			job_ptr->end_time       = job_ptr->suspend_time;
 			job_ptr->tot_sus_time  +=
 				difftime(now, job_ptr->suspend_time);
 		} else
 			job_ptr->end_time       = now;
 		last_job_update                 = now;
 		job_state_set(job_ptr, (job_state | JOB_COMPLETING));
 		job_ptr->exit_code = 1;
 		job_ptr->state_reason = FAIL_LAUNCH;
 		xfree(job_ptr->state_desc);
 		job_completion_logger(job_ptr, false);
 		if (job_ptr->node_bitmap) {
 			build_cg_bitmap(job_ptr);
 			deallocate_nodes(job_ptr, false, suspended, false);
 		}
 		return SLURM_SUCCESS;
 	}
 	/* All other states */
 	verbose("job_fail: %pJ can't be killed from state=%s",
 		job_ptr, job_state_string(job_ptr->job_state));

 	return ESLURM_TRANSITION_STATE_NO_UPDATE;

 }

 /*
  * IN signal_args - Append the response to signal_args->responses.
  * IN cluster_id - If set, then this identifies the sibling cluster that the
  *                 job is running on or originated from.
  * IN eror_code - Error code to use in the response.
  * IN err_msg - If set, use this as the response error message.
  * IN id - Identifier for the job. Job id is different than the actual job id
  *         if the job is an array task or a het job component that is not the
  *         het job leader.
  * IN real_job_id - The real job id or NO_VAL
  */
 static void _add_signal_job_resp(signal_jobs_args_t *signal_args,
 				 char *sibling_name, int error_code,
 				 char *err_msg, slurm_selected_step_t *id,
 				 uint32_t real_job_id)
 {
 	kill_jobs_resp_job_t *job_resp = xmalloc(sizeof(*job_resp));

 	job_resp->error_code = error_code;
 	if (err_msg)
 		job_resp->error_msg = err_msg;
 	else if (error_code != SLURM_SUCCESS)
 		job_resp->error_msg = xstrdup(slurm_strerror(error_code));
 	job_resp->id = xmalloc(sizeof(*job_resp->id));
 	memcpy(job_resp->id, id, sizeof(*id));
 	/* Full copy job_resp->id->array_bitmap */
 	if (id->array_bitmap)
 		job_resp->id->array_bitmap = bit_copy(id->array_bitmap);

 	job_resp->real_job_id = real_job_id;
 	job_resp->sibling_name = sibling_name;

 	list_append(signal_args->responses, job_resp);
 }

 static int _match_part_name(void *x, void *key)
 {
 	part_record_t *part_ptr = x;
 	char *part_name = key;

 	if (!xstrcmp(part_ptr->name, part_name))
 		return 1;
 	return 0;
 }

 static int _match_resv_name(void *x, void *key)
 {
 	slurmctld_resv_t *resv_ptr = x;
 	char *resv_name = key;

 	if (!xstrcmp(resv_ptr->name, resv_name))
 		return 1;
 	return 0;
 }

 static void _slurm_selected_step_init(job_record_t *job_ptr,
 				      slurm_selected_step_t *id)
 {
 	xassert(job_ptr);

 	id->array_bitmap = NULL;
 	id->array_task_id = job_ptr->array_task_id;
 	if (job_ptr->array_task_id != NO_VAL)
 		id->step_id.job_id = job_ptr->array_job_id;
 	else if (job_ptr->het_job_offset)
 		id->step_id.job_id = job_ptr->het_job_id;
 	else
 		id->step_id.job_id = job_ptr->job_id;

 	if (job_ptr->het_job_offset)
 		id->het_job_offset = job_ptr->het_job_offset;
 	else
 		id->het_job_offset = NO_VAL;

 	id->step_id.step_het_comp = NO_VAL;
 	id->step_id.step_id = NO_VAL;
 }

 static void _handle_signal_filter_mismatch(job_record_t *job_ptr,
 					   signal_jobs_args_t *signal_args,
 					   uint32_t error_code,
 					   char *filter_err_msg)
 {
 	slurm_selected_step_t id;
 	char *err_msg = NULL;

 	/*
 	 * If the job is revoked on this cluster and started on a sibling, the
 	 * revoked job's state, reservation, and partition will not necessarily
 	 * match the other cluster, and the other cluster has the cluster lock
 	 * for this job. For example, this job's state is 0+REVOKED and the job
 	 * state on the other cluster could be suspended, running, etc.
 	 * In that case, always send a response back to the client that we
 	 * could not signal the job.
 	 */
 	if (fed_mgr_fed_rec && fed_mgr_job_started_on_sib(job_ptr)) {
 		char *sib_name;

 		sib_name = fed_mgr_get_cluster_name(
 			job_ptr->fed_details->cluster_lock);
 		err_msg = xstrdup_printf("Job started on sibling cluster %s: %s",
 					 sib_name, slurm_strerror(error_code));
 		_slurm_selected_step_init(job_ptr, &id);
 		_add_signal_job_resp(signal_args, sib_name, error_code,
 				     err_msg, &id, job_ptr->job_id);
 		/* sib_name is added to the job_resp, do not free */
 		return;
 	}

 	if (!signal_args->filter_specific_job_ids)
 		return;

 	if (filter_err_msg)
 		err_msg = xstrdup_printf("%s: %s",
 					 filter_err_msg,
 					 slurm_strerror(error_code));
 	else
 		err_msg = xstrdup_printf("%s", slurm_strerror(error_code));

 	_slurm_selected_step_init(job_ptr, &id);
 	_add_signal_job_resp(signal_args, NULL, error_code,
 			     err_msg, &id, job_ptr->job_id);
 }

 static bool _signal_job_matches_filter(job_record_t *job_ptr,
 				       signal_jobs_args_t *signal_args)
 {
 	bool matches_filter = true;
 	int error_code = ESLURM_JOB_SIGNAL_FAILED;
 	uint32_t job_base_state = job_ptr->job_state & JOB_STATE_BASE;
 	char *filter_err_msg = NULL;
 	kill_jobs_msg_t *kill_msg = signal_args->kill_msg;

 	if (IS_JOB_FINISHED(job_ptr)) {
 		error_code = ESLURM_ALREADY_DONE;
 		matches_filter = false;
 		goto fini;
 	}

 	if (kill_msg->account && xstrcmp(job_ptr->account, kill_msg->account)) {
 		if (signal_args->filter_specific_job_ids) {
 			filter_err_msg = xstrdup_printf("Job account %s != filter account %s",
 							job_ptr->account,
 							kill_msg->account);
 		}
 		matches_filter = false;
 		goto fini;
 	}

 	if (kill_msg->job_name && xstrcmp(job_ptr->name, kill_msg->job_name)) {
 		if (signal_args->filter_specific_job_ids) {
 			filter_err_msg = xstrdup_printf("Job name %s != filter name %s",
 							job_ptr->name,
 							kill_msg->job_name);
 		}
 		matches_filter = false;
 		goto fini;
 	}

 	/*
 	 * If the job is submitted to multiple partitions, then its partition
 	 * string is all the partitions. We need to find if the requested
 	 * partition matches any of the partitions that the job was submitted
 	 * to if the job is still pending. If the job is running, only check
 	 * the partition the job is running in.
 	 */
 	if (kill_msg->partition) {
 		if (IS_JOB_PENDING(job_ptr) && job_ptr->part_ptr_list) {
 			if (!list_find_first(job_ptr->part_ptr_list,
 					     _match_part_name,
 					     kill_msg->partition))
 				matches_filter = false;
 		} else if (job_ptr->part_ptr) {
 			if (xstrcmp(job_ptr->part_ptr->name,
 				    kill_msg->partition))
 				matches_filter = false;
 		} else {
 			if (xstrcmp(job_ptr->partition, kill_msg->partition))
 				matches_filter = false;
 		}

 		if (!matches_filter) {
 			if (signal_args->filter_specific_job_ids) {
 				filter_err_msg =
 					xstrdup_printf("Job partition %s does not include filter partition %s",
 						       job_ptr->partition,
 						       kill_msg->partition);
 			}
 			goto fini;
 		}
 	}

 	if (kill_msg->qos) {
 		char *qos_name = "NULL";

 		if (!job_ptr->qos_ptr)
 			matches_filter = false;
 		else if (xstrcmp(job_ptr->qos_ptr->name, kill_msg->qos)) {
 			matches_filter = false;
 			qos_name = job_ptr->qos_ptr->name;
 		}

 		if (!matches_filter) {
 			if (signal_args->filter_specific_job_ids) {
 				filter_err_msg = xstrdup_printf("Job qos %s != filter qos %s",
 								qos_name,
 								kill_msg->qos);
 			}
 			goto fini;
 		}
 	}

 	if (kill_msg->reservation) {
 		if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) {
 			slurmctld_resv_t *resv_ptr =
 				find_resv_name(kill_msg->reservation);

 			if (!(resv_ptr &&
 			      (resv_ptr->resv_id == job_ptr->resv_id)))
 				matches_filter = false;
 		} else if (job_ptr->resv_list) {
 			if (!list_find_first(job_ptr->resv_list,
 					     _match_resv_name,
 					     kill_msg->reservation))
 				matches_filter = false;
 		} else if (job_ptr->resv_ptr) {
 			if (xstrcmp(job_ptr->resv_ptr->name,
 				    kill_msg->reservation))
 				matches_filter = false;
 		} else {
 			if (xstrcmp(job_ptr->resv_name, kill_msg->reservation))
 				matches_filter = false;
 		}

 		if (!matches_filter) {
 			if (signal_args->filter_specific_job_ids) {
 				filter_err_msg =
 					xstrdup_printf("Job reservation %s does not include filter reservation %s",
 						       job_ptr->resv_name,
 						       kill_msg->reservation);
 			}
 			goto fini;
 		}
 	}

 	if ((kill_msg->state != JOB_END) &&
 	    (job_base_state != kill_msg->state)) {
 		if (signal_args->filter_specific_job_ids) {
 			char *msg_state_str = job_state_string(kill_msg->state);
 			char *job_state_str = job_state_string(job_base_state);

 			filter_err_msg = xstrdup_printf("Job state %s != filter state %s",
 							job_state_str,
 							msg_state_str);
 		}
 		matches_filter = false;
 		goto fini;
 	}

 	if (kill_msg->user_name && (job_ptr->user_id != kill_msg->user_id)) {
 		if (signal_args->filter_specific_job_ids) {
 			filter_err_msg = xstrdup_printf("Job user id %u != filter user id %u",
 							job_ptr->user_id,
 							kill_msg->user_id);
 		}
 		matches_filter = false;
 		goto fini;
 	}

 	if (kill_msg->nodelist) {
 		hostset_t *hs;
 		bool intersects;

 		if (!job_ptr->nodes) {
 			if (signal_args->filter_specific_job_ids) {
 				filter_err_msg =
 					xstrdup_printf("Job does not have nodes but filter has nodes %s",
 						       kill_msg->nodelist);
 			}
 			matches_filter = false;
 			goto fini;
 		}

 		hs = hostset_create(job_ptr->nodes);
 		intersects = hostset_intersects(hs, kill_msg->nodelist);
 		hostset_destroy(hs);
 		if (!intersects) {
 			if (signal_args->filter_specific_job_ids) {
 				filter_err_msg =
 					xstrdup_printf("Job nodes %s does not intersect with filter nodes %s",
 						       job_ptr->nodes,
 						       kill_msg->nodelist);
 			}
 			matches_filter = false;
 			goto fini;
 		}
 	}

 	if (kill_msg->wckey) {
 		char *job_key = job_ptr->wckey;

 		/*
 		 * A wckey that begins with '*' indicates that the wckey
 		 * was applied by default.  When the --wckey option does
 		 * not begin with a '*', act on all wckeys with the same
 		 * name, default or not.
 		 */
 		if ((kill_msg->wckey[0] != '*') && job_key &&
 		    (job_key[0] == '*'))
 			job_key++;

 		if (xstrcmp(job_key, kill_msg->wckey)) {
 			if (signal_args->filter_specific_job_ids) {
 				filter_err_msg =
 					xstrdup_printf("Job wckey %s != filter wckey %s",
 						       job_ptr->wckey,
 						       kill_msg->wckey);
 			}
 			matches_filter = false;
 			goto fini;
 		}
 	}

 	if (job_ptr->het_job_offset) {
 		if (signal_args->het_leader &&
 		    signal_args->het_leader->job_id &&
 		    (job_ptr->het_job_id ==
 		     signal_args->het_leader->het_job_id)) {
 			/*
 			 * Filter out HetJob non-leader component as its leader
 			 * should have already been evaluated and hasn't been
 			 * filtered out.
 			 *
 			 * The leader RPC signal handler will affect all the
 			 * components, so this avoids extra unneeded RPCs, races
 			 * and issues interpreting multiple error codes.
 			 *
 			 * This can be done assuming the walking of the loaded
 			 * jobs is guaranteed to evaluate in an order such that
 			 * HetJob leaders are evaluated before their matching
 			 * non-leaders and the whole HetJob is evaluated
 			 * contiguously. The slurmctld job_list is ordered by
 			 * job creation time (always leader first) and HetJobs
 			 * are created in a row.
 			 */
 			return false;
 		}

 		/*
 		 * Het job components may not be signalled individually if they
 		 * are pending or if whole_hetjob is set.
 		 */
 		if (IS_JOB_PENDING(job_ptr)) {
 			error_code = ESLURM_NOT_WHOLE_HET_JOB;
 			if (signal_args->filter_specific_job_ids)
 				filter_err_msg = xstrdup("Het job component cannot be signalled while pending");
 			goto fini;
 		}
 		if (_get_whole_hetjob()) {
 			error_code = ESLURM_NOT_WHOLE_HET_JOB;
 			if (signal_args->filter_specific_job_ids)
 				filter_err_msg = xstrdup("slurm.conf whole_hetjob is set");
 			goto fini;
 		}
 	}

 fini:
 	if (!matches_filter)
 		_handle_signal_filter_mismatch(job_ptr, signal_args,
 					       error_code, filter_err_msg);
 	else {
 		/* Track most recent het leader. */
 		if (job_ptr->het_job_id && !job_ptr->het_job_offset)
 			signal_args->het_leader = job_ptr;
 	}

 	xfree(filter_err_msg);

 	return matches_filter;
 }

 /*
  * Figure out if the job (job_ptr) matches the specified filters:
  * - filter_id describes a job or set of jobs if it is an array expression.
  * - signal_args->kill_msg has filters requested by the client.
  *
  * If the job does not match the specified filters in signal_args, then
  * _signal_job_matches_filter() adds a response message for the job and we
  * return.
  *
  * If the job matches the specified filters, but the user is not authorized to
  * signal the job, add a response message and return.
  *
  * If the job matches the specified filters and the user is authorized to signal
  * the job, place the job into the appropriate list of jobs which will later be
  * signaled. The lists are in signal_args.
  * - pending_array_task_list: A meta record with pending array tasks that are
  *   requested to be signaled, or a single pending array task that has not yet
  *   been split from the meta record.
  * - array_leader_list - A meta record for an array where that entire array has
  *   been requested to be signaled.
  * - other_job_list - All other jobs to be signaled.
  */
 static void _apply_signal_jobs_filter(job_record_t *job_ptr,
 				      slurm_selected_step_t *filter_id,
 				      signal_jobs_args_t *signal_args)
 {
 	bool is_pending_meta_record_with_tasks;
 	uid_t auth_uid = signal_args->auth_uid;

 	if (!_signal_job_matches_filter(job_ptr, signal_args))
 		return;

 	/* Verify that the user can kill the requested job */
 	if ((job_ptr->user_id != auth_uid) &&
 	    !validate_operator_locked(auth_uid) &&
 	    !assoc_mgr_is_user_acct_coord(acct_db_conn, auth_uid,
 					  job_ptr->account, true)) {
 		slurm_selected_step_t *use_id;
 		slurm_selected_step_t id;

 		if (filter_id)
 			use_id = filter_id;
 		else {
 			_slurm_selected_step_init(job_ptr, &id);
 			use_id = &id;
 		}
 		_add_signal_job_resp(signal_args, NULL, ESLURM_ACCESS_DENIED,
 				     NULL, use_id, job_ptr->job_id);
 		return;
 	}

 	is_pending_meta_record_with_tasks = (IS_JOB_PENDING(job_ptr) &&
 					     job_ptr->array_recs &&
 					     job_ptr->array_recs->task_cnt);

 	if (filter_id && !filter_id->array_bitmap &&
 	    (filter_id->array_task_id != NO_VAL) &&
 	    is_pending_meta_record_with_tasks) {
 		/*
 		 * A pending job array task that has not been split from the
 		 * meta array record.
 		 */
 		array_task_filter_t *atf = xmalloc(sizeof(*atf));

 		/* Copy filter_id, but use a new array_bitmap */
 		atf->filter_id = xmalloc(sizeof(*atf->filter_id));
 		memcpy(atf->filter_id, filter_id, sizeof(*filter_id));

 		atf->filter_id->array_bitmap = bit_alloc(max_array_size);
 		bit_set(atf->filter_id->array_bitmap, filter_id->array_task_id);
 		atf->free_array_bitmap = true;
 		atf->job_ptr = job_ptr;

 		list_append(signal_args->pending_array_task_list, atf);
 	} else if (filter_id && filter_id->array_bitmap &&
 		   is_pending_meta_record_with_tasks) {
 		/* A job array expression with pending array tasks */
 		array_task_filter_t *atf = xmalloc(sizeof(*atf));

 		atf->filter_id = xmalloc(sizeof(*atf->filter_id));
 		memcpy(atf->filter_id, filter_id, sizeof(*filter_id));
 		atf->job_ptr = job_ptr;

 		list_append(signal_args->pending_array_task_list, atf);
 	} else if (job_ptr->array_recs)
 		list_append(signal_args->array_leader_list, job_ptr);
 	else
 		list_append(signal_args->other_job_list, job_ptr);
 }

 static int _foreach_filter_job_list(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;
 	signal_jobs_args_t *signal_args = arg;

 	_apply_signal_jobs_filter(job_ptr, NULL, signal_args);

 	return SLURM_SUCCESS;
 }

 static int _foreach_signal_job(void *x, void *arg)
 {
 	int error_code;
 	job_record_t *job_ptr = x;
 	signal_jobs_args_t *signal_args = arg;
 	kill_jobs_msg_t *kill_msg = signal_args->kill_msg;

 	if (job_ptr->het_job_list)
 		error_code = het_job_signal(job_ptr, kill_msg->signal,
 					    kill_msg->flags,
 					    signal_args->auth_uid, 0);
 	else
 		error_code = job_signal(job_ptr, kill_msg->signal,
 					kill_msg->flags,
 					signal_args->auth_uid, 0);

 	if (error_code || (kill_msg->flags & KILL_JOBS_VERBOSE)) {
 		slurm_selected_step_t id;

 		_slurm_selected_step_init(job_ptr, &id);
 		_add_signal_job_resp(signal_args, NULL, error_code, NULL, &id,
 				     job_ptr->job_id);
 	}

 	return SLURM_SUCCESS;
 }

 static int _foreach_signal_job_array_tasks(void *x, void *arg)
 {
 	array_task_filter_t *atf = x;
 	signal_jobs_args_t *signal_args = arg;
 	kill_jobs_msg_t *kill_msg = signal_args->kill_msg;
 	int32_t i_last;
 	int error_code = SLURM_SUCCESS;

 	/*
 	 * Signal the pending array tasks in the array job. The tasks that
 	 * have already been split out are not part of the meta job's array
 	 * bitmap and are handled elsewhere.
 	 *
 	 * _signal_pending_job_array_tasks() removes the pending tasks from
 	 * array_bitmap. For the response to the client, we want to the pending
 	 * tasks that were signalled. To get that, operate on a copy of
 	 * array_bitmap which will be returned with the running tasks. Then
 	 * remove the running tasks from the original bitmap (bit_and_not).
 	 */
 	i_last = bit_fls(atf->filter_id->array_bitmap);
 	if (i_last >= 0) {
 		bitstr_t *array_bitmap_running =
 			bit_copy(atf->filter_id->array_bitmap);

 		_signal_pending_job_array_tasks(atf->job_ptr,
 						&array_bitmap_running,
 						kill_msg->signal,
 						signal_args->auth_uid,
 						i_last, signal_args->now,
 						&error_code);
 		bit_and_not(atf->filter_id->array_bitmap, array_bitmap_running);
 		FREE_NULL_BITMAP(array_bitmap_running);
 	}

 	if (error_code || (kill_msg->flags & KILL_JOBS_VERBOSE))
 		_add_signal_job_resp(signal_args, NULL, error_code, NULL,
 				     atf->filter_id, atf->job_ptr->job_id);

 	return 0;
 }

 static foreach_job_by_id_control_t _job_not_found(const slurm_selected_step_t
 						  	*id,
 						  void *arg)
 {
 	signal_jobs_args_t *signal_args = arg;
 	uint32_t job_id = id->step_id.job_id;

 	if (fed_mgr_fed_rec && !fed_mgr_is_origin_job_id(job_id)) {
 		int error_code = ESLURM_JOB_SIGNAL_FAILED;
 		char *err_msg = NULL;

 		err_msg = xstrdup_printf("Job id not in federation: %s",
 					 slurm_strerror(error_code));
 		_add_signal_job_resp(signal_args, NULL, error_code,
 				     err_msg, (slurm_selected_step_t *) id,
 				     NO_VAL);
 	} else {
 		_add_signal_job_resp(signal_args, NULL, ESLURM_INVALID_JOB_ID,
 				     NULL, (slurm_selected_step_t *) id,
 				     NO_VAL);
 	}
 	return FOR_EACH_JOB_BY_ID_EACH_CONT;
 }

 static foreach_job_by_id_control_t _filter_job(job_record_t *job_ptr,
 					       const slurm_selected_step_t *id,
 					       void *arg)
 {
 	_apply_signal_jobs_filter(job_ptr, (slurm_selected_step_t *) id, arg);

 	return FOR_EACH_JOB_BY_ID_EACH_CONT;
 }

 static void _filter_jobs_ids(slurm_selected_step_t **job_ids, uint32_t cnt,
 			     signal_jobs_args_t *signal_args)
 {
 	signal_args->filter_specific_job_ids = true;
 	for (int i = 0; i < cnt; i++) {
 		slurm_selected_step_t *filter = job_ids[i];
 		uint32_t job_id = filter->step_id.job_id;
 		int rc;

 		if (fed_mgr_cluster_rec && !fed_mgr_is_job_id_in_fed(job_id)) {
 			rc = ESLURM_JOB_NOT_FEDERATED;
 			_add_signal_job_resp(signal_args, NULL, rc, NULL,
 					     filter, NO_VAL);
 			continue;
 		}

 		(void) foreach_job_by_id(filter, _filter_job, _job_not_found,
 					 signal_args);
 	}
 }

 static int _foreach_xfer_responses(void *x, void *arg)
 {
 	kill_jobs_resp_job_t *job_resp = x;
 	xfer_signal_jobs_responses_args_t *args = arg;

 	memcpy(&args->resp_msg->job_responses[args->curr_count], job_resp,
 	       sizeof(*job_resp));

 	/*
 	 * Pointers in job_resp were transferred and will be free'd with
 	 * job_responses
 	 */
 	xfree(job_resp);
 	args->curr_count++;

 	return SLURM_SUCCESS;
 }

 static void _build_kill_jobs_resp_msg(signal_jobs_args_t *signal_args,
 				      kill_jobs_resp_msg_t **resp_msg_p)
 {
 	kill_jobs_resp_msg_t *resp_msg = xmalloc(sizeof(*resp_msg));
 	xfer_signal_jobs_responses_args_t foreach_args = {
 		.resp_msg = resp_msg,
 	};

 	*resp_msg_p = resp_msg;
 	resp_msg->jobs_cnt = list_count(signal_args->responses);

 	if (!resp_msg->jobs_cnt)
 		return;

 	resp_msg->job_responses = xcalloc(resp_msg->jobs_cnt,
 					  sizeof(*resp_msg->job_responses));
 	list_for_each(signal_args->responses, _foreach_xfer_responses,
 		      &foreach_args);
 }

 /*
  * Signal a job based upon job pointer.
  * Authentication and authorization checks must be performed before calling.
  */
 extern int job_signal(job_record_t *job_ptr, uint16_t signal,
 		      uint16_t flags, uid_t uid, bool preempt)
 {
 	uint16_t job_term_state;
 	time_t now = time(NULL);

 	log_flag(TRACE_JOBS, "%s: enter %pJ", __func__, job_ptr);

 	if (IS_JOB_STAGE_OUT(job_ptr) && (flags & KILL_HURRY)) {
 		job_ptr->bit_flags |= JOB_KILL_HURRY;
 		return bb_g_job_cancel(job_ptr);
 	}

 	if (IS_JOB_FINISHED(job_ptr))
 		return ESLURM_ALREADY_DONE;

 	/*
 	 * If is origin job then cancel siblings -- if they exist.
 	 * origin job = because it knows where the siblings are
 	 * If the job is running locally then just do the normal signaling
 	 */
 	if (!(flags & KILL_NO_SIBS) && !IS_JOB_RUNNING(job_ptr) &&
 	    job_ptr->fed_details && fed_mgr_fed_rec) {
 		uint32_t origin_id = fed_mgr_get_cluster_id(job_ptr->job_id);
 		slurmdb_cluster_rec_t *origin =
 			fed_mgr_get_cluster_by_id(origin_id);

 		if (origin && (origin == fed_mgr_cluster_rec) &&
 		    fed_mgr_job_started_on_sib(job_ptr)) {
 			/*
 			 * If the job is running on a remote cluster then wait
 			 * for the job to report back that it's completed,
 			 * otherwise just signal the pending siblings and itself
 			 * (by not returning).
 			 */
 			return fed_mgr_job_cancel(job_ptr, signal, flags, uid,
 						  false);
 		} else if (origin && (origin == fed_mgr_cluster_rec)) {
 			/* cancel origin job and revoke sibling jobs */
 			fed_mgr_job_revoke_sibs(job_ptr);
 			fed_mgr_remove_remote_dependencies(job_ptr);
 		} else if (!origin ||
 			   !origin->fed.send ||
 			   !((persist_conn_t *) origin->fed.send)->tls_conn) {
 			/*
 			 * The origin is down just signal all of the viable
 			 * sibling jobs
 			 */
 			fed_mgr_job_cancel(job_ptr, signal, flags, uid, true);
 		}
 	}

 	last_job_update = now;

 	/*
 	 * Handle jobs submitted through scrontab.
 	 */
 	if (job_ptr->bit_flags & CRON_JOB) {
 		cron_entry_t *entry =
 			(cron_entry_t *) job_ptr->details->crontab_entry;
 		/*
 		 * The KILL_CRON flag being set here is indicating that the
 		 * user has specifically requested killing scrontab jobs. To
 		 * avoid interfering with other possible ways of killing jobs,
 		 * the KILL_CRON flag being set must mean that killing cron
 		 * jobs is permitted.
 		 */
 		if (xstrcasestr(slurm_conf.scron_params, "explicit_scancel") &&
 		    !(flags & KILL_CRON))
 			return ESLURM_CANNOT_CANCEL_CRON_JOB;
 		job_ptr->bit_flags &= ~CRON_JOB;
 		error("cancelling cron job, lines %u %u",
 		      entry->line_start, entry->line_end);
 		crontab_add_disabled_lines(job_ptr->user_id, entry->line_start,
 					   entry->line_end);
 	}

 	/* save user ID of the one who requested the job be cancelled */
 	if (signal == SIGKILL)
 		job_ptr->requid = uid;
 	if (IS_JOB_PENDING(job_ptr) && IS_JOB_COMPLETING(job_ptr) &&
 	    (signal == SIGKILL)) {
 		/* Prevent job requeue, otherwise preserve state */
 		job_state_set(job_ptr, (JOB_CANCELLED | JOB_COMPLETING));

 		/* build_cg_bitmap() not needed, job already completing */
 		verbose("%s: %u of requeuing %pJ successful",
 			__func__, signal, job_ptr);
 		return SLURM_SUCCESS;
 	}

 	if (flags & KILL_HURRY)
 		job_ptr->bit_flags |= JOB_KILL_HURRY;

 	if (IS_JOB_CONFIGURING(job_ptr) && (signal == SIGKILL)) {
 		last_job_update         = now;
 		job_ptr->end_time       = now;
 		job_state_set(job_ptr, (JOB_CANCELLED | JOB_COMPLETING));
 		if (flags & KILL_FED_REQUEUE)
 			job_state_set_flag(job_ptr, JOB_REQUEUE);
 		slurmscriptd_flush_job(job_ptr->job_id);
 		track_script_flush_job(job_ptr->job_id);
 		build_cg_bitmap(job_ptr);
 		job_completion_logger(job_ptr, false);
 		deallocate_nodes(job_ptr, false, false, false);
 		if (flags & KILL_FED_REQUEUE)
 			job_state_unset_flag(job_ptr, JOB_REQUEUE);

 		verbose("%s: %u of configuring %pJ successful",
 			__func__, signal, job_ptr);
 		return SLURM_SUCCESS;
 	}

 	if (IS_JOB_PENDING(job_ptr) && (signal == SIGKILL)) {
 		job_state_set(job_ptr, JOB_CANCELLED);
 		if (flags & KILL_FED_REQUEUE)
 			job_state_set_flag(job_ptr, JOB_REQUEUE);
 		job_ptr->start_time	= now;
 		job_ptr->end_time	= now;
 		srun_allocate_abort(job_ptr);
 		slurmscriptd_flush_job(job_ptr->job_id);
 		track_script_flush_job(job_ptr->job_id);
 		job_completion_logger(job_ptr, false);
 		if (flags & KILL_FED_REQUEUE)
 			job_state_unset_flag(job_ptr, JOB_REQUEUE);

 		/*
 		 * Send back a response to the origin cluster, in other cases
 		 * where the job is running the job will send back a response
 		 * after the job is is completed. This can happen when the
 		 * pending origin job is put into a hold state and the siblings
 		 * are removed or when the job is canceled from the origin.
 		 */
 		fed_mgr_job_complete(job_ptr, 0, now);
 		verbose("%s: %u of pending %pJ successful",
 			__func__, signal, job_ptr);
 		return SLURM_SUCCESS;
 	}

 	if (preempt)
 		job_term_state = JOB_PREEMPTED;
 	else if (flags & KILL_FAIL_JOB)
 		job_term_state = JOB_FAILED;
 	else
 		job_term_state = JOB_CANCELLED;
 	if (IS_JOB_SUSPENDED(job_ptr) && (signal == SIGKILL)) {
 		last_job_update         = now;
 		job_ptr->end_time       = job_ptr->suspend_time;
 		job_ptr->tot_sus_time  += difftime(now, job_ptr->suspend_time);
 		job_state_set(job_ptr, (job_term_state | JOB_COMPLETING));
 		if (flags & KILL_FED_REQUEUE)
 			job_state_set_flag(job_ptr, JOB_REQUEUE);
 		build_cg_bitmap(job_ptr);
 		jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
 		job_completion_logger(job_ptr, false);
 		if (flags & KILL_FED_REQUEUE)
 			job_state_unset_flag(job_ptr, JOB_REQUEUE);
 		deallocate_nodes(job_ptr, false, true, preempt);
 		verbose("%s: %u of suspended %pJ successful",
 			__func__, signal, job_ptr);
 		return SLURM_SUCCESS;
 	}

 	if (IS_JOB_RUNNING(job_ptr)) {

 		if ((signal == SIGSTOP) || (signal == SIGCONT)) {
 			if (IS_JOB_SIGNALING(job_ptr)) {
 				verbose("%s: %u not send to %pJ 0x%x",
 					__func__, signal, job_ptr,
 					job_ptr->job_state);
 				return ESLURM_TRANSITION_STATE_NO_UPDATE;
 			}
 			job_state_set_flag(job_ptr, JOB_SIGNALING);
 		}

 		if ((signal == SIGKILL)
 		    && !(flags & KILL_STEPS_ONLY)
 		    && !(flags & KILL_JOB_BATCH)) {
 			/* No need to signal steps, deallocate kills them
 			 */
 			job_ptr->time_last_active	= now;
 			job_ptr->end_time		= now;
 			last_job_update			= now;
 			job_state_set(job_ptr, (job_term_state |
 						JOB_COMPLETING));
 			if (flags & KILL_FED_REQUEUE)
 				job_state_set_flag(job_ptr, JOB_REQUEUE);
 			build_cg_bitmap(job_ptr);
 			job_completion_logger(job_ptr, false);
 			deallocate_nodes(job_ptr, false, false, preempt);
 			if (flags & KILL_FED_REQUEUE)
 				job_state_unset_flag(job_ptr, JOB_REQUEUE);
 		} else if (job_ptr->batch_flag && (flags & KILL_JOB_BATCH)) {
 			_signal_batch_job(job_ptr, signal, flags);
 		} else if ((flags & KILL_JOB_BATCH) && !job_ptr->batch_flag) {
 			if ((signal == SIGSTOP) || (signal == SIGCONT))
 				job_state_unset_flag(job_ptr, JOB_SIGNALING);
 			return ESLURM_JOB_SCRIPT_MISSING;
 		} else {
 			_signal_job(job_ptr, signal, flags);
 		}
 		verbose("%s: %u of running %pJ successful 0x%x",
 			__func__, signal, job_ptr, job_ptr->job_state);
 		return SLURM_SUCCESS;
 	}

 	verbose("%s: %pJ can't be sent signal %u from state=%s",
 		__func__, job_ptr, signal,
 		job_state_string(job_ptr->job_state));

 	log_flag(TRACE_JOBS, "%s: return %pJ", __func__, job_ptr);

 	return ESLURM_TRANSITION_STATE_NO_UPDATE;
 }

 static int foreach_het_job_signal(void *x, void *arg)
 {
 	job_record_t *het_job = x;
 	foreach_kill_hetjob_t *foreach_kill_hetjob = arg;

 	if (foreach_kill_hetjob->het_job_leader->het_job_id !=
 	    het_job->het_job_id) {
 		error("%s: Bad het_job_list for %pJ",
 		      __func__, foreach_kill_hetjob->het_job_leader);
 	} else {
 		int rc1 = job_signal(het_job,
 				     foreach_kill_hetjob->signal,
 				     foreach_kill_hetjob->flags,
 				     foreach_kill_hetjob->uid,
 				     foreach_kill_hetjob->preempt);
 		if (rc1 != SLURM_SUCCESS)
 			foreach_kill_hetjob->rc = rc1;
 	}

 	return 0;
 }

 /* Signal all components of a hetjob */
 extern int het_job_signal(job_record_t *het_job_leader, uint16_t signal,
 			  uint16_t flags, uid_t uid, bool preempt)
 {
 	foreach_kill_hetjob_t foreach_kill_hetjob = {
 		.flags = flags,
 		.het_job_leader = het_job_leader,
 		.preempt = preempt,
 		.rc = SLURM_SUCCESS,
 		.signal = signal,
 		.uid = uid,
 	};

 	if (!het_job_leader->het_job_id)
 		return ESLURM_NOT_HET_JOB;
 	else if (!het_job_leader->het_job_list)
 		return ESLURM_NOT_HET_JOB_LEADER;

 	(void) list_for_each(het_job_leader->het_job_list,
 			     foreach_het_job_signal,
 			     &foreach_kill_hetjob);

 	return foreach_kill_hetjob.rc;
 }

 /*
  * Returns average pn_min_memory, considering DefMemPer{CPU,Node,GPU} from both
  * the partition and cluster configuration
  * WARNING: assumes memory is evenly distributed across all nodes in job,
  * may return an inaccurate value if this is not the case
  */
 static uint64_t _get_def_mem(part_record_t *part_ptr, uint64_t *tres_req_cnt)
 {
 	if (part_ptr && part_ptr->def_mem_per_cpu &&
 	    (part_ptr->def_mem_per_cpu != MEM_PER_CPU) &&
 	    (part_ptr->def_mem_per_cpu != NO_VAL64))
 		return part_ptr->def_mem_per_cpu;
 	else if (tres_req_cnt && tres_req_cnt[TRES_ARRAY_MEM] &&
 		 (tres_req_cnt[TRES_ARRAY_MEM] != NO_VAL64)) {
 		xassert(tres_req_cnt[TRES_ARRAY_NODE]);
 		return tres_req_cnt[TRES_ARRAY_MEM] /
 		       tres_req_cnt[TRES_ARRAY_NODE];
 	} else
 		return slurm_conf.def_mem_per_cpu;
 }

 static bool _get_whole_hetjob(void)
 {
 	static time_t sched_update = 0;
 	static bool whole_hetjob = false;

 	if (sched_update != slurm_conf.last_update) {
 		sched_update = slurm_conf.last_update;
 		if (xstrcasestr(slurm_conf.sched_params, "whole_hetjob") ||
 		    xstrcasestr(slurm_conf.sched_params, "whole_pack"))
 			whole_hetjob = true;
 		else
 			whole_hetjob = false;
 	}

 	return whole_hetjob;
 }

 static job_record_t *_find_meta_job_record(uint32_t job_id)
 {
 	job_record_t *job_ptr;

 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL) {
 		job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
 		while (job_ptr) {
 			if (job_ptr->array_job_id == job_id)
 				break;
 			job_ptr = job_ptr->job_array_next_j;
 		}
 	}
 	if ((job_ptr == NULL) ||
 	    ((job_ptr->array_task_id == NO_VAL) &&
 	     (job_ptr->array_recs == NULL)))
 		return NULL;

 	return job_ptr;
 }

 static void _signal_pending_job_array_tasks(job_record_t *job_ptr,
 					    bitstr_t **array_bitmap,
 					    uint16_t signal,
 					    uid_t uid,
 					    int32_t i_last,
 					    time_t now,
 					    int *rc)
 {
 	int len;

 	xassert(job_ptr);

 	if (!(IS_JOB_PENDING(job_ptr) && job_ptr->array_recs &&
 	      job_ptr->array_recs->task_id_bitmap))
 		return; /* No tasks to signal */

 	/* Ensure bitmap sizes match for AND operations */
 	len = bit_size(job_ptr->array_recs->task_id_bitmap);
 	i_last++;
 	if (i_last < len) {
 		bit_realloc(*array_bitmap, len);
 	} else {
 		bit_realloc(*array_bitmap, i_last);
 		bit_realloc(job_ptr->array_recs->task_id_bitmap, i_last);
 	}
 	if (signal == SIGKILL) {
 		uint32_t orig_task_cnt, new_task_count;
 		/* task_id_bitmap changes, so we need a copy of it */
 		bitstr_t *task_id_bitmap_orig =
 			bit_copy(job_ptr->array_recs->task_id_bitmap);

 		bit_and_not(job_ptr->array_recs->task_id_bitmap,
 			    *array_bitmap);
 		xfree(job_ptr->array_recs->task_id_str);
 		orig_task_cnt = job_ptr->array_recs->task_cnt;
 		new_task_count = bit_set_count(job_ptr->array_recs->
 					       task_id_bitmap);
 		if (!new_task_count) {
 			last_job_update		= now;
 			job_state_set(job_ptr, JOB_CANCELLED);
 			job_ptr->start_time	= now;
 			job_ptr->end_time	= now;
 			job_ptr->requid		= uid;
 			srun_allocate_abort(job_ptr);
 			job_completion_logger(job_ptr, false);
 			/*
 			 * Master job record, even without tasks,
 			 * counts as one job record
 			 */
 			job_count -= (orig_task_cnt - 1);
 		} else {
 			_job_array_comp(job_ptr, false, false);
 			job_count -= (orig_task_cnt - new_task_count);
 			/*
 			 * Since we are altering the job array's
 			 * task_cnt we must go alter this count in the
 			 * acct_policy code as if they are finishing
 			 * (accrue_cnt/job_submit etc...).
 			 */
 			if (job_ptr->array_recs->task_cnt >
 			    new_task_count) {
 				uint32_t tmp_state = job_ptr->job_state;
 				job_state_set(job_ptr, JOB_CANCELLED);

 				job_ptr->array_recs->task_cnt -=
 					new_task_count;
 				acct_policy_remove_job_submit(job_ptr,
 							      false);
 				job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
 				job_state_set(job_ptr, tmp_state);
 			}
 		}

 		/*
 		 * Set the task_cnt here since
 		 * job_completion_logger needs the total
 		 * pending count to handle the acct_policy
 		 * limit for submitted jobs correctly.
 		 */
 		job_ptr->array_recs->task_cnt = new_task_count;
 		bit_and_not(*array_bitmap, task_id_bitmap_orig);
 		FREE_NULL_BITMAP(task_id_bitmap_orig);
 	} else {
 		bit_and_not(*array_bitmap,
 			    job_ptr->array_recs->task_id_bitmap);
 		*rc = ESLURM_TRANSITION_STATE_NO_UPDATE;
 	}
 }

 /*
  * job_str_signal - signal the specified job
  * IN job_id_str - id of the job to be signaled, valid formats include "#"
  *	"#_#" and "#_[expr]"
  * IN signal - signal to send, SIGKILL == cancel the job
  * IN flags  - see KILL_JOB_* flags in slurm.h
  * IN uid - uid of requesting user
  * IN preempt - true if job being preempted
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_str_signal(char *job_id_str, uint16_t signal, uint16_t flags,
 			  uid_t uid, bool preempt)
 {
 	job_record_t *job_ptr;
 	uint32_t job_id;
 	time_t now = time(NULL);
 	char *end_ptr = NULL;
 	long int long_id;
 	bitstr_t *array_bitmap = NULL;
 	int32_t i, i_first, i_last;
 	int rc = SLURM_SUCCESS, rc2;

 	if (max_array_size == NO_VAL) {
 		max_array_size = slurm_conf.max_array_sz;
 	}

 	long_id = strtol(job_id_str, &end_ptr, 10);
 	if ((long_id <= 0) || (long_id == LONG_MAX) ||
 	    ((end_ptr[0] != '\0') && (end_ptr[0] != '_') &&
 	     (end_ptr[0] != '+'))) {
 		info("%s(1): invalid JobId=%s", __func__, job_id_str);
 		return ESLURM_INVALID_JOB_ID;
 	}
 	if ((end_ptr[0] == '_') && (end_ptr[1] == '*'))
 		end_ptr += 2;	/* Defaults to full job array */

 	if (end_ptr[0] == '+') {	/* Signal hetjob element */
 		job_id = (uint32_t) long_id;
 		long_id = strtol(end_ptr + 1, &end_ptr, 10);
 		if ((long_id < 0) || (long_id == LONG_MAX) ||
 		    (end_ptr[0] != '\0')) {
 			info("%s(2): invalid JobId=%s", __func__, job_id_str);
 			return ESLURM_INVALID_JOB_ID;
 		}
 		job_ptr = find_het_job_record(job_id, (uint32_t) long_id);
 		if (!job_ptr)
 			return ESLURM_INVALID_JOB_ID;
 		if ((job_ptr->user_id != uid) && !validate_operator(uid) &&
 		    !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
 						  job_ptr->account, false)) {
 			error("Security violation, REQUEST_KILL_JOB RPC for %pJ from uid %u",
 			      job_ptr, uid);
 			return ESLURM_ACCESS_DENIED;
 		}

 		if (!job_ptr->het_job_id)
 			return ESLURM_NOT_HET_JOB;

 		if (!job_ptr->het_job_offset)
 			/*
 			 * HetJob leader. Attempt to signal all components no
 			 * matter what. If we cared about state or whole_hetjob
 			 * for the leader, we would be being inconsistent with
 			 * direct format '#' below. But even if we made an
 			 * exception here for leader R and no whole_hetjob,
 			 * job_complete() would end all the components anyways.
 			 */
 			return het_job_signal(job_ptr, signal, flags, uid,
 					      preempt);

 		/* HetJob non-leader component. */
 		if (_get_whole_hetjob()) {
 			/* Attempt to signal all components no matter state. */
 			job_record_t *het_leader = NULL;
 			if (!(het_leader = find_het_job_record(job_id, 0))) {
 				/* Leader not found. Attempt individual. */
 				error("%s: can't find HetJob leader for HetJob component %pJ",
 				      __func__, job_ptr);
 				return job_signal(job_ptr, signal,
 						  flags, uid, preempt);
 			} else {
 				/* Got the leader, signal all. */
 				return het_job_signal(het_leader,
 						      signal, flags,
 						      uid, preempt);
 			}
 		}

 		if (IS_JOB_PENDING(job_ptr))
 			return ESLURM_NOT_WHOLE_HET_JOB;
 		else
 			return job_signal(job_ptr, signal, flags, uid, preempt);
 	}

 	last_job_update = now;
 	job_id = (uint32_t) long_id;
 	if (end_ptr[0] == '\0') {	/* Single job (or full job array) */
 		int jobs_done = 0, jobs_signaled = 0;
 		job_record_t *job_ptr_done = NULL;
 		job_ptr = find_job_record(job_id);
 		if (job_ptr && (job_ptr->user_id != uid) &&
 		    !validate_operator(uid) &&
 		    !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
 						  job_ptr->account, false)) {
 			error("Security violation, REQUEST_KILL_JOB RPC for %pJ from uid %u",
 			      job_ptr, uid);
 			return ESLURM_ACCESS_DENIED;
 		}
 		if (job_ptr && job_ptr->het_job_list) {   /* Hetjob leader */
 			return het_job_signal(job_ptr, signal, flags, uid,
 					      preempt);
 		}
 		if (job_ptr && job_ptr->het_job_id && _get_whole_hetjob()) {
 			job_record_t *het_job_leader;
 			het_job_leader = find_job_record(job_ptr->het_job_id);
 			if (het_job_leader && het_job_leader->het_job_list) {
 				return het_job_signal(het_job_leader, signal,
 						      flags, uid, preempt);
 			}
 			error("%s: Hetjob leader %pJ not found",
 			      __func__, job_ptr);
 		}
 		if (job_ptr && job_ptr->het_job_id && IS_JOB_PENDING(job_ptr))
 			return ESLURM_NOT_WHOLE_HET_JOB;/* Hetjob child */

 		if (job_ptr &&
 		    (((job_ptr->array_task_id == NO_VAL) &&
 		      (job_ptr->array_recs == NULL)) ||
 		     ((job_ptr->array_task_id != NO_VAL) &&
 		      ((job_ptr->array_job_id != job_id) ||
 		       (flags & KILL_ARRAY_TASK))))) {
 			/*
 			 * This is a regular job or a single task of a job
 			 * array. KILL_ARRAY_TASK indicates that the meta job
 			 * should be treated as a single task.
 			 */
 			return job_signal(job_ptr, signal, flags, uid, preempt);
 		}

 		/*
 		 * This will kill the meta record that holds all
 		 * pending jobs.  We want to kill this first so we
 		 * don't start jobs just to kill them as we are
 		 * killing other elements of the array.
 		 */
 		if (job_ptr && job_ptr->array_recs) {
 			/* This is a job array */
 			job_ptr_done = job_ptr;
 			rc = job_signal(job_ptr, signal, flags, uid, preempt);
 			if (rc == ESLURM_ACCESS_DENIED)
 				return rc;
 			jobs_signaled++;
 			if (rc == ESLURM_ALREADY_DONE) {
 				jobs_done++;
 				rc = SLURM_SUCCESS;
 			}
 		}

 		/* Signal all tasks of this job array */
 		job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
 		if (!job_ptr && !job_ptr_done) {
 			info("%s(3): invalid JobId=%u", __func__, job_id);
 			return ESLURM_INVALID_JOB_ID;
 		}
 		while (job_ptr) {
 			if (job_ptr->array_job_id == job_id)
 				break;
 			job_ptr = job_ptr->job_array_next_j;
 		}
 		while (job_ptr) {
 			if ((job_ptr->array_job_id == job_id) &&
 			    (job_ptr != job_ptr_done)) {
 				rc2 = job_signal(job_ptr, signal, flags, uid,
 						 preempt);
 				jobs_signaled++;
 				if (rc2 == ESLURM_ALREADY_DONE) {
 					jobs_done++;
 				} else {
 					rc = MAX(rc, rc2);
 				}
 			}
 			job_ptr = job_ptr->job_array_next_j;
 		}
 		if ((rc == SLURM_SUCCESS) && (jobs_done == jobs_signaled))
 			return ESLURM_ALREADY_DONE;
 		return rc;

 	}

 	array_bitmap = slurm_array_str2bitmap(end_ptr + 1, max_array_size,
 					      &i_last);
 	if (!array_bitmap) {
 		info("%s(4): invalid JobId=%s", __func__, job_id_str);
 		rc = ESLURM_INVALID_JOB_ID;
 		goto endit;
 	}

 	/* Find some job record and validate the user signaling the job */
 	if (!(job_ptr = _find_meta_job_record(job_id))) {
 		info("%s(5): invalid JobId=%s", __func__, job_id_str);
 		rc = ESLURM_INVALID_JOB_ID;
 		goto endit;
 	}

 	if ((job_ptr->user_id != uid) && !validate_operator(uid) &&
 	    !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
 					  job_ptr->account, false)) {
 		error("%s: Security violation JOB_CANCEL RPC for %pJ from uid %u",
 		      __func__, job_ptr, uid);
 		rc = ESLURM_ACCESS_DENIED;
 		goto endit;
 	}

 	_signal_pending_job_array_tasks(job_ptr, &array_bitmap, signal, uid,
 					i_last, now, &rc);

 	i_first = bit_ffs(array_bitmap);
 	if (i_first >= 0)
 		i_last = bit_fls(array_bitmap);
 	else
 		i_last = -2;
 	for (i = i_first; i <= i_last; i++) {
 		if (!bit_test(array_bitmap, i))
 			continue;
 		job_ptr = find_job_array_rec(job_id, i);
 		if (job_ptr == NULL) {
 			info("%s(6): invalid JobId=%u_%d",
 			     __func__, job_id, i);
 			rc = ESLURM_INVALID_JOB_ID;
 			continue;
 		}

 		rc2 = job_signal(job_ptr, signal, flags, uid, preempt);
 		rc = MAX(rc, rc2);
 	}
 endit:
 	FREE_NULL_BITMAP(array_bitmap);

 	return rc;
 }

 static void _free_selected_step_array(slurm_selected_step_t ***jobs_p,
 				      uint32_t cnt)
 {
 	slurm_selected_step_t **jobs = *jobs_p;

 	for (int i = 0; i < cnt; i++)
 		slurm_destroy_selected_step(jobs[i]);
 	xfree(jobs);
 	*jobs_p = NULL;
 }

 static void _free_array_task_filter(void *x)
 {
 	array_task_filter_t *rec = x;

 	if (!rec)
 		return;

 	/*
 	 * Do not use slurm_destroy_selected_step() as that will
 	 * unconditionally free the bitmap.
 	 */
 	if (rec->free_array_bitmap)
 		FREE_NULL_BITMAP(rec->filter_id->array_bitmap);
 	xfree(rec->filter_id);
 	/* Do not free rec->job_ptr */
 	xfree(rec);
 }

 static int _parse_jobs_array(char **jobs_array, uint32_t jobs_cnt,
 			     slurm_selected_step_t ***jobs_p)
 {
 	slurm_selected_step_t **jobs = NULL;

 	if (!jobs_array)
 		return SLURM_SUCCESS;
 	if (max_array_size == NO_VAL)
 		max_array_size = slurm_conf.max_array_sz;

 	jobs = xcalloc(jobs_cnt, sizeof(*jobs));
 	for (int i = 0; i < jobs_cnt; i++) {
 		int rc;

 		jobs[i] = xmalloc(sizeof(*jobs[i]));
 		rc = unfmt_job_id_string(jobs_array[i], jobs[i],
 					 max_array_size);
 		if (rc != SLURM_SUCCESS) {
 			_free_selected_step_array(&jobs, i + 1);
 			return rc;
 		}
 	}

 	*jobs_p = jobs;
 	return SLURM_SUCCESS;
 }

 static bool _verify_kill_jobs_msg(kill_jobs_msg_t *kill_msg)
 {
 	/* At least one job id or filter must be specified */
 	if (!kill_msg->account && !kill_msg->job_name &&
 	    !kill_msg->jobs_cnt && !kill_msg->partition && !kill_msg->qos &&
 	    !kill_msg->reservation &&
 	    ((kill_msg->state & JOB_STATE_BASE) == JOB_END) &&
 	    !kill_msg->user_name && !kill_msg->wckey && !kill_msg->nodelist)
 		return false;

 	return true;
 }

 extern int job_mgr_signal_jobs(kill_jobs_msg_t *kill_msg, uid_t auth_uid,
                                kill_jobs_resp_msg_t **resp_msg_p)
 {
 	int rc = 0;
 	signal_jobs_args_t signal_args = {
 		.auth_uid = auth_uid,
 		.kill_msg = kill_msg,
 	};
 	slurm_selected_step_t **jobs = NULL;
 	assoc_mgr_lock_t assoc_lock = {
 		.user = READ_LOCK,
 	};

 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
 	xassert(verify_lock(NODE_LOCK, WRITE_LOCK));
 	xassert(verify_lock(FED_LOCK, READ_LOCK));

 	if (!_verify_kill_jobs_msg(kill_msg))
 		return ESLURM_SIGNAL_JOBS_INVALID;

 	/*
 	 * Items in the signal_args.responses list are free'd in
 	 * _foreach_xfer_responses
 	 */
 	signal_args.responses = list_create(NULL);
 	signal_args.array_leader_list = list_create(NULL);
 	signal_args.other_job_list = list_create(NULL);

 	if (kill_msg->jobs_cnt) {
 		rc = _parse_jobs_array(kill_msg->jobs_array,
 				       kill_msg->jobs_cnt, &jobs);
 		if (rc != SLURM_SUCCESS)
 			return rc;
 		signal_args.pending_array_task_list =
 			list_create(_free_array_task_filter);
 	}

 	if (max_array_size == NO_VAL)
 		max_array_size = slurm_conf.max_array_sz;

 	/*
 	 * Get a list of jobs to signal first, then signal the jobs outside of
 	 * the job_list lock. Array job leaders need to be signalled before
 	 * the tasks in their array. Try to signal each job; add each failure
 	 * to signal_args.responses.
 	 *
 	 * We check if the auth_uid is able to signal the job on every possible
 	 * job that matches the filter. Lock the assoc lock once here rather
 	 * than every time we check.
 	 */
 	assoc_mgr_lock(&assoc_lock);
 	if (jobs)
 		_filter_jobs_ids(jobs, kill_msg->jobs_cnt, &signal_args);
 	else
 		list_for_each_ro(job_list, _foreach_filter_job_list,
 				 &signal_args);
 	/*
 	 * het_leader is only used during filtering; explicitly NULL it out
 	 * so it cannot accidentally be used later.
 	 */
 	signal_args.het_leader = NULL;
 	assoc_mgr_unlock(&assoc_lock);

 	list_for_each(signal_args.array_leader_list, _foreach_signal_job,
 		      &signal_args);
 	if (signal_args.pending_array_task_list) {
 		signal_args.now = time(NULL);
 		list_for_each(signal_args.pending_array_task_list,
 			      _foreach_signal_job_array_tasks, &signal_args);
 	}
 	list_for_each(signal_args.other_job_list, _foreach_signal_job,
 		      &signal_args);

 	_build_kill_jobs_resp_msg(&signal_args, resp_msg_p);

 	/* Cleanup */
 	_free_selected_step_array(&jobs, kill_msg->jobs_cnt);
 	FREE_NULL_LIST(signal_args.array_leader_list);
 	FREE_NULL_LIST(signal_args.pending_array_task_list);
 	FREE_NULL_LIST(signal_args.other_job_list);
 	FREE_NULL_LIST(signal_args.responses);

 	return SLURM_SUCCESS;
 }

 static void _signal_batch_job(job_record_t *job_ptr, uint16_t signal,
 			      uint16_t flags)
 {
 	bitoff_t i;
 	signal_tasks_msg_t *signal_tasks_msg = NULL;
 	agent_arg_t *agent_args = NULL;
 	node_record_t *node_ptr;

 	xassert(job_ptr);
 	xassert(job_ptr->batch_host);
 	i = bit_ffs(job_ptr->node_bitmap);
 	if (i < 0) {
 		error("%s: %pJ lacks assigned nodes", __func__, job_ptr);
 		return;
 	}

 	agent_args = xmalloc(sizeof(agent_arg_t));
 	agent_args->msg_type	= REQUEST_SIGNAL_TASKS;
 	agent_args->retry	= 1;
 	agent_args->node_count  = 1;
 	if ((node_ptr = find_node_record(job_ptr->batch_host)))
 		agent_args->protocol_version = node_ptr->protocol_version;
 	agent_args->hostlist	= hostlist_create(job_ptr->batch_host);
 	signal_tasks_msg = xmalloc(sizeof(signal_tasks_msg_t));
 	signal_tasks_msg->step_id.job_id      = job_ptr->job_id;
 	signal_tasks_msg->step_id.step_id = SLURM_BATCH_SCRIPT;
 	signal_tasks_msg->step_id.step_het_comp = NO_VAL;

 	signal_tasks_msg->flags = flags;
 	signal_tasks_msg->signal = signal;

 	agent_args->msg_args = signal_tasks_msg;
 	set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY);
 	agent_queue_request(agent_args);
 }

 /*
  * prolog_complete - note the normal termination of the prolog
  * IN job_id - id of the job which completed
  * IN prolog_return_code - prolog's return code,
  *    if set then set job state to FAILED
  * RET - 0 on success, otherwise ESLURM error code
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern int prolog_complete(uint32_t job_id, uint32_t prolog_return_code,
 			   char *node_name)
 {
 	job_record_t *job_ptr;

 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL) {
 		info("prolog_complete: invalid JobId=%u", job_id);
 		return ESLURM_INVALID_JOB_ID;
 	}

 	if (IS_JOB_COMPLETING(job_ptr))
 		return SLURM_SUCCESS;

 	if (prolog_return_code) {
 		error("Prolog launch failure, %pJ", job_ptr);
 		job_ptr->exit_code = prolog_return_code;
 	}
 	/*
 	 * job_ptr->node_bitmap_pr is always NULL for front end systems
 	 */
 	if (job_ptr->node_bitmap_pr) {
 		node_record_t *node_ptr = NULL;

 		if (node_name)
 			node_ptr = find_node_record(node_name);

 		if (node_ptr) {
 			bit_clear(job_ptr->node_bitmap_pr, node_ptr->index);
 		} else {
 			if (node_name)
 				error("%s: can't find node:%s",
 				      __func__, node_name);
 			bit_clear_all(job_ptr->node_bitmap_pr);
 		}
 	}
 	if (!job_ptr->node_bitmap_pr ||
 	    (bit_ffs(job_ptr->node_bitmap_pr) == -1))
 	{
 		job_ptr->state_reason = WAIT_NO_REASON;
 		agent_trigger(999, false, true);
 	}
 	last_job_update = time(NULL);

 	return SLURM_SUCCESS;
 }

 static void _handle_requeue_limit(job_record_t *job_ptr, const char *caller)
 {
 	if (job_ptr->batch_flag <= slurm_conf.max_batch_requeue)
 		return;

 	debug("%s: Holding %pJ, repeated requeue failures",
 	      caller, job_ptr);

 	job_state_set_flag(job_ptr, JOB_REQUEUE_HOLD);
 	job_ptr->state_reason = WAIT_MAX_REQUEUE;
 	xfree(job_ptr->state_desc);
 	job_ptr->state_desc =
 		xstrdup("launch failure limit exceeded requeued held");
 	job_ptr->batch_flag = 1;
 	job_ptr->priority = 0;
 }

 static int _job_complete(job_record_t *job_ptr, uid_t uid, bool requeue,
 			 bool node_fail, uint32_t job_return_code)
 {
 	node_record_t *node_ptr;
 	time_t now = time(NULL);
 	uint32_t job_comp_flag = 0;
 	bool suspended = false;
 	int i;
 	int use_cloud = false;
 	uint16_t over_time_limit;

 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
 	xassert(verify_lock(FED_LOCK, READ_LOCK));

 	if (IS_JOB_FINISHED(job_ptr)) {
 		if (job_ptr->exit_code == 0)
 			job_ptr->exit_code = job_return_code;
 		return ESLURM_ALREADY_DONE;
 	}

 	if (IS_JOB_COMPLETING(job_ptr))
 		return SLURM_SUCCESS;	/* avoid replay */

 	if ((job_return_code & 0xff) == SIG_OOM) {
 		info("%s: %pJ OOM failure",  __func__, job_ptr);
 	} else if (WIFSIGNALED(job_return_code)) {
 		info("%s: %pJ WTERMSIG %d",
 		     __func__, job_ptr, WTERMSIG(job_return_code));
 	} else if (WIFEXITED(job_return_code)) {
 		info("%s: %pJ WEXITSTATUS %d",
 		     __func__, job_ptr, WEXITSTATUS(job_return_code));
 	}

 	if (IS_JOB_RUNNING(job_ptr))
 		job_comp_flag = JOB_COMPLETING;
 	else if (IS_JOB_PENDING(job_ptr)) {
 		job_return_code = NO_VAL;
 		fed_mgr_job_revoke_sibs(job_ptr);
 	}

 	if ((job_return_code == NO_VAL) &&
 	    (IS_JOB_RUNNING(job_ptr) || IS_JOB_PENDING(job_ptr))) {
 		if (node_fail) {
 			info("%s: %pJ cancelled by node failure",
 			     __func__, job_ptr);
 		} else {
 			info("%s: %pJ cancelled by interactive user",
 			     __func__, job_ptr);
 		}
 	}

 	if (IS_JOB_SUSPENDED(job_ptr)) {
 		uint32_t suspend_job_state = job_ptr->job_state;
 		/*
 		 * we can't have it as suspended when we call the
 		 * accounting stuff.
 		 */
 		job_state_set(job_ptr, JOB_CANCELLED);
 		jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
 		job_state_set(job_ptr, suspend_job_state);
 		job_comp_flag = JOB_COMPLETING;
 		suspended = true;
 	}

 	if (job_comp_flag && (job_ptr->node_cnt == 0)) {
 		/*
 		 * Job has no resources left (used to expand another job).
 		 * Avoid duplicate run of epilog and underflow in CPU count.
 		 */
 		job_comp_flag = 0;
 	}

 	if (requeue && job_ptr->details && job_ptr->batch_flag) {
 		/*
 		 * We want this job to look like it was terminated in the
 		 * accounting logs. Set a new submit time so the restarted
 		 * job looks like a new job.
 		 */
 		job_ptr->end_time = now;
 		if (job_ptr->bit_flags & GRACE_PREEMPT) {
 			job_state_set(job_ptr, (JOB_PREEMPTED | job_comp_flag));

 			/* clear signal sent on GracePeriod start */
 			job_ptr->bit_flags &= (~GRACE_PREEMPT);
 		} else {
 			job_state_set(job_ptr, JOB_NODE_FAIL);
 			job_ptr->exit_code = job_return_code;
 		}

 		job_completion_logger(job_ptr, true);
 		/*
 		 * Do this after the epilog complete.
 		 * Setting it here is too early.
 		 */
 		//job_record_set_sluid(job_ptr);
 		//job_ptr->details->submit_time = now + 1;
 		if (job_ptr->node_bitmap) {
 			i = bit_ffs(job_ptr->node_bitmap);
 			if (i >= 0) {
 				node_ptr = node_record_table_ptr[i];
 				if (IS_NODE_CLOUD(node_ptr))
 					use_cloud = true;
 			}
 		}
 		if (!use_cloud)
 			job_ptr->batch_flag++;	/* only one retry */
 		job_ptr->restart_cnt++;

 		/* clear signal sent flag on requeue */
 		job_ptr->warn_flags &= ~WARN_SENT;


 		job_state_set(job_ptr, (JOB_PENDING | job_comp_flag));
 		job_ptr->exit_code = 0;
 		/*
 		 * Since the job completion logger removes the job submit
 		 * information, we need to add it again.
 		 */
 		acct_policy_add_job_submit(job_ptr, false);
 		if (node_fail) {
 			info("%s: requeue %pJ due to node failure",
 			     __func__, job_ptr);
 		} else {
 			info("%s: requeue %pJ per user/system request",
 			     __func__, job_ptr);
 		}
 		/* hold job if over requeue limit */
 		_handle_requeue_limit(job_ptr, __func__);
 	} else if (IS_JOB_PENDING(job_ptr) && job_ptr->details &&
 		   job_ptr->batch_flag) {
 		/*
 		 * Possible failure mode with DOWN node and job requeue.
 		 * The DOWN node might actually respond to the cancel and
 		 * take us here.  Don't run job_completion_logger here since
 		 * this is here to catch duplicate cancels from slowly
 		 * responding slurmds
 		 */
 		return SLURM_SUCCESS;
 	} else {
 		if (job_ptr->part_ptr &&
 		    (job_ptr->part_ptr->over_time_limit != NO_VAL16)) {
 			over_time_limit = job_ptr->part_ptr->over_time_limit;
 		} else {
 			over_time_limit = slurm_conf.over_time_limit;
 		}

 		if (node_fail) {
 			job_state_set(job_ptr, (JOB_NODE_FAIL | job_comp_flag));
 			job_ptr->exit_code = job_return_code;
 			job_ptr->requid = uid;
 		} else if (job_ptr->bit_flags & GRACE_PREEMPT) {
 			job_state_set(job_ptr, (JOB_PREEMPTED | job_comp_flag));
 		} else if (job_return_code == NO_VAL) {
 			job_state_set(job_ptr, (JOB_CANCELLED | job_comp_flag));
 			job_ptr->requid = uid;
 		} else if ((job_return_code & 0xff) == SIG_OOM) {
 			job_state_set(job_ptr, (JOB_OOM | job_comp_flag));
 			job_ptr->exit_code = job_return_code;
 			job_ptr->state_reason = FAIL_OOM;
 			xfree(job_ptr->state_desc);
 		} else if (WIFEXITED(job_return_code) &&
 			   WEXITSTATUS(job_return_code)) {
 			job_state_set(job_ptr, (JOB_FAILED | job_comp_flag));
 			job_ptr->exit_code = job_return_code;
 			job_ptr->state_reason = FAIL_EXIT_CODE;
 			xfree(job_ptr->state_desc);
 		} else if (WIFSIGNALED(job_return_code)) {
 			job_state_set(job_ptr, (JOB_FAILED | job_comp_flag));
 			job_ptr->exit_code = job_return_code;
 			job_ptr->state_reason = FAIL_SIGNAL;
 			xfree(job_ptr->state_desc);
 			xstrfmtcat(job_ptr->state_desc,
 				   "RaisedSignal:%d(%s)",
 				   WTERMSIG(job_return_code),
 				   strsignal(WTERMSIG(job_return_code)));
 		} else if (job_comp_flag
 			   && ((job_ptr->end_time
 				+ over_time_limit * 60) < now)) {
 			/*
 			 * Test if the job has finished before its allowed
 			 * over time has expired.
 			 */
 			job_state_set(job_ptr, (JOB_TIMEOUT | job_comp_flag));
 			job_ptr->state_reason = FAIL_TIMEOUT;
 			xfree(job_ptr->state_desc);
 		} else {
 			job_state_set(job_ptr, (JOB_COMPLETE | job_comp_flag));
 			job_ptr->exit_code = job_return_code;
 		}

 		if (suspended) {
 			job_ptr->end_time = job_ptr->suspend_time;
 			job_ptr->tot_sus_time +=
 				difftime(now, job_ptr->suspend_time);
 		} else
 			job_ptr->end_time = now;
 		job_completion_logger(job_ptr, false);
 	}

 	last_job_update = now;
 	job_ptr->time_last_active = now;   /* Timer for resending kill RPC */
 	if (job_comp_flag) {	/* job was running */
 		build_cg_bitmap(job_ptr);
 		deallocate_nodes(job_ptr, false, suspended, false);
 	}

 	/* Check for and cleanup stuck scripts */
 	if (IS_JOB_PENDING(job_ptr) || IS_JOB_CONFIGURING(job_ptr) ||
 	    (job_ptr->details && job_ptr->details->prolog_running)) {
 		slurmscriptd_flush_job(job_ptr->job_id);
 		track_script_flush_job(job_ptr->job_id);
 	}

 	info("%s: %pJ done", __func__, job_ptr);
 	return SLURM_SUCCESS;
 }

 static int _foreach_het_job_complete(void *x, void *arg)
 {
 	job_record_t *het_job_ptr = x;
 	foreach_complete_hetjob_t *foreach_complete_hetjob = arg;
 	job_record_t *het_job_leader = foreach_complete_hetjob->het_job_leader;
 	int rc;

 	if (het_job_leader->het_job_id != het_job_ptr->het_job_id) {
 		error("%s: Bad het_job_list for %pJ",
 		      __func__, het_job_leader);
 		return 0;
 	}
 	rc = _job_complete(het_job_ptr,
 			   foreach_complete_hetjob->uid,
 			   foreach_complete_hetjob->requeue,
 			   foreach_complete_hetjob->node_fail,
 			   foreach_complete_hetjob->job_return_code);
 	if (rc != SLURM_SUCCESS)
 		foreach_complete_hetjob->rc = rc;

 	return 0;
 }

 /*
  * job_complete - note the normal termination the specified job
  * IN job_id - id of the job which completed
  * IN uid - user id of user issuing the RPC
  * IN requeue - job should be run again if possible
  * IN node_fail - true if job terminated due to node failure
  * IN job_return_code - job's return code, if set then set state to FAILED
  * RET - 0 on success, otherwise ESLURM error code
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern int job_complete(uint32_t job_id, uid_t uid, bool requeue,
 			bool node_fail, uint32_t job_return_code)
 {
 	job_record_t *job_ptr;
 	int rc;

 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
 	xassert(verify_lock(FED_LOCK, READ_LOCK));

 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL) {
 		info("%s: invalid JobId=%u", __func__, job_id);
 		return ESLURM_INVALID_JOB_ID;
 	}

 	if ((job_ptr->user_id != uid) && !validate_slurm_user(uid)) {
 		error("%s: Security violation, JOB_COMPLETE RPC for %pJ from uid %u",
 		      __func__, job_ptr, uid);
 		return ESLURM_USER_ID_MISSING;
 	}

 	if (job_ptr->het_job_list) {
 		foreach_complete_hetjob_t foreach_complete_hetjob = {
 			.het_job_leader = job_ptr,
 			.job_return_code = job_return_code,
 			.node_fail = node_fail,
 			.requeue = requeue,
 			.rc = SLURM_SUCCESS,
 			.uid = uid,
 		};
 		(void) list_for_each(job_ptr->het_job_list,
 				     _foreach_het_job_complete,
 				     &foreach_complete_hetjob);

 		rc = foreach_complete_hetjob.rc;
 	} else {
 		rc = _job_complete(job_ptr, uid, requeue, node_fail,
 				   job_return_code);
 	}

 	return rc;
 }

 static int _alt_part_test(part_record_t *part_ptr, part_record_t **part_ptr_new)
 {
 	part_record_t *alt_part_ptr = NULL;
 	char *alt_name;

 	*part_ptr_new = NULL;
 	if ((part_ptr->state_up & PARTITION_SUBMIT) == 0) {
 		info("_alt_part_test: original partition is not available "
 		     "(drain or inactive): %s", part_ptr->name);
 		alt_name = part_ptr->alternate;
 		while (alt_name) {
 			alt_part_ptr = find_part_record(alt_name);
 			if (alt_part_ptr == NULL) {
 				info("_alt_part_test: invalid alternate "
 				     "partition name specified: %s", alt_name);
 				return ESLURM_INVALID_PARTITION_NAME;
 			}
 			if (alt_part_ptr == part_ptr) {
 				info("_alt_part_test: no valid alternate "
 				     "partition is available");
 				return ESLURM_PARTITION_NOT_AVAIL;
 			}
 			if (alt_part_ptr->state_up & PARTITION_SUBMIT)
 				break;
 			/* Try next alternate in the sequence */
 			alt_name = alt_part_ptr->alternate;
 		}
 		if (alt_name == NULL) {
 			info("_alt_part_test: no valid alternate partition is "
 			     "available");
 			return ESLURM_PARTITION_NOT_AVAIL;
 		}
 		*part_ptr_new = alt_part_ptr;
 	}
 	return SLURM_SUCCESS;
 }

 static int _qos_part_check(void *object, void *arg)
 {
 	slurmdb_qos_rec_t *qos_ptr = object;
 	qos_part_check_t *qos_part_check = arg;
 	part_record_t *part_ptr = qos_part_check->part_ptr;

 	if ((part_ptr->state_up & PARTITION_SCHED) &&
 	    (qos_part_check->min_nodes != NO_VAL) &&
 	    (qos_part_check->min_nodes < part_ptr->min_nodes) &&
 	    (!qos_ptr || !(qos_ptr->flags & QOS_FLAG_PART_MIN_NODE))) {
 		debug2("%s: Job requested for nodes (%u) smaller than partition %s(%u) min nodes",
 		       __func__, qos_part_check->min_nodes,
 		       part_ptr->name, part_ptr->min_nodes);
 		qos_part_check->error_code = ESLURM_INVALID_NODE_COUNT;
 		return -1;
 	}

 	if ((part_ptr->state_up & PARTITION_SCHED) &&
 	    (qos_part_check->max_nodes != NO_VAL) &&
 	    (qos_part_check->max_nodes > part_ptr->max_nodes) &&
 	    (!qos_ptr || !(qos_ptr->flags & QOS_FLAG_PART_MAX_NODE))) {
 		debug2("%s: Job requested for nodes (%u) greater than partition %s(%u) max nodes",
 		       __func__, qos_part_check->max_nodes,
 		       part_ptr->name, part_ptr->max_nodes);
 		qos_part_check->error_code = ESLURM_INVALID_NODE_COUNT;
 		return -1;
 	}

 	if ((part_ptr->state_up & PARTITION_SCHED) &&
 	    (qos_part_check->time_limit != NO_VAL) &&
 	    (qos_part_check->time_limit > part_ptr->max_time) &&
 	    (!qos_ptr || !(qos_ptr->flags & QOS_FLAG_PART_TIME_LIMIT))) {
 		debug2("%s: Job time limit (%u) exceeds limit of partition %s(%u)",
 		       __func__, qos_part_check->time_limit,
 		       part_ptr->name, part_ptr->max_time);
 		qos_part_check->error_code = ESLURM_INVALID_TIME_LIMIT;
 		return -1;
 	}

 	if (slurm_conf.enforce_part_limits) {
 		if ((qos_part_check->error_code =
 		     part_policy_valid_qos(part_ptr, qos_ptr,
 					   qos_part_check->submit_uid,
 					   NULL)) != SLURM_SUCCESS)
 			return -1;
 	}

 	return 0;
 }

 /*
  * Test if this job can use this partition
  *
  * NOTE: This function is also called with a dummy job_desc_msg_t from
  * job_limits_check() if there is any new check added here you may also have to
  * add that parameter to the job_desc_msg_t in that function.
  */
 static int _part_access_check(part_record_t *part_ptr, job_desc_msg_t *job_desc,
 			      bitstr_t *req_bitmap, uid_t submit_uid,
 			      slurmdb_qos_rec_t *qos_ptr,
 			      list_t *qos_ptr_list, char *acct)
 {
 	uint32_t total_nodes;
 	qos_part_check_t qos_part_check = {
 		.error_code = SLURM_SUCCESS,
 		.max_nodes = job_desc->max_nodes,
 		.min_nodes = job_desc->min_nodes,
 		.part_ptr = part_ptr,
 		.submit_uid = submit_uid,
 		.time_limit = job_desc->time_limit,
 	};
 	int rc = SLURM_SUCCESS;

 	xassert(verify_assoc_lock(ASSOC_LOCK, READ_LOCK));
 	xassert(verify_assoc_lock(QOS_LOCK, READ_LOCK));

 	if ((part_ptr->flags & PART_FLAG_REQ_RESV) &&
 	    (!job_desc->reservation || job_desc->reservation[0] == '\0')) {
 		debug2("%s: uid %u access to partition %s "
 		       "denied, requires reservation", __func__,
 		       (unsigned int) submit_uid, part_ptr->name);
 		return ESLURM_ACCESS_DENIED;
 	}

 	if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && (submit_uid != 0) &&
 	    (submit_uid != slurm_conf.slurm_user_id)) {
 		debug2("%s: uid %u access to partition %s "
 		       "denied, not root", __func__,
 		       (unsigned int) submit_uid, part_ptr->name);
 		return ESLURM_ACCESS_DENIED;
 	}

 	if ((job_desc->user_id == 0) && (part_ptr->flags & PART_FLAG_NO_ROOT)) {
 		error("%s: Security violation, SUBMIT_JOB for "
 		      "user root disabled", __func__);
 		return ESLURM_USER_ID_MISSING;
 	}

 	if (validate_alloc_node(part_ptr, job_desc->alloc_node) == 0) {
 		debug2("%s: uid %u access to partition %s "
 		       "denied, bad allocating node: %s", __func__,
 		       (unsigned int) job_desc->user_id, part_ptr->name,
 		       job_desc->alloc_node);
 		return ESLURM_ACCESS_DENIED;
 	}

 	if ((part_ptr->state_up & PARTITION_SCHED) &&
 	    (job_desc->min_cpus != NO_VAL)) {
 		if (job_desc->min_cpus > part_ptr->total_cpus) {
 			debug2("%s: Job requested too many CPUs (%u) of partition %s(%u)",
 			       __func__, job_desc->min_cpus, part_ptr->name,
 			       part_ptr->total_cpus);
 			return ESLURM_TOO_MANY_REQUESTED_CPUS;
 		} else if (job_desc->min_cpus >
 			   (part_ptr->max_cpus_per_node *
 			    part_ptr->total_nodes)) {
 			debug2("%s: Job requested too many CPUs (%u) of partition %s(%u)",
 			       __func__, job_desc->min_cpus, part_ptr->name,
 			       (part_ptr->max_cpus_per_node *
 				part_ptr->total_nodes));
 			return ESLURM_TOO_MANY_REQUESTED_CPUS;
 		}
 	}

 	/* Check against total nodes on the partition */
 	total_nodes = part_ptr->total_nodes;
 	if ((part_ptr->state_up & PARTITION_SCHED) &&
 	    (job_desc->min_nodes != NO_VAL) &&
 	    (job_desc->min_nodes > total_nodes)) {
 		debug2("%s: Job requested too many nodes (%u) "
 		       "of partition %s(%u)", __func__,
 		       job_desc->min_nodes, part_ptr->name, total_nodes);
 		return ESLURM_INVALID_NODE_COUNT;
 	}

 	if (req_bitmap && !bit_super_set(req_bitmap, part_ptr->node_bitmap)) {
 		debug2("%s: requested nodes %s not in partition %s", __func__,
 		       job_desc->req_nodes, part_ptr->name);
 		return ESLURM_REQUESTED_NODES_NOT_IN_PARTITION;
 	}

 	/* Check against min/max node limits in the partition */
 	if (qos_ptr_list)
 		(void) list_for_each(qos_ptr_list,
 				     _qos_part_check,
 				     &qos_part_check);
 	else
 		(void) _qos_part_check(qos_ptr, &qos_part_check);
 	if (qos_part_check.error_code != SLURM_SUCCESS)
 		return qos_part_check.error_code;

 	if (slurm_conf.enforce_part_limits) {
 		if (!validate_group(part_ptr, job_desc->user_id)) {
 			debug2("%s: uid %u not in group permitted to use this partition (%s). groups allowed: %s",
 			     __func__, job_desc->user_id, part_ptr->name,
 			     part_ptr->allow_groups);
 			rc = ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP;
 			goto fini;
 		}

 		if ((rc = part_policy_valid_acct(part_ptr, acct, NULL))
 		    != SLURM_SUCCESS)
 			goto fini;
 	}

 fini:
 	return rc;
 }

 static int _foreach_rebuild_part_names(void *x, void *arg)
 {
 	part_record_t *part_ptr = x;
 	foreach_rebuild_names_t *foreach_rebuild_names = arg;

 	if (!foreach_rebuild_names->names)
 		foreach_rebuild_names->part_ptr = part_ptr;
 	else
 		xstrcatat(foreach_rebuild_names->names,
 			  &foreach_rebuild_names->names_pos,
 			  ",");
 	xstrcatat(foreach_rebuild_names->names,
 		  &foreach_rebuild_names->names_pos,
 		  part_ptr->name);

 	return 0;
 }

 static int _get_job_parts(job_desc_msg_t *job_desc, part_record_t **part_pptr,
 			  list_t **part_pptr_list, char **err_msg)
 {
 	part_record_t *part_ptr = NULL, *part_ptr_new = NULL;
 	list_t *part_ptr_list = NULL;
 	int rc = SLURM_SUCCESS;

 	/* Identify partition(s) and set pointer(s) to their struct */
 	if (job_desc->partition) {
 		char *err_part = NULL;
 		get_part_list(job_desc->partition, &part_ptr_list, &part_ptr,
 			      &err_part);
 		if (part_ptr == NULL) {
 			info("%s: invalid partition specified: %s",
 			     __func__, job_desc->partition);
 			if (err_msg) {
 				xfree(*err_msg);
 				xstrfmtcat(*err_msg,
 					   "invalid partition specified: %s",
 					   err_part);
 				xfree(err_part);
 			}
 			FREE_NULL_LIST(part_ptr_list);
 			return ESLURM_INVALID_PARTITION_NAME;
 		}
 	} else if (job_desc->reservation && job_desc->reservation[0] != '\0' ) {
 		slurmctld_resv_t *resv_ptr = NULL;
 		resv_ptr = find_resv_name(job_desc->reservation);
 		if (resv_ptr)
 			part_ptr = resv_ptr->part_ptr;
 		if (part_ptr)
 			job_desc->partition = xstrdup(part_ptr->name);
 	}

 	if (!part_ptr) {
 		if (default_part_loc == NULL) {
 			error("%s: default partition not set", __func__);
 			return ESLURM_DEFAULT_PARTITION_NOT_SET;
 		}
 		part_ptr = default_part_loc;
 		job_desc->partition = xstrdup(part_ptr->name);
 		job_desc->bitflags |= JOB_PART_ASSIGNED;
 	}

 	/* Change partition pointer(s) to alternates as needed */
 	if (part_ptr_list) {
 		int fail_rc = SLURM_SUCCESS;
 		part_record_t *part_ptr_tmp;
 		bool rebuild_name_list = false;
 		list_itr_t *iter = list_iterator_create(part_ptr_list);

 		/*
 		 * Skipping this for now since we are replacing items in the
 		 * list. This is the only place in the code we use
 		 * list_insert().  There is probably other ways of doing this,
 		 * saving for future generations.
 		 */
 		while ((part_ptr_tmp = list_next(iter))) {
 			rc = _alt_part_test(part_ptr_tmp, &part_ptr_new);
 			if (rc != SLURM_SUCCESS) {
 				fail_rc = rc;
 				list_remove(iter);
 				rebuild_name_list = true;
 				continue;
 			}
 			if (part_ptr_new) {
 				list_insert(iter, part_ptr_new);
 				list_remove(iter);
 				rebuild_name_list = true;
 			}
 		}
 		list_iterator_destroy(iter);
 		if (list_is_empty(part_ptr_list)) {
 			if (fail_rc != SLURM_SUCCESS)
 				rc = fail_rc;
 			else
 				rc = ESLURM_PARTITION_NOT_AVAIL;
 			goto fini;
 		}
 		rc = SLURM_SUCCESS;	/* At least some partition usable */
 		if (rebuild_name_list) {
 			foreach_rebuild_names_t foreach_rebuild_names = { 0 };
 			(void) list_for_each(part_ptr_list,
 					     _foreach_rebuild_part_names,
 					     &foreach_rebuild_names);
 			part_ptr = foreach_rebuild_names.part_ptr;
 			xfree(job_desc->partition);
 			job_desc->partition = foreach_rebuild_names.names;
 			foreach_rebuild_names.names = NULL;

 			if (!part_ptr) {
 				rc = ESLURM_PARTITION_NOT_AVAIL;
 				goto fini;
 			}
 		}
 	} else {
 		rc = _alt_part_test(part_ptr, &part_ptr_new);
 		if (rc != SLURM_SUCCESS)
 			goto fini;
 		if (part_ptr_new) {
 			part_ptr = part_ptr_new;
 			xfree(job_desc->partition);
 			job_desc->partition = xstrdup(part_ptr->name);
 		}
 	}

 	*part_pptr = part_ptr;
 	if (part_pptr_list) {
 		*part_pptr_list = part_ptr_list;
 		part_ptr_list = NULL;
 	} else
 		FREE_NULL_LIST(part_ptr_list);

 fini:
 	return rc;
 }

 static int _foreach_valid_part(void *x, void *arg)
 {
 	part_record_t *part_ptr = x;
 	foreach_valid_part_t *foreach_valid_part = arg;
 	int rc;

 	/*
 	 * Associations should have already be checked before
 	 * this. It is not allowed to have a multiple partition
 	 * request with partition based associations.
 	 */
 	rc = _part_access_check(part_ptr,
 				foreach_valid_part->job_desc,
 				foreach_valid_part->req_bitmap,
 				foreach_valid_part->submit_uid,
 				foreach_valid_part->qos_ptr,
 				foreach_valid_part->qos_ptr_list,
 				foreach_valid_part->assoc_ptr ?
 				foreach_valid_part->assoc_ptr->acct : NULL);

 	if ((rc != SLURM_SUCCESS) &&
 	    ((rc == ESLURM_ACCESS_DENIED) ||
 	     (rc == ESLURM_USER_ID_MISSING) ||
 	     (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ALL))) {
 		foreach_valid_part->rc = rc;
 		return -1;
 	} else if (rc != SLURM_SUCCESS) {
 		foreach_valid_part->rc = rc;
 	} else {
 		foreach_valid_part->any_check = true;
 	}

 	/* Set to success since we found a usable partition */
 	if (foreach_valid_part->any_check &&
 	    (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ANY))
 		foreach_valid_part->rc = SLURM_SUCCESS;

 	foreach_valid_part->min_nodes_orig =
 		MIN(foreach_valid_part->min_nodes_orig,
 		    part_ptr->min_nodes_orig);
 	foreach_valid_part->max_nodes_orig =
 		MAX(foreach_valid_part->max_nodes_orig,
 		    part_ptr->max_nodes_orig);
 	foreach_valid_part->max_time =
 		MAX(foreach_valid_part->max_time, part_ptr->max_time);

 	return 0;
 }

 static int _valid_job_part(job_desc_msg_t *job_desc, uid_t submit_uid,
 			   bitstr_t *req_bitmap, part_record_t *part_ptr,
 			   list_t *part_ptr_list,
 			   slurmdb_assoc_rec_t *assoc_ptr,
 			   slurmdb_qos_rec_t *qos_ptr,
 			   list_t *qos_ptr_list)
 {
 	int rc = SLURM_SUCCESS;
 	uint32_t min_nodes_orig = INFINITE, max_nodes_orig = 1;
 	uint32_t max_time = 0;
 	bool any_check = false;

 	xassert(verify_assoc_lock(ASSOC_LOCK, READ_LOCK));
 	xassert(verify_assoc_lock(QOS_LOCK, READ_LOCK));

 	/* Change partition pointer(s) to alternates as needed */
 	if (part_ptr_list) {
 		foreach_valid_part_t foreach_valid_part = {
 			.any_check = any_check,
 			.assoc_ptr = assoc_ptr,
 			.job_desc = job_desc,
 			.max_nodes_orig = 1,
 			.max_time = 0,
 			.min_nodes_orig = INFINITE,
 			.qos_ptr = qos_ptr,
 			.qos_ptr_list = qos_ptr_list,
 			.req_bitmap = req_bitmap,
 			.submit_uid = submit_uid,
 		};
 		(void) list_for_each(part_ptr_list, _foreach_valid_part,
 				     &foreach_valid_part);

 		if (list_is_empty(part_ptr_list) ||
 		    (slurm_conf.enforce_part_limits &&
 		     (foreach_valid_part.rc != SLURM_SUCCESS))) {
 			if (slurm_conf.enforce_part_limits ==
 			    PARTITION_ENFORCE_ALL)
 				rc = foreach_valid_part.rc;
 			else if (slurm_conf.enforce_part_limits ==
 				 PARTITION_ENFORCE_ANY && !any_check)
 				rc = foreach_valid_part.rc;
 			else {
 				rc = ESLURM_PARTITION_NOT_AVAIL;
 			}
 			goto fini;
 		}
 		any_check = foreach_valid_part.any_check;
 		min_nodes_orig = foreach_valid_part.min_nodes_orig;
 		max_nodes_orig = foreach_valid_part.max_nodes_orig;
 		max_time = foreach_valid_part.max_time;
 		rc = SLURM_SUCCESS;	/* At least some partition usable */
 	} else {
 		min_nodes_orig = part_ptr->min_nodes_orig;
 		max_nodes_orig = part_ptr->max_nodes_orig;
 		max_time = part_ptr->max_time;
 		rc = _part_access_check(part_ptr, job_desc, req_bitmap,
 					submit_uid, qos_ptr, qos_ptr_list,
 					assoc_ptr ? assoc_ptr->acct : NULL);
 		if ((rc != SLURM_SUCCESS) &&
 		    ((rc == ESLURM_ACCESS_DENIED) ||
 		     (rc == ESLURM_USER_ID_MISSING) ||
 		     slurm_conf.enforce_part_limits))
 			goto fini;
 		/* Enforce Part Limit = no */
 		rc = SLURM_SUCCESS;
 	}

 	/* Validate job limits against partition limits */

 	/* Check Partition with the highest limits when there are multiple */
 	if (job_desc->min_nodes == NO_VAL) {
 		/* Avoid setting the job request to 0 nodes unless requested */
 		if (!min_nodes_orig)
 			job_desc->min_nodes = 1;
 		else
 			job_desc->min_nodes = min_nodes_orig;
 	} else if ((job_desc->min_nodes > max_nodes_orig) &&
 	           slurm_conf.enforce_part_limits &&
 	           (!qos_ptr || (qos_ptr && !(qos_ptr->flags &
 	                                      QOS_FLAG_PART_MAX_NODE)))) {
 		info("%s: job's min nodes greater than "
 		     "partition's max nodes (%u > %u)",
 		     __func__, job_desc->min_nodes, max_nodes_orig);
 		rc = ESLURM_INVALID_NODE_COUNT;
 		goto fini;
 	} else if ((job_desc->min_nodes < min_nodes_orig) &&
 		   ((job_desc->max_nodes == NO_VAL) ||
 		    (job_desc->max_nodes >= min_nodes_orig))) {
 		job_desc->min_nodes = min_nodes_orig;
 	}

 	if ((job_desc->max_nodes != NO_VAL) &&
 	    slurm_conf.enforce_part_limits &&
 	    (job_desc->max_nodes < min_nodes_orig) &&
 	    (!qos_ptr || (qos_ptr && !(qos_ptr->flags
 	                               & QOS_FLAG_PART_MIN_NODE)))) {
 		info("%s: job's max nodes less than partition's "
 		     "min nodes (%u < %u)",
 		     __func__, job_desc->max_nodes, min_nodes_orig);
 		rc = ESLURM_INVALID_NODE_COUNT;
 		goto fini;
 	}
 	/* Zero node count OK for persistent burst buffer create or destroy */
 	if ((job_desc->min_nodes == 0) &&
 	    (job_desc->array_inx || (job_desc->het_job_offset != NO_VAL) ||
 	     (!job_desc->burst_buffer && !job_desc->script))) {
 		info("%s: min_nodes is zero", __func__);
 		rc = ESLURM_INVALID_NODE_COUNT;
 		goto fini;
 	}

 	if ((job_desc->time_limit   == NO_VAL) &&
 	    (part_ptr->default_time == 0)) {
 		info("%s: job's default time is 0", __func__);
 		rc = ESLURM_INVALID_TIME_LIMIT;
 		goto fini;
 	}

 	if ((job_desc->time_limit   == NO_VAL) &&
 	    (part_ptr->default_time != NO_VAL))
 		job_desc->time_limit = part_ptr->default_time;

 	if ((job_desc->time_min != NO_VAL) &&
 	    (job_desc->time_min >  max_time) &&
 	    (!qos_ptr || (qos_ptr && !(qos_ptr->flags &
 				       QOS_FLAG_PART_TIME_LIMIT)))) {
 		info("%s: job's min time greater than "
 		     "partition's (%u > %u)",
 		     __func__, job_desc->time_min, max_time);
 		rc = ESLURM_INVALID_TIME_MIN_LIMIT;
 		goto fini;
 	}
 	if ((job_desc->time_limit != NO_VAL) &&
 	    (job_desc->time_limit >  max_time) &&
 	    (job_desc->time_min   == NO_VAL) &&
 	    slurm_conf.enforce_part_limits &&
 	    (!qos_ptr || (qos_ptr && !(qos_ptr->flags &
 	                               QOS_FLAG_PART_TIME_LIMIT)))) {
 		info("%s: job's time limit greater than "
 		     "partition's (%u > %u)",
 		     __func__, job_desc->time_limit, max_time);
 		rc = ESLURM_INVALID_TIME_LIMIT;
 		goto fini;
 	}
 	if ((job_desc->time_min != NO_VAL) &&
 	    (job_desc->time_min >  job_desc->time_limit) &&
 	    (!qos_ptr || (qos_ptr && !(qos_ptr->flags &
 				       QOS_FLAG_PART_TIME_LIMIT)))) {
 		info("%s: job's min_time greater time limit "
 		     "(%u > %u)",
 		     __func__, job_desc->time_min, job_desc->time_limit);
 		rc = ESLURM_INVALID_TIME_MIN_LIMIT;
 		goto fini;
 	}
 	if ((job_desc->deadline) && (job_desc->deadline != NO_VAL)) {
 		char time_str_earliest[256];
 		char time_str_deadline[256];
 		time_t now = time(NULL);
 		time_t begin_time = job_desc->begin_time;
 		time_t earliest_start = MAX(begin_time, now);
 		time_t limit_in_sec = job_desc->time_limit * 60;
 		time_t min_in_sec = job_desc->time_min * 60;

 		slurm_make_time_str(&job_desc->deadline, time_str_deadline,
 				    sizeof(time_str_deadline));
 		slurm_make_time_str(&earliest_start, time_str_earliest,
 				    sizeof(time_str_earliest));

 		if (job_desc->deadline < earliest_start) {
 			info("%s: job's deadline is before its earliest start time (%s < %s)",
 			     __func__, time_str_deadline, time_str_earliest);
 			rc = ESLURM_INVALID_TIME_LIMIT;
 			goto fini;
 		}
 		if ((job_desc->time_min) && (job_desc->time_min != NO_VAL) &&
 		    (job_desc->deadline < (earliest_start + min_in_sec))) {
 			info("%s: job's min_time exceeds the deadline (%s + %lu > %s)",
 			     __func__, time_str_earliest, min_in_sec,
 			     time_str_deadline);
 			rc = ESLURM_INVALID_TIME_MIN_LIMIT;
 			goto fini;
 		}
 		if ((!job_desc->time_min || job_desc->time_min == NO_VAL) &&
 		    (job_desc->time_limit) &&
 		    (job_desc->time_limit != NO_VAL) &&
 		    (job_desc->deadline < (earliest_start + limit_in_sec))) {
 			info("%s: job's time_limit exceeds the deadline (%s + %lu > %s)",
 			     __func__, time_str_earliest, limit_in_sec,
 			     time_str_deadline);
 			rc = ESLURM_INVALID_TIME_LIMIT;
 			goto fini;
 		}
 	}

 fini:
 	return rc;
 }

 /*
  * job_limits_check - check the limits specified for the job.
  * IN job_ptr - pointer to job table entry.
  * IN check_min_time - if true test job's minimum time limit,
  *		otherwise test maximum time limit
  * RET WAIT_NO_REASON on success, fail status otherwise.
  */
 extern int job_limits_check(job_record_t **job_pptr, bool check_min_time)
 {
 	job_details_t *detail_ptr;
 	enum job_state_reason fail_reason;
 	part_record_t *part_ptr = NULL;
 	job_record_t *job_ptr = NULL;
 	slurmdb_qos_rec_t  *qos_ptr;
 	slurmdb_assoc_rec_t *assoc_ptr;
 	job_desc_msg_t job_desc;
 	int rc;

 	assoc_mgr_lock_t assoc_mgr_read_lock = {
 		.assoc = READ_LOCK,
 		.qos = READ_LOCK,
 		.user = READ_LOCK,
 	};

 	assoc_mgr_lock(&assoc_mgr_read_lock);

 	job_ptr = *job_pptr;
 	detail_ptr = job_ptr->details;
 	part_ptr = job_ptr->part_ptr;
 	qos_ptr = job_ptr->qos_ptr;
 	assoc_ptr = job_ptr->assoc_ptr;
 	if (!detail_ptr || !part_ptr) {
 		fatal_abort("%pJ has NULL details_ptr and/or part_ptr",
 			    job_ptr);
 		assoc_mgr_unlock(&assoc_mgr_read_lock);
 		return WAIT_NO_REASON;	/* To prevent CLANG error */
 	}

 	fail_reason = WAIT_NO_REASON;

 	/*
 	 * Here we need to pretend we are just submitting the job so we can
 	 * utilize the already existing function _part_access_check. If any
 	 * additional fields in that function are ever checked, the fields set
 	 * below will need to be modified.
 	 */
 	slurm_init_job_desc_msg(&job_desc);
 	job_desc.reservation = job_ptr->resv_name;
 	job_desc.user_id = job_ptr->user_id;
 	job_desc.alloc_node = job_ptr->alloc_node;
 	job_desc.min_cpus = detail_ptr->orig_min_cpus;
 	job_desc.min_nodes = detail_ptr->min_nodes;
 	/* _part_access_check looks for NO_VAL instead of 0 */
 	job_desc.max_nodes = detail_ptr->max_nodes ?
 		detail_ptr->max_nodes : NO_VAL;;
 	if (check_min_time && job_ptr->time_min)
 		job_desc.time_limit = job_ptr->time_min;
 	else
 		job_desc.time_limit = job_ptr->time_limit;

 	/* For qos_ptr_list we are checking that now, so send in NULL */
 	if ((rc = _part_access_check(part_ptr, &job_desc, NULL,
 				     job_ptr->user_id, qos_ptr,
 				     NULL,
 				     job_ptr->account))) {
 		debug2("%pJ can't run in partition %s: %s",
 		       job_ptr, part_ptr->name, slurm_strerror(rc));
 		switch (rc) {
 		case ESLURM_INVALID_TIME_LIMIT:
 		case ESLURM_INVALID_TIME_MIN_LIMIT:
 			if (job_ptr->limit_set.time != ADMIN_SET_LIMIT)
 				fail_reason = WAIT_PART_TIME_LIMIT;
 			break;
 		case ESLURM_INVALID_NODE_COUNT:
 			fail_reason = WAIT_PART_NODE_LIMIT;
 			break;
 		/* FIXME */
 		/* case ESLURM_TOO_MANY_REQUESTED_CPUS: */
 		/* 	failt_reason = NON_EXISTANT_WAIT_PART_CPU_LIMIT; */
 		/* 	break; */
 		default:
 			fail_reason = WAIT_PART_CONFIG;
 			break;
 		}
 	} else if (part_ptr->state_up == PARTITION_DOWN) {
 		debug2("%pJ requested down partition %s",
 		       job_ptr, part_ptr->name);
 		fail_reason = WAIT_PART_DOWN;
 	} else if (part_ptr->state_up == PARTITION_INACTIVE) {
 		debug2("%pJ requested inactive partition %s",
 		       job_ptr, part_ptr->name);
 		fail_reason = WAIT_PART_INACTIVE;
 	} else if (qos_ptr && assoc_ptr &&
 		   (qos_ptr->flags & QOS_FLAG_ENFORCE_USAGE_THRES) &&
 		   (!fuzzy_equal(qos_ptr->usage_thres, NO_VAL))) {
 		if (!job_ptr->prio_factors) {
 			job_ptr->prio_factors =
 				xmalloc(sizeof(priority_factors_t));
 		}
 		if (!job_ptr->prio_factors->priority_fs) {
 			if (fuzzy_equal(assoc_ptr->usage->usage_efctv, NO_VAL))
 				priority_g_set_assoc_usage(assoc_ptr);
 			job_ptr->prio_factors->priority_fs =
 				priority_g_calc_fs_factor(
 					assoc_ptr->usage->usage_efctv,
 					(long double)assoc_ptr->usage->
 					shares_norm);
 		}
 		if (job_ptr->prio_factors->priority_fs < qos_ptr->usage_thres){
 			debug2("%pJ exceeds usage threshold", job_ptr);
 			fail_reason = WAIT_QOS_THRES;
 		}
 	} else if (fail_reason == WAIT_NO_REASON) {
 		/*
 		 * Here we need to pretend we are just submitting the job so we
 		 * can utilize the already existing function _valid_pn_min_mem.
 		 * If anything else is ever checked in that function this will
 		 * most likely have to be updated. Some of the needed members
 		 * were already initialized above to call _part_access_check, as
 		 * well as the memset for job_desc.
 		 */
 		if (job_ptr->bit_flags & JOB_MEM_SET)
 			job_desc.pn_min_memory = detail_ptr->orig_pn_min_memory;
 		else {
 			/*
 			 * Don't consider DefMemPerGPU here when coming up with
 			 * a pn_min_memory, we don't know how many nodes the
 			 * gpus may be split over yet so _get_def_mem may
 			 * overestimate.
 			 */
 			job_desc.pn_min_memory = _get_def_mem(part_ptr, NULL);
 		}
 		if (detail_ptr->orig_cpus_per_task == NO_VAL16)
 			job_desc.cpus_per_task = 1;
 		else
 			job_desc.cpus_per_task = detail_ptr->orig_cpus_per_task;
 		/*
 		 * Passing the value directly since detail_ptr->num_tasks
 		 * already set correctly. If it is zero _valid_pn_min_mem()
 		 * already handles it.
 		 */
 		job_desc.num_tasks = detail_ptr->num_tasks;
 		//job_desc.min_cpus = detail_ptr->min_cpus; /* init'ed above */
 		job_desc.max_cpus = detail_ptr->orig_max_cpus;
 		job_desc.shared = (uint16_t)detail_ptr->share_res;
 		/*
 		 * At this point detail_ptr->ntasks_per_node is expected to
 		 * hold 0 (not set) or a regular value, but never NO_VAL16.
 		 * _valid_pn_min_mem will check for job_desc.ntasks_per_node
 		 * being different than NO_VAL16, which is its initial value.
 		 */
 		if (detail_ptr->ntasks_per_node)
 			job_desc.ntasks_per_node = detail_ptr->ntasks_per_node;
 		job_desc.ntasks_per_tres = detail_ptr->ntasks_per_tres;
 		job_desc.pn_min_cpus = detail_ptr->orig_pn_min_cpus;
 		job_desc.job_id = job_ptr->job_id;
 		job_desc.bitflags = job_ptr->bit_flags;
 		job_desc.tres_per_task = xstrdup(job_ptr->tres_per_task);
 		if (!_valid_pn_min_mem(&job_desc, part_ptr)) {
 			/* debug2 message already logged inside the function. */
 			fail_reason = WAIT_PN_MEM_LIMIT;
 		} else {
 			/* Copy back to job_record adjusted members */
 			detail_ptr->pn_min_memory = job_desc.pn_min_memory;
 			detail_ptr->cpus_per_task = job_desc.cpus_per_task;
 			detail_ptr->min_cpus = job_desc.min_cpus;
 			detail_ptr->max_cpus = job_desc.max_cpus;
 			detail_ptr->pn_min_cpus = job_desc.pn_min_cpus;
 			SWAP(job_ptr->tres_per_task, job_desc.tres_per_task);
 		}

 		xfree(job_desc.tres_per_task);
 	}
 	assoc_mgr_unlock(&assoc_mgr_read_lock);

 	return (fail_reason);
 }

 static void _set_tot_license_req(job_desc_msg_t *job_desc,
 				 job_record_t *job_ptr)
 {
 	char *lic_req = NULL, *lic_req_pos = NULL;
 	uint32_t num_tasks = job_desc->num_tasks;
 	char *tres_per_task = job_desc->tres_per_task;

 	/*
 	 * If !tres_per_task we check to see if num_tasks has changed.
 	 * If it has then use the current tres.
 	 */
 	if (job_ptr && !tres_per_task && (job_desc->bitflags & TASKS_CHANGED)) {
 		tres_per_task = job_ptr->tres_per_task;
 	}

 	/*
 	 * Here we are seeing we we are setting something explicit. If we are
 	 * set it. If we are changing tasks we need what was already on the job.
 	 */
 	if (job_desc->licenses && (job_desc->licenses[0] ||
 				   (job_desc->bitflags & RESET_LIC_JOB)))
 		xstrfmtcatat(lic_req, &lic_req_pos, "%s", job_desc->licenses);
 	else if (tres_per_task &&
 		 !(job_desc->bitflags & RESET_LIC_JOB) &&
 		 job_ptr &&
 		 job_ptr->lic_req)
 		xstrfmtcatat(lic_req, &lic_req_pos, "%s", job_ptr->lic_req);

 	if (job_desc->bitflags & RESET_LIC_TASK) {
 		/* removed tres */
 		if (!lic_req)
 			lic_req = xstrdup("");
 	} else if (tres_per_task) {
 		char *lic_tmp = slurm_get_tres_sub_string(
 			tres_per_task, "license", num_tasks, false, false);
 		if (lic_tmp) {
 			if (lic_req) {
 				xstrfmtcatat(lic_req, &lic_req_pos,
 					     ",%s", lic_tmp);
 				xfree(lic_tmp);
 			} else {
 				lic_req = lic_tmp;
 				lic_tmp = NULL;
 			}
 		}
 	}

 	xfree(job_desc->licenses_tot);
 	job_desc->licenses_tot = lic_req;
 	lic_req = NULL;
 }

 static void _enable_stepmgr(job_record_t *job_ptr, job_desc_msg_t *job_desc)
 {
 	static bool first_time = true;
 	static bool stepmgr_enabled = false;

 	if (first_time) {
 		first_time = false;
 		stepmgr_enabled = xstrstr(slurm_conf.slurmctld_params,
 					  "enable_stepmgr");
 	}

 	if ((stepmgr_enabled || (job_desc->bitflags & STEPMGR_ENABLED)) &&
 	    (job_desc->het_job_offset == NO_VAL)) {
 		job_ptr->bit_flags |= STEPMGR_ENABLED;
 	} else {
 		job_ptr->bit_flags &= ~STEPMGR_ENABLED;
 	}

 	if ((job_ptr->bit_flags & STEPMGR_ENABLED) &&
 	    !(slurm_conf.prolog_flags & PROLOG_FLAG_CONTAIN)) {
 		error("STEP_MGR not supported without PrologFlags=contain");
 		job_ptr->bit_flags &= ~STEPMGR_ENABLED;
 	}
 }

 /*
  * _job_create - create a job table record for the supplied specifications.
  *	This performs only basic tests for request validity (access to
  *	partition, nodes count in partition, and sufficient processors in
  *	partition).
  * IN job_desc - job specifications
  * IN allocate - resource allocation request if set rather than job submit
  * IN will_run - job is not to be created, test of validity only
  * OUT job_pptr - pointer to the job (NULL on error)
  * OUT err_msg - Error message for user
  * RET 0 on success, otherwise ESLURM error code. If the job would only be
  *	able to execute with some change in partition configuration then
  *	ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned
  */

 static int _job_create(job_desc_msg_t *job_desc, int allocate, int will_run,
 		       bool cron, job_record_t **job_pptr, uid_t submit_uid,
 		       char **err_msg, uint16_t protocol_version)
 {
 	int error_code = SLURM_SUCCESS;
 	part_record_t *part_ptr = NULL;
 	list_t *part_ptr_list = NULL, *qos_ptr_list = NULL;
 	bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL;
 	job_record_t *job_ptr = NULL;
 	slurmdb_assoc_rec_t assoc_rec, *assoc_ptr = NULL;
 	list_t *license_list = NULL, *gres_list = NULL;
 	bool valid;
 	slurmdb_qos_rec_t *qos_ptr;
 	uint32_t user_submit_priority, acct_reason = 0;
 	uint32_t qos_id = 0;
 	acct_policy_limit_set_t acct_policy_limit_set;
 	assoc_mgr_lock_t assoc_mgr_read_lock = {
 		.assoc = READ_LOCK,
 		.qos = READ_LOCK,
 		.user = READ_LOCK,
 	};
 	gres_job_state_validate_t gres_js_val = {
 		.cpus_per_task = &job_desc->cpus_per_task,
 		.max_nodes = &job_desc->max_nodes,
 		.min_cpus = &job_desc->min_cpus,
 		.min_nodes = &job_desc->min_nodes,
 		.ntasks_per_node = &job_desc->ntasks_per_node,
 		.ntasks_per_socket = &job_desc->ntasks_per_socket,
 		.ntasks_per_tres = &job_desc->ntasks_per_tres,
 		.num_tasks = &job_desc->num_tasks,
 		.sockets_per_node = &job_desc->sockets_per_node,

 		.gres_list = &gres_list,
 	};

 	memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set));
 	acct_policy_limit_set.tres = xcalloc(slurmctld_tres_cnt,
 					     sizeof(uint16_t));

 	*job_pptr = NULL;

 	user_submit_priority = job_desc->priority;

 	/* ensure that selected nodes are in this partition */
 	if (job_desc->req_nodes) {
 		error_code = node_name2bitmap(job_desc->req_nodes, false,
 					      &req_bitmap, NULL);
 		if (error_code) {
 			error_code = ESLURM_INVALID_NODE_NAME;
 			goto cleanup_fail;
 		}
 		if ((job_desc->contiguous != NO_VAL16) &&
 		    (job_desc->contiguous))
 			bit_fill_gaps(req_bitmap);
 		if (bit_set_count(req_bitmap) > job_desc->min_nodes) {
 			/*
 			 * If a nodelist has been provided with more nodes than
 			 * are required for the job, translate this into an
 			 * exclusion of all nodes except those requested.
 			 */
 			exc_bitmap = bit_alloc(node_record_count);
 			bit_or_not(exc_bitmap, req_bitmap);
 			FREE_NULL_BITMAP(req_bitmap);
 		}
 	}

 	/* Zero node count OK for persistent burst buffer create or destroy */
 	if ((job_desc->max_nodes == 0) &&
 	    (job_desc->array_inx || (job_desc->het_job_offset != NO_VAL) ||
 	     (!job_desc->burst_buffer && !job_desc->script))) {
 		info("%s: max_nodes is zero", __func__);
 		error_code = ESLURM_INVALID_NODE_COUNT;
 		goto cleanup_fail;
 	}

 	error_code = _get_job_parts(job_desc, &part_ptr, &part_ptr_list,
 				    err_msg);
 	if (error_code != SLURM_SUCCESS)
 		goto cleanup_fail;

 	memset(&assoc_rec, 0, sizeof(assoc_rec));
 	assoc_rec.acct      = job_desc->account;
 	assoc_rec.partition = part_ptr->name;
 	assoc_rec.uid       = job_desc->user_id;
 	/*
 	 * Checks are done later to validate assoc_ptr, so we don't
 	 * need to lock outside of fill_in_assoc.
 	 */
 	assoc_mgr_lock(&assoc_mgr_read_lock);
 	if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
 				    accounting_enforce, &assoc_ptr, true)) {
 		info("%s: invalid account or partition for user %u, "
 		     "account '%s', and partition '%s'", __func__,
 		     job_desc->user_id, assoc_rec.acct, assoc_rec.partition);
 		error_code = ESLURM_INVALID_ACCOUNT;
 		assoc_mgr_unlock(&assoc_mgr_read_lock);
 		goto cleanup_fail;
 	} else if (slurm_with_slurmdbd() &&
 		   !assoc_ptr &&
 		   !(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) {
 		/*
 		 * If not enforcing associations we want to look for the
 		 * default account and use it to avoid getting trash in the
 		 * accounting records.
 		 */
 		assoc_rec.acct = NULL;
 		(void) assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
 					       accounting_enforce, &assoc_ptr,
 					       true);
 		if (assoc_ptr) {
 			info("%s: account '%s' has no association for user %u "
 			     "using default account '%s'",
 			     __func__, job_desc->account, job_desc->user_id,
 			     assoc_rec.acct);
 			xfree(job_desc->account);
 		}
 	}

 	if ((error_code = _check_for_part_assocs(
 		     part_ptr_list, assoc_ptr)) != SLURM_SUCCESS) {
 		assoc_mgr_unlock(&assoc_mgr_read_lock);
 		goto cleanup_fail;
 	}

 	if (job_desc->account == NULL)
 		job_desc->account = xstrdup(assoc_rec.acct);

 	/* This must be done after we have the assoc_ptr set */
 	error_code = _get_qos_info(job_desc->qos, 0,
 				   &qos_ptr_list,
 				   &qos_ptr,
 				   job_desc->reservation,
 				   assoc_ptr,
 				   false, true, LOG_LEVEL_ERROR);
 	if (error_code != SLURM_SUCCESS) {
 		assoc_mgr_unlock(&assoc_mgr_read_lock);
 		goto cleanup_fail;
 	}

 	error_code = _valid_job_part(job_desc, submit_uid, req_bitmap,
 				     part_ptr, part_ptr_list,
 				     assoc_ptr, qos_ptr, qos_ptr_list);
 	if (qos_ptr)
 		qos_id = qos_ptr->id;
 	assoc_mgr_unlock(&assoc_mgr_read_lock);
 	if (error_code != SLURM_SUCCESS)
 		goto cleanup_fail;

 	if ((error_code = _validate_job_desc(job_desc, allocate, cron,
 					     submit_uid, part_ptr,
 					     part_ptr_list))) {
 		goto cleanup_fail;
 	}

 	job_desc->tres_req_cnt = xcalloc(slurmctld_tres_cnt, sizeof(uint64_t));

 	_set_tot_license_req(job_desc, NULL);

 	license_list =
 		license_validate(job_desc->licenses_tot, validate_cfgd_licenses,
 				 true, false, job_desc->tres_req_cnt, &valid);

 	if (!valid) {
 		info("Job's requested licenses are invalid: %s",
 		     job_desc->licenses_tot);
 		error_code = ESLURM_INVALID_LICENSES;
 		goto cleanup_fail;
 	}

 	if ((job_desc->bitflags & GRES_ONE_TASK_PER_SHARING) &&
 	    (!(slurm_conf.select_type_param &
 	       SELECT_MULTIPLE_SHARING_GRES_PJ))) {
 		info("%s: one-task-per-sharing requires MULTIPLE_SHARING_GRES_PJ",
 		     __func__);
 		error_code = ESLURM_INVALID_GRES;
 		goto cleanup_fail;
 	}

 	gres_js_val.cpus_per_tres = job_desc->cpus_per_tres;
 	gres_js_val.mem_per_tres = job_desc->mem_per_tres;
 	gres_js_val.tres_freq = job_desc->tres_freq;
 	gres_js_val.tres_per_job = job_desc->tres_per_job;
 	gres_js_val.tres_per_node = job_desc->tres_per_node;
 	gres_js_val.tres_per_socket = job_desc->tres_per_socket;
 	gres_js_val.tres_per_task = job_desc->tres_per_task;
 	if ((error_code = gres_job_state_validate(&gres_js_val)))
 		goto cleanup_fail;

 	if (!assoc_mgr_valid_tres_cnt(job_desc->cpus_per_tres, 0) ||
 	    !assoc_mgr_valid_tres_cnt(job_desc->mem_per_tres, 0) ||
 	    tres_bind_verify_cmdline(job_desc->tres_bind) ||
 	    tres_freq_verify_cmdline(job_desc->tres_freq) ||
 	    !assoc_mgr_valid_tres_cnt(job_desc->mem_per_tres, 0) ||
 	    !assoc_mgr_valid_tres_cnt(job_desc->tres_per_job, 0) ||
 	    !assoc_mgr_valid_tres_cnt(job_desc->tres_per_node, 0) ||
 	    !assoc_mgr_valid_tres_cnt(job_desc->tres_per_socket, 0) ||
 	    !assoc_mgr_valid_tres_cnt(job_desc->tres_per_task, 0)) {
 		error_code = ESLURM_INVALID_TRES;
 		goto cleanup_fail;
 	}

 	gres_stepmgr_set_job_tres_cnt(
 		gres_list,
 		job_desc->min_nodes,
 		job_desc->tres_req_cnt,
 		false);

 	/* gres_job_state_validate() can update min_nodes and min_cpus. */
 	job_desc->tres_req_cnt[TRES_ARRAY_NODE] = job_desc->min_nodes;
 	job_desc->tres_req_cnt[TRES_ARRAY_CPU]  = job_desc->min_cpus;

 	/* Get GRES before mem so we can pass gres_list to job_get_tres_mem() */
 	job_desc->tres_req_cnt[TRES_ARRAY_MEM]  =
 		job_get_tres_mem(NULL,
 				 job_desc->pn_min_memory,
 				 job_desc->tres_req_cnt[TRES_ARRAY_CPU],
 				 job_desc->min_nodes, part_ptr,
 				 gres_list,
 				 job_desc->bitflags & JOB_MEM_SET,
 				 job_desc->sockets_per_node,
 				 job_desc->num_tasks);

 	/*
 	 * Do this last,after other TRES' have been set as it uses the other
 	 * values to calculate the billing value.
 	 */
 	job_desc->tres_req_cnt[TRES_ARRAY_BILLING] =
 		assoc_mgr_tres_weighted(job_desc->tres_req_cnt,
 		                        part_ptr->billing_weights,
 		                        slurm_conf.priority_flags, false);

 	if ((error_code = bb_g_job_validate(job_desc, submit_uid, err_msg))
 	    != SLURM_SUCCESS)
 		goto cleanup_fail;

 	if (job_desc->deadline && (job_desc->time_limit == NO_VAL) &&
 	    (job_desc->time_min == NO_VAL))
 		job_desc->time_min = 1;
 	if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) &&
 	    (!acct_policy_validate(job_desc, part_ptr, part_ptr_list,
 				   assoc_ptr, qos_ptr, &acct_reason,
 				   &acct_policy_limit_set, 0))) {
 		if (err_msg) {
 			xfree(*err_msg);
 			*err_msg =
 				xstrdup(job_state_reason_string(acct_reason));
 		}
 		info("%s: exceeded association/QOS limit for user %u: %s",
 		     __func__, job_desc->user_id,
 		     err_msg ? *err_msg : job_state_reason_string(acct_reason));
 		error_code = ESLURM_ACCOUNTING_POLICY;
 		goto cleanup_fail;
 	}

 	if (job_desc->exc_nodes) {
 		bitstr_t *old_exc_bitmap = exc_bitmap;

 		error_code = node_name2bitmap(job_desc->exc_nodes, false,
 					      &exc_bitmap, NULL);
 		if (error_code) {
 			error_code = ESLURM_INVALID_NODE_NAME;
 			goto cleanup_fail;
 		}

 		if (old_exc_bitmap)
 			bit_or(exc_bitmap, old_exc_bitmap);
 		FREE_NULL_BITMAP(old_exc_bitmap);
 	}
 	if (exc_bitmap && req_bitmap) {
 		bitstr_t *tmp_bitmap = NULL;
 		bitoff_t first_set;
 		tmp_bitmap = bit_copy(exc_bitmap);
 		bit_and(tmp_bitmap, req_bitmap);
 		first_set = bit_ffs(tmp_bitmap);
 		FREE_NULL_BITMAP(tmp_bitmap);
 		if (first_set != -1) {
 			info("Job's required and excluded node lists overlap");
 			error_code = ESLURM_INVALID_NODE_NAME;
 			goto cleanup_fail;
 		}
 	}

 	if (job_desc->min_nodes == NO_VAL)
 		job_desc->min_nodes = 1;

 	if (job_desc->max_nodes == NO_VAL)
 		job_desc->max_nodes = 0;

 	if (job_desc->max_nodes &&
 	    (job_desc->max_nodes < job_desc->min_nodes)) {
 		info("%s: Job's max_nodes(%u) < min_nodes(%u)",
 		     __func__, job_desc->max_nodes, job_desc->min_nodes);
 		error_code = ESLURM_INVALID_NODE_COUNT;
 		goto cleanup_fail;
 	}

 	if ((error_code = _copy_job_desc_to_job_record(job_desc,
 						       job_pptr,
 						       &req_bitmap,
 						       &exc_bitmap))) {
 		if (error_code == SLURM_ERROR)
 			error_code = ESLURM_ERROR_ON_DESC_TO_RECORD_COPY;
 		job_ptr = *job_pptr;
 		goto cleanup_fail;
 	}

 	job_ptr = *job_pptr;
 	job_ptr->start_protocol_ver = protocol_version;
 	job_ptr->part_ptr = part_ptr;
 	job_ptr->part_ptr_list = part_ptr_list;
 	job_ptr->qos_list = qos_ptr_list;
 	job_ptr->bit_flags |= JOB_DEPENDENT;
 	job_ptr->last_sched_eval = time(NULL);

 	part_ptr_list = NULL;
 	qos_ptr_list = NULL;

 	memcpy(&job_ptr->limit_set, &acct_policy_limit_set,
 	       sizeof(acct_policy_limit_set_t));
 	acct_policy_limit_set.tres = NULL;

 	job_ptr->assoc_id = assoc_rec.id;
 	job_ptr->assoc_ptr = (void *) assoc_ptr;
 	job_ptr->qos_ptr = (void *) qos_ptr;
 	job_ptr->qos_id = qos_id;

 	if (mcs_g_set_mcs_label(job_ptr, job_desc->mcs_label) != 0 ) {
 		if (job_desc->mcs_label == NULL) {
 			error("Failed to create job: No valid mcs_label found");
 		} else {
 			error("Failed to create job: Invalid mcs-label: %s",
 			      job_desc->mcs_label);
 		}
 		error_code = ESLURM_INVALID_MCS_LABEL;
 		goto cleanup_fail;
 	}

 	/*
 	 * Permission for altering priority was confirmed above. The job_submit
 	 * plugin may have set the priority directly or put the job on hold. If
 	 * the priority is not given, we will figure it out later after we see
 	 * if the job is eligible or not. So we want NO_VAL if not set.
 	 */
 	job_ptr->priority = job_desc->priority;
 	if (job_ptr->priority == 0) {
 		if (user_submit_priority == 0)
 			job_ptr->state_reason = WAIT_HELD_USER;
 		else
 			job_ptr->state_reason = WAIT_HELD;
 	} else if ((job_ptr->priority != NO_VAL) &&
 		   (job_ptr->priority != INFINITE)) {
 		job_ptr->direct_set_prio = 1;
 	} else if ((job_ptr->priority == INFINITE) &&
 		   (user_submit_priority == INFINITE)) {
 		/* This happens when "hold": false is specified to slurmrestd */
 		job_ptr->priority = NO_VAL;
 	}

 	/*
 	 * The job submit plugin sets site_factor to NO_VAL so that it can
 	 * only be set the by the job submit plugin at submission.
 	 */
 	if (job_desc->site_factor != NO_VAL)
 		job_ptr->site_factor = job_desc->site_factor;

 	error_code = update_job_dependency(job_ptr, job_desc->dependency);
 	if (error_code != SLURM_SUCCESS)
 		goto cleanup_fail;
 	job_ptr->details->orig_dependency = xstrdup(job_ptr->details->
 						    dependency);

 	if ((error_code = build_feature_list(job_ptr, false, false)))
 		goto cleanup_fail;

 	if ((error_code = build_feature_list(job_ptr, true, false)))
 		goto cleanup_fail;

 	error_code = extra_constraints_parse(job_ptr->extra,
 					     &job_ptr->extra_constraints);
 	if (error_code != SLURM_SUCCESS)
 		goto cleanup_fail;

 	/*
 	 * NOTE: If this job is being used to expand another job, this job's
 	 * gres_list has already been filled in with a copy of gres_list job
 	 * to be expanded by update_job_dependency()
 	 */
 	if (!job_ptr->details->expanding_jobid) {
 		job_ptr->gres_list_req = gres_list;
 		gres_list = NULL;
 	}

 	job_ptr->gres_detail_cnt = 0;
 	job_ptr->gres_detail_str = NULL;
 	gres_job_state_log(job_ptr->gres_list_req, job_ptr->job_id);

 	if ((error_code = validate_job_resv(job_ptr)))
 		goto cleanup_fail;

 	if (job_desc->script
 	    &&  (!will_run)) {	/* don't bother with copy if just a test */
 		char *tmp;
 		if ((error_code = _copy_job_desc_to_file(job_desc,
 							 job_ptr->job_id))) {
 			error_code = ESLURM_WRITING_TO_FILE;
 			goto cleanup_fail;
 		}
 		job_ptr->batch_flag = 1;

 		if (slurm_conf.conf_flags & CONF_FLAG_SJE) {
 			tmp = xstring_bytes2hex(job_desc->env_hash.hash,
 						sizeof(job_desc->env_hash.hash),
 						NULL);
 			job_ptr->details->env_hash =
 				xstrdup_printf("%d:%s",
 					       job_desc->env_hash.type,
 					       tmp);
 			xfree(tmp);
 		}

 		if (slurm_conf.conf_flags & CONF_FLAG_SJS) {
 			tmp = xstring_bytes2hex(
 				job_desc->script_hash.hash,
 				sizeof(job_desc->script_hash.hash), NULL);

 			job_ptr->details->script_hash =
 				xstrdup_printf("%d:%s",
 					       job_desc->script_hash.type,
 					       tmp);
 			xfree(tmp);
 		}
 	} else
 		job_ptr->batch_flag = 0;
 	if (!will_run &&
 	    (error_code = bb_g_job_validate2(job_ptr, err_msg)))
 		goto cleanup_fail;

 	job_ptr->license_list = license_list;
 	license_list = NULL;

 	if (job_desc->req_switch != NO_VAL) {	/* Max # of switches */
 		job_ptr->req_switch = job_desc->req_switch;
 		if (job_desc->wait4switch != NO_VAL) {
 			job_ptr->wait4switch =
 				_max_switch_wait(job_desc->wait4switch);
 		} else
 			job_ptr->wait4switch = _max_switch_wait(INFINITE);
 	}
 	job_ptr->best_switch = true;

 	_enable_stepmgr(job_ptr, job_desc);

 	FREE_NULL_LIST(license_list);
 	FREE_NULL_LIST(gres_list);
 	FREE_NULL_BITMAP(req_bitmap);
 	FREE_NULL_BITMAP(exc_bitmap);
 	return error_code;

 cleanup_fail:
 	if (job_ptr) {
 		job_state_set(job_ptr, JOB_FAILED);
 		job_ptr->exit_code = 1;
 		job_ptr->state_reason = FAIL_SYSTEM;
 		xfree(job_ptr->state_desc);
 		job_ptr->start_time = job_ptr->end_time = time(NULL);
 		purge_job_record(job_ptr->job_id);
 		*job_pptr = NULL;
 	}
 	FREE_NULL_LIST(license_list);
 	xfree(acct_policy_limit_set.tres);
 	FREE_NULL_LIST(gres_list);
 	FREE_NULL_LIST(part_ptr_list);
 	FREE_NULL_LIST(qos_ptr_list);
 	FREE_NULL_BITMAP(req_bitmap);
 	FREE_NULL_BITMAP(exc_bitmap);
 	return error_code;
 }

 static int _test_strlen(char *test_str, char *str_name, int max_str_len)
 {
 	int i = 0;

 	if (test_str)
 		i = strlen(test_str);
 	if (i > max_str_len) {
 		info("job_create_request: strlen(%s) too big (%d > %d)",
 		     str_name, i, max_str_len);
 		return ESLURM_PATHNAME_TOO_LONG;
 	}
 	return SLURM_SUCCESS;
 }

 /* Translate a job array expression into the equivalent bitmap */
 static bool _valid_array_inx(job_desc_msg_t *job_desc)
 {
 	static time_t sched_update = 0;
 	static uint32_t max_task_cnt = NO_VAL;
 	uint32_t task_cnt;
 	bool valid = true;
 	char *tmp, *tok, *last = NULL;

 	FREE_NULL_BITMAP(job_desc->array_bitmap);
 	if (!job_desc->array_inx || !job_desc->array_inx[0])
 		return true;
 	if (!job_desc->script || !job_desc->script[0])
 		return false;

 	if (max_array_size == NO_VAL) {
 		max_array_size = slurm_conf.max_array_sz;
 	}
 	if (max_array_size == 0) {
 		verbose("Job arrays disabled, MaxArraySize=0");
 		return false;
 	}

 	if (sched_update != slurm_conf.last_update) {
 		char *key;
 		max_task_cnt = max_array_size;
 		sched_update = slurm_conf.last_update;
 		if ((key = xstrcasestr(slurm_conf.sched_params,
 		                       "max_array_tasks="))) {
 			key += 16;
 			max_task_cnt = atoi(key);
 		}
 	}

 	/* We have a job array request */
 	job_desc->immediate = 0;	/* Disable immediate option */
 	job_desc->array_bitmap = bit_alloc(max_array_size);

 	tmp = xstrdup(job_desc->array_inx);
 	tok = strtok_r(tmp, ",", &last);
 	while (tok && valid) {
 		valid = slurm_parse_array_tok(tok, job_desc->array_bitmap,
 					      max_array_size);
 		tok = strtok_r(NULL, ",", &last);
 	}
 	xfree(tmp);

 	if (valid && (max_task_cnt < max_array_size)) {
 		task_cnt = bit_set_count(job_desc->array_bitmap);
 		if (task_cnt > max_task_cnt) {
 			debug("max_array_tasks exceeded (%u > %u)",
 			      task_cnt, max_task_cnt);
 			valid = false;
 		}
 	}

 	return valid;
 }

 /* Make sure a job descriptor's strings are not huge, which could result in
  * a denial of service attack due to memory demands by the slurmctld */
 static int _test_job_desc_fields(job_desc_msg_t * job_desc)
 {
 	static time_t sched_update = 0;
 	static int max_script = DEFAULT_BATCH_SCRIPT_LIMIT;
 	static int max_submit_line = DEFAULT_MAX_SUBMIT_LINE_SIZE;

 	if (sched_update != slurm_conf.last_update) {
 		char *tmp_ptr;
 		sched_update = slurm_conf.last_update;

 		if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params,
 		                           "max_script_size="))) {
 			max_script = atoi(tmp_ptr + 16);
 		} else {
 			max_script = DEFAULT_BATCH_SCRIPT_LIMIT;
 		}

 		if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params,
 		                           "max_submit_line_size="))) {
 			max_submit_line = atoi(tmp_ptr + 21);
 		} else {
 			max_submit_line = DEFAULT_MAX_SUBMIT_LINE_SIZE;
 		}
 	}

 	if (_test_strlen(job_desc->account, "account", 1024)		||
 	    _test_strlen(job_desc->alloc_node, "alloc_node", 1024)	||
 	    _test_strlen(job_desc->array_inx, "array_inx", 1024 * 4)	||
 	    _test_strlen(job_desc->burst_buffer, "burst_buffer",1024*8) ||
 	    _test_strlen(job_desc->comment, "comment", 1024)		||
 	    _test_strlen(job_desc->cpu_bind, "cpu-bind", 1024 * 128)	||
 	    _test_strlen(job_desc->cpus_per_tres, "cpus_per_tres", 1024)||
 	    _test_strlen(job_desc->dependency, "dependency", 1024*128)	||
 	    _test_strlen(job_desc->extra, "extra", 1024)		||
 	    _test_strlen(job_desc->features, "features", 1024)		||
 	    _test_strlen(
 		    job_desc->cluster_features, "cluster_features", 1024)   ||
 	    _test_strlen(job_desc->licenses_tot, "licenses", 1024)	||
 	    _test_strlen(job_desc->mail_user, "mail_user", 1024)	||
 	    _test_strlen(job_desc->mcs_label, "mcs_label", 1024)	||
 	    _test_strlen(job_desc->mem_bind, "mem-bind", 1024 * 128)	||
 	    _test_strlen(job_desc->mem_per_tres, "mem_per_tres", 1024)	||
 	    _test_strlen(job_desc->name, "name", 1024)			||
 	    _test_strlen(job_desc->network, "network", 1024)		||
 	    _test_strlen(job_desc->partition, "partition", 1024)	||
 	    _test_strlen(job_desc->prefer, "prefer", 1024)		||
 	    _test_strlen(job_desc->qos, "qos", 1024)			||
 	    _test_strlen(job_desc->reservation, "reservation", 1024)	||
 	    _test_strlen(job_desc->script, "script", max_script)	||
 	    _test_strlen(job_desc->std_err, "std_err", PATH_MAX)	||
 	    _test_strlen(job_desc->std_in, "std_in", PATH_MAX)		||
 	    _test_strlen(job_desc->std_out, "std_out", PATH_MAX)	||
 	    _test_strlen(job_desc->submit_line, "submit_line",
 			 max_submit_line) ||
 	    _test_strlen(job_desc->tres_bind, "tres_bind", 1024)	||
 	    _test_strlen(job_desc->tres_freq, "tres_freq", 1024)	||
 	    _test_strlen(job_desc->tres_per_job, "tres_per_job", 1024)	||
 	    _test_strlen(job_desc->tres_per_node, "tres_per_node", 1024)||
 	    _test_strlen(job_desc->tres_per_socket, "tres_per_socket", 1024) ||
 	    _test_strlen(job_desc->tres_per_task, "tres_per_task", 1024)||
 	    _test_strlen(job_desc->wckey, "wckey", 1024)		||
 	    _test_strlen(job_desc->work_dir, "work_dir", PATH_MAX))
 		return ESLURM_PATHNAME_TOO_LONG;

 	return SLURM_SUCCESS;
 }

 static void _figure_out_num_tasks(
 	job_desc_msg_t *job_desc, job_record_t *job_ptr)
 {
 	uint32_t num_tasks = job_desc->num_tasks;
 	uint32_t min_nodes = job_desc->min_nodes;
 	uint32_t max_nodes = job_desc->max_nodes;
 	uint16_t ntasks_per_node = job_desc->ntasks_per_node;
 	uint16_t ntasks_per_tres = job_desc->ntasks_per_tres;

 	/*
 	 * Don't figure out num tasks / bitflags if updating the job and none
 	 * of the relevant influencing fields in job_desc are set.
 	 */
 	if (job_ptr &&
 	    (job_desc->num_tasks == NO_VAL && job_desc->min_nodes == NO_VAL &&
 	     job_desc->ntasks_per_node == NO_VAL16 &&
 	     job_desc->ntasks_per_tres == NO_VAL16))
 		return;

 	if (num_tasks != NO_VAL) {
 		job_desc->bitflags |= JOB_NTASKS_SET;
 	}

 	if (job_ptr) {
 		if (min_nodes == NO_VAL)
 			min_nodes = job_ptr->details->min_nodes;
 		if (max_nodes == NO_VAL)
 			max_nodes = job_ptr->details->max_nodes;
 		if (max_nodes == 0)
 			max_nodes = min_nodes;

 		if ((ntasks_per_node == NO_VAL16) &&
 		    job_ptr->details->ntasks_per_node)
 			ntasks_per_node = job_ptr->details->ntasks_per_node;
 		else if ((ntasks_per_tres == NO_VAL16) &&
 			 job_ptr->details->ntasks_per_tres)
 			ntasks_per_tres = job_ptr->details->ntasks_per_tres;

 	} else if (job_desc->min_nodes == NO_VAL) {
 		min_nodes = job_desc->min_nodes = 1;
 	}

 	/* If we are creating the job we want the tasks to be set every time. */
 	if ((num_tasks == NO_VAL) &&
 	    (min_nodes != NO_VAL) &&
 	    (!job_ptr || (job_ptr && (min_nodes == max_nodes)))) {
 		/* Implicitly set task count */
 		if (ntasks_per_tres != NO_VAL16)
 			num_tasks = min_nodes * ntasks_per_tres;
 		else if (ntasks_per_node != NO_VAL16)
 			num_tasks = min_nodes * ntasks_per_node;
 	}

 	if (job_ptr) {
 		if ((num_tasks != NO_VAL) &&
 		    (num_tasks != job_ptr->details->num_tasks)) {
 			job_desc->num_tasks = num_tasks;
 			job_desc->bitflags |= TASKS_CHANGED;
 		}
 	} else if (num_tasks != job_desc->num_tasks) {
 		job_desc->num_tasks = num_tasks;
 		job_desc->bitflags |= TASKS_CHANGED;
 	}
 }

 /* Perform some size checks on strings we store to prevent
  * malicious user filling slurmctld's memory
  * IN job_desc   - user job submit request
  * IN submit_uid - UID making job submit request
  * OUT err_msg   - custom error message to return
  * RET 0 or error code */
 extern int validate_job_create_req(job_desc_msg_t * job_desc, uid_t submit_uid,
 				   char **err_msg)
 {
 	job_record_t *job_ptr = NULL;
 	int rc;

 	/*
 	 * Check user permission for negative 'nice' and non-0 priority values
 	 * (restricted to root, SlurmUser, or SLURMDB_ADMIN_OPERATOR) _before_
 	 * running the job_submit plugin.
 	 */
 	if (!validate_operator(submit_uid)) {
 		if (job_desc->priority != 0)
 			job_desc->priority = NO_VAL;
 		if (job_desc->nice < NICE_OFFSET)
 			return ESLURM_INVALID_NICE;
 	}

 	if (!validate_super_user(submit_uid)) {
 		/* AdminComment can only be set by an Admin. */
 		if (job_desc->admin_comment)
 			return ESLURM_ACCESS_DENIED;

 		if (job_desc->reboot && (job_desc->reboot != NO_VAL16)) {
 			*err_msg = xstrdup("rebooting of nodes is only allowed for admins");
 			return ESLURM_ACCESS_DENIED;
 		}
 	}

 	rc = job_submit_g_submit(job_desc, submit_uid, err_msg);
 	if (rc != SLURM_SUCCESS)
 		return rc;

 	/* Reject jobs requesting arbitrary distribution without a task count */
 	if (((job_desc->task_dist & SLURM_DIST_STATE_BASE) ==
 	     SLURM_DIST_ARBITRARY) && (job_desc->num_tasks == NO_VAL)) {
 		*err_msg = xstrdup("task count required for arbitrary distribution");
 		return ESLURM_BAD_TASK_COUNT;
 	}

 	/* Add a temporary job_ptr for node_features_g_job_valid */
 	job_ptr = xmalloc(sizeof(job_record_t));
 	job_ptr->details = xmalloc(sizeof(job_details_t));
 	/* Point, don't dup, so don't free */
 	job_ptr->details->features = job_desc->features;
 	job_ptr->details->prefer = job_desc->prefer;
 	/* job_ptr->job_id = 0; */
 	job_ptr->user_id = job_desc->user_id;
 	if ((rc = build_feature_list(job_ptr, false, false)) != SLURM_SUCCESS)
 		goto fini;
 	rc = node_features_g_job_valid(job_desc->features,
 				       job_ptr->details->feature_list);
 	if (rc != SLURM_SUCCESS)
 		goto fini;

 	if (build_feature_list(job_ptr, true, false) != SLURM_SUCCESS) {
 		rc = ESLURM_INVALID_PREFER;
 		goto fini;
 	}
 	rc = node_features_g_job_valid(job_desc->prefer,
 				       job_ptr->details->prefer_list);
 	if (rc == ESLURM_INVALID_FEATURE)
 		rc = ESLURM_INVALID_PREFER;
 	if (rc != SLURM_SUCCESS) {
 		goto fini;
 	}

 	rc = _test_job_desc_fields(job_desc);
 	if (rc != SLURM_SUCCESS)
 		goto fini;

 	if (!_valid_array_inx(job_desc)) {
 		rc = ESLURM_INVALID_ARRAY;
 		goto fini;
 	}

 	if (job_desc->x11 && !(slurm_conf.prolog_flags & PROLOG_FLAG_X11)) {
 		rc = ESLURM_X11_NOT_AVAIL;
 		goto fini;
 	}

 	/* Make sure anything that may be put in the database will be
 	 * lower case */
 	xstrtolower(job_desc->account);
 	xstrtolower(job_desc->wckey);

 	if (job_desc->wckey && (job_desc->wckey[0] == '*')) {
 		rc = ESLURM_INVALID_WCKEY;
 		goto fini;
 	}

 	/* Basic validation of some parameters */
 	if (job_desc->req_nodes && (job_desc->min_nodes == NO_VAL)) {
 		bitstr_t *node_bitmap = NULL;
 		if (node_name2bitmap(job_desc->req_nodes, false,
 				     &node_bitmap, NULL)) {
 			/* likely a badly formatted hostlist */
 			error("validate_job_create_req: bad hostlist");
 			rc = ESLURM_INVALID_NODE_NAME;
 			goto fini;
 		}
 		job_desc->min_nodes = bit_set_count(node_bitmap);
 		FREE_NULL_BITMAP(node_bitmap);
 	}

 	_figure_out_num_tasks(job_desc, NULL);

 	/* Only set min and max cpus if overcommit isn't set */
 	if ((job_desc->overcommit == NO_VAL8) &&
 	    ((job_desc->min_cpus == NO_VAL) ||
 	     ((job_desc->min_cpus != NO_VAL) &&
 	      (job_desc->num_tasks != NO_VAL) &&
 	      (job_desc->num_tasks > job_desc->min_cpus)))) {
 		if (job_desc->num_tasks != NO_VAL)
 			job_desc->min_cpus = job_desc->num_tasks;
 		else if (job_desc->min_nodes != NO_VAL)
 			job_desc->min_cpus = job_desc->min_nodes;
 		else
 			job_desc->min_cpus = 1;

 		if (job_desc->cpus_per_task != NO_VAL16)
 			job_desc->min_cpus *= job_desc->cpus_per_task;
 		/* This is just a sanity check as we wouldn't ever have a
 		 * max_cpus if we didn't have a min_cpus.
 		 */
 		if ((job_desc->max_cpus != NO_VAL) &&
 		    (job_desc->max_cpus < job_desc->min_cpus))
 			job_desc->max_cpus = job_desc->min_cpus;
 	}

 	if (job_desc->reboot && (job_desc->reboot != NO_VAL16))
 		job_desc->shared = 0;

 fini:
 	on_job_state_change(job_ptr, NO_VAL);
 	FREE_NULL_LIST(job_ptr->details->feature_list);
 	FREE_NULL_LIST(job_ptr->details->prefer_list);
 	xfree(job_ptr->details);
 	xfree(job_ptr);

 	return rc;
 }

 /* _copy_job_desc_to_file - copy the job script and environment from the RPC
  *	structure into a file */
 static int
 _copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id)
 {
 	int error_code = 0, hash;
 	char *dir_name, *file_name;
 	DEF_TIMERS;

 	START_TIMER;

 	if (!job_desc->container &&
 	    (!job_desc->environment || job_desc->env_size == 0)) {
 		error("%s: batch job cannot run without an environment",
 		      __func__);
 		return ESLURM_ENVIRONMENT_MISSING;
 	}

 	/* Create directory based upon job ID due to limitations on the number
 	 * of files possible in a directory on some file system types (e.g.
 	 * up to 64k files on a FAT32 file system). */
 	hash = job_id % 10;
 	dir_name = xstrdup_printf("%s/hash.%d",
 	                          slurm_conf.state_save_location, hash);
 	(void) mkdir(dir_name, 0700);

 	/* Create job_id specific directory */
 	xstrfmtcat(dir_name, "/job.%u", job_id);
 	if (mkdir(dir_name, 0700)) {
 		if (!slurmctld_primary && (errno == EEXIST)) {
 			error("Apparent duplicate JobId=%u. Two primary slurmctld daemons might currently be active",
 			      job_id);
 		}
 		error("mkdir(%s) error %m", dir_name);
 		xfree(dir_name);
 		return ESLURM_WRITING_TO_FILE;
 	}

 	/* Create environment file, and write data to it */
 	file_name = xstrdup_printf("%s/environment", dir_name);
 	error_code = _write_data_array_to_file(file_name,
 					       job_desc->environment,
 					       job_desc->env_size);
 	xfree(file_name);

 	if (error_code == 0) {
 		/* Create script file */
 		file_name = xstrdup_printf("%s/script", dir_name);
 		error_code = write_data_to_file(file_name, job_desc->script);
 		xfree(file_name);
 	}

 	xfree(dir_name);
 	END_TIMER2(__func__);
 	return error_code;
 }

 /* Return true of the specified job ID already has a batch directory so
  * that a different job ID can be created. This is to help limit damage from
  * split-brain, where two slurmctld daemons are running as primary. */
 static bool _dup_job_file_test(uint32_t job_id)
 {
 	char *dir_name_src;
 	struct stat buf;
 	int rc, hash = job_id % 10;

 	dir_name_src = xstrdup_printf("%s/hash.%d/job.%u",
 	                              slurm_conf.state_save_location,
 	                              hash, job_id);
 	rc = stat(dir_name_src, &buf);
 	xfree(dir_name_src);
 	if (rc == 0) {
 		error("Vestigial state files for JobId=%u, but no job record. This may be the result of two slurmctld running in primary mode",
 		      job_id);
 		return true;
 	}
 	errno = 0; /* don't care about errno */
 	return false;
 }

 /*
  * Create file with specified name and write the supplied data array to it
  * IN file_name - file to create and write to
  * IN data - array of pointers to strings (e.g. env)
  * IN size - number of elements in data
  */
 static int
 _write_data_array_to_file(char *file_name, char **data, uint32_t size)
 {
 	int fd, i, pos, nwrite, amount;

 	fd = creat(file_name, 0600);
 	if (fd < 0) {
 		error("Error creating file %s, %m", file_name);
 		return ESLURM_WRITING_TO_FILE;
 	}

 	amount = write(fd, &size, sizeof(uint32_t));
 	if (amount < sizeof(uint32_t)) {
 		error("Error writing file %s, %m", file_name);
 		close(fd);
 		return ESLURM_WRITING_TO_FILE;
 	}

 	if (data == NULL) {
 		close(fd);
 		return SLURM_SUCCESS;
 	}

 	for (i = 0; i < size; i++) {
 		nwrite = strlen(data[i]) + 1;
 		pos = 0;
 		while (nwrite > 0) {
 			amount = write(fd, &data[i][pos], nwrite);
 			if ((amount < 0) && (errno != EINTR)) {
 				error("Error writing file %s, %m",
 				      file_name);
 				close(fd);
 				return ESLURM_WRITING_TO_FILE;
 			}
 			nwrite -= amount;
 			pos    += amount;
 		}
 	}

 	close(fd);
 	return SLURM_SUCCESS;
 }

 /*
  * Create file with specified name and write the supplied data array to it
  * IN file_name - file to create and write to
  * IN data - pointer to string
  */
 extern int write_data_to_file(char *file_name, char *data)
 {
 	int fd, pos, nwrite, amount;

 	if (data == NULL) {
 		(void) unlink(file_name);
 		return SLURM_SUCCESS;
 	}

 	fd = creat(file_name, 0700);
 	if (fd < 0) {
 		error("Error creating file %s, %m", file_name);
 		return ESLURM_WRITING_TO_FILE;
 	}

 	nwrite = strlen(data) + 1;
 	pos = 0;
 	while (nwrite > 0) {
 		amount = write(fd, &data[pos], nwrite);
 		if ((amount < 0) && (errno != EINTR)) {
 			error("Error writing file %s, %m", file_name);
 			close(fd);
 			return ESLURM_WRITING_TO_FILE;
 		}
 		nwrite -= amount;
 		pos    += amount;
 	}
 	close(fd);
 	return SLURM_SUCCESS;
 }

 /*
  * get_job_env - return the environment variables and their count for a
  *	given job
  * IN job_ptr - pointer to job for which data is required
  * OUT env_size - number of elements to read
  * RET point to array of string pointers containing environment variables
  */
 char **get_job_env(job_record_t *job_ptr, uint32_t *env_size)
 {
 	char *file_name = NULL, **environment = NULL;
 	int cc, fd = -1, hash;
 	uint32_t use_id;

 	use_id = (job_ptr->array_task_id != NO_VAL) ?
 		job_ptr->array_job_id : job_ptr->job_id;
 	hash = use_id % 10;
 	file_name = xstrdup_printf("%s/hash.%d/job.%u/environment",
 	                           slurm_conf.state_save_location,
 	                           hash, use_id);
 	fd = open(file_name, 0);

 	if (fd >= 0) {
 		cc = _read_data_array_from_file(fd, file_name, &environment,
 						env_size, job_ptr);
 		if (cc < 0)
 			environment = NULL;
 		close(fd);
 	} else {
 		error("Could not open environment file for %pJ", job_ptr);
 	}

 	xfree(file_name);
 	return environment;
 }

 /*
  * get_job_script - return the script for a given job
  * IN job_ptr - pointer to job for which data is required
  * RET buf_t *containing job script
  */
 buf_t *get_job_script(const job_record_t *job_ptr)
 {
 	char *file_name = NULL;
 	int hash;
 	uint32_t use_id;
 	buf_t *buf;

 	if (!job_ptr->batch_flag)
 		return NULL;

 	use_id = (job_ptr->array_task_id != NO_VAL) ?
 		job_ptr->array_job_id : job_ptr->job_id;
 	hash = use_id % 10;
 	file_name = xstrdup_printf("%s/hash.%d/job.%u/script",
 	                           slurm_conf.state_save_location,
 	                           hash, use_id);

 	if (!(buf = create_mmap_buf(file_name)))
 		error("Could not open script file for %pJ", job_ptr);
 	xfree(file_name);

 	return buf;
 }

 extern uint16_t job_get_sockets_per_node(job_record_t *job_ptr)
 {
 	xassert(job_ptr);

 	if (job_ptr->details && job_ptr->details->mc_ptr &&
 	    job_ptr->details->mc_ptr->sockets_per_node &&
 	    (job_ptr->details->mc_ptr->sockets_per_node != NO_VAL16))
 		return job_ptr->details->mc_ptr->sockets_per_node;
 	return 1;
 }

 /*
  * Read a collection of strings from a file
  * IN fd - file descriptor
  * IN file_name - file to read from
  * OUT data - pointer to array of pointers to strings (e.g. env),
  *	must be xfreed when no longer needed
  * OUT size - number of elements in data
  * IN job_ptr - job
  * RET 0 on success, -1 on error
  * NOTE: The output format of this must be identical with _xduparray2()
  */
 static int _read_data_array_from_file(int fd, char *file_name, char ***data,
 				      uint32_t *size, job_record_t *job_ptr)
 {
 	int pos, buf_size, amount, i, j;
 	char *buffer, **array_ptr;
 	uint32_t rec_cnt;

 	xassert(file_name);
 	xassert(data);
 	xassert(size);
 	*data = NULL;
 	*size = 0;

 	amount = read(fd, &rec_cnt, sizeof(uint32_t));
 	if (amount < sizeof(uint32_t)) {
 		if (amount != 0)	/* incomplete write */
 			error("Error reading file %s, %m", file_name);
 		else
 			verbose("File %s has zero size", file_name);
 		return -1;
 	}

 	if (rec_cnt >= INT_MAX) {
 		error("%s: unreasonable record counter %d in file %s",
 		      __func__, rec_cnt, file_name);
 		return -1;
 	}

 	if (rec_cnt == 0) {
 		*data = NULL;
 		*size = 0;
 		return 0;
 	}

 	pos = 0;
 	buf_size = BUF_SIZE;
 	buffer = xmalloc(buf_size + 1);
 	while (1) {
 		amount = read(fd, &buffer[pos], BUF_SIZE);
 		if (amount < 0) {
 			error("Error reading file %s, %m", file_name);
 			xfree(buffer);
 			return -1;
 		}
 		buffer[pos + amount] = '\0';
 		pos += amount;
 		if (amount < BUF_SIZE)	/* end of file */
 			break;
 		buf_size += amount;
 		xrealloc(buffer, buf_size + 1);
 	}

 	/* Allocate extra space for supplemental environment variables */
 	if (job_ptr->details->env_cnt) {
 		for (j = 0; j < job_ptr->details->env_cnt; j++)
 			pos += (strlen(job_ptr->details->env_sup[j]) + 1);
 		xrealloc(buffer, pos);
 	}

 	/* We have all the data, now let's compute the pointers */
 	array_ptr = xcalloc((rec_cnt + job_ptr->details->env_cnt) + 1,
 			    sizeof(char *));
 	for (i = 0, pos = 0; i < rec_cnt; i++) {
 		array_ptr[i] = &buffer[pos];
 		pos += strlen(&buffer[pos]) + 1;
 		if ((pos > buf_size) && ((i + 1) < rec_cnt)) {
 			error("Bad environment file %s", file_name);
 			rec_cnt = i;
 			break;
 		}
 	}

 	/* Add supplemental environment variables */
 	if (job_ptr->details->env_cnt) {
 		char *tmp_chr;
 		int env_len, name_len;
 		for (j = 0; j < job_ptr->details->env_cnt; j++) {
 			tmp_chr = strchr(job_ptr->details->env_sup[j], '=');
 			if (tmp_chr == NULL) {
 				error("Invalid supplemental environment "
 				      "variable: %s",
 				      job_ptr->details->env_sup[j]);
 				continue;
 			}
 			env_len  = strlen(job_ptr->details->env_sup[j]) + 1;
 			name_len = tmp_chr - job_ptr->details->env_sup[j] + 1;
 			/* search for duplicate */
 			for (i = 0; i < rec_cnt; i++) {
 				if (xstrncmp(array_ptr[i],
 					     job_ptr->details->env_sup[j],
 					     name_len)) {
 					continue;
 				}

 				/*
 				 * If we are are the front we can not overwrite
 				 * that spot, we can clear it an then add to the
 				 * end of the array.
 				 */
 				if (i == 0) {
 					array_ptr[0][0] = '\0';
 					i = rec_cnt;
 					break;
 				}
 				/* over-write duplicate */
 				memcpy(&buffer[pos],
 				       job_ptr->details->env_sup[j], env_len);
 				array_ptr[i] = &buffer[pos];
 				pos += env_len;
 				break;
 			}
 			if (i >= rec_cnt) {	/* add env to array end */
 				memcpy(&buffer[pos],
 				       job_ptr->details->env_sup[j], env_len);
 				array_ptr[rec_cnt++] = &buffer[pos];
 				pos += env_len;
 			}
 		}
 	}

 	*size = rec_cnt;
 	*data = array_ptr;
 	return 0;
 }

 /* Given a job request, return a multi_core_data struct.
  * Returns NULL if no values set in the job/step request */
 static multi_core_data_t *
 _set_multi_core_data(job_desc_msg_t * job_desc)
 {
 	multi_core_data_t * mc_ptr;

 	if ((job_desc->sockets_per_node  == NO_VAL16)	&&
 	    (job_desc->cores_per_socket  == NO_VAL16)	&&
 	    (job_desc->threads_per_core  == NO_VAL16)	&&
 	    (job_desc->ntasks_per_socket == NO_VAL16)	&&
 	    (job_desc->ntasks_per_core   == NO_VAL16)	&&
 	    (job_desc->plane_size        == NO_VAL16))
 		return NULL;

 	mc_ptr = xmalloc(sizeof(multi_core_data_t));
 	mc_ptr->sockets_per_node = job_desc->sockets_per_node;
 	mc_ptr->cores_per_socket = job_desc->cores_per_socket;
 	mc_ptr->threads_per_core = job_desc->threads_per_core;
 	if (job_desc->ntasks_per_socket != NO_VAL16)
 		mc_ptr->ntasks_per_socket  = job_desc->ntasks_per_socket;
 	else
 		mc_ptr->ntasks_per_socket  = INFINITE16;
 	if (job_desc->ntasks_per_core != NO_VAL16)
 		mc_ptr->ntasks_per_core    = job_desc->ntasks_per_core;
 	else if (slurm_conf.select_type_param & SELECT_ONE_TASK_PER_CORE)
 		mc_ptr->ntasks_per_core    = 1;
 	else
 		mc_ptr->ntasks_per_core    = INFINITE16;
 	if (job_desc->plane_size != NO_VAL16)
 		mc_ptr->plane_size         = job_desc->plane_size;
 	else
 		mc_ptr->plane_size         = 0;

 	return mc_ptr;
 }

 /* Return default "wait_all_nodes" option for a new job */
 static uint16_t _default_wait_all_nodes(job_desc_msg_t *job_desc)
 {
 	static uint16_t default_batch_wait = NO_VAL16;
 	static time_t sched_update = 0;

 	if (!job_desc->script)
 		return 0;

 	if ((default_batch_wait != NO_VAL16) &&
 	    (sched_update == slurm_conf.last_update))
 		return default_batch_wait;

 	if (xstrcasestr(slurm_conf.sched_params, "sbatch_wait_nodes"))
 		default_batch_wait = 1;
 	else
 		default_batch_wait = 0;
 	sched_update = slurm_conf.last_update;

 	return default_batch_wait;
 }

 static int _unroll_min_max_node(job_record_t *job_ptr)
 {
 	static int max_unroll = -1;
 	static time_t topo_update = 0;
 	job_details_t *detail_ptr = job_ptr->details;
 	int i;

 	if (topo_update != slurm_conf.last_update) {
 		char *tmp_ptr;
 		topo_update = slurm_conf.last_update;
 		char *unroll_opt_str = "TopoMaxSizeUnroll=";

 		if ((topology_get_plugin_id() == TOPOLOGY_PLUGIN_BLOCK) &&
 		    (tmp_ptr = xstrcasestr(slurm_conf.topology_param,
 					   unroll_opt_str))) {
 			i = atoi(tmp_ptr + strlen(unroll_opt_str));
 			if (i < 0) {
 				error("ignoring TopologyParam: TopoMaxSizeUnroll %d",
 				      i);
 			} else {
 				max_unroll = i;
 			}
 		}
 	}

 	if (max_unroll < 0)
 		return SLURM_SUCCESS;

 	if (detail_ptr->job_size_bitmap)
 		return SLURM_SUCCESS;

 	if (!detail_ptr->max_nodes ||
 	    (detail_ptr->max_nodes == detail_ptr->min_nodes))
 		return SLURM_SUCCESS;

 	if ((detail_ptr->max_nodes < MAX_JOB_SIZE_BITMAP) &&
 	    ((detail_ptr->max_nodes - detail_ptr->min_nodes) < max_unroll)) {
 		bitstr_t *size_bitmap;
 		size_bitmap = bit_alloc(detail_ptr->max_nodes + 1);
 		bit_nset(size_bitmap, detail_ptr->min_nodes,
 			 detail_ptr->max_nodes);
 		detail_ptr->job_size_bitmap = size_bitmap;
 	} else {
 		return ESLURM_INVALID_NODE_COUNT;
 	}

 	return SLURM_SUCCESS;
 }

 /* _copy_job_desc_to_job_record - copy the job descriptor from the RPC
  *	structure into the actual slurmctld job record */
 static int _copy_job_desc_to_job_record(job_desc_msg_t *job_desc,
 					job_record_t **job_rec_ptr,
 					bitstr_t **req_bitmap,
 					bitstr_t **exc_bitmap)
 {
 	int error_code;
 	job_details_t *detail_ptr;
 	job_record_t *job_ptr;

 	if (slurm_conf.conf_flags & CONF_FLAG_WCKEY) {
 		if (!job_desc->wckey) {
 			/* get the default wckey for this user since none was
 			 * given */
 			slurmdb_user_rec_t user_rec;
 			memset(&user_rec, 0, sizeof(user_rec));
 			user_rec.uid = job_desc->user_id;
 			assoc_mgr_fill_in_user(acct_db_conn, &user_rec,
 					       accounting_enforce, NULL, false);
 			if (user_rec.default_wckey)
 				job_desc->wckey = xstrdup_printf(
 					"*%s", user_rec.default_wckey);
 			else if (!(accounting_enforce &
 				   ACCOUNTING_ENFORCE_WCKEYS))
 				job_desc->wckey = xstrdup("*");
 			else {
 				error("Job didn't specify wckey and user "
 				      "%d has no default.", job_desc->user_id);
 				return ESLURM_INVALID_WCKEY;
 			}
 		} else if (job_desc->wckey) {
 			slurmdb_wckey_rec_t wckey_rec, *wckey_ptr = NULL;

 			memset(&wckey_rec, 0, sizeof(wckey_rec));
 			wckey_rec.uid       = job_desc->user_id;
 			wckey_rec.name      = job_desc->wckey;

 			if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec,
 						    accounting_enforce,
 						    &wckey_ptr, false)) {
 				if (accounting_enforce &
 				    ACCOUNTING_ENFORCE_WCKEYS) {
 					error("%s: invalid wckey '%s' for "
 					      "user %u.",
 					      __func__, wckey_rec.name,
 					      job_desc->user_id);
 					return ESLURM_INVALID_WCKEY;
 				}
 			}
 		} else if (accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS) {
 			/* This should never happen */
 			info("%s: no wckey was given for job submit", __func__);
 			return ESLURM_INVALID_WCKEY;
 		}
 	}

 	job_ptr = _create_job_record(1, true);

 	*job_rec_ptr = job_ptr;
 	job_ptr->partition = xstrdup(job_desc->partition);
 	if (job_desc->profile != ACCT_GATHER_PROFILE_NOT_SET)
 		job_ptr->profile = job_desc->profile;

 	if (job_desc->job_id != NO_VAL) {	/* already confirmed unique */
 		job_ptr->job_id = job_desc->job_id;
 	} else {
 		error_code = _set_job_id(job_ptr);
 		if (error_code)
 			return error_code;
 	}

 	job_ptr->name = xstrdup(job_desc->name);
 	job_ptr->wckey = xstrdup(job_desc->wckey);

 	/* Since this is only used in the slurmctld, copy it now. */
 	job_ptr->tres_req_cnt = job_desc->tres_req_cnt;
 	job_desc->tres_req_cnt = NULL;
 	set_job_tres_req_str(job_ptr, false);
 	_add_job_hash(job_ptr);

 	job_ptr->user_id    = (uid_t) job_desc->user_id;
 	job_ptr->group_id   = (gid_t) job_desc->group_id;
 	/* skip copy, just take ownership */
 	job_ptr->id = job_desc->id;
 	job_desc->id = NULL;

 	job_state_set(job_ptr, JOB_PENDING);
 	job_ptr->time_limit = job_desc->time_limit;
 	job_ptr->deadline   = job_desc->deadline;
 	if (job_desc->delay_boot == NO_VAL)
 		job_ptr->delay_boot   = delay_boot;
 	else
 		job_ptr->delay_boot   = job_desc->delay_boot;
 	if (job_desc->time_min != NO_VAL)
 		job_ptr->time_min = job_desc->time_min;
 	job_ptr->alloc_sid  = job_desc->alloc_sid;
 	job_ptr->alloc_node = xstrdup(job_desc->alloc_node);
 	job_ptr->account    = xstrdup(job_desc->account);
 	job_ptr->batch_features = xstrdup(job_desc->batch_features);
 	job_ptr->burst_buffer = xstrdup(job_desc->burst_buffer);
 	job_ptr->network    = xstrdup(job_desc->network);
 	job_ptr->resv_name  = xstrdup(job_desc->reservation);
 	job_ptr->restart_cnt = job_desc->restart_cnt;
 	job_ptr->comment    = xstrdup(job_desc->comment);
 	job_ptr->extra = xstrdup(job_desc->extra);
 	job_ptr->container = xstrdup(job_desc->container);
 	job_ptr->container_id = xstrdup(job_desc->container_id);
 	job_ptr->admin_comment = xstrdup(job_desc->admin_comment);

 	if (job_desc->kill_on_node_fail != NO_VAL16)
 		job_ptr->kill_on_node_fail = job_desc->kill_on_node_fail;

 	job_ptr->resp_host = xstrdup(job_desc->resp_host);
 	job_ptr->alloc_resp_port = job_desc->alloc_resp_port;
 	job_ptr->alloc_tls_cert = xstrdup(job_desc->alloc_tls_cert);
 	job_ptr->other_port = job_desc->other_port;
 	job_ptr->time_last_active = time(NULL);
 	job_ptr->derived_ec = 0;

 	job_ptr->licenses  = xstrdup(job_desc->licenses_tot);
 	job_ptr->lic_req  = xstrdup(job_desc->licenses);
 	job_ptr->mail_user = _get_mail_user(job_desc->mail_user,
 					    job_ptr);
 	if (job_desc->mail_type &&
 	    (job_desc->mail_type != NO_VAL16)) {
 		job_ptr->mail_type = job_desc->mail_type;
 	}

 	job_ptr->bit_flags = job_desc->bitflags;
 	job_ptr->bit_flags &= ~TASKS_CHANGED;
 	job_ptr->bit_flags &= ~BACKFILL_TEST;
 	job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST;

 	job_ptr->resv_port_cnt = job_desc->resv_port_cnt;
 	if (job_desc->resv_port_cnt != NO_VAL16) {
 		error_code = resv_port_check_job_request_cnt(job_ptr);
 		if (error_code)
 			return error_code;
 	}

 	job_ptr->spank_job_env = job_desc->spank_job_env;
 	job_ptr->spank_job_env_size = job_desc->spank_job_env_size;
 	job_desc->spank_job_env = (char **) NULL; /* nothing left to free */
 	job_desc->spank_job_env_size = 0;         /* nothing left to free */
 	job_ptr->mcs_label = xstrdup(job_desc->mcs_label);
 	job_ptr->origin_cluster = xstrdup(job_desc->origin_cluster);

 	job_ptr->cpus_per_tres = xstrdup(job_desc->cpus_per_tres);
 	job_ptr->mem_per_tres = xstrdup(job_desc->mem_per_tres);
 	job_ptr->tres_bind = xstrdup(job_desc->tres_bind);
 	job_ptr->tres_freq = xstrdup(job_desc->tres_freq);
 	job_ptr->tres_per_job = xstrdup(job_desc->tres_per_job);
 	job_ptr->tres_per_node = xstrdup(job_desc->tres_per_node);
 	job_ptr->tres_per_socket = xstrdup(job_desc->tres_per_socket);
 	job_ptr->tres_per_task = xstrdup(job_desc->tres_per_task);

 	if (job_desc->wait_all_nodes == NO_VAL16)
 		job_ptr->wait_all_nodes = _default_wait_all_nodes(job_desc);
 	else
 		job_ptr->wait_all_nodes = job_desc->wait_all_nodes;
 	job_ptr->warn_flags  = job_desc->warn_flags;
 	job_ptr->warn_signal = job_desc->warn_signal;
 	job_ptr->warn_time   = job_desc->warn_time;

 	detail_ptr = job_ptr->details;
 	detail_ptr->argc = job_desc->argc;
 	detail_ptr->argv = job_desc->argv;
 	job_desc->argv   = (char **) NULL; /* nothing left to free */
 	job_desc->argc   = 0;		   /* nothing left to free */
 	detail_ptr->acctg_freq = xstrdup(job_desc->acctg_freq);
 	detail_ptr->cpu_bind_type = job_desc->cpu_bind_type;
 	detail_ptr->cpu_bind   = xstrdup(job_desc->cpu_bind);
 	detail_ptr->cpu_freq_gov = job_desc->cpu_freq_gov;
 	detail_ptr->cpu_freq_max = job_desc->cpu_freq_max;
 	detail_ptr->cpu_freq_min = job_desc->cpu_freq_min;
 	detail_ptr->nice       = job_desc->nice;
 	detail_ptr->open_mode  = job_desc->open_mode;
 	detail_ptr->min_cpus   = job_desc->min_cpus;
 	detail_ptr->orig_min_cpus   = job_desc->min_cpus;
 	detail_ptr->max_cpus   = job_desc->max_cpus;
 	detail_ptr->orig_max_cpus   = job_desc->max_cpus;
 	detail_ptr->min_nodes  = job_desc->min_nodes;
 	detail_ptr->max_nodes  = job_desc->max_nodes;
 	detail_ptr->qos_req = xstrdup(job_desc->qos);
 	if (job_desc->job_size_str && detail_ptr->max_nodes) {
 		if (detail_ptr->max_nodes >= MAX_JOB_SIZE_BITMAP)
 			return ESLURM_INVALID_NODE_COUNT;
 		detail_ptr->job_size_bitmap =
 			bit_alloc(detail_ptr->max_nodes + 1);
 		if (bit_unfmt(detail_ptr->job_size_bitmap,
 			      job_desc->job_size_str))
 			FREE_NULL_BITMAP(detail_ptr->job_size_bitmap);
 	} else {
 		error_code = _unroll_min_max_node(job_ptr);
 		if (error_code)
 			return error_code;
 	}
 	detail_ptr->req_context = xstrdup(job_desc->req_context);
 	detail_ptr->resv_req = xstrdup(job_desc->reservation);
 	detail_ptr->x11        = job_desc->x11;
 	detail_ptr->x11_magic_cookie = xstrdup(job_desc->x11_magic_cookie);
 	detail_ptr->x11_target = xstrdup(job_desc->x11_target);
 	detail_ptr->x11_target_port = job_desc->x11_target_port;
 	if (job_desc->req_nodes) {
 		if ((job_desc->task_dist & SLURM_DIST_STATE_BASE) ==
 		    SLURM_DIST_ARBITRARY) {
 			detail_ptr->req_nodes = xstrdup(job_desc->req_nodes);
 			if ((error_code =
 			     job_record_calc_arbitrary_tpn(job_ptr)))
 				return error_code;
 		} else {
 			detail_ptr->req_nodes =
 				_copy_nodelist_no_dup(job_desc->req_nodes);
 		}
 		detail_ptr->req_node_bitmap = *req_bitmap;
 		*req_bitmap = NULL;	/* Reused nothing left to free */
 		detail_ptr->exc_node_bitmap = *exc_bitmap;
 	}
 	if (job_desc->exc_nodes) {
 		detail_ptr->exc_nodes =
 			_copy_nodelist_no_dup(job_desc->exc_nodes);
 		detail_ptr->exc_node_bitmap = *exc_bitmap;
 	}
 	if (job_desc->exc_nodes || job_desc->req_nodes)
 		*exc_bitmap = NULL;	/* Reused nothing left to free */
 	detail_ptr->features = xstrdup(job_desc->features);
 	detail_ptr->cluster_features = xstrdup(job_desc->cluster_features);
 	detail_ptr->prefer = xstrdup(job_desc->prefer);
 	if (job_desc->fed_siblings_viable) {
 		job_ptr->fed_details = xmalloc(sizeof(job_fed_details_t));
 		job_ptr->fed_details->siblings_viable =
 			job_desc->fed_siblings_viable;
 		update_job_fed_details(job_ptr);
 	}
 	if (job_desc->shared == JOB_SHARED_NONE) {
 		detail_ptr->share_res  = 0;
 		detail_ptr->whole_node = WHOLE_NODE_REQUIRED;
 	} else if (job_desc->shared == JOB_SHARED_OK) {
 		detail_ptr->share_res  = 1;
 		detail_ptr->whole_node = 0;
 	} else if (job_desc->shared == JOB_SHARED_USER) {
 		detail_ptr->share_res  = NO_VAL8;
 		detail_ptr->whole_node = WHOLE_NODE_USER;
 	} else if (job_desc->shared == JOB_SHARED_MCS) {
 		detail_ptr->share_res  = NO_VAL8;
 		detail_ptr->whole_node = WHOLE_NODE_MCS;
 	} else if (job_desc->shared == JOB_SHARED_TOPO) {
 		detail_ptr->share_res  = NO_VAL8;
 		detail_ptr->whole_node = WHOLE_TOPO;
 	} else {
 		detail_ptr->share_res  = NO_VAL8;
 		detail_ptr->whole_node = 0;
 	}
 	if (job_desc->contiguous != NO_VAL16)
 		detail_ptr->contiguous = job_desc->contiguous;
 	if (slurm_conf.conf_flags & CONF_FLAG_ASRU)
 		detail_ptr->core_spec = job_desc->core_spec;
 	else
 		detail_ptr->core_spec = NO_VAL16;
 	if (detail_ptr->core_spec != NO_VAL16)
 		detail_ptr->whole_node = WHOLE_NODE_REQUIRED;
 	if (job_desc->task_dist != NO_VAL)
 		detail_ptr->task_dist = job_desc->task_dist;
 	if (job_desc->cpus_per_task == NO_VAL16) {
 		detail_ptr->cpus_per_task = 1;
 		detail_ptr->orig_cpus_per_task = NO_VAL16;
 	} else {
 		detail_ptr->cpus_per_task = MAX(job_desc->cpus_per_task, 1);
 		detail_ptr->orig_cpus_per_task = detail_ptr->cpus_per_task;
 	}
 	if (job_desc->pn_min_cpus != NO_VAL16)
 		detail_ptr->pn_min_cpus = job_desc->pn_min_cpus;
 	if (job_desc->overcommit != NO_VAL8)
 		detail_ptr->overcommit = job_desc->overcommit;
 	if (job_desc->num_tasks != NO_VAL)
 		detail_ptr->num_tasks = job_desc->num_tasks;
 	if (job_desc->ntasks_per_node != NO_VAL16) {
 		detail_ptr->ntasks_per_node = job_desc->ntasks_per_node;
 		if ((detail_ptr->overcommit == 0) &&
 		    (detail_ptr->num_tasks > 1)) {
 			detail_ptr->pn_min_cpus =
 				MAX(detail_ptr->pn_min_cpus,
 				    (detail_ptr->cpus_per_task *
 				     detail_ptr->ntasks_per_node));
 		}
 	}
 	if (job_desc->ntasks_per_tres != NO_VAL16)
 		detail_ptr->ntasks_per_tres = job_desc->ntasks_per_tres;
 	detail_ptr->pn_min_cpus = MAX(detail_ptr->pn_min_cpus,
 				      detail_ptr->cpus_per_task);
 	detail_ptr->orig_pn_min_cpus = detail_ptr->pn_min_cpus;
 	if (job_desc->reboot != NO_VAL16)
 		job_ptr->reboot = MIN(job_desc->reboot, 1);
 	else
 		job_ptr->reboot = 0;
 	if (job_desc->requeue != NO_VAL16)
 		detail_ptr->requeue = MIN(job_desc->requeue, 1);
 	else
 		detail_ptr->requeue = slurm_conf.job_requeue;
 	if (job_desc->pn_min_memory != NO_VAL64)
 		detail_ptr->pn_min_memory = job_desc->pn_min_memory;
 	detail_ptr->orig_pn_min_memory = detail_ptr->pn_min_memory;
 	if (job_desc->pn_min_tmp_disk != NO_VAL)
 		detail_ptr->pn_min_tmp_disk = job_desc->pn_min_tmp_disk;

 	detail_ptr->oom_kill_step = job_desc->oom_kill_step;

 	detail_ptr->segment_size = job_desc->segment_size;
 	detail_ptr->std_err = xstrdup(job_desc->std_err);
 	detail_ptr->std_in = xstrdup(job_desc->std_in);
 	detail_ptr->std_out = xstrdup(job_desc->std_out);
 	detail_ptr->submit_line = xstrdup(job_desc->submit_line);
 	detail_ptr->work_dir = xstrdup(job_desc->work_dir);
 	if (job_desc->begin_time > time(NULL))
 		detail_ptr->begin_time = job_desc->begin_time;

 	job_ptr->clusters = xstrdup(job_desc->clusters);

 	/*
 	 * The priority needs to be set after this since we don't have
 	 * an association rec yet
 	 */
 	detail_ptr->mc_ptr = _set_multi_core_data(job_desc);

 	if ((job_ptr->bit_flags & SPREAD_JOB) && (detail_ptr->max_nodes == 0) &&
 	    (detail_ptr->num_tasks != 0)) {
 		if (detail_ptr->min_nodes == 0)
 			detail_ptr->min_nodes = 1;
 		detail_ptr->max_nodes = MIN(active_node_record_count,
 					    detail_ptr->num_tasks);
 	}

 	job_ptr->selinux_context = xstrdup(job_desc->selinux_context);

 	return SLURM_SUCCESS;
 }

 /*
  * _copy_nodelist_no_dup - Take a node_list string and convert it to an
  *	expression without duplicate names. For example, we want to convert
  *	a users request for nodes "lx1,lx2,lx1,lx3" to "lx[1-3]"
  * node_list IN - string describing a list of nodes
  * RET a compact node expression, must be xfreed by the user
  */
 static char *_copy_nodelist_no_dup(char *node_list)
 {
 	char *buf;

 	hostlist_t *hl = hostlist_create(node_list);
 	if (hl == NULL)
 		return NULL;
 	hostlist_uniq(hl);
 	buf = hostlist_ranged_string_xmalloc(hl);
 	hostlist_destroy(hl);

 	return buf;
 }

 /* Return memory on the first node in the identified partition */
 static uint64_t _mem_per_node_part(part_record_t *part_ptr)
 {
 	int node_inx = -1;
 	node_record_t *node_ptr;

 	if (!part_ptr)
 		return 0;

 	if (part_ptr->node_bitmap)
 		node_inx = bit_ffs(part_ptr->node_bitmap);
 	if (node_inx >= 0) {
 		node_ptr = node_record_table_ptr[node_inx];
 		return (node_ptr->config_ptr->real_memory -
 			node_ptr->mem_spec_limit);
 	}
 	return 0;
 }

 /*
  * Test if this job exceeds any of MaxMemPer[CPU|Node] limits and potentially
  * adjust mem / cpu ratios.
  *
  * NOTE: This function is also called with a dummy job_desc_msg_t from
  * job_limits_check(), if there is any new check added here you may also have to
  * add that parameter to the job_desc_msg_t in that function.
  */
 static bool _valid_pn_min_mem(job_desc_msg_t *job_desc_msg,
 			      part_record_t *part_ptr)
 {
 	uint64_t job_mem_limit = job_desc_msg->pn_min_memory;
 	uint64_t sys_mem_limit;
 	uint16_t cpus_per_node;

 	if (part_ptr && part_ptr->max_mem_per_cpu)
 		sys_mem_limit = part_ptr->max_mem_per_cpu;
 	else
 		sys_mem_limit = slurm_conf.max_mem_per_cpu;

 	if ((sys_mem_limit == 0) || (sys_mem_limit == MEM_PER_CPU))
 		return true;

 	if ((job_mem_limit & MEM_PER_CPU) && (sys_mem_limit & MEM_PER_CPU)) {
 		uint64_t mem_ratio;
 		job_mem_limit &= (~MEM_PER_CPU);
 		sys_mem_limit &= (~MEM_PER_CPU);
 		if (job_mem_limit <= sys_mem_limit)
 			return true;
 		mem_ratio = ROUNDUP(job_mem_limit, sys_mem_limit);
 		debug("JobId=%u: increasing cpus_per_task and decreasing mem_per_cpu by factor of %"PRIu64" based upon mem_per_cpu limits",
 		      job_desc_msg->job_id, mem_ratio);
 		if (job_desc_msg->cpus_per_task == NO_VAL16)
 			job_desc_msg->cpus_per_task = mem_ratio;
 		else
 			job_desc_msg->cpus_per_task *= mem_ratio;

 		/* Update tres_per_task, but not if it wasn't set before */
 		if (job_desc_msg->bitflags & JOB_CPUS_SET)
 			slurm_option_update_tres_per_task(
 				job_desc_msg->cpus_per_task, "cpu",
 				&job_desc_msg->tres_per_task);

 		job_desc_msg->pn_min_memory =
 			ROUNDUP(job_mem_limit, mem_ratio) | MEM_PER_CPU;
 		if ((job_desc_msg->num_tasks != NO_VAL) &&
 		    (job_desc_msg->num_tasks != 0) &&
 		    (job_desc_msg->min_cpus  != NO_VAL)) {
 			job_desc_msg->min_cpus =
 				job_desc_msg->num_tasks *
 				job_desc_msg->cpus_per_task;

 			if ((job_desc_msg->max_cpus != NO_VAL) &&
 			    (job_desc_msg->max_cpus < job_desc_msg->min_cpus)) {
 				job_desc_msg->max_cpus = job_desc_msg->min_cpus;
 			}
 		} else {
 			job_desc_msg->pn_min_cpus = job_desc_msg->cpus_per_task;
 		}
 		return true;
 	}

 	if (job_mem_limit == 0)
 		job_mem_limit = _mem_per_node_part(part_ptr);

 	if (((job_mem_limit & MEM_PER_CPU) == 0) &&
 	    ((sys_mem_limit & MEM_PER_CPU) == 0)) {
 		if (job_mem_limit <= sys_mem_limit)
 			return true;
 		debug2("JobId=%u mem=%"PRIu64"M > MaxMemPerNode=%"PRIu64"M in partition %s",
 		       job_desc_msg->job_id, job_mem_limit, sys_mem_limit,
 		       (part_ptr && part_ptr->name) ? part_ptr->name : "N/A");
 		return false;
 	}

 	/* Job and system have different memory limit forms (i.e. one is a
 	 * per-job and the other is per-node). Convert them both to per-node
 	 * values for comparison. */
 	if (part_ptr && (!part_ptr->max_share || !job_desc_msg->shared)) {
 		/* Whole node allocation */
 		cpus_per_node = part_ptr->max_cpu_cnt;
 	} else {
 		if ((job_desc_msg->ntasks_per_node != NO_VAL16) &&
 		    (job_desc_msg->ntasks_per_node != 0))
 			cpus_per_node = job_desc_msg->ntasks_per_node;
 		else
 			cpus_per_node = 1;

 		if ((job_desc_msg->num_tasks != NO_VAL) &&
 		    (job_desc_msg->num_tasks != 0)     &&
 		    (job_desc_msg->max_nodes != NO_VAL) &&
 		    (job_desc_msg->max_nodes != 0)) {
 			cpus_per_node = MAX(cpus_per_node,
 					    ROUNDUP(job_desc_msg->num_tasks,
 						    job_desc_msg->max_nodes));
 		}

 		if ((job_desc_msg->cpus_per_task != NO_VAL16) &&
 		    (job_desc_msg->cpus_per_task != 0))
 			cpus_per_node *= job_desc_msg->cpus_per_task;

 		if ((job_desc_msg->pn_min_cpus != NO_VAL16) &&
 		    (job_desc_msg->pn_min_cpus > cpus_per_node))
 			cpus_per_node = job_desc_msg->pn_min_cpus;
 	}

 	if (job_mem_limit & MEM_PER_CPU) {
 		/* Job has per-CPU memory limit, system has per-node limit */
 		job_mem_limit &= (~MEM_PER_CPU);
 		job_mem_limit *= cpus_per_node;
 	} else {
 		/* Job has per-node memory limit, system has per-CPU limit */
 		uint32_t min_cpus;
 		sys_mem_limit &= (~MEM_PER_CPU);
 		min_cpus = ROUNDUP(job_mem_limit, sys_mem_limit);

 		if ((job_desc_msg->pn_min_cpus == NO_VAL16) ||
 		    (job_desc_msg->pn_min_cpus < min_cpus)) {
 			job_desc_msg->pn_min_cpus = min_cpus;
 			if (min_cpus > job_desc_msg->min_cpus) {
 				job_desc_msg->min_cpus = min_cpus;
 				job_desc_msg->max_cpus =
 					MAX(min_cpus, job_desc_msg->max_cpus);
 			}
 			cpus_per_node = MAX(cpus_per_node, min_cpus);
 			if (job_desc_msg->ntasks_per_node != NO_VAL16) {
 				job_desc_msg->cpus_per_task =
 					ROUNDUP(job_desc_msg->pn_min_cpus,
 						job_desc_msg->ntasks_per_node);
 				job_desc_msg->pn_min_cpus =
 					MAX(job_desc_msg->cpus_per_task *
 					    job_desc_msg->ntasks_per_node,
 					    job_desc_msg->pn_min_cpus);
 			} else if (job_desc_msg->num_tasks &&
 				   (job_desc_msg->num_tasks != NO_VAL) &&
 				   job_desc_msg->min_nodes &&
 				   (job_desc_msg->min_nodes != NO_VAL)) {
 				/*
 				 * Calculate a new value of cpus/task given the
 				 * current nodes and tasks values:
 				 * CPUs/Task = (min_cpus_per_node * min_nodes) / num_tasks
 				 */
 				uint32_t cpus =
 					min_cpus * job_desc_msg->min_nodes;
 				job_desc_msg->cpus_per_task =
 					ROUNDUP(cpus, job_desc_msg->num_tasks);
 				/*
 				 * Recalculate pn_min_cpus based on the new
 				 * CPUs/task. This formula aims to get
 				 * an allocation with the least amount of
 				 * CPUs combining all the nodes from the job.
 				 */
 				min_cpus = (job_desc_msg->cpus_per_task *
 					    job_desc_msg->num_tasks) /
 					   job_desc_msg->min_nodes;
 				job_desc_msg->pn_min_cpus = min_cpus;
 				job_desc_msg->min_cpus =
 					MAX(min_cpus,
 					    job_desc_msg->pn_min_cpus);
 			} else if (!job_desc_msg->num_tasks) {
 				/*
 				 * The job did not request any amount of tasks
 				 * explicitly. Assuming 1 per node.
 				 */
 				job_desc_msg->cpus_per_task =
 					MAX(job_desc_msg->pn_min_cpus,
 					    job_desc_msg->cpus_per_task);
 			}
 			debug("JobId=%u: Setting job's pn_min_cpus to %u due to memory limit",
 			      job_desc_msg->job_id,
 			      job_desc_msg->pn_min_cpus);
 		}
 		sys_mem_limit *= cpus_per_node;
 	}

 	if (job_mem_limit <= sys_mem_limit)
 		return true;

 	debug2("JobId=%u mem=%"PRIu64"M > MaxMemPer%s=%"PRIu64"M in partition:%s",
 	       job_desc_msg->job_id, job_mem_limit,
 	       (job_mem_limit & MEM_PER_CPU) ? "CPU" : "Node", sys_mem_limit,
 	       (part_ptr && part_ptr->name) ? part_ptr->name : "N/A");

 	return false;
 }

 /*
  * Increment time limit of one job record for node configuration.
  */
 static void _job_time_limit_incr(job_record_t *job_ptr, uint32_t boot_job_id)
 {
 	time_t delta_t, now = time(NULL);

 	delta_t = difftime(now, job_ptr->start_time);
 	if ((job_ptr->job_id != boot_job_id) && !IS_JOB_CONFIGURING(job_ptr))
 		job_ptr->tot_sus_time = delta_t;

 	if ((job_ptr->time_limit != INFINITE) &&
 	    ((job_ptr->job_id == boot_job_id) || (delta_t != 0))) {
 		if (delta_t && !IS_JOB_CONFIGURING(job_ptr)) {
 			verbose("Extending %pJ time limit by %u secs for configuration",
 				job_ptr, (uint32_t) delta_t);
 		}
 		job_ptr->end_time = now + (job_ptr->time_limit * 60);
 		job_ptr->end_time_exp = job_ptr->end_time;
 	}
 }

 static int _foreach_het_job_time_limit_incr(void *x, void *arg)
 {
 	_job_time_limit_incr(x, *(uint32_t *)arg);

 	return 0;
 }

 /*
  * Increment time limit for all components of a hetjob for node configuration.
  * job_ptr IN - pointer to job record for which configuration is complete
  * boot_job_id - job ID of record with newly powered up node or 0
  */
 static void _het_job_time_limit_incr(job_record_t *job_ptr,
 				     uint32_t boot_job_id)
 {
 	job_record_t *het_job_leader;

 	if (!job_ptr->het_job_id) {
 		_job_time_limit_incr(job_ptr, boot_job_id);
 		return;
 	}

 	het_job_leader = find_job_record(job_ptr->het_job_id);
 	if (!het_job_leader) {
 		error("%s: Hetjob leader %pJ not found",
 		      __func__, job_ptr);
 		_job_time_limit_incr(job_ptr, boot_job_id);
 		return;
 	}
 	if (!het_job_leader->het_job_list) {
 		error("%s: Hetjob leader %pJ job list is NULL",
 		      __func__, job_ptr);
 		_job_time_limit_incr(job_ptr, boot_job_id);
 		return;
 	}

 	(void) list_for_each(het_job_leader->het_job_list,
 			     _foreach_het_job_time_limit_incr,
 			     &boot_job_id);
 }

 /* Clear job's CONFIGURING flag and advance end time as needed */
 extern void job_config_fini(job_record_t *job_ptr)
 {
 	time_t now = time(NULL);

 	last_job_update = now;
 	job_state_unset_flag(job_ptr, JOB_CONFIGURING);
 	if (IS_JOB_POWER_UP_NODE(job_ptr)) {
 		info("Resetting %pJ start time for node power up", job_ptr);
 		job_state_unset_flag(job_ptr, JOB_POWER_UP_NODE);
 		job_ptr->start_time = now;
 		_het_job_time_limit_incr(job_ptr, job_ptr->job_id);
 		jobacct_storage_g_job_start(acct_db_conn, job_ptr);
 	} else {
 		_het_job_time_limit_incr(job_ptr, 0);
 	}

 	if (job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD"))
 		set_job_alias_list(job_ptr);

 	/*
 	 * Request asynchronous launch of a prolog for a non-batch job.
 	 * PROLOG_FLAG_CONTAIN also turns on PROLOG_FLAG_ALLOC.
 	 */
 	if (slurm_conf.prolog_flags & PROLOG_FLAG_ALLOC)
 		launch_prolog(job_ptr);
 }

 /*
  * Determine of the nodes are ready to run a job
  * RET true if ready
  */
 extern bool test_job_nodes_ready(job_record_t *job_ptr)
 {
 	if (IS_JOB_PENDING(job_ptr))
 		return false;
 	if (!job_ptr->node_bitmap)	/* Revoked allocation */
 		return true;
 	if (bit_overlap_any(job_ptr->node_bitmap, power_down_node_bitmap))
 		return false;

 	if (!job_ptr->batch_flag ||
 	    job_ptr->batch_features ||
 	    job_ptr->wait_all_nodes || job_ptr->burst_buffer) {
 		/* Make sure all nodes ready to start job */
 		if ((select_g_job_ready(job_ptr) & READY_NODE_STATE) == 0)
 			return false;
 	} else if (job_ptr->batch_flag) {
 		/* Make sure first node is ready to start batch job */
 		node_record_t *node_ptr =
 			find_node_record(job_ptr->batch_host);
 		if (!node_ptr ||
 		    IS_NODE_POWERED_DOWN(node_ptr) ||
 		    IS_NODE_POWERING_UP(node_ptr)) {
 			return false;
 		}
 	}

 	return true;
 }

 static int _foreach_het_job_configuring_test(void *x, void *arg)
 {
 	job_record_t *het_job = x;

 	if (IS_JOB_CONFIGURING(het_job))
 		return 1;
 	return 0;
 }

 /*
  * For non-hetjob, return true if this job is configuring.
  * For hetjob, return true if any component of the job is configuring.
  */
 static bool _het_job_configuring_test(job_record_t *job_ptr)
 {
 	job_record_t *het_job_leader;

 	if (IS_JOB_CONFIGURING(job_ptr))
 		return true;
 	if (!job_ptr->het_job_id)
 		return false;

 	het_job_leader = find_job_record(job_ptr->het_job_id);
 	if (!het_job_leader) {
 		error("%s: Hetjob leader %pJ not found", __func__, job_ptr);
 		return false;
 	}
 	if (!het_job_leader->het_job_list) {
 		error("%s: Hetjob leader %pJ job list is NULL",
 		      __func__, job_ptr);
 		return false;
 	}

 	return list_find_first(het_job_leader->het_job_list,
 			       _foreach_het_job_configuring_test,
 			       NULL);
 }

 /*
  * job_time_limit - terminate jobs which have exceeded their time limit
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 void job_time_limit(void)
 {
 	list_itr_t *job_iterator;
 	job_record_t *job_ptr;
 	time_t now = time(NULL);
 	time_t old = now - ((slurm_conf.inactive_limit * 4 / 3) +
 	                    slurm_conf.msg_timeout + 1);
 	time_t over_run;
 	uint16_t over_time_limit;
 	uint8_t prolog;
 	int job_test_count = 0;
 	uint32_t resv_over_run = slurm_conf.resv_over_run;

 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));

 	if (resv_over_run == INFINITE16)
 		resv_over_run = YEAR_SECONDS;
 	else
 		resv_over_run *= 60;

 	/*
 	 * locks same as in _slurmctld_background() (The only current place this
 	 * is called).
 	 */
 	slurmctld_lock_t job_write_lock = {
 		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
 	DEF_TIMERS;


 	/*
 	 * Not making this list_next loop a list_for_each. This loop unlocks the
 	 * job_write lock if held too long, but that would not unlock the lists
 	 * write lock in a list_for_each. Unless this can be handled this must
 	 * remain a list_next loop.
 	 */
 	job_iterator = list_iterator_create(job_list);
 	START_TIMER;
 	while ((job_ptr = list_next(job_iterator))) {
 		xassert (job_ptr->magic == JOB_MAGIC);
 		job_test_count++;

 		if (job_ptr->details)
 			prolog = job_ptr->details->prolog_running;
 		else
 			prolog = 0;
 		if ((prolog == 0) && IS_JOB_CONFIGURING(job_ptr) &&
 		    test_job_nodes_ready(job_ptr)) {
 			info("%s: Configuration for %pJ complete",
 			     __func__, job_ptr);
 			job_config_fini(job_ptr);
 			if (job_ptr->batch_flag)
 				launch_job(job_ptr);
 		}

 		/*
 		 * Features have been changed on some node, make job eligiable
 		 * to run and test to see if it can run now
 		 */
 		if (node_features_updated &&
 		    (job_ptr->state_reason == FAIL_BAD_CONSTRAINTS) &&
 		    IS_JOB_PENDING(job_ptr) && (job_ptr->priority == 0)) {
 			job_ptr->state_reason = WAIT_NO_REASON;
 			set_job_prio(job_ptr);
 			last_job_update = now;
 		}

 		/* Don't enforce time limits for configuring hetjobs */
 		if (_het_job_configuring_test(job_ptr))
 			continue;

 		/*
 		 * Only running jobs can be killed due to timeout. Do not kill
 		 * suspended jobs due to timeout.
 		 */
 		if (!IS_JOB_RUNNING(job_ptr))
 			continue;

 		/*
 		 * everything above here is considered "quick", and skips the
 		 * timeout at the bottom of the loop by using a continue.
 		 * everything below is considered "slow", and needs to jump to
 		 * time_check before the next job is tested
 		 */
 		if (job_ptr->preempt_time) {
 			(void)slurm_job_preempt(job_ptr, NULL,
 						slurm_job_preempt_mode(job_ptr),
 						false);
 			goto time_check;
 		}

 		if (slurm_conf.inactive_limit && (job_ptr->batch_flag == 0) &&
 		    (job_ptr->time_last_active <= old) &&
 		    (job_ptr->other_port) &&
 		    (job_ptr->part_ptr) &&
 		    (!(job_ptr->part_ptr->flags & PART_FLAG_ROOT_ONLY))) {
 			/* job inactive, kill it */
 			info("%s: inactivity time limit reached for %pJ",
 			     __func__, job_ptr);
 			_job_timed_out(job_ptr, false);
 			job_ptr->state_reason = FAIL_INACTIVE_LIMIT;
 			xfree(job_ptr->state_desc);
 			goto time_check;
 		}
 		if (job_ptr->time_limit != INFINITE) {
 			send_job_warn_signal(job_ptr, false);
 			if ((job_ptr->mail_type & MAIL_JOB_TIME100) &&
 			    (now >= job_ptr->end_time)) {
 				job_ptr->mail_type &= (~MAIL_JOB_TIME100);
 				mail_job_info(job_ptr, MAIL_JOB_TIME100);
 			}
 			if ((job_ptr->mail_type & MAIL_JOB_TIME90) &&
 			    (now + (job_ptr->time_limit * 60 * 0.1) >=
 			     job_ptr->end_time)) {
 				job_ptr->mail_type &= (~MAIL_JOB_TIME90);
 				mail_job_info(job_ptr, MAIL_JOB_TIME90);
 			}
 			if ((job_ptr->mail_type & MAIL_JOB_TIME80) &&
 			    (now + (job_ptr->time_limit * 60 * 0.2) >=
 			     job_ptr->end_time)) {
 				job_ptr->mail_type &= (~MAIL_JOB_TIME80);
 				mail_job_info(job_ptr, MAIL_JOB_TIME80);
 			}
 			if ((job_ptr->mail_type & MAIL_JOB_TIME50) &&
 			    (now + (job_ptr->time_limit * 60 * 0.5) >=
 			     job_ptr->end_time)) {
 				job_ptr->mail_type &= (~MAIL_JOB_TIME50);
 				mail_job_info(job_ptr, MAIL_JOB_TIME50);
 			}

 			if (job_ptr->part_ptr &&
 			    (job_ptr->part_ptr->over_time_limit != NO_VAL16)) {
 				over_time_limit =
 					job_ptr->part_ptr->over_time_limit;
 			} else {
 				over_time_limit = slurm_conf.over_time_limit;
 			}
 			if (over_time_limit == INFINITE16)
 				over_run = now - YEAR_SECONDS;
 			else
 				over_run = now - (over_time_limit  * 60);
 			if (job_ptr->end_time <= over_run) {
 				last_job_update = now;
 				info("Time limit exhausted for %pJ", job_ptr);
 				_job_timed_out(job_ptr, false);
 				job_ptr->state_reason = FAIL_TIMEOUT;
 				xfree(job_ptr->state_desc);
 				goto time_check;
 			}
 		}

 		if (job_ptr->resv_ptr &&
 		    !(job_ptr->resv_ptr->flags & RESERVE_FLAG_FLEX) &&
 		    (job_ptr->resv_ptr->end_time + resv_over_run) < time(NULL)){
 			last_job_update = now;
 			info("Reservation ended for %pJ", job_ptr);
 			xfree(job_ptr->state_desc);
 			xstrfmtcat(job_ptr->state_desc, "Reservation %s, which this job was running under, has ended",
 				   job_ptr->resv_ptr->name);
 			_job_timed_out(job_ptr, false);
 			job_ptr->state_reason = FAIL_TIMEOUT;
 			xfree(job_ptr->state_desc);
 			goto time_check;
 		}

 		/*
 		 * check if any individual job steps have exceeded
 		 * their time limit
 		 */
 		list_for_each(job_ptr->step_list, check_job_step_time_limit,
 			      &now);

 		acct_policy_job_time_out(job_ptr);

 		if (job_ptr->state_reason == FAIL_TIMEOUT) {
 			last_job_update = now;
 			_job_timed_out(job_ptr, false);
 			xfree(job_ptr->state_desc);
 			goto time_check;
 		}

 		/* Give srun command warning message about pending timeout */
 		if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2))
 			srun_timeout (job_ptr);

 		/*
 		 * _job_timed_out() and other calls can take a long time on
 		 * some platforms. This loop is holding the job_write lock;
 		 * if a lot of jobs need to be timed out within the same cycle
 		 * this stalls other threads from running and causes
 		 * communication issues within the cluster.
 		 *
 		 * This test happens last, as job_ptr may be pointing to a job
 		 * that would be deleted by a separate thread when the job_write
 		 * lock is released. However, list_next itself is thread safe,
 		 * and can be used again once the locks are reacquired.
 		 * list_peek_next is used in the unlikely event the timer has
 		 * expired just as the end of the job_list is reached.
 		 */
 	time_check:
 		/* Use a hard-coded 3 second timeout, with a 1 second sleep. */
 		if (slurm_delta_tv(&tv1) >= 3000000 &&
 		    list_peek_next(job_iterator)) {
 			END_TIMER;
 			debug("%s: yielding locks after testing %d jobs, %s",
 			      __func__, job_test_count, TIME_STR);
 			unlock_slurmctld(job_write_lock);
 			usleep(1000000);
 			lock_slurmctld(job_write_lock);
 			START_TIMER;
 			job_test_count = 0;
 		}
 	}
 	list_iterator_destroy(job_iterator);
 	node_features_updated = false;
 }

 extern void job_set_req_tres(job_record_t *job_ptr, bool assoc_mgr_locked)
 {
 	uint32_t cpu_cnt = 0, node_cnt = 0;
 	uint64_t mem_cnt = 0;
 	uint16_t sockets_per_node;
 	uint32_t num_tasks = 1; /* Default to 1 if it's not set */
 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };

 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));

 	xfree(job_ptr->tres_req_str);
 	xfree(job_ptr->tres_fmt_req_str);
 	xfree(job_ptr->tres_req_cnt);

 	if (!assoc_mgr_locked)
 		assoc_mgr_lock(&locks);

 	job_ptr->tres_req_cnt = xcalloc(g_tres_count, sizeof(uint64_t));

 	if (job_ptr->details) {
 		node_cnt = job_ptr->details->min_nodes;
 		cpu_cnt = job_ptr->details->min_cpus;
 		if (job_ptr->details->pn_min_memory)
 			mem_cnt = job_ptr->details->pn_min_memory;
 		num_tasks = job_ptr->details->num_tasks;
 	}

 	/* if this is set just override */
 	if (job_ptr->total_cpus)
 		cpu_cnt = job_ptr->total_cpus;

 	if (job_ptr->node_cnt)
 		node_cnt = job_ptr->node_cnt;

 	job_ptr->tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)node_cnt;
 	job_ptr->tres_req_cnt[TRES_ARRAY_CPU] = (uint64_t)cpu_cnt;
 	sockets_per_node = job_get_sockets_per_node(job_ptr);
 	job_ptr->tres_req_cnt[TRES_ARRAY_MEM] =
 		job_get_tres_mem(job_ptr->job_resrcs,
 				 mem_cnt, cpu_cnt,
 				 node_cnt,
 				 job_ptr->part_ptr,
 				 job_ptr->gres_list_req,
 				 (job_ptr->bit_flags & JOB_MEM_SET),
 				 sockets_per_node,
 				 num_tasks);

 	license_set_job_tres_cnt(job_ptr->license_list,
 				 job_ptr->tres_req_cnt,
 				 true);

 	/* FIXME: this assumes that all nodes have equal TRES */
 	gres_stepmgr_set_job_tres_cnt(
 		job_ptr->gres_list_req,
 		node_cnt,
 		job_ptr->tres_req_cnt,
 		true);

 	bb_g_job_set_tres_cnt(job_ptr,
 			      job_ptr->tres_req_cnt,
 			      true);

 	/*
 	 * Do this last as it calculates off of everything else.
 	 * Don't use calc_job_billable_tres() as it relies on allocated tres
 	 * If the partition was destroyed the part_ptr will be NULL.  As this
 	 * could be run on already finished jobs running in the assoc mgr
 	 * cache.
 	 */
 	if (job_ptr->part_ptr)
 		job_ptr->tres_req_cnt[TRES_ARRAY_BILLING] =
 			assoc_mgr_tres_weighted(
 				job_ptr->tres_req_cnt,
 				job_ptr->part_ptr->billing_weights,
 				slurm_conf.priority_flags, true);

 	/* now that the array is filled lets make the string from it */
 	set_job_tres_req_str(job_ptr, true);

 	if (!assoc_mgr_locked)
 		assoc_mgr_unlock(&locks);
 }

 extern void job_set_alloc_tres(job_record_t *job_ptr, bool assoc_mgr_locked)
 {
 	uint32_t alloc_nodes = 0;
 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };

 	xfree(job_ptr->tres_alloc_str);
 	xfree(job_ptr->tres_alloc_cnt);
 	xfree(job_ptr->tres_fmt_alloc_str);

 	/*
 	 * We only need to do this on non-pending jobs.
 	 * Requeued jobs are marked as PENDING|COMPLETING until the epilog is
 	 * finished so we still need the alloc tres until then.
 	 */
 	if (IS_JOB_PENDING(job_ptr) && !IS_JOB_COMPLETING(job_ptr))
 		return;

 	if (!assoc_mgr_locked)
 		assoc_mgr_lock(&locks);

 	job_ptr->tres_alloc_cnt = xcalloc(slurmctld_tres_cnt, sizeof(uint64_t));

 	job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU] = (uint64_t)job_ptr->total_cpus;

 	alloc_nodes = job_ptr->node_cnt;
 	job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE] = (uint64_t)alloc_nodes;
 	job_ptr->tres_alloc_cnt[TRES_ARRAY_MEM] =
 		job_get_tres_mem(job_ptr->job_resrcs,
 				 job_ptr->details->pn_min_memory,
 				 job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU],
 				 job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE],
 				 job_ptr->part_ptr,
 				 job_ptr->gres_list_req,
 				 job_ptr->bit_flags & JOB_MEM_SET,
 				 job_get_sockets_per_node(job_ptr),
 				 job_ptr->details->num_tasks);

 	job_ptr->tres_alloc_cnt[TRES_ARRAY_ENERGY] = NO_VAL64;

 	license_set_job_tres_cnt(job_ptr->license_list,
 				 job_ptr->tres_alloc_cnt,
 				 true);
 	gres_stepmgr_set_job_tres_cnt(
 		job_ptr->gres_list_alloc,
 		alloc_nodes,
 		job_ptr->tres_alloc_cnt,
 		true);

 	bb_g_job_set_tres_cnt(job_ptr,
 			      job_ptr->tres_alloc_cnt,
 			      true);

 	/* Do this last as it calculates off of everything else. */
 	job_ptr->tres_alloc_cnt[TRES_ARRAY_BILLING] =
 		calc_job_billable_tres(job_ptr, job_ptr->start_time, true);

 	/* now that the array is filled lets make the string from it */
 	assoc_mgr_set_job_tres_alloc_str(job_ptr, true);

 	if (!assoc_mgr_locked)
 		assoc_mgr_unlock(&locks);
 }

 /*
  * job_update_tres_cnt - when job is completing remove allocated tres
  *                      from count.
  * IN/OUT job_ptr - job structure to be updated
  * IN node_inx    - node bit that is finished with job.
  * RET SLURM_SUCCES on success SLURM_ERROR on cpu_cnt underflow
  */
 extern int job_update_tres_cnt(job_record_t *job_ptr, int node_inx)
 {
 	int cpu_cnt, offset = -1, rc = SLURM_SUCCESS;

 	xassert(job_ptr);

 	if (job_ptr->details->whole_node & WHOLE_NODE_REQUIRED) {
 		/*
 		 * Since we are allocating whole nodes don't rely on
 		 * the job_resrcs since it could be less because the
 		 * node could of only used 1 thread per core.
 		 */
 		node_record_t *node_ptr =
 			node_record_table_ptr[node_inx];
 		cpu_cnt = node_ptr->config_ptr->cpus;
 	} else {
 		if ((offset = job_resources_node_inx_to_cpu_inx(
 			     job_ptr->job_resrcs, node_inx)) < 0) {
 			error("%s: problem getting offset of %pJ",
 			      __func__, job_ptr);
 			job_ptr->cpu_cnt = 0;
 			return SLURM_ERROR;
 		}

 		cpu_cnt = job_ptr->job_resrcs->cpus[offset];
 	}
 	if (cpu_cnt > job_ptr->cpu_cnt) {
 		error("%s: cpu_cnt underflow (%d > %u) on %pJ", __func__,
 		      cpu_cnt, job_ptr->cpu_cnt, job_ptr);
 		job_ptr->cpu_cnt = 0;
 		rc = SLURM_ERROR;
 	} else
 		job_ptr->cpu_cnt -= cpu_cnt;

 	if (IS_JOB_RESIZING(job_ptr)) {
 		if (cpu_cnt > job_ptr->total_cpus) {
 			error("%s: total_cpus underflow on %pJ",
 			      __func__, job_ptr);
 			job_ptr->total_cpus = 0;
 			rc = SLURM_ERROR;
 		} else
 			job_ptr->total_cpus -= cpu_cnt;

 		job_set_alloc_tres(job_ptr, false);
 	}
 	return rc;
 }

 /* Terminate a job that has exhausted its time limit */
 static void _job_timed_out(job_record_t *job_ptr, bool preempted)
 {
 	xassert(job_ptr);

 	srun_timeout(job_ptr);
 	if (job_ptr->details) {
 		time_t now      = time(NULL);
 		job_ptr->end_time           = now;
 		job_ptr->time_last_active   = now;
 		if (!job_ptr->preempt_time)
 			job_state_set(job_ptr, (JOB_TIMEOUT | JOB_COMPLETING));
 		build_cg_bitmap(job_ptr);
 		job_completion_logger(job_ptr, false);
 		deallocate_nodes(job_ptr, !preempted, false, preempted);
 	} else
 		job_signal(job_ptr, SIGKILL, 0, 0, false);
 }

 /* _validate_job_desc - validate that a job descriptor for job submit or
  *	allocate has valid data, set values to defaults as required
  * IN/OUT job_desc_msg - pointer to job descriptor, modified as needed
  * IN allocate - if clear job to be queued, if set allocate for user now
  * IN submit_uid - who request originated
  */
 static int _validate_job_desc(job_desc_msg_t *job_desc_msg, int allocate,
 			      bool cron, uid_t submit_uid,
 			      part_record_t *part_ptr, list_t *part_list)
 {
 	if ((job_desc_msg->min_cpus  == NO_VAL) &&
 	    (job_desc_msg->min_nodes == NO_VAL) &&
 	    (job_desc_msg->req_nodes == NULL)) {
 		info("%s: job specified no min_cpus, min_nodes or req_nodes",
 		     __func__);
 		return ESLURM_JOB_MISSING_SIZE_SPECIFICATION;
 	}
 	if ((allocate == SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0) &&
 	    (job_desc_msg->script == NULL) &&
 	    !(job_desc_msg->bitflags & EXTERNAL_JOB)) {
 		info("%s: job failed to specify Script", __func__);
 		return ESLURM_JOB_SCRIPT_MISSING;
 	}
 	if (job_desc_msg->script && job_desc_msg->x11) {
 		info("%s: batch job cannot use X11 forwarding", __func__);
 		return ESLURM_X11_NOT_AVAIL;
 	}
 	if (job_desc_msg->user_id == NO_VAL) {
 		info("%s: job failed to specify User", __func__);
 		return ESLURM_USER_ID_MISSING;
 	}
 	if ( job_desc_msg->group_id == NO_VAL ) {
 		debug("%s: job failed to specify group", __func__);
 		return ESLURM_GROUP_ID_MISSING;
 	}
 	if (!job_desc_msg->container_id && !job_desc_msg->container &&
 	    (!job_desc_msg->work_dir || !job_desc_msg->work_dir[0])) {
 		debug("%s: job working directory has to be set", __func__);
 		return ESLURM_MISSING_WORK_DIR;
 	}
 	if ((job_desc_msg->warn_flags & KILL_JOB_RESV) &&
 	    (slurm_conf.preempt_mode == PREEMPT_MODE_OFF)) {
 		debug("%s: job specified \"R:\" option of --signal, which is incompatible with PreemptMode=OFF",
 		     __func__);
 		return ESLURM_PREEMPTION_REQUIRED;
 	}
 	if (job_desc_msg->contiguous == NO_VAL16)
 		job_desc_msg->contiguous = 0;

 	if (job_desc_msg->task_dist == NO_VAL) {
 		/* not typically set by salloc or sbatch */
 		job_desc_msg->task_dist = SLURM_DIST_CYCLIC;
 	}
 	if (job_desc_msg->plane_size == NO_VAL16)
 		job_desc_msg->plane_size = 0;

 	if (job_desc_msg->segment_size == NO_VAL16)
 		job_desc_msg->segment_size = 0;

 	if (job_desc_msg->kill_on_node_fail == NO_VAL16)
 		job_desc_msg->kill_on_node_fail = 1;

 	if (job_desc_msg->job_id != NO_VAL) {
 		job_record_t *dup_job_ptr;
 		if (!fed_mgr_fed_rec &&
 		    (submit_uid != 0) &&
 		    (submit_uid != slurm_conf.slurm_user_id)) {
 			info("attempt by uid %u to set JobId=%u",
 			     submit_uid, job_desc_msg->job_id);
 			return ESLURM_INVALID_JOB_ID;
 		}
 		if (job_desc_msg->job_id == 0) {
 			info("attempt by uid %u to set JobId=0",
 			     submit_uid);
 			return ESLURM_INVALID_JOB_ID;
 		}
 		dup_job_ptr = find_job_record(job_desc_msg->job_id);
 		if (dup_job_ptr) {
 			info("attempt to reuse active %pJ", dup_job_ptr);
 			return ESLURM_DUPLICATE_JOB_ID;
 		}
 	}

 	if (job_desc_msg->nice == NO_VAL)
 		job_desc_msg->nice = NICE_OFFSET;

 	if (job_desc_msg->pn_min_memory == NO_VAL64)
 		job_desc_msg->pn_min_memory = _get_def_mem(part_ptr, NULL);
 	else if (!_validate_min_mem_partition(job_desc_msg, part_ptr,
 					      part_list)) {
 		return ESLURM_INVALID_TASK_MEMORY;
 	} else {
 		/* Memory limit explicitly set by user */
 		job_desc_msg->bitflags |= JOB_MEM_SET;
 	}

 	job_desc_msg->bitflags &= ~BACKFILL_TEST;
 	job_desc_msg->bitflags &= ~BF_WHOLE_NODE_TEST;
 	job_desc_msg->bitflags &= ~JOB_ACCRUE_OVER;
 	job_desc_msg->bitflags &= ~JOB_KILL_HURRY;
 	job_desc_msg->bitflags &= ~SIB_JOB_FLUSH;
 	job_desc_msg->bitflags &= ~TRES_STR_CALC;
 	job_desc_msg->bitflags &= ~JOB_WAS_RUNNING;
 	if (!cron)
 		job_desc_msg->bitflags &= ~CRON_JOB;

 	if (job_desc_msg->pn_min_memory == MEM_PER_CPU) {
 		/* Map --mem-per-cpu=0 to --mem=0 for simpler logic */
 		job_desc_msg->pn_min_memory = 0;
 	}

 	/* Validate a job's accounting frequency, if specified */
 	if (acct_gather_check_acct_freq_task(
 		    job_desc_msg->pn_min_memory, job_desc_msg->acctg_freq))
 		return ESLURMD_INVALID_ACCT_FREQ;

 	if (job_desc_msg->min_nodes == NO_VAL)
 		job_desc_msg->min_nodes = 1;	/* default node count of 1 */
 	if (job_desc_msg->min_cpus == NO_VAL)
 		job_desc_msg->min_cpus = job_desc_msg->min_nodes;

 	if ((job_desc_msg->pn_min_cpus == NO_VAL16) ||
 	    (job_desc_msg->pn_min_cpus == 0))
 		job_desc_msg->pn_min_cpus = 1;   /* default 1 cpu per node */
 	if (job_desc_msg->pn_min_tmp_disk == NO_VAL)
 		job_desc_msg->pn_min_tmp_disk = 0;/* default 0MB disk per node */

 	return SLURM_SUCCESS;
 }

 static int _foreach_valid_pn_min_mem(void *x, void *arg)
 {
 	part_record_t *part_ptr = x;
 	foreach_valid_pn_min_mem_t *foreach_valid_pn_min_mem = arg;
 	job_desc_msg_t *job_desc_msg = foreach_valid_pn_min_mem->job_desc;

 	foreach_valid_pn_min_mem->rc =
 		_valid_pn_min_mem(job_desc_msg, part_ptr);

 	/* for ALL we have to test them all */
 	if (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ALL) {
 		if (!foreach_valid_pn_min_mem->rc)
 			return -1;
 	} else if (foreach_valid_pn_min_mem->rc) /* break, we found one! */
 		return -1;
 	else if (slurm_conf.enforce_part_limits == PARTITION_ENFORCE_ANY) {
 		debug("%s: Job requested for (%"PRIu64")MB is invalid for partition %s",
 		      __func__, job_desc_msg->pn_min_memory,
 		      part_ptr->name);
 	}

 	job_desc_msg->pn_min_memory = foreach_valid_pn_min_mem->pn_min_memory;
 	job_desc_msg->cpus_per_task = foreach_valid_pn_min_mem->cpus_per_task;
 	job_desc_msg->min_cpus = foreach_valid_pn_min_mem->min_cpus;
 	job_desc_msg->max_cpus = foreach_valid_pn_min_mem->max_cpus;
 	job_desc_msg->pn_min_cpus = foreach_valid_pn_min_mem->pn_min_cpus;

 	return 0;
 }

 /*
  * Traverse the list of partitions and invoke the
  * function validating the job memory specification.
  */
 static bool _validate_min_mem_partition(job_desc_msg_t *job_desc_msg,
 					part_record_t *part_ptr,
 					list_t *part_list)
 {
 	uint64_t tmp_pn_min_memory;
 	uint16_t tmp_cpus_per_task;
 	uint32_t tmp_min_cpus;
 	uint32_t tmp_max_cpus;
 	uint32_t tmp_pn_min_cpus;
 	bool cc = false;

 	/* no reason to check them here as we aren't enforcing them */
 	if (!slurm_conf.enforce_part_limits)
 		return true;

 	tmp_pn_min_memory = job_desc_msg->pn_min_memory;
 	tmp_cpus_per_task = job_desc_msg->cpus_per_task;
 	tmp_min_cpus = job_desc_msg->min_cpus;
 	tmp_max_cpus = job_desc_msg->max_cpus;
 	tmp_pn_min_cpus = job_desc_msg->pn_min_cpus;

 	if (part_list == NULL) {
 		cc = _valid_pn_min_mem(job_desc_msg, part_ptr);
 	} else {
 		foreach_valid_pn_min_mem_t foreach_valid_pn_min_mem = {
 			.cpus_per_task = tmp_cpus_per_task,
 			.job_desc = job_desc_msg,
 			.max_cpus = tmp_max_cpus,
 			.min_cpus = tmp_min_cpus,
 			.pn_min_cpus = tmp_pn_min_cpus,
 			.pn_min_memory = tmp_pn_min_memory,
 		};

 		(void) list_for_each(part_list, _foreach_valid_pn_min_mem,
 				     &foreach_valid_pn_min_mem);
 		cc = foreach_valid_pn_min_mem.rc;
 	}

 	/*
 	 * Restoring original values, if it is necessary,
 	 * these will be modified in job_limits_check()
 	 */
 	job_desc_msg->pn_min_memory = tmp_pn_min_memory;
 	job_desc_msg->cpus_per_task = tmp_cpus_per_task;
 	job_desc_msg->min_cpus = tmp_min_cpus;
 	job_desc_msg->max_cpus = tmp_max_cpus;
 	job_desc_msg->pn_min_cpus = tmp_pn_min_cpus;

 	return cc;
 }

 static void _delete_job_common(job_record_t *job_ptr)
 {
 	if (!job_ptr->job_id)
 		return;

 	/* Remove record from fed_job_list */
 	fed_mgr_remove_fed_job_info(job_ptr->job_id);

 	/* Remove the record from job hash table */
 	_remove_job_hash(job_ptr, JOB_HASH_JOB);

 	/* Remove the record from job array hash tables, if applicable */
 	if (job_ptr->array_task_id != NO_VAL) {
 		_remove_job_hash(job_ptr, JOB_HASH_ARRAY_JOB);
 		_remove_job_hash(job_ptr, JOB_HASH_ARRAY_TASK);
 	}
 }

 /*
  * Remove the job record from hash tables and append to purge_jobs_list.
  */
 static void _move_to_purge_jobs_list(void *job_entry)
 {
 	job_record_t *job_ptr = job_entry;
 	int job_array_size;

 	if (!job_entry)
 		return;

 	xassert(job_ptr->magic == JOB_MAGIC);

 	_delete_job_common(job_ptr);

 	if (job_ptr->array_recs) {
 		job_array_size = MAX(1, job_ptr->array_recs->task_cnt);
 	} else if (!job_ptr->job_id) { /* reservation */
 		job_array_size = 0;
 	} else {
 		job_array_size = 1;
 	}

 	if (job_array_size > job_count) {
 		error("job_count underflow");
 		job_count = 0;
 	} else {
 		job_count -= job_array_size;
 	}

 	list_append(purge_jobs_list, job_ptr);
 }

 /*
  * find specific job_id entry in the job list, key is job_id_ptr
  */
 static int _list_find_job_id(void *job_entry, void *key)
 {
 	job_record_t *job_ptr = (job_record_t *) job_entry;
 	uint32_t *job_id_ptr = (uint32_t *) key;

 	if (job_ptr->job_id == *job_id_ptr)
 		return 1;

 	return 0;
 }

 /*
  * _list_find_job_old - find old entries in the job list,
  *	see common/list.h for documentation, key is ignored
  * job_entry IN - job pointer
  * key IN - if not NULL, then skip hetjobs
  */
 static int _list_find_job_old(void *job_entry, void *key)
 {
 	time_t kill_age, min_age, now = time(NULL);
 	job_record_t *job_ptr = (job_record_t *) job_entry;

 	if ((job_ptr->job_id == NO_VAL) && IS_JOB_REVOKED(job_ptr))
 		return 1;

 	if (job_ptr->het_job_id && (job_ptr->bit_flags & HETJOB_PURGE))
 		return 1;

 	if (IS_JOB_COMPLETING(job_ptr) && !LOTS_OF_AGENTS) {
 		kill_age = now - (slurm_conf.kill_wait +
 		                  2 * slurm_conf.msg_timeout);
 		if (job_ptr->time_last_active < kill_age) {
 			job_ptr->time_last_active = now;
 			re_kill_job(job_ptr);
 		}
 		return 0;       /* Job still completing */
 	}

 	if (job_ptr->epilog_running)
 		return 0;       /* EpilogSlurmctld still running */

 	if (slurm_conf.min_job_age == 0)
 		return 0;	/* No job record purging */

 	if (fed_mgr_fed_rec && job_ptr->fed_details &&
 	    !fed_mgr_is_origin_job(job_ptr)) {
 		uint32_t origin_id = fed_mgr_get_cluster_id(job_ptr->job_id);
 		slurmdb_cluster_rec_t *origin =
 			fed_mgr_get_cluster_by_id(origin_id);

 		/* keep job around until origin comes back and is synced */
 		if (origin &&
 		    (!origin->fed.send ||
 		     !((persist_conn_t *) origin->fed.send)->tls_conn ||
 		     !origin->fed.sync_sent))
 			return 0;
 	}

 	min_age = now - slurm_conf.min_job_age;
 	if (job_ptr->end_time > min_age)
 		return 0;	/* Too new to purge */

 	if (!(IS_JOB_COMPLETED(job_ptr)))
 		return 0;	/* Job still active */

 	if (job_ptr->step_list && list_count(job_ptr->step_list)) {
 		debug("%pJ still has %d active steps",
 		      job_ptr, list_count(job_ptr->step_list));
 		/*
 		 * If the job has been around more than 30 days the steps are
 		 * bogus.  Blow the job away.  This was witnessed <= 16.05 but
 		 * hasn't be seen since.  This is here just to clear them out if
 		 * this ever shows up again.
 		 */
 		min_age = now - PURGE_OLD_JOB_IN_SEC;
 		if (job_ptr->end_time <= min_age) {
 			info("Force purge of %pJ. It ended over 30 days ago, the slurmctld thinks there are still steps running but they are most likely bogus. In any case you might want to check nodes %s to make sure nothing remains of the job.",
 			     job_ptr, job_ptr->nodes);
 			goto end_it;
 		} else
 			return 0;	/* steps are still active */
 	}

 	if (job_ptr->array_recs) {
 		if (job_ptr->array_recs->tot_run_tasks ||
 		    !_test_job_array_purged(job_ptr->array_job_id)) {
 			/* Some tasks from this job array still active */
 			return 0;
 		}
 	}

 	if (bb_g_job_test_stage_out(job_ptr) != 1)
 		return 0;      /* Stage out in progress */

 end_it:

 	return 1;		/* Purge the job */
 }

 static int _foreach_is_part_visible(void *x, void *arg)
 {
 	part_record_t *part_ptr = x;
 	part_record_t **visible_parts = arg;

 	for (int i = 0; visible_parts[i]; i++) {
 		if (visible_parts[i] == part_ptr) {
 			return -1;
 		}
 	}
 	return 0;
 }

 /* Determine if ALL partitions associated with a job are hidden */
 static bool _all_parts_hidden(job_record_t *job_ptr,
 			      part_record_t **visible_parts)
 {
 	if (job_ptr->part_ptr_list) {
 		if (list_find_first(part_list, _foreach_is_part_visible,
 				    visible_parts))
 			return false;
 		return true;
 	}

 	if (job_ptr->part_ptr) {
 		if (_foreach_is_part_visible(job_ptr->part_ptr, visible_parts))
 			return false;
 	}

 	return true;
 }

 /* Determine if a given job should be seen by a specific user */
 static bool _hide_job_user_rec(job_record_t *job_ptr, slurmdb_user_rec_t *user,
 			       uint16_t show_flags)
 {
 	if (!job_ptr)
 		return true;

 	if ((slurm_conf.private_data & PRIVATE_DATA_JOBS) &&
 	    (job_ptr->user_id != user->uid) &&
 	    (((slurm_mcs_get_privatedata() == 0) &&
 	      !assoc_mgr_is_user_acct_coord_user_rec(user, job_ptr->account)) ||
 	     ((slurm_mcs_get_privatedata() == 1) &&
 	      (mcs_g_check_mcs_label(user->uid, job_ptr->mcs_label,
 				     true) != 0))))
 		return true;
 	return false;
 }

 static int _pack_job(void *object, void *arg)
 {
 	job_record_t *job_ptr = (job_record_t *)object;
 	_foreach_pack_job_info_t *pack_info = (_foreach_pack_job_info_t *)arg;

 	xassert (job_ptr->magic == JOB_MAGIC);

 	if ((pack_info->filter_uid != NO_VAL) &&
 	    (pack_info->filter_uid != job_ptr->user_id))
 		return SLURM_SUCCESS;

 	if (!(pack_info->show_flags & SHOW_ALL) && IS_JOB_REVOKED(job_ptr))
 		return SLURM_SUCCESS;

 	if (!pack_info->privileged) {
 		if (((pack_info->show_flags & SHOW_ALL) == 0) &&
 		    _all_parts_hidden(job_ptr, pack_info->visible_parts))
 			return SLURM_SUCCESS;

 		if (_hide_job_user_rec(job_ptr, &pack_info->user_rec,
 				       pack_info->show_flags))
 			return SLURM_SUCCESS;
 	}

 	pack_job(job_ptr, pack_info->show_flags, pack_info->buffer,
 		 pack_info->protocol_version, pack_info->uid,
 		 pack_info->has_qos_lock);

 	pack_info->jobs_packed++;

 	return SLURM_SUCCESS;
 }

 static int _foreach_pack_het_job(void *x, void *arg)
 {
 	job_record_t *het_job_ptr = x;
 	_foreach_pack_job_info_t *pack_info = arg;

 	xassert(pack_info->het_leader);
 	xassert(verify_assoc_lock(QOS_LOCK, READ_LOCK));
 	if (het_job_ptr->het_job_id != pack_info->het_leader->het_job_id) {
 		error("%s: Bad het_job_list for %pJ", __func__,
 		      pack_info->het_leader);
 		return 0;
 	}

 	pack_job(het_job_ptr, pack_info->show_flags, pack_info->buffer,
 		 pack_info->protocol_version, pack_info->uid,
 		 pack_info->has_qos_lock);

 	pack_info->jobs_packed++;

 	return 0;
 }

 static int _foreach_pack_jobid(void *object, void *arg)
 {
 	job_record_t *job_ptr;
 	uint32_t job_id = *(uint32_t *)object;
 	_foreach_pack_job_info_t *info = (_foreach_pack_job_info_t *)arg;

 	if (!(job_ptr = find_job_record(job_id)))
 		return SLURM_SUCCESS;

 	return _pack_job(job_ptr, info);
 }

 /*
  * _pack_init_job_info - create buffer with header packed for a job_info_msg_t
  *
  * NOTE: change _unpack_job_info_msg() in common/slurm_protocol_pack.c
  *	whenever the data format changes
  */
 static buf_t *_pack_init_job_info(uint16_t protocol_version)
 {
 	buf_t *buffer = init_buf(BUF_SIZE);

 	/* write message body header : size and time */
 	/* put in a place holder job record count of 0 for now */
 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		pack32(0, buffer);
 		pack_time(time(NULL), buffer);
 		pack_time(slurmctld_diag_stats.bf_when_last_cycle, buffer);
 	}

 	return buffer;
 }

 /*
  * pack_all_jobs - dump all job information for all jobs in
  *	machine independent form (for network transmission)
  * IN show_flags - job filtering options
  * IN uid - uid of user making request (for partition filtering)
  * IN filter_uid - pack only jobs belonging to this user if not NO_VAL
  * OUT buffer
  * global: job_list - global list of job records
  * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
  */
 extern buf_t *pack_all_jobs(uint16_t show_flags, uid_t uid, uint32_t filter_uid,
 			    uint16_t protocol_version)
 {
 	uint32_t tmp_offset;
 	_foreach_pack_job_info_t pack_info = {
 		.buffer = _pack_init_job_info(protocol_version),
 		.filter_uid = filter_uid,
 		.jobs_packed = 0,
 		.protocol_version = protocol_version,
 		.show_flags = show_flags,
 		.uid = uid,
 		.has_qos_lock = true,
 		.user_rec.uid = uid,
 	};
 	assoc_mgr_lock_t locks = { .assoc = READ_LOCK, .user = READ_LOCK,
 				   .qos = READ_LOCK };

 	assoc_mgr_lock(&locks);
 	assoc_mgr_fill_in_user(acct_db_conn, &pack_info.user_rec,
 			       accounting_enforce, NULL, true);
 	pack_info.privileged = validate_operator_user_rec(&pack_info.user_rec);
 	pack_info.visible_parts = build_visible_parts(
 		uid, (pack_info.privileged || (show_flags & SHOW_ALL)));
 	list_for_each_ro(job_list, _pack_job, &pack_info);
 	assoc_mgr_unlock(&locks);

 	/* put the real record count in the message body header */
 	tmp_offset = get_buf_offset(pack_info.buffer);
 	set_buf_offset(pack_info.buffer, 0);
 	pack32(pack_info.jobs_packed, pack_info.buffer);
 	set_buf_offset(pack_info.buffer, tmp_offset);

 	xfree(pack_info.visible_parts);

 	return pack_info.buffer;
 }

 /*
  * pack_spec_jobs - dump job information for specified jobs in
  *	machine independent form (for network transmission)
  * IN show_flags - job filtering options
  * IN job_ids - list of job_ids to pack
  * IN uid - uid of user making request (for partition filtering)
  * IN filter_uid - pack only jobs belonging to this user if not NO_VAL
  * OUT buffer
  * global: job_list - global list of job records
  * NOTE: the buffer at *buffer_ptr must be xfreed by the caller
  */
 extern buf_t *pack_spec_jobs(list_t *job_ids, uint16_t show_flags, uid_t uid,
 			     uint32_t filter_uid, uint16_t protocol_version)
 {
 	uint32_t tmp_offset;
 	_foreach_pack_job_info_t pack_info = {
 		.buffer = _pack_init_job_info(protocol_version),
 		.filter_uid = filter_uid,
 		.jobs_packed = 0,
 		.protocol_version = protocol_version,
 		.show_flags = show_flags,
 		.uid = uid,
 		.has_qos_lock = true,
 		.user_rec.uid = uid,
 	};
 	assoc_mgr_lock_t locks = { .assoc = READ_LOCK, .user = READ_LOCK,
 				   .qos = READ_LOCK };

 	xassert(job_ids);

 	assoc_mgr_lock(&locks);
 	assoc_mgr_fill_in_user(acct_db_conn, &pack_info.user_rec,
 			       accounting_enforce, NULL, true);
 	pack_info.privileged = validate_operator_user_rec(&pack_info.user_rec);
 	pack_info.visible_parts = build_visible_parts(
 		uid, (pack_info.privileged || (show_flags & SHOW_ALL)));
 	list_for_each_ro(job_ids, _foreach_pack_jobid, &pack_info);
 	assoc_mgr_unlock(&locks);

 	/* put the real record count in the message body header */
 	tmp_offset = get_buf_offset(pack_info.buffer);
 	set_buf_offset(pack_info.buffer, 0);
 	pack32(pack_info.jobs_packed, pack_info.buffer);
 	set_buf_offset(pack_info.buffer, tmp_offset);

 	xfree(pack_info.visible_parts);

 	return pack_info.buffer;
 }

 /*
  * pack_one_job - dump information for one jobs in
  *	machine independent form (for network transmission)
  * IN job_id - ID of job that we want info for
  * IN show_flags - job filtering options
  * IN uid - uid of user making request (for partition filtering)
  * OUT buffer
  */
 extern buf_t *pack_one_job(uint32_t job_id, uint16_t show_flags, uid_t uid,
 			   uint16_t protocol_version)
 {
 	job_record_t *job_ptr;
 	uint32_t jobs_packed = 0, tmp_offset;
 	buf_t *buffer;
 	assoc_mgr_lock_t locks = { .qos = READ_LOCK, .user = READ_LOCK };
 	slurmdb_user_rec_t user_rec = { 0 };
 	bool hide_job = false;
 	bool valid_operator;

 	buffer = _pack_init_job_info(protocol_version);

 	assoc_mgr_lock(&locks);
 	user_rec.uid = uid;
 	assoc_mgr_fill_in_user(acct_db_conn, &user_rec,
 			       accounting_enforce, NULL, true);

 	job_ptr = find_job_record(job_id);

 	if (!(valid_operator = validate_operator_user_rec(&user_rec)))
 		hide_job = _hide_job_user_rec(job_ptr, &user_rec, show_flags);

 	if (!(show_flags & SHOW_ALL) && job_ptr && IS_JOB_REVOKED(job_ptr))
 		hide_job = true;

 	if (job_ptr && job_ptr->het_job_list) {
 		/* Pack heterogeneous job components */
 		if (!hide_job) {
 			_foreach_pack_job_info_t pack_info = {
 				.buffer = buffer,
 				.het_leader = job_ptr,
 				.jobs_packed = 0,
 				.protocol_version = protocol_version,
 				.show_flags = show_flags,
 				.uid = uid,
 				.has_qos_lock = true,
 			};
 			(void) list_for_each(job_ptr->het_job_list,
 					     _foreach_pack_het_job,
 					     &pack_info);

 			jobs_packed = pack_info.jobs_packed;
 			buffer = pack_info.buffer;
 		}
 	} else if (job_ptr && (job_ptr->array_task_id == NO_VAL) &&
 		   !job_ptr->array_recs) {
 		/* Pack regular (not array) job */
 		if (!hide_job) {
 			pack_job(job_ptr, show_flags, buffer, protocol_version,
 				 uid, true);
 			jobs_packed++;
 		}
 	} else {
 		bool packed_head = false;

 		/* Either the job is not found or it is a job array */
 		if (job_ptr) {
 			packed_head = true;
 			if (!hide_job) {
 				pack_job(job_ptr, show_flags, buffer,
 					 protocol_version, uid, true);
 				jobs_packed++;
 			}
 		}

 		job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
 		while (job_ptr) {
 			if ((job_ptr->job_id == job_id) && packed_head) {
 				;	/* Already packed */
 			} else if (!(show_flags & SHOW_ALL) &&
 				   IS_JOB_REVOKED(job_ptr)) {
 				/*
 				 * Array jobs can't be federated but to be
 				 * consistent and future proof, don't pack
 				 * revoked array jobs.
 				 */
 			} else if (job_ptr->array_job_id == job_id) {
 				if (valid_operator ||
 				    !_hide_job_user_rec(job_ptr, &user_rec,
 							show_flags)) {
 					pack_job(job_ptr, show_flags, buffer,
 						 protocol_version, uid, true);
 					jobs_packed++;
 				}
 			}
 			job_ptr = job_ptr->job_array_next_j;
 		}
 	}

 	assoc_mgr_unlock(&locks);

 	if (jobs_packed == 0) {
 		FREE_NULL_BUFFER(buffer);
 		return NULL;
 	}

 	/* put the real record count in the message body header */
 	tmp_offset = get_buf_offset(buffer);
 	set_buf_offset(buffer, 0);
 	pack32(jobs_packed, buffer);
 	set_buf_offset(buffer, tmp_offset);

 	return buffer;
 }

 static void _pack_job_gres(job_record_t *dump_job_ptr, buf_t *buffer,
 			   uint16_t protocol_version)
 {
 	if (!IS_JOB_STARTED(dump_job_ptr) || IS_JOB_FINISHED(dump_job_ptr) ||
 	    (dump_job_ptr->gres_list_req == NULL)) {
 		packstr_array(NULL, 0, buffer);
 		return;
 	}

 	packstr_array(dump_job_ptr->gres_detail_str,
 		      dump_job_ptr->gres_detail_cnt, buffer);
 }

 /*
  * pack_job - dump all configuration information about a specific job in
  *	machine independent form (for network transmission)
  * IN dump_job_ptr - pointer to job for which information is requested
  * IN show_flags - job filtering options
  * IN/OUT buffer - buffer in which data is placed, pointers automatically
  *	updated
  * IN uid - user requesting the data
  * NOTE: change _unpack_job_info_members() in common/slurm_protocol_pack.c
  *	  whenever the data format changes
  */
 void pack_job(job_record_t *dump_job_ptr, uint16_t show_flags, buf_t *buffer,
 	      uint16_t protocol_version, uid_t uid, bool has_qos_lock)
 {
 	job_details_t *detail_ptr;
 	time_t accrue_time = 0, begin_time = 0, start_time = 0, end_time = 0;
 	uint32_t time_limit;
 	char *nodelist = NULL;
 	assoc_mgr_lock_t locks = { .qos = READ_LOCK };
 	xassert(!has_qos_lock || verify_assoc_lock(QOS_LOCK, READ_LOCK));

 	/*
 	 * NOTE: There are nested pack blocks in
 	 * job_record_pack_details_common() and
 	 * job_record_pack_details_common(). Bump this protocol block when
 	 * bumping the blocks in these functions to help keep symmetry between
 	 * pack and unpacks.
 	 */
 	if (protocol_version >= SLURM_25_05_PROTOCOL_VERSION) {
 		job_record_pack_common(dump_job_ptr, false, buffer,
 				       protocol_version);

 		if (dump_job_ptr->array_recs) {
 			build_array_str(dump_job_ptr);
 			packstr(dump_job_ptr->array_recs->task_id_str, buffer);
 			pack32(dump_job_ptr->array_recs->max_run_tasks, buffer);
 		} else {
 			job_record_t *array_head = NULL;
 			packnull(buffer);
 			if (dump_job_ptr->array_job_id) {
 				array_head = find_job_record(
 					dump_job_ptr->array_job_id);
 			}
 			if (array_head && array_head->array_recs) {
 				pack32(array_head->array_recs->max_run_tasks,
 				       buffer);
 			} else {
 				pack32(0, buffer);
 			}
 		}
 		if ((dump_job_ptr->time_limit == NO_VAL) &&
 		    dump_job_ptr->part_ptr)
 			time_limit = dump_job_ptr->part_ptr->max_time;
 		else
 			time_limit = dump_job_ptr->time_limit;

 		pack32(time_limit, buffer);

 		if (IS_JOB_STARTED(dump_job_ptr)) {
 			/* Report actual start time, in past */
 			start_time = dump_job_ptr->start_time;
 			end_time = dump_job_ptr->end_time;
 		} else if (dump_job_ptr->start_time != 0) {
 			/*
 			 * Report expected start time,
 			 * making sure that time is not in the past
 			 */
 			start_time = MAX(dump_job_ptr->start_time, time(NULL));
 			if (time_limit != NO_VAL) {
 				end_time = MAX(dump_job_ptr->end_time,
 					       (start_time + time_limit * 60));
 			}
 		} else if (dump_job_ptr->details->begin_time > time(NULL)) {
 			/* earliest start time in the future */
 			start_time = dump_job_ptr->details->begin_time;
 			if (time_limit != NO_VAL) {
 				end_time = MAX(dump_job_ptr->end_time,
 					       (start_time + time_limit * 60));
 			}
 		}
 		pack_time(start_time, buffer);
 		pack_time(end_time, buffer);

 		if (dump_job_ptr->prio_mult) {
 			pack32_array(dump_job_ptr->prio_mult->priority_array,
 				     (dump_job_ptr->prio_mult->priority_array) ?
 				     list_count(dump_job_ptr->part_ptr_list) :
 				     0, buffer);
 			packstr(dump_job_ptr->prio_mult->priority_array_names,
 				buffer);
 		} else {
 			packnull(buffer);
 			packnull(buffer);
 		}

 		packstr(slurm_conf.cluster_name, buffer);

 		/*
 		 * Only send the allocated nodelist since we are only sending
 		 * the number of cpus and nodes that are currently allocated.
 		 */
 		if (!IS_JOB_COMPLETING(dump_job_ptr))
 			packstr(dump_job_ptr->nodes, buffer);
 		else {
 			nodelist = bitmap2node_name(
 				dump_job_ptr->node_bitmap_cg);
 			packstr(nodelist, buffer);
 			xfree(nodelist);
 		}
 		packstr(dump_job_ptr->sched_nodes, buffer);

 		if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr)
 			packstr(dump_job_ptr->part_ptr->name, buffer);
 		else
 			packstr(dump_job_ptr->partition, buffer);

 		if (IS_JOB_PENDING(dump_job_ptr) &&
 		    dump_job_ptr->details->qos_req)
 			packstr(dump_job_ptr->details->qos_req, buffer);
 		else {
 			if (!has_qos_lock)
 				assoc_mgr_lock(&locks);
 			if (dump_job_ptr->qos_ptr)
 				packstr(dump_job_ptr->qos_ptr->name, buffer);
 			else {
 				if (assoc_mgr_qos_list) {
 					packstr(slurmdb_qos_str(
 							assoc_mgr_qos_list,
 							dump_job_ptr->qos_id),
 						buffer);
 				} else
 					packnull(buffer);
 			}
 		}

 		if (IS_JOB_STARTED(dump_job_ptr) &&
 		    (slurm_conf.preempt_mode != PREEMPT_MODE_OFF) &&
 		    (slurm_job_preempt_mode(dump_job_ptr) !=
 		     PREEMPT_MODE_OFF)) {
 			time_t preemptable = acct_policy_get_preemptable_time(
 				dump_job_ptr);
 			pack_time(preemptable, buffer);
 		} else {
 			pack_time(0, buffer);
 		}
 		if (!has_qos_lock)
 			assoc_mgr_unlock(&locks);

 		if (show_flags & SHOW_DETAIL) {
 			pack_job_resources(dump_job_ptr->job_resrcs, buffer,
 					   protocol_version);
 			_pack_job_gres(dump_job_ptr, buffer, protocol_version);
 		} else {
 			pack32(NO_VAL, buffer);
 			pack32((uint32_t)0, buffer);
 		}

 		if (!IS_JOB_COMPLETING(dump_job_ptr))
 			pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer);
 		else
 			pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer);

 		/* A few details are always dumped here */
 		_pack_default_job_details(dump_job_ptr, buffer,
 					  protocol_version);

 		/*
 		 * other job details are only dumped until the job starts
 		 * running (at which time they become meaningless)
 		 */
 		_pack_pending_job_details(dump_job_ptr->details,
 					  buffer, protocol_version);
 	} else if (protocol_version >= SLURM_24_11_PROTOCOL_VERSION) {
 		job_record_pack_common(dump_job_ptr, false, buffer,
 				       protocol_version);

 		if (dump_job_ptr->array_recs) {
 			build_array_str(dump_job_ptr);
 			packstr(dump_job_ptr->array_recs->task_id_str, buffer);
 			pack32(dump_job_ptr->array_recs->max_run_tasks, buffer);
 		} else {
 			job_record_t *array_head = NULL;
 			packnull(buffer);
 			if (dump_job_ptr->array_job_id) {
 				array_head = find_job_record(
 					dump_job_ptr->array_job_id);
 			}
 			if (array_head && array_head->array_recs) {
 				pack32(array_head->array_recs->max_run_tasks,
 				       buffer);
 			} else {
 				pack32(0, buffer);
 			}
 		}
 		if ((dump_job_ptr->time_limit == NO_VAL) &&
 		    dump_job_ptr->part_ptr)
 			time_limit = dump_job_ptr->part_ptr->max_time;
 		else
 			time_limit = dump_job_ptr->time_limit;

 		pack32(time_limit, buffer);

 		if (IS_JOB_STARTED(dump_job_ptr)) {
 			/* Report actual start time, in past */
 			start_time = dump_job_ptr->start_time;
 			end_time = dump_job_ptr->end_time;
 		} else if (dump_job_ptr->start_time != 0) {
 			/*
 			 * Report expected start time,
 			 * making sure that time is not in the past
 			 */
 			start_time = MAX(dump_job_ptr->start_time, time(NULL));
 			if (time_limit != NO_VAL) {
 				end_time = MAX(dump_job_ptr->end_time,
 					       (start_time + time_limit * 60));
 			}
 		} else if (dump_job_ptr->details->begin_time > time(NULL)) {
 			/* earliest start time in the future */
 			start_time = dump_job_ptr->details->begin_time;
 			if (time_limit != NO_VAL) {
 				end_time = MAX(dump_job_ptr->end_time,
 					       (start_time + time_limit * 60));
 			}
 		}
 		pack_time(start_time, buffer);
 		pack_time(end_time, buffer);

 		if (dump_job_ptr->prio_mult) {
 			pack32_array(dump_job_ptr->prio_mult->priority_array,
 				     (dump_job_ptr->prio_mult->priority_array) ?
 				     list_count(dump_job_ptr->part_ptr_list) :
 				     0, buffer);
 			packstr(dump_job_ptr->prio_mult->priority_array_names,
 				buffer);
 		} else {
 			packnull(buffer);
 			packnull(buffer);
 		}

 		packstr(slurm_conf.cluster_name, buffer);

 		/*
 		 * Only send the allocated nodelist since we are only sending
 		 * the number of cpus and nodes that are currently allocated.
 		 */
 		if (!IS_JOB_COMPLETING(dump_job_ptr))
 			packstr(dump_job_ptr->nodes, buffer);
 		else {
 			nodelist = bitmap2node_name(
 				dump_job_ptr->node_bitmap_cg);
 			packstr(nodelist, buffer);
 			xfree(nodelist);
 		}
 		packstr(dump_job_ptr->sched_nodes, buffer);

 		if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr)
 			packstr(dump_job_ptr->part_ptr->name, buffer);
 		else
 			packstr(dump_job_ptr->partition, buffer);

 		if (IS_JOB_PENDING(dump_job_ptr) &&
 		    dump_job_ptr->details->qos_req)
 			packstr(dump_job_ptr->details->qos_req, buffer);
 		else {
 			if (!has_qos_lock)
 				assoc_mgr_lock(&locks);
 			if (dump_job_ptr->qos_ptr)
 				packstr(dump_job_ptr->qos_ptr->name, buffer);
 			else {
 				if (assoc_mgr_qos_list) {
 					packstr(slurmdb_qos_str(
 							assoc_mgr_qos_list,
 							dump_job_ptr->qos_id),
 						buffer);
 				} else
 					packnull(buffer);
 			}
 		}

 		if (IS_JOB_STARTED(dump_job_ptr) &&
 		    (slurm_conf.preempt_mode != PREEMPT_MODE_OFF) &&
 		    (slurm_job_preempt_mode(dump_job_ptr) !=
 		     PREEMPT_MODE_OFF)) {
 			time_t preemptable = acct_policy_get_preemptable_time(
 				dump_job_ptr);
 			pack_time(preemptable, buffer);
 		} else {
 			pack_time(0, buffer);
 		}
 		if (!has_qos_lock)
 			assoc_mgr_unlock(&locks);

 		if (show_flags & SHOW_DETAIL) {
 			pack_job_resources(dump_job_ptr->job_resrcs, buffer,
 					   protocol_version);
 			_pack_job_gres(dump_job_ptr, buffer, protocol_version);
 		} else {
 			pack32(NO_VAL, buffer);
 			pack32((uint32_t)0, buffer);
 		}

 		if (!IS_JOB_COMPLETING(dump_job_ptr))
 			pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer);
 		else
 			pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer);

 		/* A few details are always dumped here */
 		_pack_default_job_details(dump_job_ptr, buffer,
 					  protocol_version);

 		/*
 		 * other job details are only dumped until the job starts
 		 * running (at which time they become meaningless)
 		 */
 		_pack_pending_job_details(dump_job_ptr->details,
 					  buffer, protocol_version);
 	} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		detail_ptr = dump_job_ptr->details;
 		pack32(dump_job_ptr->array_job_id, buffer);
 		pack32(dump_job_ptr->array_task_id, buffer);
 		if (dump_job_ptr->array_recs) {
 			build_array_str(dump_job_ptr);
 			packstr(dump_job_ptr->array_recs->task_id_str, buffer);
 			pack32(dump_job_ptr->array_recs->max_run_tasks, buffer);
 		} else {
 			job_record_t *array_head = NULL;
 			packnull(buffer);
 			if (dump_job_ptr->array_job_id) {
 				array_head = find_job_record(
 					dump_job_ptr->array_job_id);
 			}
 			if (array_head && array_head->array_recs) {
 				pack32(array_head->array_recs->max_run_tasks,
 				       buffer);
 			} else {
 				pack32(0, buffer);
 			}
 		}

 		pack32(dump_job_ptr->assoc_id, buffer);
 		packstr(dump_job_ptr->container, buffer);
 		packstr(dump_job_ptr->container_id, buffer);
 		pack32(dump_job_ptr->delay_boot, buffer);
 		packstr(dump_job_ptr->failed_node, buffer);
 		pack32(dump_job_ptr->job_id, buffer);
 		pack32(dump_job_ptr->user_id, buffer);
 		pack32(dump_job_ptr->group_id, buffer);
 		pack32(dump_job_ptr->het_job_id, buffer);
 		packstr(dump_job_ptr->het_job_id_set, buffer);
 		pack32(dump_job_ptr->het_job_offset, buffer);
 		pack32(dump_job_ptr->profile, buffer);

 		pack32(dump_job_ptr->job_state, buffer);
 		pack16(dump_job_ptr->batch_flag, buffer);
 		pack32(dump_job_ptr->state_reason, buffer);
 		pack8(0, buffer); /* was power_flags */
 		pack8(dump_job_ptr->reboot, buffer);
 		pack16(dump_job_ptr->restart_cnt, buffer);
 		pack16(show_flags, buffer);
 		pack_time(dump_job_ptr->deadline, buffer);

 		pack32(dump_job_ptr->alloc_sid, buffer);
 		if ((dump_job_ptr->time_limit == NO_VAL) &&
 		    dump_job_ptr->part_ptr)
 			time_limit = dump_job_ptr->part_ptr->max_time;
 		else
 			time_limit = dump_job_ptr->time_limit;

 		pack32(time_limit, buffer);
 		pack32(dump_job_ptr->time_min, buffer);

 		if (dump_job_ptr->details) {
 			pack32(dump_job_ptr->details->nice, buffer);
 			pack_time(dump_job_ptr->details->submit_time, buffer);
 			/* Earliest possible begin time */
 			begin_time = dump_job_ptr->details->begin_time;
 			/* When we started accruing time for priority */
 			accrue_time = dump_job_ptr->details->accrue_time;
 		} else { /* Some job details may be purged after completion */
 			pack32(NICE_OFFSET, buffer); /* Best guess */
 			pack_time((time_t)0, buffer);
 		}

 		pack_time(begin_time, buffer);
 		pack_time(accrue_time, buffer);

 		if (IS_JOB_STARTED(dump_job_ptr)) {
 			/* Report actual start time, in past */
 			start_time = dump_job_ptr->start_time;
 			end_time = dump_job_ptr->end_time;
 		} else if (dump_job_ptr->start_time != 0) {
 			/*
 			 * Report expected start time,
 			 * making sure that time is not in the past
 			 */
 			start_time = MAX(dump_job_ptr->start_time, time(NULL));
 			if (time_limit != NO_VAL) {
 				end_time = MAX(dump_job_ptr->end_time,
 					       (start_time + time_limit * 60));
 			}
 		} else if (begin_time > time(NULL)) {
 			/* earliest start time in the future */
 			start_time = begin_time;
 			if (time_limit != NO_VAL) {
 				end_time = MAX(dump_job_ptr->end_time,
 					       (start_time + time_limit * 60));
 			}
 		}
 		pack_time(start_time, buffer);
 		pack_time(end_time, buffer);

 		pack_time(dump_job_ptr->suspend_time, buffer);
 		pack_time(dump_job_ptr->pre_sus_time, buffer);
 		pack_time(dump_job_ptr->resize_time, buffer);
 		pack_time(dump_job_ptr->last_sched_eval, buffer);
 		pack_time(dump_job_ptr->preempt_time, buffer);
 		pack32(dump_job_ptr->priority, buffer);
 		if (dump_job_ptr->prio_mult) {
 			pack32_array(dump_job_ptr->prio_mult->priority_array,
 				     (dump_job_ptr->prio_mult->priority_array) ?
 				     list_count(dump_job_ptr->part_ptr_list) :
 				     0, buffer);
 			packstr(dump_job_ptr->prio_mult->priority_array_names,
 				buffer);
 		} else {
 			packnull(buffer);
 			packnull(buffer);
 		}
 		packdouble(dump_job_ptr->billable_tres, buffer);

 		packstr(slurm_conf.cluster_name, buffer);
 		/*
 		 * Only send the allocated nodelist since we are only sending
 		 * the number of cpus and nodes that are currently allocated.
 		 */
 		if (!IS_JOB_COMPLETING(dump_job_ptr))
 			packstr(dump_job_ptr->nodes, buffer);
 		else {
 			nodelist = bitmap2node_name(
 				dump_job_ptr->node_bitmap_cg);
 			packstr(nodelist, buffer);
 			xfree(nodelist);
 		}

 		packstr(dump_job_ptr->sched_nodes, buffer);

 		if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr)
 			packstr(dump_job_ptr->part_ptr->name, buffer);
 		else
 			packstr(dump_job_ptr->partition, buffer);
 		packstr(dump_job_ptr->account, buffer);
 		packstr(dump_job_ptr->admin_comment, buffer);
 		pack32(dump_job_ptr->site_factor, buffer);
 		packstr(dump_job_ptr->network, buffer);
 		packstr(dump_job_ptr->comment, buffer);
 		packstr(dump_job_ptr->extra, buffer);
 		packstr(dump_job_ptr->container, buffer);
 		packstr(dump_job_ptr->batch_features, buffer);
 		packstr(dump_job_ptr->batch_host, buffer);
 		packstr(dump_job_ptr->burst_buffer, buffer);
 		packstr(dump_job_ptr->burst_buffer_state, buffer);
 		packstr(dump_job_ptr->system_comment, buffer);

 		if (!has_qos_lock)
 			assoc_mgr_lock(&locks);
 		if (dump_job_ptr->qos_ptr)
 			packstr(dump_job_ptr->qos_ptr->name, buffer);
 		else {
 			if (assoc_mgr_qos_list) {
 				packstr(slurmdb_qos_str(assoc_mgr_qos_list,
 							dump_job_ptr->qos_id),
 					buffer);
 			} else
 				packnull(buffer);
 		}

 		if (IS_JOB_STARTED(dump_job_ptr) &&
 		    (slurm_conf.preempt_mode != PREEMPT_MODE_OFF) &&
 		    (slurm_job_preempt_mode(dump_job_ptr) !=
 		     PREEMPT_MODE_OFF)) {
 			time_t preemptable = acct_policy_get_preemptable_time(
 				dump_job_ptr);
 			pack_time(preemptable, buffer);
 		} else {
 			pack_time(0, buffer);
 		}
 		if (!has_qos_lock)
 			assoc_mgr_unlock(&locks);

 		packstr(dump_job_ptr->licenses, buffer);
 		packstr(dump_job_ptr->state_desc, buffer);
 		packstr(dump_job_ptr->resv_name, buffer);
 		packstr(dump_job_ptr->resv_ports, buffer);
 		packstr(dump_job_ptr->mcs_label, buffer);

 		pack32(dump_job_ptr->exit_code, buffer);
 		pack32(dump_job_ptr->derived_ec, buffer);

 		packstr(dump_job_ptr->gres_used, buffer);
 		if (show_flags & SHOW_DETAIL) {
 			pack_job_resources(dump_job_ptr->job_resrcs, buffer,
 					   protocol_version);
 			_pack_job_gres(dump_job_ptr, buffer, protocol_version);
 		} else {
 			pack32(NO_VAL, buffer);
 			pack32((uint32_t)0, buffer);
 		}

 		packstr(dump_job_ptr->name, buffer);
 		packstr(dump_job_ptr->user_name, buffer);
 		packstr(dump_job_ptr->wckey, buffer);
 		pack32(dump_job_ptr->req_switch, buffer);
 		pack32(dump_job_ptr->wait4switch, buffer);

 		packstr(dump_job_ptr->alloc_node, buffer);
 		if (!IS_JOB_COMPLETING(dump_job_ptr))
 			pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer);
 		else
 			pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer);

 		/* A few details are always dumped here */
 		_pack_default_job_details(dump_job_ptr, buffer,
 					  protocol_version);

 		/*
 		 * other job details are only dumped until the job starts
 		 * running (at which time they become meaningless)
 		 */
 		if (detail_ptr)
 			_pack_pending_job_details(detail_ptr, buffer,
 						  protocol_version);
 		else
 			_pack_pending_job_details(NULL, buffer,
 						  protocol_version);
 		pack64(dump_job_ptr->bit_flags, buffer);
 		packstr(dump_job_ptr->tres_fmt_alloc_str, buffer);
 		packstr(dump_job_ptr->tres_fmt_req_str, buffer);
 		pack16(dump_job_ptr->start_protocol_ver, buffer);

 		if (dump_job_ptr->fed_details) {
 			packstr(dump_job_ptr->fed_details->origin_str, buffer);
 			pack64(dump_job_ptr->fed_details->siblings_active,
 			       buffer);
 			packstr(dump_job_ptr->fed_details->siblings_active_str,
 				buffer);
 			pack64(dump_job_ptr->fed_details->siblings_viable,
 			       buffer);
 			packstr(dump_job_ptr->fed_details->siblings_viable_str,
 				buffer);
 		} else {
 			packnull(buffer);
 			pack64((uint64_t)0, buffer);
 			packnull(buffer);
 			pack64((uint64_t)0, buffer);
 			packnull(buffer);
 		}

 		packstr(dump_job_ptr->cpus_per_tres, buffer);
 		packstr(dump_job_ptr->mem_per_tres, buffer);
 		packstr(dump_job_ptr->tres_bind, buffer);
 		packstr(dump_job_ptr->tres_freq, buffer);
 		packstr(dump_job_ptr->tres_per_job, buffer);
 		packstr(dump_job_ptr->tres_per_node, buffer);
 		packstr(dump_job_ptr->tres_per_socket, buffer);
 		packstr(dump_job_ptr->tres_per_task, buffer);

 		pack16(dump_job_ptr->mail_type, buffer);
 		packstr(dump_job_ptr->mail_user, buffer);

 		packstr(dump_job_ptr->selinux_context, buffer);
 	} else {
 		error("pack_job: protocol_version "
 		      "%hu not supported", protocol_version);
 	}
 }

 static void _find_node_config(int *cpu_cnt_ptr, int *core_cnt_ptr)
 {
 	static int max_cpu_cnt = -1, max_core_cnt = -1;
 	static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
 	int i;
 	node_record_t *node_ptr;

 	slurm_mutex_lock(&lock);
 	if (max_cpu_cnt == -1) {
 		for (i = 0; (node_ptr = next_node(&i)); i++) {
 			/* Only data from config_record used for scheduling */
 			max_cpu_cnt = MAX(max_cpu_cnt,
 					  node_ptr->config_ptr->cpus);
 			max_core_cnt = MAX(max_core_cnt,
 					   node_ptr->config_ptr->cores);
 		}
 	}
 	slurm_mutex_unlock(&lock);

 	*cpu_cnt_ptr  = max_cpu_cnt;
 	*core_cnt_ptr = max_core_cnt;
 }

 /* pack default job details for "get_job_info" RPC */
 static void _pack_default_job_details(job_record_t *job_ptr, buf_t *buffer,
 				      uint16_t protocol_version)
 {
 	int max_cpu_cnt = -1, max_core_cnt = -1;
 	job_details_t *detail_ptr = job_ptr->details;
 	uint16_t shared = 0;

 	shared = get_job_share_value(job_ptr);

 	if (job_ptr->part_ptr && job_ptr->part_ptr->max_cpu_cnt) {
 		max_cpu_cnt  = job_ptr->part_ptr->max_cpu_cnt;
 		max_core_cnt = job_ptr->part_ptr->max_core_cnt;
 	} else
 		_find_node_config(&max_cpu_cnt, &max_core_cnt);

 	if (protocol_version >= SLURM_24_11_PROTOCOL_VERSION) {
 		if (!detail_ptr) {
 			packbool(false, buffer);

 			if (job_ptr->total_cpus)
 				pack32(job_ptr->total_cpus, buffer);
 			else
 				pack32(job_ptr->cpu_cnt, buffer);

 			pack32(job_ptr->node_cnt, buffer);
 			pack32(NICE_OFFSET, buffer); /* Best guess */
 			return;
 		}
 		packbool(true, buffer);
 		job_record_pack_details_common(detail_ptr, buffer,
 					       protocol_version);

 		if (!IS_JOB_PENDING(job_ptr)) {
 			packstr(detail_ptr->features_use, buffer);
 			packnull(buffer);
 		} else {
 			packstr(detail_ptr->features, buffer);
 			packstr(detail_ptr->prefer, buffer);
 		}

 		if (detail_ptr->argv)
 			packstr(detail_ptr->argv[0], buffer);
 		else
 			packnull(buffer);

 		if (IS_JOB_COMPLETING(job_ptr) && job_ptr->cpu_cnt) {
 			pack32(job_ptr->cpu_cnt, buffer);
 			pack32((uint32_t) 0, buffer);
 		} else if (job_ptr->total_cpus &&
 			   !IS_JOB_PENDING(job_ptr)) {
 			/* If job is PENDING ignore total_cpus,
 			 * which may have been set by previous run
 			 * followed by job requeue. */
 			pack32(job_ptr->total_cpus, buffer);
 			pack32((uint32_t) 0, buffer);
 		} else {
 			pack32(detail_ptr->min_cpus, buffer);
 			if (detail_ptr->max_cpus != NO_VAL)
 				pack32(detail_ptr->max_cpus, buffer);
 			else
 				pack32((uint32_t) 0, buffer);
 		}

 		if (IS_JOB_COMPLETING(job_ptr) && job_ptr->node_cnt) {
 			pack32(job_ptr->node_cnt, buffer);
 			pack32((uint32_t) 0, buffer);
 		} else if (job_ptr->total_nodes) {
 			pack32(job_ptr->total_nodes, buffer);
 			pack32((uint32_t) 0, buffer);
 		} else if (job_ptr->node_cnt_wag) {
 			/* This should catch everything else, but
 			 * just in case this is 0 (startup or
 			 * whatever) we will keep the rest of
 			 * this if statement around.
 			 */
 			pack32(job_ptr->node_cnt_wag, buffer);
 			pack32((uint32_t) detail_ptr->max_nodes,
 			       buffer);
 		} else if (detail_ptr->ntasks_per_node) {
 			/* min_nodes based upon task count and ntasks
 			 * per node */
 			uint32_t min_nodes;
 			min_nodes = detail_ptr->num_tasks /
 				detail_ptr->ntasks_per_node;
 			min_nodes = MAX(min_nodes,
 					detail_ptr->min_nodes);
 			pack32(min_nodes, buffer);
 			pack32(detail_ptr->max_nodes, buffer);
 		} else if (detail_ptr->cpus_per_task > 1) {
 			/* min_nodes based upon task count and cpus
 			 * per task */
 			uint32_t ntasks_per_node, min_nodes;
 			ntasks_per_node = max_cpu_cnt /
 				detail_ptr->cpus_per_task;
 			ntasks_per_node = MAX(ntasks_per_node, 1);
 			min_nodes = detail_ptr->num_tasks /
 				ntasks_per_node;
 			min_nodes = MAX(min_nodes,
 					detail_ptr->min_nodes);
 			pack32(min_nodes, buffer);
 			pack32(detail_ptr->max_nodes, buffer);
 		} else if (detail_ptr->mc_ptr &&
 			   detail_ptr->mc_ptr->ntasks_per_core &&
 			   (detail_ptr->mc_ptr->ntasks_per_core
 			    != INFINITE16)) {
 			/* min_nodes based upon task count and ntasks
 			 * per core */
 			uint32_t min_cores, min_nodes;
 			min_cores = ROUNDUP(detail_ptr->num_tasks,
 					    detail_ptr->mc_ptr->
 					    ntasks_per_core);
 			min_nodes = ROUNDUP(min_cores, max_core_cnt);
 			min_nodes = MAX(min_nodes,
 					detail_ptr->min_nodes);
 			pack32(min_nodes, buffer);
 			pack32(detail_ptr->max_nodes, buffer);
 		} else {
 			/* min_nodes based upon task count only */
 			uint32_t min_nodes;
 			uint32_t max_nodes;

 			min_nodes = ROUNDUP(detail_ptr->num_tasks,
 					    max_cpu_cnt);
 			min_nodes = MAX(min_nodes,
 					detail_ptr->min_nodes);
 			max_nodes = MAX(min_nodes,
 					detail_ptr->max_nodes);
 			pack32(min_nodes, buffer);
 			pack32(max_nodes, buffer);
 		}
 		if (detail_ptr->num_tasks)
 			pack32(detail_ptr->num_tasks, buffer);
 		else if (IS_JOB_PENDING(job_ptr))
 			pack32(detail_ptr->min_nodes, buffer);
 		else if (job_ptr->tres_alloc_cnt)
 			pack32((uint32_t)
 			       job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE],
 			       buffer);
 		else
 			pack32(NO_VAL, buffer);

 		pack16(shared, buffer);

 		if (detail_ptr->crontab_entry)
 			packstr(detail_ptr->crontab_entry->cronspec,
 				buffer);
 		else
 			packnull(buffer);
 	} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		if (detail_ptr) {
 			if (!IS_JOB_PENDING(job_ptr)) {
 				packstr(detail_ptr->features_use, buffer);
 				packnull(buffer);
 			} else {
 				packstr(detail_ptr->features, buffer);
 				packstr(detail_ptr->prefer, buffer);
 			}
 			packstr(detail_ptr->cluster_features, buffer);
 			packstr(detail_ptr->work_dir, buffer);
 			packstr(detail_ptr->dependency, buffer);

 			if (detail_ptr->argv)
 				packstr(detail_ptr->argv[0], buffer);
 			else
 				packnull(buffer);

 			if (IS_JOB_COMPLETING(job_ptr) && job_ptr->cpu_cnt) {
 				pack32(job_ptr->cpu_cnt, buffer);
 				pack32((uint32_t) 0, buffer);
 			} else if (job_ptr->total_cpus &&
 				   !IS_JOB_PENDING(job_ptr)) {
 				/* If job is PENDING ignore total_cpus,
 				 * which may have been set by previous run
 				 * followed by job requeue. */
 				pack32(job_ptr->total_cpus, buffer);
 				pack32((uint32_t) 0, buffer);
 			} else {
 				pack32(detail_ptr->min_cpus, buffer);
 				if (detail_ptr->max_cpus != NO_VAL)
 					pack32(detail_ptr->max_cpus, buffer);
 				else
 					pack32((uint32_t) 0, buffer);
 			}

 			if (IS_JOB_COMPLETING(job_ptr) && job_ptr->node_cnt) {
 				pack32(job_ptr->node_cnt, buffer);
 				pack32((uint32_t) 0, buffer);
 			} else if (job_ptr->total_nodes) {
 				pack32(job_ptr->total_nodes, buffer);
 				pack32((uint32_t) 0, buffer);
 			} else if (job_ptr->node_cnt_wag) {
 				/* This should catch everything else, but
 				 * just in case this is 0 (startup or
 				 * whatever) we will keep the rest of
 				 * this if statement around.
 				 */
 				pack32(job_ptr->node_cnt_wag, buffer);
 				pack32((uint32_t) detail_ptr->max_nodes,
 				       buffer);
 			} else if (detail_ptr->ntasks_per_node) {
 				/* min_nodes based upon task count and ntasks
 				 * per node */
 				uint32_t min_nodes;
 				min_nodes = detail_ptr->num_tasks /
 					detail_ptr->ntasks_per_node;
 				min_nodes = MAX(min_nodes,
 						detail_ptr->min_nodes);
 				pack32(min_nodes, buffer);
 				pack32(detail_ptr->max_nodes, buffer);
 			} else if (detail_ptr->cpus_per_task > 1) {
 				/* min_nodes based upon task count and cpus
 				 * per task */
 				uint32_t ntasks_per_node, min_nodes;
 				ntasks_per_node = max_cpu_cnt /
 					detail_ptr->cpus_per_task;
 				ntasks_per_node = MAX(ntasks_per_node, 1);
 				min_nodes = detail_ptr->num_tasks /
 					ntasks_per_node;
 				min_nodes = MAX(min_nodes,
 						detail_ptr->min_nodes);
 				pack32(min_nodes, buffer);
 				pack32(detail_ptr->max_nodes, buffer);
 			} else if (detail_ptr->mc_ptr &&
 				   detail_ptr->mc_ptr->ntasks_per_core &&
 				   (detail_ptr->mc_ptr->ntasks_per_core
 				    != INFINITE16)) {
 				/* min_nodes based upon task count and ntasks
 				 * per core */
 				uint32_t min_cores, min_nodes;
 				min_cores = ROUNDUP(detail_ptr->num_tasks,
 						    detail_ptr->mc_ptr->
 						    ntasks_per_core);
 				min_nodes = ROUNDUP(min_cores, max_core_cnt);
 				min_nodes = MAX(min_nodes,
 						detail_ptr->min_nodes);
 				pack32(min_nodes, buffer);
 				pack32(detail_ptr->max_nodes, buffer);
 			} else {
 				/* min_nodes based upon task count only */
 				uint32_t min_nodes;
 				uint32_t max_nodes;

 				min_nodes = ROUNDUP(detail_ptr->num_tasks,
 						    max_cpu_cnt);
 				min_nodes = MAX(min_nodes,
 						detail_ptr->min_nodes);
 				max_nodes = MAX(min_nodes,
 						detail_ptr->max_nodes);
 				pack32(min_nodes, buffer);
 				pack32(max_nodes, buffer);
 			}
 			pack_bit_str_hex(detail_ptr->job_size_bitmap, buffer);

 			pack16(detail_ptr->requeue,   buffer);
 			pack16(detail_ptr->ntasks_per_node, buffer);
 			pack16(detail_ptr->ntasks_per_tres, buffer);
 			if (detail_ptr->num_tasks)
 				pack32(detail_ptr->num_tasks, buffer);
 			else if (IS_JOB_PENDING(job_ptr))
 				pack32(detail_ptr->min_nodes, buffer);
 			else if (job_ptr->tres_alloc_cnt)
 				pack32((uint32_t)
 				       job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE],
 				       buffer);
 			else
 				pack32(NO_VAL, buffer);

 			pack16(shared, buffer);
 			pack32(detail_ptr->cpu_freq_min, buffer);
 			pack32(detail_ptr->cpu_freq_max, buffer);
 			pack32(detail_ptr->cpu_freq_gov, buffer);

 			if (detail_ptr->crontab_entry)
 				packstr(detail_ptr->crontab_entry->cronspec,
 					buffer);
 			else
 				packnull(buffer);
 		} else {
 			packnull(buffer);
 			packnull(buffer);
 			packnull(buffer);
 			packnull(buffer);

 			if (job_ptr->total_cpus)
 				pack32(job_ptr->total_cpus, buffer);
 			else
 				pack32(job_ptr->cpu_cnt, buffer);
 			pack32((uint32_t) 0, buffer);

 			pack32(job_ptr->node_cnt, buffer);
 			pack32((uint32_t) 0, buffer);
 			pack16((uint16_t) 0, buffer);
 			pack16((uint16_t) 0, buffer);
 			pack16((uint16_t) 0, buffer);
 			pack32((uint32_t) 0, buffer);
 			pack32((uint32_t) 0, buffer);
 			pack32((uint32_t) 0, buffer);

 			packnull(buffer);
 		}
 	} else {
 		error("_pack_default_job_details: protocol_version "
 		      "%hu not supported", protocol_version);
 	}
 }

 /* pack pending job details for "get_job_info" RPC */
 static void _pack_pending_job_details(job_details_t *detail_ptr, buf_t *buffer,
 				      uint16_t protocol_version)
 {
 	if (protocol_version >= SLURM_24_11_PROTOCOL_VERSION) {
 		if (detail_ptr) {
 			pack16(detail_ptr->contiguous, buffer);
 			pack16(detail_ptr->core_spec, buffer);
 			pack16(detail_ptr->cpus_per_task, buffer);
 			pack16(detail_ptr->pn_min_cpus, buffer);

 			pack64(detail_ptr->pn_min_memory, buffer);
 			pack32(detail_ptr->pn_min_tmp_disk, buffer);

 			pack16(detail_ptr->oom_kill_step, buffer);

 			packstr(detail_ptr->req_nodes, buffer);
 			pack_bit_str_hex(detail_ptr->req_node_bitmap, buffer);
 			packstr(detail_ptr->exc_nodes, buffer);
 			pack_bit_str_hex(detail_ptr->exc_node_bitmap, buffer);

 			packstr(detail_ptr->std_err, buffer);
 			packstr(detail_ptr->std_in, buffer);
 			packstr(detail_ptr->std_out, buffer);

 			pack_multi_core_data(detail_ptr->mc_ptr, buffer,
 					     protocol_version);
 		} else {
 			pack16((uint16_t) 0, buffer);
 			pack16((uint16_t) 0, buffer);
 			pack16((uint16_t) 0, buffer);
 			pack16((uint16_t) 0, buffer);

 			pack64((uint64_t) 0, buffer);
 			pack32((uint32_t) 0, buffer);

 			pack16((uint16_t) 0, buffer);

 			packnull(buffer);
 			packnull(buffer);
 			packnull(buffer);
 			packnull(buffer);

 			packnull(buffer);
 			packnull(buffer);
 			packnull(buffer);

 			pack_multi_core_data(NULL, buffer, protocol_version);
 		}
 	} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		if (detail_ptr) {
 			pack16(detail_ptr->contiguous, buffer);
 			pack16(detail_ptr->core_spec, buffer);
 			pack16(detail_ptr->cpus_per_task, buffer);
 			pack16(detail_ptr->pn_min_cpus, buffer);

 			pack64(detail_ptr->pn_min_memory, buffer);
 			pack32(detail_ptr->pn_min_tmp_disk, buffer);

 			packstr(detail_ptr->req_nodes, buffer);
 			pack_bit_str_hex(detail_ptr->req_node_bitmap, buffer);
 			packstr(detail_ptr->exc_nodes, buffer);
 			pack_bit_str_hex(detail_ptr->exc_node_bitmap, buffer);

 			packstr(detail_ptr->std_err, buffer);
 			packstr(detail_ptr->std_in, buffer);
 			packstr(detail_ptr->std_out, buffer);

 			pack_multi_core_data(detail_ptr->mc_ptr, buffer,
 					     protocol_version);
 		} else {
 			pack16((uint16_t) 0, buffer);
 			pack16((uint16_t) 0, buffer);
 			pack16((uint16_t) 0, buffer);
 			pack16((uint16_t) 0, buffer);

 			pack64((uint64_t) 0, buffer);
 			pack32((uint32_t) 0, buffer);

 			packnull(buffer);
 			packnull(buffer);
 			packnull(buffer);
 			packnull(buffer);

 			packnull(buffer);
 			packnull(buffer);
 			packnull(buffer);

 			pack_multi_core_data(NULL, buffer, protocol_version);
 		}
 	} else {
 		error("%s: protocol_version %hu not supported", __func__,
 		      protocol_version);
 	}
 }

 static int _foreach_set_het_job_for_purge(void *x, void *arg)
 {
 	job_record_t *het_job = x;
 	job_record_t *het_leader = arg;

 	if (het_leader->het_job_id != het_job->het_job_id) {
 		error("%s: Bad het_job_list for %pJ", __func__, het_leader);
 		return 0;
 	}

 	het_job->bit_flags |= HETJOB_PURGE;

 	return 0;
 }

 static int _foreach_check_old_het_job(void *x, void *arg)
 {
 	job_record_t *het_job = x;
 	job_record_t *het_leader = arg;

 	if (het_leader->het_job_id != het_job->het_job_id) {
 		error("%s: Bad het_job_list for %pJ", __func__, het_leader);
 		return 0;
 	}

 	if (!_list_find_job_old(het_job, NULL))
 		return -1;

 	return 0;
 }

 /* If this is a hetjob leader and all components are complete,
  * then purge all job of its hetjob records
  * RET true if this record purged */
 static inline bool _purge_complete_het_job(job_record_t *het_job_leader)
 {
 	if (!het_job_leader->het_job_list)
 		return false;		/* Not hetjob leader */
 	if (!IS_JOB_FINISHED(het_job_leader))
 		return false;		/* Hetjob leader incomplete */

 	if (list_find_first(het_job_leader->het_job_list,
 			    _foreach_check_old_het_job,
 			    het_job_leader))
 		return false;

 	(void) list_for_each(het_job_leader->het_job_list,
 			     _foreach_set_het_job_for_purge,
 			     het_job_leader);

 	return true;
 }

 static int _foreach_pre_purge_old_job(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;

 	if (_purge_complete_het_job(job_ptr))
 		return 0;
 	if (!IS_JOB_PENDING(job_ptr))
 		return 0;

 	if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL) &&
 	    !deadline_ok(job_ptr, __func__))
 		return 0;

 	/*
 	 * If the dependency is already invalid there's no reason to
 	 * keep checking it.
 	 */
 	if (job_ptr->state_reason == WAIT_DEP_INVALID)
 		return 0;
 	if (test_job_dependency(job_ptr, NULL) == FAIL_DEPEND) {
 		/* Check what are the job disposition
 		 * to deal with invalid dependencies
 		 */
 		handle_invalid_dependency(job_ptr);
 	}
 	return 0;
 }

 /*
  * If the job or slurm.conf requests to not kill on invalid dependency,
  * then set the job state reason to WAIT_DEP_INVALID. Otherwise, kill the
  * job.
  */
 void handle_invalid_dependency(job_record_t *job_ptr)
 {
 	job_ptr->state_reason = WAIT_DEP_INVALID;
 	xfree(job_ptr->state_desc);

 	if (job_ptr->mail_type & MAIL_INVALID_DEPEND)
 		mail_job_info(job_ptr, MAIL_INVALID_DEPEND);

 	if (job_ptr->bit_flags & KILL_INV_DEP) {
 		_kill_dependent(job_ptr);
 	} else if (job_ptr->bit_flags & NO_KILL_INV_DEP) {
 		debug("%s: %pJ job dependency never satisfied",
 		      __func__, job_ptr);
 	} else if (kill_invalid_dep) {
 		_kill_dependent(job_ptr);
 	} else {
 		debug("%s: %pJ job dependency never satisfied",
 		      __func__, job_ptr);
 		job_ptr->state_reason = WAIT_DEP_INVALID;
 	}
 	fed_mgr_remove_remote_dependencies(job_ptr);
 }

 /*
  * purge_old_job - purge old job records.
  *	The jobs must have completed at least MIN_JOB_AGE minutes ago.
  *	Test job dependencies, handle after_ok, after_not_ok before
  *	purging any jobs.
  */
 void purge_old_job(void)
 {
 	int i, purge_job_count;

 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
 	xassert(verify_lock(NODE_LOCK, WRITE_LOCK));
 	xassert(verify_lock(FED_LOCK, READ_LOCK));

 	if ((purge_job_count = list_count(purge_files_list)))
 		debug("%s: job file deletion is falling behind, "
 		      "%d left to remove", __func__, purge_job_count);

 	(void) list_for_each(job_list, _foreach_pre_purge_old_job, NULL);

 	fed_mgr_test_remote_dependencies();

 	i = list_delete_all(job_list, &_list_find_job_old, "");
 	if (i) {
 		debug2("purge_old_job: purged %d old job records", i);
 		last_job_update = time(NULL);
 		slurm_mutex_lock(&purge_thread_lock);
 		slurm_cond_signal(&purge_thread_cond);
 		slurm_mutex_unlock(&purge_thread_lock);
 	}
 }

 extern void free_old_jobs(void)
 {
 	job_record_t *job_ptr;
 	/*
 	 * Delete records one-by-one to avoid blocking purge_job_record().
 	 */
 	while ((job_ptr = list_pop(purge_jobs_list)))
 		job_record_delete(job_ptr);
 }

 /*
  * purge_job_record - purge specific job record. No testing is performed to
  *	ensure the job records has no active references. Use only for job
  *	records that were never fully operational (e.g. WILL_RUN test, failed
  *	job load, failed job create, etc.).
  * IN job_id - job_id of job record to be purged
  * RET int - count of job's purged
  * global: job_list - global job table
  */
 extern int purge_job_record(uint32_t job_id)
 {
 	int count = 0;
 	count = list_delete_all(job_list, _list_find_job_id, (void *)&job_id);
 	if (count) {
 		last_job_update = time(NULL);
 		slurm_mutex_lock(&purge_thread_lock);
 		slurm_cond_signal(&purge_thread_cond);
 		slurm_mutex_unlock(&purge_thread_lock);
 	}

 	return count;
 }

 extern void unlink_job_record(job_record_t *job_ptr)
 {
 	uint32_t *job_id;

 	xassert(job_ptr->magic == JOB_MAGIC);

 	_delete_job_common(job_ptr);

 	job_id = xmalloc(sizeof(uint32_t));
 	*job_id = job_ptr->job_id;
 	list_enqueue(purge_files_list, job_id);

 	job_ptr->job_id = NO_VAL;

 	last_job_update = time(NULL);
 	slurm_mutex_lock(&purge_thread_lock);
 	slurm_cond_signal(&purge_thread_cond);
 	slurm_mutex_unlock(&purge_thread_lock);
 }

 /* update first assigned job id as needed on reconfigure */
 void reset_first_job_id(void)
 {
 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
 	job_id_sequence = MAX(job_id_sequence, slurm_conf.first_job_id);
 }

 /*
  * Return the next available job_id to be used.
  *
  * IN test_only - if true, doesn't advance the job_id sequence, just returns
  * 	what the next job id will be.
  * RET a valid job_id or SLURM_ERROR if all job_ids are exhausted.
  */
 extern uint32_t get_next_job_id(bool test_only)
 {
 	int i;
 	uint32_t new_id, max_jobs, tmp_id_sequence;

 	xassert(verify_lock(JOB_LOCK, READ_LOCK));
 	xassert(test_only || verify_lock(JOB_LOCK, WRITE_LOCK));
 	xassert(verify_lock(FED_LOCK, READ_LOCK));

 	max_jobs = slurm_conf.max_job_id - slurm_conf.first_job_id;
 	tmp_id_sequence = MAX(job_id_sequence, slurm_conf.first_job_id);

 	/* Ensure no conflict in job id if we roll over 32 bits */
 	for (i = 0; i < max_jobs; i++) {
 		if (tmp_id_sequence >= slurm_conf.max_job_id)
 			tmp_id_sequence = slurm_conf.first_job_id;

 		new_id = fed_mgr_get_job_id(tmp_id_sequence);

 		if (find_job_record(new_id)) {
 			tmp_id_sequence++;
 			continue;
 		}
 		if (_dup_job_file_test(new_id)) {
 			tmp_id_sequence++;
 			continue;
 		}

 		if (!test_only)
 			job_id_sequence = tmp_id_sequence + 1;

 		return new_id;
 	}

 	error("We have exhausted our supply of valid job id values. FirstJobId=%u MaxJobId=%u",
 	      slurm_conf.first_job_id, slurm_conf.max_job_id);
 	return SLURM_ERROR;
 }

 /*
  * _set_job_id - set a default job_id, ensure that it is unique
  * IN job_ptr - pointer to the job_record
  */
 static int _set_job_id(job_record_t *job_ptr)
 {
 	uint32_t new_id;

 	xassert(job_ptr);
 	xassert (job_ptr->magic == JOB_MAGIC);

 	if ((new_id = get_next_job_id(false)) != SLURM_ERROR) {
 		job_ptr->job_id = new_id;
 		/* When we get a new job id might as well make sure
 		 * the db_index is set since there is no way it will be
 		 * correct otherwise :). */
 		job_record_set_sluid(job_ptr);
 		return SLURM_SUCCESS;
 	}

 	job_ptr->job_id = NO_VAL;
 	return EAGAIN;
 }


 /*
  * set_job_prio - set a default job priority
  * IN job_ptr - pointer to the job_record
  */
 extern void set_job_prio(job_record_t *job_ptr)
 {
 	uint32_t relative_prio;

 	xassert(job_ptr);
 	xassert (job_ptr->magic == JOB_MAGIC);

 	if (IS_JOB_FINISHED(job_ptr))
 		return;
 	job_ptr->priority = priority_g_set(lowest_prio, job_ptr);
 	if ((job_ptr->priority == 0) || (job_ptr->direct_set_prio))
 		return;

 	relative_prio = job_ptr->priority;
 	if (job_ptr->details && (job_ptr->details->nice != NICE_OFFSET)) {
 		int64_t offset = job_ptr->details->nice;
 		offset -= NICE_OFFSET;
 		relative_prio += offset;
 	}
 	lowest_prio = MIN(relative_prio, lowest_prio);
 }

 /* After recovering job state, if using priority/basic then we increment the
  * priorities of all jobs to avoid decrementing the base down to zero */
 extern void sync_job_priorities(void)
 {
 	uint32_t prio_boost = 0;

 	if ((highest_prio != 0) && (highest_prio < TOP_PRIORITY))
 		prio_boost = TOP_PRIORITY - highest_prio;

 	prio_boost = priority_g_recover(prio_boost);
 	lowest_prio += prio_boost;
 }

 /*
  * _higher_precedence - determine if job_ptr should be considered before
  *	job_ptr2 when scheduling jobs at submission time.
  *	This compares priority, submit time, and job id (in this order).
  *
  * IN job_ptr - pointer to first job
  * IN job_ptr2 - pointer to second job
  * RET true if job_ptr has higher scheduling precedence over job_ptr2
  */
 static bool _higher_precedence(job_record_t *job_ptr, job_record_t *job_ptr2)
 {
 	xassert(job_ptr);
 	xassert(job_ptr2);

 	/* Compare priority */
 	if (job_ptr->priority > job_ptr2->priority)
 		return true;
 	if (job_ptr2->priority > job_ptr->priority)
 		return false;

 	/* Compare submit time */
 	if (job_ptr->details->submit_time && job_ptr2->details->submit_time) {
 		if (job_ptr->details->submit_time <
 		    job_ptr2->details->submit_time)
 			return true;
 		if (job_ptr2->details->submit_time <
 		    job_ptr->details->submit_time)
 			return false;
 	}

 	/* Compare job id */
 	return job_ptr->job_id < job_ptr2->job_id;
 }

 static int _is_flex_or_any_nodes(void *x, void *none)
 {
 	slurmctld_resv_t *resv_ptr = x;
 	xassert(resv_ptr);
 	if (resv_ptr->flags & (RESERVE_FLAG_FLEX | RESERVE_FLAG_ANY_NODES))
 		return true;
 	return false;
 }

 static bool _use_none_resv_nodes(job_record_t *job_ptr)
 {
 	if (!job_ptr->resv_name)
 		return true; /* no reservation is used */
 	if (!job_ptr->resv_list)
 		return _is_flex_or_any_nodes(job_ptr->resv_ptr, NULL);
 	return list_find_first(job_ptr->resv_list, _is_flex_or_any_nodes, NULL);
 }

 static int _match_resv_id(void *x, void *key)
 {
 	slurmctld_resv_t *resv_ptr = (slurmctld_resv_t *) x;
 	uint32_t *resv_id = (uint32_t *) key;

 	xassert(resv_ptr);

 	if (resv_ptr->resv_id != *resv_id)
 		return 0;
 	else
 		return 1; /* match */
 }

 static int _will_resv_allow_warn_time(void *x, void *arg)
 {
 	slurmctld_resv_t *resv_ptr = x;
 	uint16_t *warn_time = arg;

 	xassert(resv_ptr);
 	xassert(warn_time);

 	if (resv_ptr->max_start_delay &&
 	    (*warn_time <= resv_ptr->max_start_delay))
 		return true;

 	return false;
 }

 static int _findfirst_resv_overlap_internal(void *x, void *arg)
 {
 	slurmctld_resv_t *cur_resv_in = x;
 	findfirst_resv_overlap_t *findfirst_resv_overlap = arg;
 	slurmctld_resv_t *cur_resv_check = findfirst_resv_overlap->cur_resv;

 	if (cur_resv_check->resv_id == cur_resv_in->resv_id) {
 		findfirst_resv_overlap->found = true;
 		return -1;
 	} else if (cur_resv_check->resv_id < cur_resv_in->resv_id) {
 		return -1;
 	}

 	return 0;
 }

 static int _findfirst_resv_overlap(void *x, void *arg)
 {
 	findfirst_resv_overlap_t *findfirst_resv_overlap = arg;
 	job_record_t *job_ptr2 = findfirst_resv_overlap->job_ptr2;

 	findfirst_resv_overlap->cur_resv = x;

 	/*
 	 * Continue if the cur_resv is less than any of the resv in the
 	 * second job's list. Otherwise return
 	 * findfirst_resv_overlap.found.
 	 */
 	if (!list_find_first(job_ptr2->resv_list,
 			     _findfirst_resv_overlap_internal,
 			     findfirst_resv_overlap) ||
 	    findfirst_resv_overlap->found)
 		return -1;

 	return 0;
 }

 static bool _can_resv_overlap(top_prio_args_t *job_args, job_record_t *job_ptr2)
 {
 	job_record_t *job_ptr1 = job_args->job_ptr;
 	findfirst_resv_overlap_t findfirst_resv_overlap = {
 		.found = false,
 		.job_ptr2 = job_ptr2,
 	};

 	if (job_args->use_none_resv_nodes && _use_none_resv_nodes(job_ptr2))
 		return true;

 	/*
 	 * If job_ptr1 does not have a resv but uses --signal=R, check if any of
 	 * job_ptr2's resv will allow overlap.
 	 */
 	if (!job_ptr1->resv_ptr && job_ptr2->resv_ptr &&
 	    (job_ptr1->warn_flags & KILL_JOB_RESV)) {
 		if (!job_ptr2->resv_list)
 			return _will_resv_allow_warn_time(job_ptr2->resv_ptr,
 							  &job_ptr1->warn_time);
 		return list_find_first(job_ptr2->resv_list,
 				       _will_resv_allow_warn_time,
 				       &job_ptr1->warn_time);
 	}

 	/* If 0-1 resv is used per job see if they match */
 	if (!job_ptr1->resv_list && !job_ptr2->resv_list)
 		return !xstrcmp(job_ptr1->resv_name, job_ptr2->resv_name);

 	/* If one doesn't use resv at this point they can't overlap */
 	if (!job_ptr1->resv_ptr || !job_ptr2->resv_ptr)
 		return false;

 	/* If one has a list of resv and the other has one resv */
 	if (job_ptr1->resv_list && !job_ptr2->resv_list)
 		return list_find_first(job_ptr1->resv_list, _match_resv_id,
 				       &job_ptr2->resv_ptr->resv_id);
 	if (job_ptr2->resv_list && !job_ptr1->resv_list)
 		return list_find_first(job_ptr2->resv_list, _match_resv_id,
 				       &job_ptr1->resv_ptr->resv_id);

 	/* Both jobs have resv lists - Note resv_list is sorted by id */
 	(void) list_find_first(job_ptr1->resv_list, _findfirst_resv_overlap,
 			       &findfirst_resv_overlap);

 	return findfirst_resv_overlap.found;
 }

 static int _union_part_nodes(void *x, void *arg)
 {
 	part_record_t *part_ptr = x;
 	bitstr_t *node_bitmap = arg;

 	xassert(part_ptr);
 	xassert(node_bitmap);

 	bit_or(node_bitmap, part_ptr->node_bitmap);
 	return SLURM_SUCCESS;
 }

 static bitstr_t *_get_all_part_nodes(job_record_t *job_ptr)
 {
 	bitstr_t *node_bitmap = NULL;

 	if (!job_ptr->part_ptr_list)
 		return bit_copy(job_ptr->part_ptr->node_bitmap);

 	node_bitmap = bit_alloc(bit_size(job_ptr->part_ptr->node_bitmap));
 	list_for_each(job_ptr->part_ptr_list, _union_part_nodes, node_bitmap);
 	return node_bitmap;
 }

 /* Return 1 if higher, 0 if the same, and -1 if lower */
 static int _cmp_part_prio_tier(top_prio_args_t *job_args,
 			       job_record_t *job_ptr2)
 {
 	uint16_t max_prio_tier2 = job_ptr2->part_ptr->priority_tier;
 	if (job_ptr2->part_ptr_list) {
 		/* part_ptr_list is sorted by priority tier */
 		part_record_t *part_ptr = list_peek(job_ptr2->part_ptr_list);
 		max_prio_tier2 = part_ptr->priority_tier;
 	}

 	/*
 	 * Comparing the min partition priority tier of job_ptr1
 	 * (the job in job_args) to the max of job_ptr2 is an optimization. It
 	 * will prevent job_ptr1 from being considered top priority if it is
 	 * possible for it to start in a lower priority tier partition than what
 	 * job_ptr2 could start in, even if job_ptr1 could also potentially
 	 * start in a higher priority tier partition.
 	 */
 	if (job_args->min_part_prio_tier > max_prio_tier2)
 		return 1;
 	if (job_args->min_part_prio_tier == max_prio_tier2)
 		return 0;
 	return -1;
 }

 static int _set_min_prio_tier(void *x, void *arg)
 {
 	part_record_t * part_ptr = x;
 	uint16_t *min_prio_tier = arg;

 	xassert(part_ptr);
 	xassert(min_prio_tier);

 	if (part_ptr->priority_tier < *min_prio_tier)
 		*min_prio_tier = part_ptr->priority_tier;

 	return SLURM_SUCCESS;
 }

 static void _destroy_top_prio_args(top_prio_args_t *args)
 {
 	if (!args || !args->job_ptr)
 		return;

 	/* Intentionally not freeing the job_ptr */
 	FREE_NULL_BITMAP(args->part_nodes);
 }

 static int _foreach_top_priority(void *x, void *arg)
 {
 	job_record_t *job_ptr2 = x;
 	top_prio_args_t *job_args = arg;
 	job_record_t *job_ptr = job_args->job_ptr;
 	bool overlap_with_resv = false;
 	bool parts_overlap = false;
 	int part_prio_cmp;
 	bitstr_t *node_bitmap2 = NULL;

 	xassert(job_args->job_ptr);

 	if (job_ptr2 == job_ptr)
 		return 0;
 	if ((job_args->het_job_offset != NO_VAL) &&
 	    (job_ptr->job_id == (job_ptr2->job_id + job_args->het_job_offset)))
 		return 0;
 	if (!IS_JOB_PENDING(job_ptr2))
 		return 0;
 	if (IS_JOB_COMPLETING(job_ptr2)) {
 		/* Job is hung in pending & completing state,
 		 * indicative of job requeue */
 		return 0;
 	}

 	if (bf_min_age_reserve) {
 		int pend_time;
 		if (!job_ptr2->details->begin_time)
 			return 0;
 		pend_time = difftime(job_args->now,
 				     job_ptr2->details->begin_time);
 		if (pend_time < bf_min_age_reserve)
 			return 0;
 	}
 	if (job_state_reason_check(job_ptr2->state_reason,
 				   JSR_QOS_ASSOC | JSR_MISC | JSR_PART) ||
 	    !job_independent(job_ptr2))
 		return 0;

 	if (job_ptr->resv_name && !job_ptr2->resv_name)
 		return 0; /* job's with resv have priority */
 	if (!_can_resv_overlap(job_args, job_ptr2))
 		return 0; /* job can't overlap nodes */
 	if (!job_ptr->resv_name && job_ptr2->resv_name)
 		overlap_with_resv = true;

 	if (bb_g_job_test_stage_in(job_ptr2, true) != 1)
 		return 0; /* Waiting for buffer */

 	/*
 	 * Priority tiers doesn't matter if job_ptr2 uses a resv
 	 * and job_ptr does not since resv take precedence
 	 */
 	part_prio_cmp = overlap_with_resv ?
 		-1 : _cmp_part_prio_tier(job_args, job_ptr2);
 	if ((part_prio_cmp == 1) ||
 	    ((part_prio_cmp == 0) && _higher_precedence(job_ptr, job_ptr2)))
 		return 0;

 	/*
 	 * Here job_ptr2 is either in a higher priority tier
 	 * partition or is using a resv while job_ptr is not.
 	 * If partitions overlap job_ptr is not top priority.
 	 */
 	if (!job_args->part_nodes)
 		job_args->part_nodes = _get_all_part_nodes(job_ptr);

 	node_bitmap2 = _get_all_part_nodes(job_ptr2);
 	parts_overlap = bit_overlap_any(job_args->part_nodes, node_bitmap2);
 	FREE_NULL_BITMAP(node_bitmap2);

 	if (!parts_overlap)
 		return 0; /* no nodes overlap in partitions */

 	return -1;
 }


 /*
  * _top_priority - determine if any other job has a higher priority than the
  *	specified job
  * IN job_ptr - pointer to selected job
  * RET true if selected job has highest priority
  */
 static bool _top_priority(job_record_t *job_ptr, uint32_t het_job_offset)
 {
 	job_details_t *detail_ptr = job_ptr->details;
 	bool top;

 	if (job_ptr->priority == 0)	/* user held */
 		top = false;
 	else {
 		top_prio_args_t job_args = {
 			.het_job_offset = het_job_offset,
 			.job_ptr = job_ptr,
 			.min_part_prio_tier = job_ptr->part_ptr->priority_tier,
 			.now = time(NULL),
 			.use_none_resv_nodes = _use_none_resv_nodes(job_ptr),
 		};

 		if (job_ptr->part_ptr_list)
 			list_for_each(job_ptr->part_ptr_list,
 				      _set_min_prio_tier,
 				      &job_args.min_part_prio_tier);


 		top = true; /* assume top priority until found otherwise */
 		if (list_find_first(job_list, _foreach_top_priority, &job_args))
 			top = false;

 		_destroy_top_prio_args(&job_args);
 	}

 	if ((!top) && detail_ptr) {	/* not top prio */
 		if (job_ptr->priority == 0) {		/* user/admin hold */
 			if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS
 			    && (job_ptr->state_reason != WAIT_RESV_DELETED)
 			    && (job_ptr->state_reason != FAIL_BURST_BUFFER_OP)
 			    && (job_ptr->state_reason != FAIL_ACCOUNT)
 			    && (job_ptr->state_reason != FAIL_QOS)
 			    && (job_ptr->state_reason != WAIT_HELD)
 			    && (job_ptr->state_reason != WAIT_HELD_USER)
 			    && job_ptr->state_reason != WAIT_MAX_REQUEUE) {
 				job_ptr->state_reason = WAIT_HELD;
 				xfree(job_ptr->state_desc);
 			}
 		} else if (job_ptr->state_reason == WAIT_NO_REASON &&
 			   het_job_offset == NO_VAL) {
 			job_ptr->state_reason = WAIT_PRIORITY;
 			xfree(job_ptr->state_desc);
 		}
 	}
 	return top;
 }

 static void _merge_job_licenses(job_record_t *shrink_job_ptr,
 				job_record_t *expand_job_ptr)
 {
 	xassert(shrink_job_ptr);
 	xassert(expand_job_ptr);

 	/* FIXME: do we really need to update accounting here?  It
 	 * might already happen */

 	if (!shrink_job_ptr->licenses)		/* No licenses to add */
 		return;

 	if (!expand_job_ptr->licenses) {	/* Just transfer licenses */
 		expand_job_ptr->licenses = shrink_job_ptr->licenses;
 		shrink_job_ptr->licenses = NULL;
 		FREE_NULL_LIST(expand_job_ptr->license_list);
 		expand_job_ptr->license_list = shrink_job_ptr->license_list;
 		shrink_job_ptr->license_list = NULL;
 		return;
 	}

 	/* Merge the license information into expanding job */
 	xstrcat(expand_job_ptr->licenses, ",");
 	xstrcat(expand_job_ptr->licenses, shrink_job_ptr->licenses);
 	xfree(shrink_job_ptr->licenses);
 	FREE_NULL_LIST(expand_job_ptr->license_list);
 	FREE_NULL_LIST(shrink_job_ptr->license_list);
 	license_job_merge(expand_job_ptr);
 }

 static void _hold_job_rec(job_record_t *job_ptr, uid_t uid)
 {
 	int i, j;
 	time_t now = time(NULL);

 	job_ptr->direct_set_prio = 1;
 	job_ptr->priority = 0;

 	if (job_ptr->details && (job_ptr->details->begin_time < now))
 		job_ptr->details->begin_time = 0;

 	/* Update job with new begin_time. */
 	jobacct_storage_g_job_start(acct_db_conn, job_ptr);

 	if (IS_JOB_PENDING(job_ptr))
 		acct_policy_remove_accrue_time(job_ptr, false);

 	if (job_ptr->part_ptr_list &&
 	    job_ptr->prio_mult &&
 	    job_ptr->prio_mult->priority_array) {
 		j = list_count(job_ptr->part_ptr_list);
 		for (i = 0; i < j; i++) {
 			job_ptr->prio_mult->priority_array[i] = 0;
 		}
 	}
 	sched_info("%s: hold on %pJ by uid %u", __func__, job_ptr, uid);
 }

 static int _foreach_hold_het_comp(void *x, void *arg)
 {
 	_hold_job_rec(x, *(uid_t *) arg);
 	return 0;
 }

 static void _hold_job(job_record_t *job_ptr, uid_t uid)
 {
 	job_record_t *het_job_leader = NULL;

 	if (job_ptr->het_job_id && _get_whole_hetjob())
 		het_job_leader = find_job_record(job_ptr->het_job_id);
 	if (het_job_leader && het_job_leader->het_job_list)
 		(void) list_for_each(het_job_leader->het_job_list,
 				     _foreach_hold_het_comp, &uid);
 	else
 		_hold_job_rec(job_ptr, uid);
 }

 static void _release_job_rec(job_record_t *job_ptr, uid_t uid)
 {
 	time_t now = time(NULL);
 	if (job_ptr->details && (job_ptr->details->begin_time < now))
 		job_ptr->details->begin_time = 0;
 	job_ptr->direct_set_prio = 0;
 	set_job_prio(job_ptr);
 	job_ptr->state_reason = WAIT_NO_REASON;
 	job_state_unset_flag(job_ptr, JOB_SPECIAL_EXIT);
 	xfree(job_ptr->state_desc);
 	job_ptr->exit_code = 0;
 	fed_mgr_job_requeue(job_ptr); /* submit sibling jobs */
 	sched_info("%s: release hold on %pJ by uid %u",
 		   __func__, job_ptr, uid);
 }

 static int _foreach_release_het_comp(void *x, void *arg)
 {
 	_release_job_rec(x, *(uid_t *) arg);
 	return 0;
 }

 static void _release_job(job_record_t *job_ptr, uid_t uid)
 {
 	job_record_t *het_job_leader = NULL;

 	if (job_ptr->het_job_id && _get_whole_hetjob())
 		het_job_leader = find_job_record(job_ptr->het_job_id);
 	if (het_job_leader && het_job_leader->het_job_list)
 		(void) list_for_each(het_job_leader->het_job_list,
 				     _foreach_release_het_comp, &uid);
 	else
 		_release_job_rec(job_ptr, uid);
 }

 /*
  * Gets a new association giving priority to the given parameters in job_desc,
  * and if not possible using the job_ptr ones.
  * IN job_desc: The new job description to use for getting the assoc_ptr.
  * IN job_ptr: The original job_ptr to use when parameters are not in job_desc.
  * RET assoc_rec, the new association combining the most updated information
  * from job_desc.
  */
 static slurmdb_assoc_rec_t *_retrieve_new_assoc(job_desc_msg_t *job_desc,
 						job_record_t *job_ptr)
 {
 	slurmdb_assoc_rec_t assoc_rec, *assoc_ptr = NULL;

 	memset(&assoc_rec, 0, sizeof(assoc_rec));

 	if (job_desc->partition) {
 		part_record_t *part_ptr = NULL;
 		int error_code =
 			_get_job_parts(job_desc, &part_ptr, NULL, NULL);
 		/* We don't need this we only care about part_ptr */
 		if (error_code != SLURM_SUCCESS) {
 			errno = error_code;
 			return NULL;
 		} else if (!(part_ptr->state_up & PARTITION_SUBMIT)) {
 			errno = ESLURM_PARTITION_NOT_AVAIL;
 			return NULL;
 		}

 		assoc_rec.partition = part_ptr->name;
 	} else if (job_ptr->part_ptr)
 		assoc_rec.partition = job_ptr->part_ptr->name;

 	if (job_desc->account)
 		assoc_rec.acct = job_desc->account;
 	else
 		assoc_rec.acct = job_ptr->account;

 	assoc_rec.uid = job_ptr->user_id;

 	if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
 				    accounting_enforce,
 				    &assoc_ptr, false)) {
 		info("%s: invalid account %s for %pJ",
 		     __func__, assoc_rec.acct, job_ptr);
 		errno = ESLURM_INVALID_ACCOUNT;
 		return NULL;
 	} else if (slurm_with_slurmdbd() &&
 		   !assoc_ptr &&
 		   !(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) &&
 		   assoc_rec.acct) {
 		/* if not enforcing associations we want to look for
 		 * the default account and use it to avoid getting
 		 * trash in the accounting records.
 		 */
 		assoc_rec.acct = NULL;
 		(void) assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
 					       accounting_enforce,
 					       &assoc_ptr, false);
 	}

 	return assoc_ptr;
 }

 /* Allocate nodes to new job. Old job info will be cleared at epilog complete */
 static void _realloc_nodes(job_record_t *job_ptr, bitstr_t *orig_node_bitmap)
 {
 	bitstr_t *node_bitmap;
 	node_record_t *node_ptr;

 	xassert(job_ptr);
 	xassert(orig_node_bitmap);
 	if (!job_ptr->job_resrcs || !job_ptr->job_resrcs->node_bitmap)
 		return;

 	node_bitmap = job_ptr->job_resrcs->node_bitmap;
 	for (int i = 0; (node_ptr = next_node_bitmap(node_bitmap, &i)); i++) {
 		if (bit_test(orig_node_bitmap, i))
 			continue;
 		make_node_alloc(node_ptr, job_ptr);
 	}
 	node_mgr_make_node_blocked(job_ptr, true);
 }

 extern bool permit_job_expansion(void)
 {
 	static time_t sched_update = 0;
 	static bool permit_job_expansion = false;

 	if (sched_update != slurm_conf.last_update) {
 		sched_update = slurm_conf.last_update;
 		if (xstrcasestr(slurm_conf.sched_params,
 		                "permit_job_expansion"))
 			permit_job_expansion = true;
 		else
 			permit_job_expansion = false;
 	}

 	return permit_job_expansion;
 }

 extern bool permit_job_shrink(void)
 {
 	static time_t sched_update = 0;
 	static bool permit_job_shrink = false;

 	if (sched_update != slurm_conf.last_update) {
 		sched_update = slurm_conf.last_update;
 		if (xstrcasestr(slurm_conf.sched_params, "disable_job_shrink"))
 			permit_job_shrink = false;
 		else
 			permit_job_shrink = true;
 	}

 	return permit_job_shrink;
 }

 /*
  * Job expansion is not allowed for jobs that requested OR licenses.
  */
 static bool _valid_license_job_expansion(job_record_t *job_ptr1,
 					 job_record_t *job_ptr2)
 {
 	if (xstrchr(job_ptr1->licenses, '|') ||
 	    xstrchr(job_ptr2->licenses, '|'))
 		return false;

 	return true;
 }

 static int _update_job(job_record_t *job_ptr, job_desc_msg_t *job_desc,
 		       uid_t uid, char **err_msg)
 {
 	int error_code = SLURM_SUCCESS;
 	enum job_state_reason fail_reason;
 	bool privileged = false;
 	bool is_coord_oldacc = false, is_coord_newacc = false;
 	uint32_t save_min_nodes = 0, save_max_nodes = 0;
 	uint32_t save_min_cpus = 0, save_max_cpus = 0;
 	job_details_t *detail_ptr;
 	part_record_t *new_part_ptr = NULL, *use_part_ptr = NULL;
 	bitstr_t *exc_bitmap = NULL, *new_req_bitmap = NULL;
 	bitstr_t *orig_job_node_bitmap = NULL;
 	time_t now = time(NULL);
 	multi_core_data_t *mc_ptr = NULL;
 	bool update_accounting = false, new_req_bitmap_given = false;
 	acct_policy_limit_set_t acct_policy_limit_set;
 	uint16_t tres[slurmctld_tres_cnt];
 	bool acct_limit_already_exceeded;
 	bool tres_changed = false;
 	int tres_pos;
 	uint64_t tres_req_cnt[slurmctld_tres_cnt];
 	bool tres_req_cnt_set = false, valid_licenses = false;
 	list_t *gres_list = NULL, *license_list = NULL;
 	list_t *part_ptr_list = NULL;
 	uint32_t orig_time_limit;
 	bool gres_update = false;
 	slurmdb_assoc_rec_t *new_assoc_ptr = NULL, *use_assoc_ptr = NULL;
 	slurmdb_qos_rec_t *new_qos_ptr = NULL, *use_qos_ptr = NULL;
 	slurmctld_resv_t *new_resv_ptr = NULL;
 	list_t *new_resv_list = NULL;
 	list_t *new_qos_list = NULL;
 	uint32_t user_site_factor;
 	uint32_t new_qos_id = 0;
 	uint64_t mem_req;

 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };
 	assoc_mgr_lock_t assoc_mgr_read_lock = {
 		.assoc = READ_LOCK,
 		.qos = READ_LOCK,
 		.user = READ_LOCK,
 	};

 	/*
 	 * Block scontrol updates of scrontab jobs.
 	 */
 	if (job_ptr->bit_flags & CRON_JOB)
 		return ESLURM_CANNOT_MODIFY_CRON_JOB;

 	privileged = validate_operator(uid);

 	/* Check authorization for modifying this job */
 	is_coord_oldacc = assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
 						       job_ptr->account,
 						       false);
 	is_coord_newacc = assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
 						       job_desc->account,
 						       false);
 	if ((job_ptr->user_id != uid) && !privileged) {
 		/*
 		 * Fail if we are not coordinators of the current account or
 		 * if we are changing an account and  we are not coordinators
 		 * of both src and dest accounts.
 		 */
 		if (!is_coord_oldacc ||
 		    (!is_coord_newacc && job_desc->account)) {
 			error("Security violation, JOB_UPDATE RPC from uid %u",
 			      uid);
 			return ESLURM_USER_ID_MISSING;
 		}
 	}

 	if (job_desc->burst_buffer) {
 		/*
 		 * burst_buffer contents are validated at job submit time and
 		 * data is possibly being staged at later times. It can not
 		 * be changed except to clear the value on a completed job and
 		 * purge the record in order to recover from a failure mode
 		 */
 		if (IS_JOB_COMPLETED(job_ptr) && privileged &&
 		    (job_desc->burst_buffer[0] == '\0')) {
 			xfree(job_ptr->burst_buffer);
 			last_job_update = now;
 		} else {
 			error_code = ESLURM_NOT_SUPPORTED;
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->array_inx && job_ptr->array_recs) {
 		int throttle;
 		throttle = strtoll(job_desc->array_inx, (char **) NULL, 10);
 		if (throttle >= 0) {
 			info("%s: set max_run_tasks to %d for job array %pJ",
 			     __func__, throttle, job_ptr);
 			job_ptr->array_recs->max_run_tasks = throttle;
 		} else {
 			info("%s: invalid max_run_tasks of %d for job array %pJ, ignored",
 			     __func__, throttle, job_ptr);
 			error_code = ESLURM_BAD_TASK_COUNT;
 		}
 		/*
 		 * Even if the job is complete, permit changing
 		 * ArrayTaskThrottle for other elements of the task array
 		 */
 		if (IS_JOB_FINISHED(job_ptr))
 			goto fini;
 	}

 	if (IS_JOB_FINISHED(job_ptr)) {
 		error_code = ESLURM_JOB_FINISHED;
 		goto fini;
 	}

 	/*
 	 * Validate before job_submit_g_modify() so that the job_submit
 	 * plugin can make changes to the field without triggering an auth
 	 * issue.
 	 */
 	if (job_desc->admin_comment && !validate_super_user(uid)) {
 		error("Attempt to change admin_comment for %pJ", job_ptr);
 		error_code = ESLURM_ACCESS_DENIED;
 		goto fini;
 	}

 	/* Save before submit plugin potentially modifies it. */
 	user_site_factor = job_desc->site_factor;

 	if (job_desc->user_id == SLURM_AUTH_NOBODY) {
 		/*
 		 * Used by job_submit/lua to find default partition and
 		 * access control logic below to validate partition change
 		 */
 		job_desc->user_id = job_ptr->user_id;
 	}
 	error_code = job_submit_g_modify(job_desc, job_ptr, uid, err_msg);
 	if (error_code != SLURM_SUCCESS)
 		return error_code;

 	error_code = _test_job_desc_fields(job_desc);
 	if (error_code != SLURM_SUCCESS)
 		return error_code;

 	/* Do not update MCS label unless explicitly provided */
 	if (job_desc->mcs_label) {
 		/* Only pending jobs can be updated */
 		if (!IS_JOB_PENDING(job_ptr))
 			return ESLURM_JOB_NOT_PENDING;
 		/* This is an attempt to explicitly reset the value */
 		if (job_desc->mcs_label[0] == '\0')
 			xfree(job_desc->mcs_label);

 		if (mcs_g_set_mcs_label(job_ptr, job_desc->mcs_label)) {
 			if (!job_desc->mcs_label)
 				error("Failed to update job: No valid mcs_label found");
 			else
 				error("Failed to update job: Invalid mcs-label: %s",
 				      job_desc->mcs_label);
 			return ESLURM_INVALID_MCS_LABEL;
 		}
 	}

 	memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set));
 	acct_policy_limit_set.tres = tres;

 	if (privileged) {
 		/* set up the acct_policy if we are at least an operator */
 		for (tres_pos = 0; tres_pos < slurmctld_tres_cnt; tres_pos++)
 			acct_policy_limit_set.tres[tres_pos] = ADMIN_SET_LIMIT;
 		acct_policy_limit_set.time = ADMIN_SET_LIMIT;
 		acct_policy_limit_set.qos = ADMIN_SET_LIMIT;
 	} else
 		memset(tres, 0, sizeof(tres));

 	detail_ptr = job_ptr->details;
 	if (detail_ptr)
 		mc_ptr = detail_ptr->mc_ptr;
 	last_job_update = now;

 	/*
 	 * Check to see if the new requested job_desc exceeds any
 	 * existing limit. If it passes, cool, we will check the new
 	 * association/qos/part later in the code and fail if it is wrong.
 	 *
 	 * If it doesn't pass this mean some limit was exceededed before the
 	 * update request so let's keep the user continue screwing up herself
 	 * with the limit if it is what she wants. We do this by not exiting
 	 * on the later call to acct_policy_validate() if it fails.
 	 *
 	 * We will also prevent the update to return an error code that is
 	 * confusing since many things could successfully update and we are now
 	 * just already violating a limit. The job won't be allowed to run,
 	 * but it will allow the update to happen which is most likely what
 	 * was desired.
 	 *
 	 * Changes in between this check and the next acct_policy_validate()
 	 * will not be constrained to accounting enforce limits.
 	 */
 	orig_time_limit = job_desc->time_limit;


 	/*
 	 * We need to figure out if we changed task cnt.
 	 */
 	_figure_out_num_tasks(job_desc, job_ptr);

 	memcpy(tres_req_cnt, job_ptr->tres_req_cnt, sizeof(tres_req_cnt));
 	job_desc->tres_req_cnt = tres_req_cnt;
 	tres_req_cnt_set = true;

 	acct_limit_already_exceeded = false;

 	if (!privileged && (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) {
 		if (!acct_policy_validate(job_desc, job_ptr->part_ptr,
 					  job_ptr->part_ptr_list,
 					  job_ptr->assoc_ptr, job_ptr->qos_ptr,
 					  NULL, &acct_policy_limit_set,
 					  true)) {
 			debug("%s: already exceeded association's cpu, node, "
 			      "memory or time limit for user %u",
 			      __func__, job_desc->user_id);
 			acct_limit_already_exceeded = true;
 		}
 		job_desc->time_limit = orig_time_limit;
 	}

 	/*
 	 * The partition, assoc, qos, reservation, and req_node_bitmap all have
 	 * to be set before checking later.  So here we set them into temporary
 	 * variables set in the job way later.
 	 */
 	if (job_desc->partition &&
 	    !xstrcmp(job_desc->partition, job_ptr->partition)) {
 		sched_debug("%s: new partition identical to old partition %pJ",
 			    __func__, job_ptr);
 	} else if (job_desc->partition) {
 		if (!IS_JOB_PENDING(job_ptr)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 			goto fini;
 		}

 		error_code = _get_job_parts(job_desc,
 					    &new_part_ptr,
 					    &part_ptr_list, NULL);

 		if (error_code != SLURM_SUCCESS)
 			;
 		else if ((new_part_ptr->state_up & PARTITION_SUBMIT) == 0)
 			error_code = ESLURM_PARTITION_NOT_AVAIL;
 		else if (!part_ptr_list &&
 			 !xstrcmp(new_part_ptr->name, job_ptr->partition)) {
 			sched_debug("%s: 2 new partition identical to old partition %pJ",
 				    __func__, job_ptr);
 			new_part_ptr = NULL;
 		}
 		if (error_code != SLURM_SUCCESS)
 			goto fini;
 	}

 	use_part_ptr = new_part_ptr ? new_part_ptr : job_ptr->part_ptr;

 	/* Check the account and the partition as both affect the association */
 	if (job_desc->account || new_part_ptr) {
 		if (!IS_JOB_PENDING(job_ptr))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else {
 			new_assoc_ptr = _retrieve_new_assoc(job_desc, job_ptr);

 			if (!new_assoc_ptr)
 				error_code = errno;
 			else if (new_assoc_ptr == job_ptr->assoc_ptr) {
 				new_assoc_ptr = NULL;
 				sched_debug("%s: new association identical to old association %u",
 					    __func__, job_ptr->job_id);
 			}

 			/*
 			 * Clear errno that may have been set by
 			 * _retrieve_new_assoc.
 			 */
 			errno = 0;
 		}

 		if (error_code != SLURM_SUCCESS)
 			goto fini;
 	}

 	use_assoc_ptr = new_assoc_ptr ?	new_assoc_ptr : job_ptr->assoc_ptr;

 	if (job_desc->qos) {
 		char *resv_name;
 		assoc_mgr_lock_t qos_read_lock = {
 			.qos = READ_LOCK,
 		};

 		if (job_desc->reservation
 		    && job_desc->reservation[0] != '\0')
 			resv_name = job_desc->reservation;
 		else
 			resv_name = job_ptr->resv_name;

 		assoc_mgr_lock(&qos_read_lock);

 		error_code =
 			_get_qos_info(job_desc->qos, 0, &new_qos_list,
 				      &new_qos_ptr, resv_name, use_assoc_ptr,
 				      privileged, true, LOG_LEVEL_ERROR);
 		if ((error_code == SLURM_SUCCESS) && new_qos_ptr) {
 			if (!new_qos_list &&
 			    (job_ptr->qos_ptr == new_qos_ptr)) {
 				sched_debug("%s: new QOS identical to old QOS %pJ",
 					    __func__, job_ptr);
 				new_qos_ptr = NULL;
 			} else if (!IS_JOB_PENDING(job_ptr)) {
 				error_code = ESLURM_JOB_NOT_PENDING;
 				new_qos_ptr = NULL;
 			}
 		}

 		if (new_qos_ptr)
 			new_qos_id = new_qos_ptr->id;

 		assoc_mgr_unlock(&qos_read_lock);

 		if (error_code != SLURM_SUCCESS)
 			goto fini;
 	}

 	use_qos_ptr = new_qos_ptr ? new_qos_ptr : job_ptr->qos_ptr;

 	if (job_desc->bitflags & RESET_ACCRUE_TIME) {
 		if (!IS_JOB_PENDING(job_ptr) || !detail_ptr) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 			goto fini;
 		} else if (detail_ptr->accrue_time) {
 			uint64_t bit_flags = job_ptr->bit_flags;
 			acct_policy_remove_accrue_time(job_ptr, false);
 			/*
 			 * Set the accrue_time to 'now' since we are not
 			 * removing this job, but resetting the time
 			 * instead. Since acct_policy_remove_accrue_time()
 			 * will set this to 0 which will cause the next time
 			 * through acct_policy_handle_accrue_time() to set
 			 * things back to the original time thus making it as if
 			 * nothing happened here.
 			 *
 			 * We also reset the bit_flags to be the same as it was
 			 * before so we don't loose JOB_ACCRUE_OVER if set
 			 * beforehand.
 			 */
 			job_ptr->bit_flags = bit_flags;
 			detail_ptr->accrue_time = now;
 		}
 	}

 	/*
 	 * Before any action over excluded or required nodes, we are going to
 	 * reset them to their original values.
 	 *
 	 * We will decide later if those values need update, or even if we need
 	 * to merge the negated required list into the excluded one (when
 	 * -N < size required list).
 	 */
 	FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap);
 	if (detail_ptr->exc_nodes) {
 		/* This error should never happen */
 		if (node_name2bitmap(detail_ptr->exc_nodes,
 				     false, &exc_bitmap, NULL)) {
 			sched_info("%s: Invalid excluded nodes list in job records: %s",
 				   __func__, detail_ptr->exc_nodes);
 			FREE_NULL_BITMAP(exc_bitmap);
 			error_code = ESLURM_INVALID_NODE_NAME;
 			goto fini;
 		}
 		detail_ptr->exc_node_bitmap = exc_bitmap;
 		exc_bitmap = NULL;
 	}
 	FREE_NULL_BITMAP(detail_ptr->req_node_bitmap);
 	if (detail_ptr->req_nodes) {
 		/* This error should never happen */
 		if (node_name2bitmap(detail_ptr->req_nodes,
 				     false, &new_req_bitmap, NULL)) {
 			sched_info("%s: Invalid required nodes list in job records: %s",
 				   __func__, detail_ptr->req_nodes);
 			FREE_NULL_BITMAP(new_req_bitmap);
 			error_code = ESLURM_INVALID_NODE_NAME;
 			goto fini;
 		}
 		detail_ptr->req_node_bitmap = new_req_bitmap;
 		new_req_bitmap = NULL;
 	}

 	if (job_desc->exc_nodes && detail_ptr &&
 	    !xstrcmp(job_desc->exc_nodes, detail_ptr->exc_nodes)) {
 		sched_debug("%s: new exc_nodes identical to old exc_nodes %s",
 			    __func__, job_desc->exc_nodes);
 	} else if (job_desc->exc_nodes) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (job_desc->exc_nodes[0] == '\0') {
 			xfree(detail_ptr->exc_nodes);
 			FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap);
 		} else {
 			if (node_name2bitmap(job_desc->exc_nodes, false,
 					     &exc_bitmap, NULL)) {
 				sched_error("%s: Invalid node list for update of %pJ: %s",
 					    __func__, job_ptr,
 					    job_desc->exc_nodes);
 				FREE_NULL_BITMAP(exc_bitmap);
 				error_code = ESLURM_INVALID_NODE_NAME;
 			}
 			if (exc_bitmap) {
 				xfree(detail_ptr->exc_nodes);
 				detail_ptr->exc_nodes =
 					xstrdup(job_desc->exc_nodes);
 				FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap);
 				detail_ptr->exc_node_bitmap = exc_bitmap;
 				sched_info("%s: setting exc_nodes to %s for %pJ",
 					   __func__, job_desc->exc_nodes, job_ptr);
 			}
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	/*
 	 * Must check req_nodes to set the job_ptr->details->req_node_bitmap
 	 * before we validate it later.
 	 */
 	if (job_desc->req_nodes && detail_ptr &&
 	    !xstrcmp(job_desc->req_nodes, detail_ptr->req_nodes)) {
 		sched_debug("%s: new req_nodes identical to old req_nodes %s",
 			    __func__, job_desc->req_nodes);
 	} else if (job_desc->req_nodes && detail_ptr &&
 		   (detail_ptr->task_dist & SLURM_DIST_STATE_BASE) ==
 		   SLURM_DIST_ARBITRARY) {
 		sched_info("%s: Cannot update node list of %pJ. Not compatible with arbitrary distribution",
 		      __func__, job_ptr);
 		error_code = ESLURM_NOT_SUPPORTED;
 		goto fini;
 	} else if (job_desc->req_nodes &&
 		   (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
 		/*
 		 * Use req_nodes to change the nodes associated with a running
 		 * for lack of other field in the job request to use
 		 */
 		if (!permit_job_shrink()) {
 			error("%s: request to shrink %pJ denied by configuration",
 			      __func__, job_ptr);
 			error_code = ESLURM_NOT_SUPPORTED;
 			goto fini;
 		}

 		if ((job_desc->req_nodes[0] == '\0') ||
 		    node_name2bitmap(job_desc->req_nodes, false,
 				     &new_req_bitmap, NULL) ||
 		    !bit_super_set(new_req_bitmap, job_ptr->node_bitmap) ||
 		    (job_ptr->details && job_ptr->details->expanding_jobid)) {
 			sched_info("%s: Invalid node list (%s) for %pJ update",
 				   __func__, job_desc->req_nodes, job_ptr);
 			error_code = ESLURM_INVALID_NODE_NAME;
 			goto fini;
 		}

 		if (new_req_bitmap) {
 			node_record_t *node_ptr;
 			bitstr_t *rem_nodes;

 			/*
 			 * They requested a new list of nodes for the job. If
 			 * the batch host isn't in this list, then deny this
 			 * request.
 			 */
 			if (job_ptr->batch_flag) {
 				int batch_inx = node_name_get_inx(
 					job_ptr->batch_host);

 				if (batch_inx == -1)
 					error("%s: Invalid batch host %s for %pJ; this should never happen",
 					      __func__, job_ptr->batch_host,
 					      job_ptr);
 				else if (!bit_test(new_req_bitmap, batch_inx)) {
 					error("%s: Batch host %s for %pJ is not in the requested node list %s. You cannot remove the batch host from a job when resizing.",
 					      __func__, job_ptr->batch_host,
 					      job_ptr, job_desc->req_nodes);
 					error_code = ESLURM_INVALID_NODE_NAME;
 					goto fini;
 				}
 			}

 			sched_info("%s: setting nodes to %s for %pJ",
 				   __func__, job_desc->req_nodes, job_ptr);
 			job_pre_resize_acctg(job_ptr);
 			rem_nodes = bit_copy(job_ptr->node_bitmap);
 			bit_and_not(rem_nodes, new_req_bitmap);
 			abort_job_on_nodes(job_ptr, rem_nodes);
 			orig_job_node_bitmap =
 				bit_copy(job_ptr->job_resrcs->node_bitmap);
 			for (int i = 0;
 			     (node_ptr = next_node_bitmap(rem_nodes, &i));
 			     i++) {
 				kill_step_on_node(job_ptr, node_ptr, false);
 				excise_node_from_job(job_ptr, node_ptr);
 			}
 			/* Resize the core bitmaps of the job's steps */
 			rebuild_step_bitmaps(job_ptr, orig_job_node_bitmap);

 			FREE_NULL_BITMAP(orig_job_node_bitmap);
 			FREE_NULL_BITMAP(rem_nodes);
 			(void) gs_job_start(job_ptr);
 			gres_stepmgr_job_build_details(
 				job_ptr->gres_list_alloc,
 				job_ptr->nodes,
 				&job_ptr->gres_detail_cnt,
 				&job_ptr->gres_detail_str,
 				&job_ptr->gres_used);
 			job_post_resize_acctg(job_ptr);
 			/*
 			 * Since job_post_resize_acctg will restart
 			 * things, don't do it again.
 			 */
 			update_accounting = false;
 		} else {
 			update_accounting = true;
 		}
 		FREE_NULL_BITMAP(new_req_bitmap);
 	} else if (job_desc->req_nodes) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (job_desc->req_nodes[0] == '\0')
 			new_req_bitmap_given = true;
 		else {
 			if (node_name2bitmap(job_desc->req_nodes, false,
 					     &new_req_bitmap, NULL)) {
 				sched_info("%s: Invalid node list for job_update: %s",
 					   __func__, job_desc->req_nodes);
 				FREE_NULL_BITMAP(new_req_bitmap);
 				error_code = ESLURM_INVALID_NODE_NAME;
 			} else
 				new_req_bitmap_given = true;
 		}
 	}

 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (new_req_bitmap_given) {
 		xfree(detail_ptr->req_nodes);
 		if (job_desc->req_nodes[0] != '\0')
 			detail_ptr->req_nodes =	xstrdup(job_desc->req_nodes);
 		FREE_NULL_BITMAP(detail_ptr->req_node_bitmap);
 		detail_ptr->req_node_bitmap = new_req_bitmap;
 		new_req_bitmap = NULL;
 		sched_info("%s: setting req_nodes to %s for %pJ",
 			   __func__, job_desc->req_nodes, job_ptr);
 	}

 	/* this needs to be after partition and QOS checks */
 	if (job_desc->reservation
 	    && (!xstrcmp(job_desc->reservation, job_ptr->resv_name) ||
 		(!job_ptr->resv_name && job_desc->reservation[0] == '\0'))) {
 		sched_debug("%s: new reservation identical to old reservation %pJ",
 			    __func__, job_ptr);
 	} else if (job_desc->reservation) {
 		if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr)) {
 			error_code = ESLURM_JOB_NOT_PENDING_NOR_RUNNING;
 		} else {
 			job_record_t tmp_job_rec;

 			memcpy(&tmp_job_rec, job_ptr, sizeof(job_record_t));
 			tmp_job_rec.resv_name = xstrdup(job_desc->reservation);
 			tmp_job_rec.resv_ptr = NULL;
 			tmp_job_rec.resv_list = NULL;
 			tmp_job_rec.part_ptr = use_part_ptr;
 			tmp_job_rec.qos_ptr = use_qos_ptr;
 			tmp_job_rec.assoc_ptr = use_assoc_ptr;

 			error_code = validate_job_resv(&tmp_job_rec);

 			/*
 			 * It doesn't matter what this is, just set it as
 			 * failure will be NULL.
 			 */
 			new_resv_ptr = tmp_job_rec.resv_ptr;
 			new_resv_list = tmp_job_rec.resv_list;

 			/*
 			 * Make sure this job isn't using a partition or QOS
 			 * that requires it to be in a reservation.
 			 */
 			if ((error_code == SLURM_SUCCESS) && !new_resv_ptr) {
 				if (use_part_ptr
 				    && use_part_ptr->flags & PART_FLAG_REQ_RESV)
 					error_code = ESLURM_ACCESS_DENIED;

 				if (use_qos_ptr
 				    && use_qos_ptr->flags & QOS_FLAG_REQ_RESV)
 					error_code = ESLURM_INVALID_QOS;
 			}

 			if (job_ptr->state_reason == WAIT_RESV_INVALID)
 				_release_job(job_ptr, uid);

 			xfree(tmp_job_rec.resv_name);
 		}
 		if (error_code != SLURM_SUCCESS)
 			goto fini;
 	}

 	if (job_desc->cpus_per_tres   || job_desc->tres_per_job    ||
 	    job_desc->tres_per_node   || job_desc->tres_per_socket ||
 	    job_desc->tres_per_task   || job_desc->mem_per_tres ||
 	    (job_desc->bitflags & TASKS_CHANGED))
 		gres_update = true;
 	if (gres_update) {
 		uint16_t orig_ntasks_per_socket = NO_VAL16;
 		gres_job_state_validate_t gres_js_val = {
 			.cpus_per_task = &job_desc->cpus_per_task,
 			.max_nodes = &job_desc->max_nodes,
 			.min_cpus = &job_desc->min_cpus,
 			.min_nodes = &job_desc->min_nodes,
 			.ntasks_per_node = &job_desc->ntasks_per_node,
 			.ntasks_per_socket = &job_desc->ntasks_per_socket,
 			.ntasks_per_tres = &job_desc->ntasks_per_tres,
 			.num_tasks = &job_desc->num_tasks,
 			.sockets_per_node = &job_desc->sockets_per_node,

 			.gres_list = &gres_list,
 		};

 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL) ||
 		    (detail_ptr->expanding_jobid != 0)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 			goto fini;
 		}
 		if (!job_desc->cpus_per_tres)
 			job_desc->cpus_per_tres =
 				xstrdup(job_ptr->cpus_per_tres);
 		if (!job_desc->tres_freq)
 			job_desc->tres_freq = xstrdup(job_ptr->tres_freq);
 		if (!job_desc->tres_per_job)
 			job_desc->tres_per_job = xstrdup(job_ptr->tres_per_job);
 		if (!job_desc->tres_per_node)
 			job_desc->tres_per_node =
 				xstrdup(job_ptr->tres_per_node);
 		if (!job_desc->tres_per_socket)
 			job_desc->tres_per_socket =
 				xstrdup(job_ptr->tres_per_socket);
 		if (!job_desc->tres_per_task)
 			job_desc->tres_per_task =
 				xstrdup(job_ptr->tres_per_task);
 		if (!job_desc->mem_per_tres)
 			job_desc->mem_per_tres = xstrdup(job_ptr->mem_per_tres);
 		if (job_desc->num_tasks == NO_VAL)
 			job_desc->num_tasks = detail_ptr->num_tasks;
 		if (job_desc->min_cpus == NO_VAL)
 			job_desc->min_cpus = 0; /* min_cpus could decrease */
 		if (job_desc->min_nodes == NO_VAL)
 			job_desc->min_nodes = detail_ptr->min_nodes;
 		if (job_desc->max_nodes == NO_VAL)
 			job_desc->max_nodes = detail_ptr->max_nodes;
 		if (job_desc->ntasks_per_node == NO_VAL16)
 			job_desc->ntasks_per_node = detail_ptr->ntasks_per_node;
 		if ((job_desc->ntasks_per_socket == NO_VAL16) &&
 		    (detail_ptr->mc_ptr) &&
 		    (detail_ptr->mc_ptr->ntasks_per_socket != INFINITE16)) {
 			job_desc->ntasks_per_socket =
 				mc_ptr->ntasks_per_socket;
 			orig_ntasks_per_socket = job_desc->ntasks_per_socket;
 		}
 		if (job_desc->sockets_per_node == NO_VAL16)
 			job_desc->sockets_per_node =
 				detail_ptr->mc_ptr->sockets_per_node;
 		if (job_desc->cpus_per_task == NO_VAL16)
 			job_desc->cpus_per_task =
 				detail_ptr->orig_cpus_per_task;
 		if (!job_desc->ntasks_per_tres)
 			job_desc->ntasks_per_tres = detail_ptr->ntasks_per_tres;

 		gres_js_val.cpus_per_tres = job_desc->cpus_per_tres;
 		gres_js_val.mem_per_tres = job_desc->mem_per_tres;
 		gres_js_val.tres_freq = job_desc->tres_freq;
 		gres_js_val.tres_per_job = job_desc->tres_per_job;
 		gres_js_val.tres_per_node = job_desc->tres_per_node;
 		gres_js_val.tres_per_socket = job_desc->tres_per_socket;
 		gres_js_val.tres_per_task = job_desc->tres_per_task;

 		if ((error_code = gres_job_state_validate(&gres_js_val))) {
 			sched_info("%s: invalid GRES for %pJ",
 				   __func__, job_ptr);
 			goto fini;
 		}
 		if (job_desc->num_tasks == detail_ptr->num_tasks)
 			job_desc->num_tasks = NO_VAL;	/* Unchanged */
 		if ((job_desc->min_cpus == detail_ptr->min_cpus) ||
 		    (job_desc->min_cpus == 0)) /* Unchanged */
 			job_desc->min_cpus = NO_VAL;
 		if (job_desc->min_nodes == detail_ptr->min_nodes)
 			job_desc->min_nodes = NO_VAL;	/* Unchanged */
 		if (job_desc->max_nodes == detail_ptr->max_nodes)
 			job_desc->max_nodes = NO_VAL;	/* Unchanged */
 		if (job_desc->ntasks_per_node == detail_ptr->ntasks_per_node)
 			job_desc->ntasks_per_node = NO_VAL16;	/* Unchanged */
 		if (job_desc->ntasks_per_socket == orig_ntasks_per_socket)
 			job_desc->ntasks_per_socket = NO_VAL16; /* Unchanged */
 		if (job_desc->sockets_per_node ==
 		    detail_ptr->mc_ptr->sockets_per_node)
 			job_desc->sockets_per_node = NO_VAL16;
 		if (job_desc->cpus_per_task == detail_ptr->cpus_per_task)
 			job_desc->cpus_per_task = NO_VAL16;	/* Unchanged */
 		if (job_desc->ntasks_per_tres == detail_ptr->ntasks_per_tres)
 			job_desc->ntasks_per_tres = 0;
 		if (!xstrcmp(job_desc->cpus_per_tres, job_ptr->cpus_per_tres))
 			xfree(job_desc->cpus_per_tres);
 		if (!xstrcmp(job_desc->tres_freq, job_ptr->tres_freq))
 			xfree(job_desc->tres_freq);
 		if (!xstrcmp(job_desc->tres_per_job, job_ptr->tres_per_job))
 			xfree(job_desc->tres_per_job);
 		if (!xstrcmp(job_desc->tres_per_node, job_ptr->tres_per_node))
 			xfree(job_desc->tres_per_node);
 		if (!xstrcmp(job_desc->tres_per_socket,
 			     job_ptr->tres_per_socket))
 			xfree(job_desc->tres_per_socket);
 		if (!xstrcmp(job_desc->tres_per_task, job_ptr->tres_per_task))
 			xfree(job_desc->tres_per_task);
 		if (!xstrcmp(job_desc->mem_per_tres, job_ptr->mem_per_tres))
 			xfree(job_desc->mem_per_tres);

 	}

 	if ((job_desc->min_nodes != NO_VAL) &&
 	    (job_desc->min_nodes != INFINITE)) {
 		uint32_t min_cpus = (job_desc->pn_min_cpus != NO_VAL16 ?
 				     job_desc->pn_min_cpus : detail_ptr->pn_min_cpus) *
 			job_desc->min_nodes;
 		uint32_t num_cpus = job_desc->min_cpus != NO_VAL ?
 			job_desc->min_cpus :
 			IS_JOB_PENDING(job_ptr) ?
 			job_ptr->tres_req_cnt[TRES_ARRAY_CPU] :
 			job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU];
 		uint32_t num_tasks = job_desc->num_tasks != NO_VAL ?
 			job_desc->num_tasks : detail_ptr->num_tasks;

 		if (!num_tasks) {
 			num_tasks = job_desc->min_nodes;
 		} else if (num_tasks < job_desc->min_nodes) {
 			info("%s: adjusting num_tasks (prev: %u) to be at least min_nodes: %u",
 			     __func__, num_tasks, job_desc->min_nodes);
 			num_tasks = job_desc->min_nodes;
 			if (IS_JOB_PENDING(job_ptr))
 				job_desc->num_tasks = num_tasks;
 		}

 		num_tasks *= job_desc->cpus_per_task != NO_VAL16 ?
 			job_desc->cpus_per_task : detail_ptr->cpus_per_task;
 		num_tasks = MAX(num_tasks, min_cpus);
 		if (num_tasks > num_cpus) {
 			info("%s: adjusting min_cpus (prev: %u) to be at least : %u",
 			     __func__, num_cpus, num_tasks);
 			job_desc->min_cpus = num_tasks;

 			job_desc->pn_min_memory =
 				job_desc->pn_min_memory != NO_VAL64 ?
 				job_desc->pn_min_memory :
 				detail_ptr->pn_min_memory;
 		}

 		assoc_mgr_lock(&locks);

 		if (!job_desc->licenses) {
 			license_set_job_tres_cnt(job_ptr->license_list,
 						 job_desc->tres_req_cnt,
 						 true);
 		}
 		assoc_mgr_unlock(&locks);


 		job_desc->tres_req_cnt[TRES_ARRAY_NODE] = job_desc->min_nodes;
 	}

 	if (job_desc->min_cpus != NO_VAL)
 		job_desc->tres_req_cnt[TRES_ARRAY_CPU] = job_desc->min_cpus;
 	else if ((job_desc->pn_min_cpus != NO_VAL16) &&
 		 (job_desc->pn_min_cpus != 0)) {
 		job_desc->tres_req_cnt[TRES_ARRAY_CPU] =
 			job_desc->pn_min_cpus *
 			(job_desc->min_nodes != NO_VAL ?
 			 job_desc->min_nodes :
 			 detail_ptr ? detail_ptr->min_nodes : 1);
 		job_desc->min_cpus = job_desc->tres_req_cnt[TRES_ARRAY_CPU];
 	} else if (job_desc->bitflags & TASKS_CHANGED) {
 		job_desc->tres_req_cnt[TRES_ARRAY_CPU] = job_desc->min_cpus =
 			job_desc->num_tasks;
 	}

 	mem_req =
 		job_get_tres_mem(NULL,
 				 job_desc->pn_min_memory,
 				 job_desc->tres_req_cnt[TRES_ARRAY_CPU] ?
 				 job_desc->tres_req_cnt[TRES_ARRAY_CPU] :
 				 job_ptr->tres_req_cnt[TRES_ARRAY_CPU],
 				 job_desc->min_nodes != NO_VAL ?
 				 job_desc->min_nodes :
 				 detail_ptr ? detail_ptr->min_nodes : 1,
 				 use_part_ptr,
 				 gres_list ? gres_list : job_ptr->gres_list_req,
 				 (job_desc->pn_min_memory != NO_VAL64),
 				 job_desc->sockets_per_node,
 				 job_desc->num_tasks);
 	if (mem_req)
 		job_desc->tres_req_cnt[TRES_ARRAY_MEM] = mem_req;

 	if (gres_update) {
 		gres_stepmgr_set_job_tres_cnt(
 			gres_list,
 			job_desc->tres_req_cnt[TRES_ARRAY_NODE],
 			job_desc->tres_req_cnt, false);
 	}

 	/* Check if we are clearing licenses */
 	if (job_desc->licenses && !job_desc->licenses[0])
 		job_desc->bitflags |= RESET_LIC_JOB;
 	if (job_desc->tres_per_task &&
 	    !xstrcasestr(job_desc->tres_per_task, "license/"))
 		job_desc->bitflags |= RESET_LIC_TASK;

 	_set_tot_license_req(job_desc, job_ptr);

 	if (job_desc->licenses_tot && !xstrcmp(job_desc->licenses_tot,
 						job_ptr->licenses)) {
 		sched_debug("%s: new licenses identical to old licenses \"%s\"",
 			    __func__, job_ptr->licenses);
 	} else if (job_desc->licenses_tot) {
 		bool pending = IS_JOB_PENDING(job_ptr);
 		license_list =
 			license_validate(job_desc->licenses_tot, true, true,
 					 false,
 					 pending ? job_desc->tres_req_cnt :
 						   NULL,
 					 &valid_licenses);

 		if (!valid_licenses) {
 			sched_info("%s: invalid licenses: %s",
 				   __func__, job_desc->licenses_tot);
 			error_code = ESLURM_INVALID_LICENSES;
 		} else if (!license_list)
 			xfree(job_desc->licenses_tot);
 	}

 	if (error_code != SLURM_SUCCESS)
 		goto fini;


 	if (job_desc->min_nodes == INFINITE) {
 		/* Used by scontrol just to get current configuration info */
 		job_desc->min_nodes = NO_VAL;
 	}
 	if ((job_desc->min_nodes != NO_VAL) &&
 	    (job_desc->min_nodes > job_ptr->node_cnt) &&
 	    !permit_job_expansion() &&
 	    (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
 		info("%s: Change of size for %pJ not supported",  __func__,
 		     job_ptr);
 		error_code = ESLURM_NOT_SUPPORTED;
 		goto fini;
 	}

 	if (job_desc->req_switch != NO_VAL) {
 		job_ptr->req_switch = job_desc->req_switch;
 		info("%s: Change of switches to %u %pJ",
 		     __func__, job_desc->req_switch, job_ptr);
 	}
 	if (job_desc->wait4switch != NO_VAL) {
 		job_ptr->wait4switch = _max_switch_wait(job_desc->wait4switch);
 		info("%s: Change of switch wait to %u secs %pJ",
 		     __func__, job_ptr->wait4switch, job_ptr);
 	}

 	if (job_desc->admin_comment) {
 		if (!validate_super_user(uid)) {
 			error("%s: Attempt to change admin_comment for %pJ",
 			      __func__, job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 		} else {
 			xfree(job_ptr->admin_comment);
 			job_ptr->admin_comment =
 				xstrdup(job_desc->admin_comment);
 			info("%s: setting admin_comment to %s for %pJ",
 			     __func__, job_ptr->admin_comment, job_ptr);
 		}
 	}

 	if (job_desc->comment) {
 		xfree(job_ptr->comment);
 		job_ptr->comment = xstrdup(job_desc->comment);
 		info("%s: setting comment to %s for %pJ",
 		     __func__, job_ptr->comment, job_ptr);
 	}

 	if (job_desc->extra) {
 		elem_t *head = NULL;

 		error_code = extra_constraints_parse(job_desc->extra, &head);
 		if (error_code != SLURM_SUCCESS) {
 			error("%s: Invalid extra constraints", __func__);
 		} else {
 			xfree(job_ptr->extra);
 			job_ptr->extra = xstrdup(job_desc->extra);
 			FREE_NULL_EXTRA_CONSTRAINTS(job_ptr->extra_constraints);
 			job_ptr->extra_constraints = head;
 			info("%s: setting extra to %s for %pJ",
 			     __func__, job_ptr->extra, job_ptr);
 		}
 	}

 	if (error_code != SLURM_SUCCESS)
 		goto fini;

         /*
 	 * Now that we know what the new part, qos, and association are going
 	 * to be lets check the limits.
 	 * If a limit was already exceeded before this update
 	 * request, let's assume it is expected and allow the change to happen.
 	 */
 	if (new_qos_ptr || new_assoc_ptr || new_part_ptr) {
 		list_t *use_part_list = new_part_ptr ?
 			part_ptr_list : job_ptr->part_ptr_list;
 		assoc_mgr_lock(&assoc_mgr_read_lock);
 		if ((error_code = _check_for_part_assocs(
 			     use_part_list, use_assoc_ptr)) != SLURM_SUCCESS) {
 			assoc_mgr_unlock(&assoc_mgr_read_lock);
 			goto fini;
 		}
 		assoc_mgr_unlock(&assoc_mgr_read_lock);

 		if (!privileged &&
 		    (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) {
 			uint32_t acct_reason = 0;
 			char *resv_orig = NULL;
 			bool resv_reset = false, min_reset = false,
 				max_reset = false,
 				time_min_reset = false;
 			if (!acct_policy_validate(job_desc, use_part_ptr,
 						  use_part_list,
 						  use_assoc_ptr, use_qos_ptr,
 						  &acct_reason,
 						  &acct_policy_limit_set,
 						  true)
 			    && !acct_limit_already_exceeded) {
 				info("%s: exceeded association/QOS limit for user %u: %s",
 				     __func__, job_desc->user_id,
 				     job_state_reason_string(acct_reason));
 				error_code = ESLURM_ACCOUNTING_POLICY;
 				goto fini;
 			}
 			/*
 			 * We need to set the various parts of job_desc below
 			 * to something since _valid_job_part() will validate
 			 * them.  Note the reservation part is validated in the
 			 * sub call to _part_access_check().
 			 */
 			if (job_desc->min_nodes == NO_VAL) {
 				job_desc->min_nodes = detail_ptr->min_nodes;
 				min_reset = true;
 			}
 			if ((job_desc->max_nodes == NO_VAL) &&
 			    (detail_ptr->max_nodes != 0)) {
 				job_desc->max_nodes = detail_ptr->max_nodes;
 				max_reset = true;
 			}

 			if ((job_desc->time_min == NO_VAL) &&
 			    (job_ptr->time_min != 0)) {
 				job_desc->time_min = job_ptr->time_min;
 				time_min_reset = true;
 			}

 			/*
 			 * This always gets reset, so don't worry about tracking
 			 * it.
 			 */
 			if (job_desc->time_limit == NO_VAL)
 				job_desc->time_limit = job_ptr->time_limit;

 			if (!job_desc->reservation
 			    || job_desc->reservation[0] == '\0') {
 				resv_reset = true;
 				resv_orig = job_desc->reservation;
 				job_desc->reservation = job_ptr->resv_name;
 			}

 			assoc_mgr_lock(&assoc_mgr_read_lock);
 			if ((error_code = _valid_job_part(
 				     job_desc, uid,
 				     new_req_bitmap_given ?
 				     new_req_bitmap :
 				     job_ptr->details->req_node_bitmap,
 				     use_part_ptr,
 				     new_part_ptr ?
 				     part_ptr_list : job_ptr->part_ptr_list,
 				     use_assoc_ptr, use_qos_ptr, NULL))) {
 				assoc_mgr_unlock(&assoc_mgr_read_lock);
 				goto fini;
 			}
 			assoc_mgr_unlock(&assoc_mgr_read_lock);

 			if (min_reset)
 				job_desc->min_nodes = NO_VAL;
 			if (max_reset)
 				job_desc->max_nodes = NO_VAL;
 			if (time_min_reset)
 				job_desc->time_min = NO_VAL;
 			if (resv_reset)
 				job_desc->reservation = resv_orig;

 			job_desc->time_limit = orig_time_limit;
 		}

 		/*
 		 * Since we are successful to this point remove the job from the
 		 * old qos/assoc's
 		 */
 		acct_policy_remove_job_submit(job_ptr, false);
 		acct_policy_remove_accrue_time(job_ptr, false);
 	}

 	if (new_qos_ptr) {
 		/* Change QOS */
 		job_ptr->qos_id = new_qos_id;
 		job_ptr->qos_ptr = new_qos_ptr;
 		FREE_NULL_LIST(job_ptr->qos_list);
 		job_ptr->qos_list = new_qos_list;
 		new_qos_list = NULL;
 		xfree(detail_ptr->qos_req);
 		detail_ptr->qos_req = job_desc->qos;
 		job_desc->qos = NULL;

 		job_ptr->limit_set.qos = acct_policy_limit_set.qos;

 		if (job_ptr->state_reason == FAIL_QOS) {
 			job_ptr->state_reason = WAIT_NO_REASON;
 			xfree(job_ptr->state_desc);
 		}

 		info("%s: setting QOS to %s for %pJ",
 		     __func__, detail_ptr->qos_req, job_ptr);
 	}

 	if (new_assoc_ptr) {
 		/* Change account/association */
 		xfree(job_ptr->account);
 		job_ptr->account = xstrdup(new_assoc_ptr->acct);
 		job_ptr->assoc_id = new_assoc_ptr->id;
 		job_ptr->assoc_ptr = new_assoc_ptr;

 		if (job_ptr->state_reason == FAIL_ACCOUNT) {
 			job_ptr->state_reason = WAIT_NO_REASON;
 			xfree(job_ptr->state_desc);
 		}

 		info("%s: setting account to %s for %pJ",
 		     __func__, job_ptr->account, job_ptr);
 	}

 	if (new_part_ptr) {
 		/* Change partition */
 		job_ptr->part_ptr = new_part_ptr;
 		job_ptr->bit_flags &= ~JOB_PART_ASSIGNED;

 		FREE_NULL_LIST(job_ptr->part_ptr_list);
 		job_ptr->part_ptr_list = part_ptr_list;
 		part_ptr_list = NULL;	/* nothing to free */

 		rebuild_job_part_list(job_ptr);

 		/* Rebuilt in priority/multifactor plugin */
 		if (job_ptr->prio_mult)
 			xfree(job_ptr->prio_mult->priority_array);

 		info("%s: setting partition to %s for %pJ",
 		     __func__, job_desc->partition, job_ptr);
 	}

 	/* Now add the job to the new qos/assoc's */
 	if (new_qos_ptr || new_assoc_ptr || new_part_ptr) {
 		update_accounting = true;
 		acct_policy_add_job_submit(job_ptr, false);
 	}

 	if (new_resv_ptr) {
 		FREE_NULL_LIST(job_ptr->resv_list);
 		xfree(job_ptr->resv_name);
 		job_ptr->resv_name = xstrdup(job_desc->reservation);
 		xfree(job_ptr->details->resv_req);
 		job_ptr->details->resv_req = xstrdup(job_desc->reservation);
 		job_ptr->resv_list = new_resv_list;
 		job_ptr->resv_id = new_resv_ptr->resv_id;
 		job_ptr->resv_ptr = new_resv_ptr;

 		sched_info("%s: setting reservation to %s for %pJ", __func__,
 			   job_ptr->resv_name, job_ptr);
 		update_accounting = true;
 	} else if (job_desc->reservation &&
 		   job_desc->reservation[0] == '\0' &&
 		   job_ptr->resv_name) {
 		FREE_NULL_LIST(job_ptr->resv_list);
 		xfree(job_ptr->resv_name);
 		job_ptr->resv_id    = 0;
 		job_ptr->resv_ptr   = NULL;
 		sched_info("%s: setting reservation to '' for %pJ",
 			   __func__, job_ptr);
 		update_accounting = true;
 	}

 	/* Reset min and max cpu counts as needed, ensure consistency */
 	if (job_desc->min_cpus != NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (job_desc->min_cpus < 1)
 			error_code = ESLURM_INVALID_CPU_COUNT;
 		else {
 			save_min_cpus = detail_ptr->min_cpus;
 			detail_ptr->min_cpus = job_desc->min_cpus;
 		}
 	}
 	if (job_desc->max_cpus != NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else {
 			save_max_cpus = detail_ptr->max_cpus;
 			detail_ptr->max_cpus = job_desc->max_cpus;
 		}
 	}
 	if ((save_min_cpus || save_max_cpus) && detail_ptr->max_cpus &&
 	    (detail_ptr->max_cpus < detail_ptr->min_cpus)) {
 		error_code = ESLURM_INVALID_CPU_COUNT;
 		if (save_min_cpus) {
 			detail_ptr->min_cpus = save_min_cpus;
 			save_min_cpus = 0;
 		}
 		if (save_max_cpus) {
 			detail_ptr->max_cpus = save_max_cpus;
 			save_max_cpus = 0;
 		}
 	}

 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (save_min_cpus && (detail_ptr->min_cpus != save_min_cpus)) {
 		info("%s: setting min_cpus from %u to %u for %pJ",
 		     __func__, save_min_cpus, detail_ptr->min_cpus, job_ptr);
 		job_ptr->limit_set.tres[TRES_ARRAY_CPU] =
 			acct_policy_limit_set.tres[TRES_ARRAY_CPU];
 		detail_ptr->orig_min_cpus = job_desc->min_cpus;
 		update_accounting = true;
 	}
 	if (save_max_cpus && (detail_ptr->max_cpus != save_max_cpus)) {
 		info("%s: setting max_cpus from %u to %u for %pJ",
 		     __func__, save_max_cpus, detail_ptr->max_cpus, job_ptr);
 		/*
 		 * Always use the acct_policy_limit_set.* since if set by a
 		 * super user it be set correctly
 		 */
 		job_ptr->limit_set.tres[TRES_ARRAY_CPU] =
 			acct_policy_limit_set.tres[TRES_ARRAY_CPU];
 		detail_ptr->orig_max_cpus = job_desc->max_cpus;
 		update_accounting = true;
 	}

 	if ((job_desc->pn_min_cpus != NO_VAL16) &&
 	    (job_desc->pn_min_cpus != 0)) {

 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 		} else {
 			detail_ptr->pn_min_cpus = job_desc->pn_min_cpus;
 			detail_ptr->orig_pn_min_cpus = job_desc->pn_min_cpus;
 			info("%s: setting pn_min_cpus to %u for %pJ",
 			     __func__, job_desc->pn_min_cpus, job_ptr);
 		}
 	}

 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->cpus_per_task != NO_VAL16) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 		} else if (job_desc->cpus_per_task == 0) {
 			error("%s: trying to set cpus_per_task to an erroneous value: %u",
 			      __func__, job_desc->cpus_per_task);
 			error_code = ESLURM_INVALID_CPU_COUNT;
 		} else if (detail_ptr->cpus_per_task !=
 			   job_desc->cpus_per_task) {
 			info("%s: setting cpus_per_task from %u to %u for %pJ",
 			     __func__, detail_ptr->cpus_per_task,
 			     job_desc->cpus_per_task, job_ptr);
 			detail_ptr->cpus_per_task = job_desc->cpus_per_task;
 			detail_ptr->orig_cpus_per_task =
 				job_desc->cpus_per_task;
 		}
 	}

 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	/* Reset min and max node counts as needed, ensure consistency */
 	if (job_desc->min_nodes != NO_VAL) {
 		if (job_ptr->details &&
 		    (job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) ==
 		    SLURM_DIST_ARBITRARY) {
 			info("%s: Cannot update node count of %pJ. Not compatible with arbitrary distribution",
 			     __func__, job_ptr);
 			error_code = ESLURM_NOT_SUPPORTED;
 		} else if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
 			;	/* shrink running job, processed later */
 		else if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (job_desc->min_nodes < 1) {
 			info("%s: min_nodes < 1 for %pJ", __func__, job_ptr);
 			error_code = ESLURM_INVALID_NODE_COUNT;
 		} else {
 			/* Resize of pending job */
 			save_min_nodes = detail_ptr->min_nodes;
 			detail_ptr->min_nodes = job_desc->min_nodes;
 		}
 	}
 	if (job_desc->max_nodes != NO_VAL) {
 		if ((IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) &&
 		    (job_desc->max_nodes == job_desc->min_nodes))
 			;	/* shrink running job, processed later */
 		else if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else {
 			save_max_nodes = detail_ptr->max_nodes;
 			detail_ptr->max_nodes = job_desc->max_nodes;
 		}
 	}
 	if ((save_min_nodes || save_max_nodes) && detail_ptr->max_nodes &&
 	    (detail_ptr->max_nodes < detail_ptr->min_nodes)) {
 		info("%s: max_nodes < min_nodes (%u < %u) for %pJ", __func__,
 		     detail_ptr->max_nodes, detail_ptr->min_nodes,
 		     job_ptr);
 		error_code = ESLURM_INVALID_NODE_COUNT;
 		if (save_min_nodes) {
 			detail_ptr->min_nodes = save_min_nodes;
 			save_min_nodes = 0;
 		}
 		if (save_max_nodes) {
 			detail_ptr->max_nodes = save_max_nodes;
 			save_max_nodes = 0;
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (save_min_nodes && (save_min_nodes!= detail_ptr->min_nodes)) {
 		info("%s: setting min_nodes from %u to %u for %pJ", __func__,
 		     save_min_nodes, detail_ptr->min_nodes, job_ptr);
 		job_ptr->limit_set.tres[TRES_ARRAY_NODE] =
 			acct_policy_limit_set.tres[TRES_ARRAY_NODE];
 		update_accounting = true;
 		FREE_NULL_BITMAP(detail_ptr->job_size_bitmap);
 	}
 	if (save_max_nodes && (save_max_nodes != detail_ptr->max_nodes)) {
 		info("%s: setting max_nodes from %u to %u for %pJ", __func__,
 		     save_max_nodes, detail_ptr->max_nodes, job_ptr);
 		/*
 		 * Always use the acct_policy_limit_set.* since if set by a
 		 * super user it be set correctly
 		 */
 		job_ptr->limit_set.tres[TRES_ARRAY_NODE] =
 			acct_policy_limit_set.tres[TRES_ARRAY_NODE];
 		update_accounting = true;
 		FREE_NULL_BITMAP(detail_ptr->job_size_bitmap);
 	}
 	if (job_desc->job_size_str) {
 		if ((!IS_JOB_PENDING(job_ptr)) || !detail_ptr)
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (detail_ptr->min_nodes && detail_ptr->max_nodes &&
 			 (detail_ptr->max_nodes != NO_VAL) &&
 			 (detail_ptr->max_nodes < MAX_JOB_SIZE_BITMAP)) {
 			bitstr_t  *new_size_bitmap;
 			new_size_bitmap = bit_alloc(detail_ptr->max_nodes + 1);
 			if (bit_unfmt(new_size_bitmap,
 				      job_desc->job_size_str)) {
 				FREE_NULL_BITMAP(new_size_bitmap);
 				info("%s: %pJ: invalid job_size_str:%s",
 				     __func__, job_ptr, job_desc->job_size_str);
 				error_code = ESLURM_INVALID_NODE_COUNT;
 			} else {
 				FREE_NULL_BITMAP(detail_ptr->job_size_bitmap);
 				detail_ptr->job_size_bitmap = new_size_bitmap;
 			}
 		} else {
 			info("%s: %pJ: invalid job_size_str:%s", __func__,
 			     job_ptr, job_desc->job_size_str);
 			error_code = ESLURM_INVALID_NODE_COUNT;
 		}

 	} else {
 		error_code = _unroll_min_max_node(job_ptr);
 	}

 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if ((job_desc->num_tasks != NO_VAL) &&
 	    (job_desc->bitflags & TASKS_CHANGED)) {
 		if (!IS_JOB_PENDING(job_ptr))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (job_desc->num_tasks < 1)
 			error_code = ESLURM_BAD_TASK_COUNT;
 		else {
 			detail_ptr->num_tasks = job_desc->num_tasks;
 			/*
 			 * Once you actually requested ntasks you will get
 			 * SLURM_NTASKS in your environment. There is no way to
 			 * remove that.
 			 */
 			if (job_desc->bitflags & JOB_NTASKS_SET)
 				job_ptr->bit_flags |= JOB_NTASKS_SET;
 			info("%s: setting num_tasks to %u for %pJ",
 			     __func__, job_desc->num_tasks, job_ptr);
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	/*
 	 * If the job records now holds a required nodelist with more nodes than
 	 * are required, translate this list into an exclusion of all nodes
 	 * except those requested.
 	 *
 	 * Merge the resulting negated version into the excluded nodelist of the
 	 * job.
 	 */
 	if (detail_ptr->req_node_bitmap &&
 	    (bit_set_count(detail_ptr->req_node_bitmap) >
 	     detail_ptr->min_nodes)) {
 		if (!detail_ptr->exc_node_bitmap)
 			detail_ptr->exc_node_bitmap =
 				bit_alloc(node_record_count);
 		bit_or_not(detail_ptr->exc_node_bitmap,
 			   detail_ptr->req_node_bitmap);
 		FREE_NULL_BITMAP(detail_ptr->req_node_bitmap);
 	}

 	if (job_desc->time_limit != NO_VAL) {
 		if (IS_JOB_FINISHED(job_ptr) || job_ptr->preempt_time)
 			error_code = ESLURM_JOB_FINISHED;
 		else if (job_ptr->time_limit == job_desc->time_limit) {
 			sched_debug("%s: new time limit identical to old time limit %pJ",
 				    __func__, job_ptr);
 		} else if (privileged ||
 			   (job_ptr->time_limit > job_desc->time_limit)) {
 			time_t old_time =  job_ptr->time_limit;
 			uint32_t use_time_min = job_desc->time_min != NO_VAL ?
 				job_desc->time_min : job_ptr->time_min;
 			if (old_time == INFINITE)	/* one year in mins */
 				old_time = (365 * 24 * 60);
 			if (job_desc->time_limit < use_time_min) {
 				sched_info("%s: attempt to set time_limit < time_min (%u < %u)",
 					   __func__,
 					   job_desc->time_limit,
 					   use_time_min);
 				error_code = ESLURM_INVALID_TIME_MIN_LIMIT;
 				goto fini;
 			}
 			acct_policy_alter_job(job_ptr, job_desc->time_limit);
 			job_ptr->time_limit = job_desc->time_limit;
 			if (IS_JOB_RUNNING(job_ptr) ||
 			    IS_JOB_SUSPENDED(job_ptr)) {
 				if (job_ptr->preempt_time) {
 					;	/* Preemption in progress */
 				} else if (job_ptr->time_limit == INFINITE) {
 					/* Set end time in one year */
 					job_ptr->end_time = now +
 						(365 * 24 * 60 * 60);
 				} else {
 					/*
 					 * Update end_time based upon change
 					 * to preserve suspend time info
 					 */
 					job_ptr->end_time = job_ptr->end_time +
 						((job_ptr->time_limit -
 						  old_time) * 60);
 				}
 				if (job_ptr->end_time < now)
 					job_ptr->end_time = now;
 				job_ptr->end_time_exp = job_ptr->end_time;
 			}
 			sched_info("%s: setting time_limit to %u for %pJ",
 				   __func__, job_desc->time_limit, job_ptr);
 			/*
 			 * Always use the acct_policy_limit_set.*
 			 * since if set by a super user it be set correctly
 			 */
 			job_ptr->limit_set.time = acct_policy_limit_set.time;
 			update_accounting = true;
 		} else if (IS_JOB_PENDING(job_ptr) && job_ptr->part_ptr &&
 			   (job_ptr->part_ptr->max_time >=
 			    job_desc->time_limit)) {
 			job_ptr->time_limit = job_desc->time_limit;
 			sched_info("%s: setting time_limit to %u for %pJ",
 				   __func__, job_desc->time_limit, job_ptr);
 			/*
 			 * Always use the acct_policy_limit_set.*
 			 * since if set by a super user it be set correctly
 			 */
 			job_ptr->limit_set.time = acct_policy_limit_set.time;
 			update_accounting = true;
 		} else {
 			sched_info("%s: Attempt to increase time limit for %pJ",
 				   __func__, job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if ((job_desc->time_min != NO_VAL) && IS_JOB_PENDING(job_ptr)) {
 		if (job_desc->time_min > job_ptr->time_limit) {
 			info("%s: attempt to set TimeMin > TimeLimit (%u > %u)",
 			     __func__, job_desc->time_min, job_ptr->time_limit);
 			error_code = ESLURM_INVALID_TIME_MIN_LIMIT;
 		} else if (job_ptr->time_min != job_desc->time_min) {
 			job_ptr->time_min = job_desc->time_min;
 			info("%s: setting TimeMin to %u for %pJ",
 			     __func__, job_desc->time_min, job_ptr);
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->end_time) {
 		if (!IS_JOB_RUNNING(job_ptr) || job_ptr->preempt_time) {
 			/*
 			 * We may want to use this for deadline scheduling
 			 * at some point in the future. For now only reset
 			 * the time limit of running jobs.
 			 */
 			error_code = ESLURM_JOB_NOT_RUNNING;
 		} else if (job_desc->end_time < now) {
 			error_code = ESLURM_INVALID_TIME_VALUE;
 		} else if (privileged ||
 			   (job_ptr->end_time > job_desc->end_time)) {
 			int delta_t  = job_desc->end_time - job_ptr->end_time;
 			job_ptr->end_time = job_desc->end_time;
 			job_ptr->time_limit += (delta_t+30)/60; /* Sec->min */
 			sched_info("%s: setting time_limit to %u for %pJ",
 				   __func__, job_ptr->time_limit, job_ptr);
 			/* Always use the acct_policy_limit_set.*
 			 * since if set by a super user it be set correctly */
 			job_ptr->limit_set.time = acct_policy_limit_set.time;
 			update_accounting = true;
 		} else {
 			sched_info("%s: Attempt to extend end time for %pJ",
 				   __func__, job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}

 	if ((job_desc->deadline) && (!IS_JOB_RUNNING(job_ptr))) {
 		char time_str[256];
 		slurm_make_time_str(&job_ptr->deadline, time_str,
 				    sizeof(time_str));
 		if (job_desc->deadline < now) {
 			error_code = ESLURM_INVALID_TIME_VALUE;
 		} else if (privileged) {
 			/* update deadline */
 			job_ptr->deadline = job_desc->deadline;
 			sched_info("%s: setting deadline to %s for %pJ",
 				   __func__, time_str, job_ptr);
 			/*
 			 * Always use the acct_policy_limit_set.*
 			 * since if set by a super user it be set correctly
 			 */
 			job_ptr->limit_set.time = acct_policy_limit_set.time;
 			update_accounting = true;
 		} else {
 			sched_info("%s: Attempt to extend end time for %pJ",
 				   __func__, job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->delay_boot != NO_VAL) {
 		job_ptr->delay_boot = job_desc->delay_boot;
 		sched_info("%s: setting delay_boot to %u for %pJ",
 			   __func__, job_desc->delay_boot, job_ptr);
 	}

 	if ((job_desc->requeue != NO_VAL16) && detail_ptr) {
 		detail_ptr->requeue = MIN(job_desc->requeue, 1);
 		sched_info("%s: setting requeue to %u for %pJ",
 			   __func__, job_desc->requeue, job_ptr);
 	}

 	if (job_desc->priority != NO_VAL) {
 		/*
 		 * If we are doing time slicing we could update the
 		 * priority of the job while running to give better
 		 * position (larger time slices) than competing jobs
 		 */
 		if (IS_JOB_FINISHED(job_ptr) || (detail_ptr == NULL))
 			error_code = ESLURM_JOB_FINISHED;
 		else if (job_ptr->priority == job_desc->priority) {
 			debug("%s: setting priority to current value",__func__);
 			if ((job_ptr->priority == 0) && privileged) {
 				/*
 				 * Authorized user can change from user hold
 				 * to admin hold or admin hold to user hold
 				 */
 				if (job_desc->alloc_sid == ALLOC_SID_USER_HOLD)
 					job_ptr->state_reason = WAIT_HELD_USER;
 				else
 					job_ptr->state_reason = WAIT_HELD;
 			}
 		} else if ((job_ptr->priority == 0) &&
 			   (job_desc->priority == INFINITE) &&
 			   (privileged ||
 			    (job_ptr->state_reason == WAIT_RESV_DELETED) ||
 			    (job_ptr->state_reason == WAIT_HELD_USER))) {
 			_release_job(job_ptr, uid);
 		} else if ((job_ptr->priority == 0) &&
 			   (job_desc->priority != INFINITE)) {
 			info("%s: ignore priority reset request on held %pJ",
 			     __func__, job_ptr);
 			error_code = ESLURM_JOB_HELD;
 		} else if (privileged ||
 			   (job_ptr->priority > job_desc->priority)) {
 			if (job_desc->priority != 0)
 				job_ptr->details->nice = NICE_OFFSET;
 			if (job_desc->priority == INFINITE) {
 				job_ptr->direct_set_prio = 0;
 				set_job_prio(job_ptr);
 			} else if (job_desc->priority == 0) {
 				_hold_job(job_ptr, uid);
 			} else {
 				if (privileged) {
 					/*
 					 * Only administrator can make
 					 * persistent change to a job's
 					 * priority, except holding a job
 					 */
 					job_ptr->direct_set_prio = 1;
 				} else
 					error_code = ESLURM_PRIO_RESET_FAIL;
 				job_ptr->priority = job_desc->priority;
 				if (job_ptr->part_ptr_list &&
 				    job_ptr->prio_mult &&
 				    job_ptr->prio_mult->priority_array) {
 					int i, j = list_count(
 						job_ptr->part_ptr_list);
 					for (i = 0; i < j; i++) {
 						job_ptr->prio_mult->
 							priority_array[i] =
 							job_desc->priority;
 					}
 				}
 			}
 			sched_info("%s: set priority to %u for %pJ",
 				   __func__, job_ptr->priority, job_ptr);
 			update_accounting = true;
 			if (job_ptr->priority == 0) {
 				if (!privileged || (job_desc->alloc_sid ==
 						    ALLOC_SID_USER_HOLD)) {
 					job_ptr->state_reason = WAIT_HELD_USER;
 				} else
 					job_ptr->state_reason = WAIT_HELD;
 				xfree(job_ptr->state_desc);

 				/* remove pending remote sibling jobs */
 				if (IS_JOB_PENDING(job_ptr) &&
 				    !IS_JOB_REVOKED(job_ptr)) {
 					fed_mgr_job_revoke_sibs(job_ptr);
 				}
 			}
 		} else if ((job_ptr->priority != 0) &&
 			   (job_desc->priority == INFINITE)) {
 			/*
 			 * If the job was already released, ignore another
 			 * release request.
 			 */
 			debug("%s: %pJ already released, ignoring request",
 			      __func__, job_ptr);
 		} else {
 			sched_error("Attempt to modify priority for %pJ",
 				    job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	} else if (job_ptr->state_reason == FAIL_BAD_CONSTRAINTS) {
 		/*
 		 * We need to check if the state is BadConstraints here since we
 		 * are altering the job the bad constraint might have gone
 		 * away.  If it did the priority (0) wouldn't get reset so the
 		 * job would just go into JobAdminHeld otherwise.
 		 */
 		job_ptr->direct_set_prio = 0;
 		set_job_prio(job_ptr);
 		sched_debug("%s: job request changed somehow, removing the bad constraints to reevaluate %pJ uid %u",
 			    __func__, job_ptr, uid);
 		job_ptr->state_reason = WAIT_NO_REASON;
 	}

 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->nice != NO_VAL) {
 		if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL))
 			error_code = ESLURM_JOB_FINISHED;
 		else if (job_ptr->details &&
 			 (job_ptr->details->nice == job_desc->nice))
 			sched_debug("%s: new nice identical to old nice %pJ",
 				    __func__, job_ptr);
 		else if (job_ptr->direct_set_prio && job_ptr->priority != 0)
 			info("%s: ignore nice set request on %pJ",
 			     __func__, job_ptr);
 		else if (privileged || (job_desc->nice >= NICE_OFFSET)) {
 			if (!xstrcmp(slurm_conf.priority_type,
 			             "priority/basic")) {
 				int64_t new_prio = job_ptr->priority;
 				new_prio += job_ptr->details->nice;
 				new_prio -= job_desc->nice;
 				job_ptr->priority = MAX(new_prio, 2);
 				sched_info("%s: nice changed from %u to %u, setting priority to %u for %pJ",
 					   __func__, job_ptr->details->nice,
 					   job_desc->nice,
 					   job_ptr->priority, job_ptr);
 			}
 			job_ptr->details->nice = job_desc->nice;
 			update_accounting = true;
 		} else {
 			sched_error("%s: Attempt to modify nice for %pJ",
 				    __func__, job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->pn_min_memory != NO_VAL64) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 		} else if (job_desc->pn_min_memory
 			   == detail_ptr->pn_min_memory) {
 			sched_debug("%s: new memory limit identical to old limit for %pJ",
 				    __func__, job_ptr);
 		} else {
 			char *entity;
 			if (job_desc->pn_min_memory == MEM_PER_CPU) {
 				/* Map --mem-per-cpu=0 to --mem=0 */
 				job_desc->pn_min_memory = 0;
 			}
 			if (job_desc->pn_min_memory & MEM_PER_CPU)
 				entity = "cpu";
 			else
 				entity = "job";

 			detail_ptr->pn_min_memory = job_desc->pn_min_memory;
 			detail_ptr->orig_pn_min_memory =
 				job_desc->pn_min_memory;
 			job_ptr->bit_flags |= JOB_MEM_SET;
 			sched_info("%s: setting min_memory_%s to %"PRIu64" for %pJ",
 				   __func__, entity,
 				   (job_desc->pn_min_memory & (~MEM_PER_CPU)),
 				   job_ptr);
 			/*
 			 * Always use the acct_policy_limit_set.*
 			 * since if set by a super user it be set correctly
 			 */
 			job_ptr->limit_set.tres[TRES_ARRAY_MEM] =
 				acct_policy_limit_set.tres[TRES_ARRAY_MEM];
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->pn_min_tmp_disk != NO_VAL) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 		} else {
 			detail_ptr->pn_min_tmp_disk =
 				job_desc->pn_min_tmp_disk;

 			sched_info("%s: setting job_min_tmp_disk to %u for %pJ",
 				   __func__, job_desc->pn_min_tmp_disk,
 				   job_ptr);
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->sockets_per_node != NO_VAL16) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 			goto fini;
 		} else {
 			mc_ptr->sockets_per_node = job_desc->sockets_per_node;
 			sched_info("%s: setting sockets_per_node to %u for %pJ",
 				   __func__, job_desc->sockets_per_node,
 				   job_ptr);
 		}
 	}

 	if (job_desc->cores_per_socket != NO_VAL16) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 			goto fini;
 		} else {
 			mc_ptr->cores_per_socket = job_desc->cores_per_socket;
 			sched_info("%s: setting cores_per_socket to %u for %pJ",
 				   __func__, job_desc->cores_per_socket,
 				   job_ptr);
 		}
 	}

 	if ((job_desc->threads_per_core != NO_VAL16)) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 			goto fini;
 		} else {
 			mc_ptr->threads_per_core = job_desc->threads_per_core;
 			sched_info("%s: setting threads_per_core to %u for %pJ",
 				   __func__, job_desc->threads_per_core,
 				   job_ptr);
 		}
 	}

 	if (job_desc->shared != NO_VAL16) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 		} else if (!privileged) {
 			sched_error("%s: Attempt to change sharing for %pJ",
 				    __func__, job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 		} else {
 			if (job_desc->shared) {
 				detail_ptr->share_res = 1;
 				detail_ptr->whole_node = 0;
 			} else {
 				detail_ptr->share_res = 0;
 			}
 			sched_info("%s: setting shared to %u for %pJ",
 				   __func__, job_desc->shared, job_ptr);
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->contiguous != NO_VAL16) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (privileged ||
 			 (detail_ptr->contiguous > job_desc->contiguous)) {
 			detail_ptr->contiguous = job_desc->contiguous;
 			sched_info("%s: setting contiguous to %u for %pJ",
 				   __func__, job_desc->contiguous, job_ptr);
 		} else {
 			sched_error("%s: Attempt to add contiguous for %pJ",
 				    __func__, job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->core_spec != NO_VAL16) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (privileged &&
 			 (slurm_conf.conf_flags & CONF_FLAG_ASRU)) {
 			if (job_desc->core_spec == INFINITE16)
 				detail_ptr->core_spec = NO_VAL16;
 			else
 				detail_ptr->core_spec = job_desc->core_spec;
 			sched_info("%s: setting core_spec to %u for %pJ",
 				   __func__, detail_ptr->core_spec, job_ptr);
 			if (detail_ptr->core_spec != NO_VAL16)
 				detail_ptr->whole_node |= WHOLE_NODE_REQUIRED;
 		} else {
 			sched_error("%s Attempt to modify core_spec for %pJ",
 				    __func__, job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->features && detail_ptr &&
 	    !xstrcmp(job_desc->features, detail_ptr->features)) {
 		sched_debug("%s: new features identical to old features %s",
 			    __func__, job_desc->features);
 	} else if (job_desc->features) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (job_desc->features[0] != '\0') {
 			char *old_features = detail_ptr->features;
 			list_t *old_list = detail_ptr->feature_list;
 			detail_ptr->features = xstrdup(job_desc->features);
 			detail_ptr->feature_list = NULL;
 			if (build_feature_list(job_ptr, false, false)) {
 				sched_info("%s: invalid features(%s) for %pJ",
 					   __func__, job_desc->features,
 					   job_ptr);
 				FREE_NULL_LIST(detail_ptr->feature_list);
 				xfree(detail_ptr->features);
 				detail_ptr->features = old_features;
 				detail_ptr->feature_list = old_list;
 				error_code = ESLURM_INVALID_FEATURE;
 			} else if (node_features_g_job_valid(
 						detail_ptr->features,
 						detail_ptr->feature_list) !=
 				   SLURM_SUCCESS) {
 				FREE_NULL_LIST(detail_ptr->feature_list);
 				xfree(detail_ptr->features);
 				detail_ptr->features = old_features;
 				detail_ptr->feature_list = old_list;
 				error_code = ESLURM_INVALID_FEATURE;
 			} else {
 				sched_info("%s: setting features to %s for %pJ",
 					   __func__, job_desc->features,
 					   job_ptr);
 				xfree(old_features);
 				FREE_NULL_LIST(old_list);
 				detail_ptr->features_use = detail_ptr->features;
 				detail_ptr->feature_list_use =
 					detail_ptr->feature_list;
 			}
 		} else {
 			sched_info("%s: cleared features for %pJ", __func__,
 				   job_ptr);
 			xfree(detail_ptr->features);
 			FREE_NULL_LIST(detail_ptr->feature_list);
 			detail_ptr->features_use = NULL;
 			detail_ptr->feature_list_use = NULL;
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->prefer && detail_ptr &&
 	    !xstrcmp(job_desc->prefer, detail_ptr->prefer)) {
 		sched_debug("%s: new prefer identical to old prefer %s",
 			    __func__, job_desc->prefer);
 	} else if (job_desc->prefer) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (job_desc->prefer[0] != '\0') {
 			char *old_prefer = detail_ptr->prefer;
 			list_t *old_list = detail_ptr->prefer_list;
 			detail_ptr->prefer = xstrdup(job_desc->prefer);
 			detail_ptr->prefer_list = NULL;
 			if (build_feature_list(job_ptr, true, false)) {
 				sched_info("%s: invalid prefer(%s) for %pJ",
 					   __func__, job_desc->prefer,
 					   job_ptr);
 				FREE_NULL_LIST(detail_ptr->prefer_list);
 				xfree(detail_ptr->prefer);
 				detail_ptr->prefer = old_prefer;
 				detail_ptr->prefer_list = old_list;
 				error_code = ESLURM_INVALID_PREFER;
 			} else if (node_features_g_job_valid(
 						detail_ptr->prefer,
 						detail_ptr->prefer_list) !=
 				   SLURM_SUCCESS) {
 				FREE_NULL_LIST(detail_ptr->prefer_list);
 				xfree(detail_ptr->prefer);
 				detail_ptr->features = old_prefer;
 				detail_ptr->feature_list = old_list;
 				error_code = ESLURM_INVALID_PREFER;
 			} else {
 				sched_info("%s: setting prefer to %s for %pJ",
 					   __func__, job_desc->prefer,
 					   job_ptr);
 				xfree(old_prefer);
 				FREE_NULL_LIST(old_list);
 				detail_ptr->features_use = detail_ptr->prefer;
 				detail_ptr->feature_list_use =
 					detail_ptr->prefer_list;
 			}
 		} else {
 			sched_info("%s: cleared prefer for %pJ", __func__,
 				   job_ptr);
 			xfree(detail_ptr->prefer);
 			FREE_NULL_LIST(detail_ptr->prefer_list);
 			detail_ptr->features_use = NULL;
 			detail_ptr->feature_list_use = NULL;
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->cluster_features &&
 	    (error_code = fed_mgr_update_job_cluster_features(
 		    job_ptr, job_desc->cluster_features)))
 		goto fini;

 	if (job_desc->clusters &&
 	    (error_code = fed_mgr_update_job_clusters(job_ptr,
 						      job_desc->clusters)))
 		goto fini;

 	if (gres_list) {
 		char *tmp = NULL;
 		if (job_desc->cpus_per_tres) {
 			xstrfmtcat(tmp, "cpus_per_tres:%s ",
 				   job_desc->cpus_per_tres);
 			xfree(job_ptr->cpus_per_tres);
 			job_ptr->cpus_per_tres = job_desc->cpus_per_tres;
 			job_desc->cpus_per_tres = NULL;
 		}
 		if (job_desc->tres_per_job) {
 			xstrfmtcat(tmp, "tres_per_job:%s ",
 				   job_desc->tres_per_job);
 			xfree(job_ptr->tres_per_job);
 			job_ptr->tres_per_job = job_desc->tres_per_job;
 			job_desc->tres_per_job = NULL;
 		}
 		if (job_desc->tres_per_node) {
 			xstrfmtcat(tmp, "tres_per_node:%s ",
 				   job_desc->tres_per_node);
 			xfree(job_ptr->tres_per_node);
 			job_ptr->tres_per_node = job_desc->tres_per_node;
 			job_desc->tres_per_node = NULL;
 		}
 		if (job_desc->tres_per_socket) {
 			xstrfmtcat(tmp, "tres_per_socket:%s ",
 				   job_desc->tres_per_socket);
 			xfree(job_ptr->tres_per_socket);
 			job_ptr->tres_per_socket = job_desc->tres_per_socket;
 			job_desc->tres_per_socket = NULL;
 		}
 		if (job_desc->tres_per_task) {
 			xstrfmtcat(tmp, "tres_per_task:%s ",
 				   job_desc->tres_per_task);
 			xfree(job_ptr->tres_per_task);
 			job_ptr->tres_per_task = job_desc->tres_per_task;
 			job_desc->tres_per_task = NULL;
 		}
 		if (job_desc->mem_per_tres) {
 			xstrfmtcat(tmp, "mem_per_tres:%s ",
 				   job_desc->mem_per_tres);
 			xfree(job_ptr->mem_per_tres);
 			job_ptr->mem_per_tres = job_desc->mem_per_tres;
 			job_desc->mem_per_tres = NULL;
 		}
 		if (tmp) {
 			sched_info("%s: setting %sfor %pJ",
 				   __func__, tmp, job_ptr);
 			xfree(tmp);
 		}
 		FREE_NULL_LIST(job_ptr->gres_list_req);
 		job_ptr->gres_list_req = gres_list;

 		gres_list = NULL;
 	}

 	if (job_desc->name) {
 		if (IS_JOB_FINISHED(job_ptr)) {
 			error_code = ESLURM_JOB_FINISHED;
 			goto fini;
 		} else if (!xstrcmp(job_desc->name, job_ptr->name)) {
 			sched_debug("%s: new name identical to old name %pJ",
 				    __func__, job_ptr);
 		} else {
 			xfree(job_ptr->name);
 			job_ptr->name = xstrdup(job_desc->name);

 			sched_info("%s: setting name to %s for %pJ",
 				   __func__, job_ptr->name, job_ptr);
 			update_accounting = true;
 		}
 	}

 	if (job_desc->work_dir && detail_ptr &&
 	    !xstrcmp(job_desc->work_dir, detail_ptr->work_dir)) {
 		sched_debug("%s: new work_dir identical to old work_dir %s",
 			    __func__, job_desc->work_dir);
 	} else if (job_desc->work_dir) {
 		if (!IS_JOB_PENDING(job_ptr)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 			goto fini;
 		} else if (detail_ptr) {
 			xfree(detail_ptr->work_dir);
 			detail_ptr->work_dir = xstrdup(job_desc->work_dir);
 			sched_info("%s: setting work_dir to %s for %pJ",
 				   __func__, detail_ptr->work_dir, job_ptr);
 			update_accounting = true;
 		}
 	}

 	if (job_desc->std_err && detail_ptr &&
 	    !xstrcmp(job_desc->std_err, detail_ptr->std_err)) {
 		sched_debug("%s: new std_err identical to old std_err %s",
 			    __func__, job_desc->std_err);
 	} else if (job_desc->std_err) {
 		if (!IS_JOB_PENDING(job_ptr))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (detail_ptr && job_desc->std_err[0] == '\0')
 			xfree(detail_ptr->std_err);
 		else if (detail_ptr) {
 			xfree(detail_ptr->std_err);
 			detail_ptr->std_err = xstrdup(job_desc->std_err);
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->std_in && detail_ptr &&
 	    !xstrcmp(job_desc->std_in, detail_ptr->std_in)) {
 		sched_debug("%s: new std_in identical to old std_in %s",
 			    __func__, job_desc->std_in);
 	} else if (job_desc->std_in) {
 		if (!IS_JOB_PENDING(job_ptr))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (detail_ptr && job_desc->std_in[0] == '\0')
 			xfree(detail_ptr->std_in);
 		else if (detail_ptr) {
 			xfree(detail_ptr->std_in);
 			detail_ptr->std_in = xstrdup(job_desc->std_in);
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->std_out && detail_ptr &&
 	    !xstrcmp(job_desc->std_out, detail_ptr->std_out)) {
 		sched_debug("%s: new std_out identical to old std_out %s",
 			    __func__, job_desc->std_out);
 	} else if (job_desc->std_out) {
 		if (!IS_JOB_PENDING(job_ptr))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (detail_ptr && job_desc->std_out[0] == '\0')
 			xfree(detail_ptr->std_out);
 		else if (detail_ptr) {
 			xfree(detail_ptr->std_out);
 			detail_ptr->std_out = xstrdup(job_desc->std_out);
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->wckey
 	    && !xstrcmp(job_desc->wckey, job_ptr->wckey)) {
 		sched_debug("%s: new wckey identical to old wckey %pJ",
 			    __func__, job_ptr);
 	} else if (job_desc->wckey) {
 		if (!IS_JOB_PENDING(job_ptr))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else {
 			int rc = update_job_wckey((char *) __func__,
 						  job_ptr, job_desc->wckey);
 			if (rc != SLURM_SUCCESS)
 				error_code = rc;
 			else
 				update_accounting = true;
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if ((job_desc->min_nodes != NO_VAL) &&
 	    (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) {
 		uint32_t new_min_task_cnt;
 		/*
 		 * Use req_nodes to change the nodes associated with a running
 		 * for lack of other field in the job request to use
 		 */
 		if ((job_desc->min_nodes == 0) && (job_ptr->node_cnt > 0) &&
 		    job_ptr->details && job_ptr->details->expanding_jobid) {
 			job_record_t *expand_job_ptr;
 			bitstr_t *orig_job_node_bitmap, *orig_jobx_node_bitmap;

 			expand_job_ptr = find_job_record(job_ptr->details->
 							 expanding_jobid);
 			if (expand_job_ptr == NULL) {
 				info("%s: Invalid node count (%u) for %pJ update, JobId=%u to expand not found",
 				     __func__, job_desc->min_nodes, job_ptr,
 				     job_ptr->details->expanding_jobid);
 				error_code = ESLURM_INVALID_JOB_ID;
 				goto fini;
 			}
 			if (IS_JOB_SUSPENDED(job_ptr) ||
 			    IS_JOB_SUSPENDED(expand_job_ptr)) {
 				info("%s: Can not expand %pJ from %pJ, job is suspended",
 				     __func__, expand_job_ptr, job_ptr);
 				error_code = ESLURM_JOB_SUSPENDED;
 				goto fini;
 			}
 			if ((job_ptr->step_list != NULL) &&
 			    (list_count(job_ptr->step_list) != 0)) {
 				info("%s: Attempt to merge %pJ with active steps into %pJ",
 				     __func__, job_ptr, expand_job_ptr);
 				error_code = ESLURMD_STEP_EXISTS;
 				goto fini;
 			}
 			if (!_valid_license_job_expansion(job_ptr,
 							  expand_job_ptr)) {
 				info("%s: Cannot merge %pJ with %pJ - cannot mix AND and OR licenses (%s vs %s)",
 				     __func__, job_ptr, expand_job_ptr,
 				     job_ptr->licenses,
 				     expand_job_ptr->licenses);
 				error_code = ESLURM_INVALID_LICENSES;
 				goto fini;
 			}

 			sched_info("%s: killing %pJ and moving all resources to %pJ",
 				   __func__, job_ptr, expand_job_ptr);
 			job_pre_resize_acctg(job_ptr);
 			job_pre_resize_acctg(expand_job_ptr);
 			_send_job_kill(job_ptr);

 			xassert(job_ptr->job_resrcs);
 			xassert(job_ptr->job_resrcs->node_bitmap);
 			xassert(expand_job_ptr->job_resrcs->node_bitmap);
 			orig_job_node_bitmap = bit_copy(job_ptr->node_bitmap);
 			orig_jobx_node_bitmap = bit_copy(expand_job_ptr->
 							 job_resrcs->
 							 node_bitmap);
 			error_code = select_g_job_expand(job_ptr,
 							 expand_job_ptr);
 			if (error_code == SLURM_SUCCESS) {
 				_merge_job_licenses(job_ptr, expand_job_ptr);
 				FREE_NULL_BITMAP(job_ptr->node_bitmap);
 				job_ptr->node_bitmap = orig_job_node_bitmap;
 				orig_job_node_bitmap = NULL;
 				deallocate_nodes(job_ptr, false, false, false);
 				bit_clear_all(job_ptr->node_bitmap);
 				job_state_set(job_ptr, (JOB_COMPLETE |
 							(job_ptr->job_state &
 							 JOB_STATE_FLAGS)));
 				_realloc_nodes(expand_job_ptr,
 					       orig_jobx_node_bitmap);
 				rebuild_step_bitmaps(expand_job_ptr,
 						     orig_jobx_node_bitmap);
 				(void) gs_job_fini(job_ptr);
 				(void) gs_job_start(expand_job_ptr);
 			}
 			FREE_NULL_BITMAP(orig_job_node_bitmap);
 			FREE_NULL_BITMAP(orig_jobx_node_bitmap);
 			job_post_resize_acctg(job_ptr);
 			job_post_resize_acctg(expand_job_ptr);
 			/*
 			 * Since job_post_resize_acctg will restart things,
 			 * don't do it again.
 			 */
 			update_accounting = false;
 			if (error_code)
 				goto fini;
 		} else if ((job_desc->min_nodes == 0) ||
 			   (job_desc->min_nodes > job_ptr->node_cnt) ||
 			   job_ptr->details->expanding_jobid) {
 			sched_info("%s: Invalid node count (%u) for %pJ update",
 				   __func__, job_desc->min_nodes, job_ptr);
 			error_code = ESLURM_INVALID_NODE_COUNT;
 			goto fini;
 		} else if (job_desc->min_nodes == job_ptr->node_cnt) {
 			debug2("%s: No change in node count update for %pJ",
 			       __func__, job_ptr);
 		} else if (!permit_job_shrink()) {
 			error("%s: request to shrink %pJ denied by configuration",
 			      __func__, job_ptr);
 			error_code = ESLURM_NOT_SUPPORTED;
 			goto fini;
 		} else {
 			int total = 0;
 			node_record_t *node_ptr;
 			bitstr_t *rem_nodes, *tmp_nodes;
 			sched_info("%s: set node count to %u for %pJ", __func__,
 				   job_desc->min_nodes, job_ptr);
 			job_pre_resize_acctg(job_ptr);

 			/*
 			 * Don't remove the batch host from the job. The batch
 			 * host isn't guaranteed to be the first bit set in
 			 * job_ptr->node_bitmap because the batch host can be
 			 * selected with the --batch and --constraint sbatch
 			 * flags.
 			 */
 			tmp_nodes = bit_copy(job_ptr->node_bitmap);
 			if (job_ptr->batch_host) {
 				bitstr_t *batch_host_bitmap;
 				if (node_name2bitmap(job_ptr->batch_host, false,
 						     &batch_host_bitmap, NULL))
 					error("%s: Invalid batch host %s for %pJ; this should never happen",
 					      __func__, job_ptr->batch_host,
 					      job_ptr);
 				else {
 					bit_and_not(tmp_nodes,
 						    batch_host_bitmap);
 					FREE_NULL_BITMAP(batch_host_bitmap);
 					/*
 					 * Set total to 1 since we're
 					 * guaranteeing that we won't remove the
 					 * batch host.
 					 */
 					total = 1;
 				}
 			}

 			rem_nodes = bit_alloc(bit_size(tmp_nodes));
 			for (int i = 0; next_node_bitmap(tmp_nodes, &i); i++) {
 				if (++total <= job_desc->min_nodes)
 					continue;
 				bit_set(rem_nodes, i);
 			}
 			abort_job_on_nodes(job_ptr, rem_nodes);
 			orig_job_node_bitmap =
 				bit_copy(job_ptr->job_resrcs->node_bitmap);
 			for (int i = 0;
 			     (node_ptr = next_node_bitmap(rem_nodes, &i));
 			     i++) {
 				kill_step_on_node(job_ptr, node_ptr, false);
 				excise_node_from_job(job_ptr, node_ptr);
 			}
 			/* Resize the core bitmaps of the job's steps */
 			rebuild_step_bitmaps(job_ptr, orig_job_node_bitmap);

 			FREE_NULL_BITMAP(orig_job_node_bitmap);
 			FREE_NULL_BITMAP(rem_nodes);
 			FREE_NULL_BITMAP(tmp_nodes);
 			(void) gs_job_start(job_ptr);
 			job_post_resize_acctg(job_ptr);
 			sched_info("%s: set nodes to %s for %pJ",
 				   __func__, job_ptr->nodes, job_ptr);
 			/*
 			 * Since job_post_resize_acctg() will restart
 			 * things don't do it again.
 			 */
 			update_accounting = false;
 		}
 		gres_stepmgr_job_build_details(
 			job_ptr->gres_list_alloc,
 			job_ptr->nodes,
 			&job_ptr->gres_detail_cnt,
 			&job_ptr->gres_detail_str,
 			&job_ptr->gres_used);

 		/*
 		 * Ensure that the num_tasks is less than
 		 * the number of cpus now that tasks can be changed
 		 * for a running job.
 		 */
 		new_min_task_cnt = job_ptr->cpu_cnt / detail_ptr->cpus_per_task;
 		if (detail_ptr->num_tasks > new_min_task_cnt)
 			detail_ptr->num_tasks = new_min_task_cnt;

 		tres_req_cnt_set = false;
 	}

 	if (job_desc->ntasks_per_node != NO_VAL16) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (privileged) {
 			detail_ptr->ntasks_per_node =
 				job_desc->ntasks_per_node;
 			if (detail_ptr->pn_min_cpus <
 			    detail_ptr->ntasks_per_node) {
 				detail_ptr->pn_min_cpus =
 					detail_ptr->orig_pn_min_cpus =
 					job_desc->ntasks_per_node;
 			}
 			sched_info("%s: setting ntasks_per_node to %u for %pJ",
 				   __func__, job_desc->ntasks_per_node, job_ptr);
 		} else {
 			sched_error("%s: Not super user: ignore ntasks_per_node change for job %pJ",
 				    __func__, job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->ntasks_per_socket != NO_VAL16) {
 		if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL) ||
 		    (detail_ptr->mc_ptr == NULL)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 		} else if (privileged) {
 			detail_ptr->mc_ptr->ntasks_per_socket =
 				job_desc->ntasks_per_socket;
 			sched_info("%s: setting ntasks_per_socket to %u for %pJ",
 				   __func__, job_desc->ntasks_per_socket,
 				   job_ptr);
 		} else {
 			sched_error("%s: Not super user: ignore ntasks_per_socket change for %pJ",
 				    __func__, job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->dependency) {
 		/* Can't update dependency of revoked job */
 		if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL) ||
 		    IS_JOB_REVOKED(job_ptr))
 			error_code = ESLURM_JOB_NOT_PENDING;
 		else if (!fed_mgr_is_origin_job(job_ptr)) {
 			/*
 			 * If the job became independent because of a dependency
 			 * update, that job gets requeued on siblings and then
 			 * the dependency update gets sent to siblings. So we
 			 * silently ignore this update on the sibling.
 			 */
 		} else {
 			int rc;
 			rc = update_job_dependency(job_ptr,
 						   job_desc->dependency);
 			if (rc != SLURM_SUCCESS)
 				error_code = rc;
 			/*
 			 * Because dependencies updated and we don't know where
 			 * they used to be, send dependencies to all siblings
 			 * so the siblings can update their dependency list.
 			 */
 			else {
 				rc = fed_mgr_submit_remote_dependencies(job_ptr,
 									true,
 									false);
 				if (rc) {
 					error("%s: %pJ Failed to send remote dependencies to some or all siblings.",
 					      __func__, job_ptr);
 					error_code = rc;
 				}
 				/*
 				 * Even if we fail to send remote dependencies,
 				 * we already succeeded in updating the job's
 				 * dependency locally, so we still need to
 				 * do these things.
 				 */
 				xfree(job_ptr->details->orig_dependency);
 				job_ptr->details->orig_dependency =
 					xstrdup(job_ptr->details->dependency);
 				sched_info("%s: setting dependency to %s for %pJ",
 					   __func__,
 					   job_ptr->details->dependency,
 					   job_ptr);
 				/*
 				 * If the job isn't independent, remove pending
 				 * remote sibling jobs
 				 */
 				if (!job_independent(job_ptr))
 					fed_mgr_job_revoke_sibs(job_ptr);
 			}
 		}
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	if (job_desc->begin_time) {
 		if (IS_JOB_PENDING(job_ptr) && detail_ptr) {
 			char time_str[256];
 			/*
 			 * Make sure this time is current, it does no good for
 			 * accounting to say this job could have started before
 			 * now
 			 */
 			if (job_desc->begin_time < now)
 				job_desc->begin_time = now;

 			if (detail_ptr->begin_time != job_desc->begin_time) {
 				detail_ptr->begin_time = job_desc->begin_time;
 				update_accounting = true;
 				slurm_make_time_str(&detail_ptr->begin_time,
 						    time_str, sizeof(time_str));
 				sched_info("%s: setting begin to %s for %pJ",
 					   __func__, time_str, job_ptr);
 				acct_policy_remove_accrue_time(job_ptr, false);
 			} else
 				sched_debug("%s: new begin time identical to old begin time %pJ",
 					    __func__, job_ptr);
 		} else {
 			error_code = ESLURM_JOB_NOT_PENDING;
 			goto fini;
 		}
 	}

 	if (valid_licenses) {
 		if (IS_JOB_PENDING(job_ptr)) {
 			FREE_NULL_LIST(job_ptr->license_list);
 			job_ptr->license_list = license_list;
 			license_list = NULL;
 			sched_info("%s: changing licenses from '%s' to '%s' for pending %pJ",
 				   __func__, job_ptr->licenses,
 				   job_desc->licenses_tot, job_ptr);
 			xfree(job_ptr->licenses);
 			job_ptr->licenses = xstrdup(job_desc->licenses_tot);
 			if (job_desc->bitflags & RESET_LIC_JOB)
 				xfree(job_ptr->lic_req);
 			else if (job_desc->licenses) {
 				xfree(job_ptr->lic_req);
 				job_ptr->lic_req = xstrdup(job_desc->licenses);
 			}
 		} else if (IS_JOB_RUNNING(job_ptr)) {
 			/*
 			 * Operators can modify license counts on running jobs,
 			 * regular users can only completely remove license
 			 * counts on running jobs.
 			 */
 			if (!privileged && license_list) {
 				sched_error("%s: Not operator user: ignore licenses change for %pJ",
 					    __func__, job_ptr);
 				error_code = ESLURM_ACCESS_DENIED;
 				goto fini;
 			}

 			/*
 			 * NOTE: This can result in oversubscription of
 			 * licenses
 			 */
 			license_job_return(job_ptr);
 			FREE_NULL_LIST(job_ptr->license_list);
 			job_ptr->license_list = license_list;
 			license_list = NULL;
 			sched_info("%s: changing licenses from '%s' to '%s' for running %pJ",
 				   __func__, job_ptr->licenses,
 				   job_desc->licenses, job_ptr);
 			xfree(job_ptr->licenses);
 			job_ptr->licenses = xstrdup(job_desc->licenses);
 			license_job_get(job_ptr, false);
 		} else {
 			/*
 			 * licenses are valid, but job state or user not
 			 * allowed to make changes
 			 */
 			sched_info("%s: could not change licenses for %pJ",
 				   __func__, job_ptr);
 			error_code = ESLURM_JOB_NOT_PENDING_NOR_RUNNING;
 			FREE_NULL_LIST(license_list);
 		}

 		update_accounting = true;
 	}
 	if (error_code != SLURM_SUCCESS)
 		goto fini;

 	fail_reason = job_limits_check(&job_ptr, false);
 	if (fail_reason != WAIT_NO_REASON) {
 		if (fail_reason == WAIT_QOS_THRES)
 			error_code = ESLURM_QOS_THRES;
 		else if ((fail_reason == WAIT_PART_TIME_LIMIT) ||
 			 (fail_reason == WAIT_PART_NODE_LIMIT) ||
 			 (fail_reason == WAIT_PART_DOWN) ||
 			 (fail_reason == WAIT_HELD))
 			error_code = SLURM_SUCCESS;
 		else
 			error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;

 		if (error_code != SLURM_SUCCESS) {
 			if ((job_ptr->state_reason != WAIT_HELD) &&
 			    (job_ptr->state_reason != WAIT_HELD_USER) &&
 			    (job_ptr->state_reason != WAIT_RESV_DELETED)) {
 				job_ptr->state_reason = fail_reason;
 				xfree(job_ptr->state_desc);
 			}
 			goto fini;
 		}
 	} else if ((job_ptr->state_reason != WAIT_HELD)
 		   && (job_ptr->state_reason != WAIT_HELD_USER)
 		   && (job_ptr->state_reason != WAIT_RESV_DELETED)
 		   /*
 		    * A job update can come while the prolog is running.
 		    * Don't change state_reason if the prolog is running.
 		    * _is_prolog_finished() relies on state_reason==WAIT_PROLOG
 		    * to know if the prolog is running. If we change it here,
 		    * then slurmctld will think that the prolog isn't running
 		    * anymore and _slurm_rpc_job_ready will tell srun that the
 		    * prolog is done even if it isn't. Then srun can launch a
 		    * job step before the prolog is done, which breaks the
 		    * behavior of PrologFlags=alloc and means that the job step
 		    * could launch before the extern step sets up x11.
 		    */
 		   && (job_ptr->state_reason != WAIT_PROLOG)
 		   && (job_ptr->state_reason != WAIT_MAX_REQUEUE)) {
 		job_ptr->state_reason = WAIT_NO_REASON;
 		xfree(job_ptr->state_desc);
 	}

 	if (job_desc->reboot != NO_VAL16) {
 		if (!validate_super_user(uid)) {
 			error("%s: Attempt to change reboot for %pJ",
 			      __func__, job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 		} else if (!IS_JOB_PENDING(job_ptr)) {
 			error_code = ESLURM_JOB_NOT_PENDING;
 			goto fini;
 		} else {
 			sched_info("%s: setting reboot to %u for %pJ",
 				   __func__, job_desc->reboot, job_ptr);
 			if (job_desc->reboot == 0)
 				job_ptr->reboot = 0;
 			else
 				job_ptr->reboot = MAX(1, job_desc->reboot);
 		}
 	}

 	if (job_desc->network && !xstrcmp(job_desc->network,
 					   job_ptr->network)) {
 		sched_debug("%s: new network identical to old network %s",
 			    __func__, job_ptr->network);
 	} else if (job_desc->network) {
 		xfree(job_ptr->network);
 		if (!strlen(job_desc->network)
 		    || !xstrcmp(job_desc->network, "none")) {
 			sched_info("%s: clearing Network option for %pJ",
 				   __func__, job_ptr);
 		} else {
 			job_ptr->network = xstrdup(job_desc->network);
 			sched_info("%s: setting Network to %s for %pJ",
 				   __func__, job_ptr->network, job_ptr);
 		}
 	}

 	if (job_desc->fed_siblings_viable) {
 		if (!job_ptr->fed_details) {
 			error_code = ESLURM_JOB_NOT_FEDERATED;
 			goto fini;
 		}

 		info("%s: setting fed_siblings from %"PRIu64" to %"PRIu64" for %pJ",
 		     __func__, job_ptr->fed_details->siblings_viable,
 		     job_desc->fed_siblings_viable, job_ptr);

 		job_ptr->fed_details->siblings_viable =
 			job_desc->fed_siblings_viable;
 		update_job_fed_details(job_ptr);
 	}

 	if (job_desc->cpus_per_tres) {
 		if (!assoc_mgr_valid_tres_cnt(job_desc->cpus_per_tres, 0)) {
 			error_code = ESLURM_INVALID_TRES;
 			goto fini;
 		}
 		xfree(job_ptr->cpus_per_tres);
 		if (!strlen(job_desc->cpus_per_tres)) {
 			sched_info("%s: clearing CpusPerTres option for %pJ",
 				   __func__, job_ptr);
 		} else {
 			job_ptr->cpus_per_tres =
 				xstrdup(job_desc->cpus_per_tres);
 			sched_info("%s: setting CpusPerTres to %s for %pJ",
 				   __func__, job_ptr->cpus_per_tres, job_ptr);
 		}
 	}

 	if (job_desc->mem_per_tres) {
 		if (!assoc_mgr_valid_tres_cnt(job_desc->mem_per_tres, 0)) {
 			error_code = ESLURM_INVALID_TRES;
 			goto fini;
 		}
 		xfree(job_ptr->mem_per_tres);
 		if (!strlen(job_desc->mem_per_tres)) {
 			sched_info("%s: clearing MemPerTres option for %pJ",
 				   __func__, job_ptr);
 		} else {
 			job_ptr->mem_per_tres =
 				xstrdup(job_desc->mem_per_tres);
 			sched_info("%s: setting MemPerTres to %s for %pJ",
 				   __func__, job_ptr->mem_per_tres, job_ptr);
 		}
 	}

 	if (job_desc->tres_bind) {
 		if (tres_bind_verify_cmdline(job_desc->tres_bind)) {
 			error_code = ESLURM_INVALID_TRES;
 			goto fini;
 		}
 		xfree(job_ptr->tres_bind);
 		if (!strlen(job_desc->tres_bind)) {
 			sched_info("%s: clearing TresBind option for %pJ",
 				   __func__, job_ptr);
 		} else {
 			job_ptr->tres_bind = xstrdup(job_desc->tres_bind);
 			sched_info("%s: setting TresBind to %s for %pJ",
 				   __func__, job_ptr->tres_bind, job_ptr);
 		}
 	}

 	if (job_desc->tres_freq) {
 		if (tres_freq_verify_cmdline(job_desc->tres_freq)) {
 			error_code = ESLURM_INVALID_TRES;
 			goto fini;
 		}
 		xfree(job_ptr->tres_freq);
 		if (!strlen(job_desc->tres_freq)) {
 			sched_info("%s: clearing TresFreq option for %pJ",
 				   __func__, job_ptr);
 		} else {
 			job_ptr->tres_freq = xstrdup(job_desc->tres_freq);
 			sched_info("%s: setting TresFreq to %s for %pJ",
 				   __func__, job_ptr->tres_freq, job_ptr);
 		}
 	}

 	if (job_desc->tres_per_job) {
 		if (!assoc_mgr_valid_tres_cnt(job_desc->tres_per_job, 0)) {
 			error_code = ESLURM_INVALID_TRES;
 			goto fini;
 		}
 		xfree(job_ptr->tres_per_job);
 		if (!strlen(job_desc->tres_per_job)) {
 			sched_info("%s: clearing TresPerJob option for %pJ",
 				   __func__, job_ptr);
 		} else {
 			job_ptr->tres_per_job =
 				xstrdup(job_desc->tres_per_job);
 			sched_info("%s: setting TresPerJob to %s for %pJ",
 				   __func__, job_ptr->tres_per_job, job_ptr);
 		}
 	}
 	if (job_desc->tres_per_node) {
 		if (!assoc_mgr_valid_tres_cnt(job_desc->tres_per_node, 0)) {
 			error_code = ESLURM_INVALID_TRES;
 			goto fini;
 		}
 		xfree(job_ptr->tres_per_node);
 		if (!strlen(job_desc->tres_per_node)) {
 			sched_info("%s: clearing TresPerNode option for %pJ",
 				   __func__, job_ptr);
 		} else {
 			job_ptr->tres_per_node =
 				xstrdup(job_desc->tres_per_node);
 			sched_info("%s: setting TresPerNode to %s for %pJ",
 				   __func__, job_ptr->tres_per_node, job_ptr);
 		}
 	}

 	if (job_desc->tres_per_socket) {
 		if (!assoc_mgr_valid_tres_cnt(job_desc->tres_per_socket, 0)) {
 			error_code = ESLURM_INVALID_TRES;
 			goto fini;
 		}
 		xfree(job_ptr->tres_per_socket);
 		if (!strlen(job_desc->tres_per_socket)) {
 			sched_info("%s: clearing TresPerSocket option for %pJ",
 				   __func__, job_ptr);
 		} else {
 			job_ptr->tres_per_socket =
 				xstrdup(job_desc->tres_per_socket);
 			sched_info("%s: setting TresPerSocket to %s for %pJ",
 				   __func__, job_ptr->tres_per_socket, job_ptr);
 		}
 	}

 	if (job_desc->tres_per_task) {
 		if (!assoc_mgr_valid_tres_cnt(job_desc->tres_per_task, 0)) {
 			error_code = ESLURM_INVALID_TRES;
 			goto fini;
 		}
 		xfree(job_ptr->tres_per_task);
 		if (!strlen(job_desc->tres_per_task)) {
 			sched_info("%s: clearing TresPerTask option for %pJ",
 				   __func__, job_ptr);
 		} else {
 			job_ptr->tres_per_task =
 				xstrdup(job_desc->tres_per_task);
 			sched_info("%s: setting TresPerTask to %s for %pJ",
 				   __func__, job_ptr->tres_per_task, job_ptr);
 		}
 	}

 	if (job_desc->mail_type != NO_VAL16) {
 		job_ptr->mail_type = job_desc->mail_type;
 		sched_info("%s: setting mail_type to %u for %pJ",
 			   __func__, job_ptr->mail_type, job_ptr);
 	}

 	if (job_desc->mail_user) {
 		xfree(job_ptr->mail_user);
 		job_ptr->mail_user = _get_mail_user(job_desc->mail_user,
 						    job_ptr);
 		sched_info("%s: setting mail_user to %s for %pJ",
 			   __func__, job_ptr->mail_user, job_ptr);
 	}

 	/*
 	 * The job submit plugin sets site_factor to NO_VAL before calling
 	 * the plugin to prevent the user from specifying it.
 	 */
 	if (user_site_factor != NO_VAL) {
 		if (!privileged) {
 			error("%s: Attempt to change SiteFactor for %pJ",
 			      __func__, job_ptr);
 			error_code = ESLURM_ACCESS_DENIED;
 			job_desc->site_factor = NO_VAL;
 		} else
 			job_desc->site_factor = user_site_factor;
 	}
 	if (job_desc->site_factor != NO_VAL) {
 		sched_info("%s: setting AdinPrioFactor to %u for %pJ",
 			   __func__, job_desc->site_factor, job_ptr);
 		job_ptr->site_factor = job_desc->site_factor;
 	}

 fini:
 	FREE_NULL_BITMAP(new_req_bitmap);
 	FREE_NULL_LIST(part_ptr_list);

 	if ((error_code == SLURM_SUCCESS) && tres_req_cnt_set) {
 		for (tres_pos = 0; tres_pos < slurmctld_tres_cnt; tres_pos++) {
 			if (tres_req_cnt[tres_pos] ==
 			    job_ptr->tres_req_cnt[tres_pos])
 				continue;

 			job_ptr->tres_req_cnt[tres_pos] =
 				tres_req_cnt[tres_pos];
 			tres_changed = true;
 		}
 		if (tres_changed) {
 			job_ptr->tres_req_cnt[TRES_ARRAY_BILLING] =
 				assoc_mgr_tres_weighted(
 					job_ptr->tres_req_cnt,
 					job_ptr->part_ptr->billing_weights,
 					slurm_conf.priority_flags, false);
 			set_job_tres_req_str(job_ptr, false);
 			update_accounting = true;
 			job_ptr->node_cnt_wag = 0;
 		}
 	}

 	/* This was a local variable, so set it back to NULL */
 	job_desc->tres_req_cnt = NULL;

 	if (!list_count(job_ptr->gres_list_req))
 		FREE_NULL_LIST(job_ptr->gres_list_req);

 	FREE_NULL_LIST(gres_list);
 	FREE_NULL_LIST(license_list);
 	if (update_accounting) {
 		info("%s: updating accounting",  __func__);
 		/* Update job record in accounting to reflect changes */
 		jobacct_storage_g_job_start(acct_db_conn, job_ptr);
 	}

 	/*
 	 * If job isn't held recalculate the priority when not using
 	 * priority/basic. Since many factors of an update may affect priority
 	 * considerations. Do this whether or not the update was successful or
 	 * not.
 	 */
 	if ((job_ptr->priority != 0) &&
 	    xstrcmp(slurm_conf.priority_type, "priority/basic"))
 		set_job_prio(job_ptr);

 	if ((error_code == SLURM_SUCCESS) &&
 	    fed_mgr_fed_rec &&
 	    job_ptr->fed_details && fed_mgr_is_origin_job(job_ptr)) {
 		/* Send updates to sibling jobs */
 		/* Add the siblings_active to be updated. They could have been
 		 * updated if the job's ClusterFeatures were updated. */
 		job_desc->fed_siblings_viable =
 			job_ptr->fed_details->siblings_viable;
 		fed_mgr_update_job(job_ptr->job_id, job_desc,
 				   job_ptr->fed_details->siblings_active, uid);
 	}

 	return error_code;
 }

 static int _foreach_update_hetjob(void *x, void *arg)
 {
 	job_record_t *het_job = x;
 	foreach_update_hetjob_t *update_hetjob = arg;

 	if (update_hetjob->het_leader->het_job_id != het_job->het_job_id) {
 		error("%s: Bad het_job_list for %pJ",
 		      __func__, update_hetjob->het_leader);
 		return 0;
 	}
 	if (update_hetjob->job_desc->array_inx) {
 		update_hetjob->err_msg = xstrdup("Update of ArrayTaskThrottle is only allowed on ArrayJobId");
 		update_hetjob->rc = ESLURM_NOT_SUPPORTED;
 		return -1;
 	} else {
 		update_hetjob->rc = _update_job(het_job,
 						update_hetjob->job_desc,
 						update_hetjob->uid,
 						&update_hetjob->err_msg);
 	}
 	return 0;
 }

 /*
  * update_job - update a job's parameters per the supplied specifications
  * IN msg - RPC to update job, including change specification
  * IN uid - uid of user issuing RPC
  * IN send_msg - whether to send msg back or not
  * RET returns an error code from slurm_errno.h
  * global: job_list - global list of job entries
  *	last_job_update - time of last job table update
  */
 extern int update_job(slurm_msg_t *msg, uid_t uid, bool send_msg)
 {
 	job_desc_msg_t *job_desc = msg->data;
 	job_record_t *job_ptr;
 	char *hostname = auth_g_get_host(msg);
 	char *err_msg = NULL;
 	int rc;

 	xfree(job_desc->job_id_str);
 	xstrfmtcat(job_desc->job_id_str, "%u", job_desc->job_id);

 	if (hostname) {
 		xfree(job_desc->alloc_node);
 		job_desc->alloc_node = hostname;
 	}

 	job_ptr = find_job_record(job_desc->job_id);
 	if (job_ptr == NULL) {
 		info("%s: JobId=%u does not exist",
 		     __func__, job_desc->job_id);
 		rc = ESLURM_INVALID_JOB_ID;
 	} else {
 		if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap)
 			job_desc->array_bitmap =
 				bit_copy(job_ptr->array_recs->task_id_bitmap);

 		rc = _update_job(job_ptr, job_desc, uid, &err_msg);
 	}
 	if (send_msg)
 		slurm_send_rc_err_msg(msg, rc, err_msg);
 	xfree(job_desc->job_id_str);

 	return rc;
 }

 /*
  * IN msg - RPC to update job, including change specification
  * IN job_desc - a job's specification
  * IN uid - uid of user issuing RPC
  * RET returns an error code from slurm_errno.h
  * global: job_list - global list of job entries
  *	last_job_update - time of last job table update
  */
 extern int update_job_str(slurm_msg_t *msg, uid_t uid)
 {
 	job_desc_msg_t *job_desc = msg->data;
 	job_record_t *job_ptr, *new_job_ptr;
 	char *hostname = auth_g_get_host(msg);
 	long int long_id;
 	uint32_t job_id = 0, het_job_offset;
 	bitstr_t *array_bitmap = NULL, *tmp_bitmap;
 	int32_t i, i_first, i_last;
 	int len, rc = SLURM_SUCCESS, rc2;
 	char *end_ptr, *tmp = NULL;
 	char *job_id_str;
 	char *err_msg = NULL;
 	resp_array_struct_t *resp_array = NULL;

 	job_id_str = job_desc->job_id_str;

 	if (hostname) {
 		xfree(job_desc->alloc_node);
 		job_desc->alloc_node = hostname;

 	}

 	if (max_array_size == NO_VAL)
 		max_array_size = slurm_conf.max_array_sz;

 	long_id = strtol(job_id_str, &end_ptr, 10);
 	if ((long_id <= 0) || (long_id == LONG_MAX) ||
 	    ((end_ptr[0] != '\0') && (end_ptr[0] != '_') &&
 	     (end_ptr[0] != '+'))) {
 		info("%s: invalid JobId=%s", __func__, job_id_str);
 		rc = ESLURM_INVALID_JOB_ID;
 		goto reply;
 	}
 	job_id = (uint32_t) long_id;
 	if (end_ptr[0] == '\0') {	/* Single job (or full job array) */
 		job_record_t *job_ptr_done = NULL;
 		job_ptr = find_job_record(job_id);
 		if (job_ptr && job_ptr->het_job_list) {
 			foreach_update_hetjob_t update_hetjob = {
 				.het_leader = job_ptr,
 				.job_desc = job_desc,
 				.rc = SLURM_SUCCESS,
 				.uid = uid,
 			};
 			(void) list_for_each(job_ptr->het_job_list,
 					     _foreach_update_hetjob,
 					     &update_hetjob);
 			rc = update_hetjob.rc;
 			err_msg = update_hetjob.err_msg;
 			update_hetjob.err_msg = NULL;
 			goto reply;
 		}
 		if (job_ptr &&
 		    (((job_ptr->array_task_id == NO_VAL) &&
 		      (job_ptr->array_recs == NULL)) ||
 		     ((job_ptr->array_task_id != NO_VAL) &&
 		      (job_ptr->array_job_id  != job_id)))) {
 			/* This is a regular job or single task of job array */
 			if (job_desc->array_inx) {
 				err_msg = xstrdup("Update of ArrayTaskThrottle is only allowed on ArrayJobId");
 				rc = ESLURM_NOT_SUPPORTED;
 			} else
 				rc = _update_job(job_ptr, job_desc, uid,
 						 &err_msg);
 			goto reply;
 		}

 		if (job_ptr && job_ptr->array_recs) {
 			/* This is a job array */
 			job_ptr_done = job_ptr;
 			if (job_ptr->array_recs->task_id_bitmap)
 				job_desc->array_bitmap = bit_copy(
 					job_ptr->array_recs->task_id_bitmap);
 			rc2 = _update_job(job_ptr, job_desc, uid, &err_msg);
 			_resp_array_add(&resp_array, job_ptr, rc2, err_msg);
 			xfree(err_msg);
 		}

 		/* Update all tasks of this job array */
 		job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
 		if (!job_ptr && !job_ptr_done) {
 			info("%s: invalid JobId=%u", __func__, job_id);
 			rc = ESLURM_INVALID_JOB_ID;
 			goto reply;
 		}
 		while (job_ptr) {
 			if ((job_ptr->array_job_id == job_id) &&
 			    (job_ptr != job_ptr_done)) {
 				rc2 = _update_job(job_ptr, job_desc, uid,
 						  &err_msg);
 				_resp_array_add(&resp_array, job_ptr, rc2,
 						err_msg);
 				xfree(err_msg);
 			}
 			job_ptr = job_ptr->job_array_next_j;
 		}
 		goto reply;
 	} else if (end_ptr[0] == '+') {	/* Hetjob element */
 		long_id = strtol(end_ptr+1, &tmp, 10);
 		if ((long_id < 0) || (long_id == LONG_MAX) ||
 		    (tmp[0] != '\0')) {
 			info("%s: invalid JobId=%s", __func__, job_id_str);
 			rc = ESLURM_INVALID_JOB_ID;
 			goto reply;
 		}
 		het_job_offset = (uint32_t) long_id;
 		job_ptr = find_het_job_record(job_id, het_job_offset);
 		if (!job_ptr) {
 			info("%s: invalid JobId=%u", __func__, job_id);
 			rc = ESLURM_INVALID_JOB_ID;
 			goto reply;
 		}
 		if (job_desc->array_inx) {
 			err_msg = xstrdup("Update of ArrayTaskThrottle is only allowed on ArrayJobId");
 			rc = ESLURM_NOT_SUPPORTED;
 		} else {
 			rc = _update_job(job_ptr, job_desc, uid, &err_msg);
 		}
 		goto reply;
 	}

 	array_bitmap = slurm_array_str2bitmap(end_ptr + 1, max_array_size,
 					      &i_last);
 	if (!array_bitmap) {
 		info("%s: invalid JobId=%s", __func__, job_id_str);
 		rc = ESLURM_INVALID_JOB_ID;
 		goto reply;
 	}

 	job_ptr = find_job_record(job_id);
 	if (job_ptr && IS_JOB_PENDING(job_ptr) &&
 	    job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) {
 		/* Ensure bitmap sizes match for AND operations */
 		len = bit_size(job_ptr->array_recs->task_id_bitmap);
 		i_last++;
 		if (i_last < len) {
 			bit_realloc(array_bitmap, len);
 		} else {
 			bit_realloc(array_bitmap, i_last);
 			bit_realloc(job_ptr->array_recs->task_id_bitmap,
 				    i_last);
 		}
 		if (!bit_overlap_any(job_ptr->array_recs->task_id_bitmap,
 				     array_bitmap)) {
 			/* Nothing to do with this job record */
 		} else if (bit_super_set(job_ptr->array_recs->task_id_bitmap,
 					 array_bitmap)) {
 			/* Update the record with all pending tasks */
 			job_desc->array_bitmap =
 				bit_copy(job_ptr->array_recs->task_id_bitmap);
 			if (job_desc->array_inx) {
 				err_msg = xstrdup("Update of ArrayTaskThrottle is only allowed on ArrayJobId");
 				rc2 = ESLURM_NOT_SUPPORTED;
 			} else
 				rc2 = _update_job(job_ptr, job_desc, uid,
 						  &err_msg);
 			_resp_array_add(&resp_array, job_ptr, rc2, err_msg);
 			xfree(err_msg);
 			bit_and_not(array_bitmap, job_desc->array_bitmap);
 		} else {
 			/* Need to split out tasks to separate job records */
 			tmp_bitmap = bit_copy(job_ptr->array_recs->
 					      task_id_bitmap);
 			bit_and(tmp_bitmap, array_bitmap);
 			i_first = bit_ffs(tmp_bitmap);
 			if (i_first >= 0)
 				i_last = bit_fls(tmp_bitmap);
 			else
 				i_last = -2;
 			for (i = i_first; i <= i_last; i++) {
 				if (!bit_test(tmp_bitmap, i))
 					continue;
 				job_ptr->array_task_id = i;
 				new_job_ptr = job_array_split(job_ptr, true);

 				/*
 				 * The array_recs structure is moved to the
 				 * new job record copy.
 				 */
 				bb_g_job_validate2(job_ptr, NULL);
 				job_ptr = new_job_ptr;
 			}
 			FREE_NULL_BITMAP(tmp_bitmap);
 		}
 	}

 	i_first = bit_ffs(array_bitmap);
 	if (i_first >= 0)
 		i_last = bit_fls(array_bitmap);
 	else
 		i_last = -2;
 	for (i = i_first; i <= i_last; i++) {
 		if (!bit_test(array_bitmap, i))
 			continue;
 		job_ptr = find_job_array_rec(job_id, i);
 		if (job_ptr == NULL) {
 			info("%s: invalid JobId=%u_%d", __func__, job_id, i);
 			_resp_array_add_id(&resp_array, job_id, i,
 					   ESLURM_INVALID_JOB_ID);
 			continue;
 		}

 		if (job_desc->array_inx) {
 			err_msg = xstrdup("Update of ArrayTaskThrottle is only allowed on ArrayJobId");
 			rc2 = ESLURM_NOT_SUPPORTED;
 		} else
 			rc2 = _update_job(job_ptr, job_desc, uid, &err_msg);
 		_resp_array_add(&resp_array, job_ptr, rc2, err_msg);
 		xfree(err_msg);
 	}

 reply:
 	if (msg->tls_conn) {
 		if (resp_array) {
 			job_array_resp_msg_t *resp_array_msg =
 				_resp_array_xlate(resp_array, job_id);
 			(void) send_msg_response(msg, RESPONSE_JOB_ARRAY_ERRORS,
 						 resp_array_msg);
 			slurm_free_job_array_resp(resp_array_msg);
 		} else {
 			slurm_send_rc_err_msg(msg, rc, err_msg);
 		}
 	}
 	xfree(err_msg);
 	_resp_array_free(resp_array);

 	FREE_NULL_BITMAP(array_bitmap);

 	return rc;
 }

 extern kill_job_msg_t *create_kill_job_msg(job_record_t *job_ptr,
 					   uint16_t protocol_version)
 {
 	slurm_cred_arg_t cred_arg;
 	kill_job_msg_t *msg = xmalloc(sizeof(*msg));

 	xassert(job_ptr);
 	xassert(job_ptr->details);

 	setup_cred_arg(&cred_arg, job_ptr);

 	cred_arg.step_id.job_id = job_ptr->job_id;
 	cred_arg.step_id.step_het_comp = NO_VAL;
 	cred_arg.step_id.step_id = NO_VAL;

 	msg->cred = slurm_cred_create(&cred_arg, false, protocol_version);

 	msg->derived_ec = job_ptr->derived_ec;
 	msg->details = xstrdup(job_ptr->state_desc);
 	msg->exit_code = job_ptr->exit_code;
 	msg->het_job_id = job_ptr->het_job_id;
 	msg->job_gres_prep = gres_g_prep_build_env(job_ptr->gres_list_alloc,
 						   job_ptr->nodes);
 	msg->job_state = job_ptr->job_state;
 	msg->job_uid = job_ptr->user_id;
 	msg->job_gid = job_ptr->group_id;
 	msg->start_time = job_ptr->start_time;
 	msg->step_id.job_id = job_ptr->job_id;
 	msg->step_id.step_het_comp = NO_VAL;
 	msg->step_id.step_id = NO_VAL;
 	msg->spank_job_env = xduparray(job_ptr->spank_job_env_size,
 				       job_ptr->spank_job_env);
 	msg->spank_job_env_size = job_ptr->spank_job_env_size;
 	msg->time = time(NULL);
 	msg->work_dir = xstrdup(job_ptr->details->work_dir);

 	return msg;
 }

 static void _send_job_kill(job_record_t *job_ptr)
 {
 	agent_arg_t *agent_args = NULL;
 	node_record_t *node_ptr;
 	kill_job_msg_t *kill_job;

 	agent_args = xmalloc(sizeof(agent_arg_t));
 	agent_args->msg_type = REQUEST_TERMINATE_JOB;
 	agent_args->retry = 0;	/* re_kill_job() resends as needed */
 	agent_args->hostlist = hostlist_create(NULL);

 	last_node_update    = time(NULL);

 	if (!job_ptr->node_bitmap_cg)
 		build_cg_bitmap(job_ptr);
 	agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
 	for (int i = 0;
 	     (node_ptr = next_node_bitmap(job_ptr->node_bitmap_cg, &i)); i++) {
 		if (agent_args->protocol_version > node_ptr->protocol_version)
 			agent_args->protocol_version =
 				node_ptr->protocol_version;
 		hostlist_push_host(agent_args->hostlist, node_ptr->name);
 		agent_args->node_count++;
 		if (PACK_FANOUT_ADDRS(node_ptr))
 			agent_args->msg_flags |= SLURM_PACK_ADDRS;
 	}
 	if (agent_args->node_count == 0) {
 		if (job_ptr->details->expanding_jobid == 0) {
 			error("%s: %pJ allocated no nodes to be killed on",
 			      __func__, job_ptr);
 		}
 		hostlist_destroy(agent_args->hostlist);
 		xfree(agent_args);
 		return;
 	}

 	kill_job = create_kill_job_msg(job_ptr, agent_args->protocol_version);
 	kill_job->nodes = xstrdup(job_ptr->nodes);

 	agent_args->msg_args = kill_job;
 	set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY);
 	agent_queue_request(agent_args);
 }

 /* Record accounting information for a job immediately before changing size */
 extern void job_pre_resize_acctg(job_record_t *job_ptr)
 {
 	job_state_set_flag(job_ptr, JOB_RESIZING);
 	job_ptr->resize_time = time(NULL);
 	/* NOTE: job_completion_logger() calls
 	 *	 acct_policy_remove_job_submit() */
 	job_completion_logger(job_ptr, false);

 	/* This doesn't happen in job_completion_logger, but gets
 	 * added back in with job_post_resize_acctg so remove it here. */
 	acct_policy_job_fini(job_ptr, false);

 	/* NOTE: The RESIZING FLAG needed to be cleared with
 	   job_post_resize_acctg */
 }

 /* Record accounting information for a job immediately after changing size */
 extern void job_post_resize_acctg(job_record_t *job_ptr)
 {
 	/*
 	 * NOTE: The RESIZING FLAG needed to be set with job_pre_resize_acctg()
 	 * the assert is here to make sure we code it that way.
 	 */
 	xassert(IS_JOB_RESIZING(job_ptr));
 	acct_policy_add_job_submit(job_ptr, false);
 	/* job_set_alloc_tres() must be called before acct_policy_job_begin() */
 	job_set_alloc_tres(job_ptr, false);

 	/*
 	 * Clear out the old request and replace it with the new alloc.
 	 * This probably isn't totally perfect in all situations, but it will
 	 * make it tres_req_* correct enough to the user. The tres_req_* isn't
 	 * used to make any decisions. It is stored in the database, but only
 	 * as a reference for non-pending jobs, which in this case will always
 	 * be the case.
 	 */
 	memcpy(job_ptr->tres_req_cnt, job_ptr->tres_alloc_cnt,
 	       slurmctld_tres_cnt * sizeof(uint64_t));
 	xfree(job_ptr->tres_req_str);
 	job_ptr->tres_req_str = xstrdup(job_ptr->tres_alloc_str);
 	xfree(job_ptr->tres_fmt_req_str);
 	job_ptr->tres_fmt_req_str = xstrdup(job_ptr->tres_fmt_alloc_str);

 	acct_policy_job_begin(job_ptr, false);
 	resv_replace_update(job_ptr);

 	/*
 	 * Get new sluid now that we are basically a new job.
 	 */
 	job_record_set_sluid(job_ptr);
 	jobacct_storage_g_job_start(acct_db_conn, job_ptr);

 	job_state_unset_flag(job_ptr, JOB_RESIZING);

 	/*
 	 * Reset the end_time_exp that was probably set to NO_VAL when
 	 * ending the job on the resize.  If using the
 	 * priority/multifactor plugin if the end_time_exp is NO_VAL
 	 * it will not run again for the job.
 	 */
 	job_ptr->end_time_exp = job_ptr->end_time;
 }

 /*
  * validate_jobs_on_node - validate that any jobs that should be on the node
  *	are actually running, if not clean up the job records and/or node
  *	records.
  * IN slurm_msg - contains the node registration message
  */
 extern void validate_jobs_on_node(slurm_msg_t *slurm_msg)
 {
 	int i, jobs_on_node;
 	node_record_t *node_ptr;
 	job_record_t *job_ptr;
 	step_record_t *step_ptr;
 	time_t now = time(NULL);

 	slurm_node_registration_status_msg_t *reg_msg = slurm_msg->data;

 	node_ptr = find_node_record(reg_msg->node_name);
 	if (node_ptr == NULL) {
 		error("slurmd registered on unknown node %s",
 		      reg_msg->node_name);
 		return;
 	}

 	/*
 	 * Set protocol_version now because abort_job_on_node() needs to know
 	 * the node's correct version. validate_node_specs() sets it but that's
 	 * too late.
 	 */
 	node_ptr->protocol_version = slurm_msg->protocol_version;

 	if (reg_msg->energy)
 		memcpy(node_ptr->energy, reg_msg->energy,
 		       sizeof(acct_gather_energy_t));

 	if (node_ptr->up_time > reg_msg->up_time) {
 		verbose("Node %s rebooted %u secs ago",
 			reg_msg->node_name, reg_msg->up_time);
 	}

 	if (reg_msg->up_time <= now) {
 		node_ptr->up_time = reg_msg->up_time;
 		node_ptr->boot_time = now - reg_msg->up_time;
 		node_ptr->slurmd_start_time = reg_msg->slurmd_start_time;
 	} else {
 		error("Node up_time is invalid: %u>%u", reg_msg->up_time,
 		      (uint32_t) now);
 	}

 	if (waiting_for_node_boot(node_ptr) ||
 	    waiting_for_node_power_down(node_ptr))
 		return;

 	/* Check that jobs running are really supposed to be there */
 	for (i = 0; i < reg_msg->job_count; i++) {
 		if ( (reg_msg->step_id[i].job_id >= MIN_NOALLOC_JOBID) &&
 		     (reg_msg->step_id[i].job_id <= MAX_NOALLOC_JOBID) ) {
 			info("NoAllocate %ps reported on node %s",
 			     &reg_msg->step_id[i], reg_msg->node_name);
 			continue;
 		}

 		job_ptr = find_job_record(reg_msg->step_id[i].job_id);
 		if (job_ptr == NULL) {
 			error("Orphan %ps reported on node %s",
 			      &reg_msg->step_id[i],
 			      reg_msg->node_name);
 			abort_job_on_node(reg_msg->step_id[i].job_id,
 					  job_ptr, node_ptr->name);
 		}

 		else if (IS_JOB_RUNNING(job_ptr) ||
 			 IS_JOB_SUSPENDED(job_ptr)) {
 			if (bit_test(job_ptr->node_bitmap, node_ptr->index)) {
 				if ((job_ptr->batch_flag) &&
 				    (node_ptr->index == bit_ffs(
 					    job_ptr->node_bitmap))) {
 					/* NOTE: Used for purging defunct
 					 * batch jobs */
 					job_ptr->time_last_active = now;
 				}
 				step_ptr = find_step_record(job_ptr,
 							    &reg_msg->
 							    step_id[i]);
 				if (step_ptr)
 					step_ptr->time_last_active = now;
 				debug3("Registered %pS on node %s",
 				       step_ptr, reg_msg->node_name);
 			} else {
 				/* Typically indicates a job requeue and
 				 * restart on another nodes. A node from the
 				 * original allocation just responded here. */
 				error("Registered %pJ %ps on wrong node %s",
 				      job_ptr,
 				      &reg_msg->step_id[i],
 				      reg_msg->node_name);
 				info("%s: job nodes %s count %d inx %d",
 				     __func__, job_ptr->nodes,
 				     job_ptr->node_cnt, node_ptr->index);
 				abort_job_on_node(reg_msg->step_id[i].job_id,
 						  job_ptr,
 						  node_ptr->name);
 			}
 		}

 		else if (IS_JOB_COMPLETING(job_ptr)) {
 			/*
 			 * Re-send kill request as needed,
 			 * not necessarily an error
 			 */
 			kill_job_on_node(job_ptr, node_ptr);
 		}


 		else if (IS_JOB_PENDING(job_ptr)) {
 			/* Typically indicates a job requeue and the hung
 			 * slurmd that went DOWN is now responding */
 			error("Registered PENDING %pJ %ps on node %s",
 			      job_ptr,
 			      &reg_msg->step_id[i],
 			      reg_msg->node_name);
 			abort_job_on_node(reg_msg->step_id[i].job_id,
 					  job_ptr, node_ptr->name);
 		} else if (difftime(now, job_ptr->end_time) <
 		           slurm_conf.msg_timeout) {
 			/* Race condition */
 			debug("Registered newly completed %pJ %ps on %s",
 			      job_ptr,
 			      &reg_msg->step_id[i],
 			      node_ptr->name);
 		}

 		else {		/* else job is supposed to be done */
 			error("Registered %pJ %ps in state %s on node %s",
 			      job_ptr,
 			      &reg_msg->step_id[i],
 			      job_state_string(job_ptr->job_state),
 			      reg_msg->node_name);
 			kill_job_on_node(job_ptr, node_ptr);
 		}
 	}

 	jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
 	if (jobs_on_node)
 		_purge_missing_jobs(node_ptr->index, now);

 	if (jobs_on_node != reg_msg->job_count) {
 		/* slurmd will not know of a job unless the job has
 		 * steps active at registration time, so this is not
 		 * an error condition, slurmd is also reporting steps
 		 * rather than jobs */
 		debug3("resetting job_count on node %s from %u to %d",
 		       reg_msg->node_name, reg_msg->job_count, jobs_on_node);
 		reg_msg->job_count = jobs_on_node;
 	}
 }

 static int _foreach_notify_srun_missing_step(void *x, void *arg)
 {
 	step_record_t *step_ptr = x;
 	foreach_purge_missing_jobs_t *foreach_purge_missing_jobs = arg;
 	job_record_t *job_ptr = foreach_purge_missing_jobs->job_ptr;
 	time_t node_boot_time = foreach_purge_missing_jobs->node_boot_time;
 	int node_inx = foreach_purge_missing_jobs->node_inx;
 	time_t now = foreach_purge_missing_jobs->now;
 	char *node_name = node_record_table_ptr[node_inx]->name;

 	if ((step_ptr->step_id.step_id == SLURM_EXTERN_CONT) ||
 	    (step_ptr->step_id.step_id == SLURM_BATCH_SCRIPT) ||
 	    (step_ptr->state != JOB_RUNNING))
 		return 0;
 	if (!bit_test(step_ptr->step_node_bitmap, node_inx))
 		return 0;
 	if (step_ptr->time_last_active >= now) {
 		/* Back up timer in case more than one node
 		 * registration happens at this same time.
 		 * We don't want this node's registration
 		 * to count toward a different node's
 		 * registration message. */
 		step_ptr->time_last_active = now - 1;
 	} else if (step_ptr->host && step_ptr->port) {
 		/* srun may be able to verify step exists on
 		 * this node using I/O sockets and kill the
 		 * job as needed */
 		srun_step_missing(step_ptr, node_name);
 	} else if ((step_ptr->start_time < node_boot_time) &&
 		   !(step_ptr->flags & SSF_NO_KILL)) {
 		/* There is a risk that the job step's tasks completed
 		 * on this node before its reboot, but that should be
 		 * very rare and there is no srun to work with (POE) */
 		info("Node %s rebooted, killing missing step %u.%u",
 		     node_name, job_ptr->job_id, step_ptr->step_id.step_id);
 		signal_step_tasks_on_node(node_name, step_ptr, SIGKILL,
 					  REQUEST_TERMINATE_TASKS);
 	}

 	return 0;
 }

 static int _foreach_purge_missing_jobs(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;
 	foreach_purge_missing_jobs_t *foreach_purge_missing_jobs = arg;
 	time_t startup_time = foreach_purge_missing_jobs->batch_startup_time;

 	if ((IS_JOB_CONFIGURING(job_ptr) ||
 	     (!IS_JOB_RUNNING(job_ptr) &&
 	      !IS_JOB_SUSPENDED(job_ptr))) ||
 	    (!bit_test(job_ptr->node_bitmap,
 		       foreach_purge_missing_jobs->node_inx)))
 		return 0;

 	if (job_ptr->batch_flag &&
 	    foreach_purge_missing_jobs->power_save_on &&
 	    (job_ptr->start_time <
 	     foreach_purge_missing_jobs->node_boot_time))
 		startup_time -= slurm_conf.resume_timeout;

 	if (job_ptr->batch_flag &&
 	    !job_ptr->het_job_offset &&
 	    (job_ptr->time_last_active < startup_time) &&
 	    (job_ptr->start_time < startup_time) &&
 	    (foreach_purge_missing_jobs->node_ptr ==
 	     find_node_record(job_ptr->batch_host))) {
 		bool requeue = false;
 		char *requeue_msg = "";
 		if (job_ptr->details && job_ptr->details->requeue) {
 			requeue = true;
 			requeue_msg = ", Requeuing job";
 		}
 		info("Batch %pJ missing from batch node %s (not found BatchStartTime after startup)%s",
 		     job_ptr, job_ptr->batch_host, requeue_msg);
 		xfree(job_ptr->failed_node);
 		job_ptr->failed_node = xstrdup(job_ptr->batch_host);
 		job_complete(job_ptr->job_id, slurm_conf.slurm_user_id,
 			     requeue, true, 1);
 	} else {
 		foreach_purge_missing_jobs->job_ptr = job_ptr;

 		(void) list_for_each(job_ptr->step_list,
 				     _foreach_notify_srun_missing_step,
 				     foreach_purge_missing_jobs);
 	}
 	return 0;
 }

 /* Purge any batch job that should have its script running on node
  * node_inx, but is not. Allow BatchStartTimeout + ResumeTimeout seconds
  * for startup.
  *
  * Purge all job steps that were started before the node was last booted.
  *
  * Also notify srun if any job steps should be active on this node
  * but are not found. */
 static void _purge_missing_jobs(int node_inx, time_t now)
 {
 	static bool power_save_on = false;
 	static time_t sched_update = 0;
 	foreach_purge_missing_jobs_t foreach_purge_missing_jobs = {
 		.node_inx = node_inx,
 		.node_ptr = node_record_table_ptr[node_inx],
 		.now = now,
 	};

 	if (sched_update != slurm_conf.last_update) {
 		power_save_on = power_save_test();
 		sched_update = slurm_conf.last_update;
 	}

 	foreach_purge_missing_jobs.power_save_on = power_save_on;

 	if (foreach_purge_missing_jobs.node_ptr->boot_time >
 	    (slurm_conf.msg_timeout + 5)) {
 		/* allow for message timeout and other delays */
 		foreach_purge_missing_jobs.node_boot_time =
 			foreach_purge_missing_jobs.node_ptr->boot_time -
 			(slurm_conf.msg_timeout + 5);
 	}

 	foreach_purge_missing_jobs.batch_startup_time =
 		now - slurm_conf.batch_start_timeout -
 		MIN(DEFAULT_MSG_TIMEOUT, slurm_conf.msg_timeout);

 	(void) list_for_each(job_list, _foreach_purge_missing_jobs,
 			     &foreach_purge_missing_jobs);
 }

 /*
  * abort_job_on_node - Kill the specific job_id on a specific node,
  *	the request is not processed immediately, but queued.
  *	This is to prevent a flood of pthreads if slurmctld restarts
  *	without saved state and slurmd daemons register with a
  *	multitude of running jobs. Slurmctld will not recognize
  *	these jobs and use this function to kill them - one
  *	agent request per node as they register.
  * IN job_id - id of the job to be killed
  * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. job reported
  *		by slurmd on some node, but job records already purged from
  *		slurmctld)
  * IN node_name - name of the node on which the job resides
  */
 extern void abort_job_on_node(uint32_t job_id, job_record_t *job_ptr,
 			      char *node_name)
 {
 	agent_arg_t *agent_info;
 	kill_job_msg_t *kill_req;

 	agent_info = xmalloc(sizeof(agent_arg_t));
 	agent_info->node_count	= 1;
 	agent_info->retry	= 0;
 	agent_info->hostlist	= hostlist_create(node_name);
 	node_record_t *node_ptr;
 	if ((node_ptr = find_node_record(node_name)))
 		agent_info->protocol_version = node_ptr->protocol_version;
 	if (job_ptr)
 		debug("Aborting %pJ on node %s", job_ptr, node_name);
 	else
 		debug("Aborting JobId=%u on node %s", job_id, node_name);

 	if (job_ptr) {  /* NULL if unknown */
 		kill_req = create_kill_job_msg(job_ptr,
 					       agent_info->protocol_version);
 	} else {
 		kill_req = xmalloc(sizeof(*kill_req));
 		kill_req->step_id.job_id = job_id;
 		kill_req->step_id.step_id = NO_VAL;
 		kill_req->step_id.step_het_comp = NO_VAL;
 		kill_req->time = time(NULL);
 		/* kill_req->start_time = 0;  Default value */
 	}

 	kill_req->nodes = xstrdup(node_name);

 	agent_info->msg_type	= REQUEST_ABORT_JOB;
 	agent_info->msg_args	= kill_req;

 	set_agent_arg_r_uid(agent_info, SLURM_AUTH_UID_ANY);
 	agent_queue_request(agent_info);
 }

 /*
  * abort_job_on_nodes - Kill the specific job_on the specific nodes,
  *	the request is not processed immediately, but queued.
  *	This is to prevent a flood of pthreads if slurmctld restarts
  *	without saved state and slurmd daemons register with a
  *	multitude of running jobs. Slurmctld will not recognize
  *	these jobs and use this function to kill them - one
  *	agent request per node as they register.
  * IN job_ptr - pointer to terminating job
  * IN node_name - name of the node on which the job resides
  */
 extern void abort_job_on_nodes(job_record_t *job_ptr,
 			       bitstr_t *node_bitmap)
 {
 	bitstr_t *full_node_bitmap, *tmp_node_bitmap;
 	node_record_t *node_ptr;
 	int zero = 0;
 	agent_arg_t *agent_info;
 	kill_job_msg_t *kill_req;
 	uint16_t protocol_version;

 	xassert(node_bitmap);
 	/* Send a separate message for nodes at different protocol_versions */
 	full_node_bitmap = bit_copy(node_bitmap);
 	while ((node_ptr = next_node_bitmap(full_node_bitmap, &zero))) {
 		protocol_version = node_ptr->protocol_version;
 		tmp_node_bitmap = bit_alloc(bit_size(node_bitmap));
 		for (int i = 0;
 		     (node_ptr = next_node_bitmap(full_node_bitmap, &i)); i++) {
 			if (node_ptr->protocol_version != protocol_version)
 				continue;
 			bit_clear(full_node_bitmap, i);
 			bit_set(tmp_node_bitmap, i);
 		}
 		kill_req = create_kill_job_msg(job_ptr, protocol_version);
 		kill_req->nodes = bitmap2node_name_sortable(tmp_node_bitmap,
 							    false);
 		agent_info = xmalloc(sizeof(agent_arg_t));
 		agent_info->node_count	= bit_set_count(tmp_node_bitmap);
 		agent_info->retry	= 1;
 		agent_info->hostlist	= hostlist_create(kill_req->nodes);
 		debug("Aborting %pJ on nodes %s", job_ptr, kill_req->nodes);
 		agent_info->msg_type	= REQUEST_ABORT_JOB;
 		agent_info->msg_args	= kill_req;
 		agent_info->protocol_version = protocol_version;
 		set_agent_arg_r_uid(agent_info, SLURM_AUTH_UID_ANY);
 		agent_queue_request(agent_info);
 		FREE_NULL_BITMAP(tmp_node_bitmap);
 	}
 	FREE_NULL_BITMAP(full_node_bitmap);
 }

 /*
  * kill_job_on_node - Kill the specific job on a specific node.
  * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned)
  * IN node_ptr - pointer to the node on which the job resides
  */
 extern void kill_job_on_node(job_record_t *job_ptr,
 			     node_record_t *node_ptr)
 {
 	agent_arg_t *agent_info;
 	kill_job_msg_t *kill_req;

 	agent_info = xmalloc(sizeof(agent_arg_t));
 	agent_info->node_count	= 1;
 	agent_info->retry	= 0;
 	agent_info->protocol_version = node_ptr->protocol_version;
 	agent_info->hostlist	= hostlist_create(node_ptr->name);
 	debug("Killing %pJ on node %s", job_ptr, node_ptr->name);

 	kill_req = create_kill_job_msg(job_ptr, agent_info->protocol_version);
 	kill_req->nodes	= xstrdup(node_ptr->name);

 	agent_info->msg_type	= REQUEST_TERMINATE_JOB;
 	agent_info->msg_args	= kill_req;

 	set_agent_arg_r_uid(agent_info, SLURM_AUTH_UID_ANY);
 	agent_queue_request(agent_info);
 }

 static int _foreach_job_all_finished(void *x, void *arg)
 {
 	job_record_t *het_job = x;

 	if (!IS_JOB_FINISHED(het_job))
 		return -1;
 	return 0;
 }

 /*
  * Return true if this job is complete (including all elements of a hetjob)
  */
 static bool _job_all_finished(job_record_t *job_ptr)
 {
 	if (!IS_JOB_FINISHED(job_ptr))
 		return false;

 	if (job_ptr->het_job_list &&
 	    list_find_first(job_ptr->het_job_list,
 			    _foreach_job_all_finished,
 			    NULL))
 		return false;

 	return true;
 }

 /*
  * job_alloc_info_ptr - get details about an existing job allocation
  * IN uid - job issuing the code
  * IN job_ptr - pointer to job record
  * NOTE: See job_alloc_info() if job pointer not known
  */
 extern int job_alloc_info_ptr(uint32_t uid, job_record_t *job_ptr)
 {
 	uint8_t prolog = 0;

 	if ((slurm_conf.private_data & PRIVATE_DATA_JOBS) &&
 	    (job_ptr->user_id != uid) && !validate_operator(uid) &&
 	    (((slurm_mcs_get_privatedata() == 0) &&
 	      !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
 					    job_ptr->account, false)) ||
 	     ((slurm_mcs_get_privatedata() == 1) &&
 	      (mcs_g_check_mcs_label(uid, job_ptr->mcs_label, false) != 0))))
 		return ESLURM_ACCESS_DENIED;
 	if (IS_JOB_PENDING(job_ptr))
 		return ESLURM_JOB_PENDING;
 	if (_job_all_finished(job_ptr))
 		return ESLURM_ALREADY_DONE;
 	if (job_ptr->details)
 		prolog = job_ptr->details->prolog_running;

 	if (job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") &&
 	    (prolog == 0) && job_ptr->node_bitmap &&
 	    (bit_overlap_any(power_down_node_bitmap,
 	                     job_ptr->node_bitmap) == 0)) {
 		last_job_update = time(NULL);
 		set_job_alias_list(job_ptr);
 	}

 	return SLURM_SUCCESS;
 }

 /*
  * job_alloc_info - get details about an existing job allocation
  * IN uid - job issuing the code
  * IN job_id - ID of job for which info is requested
  * OUT job_pptr - set to pointer to job record
  * NOTE: See job_alloc_info_ptr() if job pointer is known
  */
 extern int job_alloc_info(uint32_t uid, uint32_t job_id,
 			  job_record_t **job_pptr)
 {
 	job_record_t *job_ptr;

 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL)
 		return ESLURM_INVALID_JOB_ID;
 	if (job_pptr)
 		*job_pptr = job_ptr;
 	return job_alloc_info_ptr(uid, job_ptr);
 }

 /*
  * If we can't find the job_id we remove the defunct file. If we do find it we
  * set HAS_STATE_DIR.
  */
 static void _sync_job_with_batch_dir(uint32_t job_id)
 {
 	job_record_t *job_ptr = find_job_record(job_id);

 	if (job_ptr) {
 		job_ptr->bit_flags |= HAS_STATE_DIR;

 		if (job_ptr->array_recs) { /* Update all tasks */
 			uint32_t array_job_id = job_ptr->array_job_id;
 			job_ptr = job_array_hash_j[JOB_HASH_INX(array_job_id)];
 			while (job_ptr) {
 				if (job_ptr->array_job_id == array_job_id)
 					job_ptr->bit_flags |= HAS_STATE_DIR;
 				job_ptr = job_ptr->job_array_next_j;
 			}
 		}
 	} else {
 		info("Purged files for defunct batch JobId=%u", job_id);
 		delete_job_desc_files(job_id);
 	}
 }

 /*
  * Synchronize the batch job in the system with their files.
  * All pending batch jobs must have script and environment files
  * No other jobs should have such files
  */
 int sync_job_files(void)
 {
 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));

 	if (!slurmctld_primary)	/* Don't purge files from backup slurmctld */
 		return SLURM_SUCCESS;

 	list_for_each(job_list, _clear_state_dir_flag, NULL);

 	_validate_job_files();

 	list_for_each(job_list, _test_state_dir_flag, NULL);

 	return SLURM_SUCCESS;
 }

 static void _validate_job_files(void)
 {
 	DIR *f_dir, *h_dir;
 	struct dirent *dir_ent, *hash_ent;
 	uint32_t job_id;
 	char *endptr;

 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));

 	xassert(slurm_conf.state_save_location);
 	f_dir = opendir(slurm_conf.state_save_location);
 	if (!f_dir) {
 		error("opendir(%s): %m", slurm_conf.state_save_location);
 		return;
 	}

 	while ((dir_ent = readdir(f_dir))) {
 		if (!xstrncmp("hash.#", dir_ent->d_name, 5)) {
 			char *h_path = NULL;
 			xstrfmtcat(h_path, "%s/%s",
 			           slurm_conf.state_save_location,
 			           dir_ent->d_name);
 			h_dir = opendir(h_path);
 			xfree(h_path);
 			if (!h_dir)
 				continue;
 			while ((hash_ent = readdir(h_dir))) {
 				if (xstrncmp("job.#", hash_ent->d_name, 4))
 					continue;
 				job_id = strtoul(&hash_ent->d_name[4],
 						 &endptr, 10);
 				if ((job_id == 0) || (endptr[0] != '\0'))
 					continue;
 				debug3("Found batch directory for JobId=%u",
 				       job_id);
 				_sync_job_with_batch_dir(job_id);
 			}
 			closedir(h_dir);
 		}
 	}

 	closedir(f_dir);
 }

 static int _clear_state_dir_flag(void *x, void *arg)
 {
 	job_record_t *job_ptr = (job_record_t *) x;
 	job_ptr->bit_flags &= ~HAS_STATE_DIR;
 	return 0;
 }

 static int _test_state_dir_flag(void *x, void *arg)
 {
 	job_record_t *job_ptr = (job_record_t *) x;

 	if (job_ptr->bit_flags & HAS_STATE_DIR) {
 		job_ptr->bit_flags &= ~HAS_STATE_DIR;
 		return 0;
 	}

 	if (!job_ptr->batch_flag || !IS_JOB_PENDING(job_ptr) ||
 	    (job_ptr->het_job_offset > 0))
 		return 0;	/* No files expected */

 	error("Script for %pJ lost, state set to FAILED", job_ptr);
 	job_state_set(job_ptr, JOB_FAILED);
 	job_ptr->exit_code = 1;
 	job_ptr->state_reason = FAIL_SYSTEM;
 	xfree(job_ptr->state_desc);
 	job_ptr->start_time = job_ptr->end_time = time(NULL);
 	job_completion_logger(job_ptr, false);
 	return 0;
 }

 /* Get requested gres but only if mem_per_gres was set for that gres */
 static int _get_req_gres(void *x, void *arg)
 {
 	gres_state_t *gres_state_job = x;
 	gres_job_state_t *gres_js_out = arg;
 	gres_job_state_t *gres_js = gres_state_job->gres_data;

 	/*
 	 * This assumes that only one gres name has mem_per_gres in the job.
 	 * This won't work if two different gres names (for example, "gpu" and
 	 * "license") both have mem_per_gres. Right now we only allow
 	 * mem_per_gres for GPU so this works.
 	 */
 	if (!gres_js->mem_per_gres)
 		return SLURM_SUCCESS;

 	/*
 	 * In theory MAX(mem_per_gres) shouldn't matter because we should only
 	 * allow one gres name to have mem_per_gres and it should be the same
 	 * for all types (e.g., gpu:k80 vs gpu:tesla) of that same gres (gpu).
 	 */
 	gres_js_out->mem_per_gres = MAX(gres_js_out->mem_per_gres,
 					gres_js->mem_per_gres);

 	gres_js_out->gres_per_job += gres_js->gres_per_job;
 	gres_js_out->gres_per_node += gres_js->gres_per_node;
 	gres_js_out->gres_per_socket += gres_js->gres_per_socket;
 	gres_js_out->gres_per_task += gres_js->gres_per_task;

 	return SLURM_SUCCESS;
 }

 extern uint64_t job_get_tres_mem(struct job_resources *job_res,
 				 uint64_t pn_min_memory, uint32_t cpu_cnt,
 				 uint32_t node_cnt, part_record_t *part_ptr,
 				 list_t *gres_list, bool user_set_mem,
 				 uint16_t min_sockets_per_node,
 				 uint32_t num_tasks)
 {
 	uint64_t mem_total = 0;
 	int i;

 	if (job_res) {
 		for (i = 0; i < job_res->nhosts; i++) {
 			mem_total += job_res->memory_allocated[i];
 		}
 		return mem_total;
 	}

 	if (pn_min_memory == NO_VAL64)
 		return mem_total;

 	if (!user_set_mem && gres_list && running_cons_tres()) {
 		/* mem_per_[cpu|node] not set, check if mem_per_gres was set */
 		gres_job_state_t gres_js;
 		gres_state_t *gres_state_job;
 		uint32_t gpu_plugin_id;
 		memset(&gres_js, 0, sizeof(gres_js));
 		list_for_each(gres_list, _get_req_gres, &gres_js);
 		if (gres_js.mem_per_gres) {
 			/* Requested node_cnt == 1 if not given */
 			if (node_cnt == NO_VAL)
 				node_cnt = 1;

 			/* Estimate requested gres per job */
 			if (gres_js.gres_per_job)
 				return gres_js.mem_per_gres *
 					gres_js.gres_per_job;
 			if (gres_js.gres_per_node)
 				return gres_js.mem_per_gres *
 					gres_js.gres_per_node * node_cnt;
 			if (gres_js.gres_per_socket) {
 				if (min_sockets_per_node &&
 				    (min_sockets_per_node != NO_VAL16))
 					return gres_js.mem_per_gres *
 						gres_js.gres_per_socket *
 						node_cnt * min_sockets_per_node;
 				else
 					return gres_js.mem_per_gres *
 						gres_js.gres_per_socket *
 						node_cnt;
 			}
 			if (gres_js.gres_per_task) {
 				if (num_tasks && (num_tasks != NO_VAL))
 					return gres_js.mem_per_gres *
 						gres_js.gres_per_task *
 						num_tasks;
 				else
 					return gres_js.mem_per_gres *
 						gres_js.gres_per_task;
 			}
 			/*
 			 * mem_per_gres set but no gres requested.
 			 * We shouldn't get here.
 			 */
 			return 0;
 		}
 		/*
 		 * If no mem_per_gres was explicitly set
 		 * Set mem_per_gres with DefMemPerGPU
 		 */
 		gpu_plugin_id = gres_get_gpu_plugin_id();
 		gres_state_job = list_find_first(
 			gres_list, gres_find_id, &gpu_plugin_id);
 		if (gres_state_job) {
 			gres_job_state_t *gres_js_gpu =
 				gres_state_job->gres_data;
 			mem_total = NO_VAL64;
 			if (part_ptr && part_ptr->job_defaults_list) {
 				mem_total = slurm_get_def_mem_per_gpu(
 					part_ptr->job_defaults_list);
 			}
 			if ((mem_total == NO_VAL64) &&
 			    slurm_conf.job_defaults_list) {
 				mem_total = slurm_get_def_mem_per_gpu(
 					slurm_conf.job_defaults_list);
 			}
 			if (mem_total != NO_VAL64) {
 				mem_total = mem_total * gres_js_gpu->total_gres;
 				return mem_total;
 			}
 		}
 	}

 	if (pn_min_memory == 0)
 		pn_min_memory = _mem_per_node_part(part_ptr);

 	if (pn_min_memory & MEM_PER_CPU) {
 		if (cpu_cnt != NO_VAL) {
 			mem_total = pn_min_memory & (~MEM_PER_CPU);
 			mem_total *= cpu_cnt;
 		}
 	} else if (node_cnt != NO_VAL)
 		mem_total = pn_min_memory * node_cnt;

 	return mem_total;
 }

 /*
  * job_epilog_complete - Note the completion of the epilog script for a
  *	given job
  * IN job_id      - id of the job for which the epilog was executed
  * IN node_name   - name of the node on which the epilog was executed
  * IN return_code - return code from epilog script
  * RET true if job is COMPLETED, otherwise false
  */
 extern bool job_epilog_complete(uint32_t job_id, char *node_name,
 				uint32_t return_code)
 {
 	job_record_t *job_ptr = find_job_record(job_id);
 	node_record_t *node_ptr;

 	if (job_ptr == NULL) {
 		debug("%s: unable to find JobId=%u for node=%s with return_code=%u.",
 		      __func__, job_id, node_name, return_code);
 		return true;
 	}

 	log_flag(TRACE_JOBS, "%s: enter %pJ", __func__, job_ptr);

 	/*
 	 * There is a potential race condition this handles.
 	 * If slurmctld cold-starts while slurmd keeps running, slurmd could
 	 * notify slurmctld of a job epilog completion before getting synced
 	 * up with slurmctld state. If a new job arrives and the job_id is
 	 * reused, we could try to note the termination of a job that hasn't
 	 * really started. Very rare obviously.
 	 */
 	if ((IS_JOB_PENDING(job_ptr) && (!IS_JOB_COMPLETING(job_ptr))) ||
 	    ((!job_ptr->node_bitmap_cg) && (!IS_JOB_COMPLETING(job_ptr))) ||
 	    (job_ptr->node_bitmap == NULL)) {
 		uint32_t base_state = NODE_STATE_UNKNOWN;
 		node_ptr = find_node_record(node_name);
 		if (node_ptr)
 			base_state = node_ptr->node_state & NODE_STATE_BASE;
 		if (base_state == NODE_STATE_DOWN) {
 			debug("%s: %pJ complete response from DOWN node %s",
 			      __func__, job_ptr, node_name);
 		} else if (job_ptr->restart_cnt) {
 			/*
 			 * Duplicate epilog complete can be due to race
 			 */
 			debug("%s: %pJ duplicate epilog complete response",
 			      __func__, job_ptr);
 		} else {
 			error("%s: %pJ is non-running slurmctld and slurmd out of sync",
 			      __func__, job_ptr);
 		}
 		return false;
 	}

 	if (return_code) {
 		error("%s: %pJ epilog error on %s, draining the node",
 		      __func__, job_ptr, node_name);
 		drain_nodes(node_name, "Epilog error",
 		            slurm_conf.slurm_user_id);
 	}
 	/* Change job from completing to completed */
 	node_ptr = find_node_record(node_name);
 	if (node_ptr)
 		make_node_idle(node_ptr, job_ptr);

 	/* nodes_completing is out of date, rebuild when next saved */
 	xfree(job_ptr->nodes_completing);
 	if (!IS_JOB_COMPLETING(job_ptr)) {	/* COMPLETED */
 		batch_requeue_fini(job_ptr);
 		return true;
 	} else
 		return false;
 }

 /* Complete a batch job requeue logic after all steps complete so that
  * subsequent jobs appear in a separate accounting record. */
 void batch_requeue_fini(job_record_t *job_ptr)
 {
 	if (IS_JOB_COMPLETING(job_ptr) ||
 	    !IS_JOB_PENDING(job_ptr) || !job_ptr->batch_flag)
 		return;

 	info("Requeuing %pJ", job_ptr);

 	/* Clear everything so this appears to be a new job and then restart
 	 * it in accounting. */
 	job_ptr->start_time = 0;
 	job_ptr->end_time_exp = job_ptr->end_time = 0;
 	job_ptr->total_cpus = 0;
 	job_ptr->pre_sus_time = 0;
 	job_ptr->preempt_time = 0;
 	job_ptr->suspend_time = 0;
 	job_ptr->tot_sus_time = 0;
 	job_ptr->next_step_id = 0;
 	job_ptr->state_reason_prev_db = 0;

 	job_ptr->node_cnt = 0;
 	job_ptr->total_nodes = 0;
 	xfree(job_ptr->alias_list);
 	xfree(job_ptr->batch_host);
 	free_job_resources(&job_ptr->job_resrcs);
 	FREE_NULL_LIST(job_ptr->license_list);
 	xfree(job_ptr->licenses_allocated);
 	xfree(job_ptr->nodes);
 	xfree(job_ptr->node_addrs);
 	xfree(job_ptr->nodes_completing);
 	xfree(job_ptr->failed_node);
 	FREE_NULL_BITMAP(job_ptr->node_bitmap);
 	FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
 	FREE_NULL_LIST(job_ptr->gres_list_alloc);

 	job_resv_clear_magnetic_flag(job_ptr);

 	if (job_ptr->details) {
 		time_t now = time(NULL);
 		/* The time stamp on the new batch launch credential must be
 		 * larger than the time stamp on the revoke request. Also the
 		 * I/O must be all cleared out, the named socket purged and
 		 * the job credential purged by slurmd. */
 		if (job_ptr->details->begin_time <= now) {
 			int cred_lifetime = DEFAULT_EXPIRATION_WINDOW;
 			time_t begin_time;
 			cred_lifetime = cred_expiration();
 			begin_time = now + cred_lifetime + 1;
 			if ((job_ptr->bit_flags & CRON_JOB) &&
 			    job_ptr->details->crontab_entry) {
 				begin_time = calc_next_cron_start(
 					job_ptr->details->crontab_entry,
 					begin_time);
 			} else if (job_ptr->bit_flags & CRON_JOB) {
 				/*
 				 * Skip requeuing this instead of crashing.
 				 */
 				error("Missing cron details for %pJ. This should never happen. Clearing CRON_JOB flag and skipping requeue.",
 				      job_ptr);
 				job_ptr->bit_flags &= ~CRON_JOB;
 			}
 			job_ptr->details->begin_time = begin_time;
 		}

 		/* Since this could happen on a launch we need to make sure the
 		 * submit isn't the same as the last submit so put now + 1 so
 		 * we get different records in the database */
 		if (now == job_ptr->details->submit_time)
 			now++;
 		job_ptr->details->submit_time = now;

 		/* clear the accrue flag */
 		job_ptr->bit_flags &= ~JOB_ACCRUE_OVER;
 		job_ptr->details->accrue_time = 0;

 		if ((job_ptr->details->whole_node & WHOLE_NODE_REQUIRED) &&
 		    job_ptr->gres_list_req) {
 			job_details_t *detail_ptr = job_ptr->details;
 			multi_core_data_t *mc_ptr = detail_ptr->mc_ptr;
 			gres_job_state_validate_t gres_js_val = {
 				.cpus_per_tres = job_ptr->cpus_per_tres,
 				.mem_per_tres = job_ptr->mem_per_tres,
 				.tres_freq = job_ptr->tres_freq,
 				.tres_per_job = job_ptr->tres_per_job,
 				.tres_per_node = job_ptr->tres_per_node,
 				.tres_per_socket = job_ptr->tres_per_socket,
 				.tres_per_task = job_ptr->tres_per_task,

 				.cpus_per_task =
 				&detail_ptr->orig_cpus_per_task,
 				.max_nodes = &detail_ptr->max_nodes,
 				.min_cpus = &detail_ptr->min_cpus,
 				.min_nodes = &detail_ptr->min_nodes,
 				.ntasks_per_node = &detail_ptr->ntasks_per_node,
 				.ntasks_per_socket = &mc_ptr->ntasks_per_socket,
 				.ntasks_per_tres = &detail_ptr->ntasks_per_tres,
 				.num_tasks = &detail_ptr->num_tasks,
 				.sockets_per_node = &mc_ptr->sockets_per_node,

 				.gres_list = &job_ptr->gres_list_req,
 			};

 			/*
 			 * We need to reset the gres_list to what was requested
 			 * instead of what was given exclusively.
 			 */
 			FREE_NULL_LIST(job_ptr->gres_list_req);
 			(void)gres_job_state_validate(&gres_js_val);
 		}
 	}

 	/* Reset the priority (begin and accrue times were reset) */
 	if (job_ptr->priority != 0)
 		set_job_prio(job_ptr);

 	/*
 	 * If a reservation ended and was a repeated (e.g., daily, weekly)
 	 * reservation, its ID will be different; make sure
 	 * job->resv_id matches the reservation id.
 	 */
 	if (job_ptr->resv_ptr)
 		job_ptr->resv_id = job_ptr->resv_ptr->resv_id;

 	/* Reset this after the batch step has finished or the batch step
 	 * information will be attributed to the next run of the job. */
 	job_record_set_sluid(job_ptr);
 	jobacct_storage_g_job_start(acct_db_conn, job_ptr);

 	/* Submit new sibling jobs for fed jobs */
 	if (fed_mgr_is_origin_job(job_ptr)) {
 		if (fed_mgr_job_requeue(job_ptr)) {
 			error("failed to submit requeued sibling jobs for fed %pJ",
 			      job_ptr);
 		}
 	}
 }


 /* job_fini - free all memory associated with job records */
 void job_fini (void)
 {
 	FREE_NULL_LIST(job_list);
 	xfree(job_hash);
 	xfree(job_array_hash_j);
 	xfree(job_array_hash_t);
 	FREE_NULL_LIST(purge_jobs_list);
 	FREE_NULL_LIST(purge_files_list);
 	FREE_NULL_BITMAP(requeue_exit);
 	FREE_NULL_BITMAP(requeue_exit_hold);
 }

 /* Record the start of one job array task */
 extern void job_array_start(job_record_t *job_ptr)
 {
 	job_record_t *base_job_ptr;

 	if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) {
 		base_job_ptr = find_job_record(job_ptr->array_job_id);
 		if (base_job_ptr && base_job_ptr->array_recs) {
 			base_job_ptr->array_recs->tot_run_tasks++;
 		}
 	}
 }

 /* Return true if a job array task can be started */
 extern bool job_array_start_test(job_record_t *job_ptr)
 {
 	job_record_t *base_job_ptr;
 	time_t now = time(NULL);

 	if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) {
 		base_job_ptr = find_job_record(job_ptr->array_job_id);
 		if (base_job_ptr && base_job_ptr->array_recs &&
 		    (base_job_ptr->array_recs->max_run_tasks != 0) &&
 		    (base_job_ptr->array_recs->tot_run_tasks >=
 		     base_job_ptr->array_recs->max_run_tasks)) {
 			if (job_ptr->details &&
 			    (job_ptr->details->begin_time <= now))
 				job_ptr->details->begin_time = (time_t) 0;
 			xfree(job_ptr->state_desc);
 			job_ptr->state_reason = WAIT_ARRAY_TASK_LIMIT;
 			return false;
 		}
 	}

 	return true;
 }

 static void _job_array_comp(job_record_t *job_ptr, bool was_running,
 			    bool requeue)
 {
 	job_record_t *base_job_ptr;
 	uint32_t status;

 	if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) {
 		status = job_ptr->exit_code;
 		if ((status == 0) && !IS_JOB_COMPLETE(job_ptr)) {
 			/* Avoid max_exit_code == 0 if task did not run to
 			 * successful completion (e.g. Cancelled, NodeFail) */
 			status = 9;
 		}
 		base_job_ptr = find_job_record(job_ptr->array_job_id);
 		if (base_job_ptr && base_job_ptr->array_recs) {
 			if (requeue) {
 				base_job_ptr->array_recs->array_flags |=
 					ARRAY_TASK_REQUEUED;
 			} else if (!base_job_ptr->array_recs->tot_comp_tasks) {
 				base_job_ptr->array_recs->min_exit_code =
 					status;
 				base_job_ptr->array_recs->max_exit_code =
 					status;
 			} else {
 				base_job_ptr->array_recs->min_exit_code =
 					MIN(status, base_job_ptr->
 					    array_recs->min_exit_code);
 				base_job_ptr->array_recs->max_exit_code =
 					MAX(status, base_job_ptr->
 					    array_recs->max_exit_code);
 			}
 			if (was_running &&
 			    base_job_ptr->array_recs->tot_run_tasks)
 				base_job_ptr->array_recs->tot_run_tasks--;
 			base_job_ptr->array_recs->tot_comp_tasks++;
 		}
 	}
 }

 /* log the completion of the specified job */
 extern void job_completion_logger(job_record_t *job_ptr, bool requeue)
 {
 	int base_state;
 	bool arr_finished = false, task_failed = false, task_requeued = false;
 	bool was_running = false;
 	job_record_t *master_job = NULL;
 	uint32_t max_exit_code = 0;

 	xassert(job_ptr);

 	if (job_ptr->resv_ports)
 		resv_port_job_free(job_ptr);

 	acct_policy_remove_job_submit(job_ptr, false);
 	if (job_ptr->nodes && ((job_ptr->bit_flags & JOB_KILL_HURRY) == 0)
 	    && !IS_JOB_RESIZING(job_ptr)) {
 		(void) bb_g_job_start_stage_out(job_ptr);
 	} else if (job_ptr->nodes && IS_JOB_RESIZING(job_ptr)){
 		debug("%s: %pJ resizing, skipping bb stage_out",
 		      __func__, job_ptr);
 	} else {
 		/*
 		 * Never allocated compute nodes.
 		 * Unless job ran, there is no data to stage-out
 		 */
 		(void) bb_g_job_cancel(job_ptr);
 	}
 	if (job_ptr->bit_flags & JOB_WAS_RUNNING) {
 		job_ptr->bit_flags &= ~JOB_WAS_RUNNING;
 		was_running = true;
 	}

 	_job_array_comp(job_ptr, was_running, requeue);

 	if (!IS_JOB_RESIZING(job_ptr) &&
 	    (!IS_JOB_PENDING(job_ptr) || requeue) &&
 	    !IS_JOB_REVOKED(job_ptr)  &&
 	    ((job_ptr->array_task_id == NO_VAL) ||
 	     (job_ptr->mail_type & MAIL_ARRAY_TASKS) ||
 	     (arr_finished = test_job_array_finished(job_ptr->array_job_id)))) {
 		/* Remove configuring state just to make sure it isn't there
 		 * since it will throw off displays of the job. */
 		job_state_unset_flag(job_ptr, JOB_CONFIGURING);

 		/* make sure all parts of the job are notified
 		 * Fed Jobs: only signal the srun from where the job is running
 		 * or from the origin if the job wasn't running. */
 		if (!job_ptr->fed_details ||
 		    fed_mgr_job_is_self_owned(job_ptr) ||
 		    (fed_mgr_is_origin_job(job_ptr) &&
 		     !fed_mgr_job_is_locked(job_ptr)))
 			srun_job_complete(job_ptr);

 		/* mail out notifications of completion */
 		if (arr_finished) {
 			/* We need to summarize different tasks states. */
 			master_job = find_job_record(job_ptr->array_job_id);
 			if (master_job && master_job->array_recs) {
 				task_requeued =
 					(master_job->array_recs->array_flags &
 					 ARRAY_TASK_REQUEUED);
 				if (task_requeued &&
 				    (job_ptr->mail_type & MAIL_JOB_REQUEUE)) {
 					/*
 					 * At least 1 task requeued and job
 					 * req. to be notified on requeues.
 					 */
 					mail_job_info(master_job,
 						      MAIL_JOB_REQUEUE);
 				}

 				max_exit_code =
 					master_job->array_recs->max_exit_code;
 				task_failed = (WIFEXITED(max_exit_code) &&
 					       WEXITSTATUS(max_exit_code));
 				if (task_failed &&
 				    (job_ptr->mail_type & MAIL_JOB_FAIL)) {
 					/*
 					 * At least 1 task failed and job
 					 * req. to be notified on failures.
 					 */
 					mail_job_info(master_job,
 						      MAIL_JOB_FAIL);
 				} else if (job_ptr->mail_type & MAIL_JOB_END) {
 					/*
 					 * Job req. to be notified on END.
 					 */
 					mail_job_info(job_ptr, MAIL_JOB_END);
 				}
 			}
 		} else {
 			base_state = job_ptr->job_state & JOB_STATE_BASE;
 			if ((job_ptr->mail_type & MAIL_JOB_FAIL) &&
 			    (base_state >= JOB_FAILED) &&
 			    ((base_state != JOB_PREEMPTED) || !requeue))
 				mail_job_info(job_ptr, MAIL_JOB_FAIL);
 			else if ((job_ptr->mail_type & MAIL_JOB_END) &&
 				 (base_state >= JOB_COMPLETE))
 				mail_job_info(job_ptr, MAIL_JOB_END);

 			if (requeue &&
 			    (job_ptr->mail_type & MAIL_JOB_REQUEUE))
 				mail_job_info(job_ptr,
 					      MAIL_JOB_REQUEUE);

 		}
 	}

 	if (!(job_ptr->bit_flags & TRES_STR_CALC) &&
 	    job_ptr->tres_alloc_cnt &&
 	    (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64))
 		assoc_mgr_set_job_tres_alloc_str(job_ptr, false);

 	jobcomp_g_record_job_end(job_ptr);

 	jobacct_storage_g_job_complete(acct_db_conn, job_ptr);
 }

 /*
  * job_independent - determine if this job has a dependent job pending
  *	or if the job's scheduled begin time is in the future
  * IN job_ptr - pointer to job being tested
  * RET - true if job no longer must be deferred for another job
  */
 extern bool job_independent(job_record_t *job_ptr)
 {
 	job_details_t *detail_ptr = job_ptr->details;
 	time_t now = time(NULL);
 	int depend_rc;

 	if ((job_ptr->state_reason == FAIL_BURST_BUFFER_OP) ||
 	    (job_ptr->state_reason == FAIL_ACCOUNT) ||
 	    (job_ptr->state_reason == FAIL_QOS) ||
 	    (job_ptr->state_reason == WAIT_HELD) ||
 	    (job_ptr->state_reason == WAIT_HELD_USER) ||
 	    (job_ptr->state_reason == WAIT_MAX_REQUEUE) ||
 	    (job_ptr->state_reason == WAIT_RESV_DELETED) ||
 	    (job_ptr->state_reason == WAIT_RESV_INVALID) ||
 	    (job_ptr->state_reason == WAIT_DEP_INVALID))
 		return false;

 	/* Test dependencies first so we can cancel jobs before dependent
 	 * job records get purged (e.g. afterok, afternotok) */
 	depend_rc = test_job_dependency(job_ptr, NULL);
 	if ((depend_rc == LOCAL_DEPEND) || (depend_rc == REMOTE_DEPEND)) {
 		/* start_time has passed but still has dependency which
 		 * makes it ineligible */
 		if (detail_ptr->begin_time < now)
 			detail_ptr->begin_time = 0;
 		job_ptr->state_reason = WAIT_DEPENDENCY;
 		xfree(job_ptr->state_desc);
 		return false;
 	} else if (depend_rc == FAIL_DEPEND) {
 		handle_invalid_dependency(job_ptr);
 		return false;
 	}
 	/* Job is eligible to start now */
 	if (job_ptr->state_reason == WAIT_DEPENDENCY) {
 		job_ptr->state_reason = WAIT_NO_REASON;
 		xfree(job_ptr->state_desc);
 		/* Submit the job to its siblings. */
 		if (job_ptr->details) {
 			fed_mgr_job_requeue(job_ptr);
 		}
 	}

 	/* Check for maximum number of running tasks in a job array */
 	if (!job_array_start_test(job_ptr))
 		return false;

 	if (detail_ptr && (detail_ptr->begin_time > now)) {
 		job_ptr->state_reason = WAIT_TIME;
 		xfree(job_ptr->state_desc);
 		return false;	/* not yet time */
 	}

 	if (job_test_resv_now(job_ptr) != SLURM_SUCCESS) {
 		job_ptr->state_reason = WAIT_RESERVATION;
 		xfree(job_ptr->state_desc);
 		return false;	/* not yet time */
 	}

 	if ((detail_ptr && (detail_ptr->begin_time == 0) &&
 	     (job_ptr->priority != 0))) {
 		detail_ptr->begin_time = now;
 		/*
 		 * Send begin time to the database if it is already there, or it
 		 * won't get there until the job starts.
 		 */
 		if (IS_JOB_IN_DB(job_ptr))
 			jobacct_storage_g_job_start(acct_db_conn, job_ptr);
 	} else if (job_ptr->state_reason == WAIT_TIME) {
 		job_ptr->state_reason = WAIT_NO_REASON;
 		xfree(job_ptr->state_desc);
 	}
 	return true;
 }

 /*
  * determine if job is ready to execute per the node select plugin
  * IN job_id - job to test
  * OUT ready - 1 if job is ready to execute 0 otherwise
  * RET Slurm error code
  */
 extern int job_node_ready(uint32_t job_id, int *ready)
 {
 	int rc;
 	job_record_t *job_ptr;
 	xassert(ready);

 	*ready = 0;
 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL)
 		return ESLURM_INVALID_JOB_ID;

 	/*
 	 * If the job is configuring, the node might be booting, or a script
 	 * such as PrologSlurmctld is running; delay job launch until these
 	 * are finished.
 	 */
 	if (IS_JOB_CONFIGURING(job_ptr))
 		return EAGAIN;

 	/* Always call select_g_job_ready() so that select/bluegene can
 	 * test and update block state information. */
 	rc = select_g_job_ready(job_ptr);
 	if (rc == READY_JOB_FATAL)
 		return ESLURM_INVALID_PARTITION_NAME;
 	if (rc == READY_JOB_ERROR)
 		return EAGAIN;
 	if (rc)
 		rc = READY_NODE_STATE;

 	if (job_ptr->details && !job_ptr->details->prolog_running)
 		rc |= READY_PROLOG_STATE;
 	if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
 		rc |= READY_JOB_STATE;
 	if ((rc == (READY_NODE_STATE | READY_JOB_STATE | READY_PROLOG_STATE)) &&
 	    job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") &&
 	    job_ptr->node_bitmap &&
 	    (bit_overlap_any(power_down_node_bitmap,
 	                     job_ptr->node_bitmap) == 0)) {
 		last_job_update = time(NULL);
 		set_job_alias_list(job_ptr);
 	}

 	*ready = rc;
 	return SLURM_SUCCESS;
 }

 /* Send specified signal to all steps associated with a job */
 static void _signal_job(job_record_t *job_ptr, int signal, uint16_t flags)
 {
 	node_record_t *node_ptr;
 	agent_arg_t *agent_args = NULL;
 	signal_tasks_msg_t *signal_job_msg = NULL;

 	agent_args = xmalloc(sizeof(agent_arg_t));
 	agent_args->msg_type = REQUEST_SIGNAL_TASKS;
 	agent_args->retry = 1;
 	agent_args->hostlist = hostlist_create(NULL);
 	signal_job_msg = xmalloc(sizeof(signal_tasks_msg_t));
 	signal_job_msg->step_id.job_id = job_ptr->job_id;

 	/*
 	 * We don't ever want to kill a step with this message.  The flags below
 	 * will make sure that does happen.  Just in case though, set the
 	 * step_id to an impossible number.
 	 */
 	signal_job_msg->step_id.step_id = slurm_conf.max_step_cnt + 1;
 	signal_job_msg->step_id.step_het_comp = NO_VAL;

 	/*
 	 * Encode the flags for slurm stepd to know what steps get signaled
 	 * Here if we aren't signaling the full job we always only want to
 	 * signal all other steps.
 	 */
 	if ((flags & KILL_FULL_JOB) ||
 	    (flags & KILL_JOB_BATCH) ||
 	    (flags & KILL_STEPS_ONLY))
 		signal_job_msg->flags = flags;
 	else
 		signal_job_msg->flags = KILL_STEPS_ONLY;

 	signal_job_msg->signal = signal;

 	agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
 	for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i));
 	     i++) {
 		if (agent_args->protocol_version > node_ptr->protocol_version)
 			agent_args->protocol_version =
 				node_ptr->protocol_version;
 		hostlist_push_host(agent_args->hostlist, node_ptr->name);
 		agent_args->node_count++;
 		if (PACK_FANOUT_ADDRS(node_ptr))
 			agent_args->msg_flags |= SLURM_PACK_ADDRS;
 	}

 	if (agent_args->node_count == 0) {
 		xfree(signal_job_msg);
 		xfree(agent_args);
 		return;
 	}

 	agent_args->msg_args = signal_job_msg;
 	set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY);
 	agent_queue_request(agent_args);
 }

 /* Send suspend request to slumrd of all nodes associated with a job
  * job_ptr IN - job to be suspended or resumed
  * op IN - SUSPEND_JOB or RESUME_JOB
  * indf_susp IN - set if job is being suspended indefinitely by user
  *                or admin, otherwise suspended for gang scheduling
  */
 static void _suspend_job(job_record_t *job_ptr, uint16_t op)
 {
 	node_record_t *node_ptr;
 	agent_arg_t *agent_args;
 	suspend_int_msg_t *sus_ptr;

 	agent_args = xmalloc(sizeof(agent_arg_t));
 	agent_args->msg_type = REQUEST_SUSPEND_INT;
 	agent_args->retry = 0;	/* don't resend, gang scheduler can
 				 * quickly induce huge backlog
 				 * of agent.c RPCs */
 	agent_args->hostlist = hostlist_create(NULL);
 	sus_ptr = xmalloc(sizeof(suspend_int_msg_t));
 	sus_ptr->job_id = job_ptr->job_id;
 	sus_ptr->op = op;

 	agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
 	for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i));
 	     i++) {
 		if (agent_args->protocol_version > node_ptr->protocol_version)
 			agent_args->protocol_version =
 				node_ptr->protocol_version;
 		hostlist_push_host(agent_args->hostlist, node_ptr->name);
 		agent_args->node_count++;
 		if (PACK_FANOUT_ADDRS(node_ptr))
 			agent_args->msg_flags |= SLURM_PACK_ADDRS;
 	}

 	if (agent_args->node_count == 0) {
 		slurm_free_suspend_int_msg(sus_ptr);
 		xfree(agent_args);
 		return;
 	}

 	agent_args->msg_args = sus_ptr;
 	set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY);
 	agent_queue_request(agent_args);
 }

 /*
  * Specified job is being suspended, release allocated nodes
  * job_ptr IN - job to be suspended
  * indf_susp IN - set if job is being suspended indefinitely by user
  *                or admin, otherwise suspended for gang scheduling
  */
 static int _suspend_job_nodes(job_record_t *job_ptr, bool indf_susp)
 {
 	int rc = SLURM_SUCCESS;
 	node_record_t *node_ptr;
 	uint32_t node_flags;
 	time_t now = time(NULL);

 	if ((rc = select_g_job_suspend(job_ptr, indf_susp)) != SLURM_SUCCESS)
 		return rc;

 	for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i));
 	     i++) {
 		node_ptr->sus_job_cnt++;
 		if (node_ptr->run_job_cnt)
 			(node_ptr->run_job_cnt)--;
 		else {
 			error("%s: %pJ node %s run_job_cnt underflow",
 			      __func__, job_ptr, node_ptr->name);
 		}
 		if (job_ptr->details && (job_ptr->details->share_res == 0)) {
 			if (node_ptr->no_share_job_cnt)
 				(node_ptr->no_share_job_cnt)--;
 			else {
 				error("%s: %pJ node %s no_share_job_cnt underflow",
 				      __func__, job_ptr, node_ptr->name);
 			}
 			if (node_ptr->no_share_job_cnt == 0)
 				bit_set(share_node_bitmap, i);
 		}
 		node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
 		if ((node_ptr->run_job_cnt  == 0) &&
 		    (node_ptr->comp_job_cnt == 0)) {
 			bit_set(idle_node_bitmap, i);
 		}
 		if (IS_NODE_DOWN(node_ptr)) {
 			debug3("%s: %pJ node %s left DOWN",
 			       __func__, job_ptr, node_ptr->name);
 		} else if (node_ptr->run_job_cnt) {
 			node_ptr->node_state =
 				NODE_STATE_ALLOCATED | node_flags;
 		} else {
 			node_ptr->node_state = NODE_STATE_IDLE | node_flags;
 			node_ptr->last_busy  = now;
 		}
 	}
 	last_job_update = last_node_update = now;
 	return rc;
 }

 /*
  * Specified job is being resumed, re-allocate the nodes
  * job_ptr IN - job to be resumed
  * indf_susp IN - set i f job is being resumed from indefinite suspend by user
  *                or admin, otherwise resume from gang scheduling
  */
 static int _resume_job_nodes(job_record_t *job_ptr, bool indf_susp)
 {
 	int rc = SLURM_SUCCESS;
 	node_record_t *node_ptr;
 	uint32_t node_flags;

 	if ((rc = select_g_job_resume(job_ptr, indf_susp)) != SLURM_SUCCESS)
 		return rc;

 	for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i));
 	     i++) {
 		if (IS_NODE_DOWN(node_ptr))
 			return SLURM_ERROR;
 	}

 	for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i));
 	     i++) {
 		if (node_ptr->sus_job_cnt)
 			(node_ptr->sus_job_cnt)--;
 		else {
 			error("Node %s sus_job_cnt underflow",
 			      node_ptr->name);
 		}
 		node_ptr->run_job_cnt++;
 		if (job_ptr->details &&
 		    (job_ptr->details->share_res == 0)) {
 			node_ptr->no_share_job_cnt++;
 			if (node_ptr->no_share_job_cnt)
 				bit_clear(share_node_bitmap, i);
 		}

 		if (slurm_mcs_get_select(job_ptr) == 1) {
 			xfree(node_ptr->mcs_label);
 			node_ptr->mcs_label = xstrdup(job_ptr->mcs_label);
 		}

 		bit_clear(idle_node_bitmap, i);
 		node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
 		node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags;
 	}
 	last_job_update = last_node_update = time(NULL);
 	return rc;
 }

 static int _foreach_job_resume_test(void *x, void *arg)
 {
 	job_record_t *test_job_ptr = x;
 	job_record_t *job_ptr = arg;

 	if (test_job_ptr->details &&
 	    (test_job_ptr->details->core_spec != NO_VAL16) &&
 	    IS_JOB_RUNNING(test_job_ptr) &&
 	    test_job_ptr->node_bitmap &&
 	    bit_overlap_any(test_job_ptr->node_bitmap, job_ptr->node_bitmap)) {
 		return -1;
 	}
 /* FIXME: Also test for ESLURM_INTERCONNECT_BUSY */
 	return 0;
 }

 /*
  * Determine if a job can be resumed.
  * Check for multiple jobs on the same nodes with core specialization.
  * RET 0 on success, otherwise ESLURM error code
  */
 static int _job_resume_test(job_record_t *job_ptr)
 {
 	int rc = SLURM_SUCCESS;

 	if ((job_ptr->details == NULL) ||
 	    (job_ptr->details->core_spec == NO_VAL16) ||
 	    (job_ptr->node_bitmap == NULL))
 		return rc;

 	if (list_find_first(job_list, _foreach_job_resume_test, job_ptr))
 		rc = ESLURM_NODES_BUSY;

 	return rc;
 }

 /*
  * _job_suspend_op - perform some suspend/resume operation on a job
  * op IN - operation: suspend/resume
  * indf_susp IN - set if job is being suspended indefinitely by user or admin
  *                and we should clear it's priority, otherwise suspended
  *		  temporarily for gang scheduling
  * RET 0 on success, otherwise ESLURM error code
  */
 static int _job_suspend_op(job_record_t *job_ptr, uint16_t op, bool indf_susp)
 {
 	int rc = SLURM_SUCCESS;
 	time_t now = time(NULL);

 	if (IS_JOB_PENDING(job_ptr))
 		return ESLURM_JOB_PENDING;
 	if (IS_JOB_FINISHED(job_ptr))
 		return ESLURM_ALREADY_DONE;
 	if ((op == RESUME_JOB) && (rc = _job_resume_test(job_ptr)))
 		return rc;

 	/* perform the operation */
 	if (op == SUSPEND_JOB) {
 		if (IS_JOB_SUSPENDED(job_ptr) && indf_susp) {
 			debug("%s: Holding %pJ, re-suspend operation",
 			      __func__, job_ptr);
 			job_ptr->priority = 0;	/* Prevent gang sched resume */
 			return SLURM_SUCCESS;
 		}
 		if (!IS_JOB_RUNNING(job_ptr))
 			return ESLURM_JOB_NOT_RUNNING;
 		rc = _suspend_job_nodes(job_ptr, indf_susp);
 		if (rc != SLURM_SUCCESS)
 			return rc;
 		_suspend_job(job_ptr, op);
 		job_state_set(job_ptr, JOB_SUSPENDED);
 		if (indf_susp) {    /* Job being manually suspended, not gang */
 			debug("%s: Holding %pJ, suspend operation",
 			      __func__, job_ptr);
 			job_ptr->priority = 0;
 			(void) gs_job_fini(job_ptr);
 		}
 		if (job_ptr->suspend_time) {
 			job_ptr->pre_sus_time +=
 				difftime(now, job_ptr->suspend_time);
 		} else {
 			job_ptr->pre_sus_time +=
 				difftime(now, job_ptr->start_time);
 		}
 		suspend_job_step(job_ptr);
 	} else if (op == RESUME_JOB) {
 		if (!IS_JOB_SUSPENDED(job_ptr))
 			return ESLURM_JOB_NOT_SUSPENDED;
 		rc = _resume_job_nodes(job_ptr, indf_susp);
 		if (rc != SLURM_SUCCESS)
 			return rc;
 		_suspend_job(job_ptr, op);
 		if (job_ptr->priority == 0) {
 			/* Job was manually suspended, not gang */
 			set_job_prio(job_ptr);
 			(void) gs_job_start(job_ptr);
 		}
 		job_state_set(job_ptr, JOB_RUNNING);
 		job_ptr->tot_sus_time +=
 			difftime(now, job_ptr->suspend_time);

 		if ((job_ptr->time_limit != INFINITE) &&
 		    (!job_ptr->preempt_time)) {
 			debug3("%pJ resumed, updating end_time", job_ptr);
 			job_ptr->end_time_exp = job_ptr->end_time =
 				now + (job_ptr->time_limit * 60)
 				- job_ptr->pre_sus_time;
 		}
 		resume_job_step(job_ptr);
 	}

 	job_ptr->time_last_active = now;
 	job_ptr->suspend_time = now;
 	jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);

 	return rc;
 }

 static int _foreach_hetjob_suspend(void *x, void *arg)
 {
 	job_record_t *het_job = x;
 	foreach_sus_hetjob_t *sus_hetjob = arg;
 	int rc = SLURM_SUCCESS;

 	if (sus_hetjob->het_leader->het_job_id != het_job->het_job_id) {
 		error("%s: Bad het_job_list for %pJ",
 		      __func__, sus_hetjob->het_leader);
 		return 0;
 	}
 	rc = _job_suspend_op(het_job, sus_hetjob->op, sus_hetjob->indf_susp);
 	if (rc != SLURM_SUCCESS)
 		sus_hetjob->rc = rc;
 	return 0;
 }

 /*
  * _job_suspend - perform some suspend/resume operation, if the specified
  *                job records is a hetjob leader, perform the operation on all
  *                components of the hetjob
  * job_ptr - job to operate upon
  * op IN - operation: suspend/resume
  * indf_susp IN - set if job is being suspended indefinitely by user or admin
  *                and we should clear it's priority, otherwise suspended
  *		  temporarily for gang scheduling
  * RET 0 on success, otherwise ESLURM error code
  */
 static int _job_suspend(job_record_t *job_ptr, uint16_t op, bool indf_susp)
 {
 	int rc = SLURM_SUCCESS;

 	if (job_ptr->het_job_id && !job_ptr->het_job_list)
 		return ESLURM_NOT_WHOLE_HET_JOB;

 	/* Notify salloc/srun of suspend/resume */
 	srun_job_suspend(job_ptr, op);

 	if (job_ptr->het_job_list) {
 		foreach_sus_hetjob_t sus_hetjob = {
 			.het_leader = job_ptr,
 			.indf_susp = indf_susp,
 			.op = op,
 			.rc = SLURM_SUCCESS,
 		};
 		(void) list_for_each(job_ptr->het_job_list,
 				     _foreach_hetjob_suspend,
 				     &sus_hetjob);
 		rc = sus_hetjob.rc;
 	} else {
 		rc = _job_suspend_op(job_ptr, op, indf_susp);
 	}

 	return rc;
 }

 /*
  * job_suspend - perform some suspend/resume operation
  * NOTE: job_suspend  - Uses the job_id field and ignores job_id_str
  *
  * IN msg - original msg
  * IN sus_ptr - suspend/resume request message
  * IN uid - user id of the user issuing the RPC
  * indf_susp IN - set if job is being suspended indefinitely by user or admin
  *                and we should clear it's priority, otherwise suspended
  *		  temporarily for gang scheduling
  * IN protocol_version - slurm protocol version of client
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_suspend(slurm_msg_t *msg, suspend_msg_t *sus_ptr, uid_t uid,
 		       bool indf_susp, uint16_t protocol_version)
 {
 	int rc = SLURM_SUCCESS;
 	job_record_t *job_ptr = NULL;

 	xfree(sus_ptr->job_id_str);
 	xstrfmtcat(sus_ptr->job_id_str, "%u", sus_ptr->job_id);

 	/* find the job */
 	job_ptr = find_job_record (sus_ptr->job_id);
 	if (job_ptr == NULL) {
 		rc = ESLURM_INVALID_JOB_ID;
 		goto reply;
 	}

 	/* validate the request */
 	if (!validate_operator(uid) &&
 	    !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
 					  job_ptr->account, false)) {
 		error("SECURITY VIOLATION: Attempt to suspend job from user %u",
 		       uid);
 		rc = ESLURM_ACCESS_DENIED;
 		goto reply;
 	}

 	rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp);

 reply:

 	/* Since we have already used it lets make sure we don't leak
 	   memory */
 	xfree(sus_ptr->job_id_str);

 	if (msg)
 		slurm_send_rc_msg(msg, rc);

 	return rc;
 }

 /*
  * job_suspend2 - perform some suspend/resume operation
  * NB job_suspend2 - Ignores the job_id field and uses job_id_str
  *
  * IN msg - original msg
  * IN sus_ptr - suspend/resume request message
  * IN uid - user id of the user issuing the RPC
  * indf_susp IN - set if job is being suspended indefinitely by user or admin
  *                and we should clear it's priority, otherwise suspended
  *		  temporarily for gang scheduling
  * IN protocol_version - slurm protocol version of client
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_suspend2(slurm_msg_t *msg, suspend_msg_t *sus_ptr, uid_t uid,
 			bool indf_susp, uint16_t protocol_version)
 {
 	int rc = SLURM_SUCCESS, rc2;
 	job_record_t *job_ptr = NULL;
 	long int long_id;
 	uint32_t job_id = 0;
 	char *end_ptr = NULL;
 	bitstr_t *array_bitmap = NULL;
 	resp_array_struct_t *resp_array = NULL;

 	if (max_array_size == NO_VAL) {
 		max_array_size = slurm_conf.max_array_sz;
 	}

 	long_id = strtol(sus_ptr->job_id_str, &end_ptr, 10);
 	if (end_ptr[0] == '+')
 		rc = ESLURM_NOT_WHOLE_HET_JOB;
 	else if ((long_id <= 0) || (long_id == LONG_MAX) ||
 		 ((end_ptr[0] != '\0') && (end_ptr[0] != '_')))
 		rc = ESLURM_INVALID_JOB_ID;
 	else {
 		job_id = (uint32_t) long_id;
 		job_ptr = find_job_record(job_id);
 		if (job_ptr == NULL)
 			rc = ESLURM_INVALID_JOB_ID;
 	}
 	if (rc != SLURM_SUCCESS) {
 		info("%s: invalid JobId=%s", __func__, sus_ptr->job_id_str);
 		goto reply;
 	}

 	/* validate the request */
 	if (!validate_operator(uid) &&
 	    !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
 					  job_ptr->account, false)) {
 		error("SECURITY VIOLATION: Attempt to suspend job from user %u",
 		      uid);
 		rc = ESLURM_ACCESS_DENIED;
 		goto reply;
 	}

 	if (end_ptr[0] == '\0') {	/* Single job (or full job array) */
 		job_record_t *job_ptr_done = NULL;
 		if (job_ptr &&
 		    (((job_ptr->array_task_id == NO_VAL) &&
 		      (job_ptr->array_recs == NULL)) ||
 		     ((job_ptr->array_task_id != NO_VAL) &&
 		      (job_ptr->array_job_id  != job_id)))) {
 			/* This is a regular job or single task of job array */
 			rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp);
 			goto reply;
 		}

 		if (job_ptr && job_ptr->array_recs) {
 			/* This is a job array */
 			job_ptr_done = job_ptr;
 			rc2 = _job_suspend(job_ptr, sus_ptr->op, indf_susp);
 			_resp_array_add(&resp_array, job_ptr, rc2, NULL);
 		}

 		/* Suspend all tasks of this job array */
 		job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
 		if (!job_ptr && !job_ptr_done) {
 			rc = ESLURM_INVALID_JOB_ID;
 			goto reply;
 		}
 		while (job_ptr) {
 			if ((job_ptr->array_job_id == job_id) &&
 			    (job_ptr != job_ptr_done)) {
 				rc2 = _job_suspend(job_ptr, sus_ptr->op,
 						   indf_susp);
 				_resp_array_add(&resp_array, job_ptr, rc2,
 						NULL);
 			}
 			job_ptr = job_ptr->job_array_next_j;
 		}
 		goto reply;
 	}

 	array_bitmap = slurm_array_str2bitmap(end_ptr + 1, max_array_size,
 					      NULL);
 	if (!array_bitmap) {
 		info("%s: invalid JobId=%s", __func__, sus_ptr->job_id_str);
 		rc = ESLURM_INVALID_JOB_ID;
 		goto reply;
 	}

 	for (int i = 0; (i = bit_ffs_from_bit(array_bitmap, i)) >= 0; i++) {
 		job_ptr = find_job_array_rec(job_id, i);
 		if (job_ptr == NULL) {
 			info("%s: invalid JobId=%u_%d", __func__, job_id, i);
 			_resp_array_add_id(&resp_array, job_id, i,
 					   ESLURM_INVALID_JOB_ID);
 			continue;
 		}
 		rc2 = _job_suspend(job_ptr, sus_ptr->op, indf_susp);
 		_resp_array_add(&resp_array, job_ptr, rc2, NULL);
 	}

 reply:
 	if (resp_array) {
 		job_array_resp_msg_t *resp_array_msg =
 			_resp_array_xlate(resp_array, job_id);
 		(void) send_msg_response(msg, RESPONSE_JOB_ARRAY_ERRORS,
 					 resp_array_msg);
 		slurm_free_job_array_resp(resp_array_msg);
 	} else
 		slurm_send_rc_msg(msg, rc);

 	_resp_array_free(resp_array);

 	FREE_NULL_BITMAP(array_bitmap);

 	return rc;
 }

 /*
  * _job_requeue_op - Requeue a running or pending batch job
  * IN uid - user id of user issuing the RPC
  * IN job_ptr - job to be requeued
  * IN preempt - true if job being preempted
  * RET 0 on success, otherwise ESLURM error code
  */
 static int _job_requeue_op(uid_t uid, job_record_t *job_ptr, bool preempt,
 			   uint32_t flags)
 {
 	static time_t config_update = 0;
 	static bool requeue_nohold_prolog = true;
 	bool is_running = false, is_suspended = false, is_completed = false;
 	bool is_completing = false;
 	bool force_requeue = false;
 	time_t now = time(NULL);
 	uint32_t completing_flags = 0;

 	if (config_update != slurm_conf.last_update) {
 		requeue_nohold_prolog = (xstrcasestr(slurm_conf.sched_params,
 						     "nohold_on_prolog_fail"));
 		config_update = slurm_conf.last_update;
 	}

 	/* validate the request */
 	if ((uid != job_ptr->user_id) && !validate_operator(uid) &&
 	    !assoc_mgr_is_user_acct_coord(acct_db_conn, uid,
 					  job_ptr->account, false)) {
 		return ESLURM_ACCESS_DENIED;
 	}

 	if (((flags & JOB_STATE_BASE) == JOB_RUNNING) &&
 	    !IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr)) {
 		return SLURM_SUCCESS;
 	}

 	if (flags & JOB_RECONFIG_FAIL)
 		node_features_g_get_node(job_ptr->nodes);

 	/*
 	 * If the partition was removed don't allow the job to be
 	 * requeued.  If it doesn't have details then something is very
 	 * wrong and if the job doesn't want to be requeued don't unless
 	 * it's being forced to do so after a launch failure.
 	 */
 	if ((flags & JOB_LAUNCH_FAILED) &&
 	    (slurm_conf.prolog_flags & PROLOG_FLAG_FORCE_REQUEUE_ON_FAIL))
 		force_requeue = true;
 	if (!job_ptr->part_ptr || !job_ptr->details
 	    || (!job_ptr->details->requeue && !force_requeue)) {
 		if (flags & JOB_RECONFIG_FAIL)
 			(void) _job_fail(job_ptr, JOB_BOOT_FAIL);
 		return ESLURM_DISABLED;
 	}

 	if (job_ptr->batch_flag == 0) {
 		debug("Job-requeue can only be done for batch jobs");
 		if (flags & JOB_RECONFIG_FAIL)
 			(void) _job_fail(job_ptr, JOB_BOOT_FAIL);
 		return ESLURM_BATCH_ONLY;
 	}

 	/*
 	 * If the job is already pending, just return an error.
 	 * A federated origin job can be pending and revoked with a sibling job
 	 * on another cluster.
 	 */
 	if (IS_JOB_PENDING(job_ptr) &&
 	    (!job_ptr->fed_details || !job_ptr->fed_details->cluster_lock))
 		return ESLURM_JOB_PENDING;

 	if ((flags & JOB_RECONFIG_FAIL) && IS_JOB_CANCELLED(job_ptr)) {
 		/*
 		 * Job was cancelled (likely be the user) while node
 		 * reconfiguration was in progress, so don't requeue it
 		 * if the node reconfiguration failed.
 		 */
 		return ESLURM_DISABLED;
 	}

 	if (job_ptr->fed_details) {
 		int rc;
 		if ((rc = fed_mgr_job_requeue_test(job_ptr, flags)))
 			return rc;

 		/* Sent requeue request to origin cluster */
 		if (job_ptr->job_state & JOB_REQUEUE_FED)
 			return SLURM_SUCCESS;
 	}

 	last_job_update = now;

 	/*
 	 * In the job is in the process of completing
 	 * return SLURM_SUCCESS and set the status
 	 * to JOB_PENDING since we support requeue
 	 * of done/exit/exiting jobs.
 	 */
 	if (IS_JOB_COMPLETING(job_ptr)) {
 		completing_flags = job_ptr->job_state & JOB_STATE_FLAGS;
 		is_completing = true;
 	}

 	if (IS_JOB_SUSPENDED(job_ptr)) {
 		uint32_t suspend_job_state = job_ptr->job_state;
 		/*
 		 * we can't have it as suspended when we call the
 		 * accounting stuff.
 		 */
 		job_state_set(job_ptr, JOB_REQUEUE);
 		jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
 		job_state_set(job_ptr, suspend_job_state);
 		is_suspended = true;
 	}

 	job_ptr->time_last_active  = now;
 	if (is_suspended)
 		job_ptr->end_time = job_ptr->suspend_time;
 	else if (!is_completing)
 		job_ptr->end_time = now;

 	/*
 	 * Save the state of the job so that
 	 * we deallocate the nodes if is in
 	 * running state.
 	 */
 	if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr))
 		is_running = true;
 	else if (IS_JOB_COMPLETED(job_ptr))
 		is_completed = true;

 	/* Only change state to requeue for local jobs */
 	if (fed_mgr_is_origin_job(job_ptr) &&
 	    !fed_mgr_is_tracker_only_job(job_ptr)) {
 		/*
 		 * We want this job to have the requeued/preempted state in the
 		 * accounting logs. Set a new submit time so the restarted
 		 * job looks like a new job.
 		 */
 		if (preempt) {
 			job_state_set(job_ptr, JOB_PREEMPTED);
 			build_cg_bitmap(job_ptr);
 			job_completion_logger(job_ptr, true);
 			job_state_set(job_ptr, JOB_REQUEUE);
 		} else {
 			job_state_set(job_ptr, JOB_REQUEUE);
 			build_cg_bitmap(job_ptr);
 			job_completion_logger(job_ptr, true);
 		}
 	}

 	/*
 	 * Increment restart counter before completing reply so that completing
 	 * jobs get counted and so that fed jobs get counted before submitting
 	 * new siblings in batch_requeue_fini()
 	 */
 	job_ptr->restart_cnt++;

 	if (is_completing) {
 		job_state_set(job_ptr, (JOB_PENDING | completing_flags));
 		goto reply;
 	}

 	/*
 	 * Deallocate resources only if the job has some.
 	 * JOB_COMPLETING is needed to properly clean up steps.
 	 */
 	if (is_running) {
 		job_state_set_flag(job_ptr, JOB_COMPLETING);
 		deallocate_nodes(job_ptr, false, is_suspended, preempt);
 		if (!IS_JOB_COMPLETING(job_ptr) && !job_ptr->fed_details)
 			is_completed = true;
 		else
 			job_state_unset_flag(job_ptr, JOB_COMPLETING);
 	}

 	_set_requeued_job_pending_completing(job_ptr);

 	/*
 	 * Mark the origin job as requeuing. Will finish requeuing fed job
 	 * after job has completed.
 	 * If it's completed, batch_requeue_fini is called below and will call
 	 * fed_mgr_job_requeue() to submit new siblings.
 	 * If it's not completed, batch_requeue_fini will either be called when
 	 * the running origin job finishes or the running remote sibling job
 	 * reports that the job is finished.
 	 */
 	if (job_ptr->fed_details && !is_completed) {
 		job_state_set_flag(job_ptr, (JOB_COMPLETING | JOB_REQUEUE_FED));
 	}

 	/*
 	 * If we set the time limit it means the user didn't so reset
 	 * it here or we could bust some limit when we try again
 	 */
 	if (job_ptr->limit_set.time == 1) {
 		job_ptr->time_limit = NO_VAL;
 		job_ptr->limit_set.time = 0;
 	}

 reply:
 	job_ptr->pre_sus_time = (time_t) 0;
 	job_ptr->suspend_time = (time_t) 0;
 	job_ptr->tot_sus_time = (time_t) 0;

 	job_ptr->db_flags = 0;

 	/* clear signal sent flag on requeue */
 	job_ptr->warn_flags &= ~WARN_SENT;

 	/*
 	 * Since the job completion logger removes the submit we need
 	 * to add it again.
 	 */
 	acct_policy_add_job_submit(job_ptr, false);

 	acct_policy_update_pending_job(job_ptr);

 	if (flags & JOB_SPECIAL_EXIT) {
 		job_state_set_flag(job_ptr, JOB_SPECIAL_EXIT);
 		job_ptr->state_reason = WAIT_HELD_USER;
 		xfree(job_ptr->state_desc);
 		job_ptr->state_desc =
 			xstrdup("job requeued in special exit state");
 		debug("%s: Holding %pJ, special exit", __func__, job_ptr);
 		job_ptr->priority = 0;
 	}
 	if (flags & JOB_REQUEUE_HOLD) {
 		job_ptr->state_reason = WAIT_HELD_USER;
 		xfree(job_ptr->state_desc);
 		job_ptr->state_desc = xstrdup("job requeued in held state");
 		debug("%s: Holding %pJ, requeue-hold exit", __func__, job_ptr);
 		job_ptr->priority = 0;
 	}
 	if (flags & JOB_LAUNCH_FAILED) {
 		job_ptr->batch_flag++;
 		_handle_requeue_limit(job_ptr, __func__);

 		/* If job not already held, make it so if needed. */
 		if (!(job_ptr->job_state & JOB_REQUEUE_HOLD) &&
 		    ((!requeue_nohold_prolog || (flags & JOB_GETENV_FAILED)))) {
 			job_ptr->state_reason = WAIT_HELD_USER;
 			xfree(job_ptr->state_desc);
 			if (flags & JOB_GETENV_FAILED) {
 				job_ptr->state_desc =
 					xstrdup("user env retrieval failed requeued held");
 				debug("%s: Holding %pJ due to user environment retrieval failure or timeout",
 				      __func__, job_ptr);
 			} else {
 				job_ptr->state_desc =
 					xstrdup("launch failed requeued held");
 				debug("%s: Holding %pJ due to prolog failure",
 				      __func__, job_ptr);
 			}
 			job_ptr->priority = 0;
 		}
 	}

 	/*
 	 * When jobs are requeued while running/completing batch_requeue_fini is
 	 * called after the job is completely finished.  If the job is already
 	 * finished it needs to be called to clear out states (especially the
 	 * db_index or we will just write over the last job in the database).
 	 * Call batch_requeue_fini after setting priority to 0 for requeue_hold
 	 * and special_exit so federation doesn't submit siblings for held job.
 	 */
 	if (is_completed)
 		batch_requeue_fini(job_ptr);

 	debug("%s: %pJ state 0x%x reason %u priority %d",
 	      __func__, job_ptr, job_ptr->job_state,
 	      job_ptr->state_reason, job_ptr->priority);

 	return SLURM_SUCCESS;
 }

 static int _foreach_hetjob_requeue(void *x, void *arg)
 {
 	job_record_t *het_job = x;
 	foreach_requeue_hetjob_t *requeue_hetjob = arg;
 	int rc;

 	if (requeue_hetjob->het_leader->het_job_id != het_job->het_job_id) {
 		error("%s: Bad het_job_list for %pJ",
 		      __func__, requeue_hetjob->het_leader);
 		return 0;
 	}
 	rc = _job_requeue_op(requeue_hetjob->uid,
 			     het_job,
 			     requeue_hetjob->preempt,
 			     requeue_hetjob->flags);
 	if (rc != SLURM_SUCCESS)
 		requeue_hetjob->rc = rc;
 	return 0;
 }

 /*
  * _job_requeue - Requeue a running or pending batch job, if the specified
  *		  job records is a hetjob leader, perform the operation on all
  *		  components of the hetjob
  * IN uid - user id of user issuing the RPC
  * IN job_ptr - job to be requeued
  * IN preempt - true if job being preempted
  * RET 0 on success, otherwise ESLURM error code
  */
 static int _job_requeue(uid_t uid, job_record_t *job_ptr, bool preempt,
 			uint32_t flags)
 {
 	int rc = SLURM_SUCCESS;

 	if (job_ptr->het_job_id && !job_ptr->het_job_list)
 		return ESLURM_NOT_HET_JOB_LEADER;

 	if (job_ptr->het_job_list) {
 		foreach_requeue_hetjob_t requeue_hetjob = {
 			.flags = flags,
 			.het_leader = job_ptr,
 			.preempt = preempt,
 			.rc = SLURM_SUCCESS,
 			.uid = uid,
 		};
 		(void) list_for_each(job_ptr->het_job_list,
 				     _foreach_hetjob_requeue,
 				     &requeue_hetjob);
 		rc = requeue_hetjob.rc;
 	} else {
 		rc = _job_requeue_op(uid, job_ptr, preempt, flags);
 	}

 	return rc;
 }

 /*
  * job_requeue - Requeue a running or pending batch job
  * IN uid - user id of user issuing the RPC
  * IN job_id - id of the job to be requeued
  * IN msg - slurm_msg to send response back on
  * IN preempt - true if job being preempted
  * IN flags - JobExitRequeue | Hold | JobFailed | etc.
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_requeue(uid_t uid, uint32_t job_id, slurm_msg_t *msg,
 		       bool preempt, uint32_t flags)
 {
 	int rc = SLURM_SUCCESS;
 	job_record_t *job_ptr = NULL;

 	/* find the job */
 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL) {
 		rc = ESLURM_INVALID_JOB_ID;
 	} else {
 		/* _job_requeue already handles het jobs */
 		rc = _job_requeue(uid, job_ptr, preempt, flags);
 	}

 	if (msg) {
 		slurm_send_rc_msg(msg, rc);
 	}

 	return rc;
 }

 /*
  * job_requeue2 - Requeue a running or pending batch job
  * IN uid - user id of user issuing the RPC
  * IN req_ptr - request including ID of the job to be requeued
  * IN msg - slurm_msg to send response back on
  * IN preempt - true if job being preempted
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_requeue2(uid_t uid, requeue_msg_t *req_ptr, slurm_msg_t *msg,
 			bool preempt)
 {
 	int rc = SLURM_SUCCESS, rc2;
 	job_record_t *job_ptr = NULL;
 	long int long_id;
 	uint32_t job_id = 0;
 	char *end_ptr = NULL;
 	bitstr_t *array_bitmap = NULL;
 	uint32_t flags = req_ptr->flags;
 	char *job_id_str = req_ptr->job_id_str;
 	resp_array_struct_t *resp_array = NULL;
 	job_array_resp_msg_t *resp_array_msg = NULL;

 	if (max_array_size == NO_VAL) {
 		max_array_size = slurm_conf.max_array_sz;
 	}

 	long_id = strtol(job_id_str, &end_ptr, 10);
 	if ((long_id <= 0) || (long_id == LONG_MAX) ||
 	    ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) {
 		info("%s: invalid JobId=%s", __func__, job_id_str);
 		rc = ESLURM_INVALID_JOB_ID;
 		goto reply;
 	}
 	if ((end_ptr[0] == '_') && (end_ptr[1] == '*'))
 		end_ptr += 2;	/* Defaults to full job array */

 	job_id = (uint32_t) long_id;
 	if (end_ptr[0] == '\0') {	/* Single job (or full job array) */
 		job_record_t *job_ptr_done = NULL;
 		job_ptr = find_job_record(job_id);
 		if (job_ptr &&
 		    (((job_ptr->array_task_id == NO_VAL) &&
 		      (job_ptr->array_recs == NULL)) ||
 		     ((job_ptr->array_task_id != NO_VAL) &&
 		      (job_ptr->array_job_id  != job_id)))) {
 			/* This is a regular job or single task of job array */
 			rc = _job_requeue(uid, job_ptr, preempt, flags);
 			goto reply;
 		}

 		if (job_ptr && job_ptr->array_recs) {
 			/* This is a job array */
 			job_ptr_done = job_ptr;
 			rc2 = _job_requeue(uid, job_ptr, preempt, flags);
 			_resp_array_add(&resp_array, job_ptr, rc2, NULL);
 		}

 		/* Requeue all tasks of this job array */
 		job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)];
 		if (!job_ptr && !job_ptr_done) {
 			rc = ESLURM_INVALID_JOB_ID;
 			goto reply;
 		}
 		while (job_ptr) {
 			if ((job_ptr->array_job_id == job_id) &&
 			    (job_ptr != job_ptr_done)) {
 				rc2 = _job_requeue(uid, job_ptr, preempt,flags);
 				_resp_array_add(&resp_array, job_ptr, rc2,
 						NULL);
 			}
 			job_ptr = job_ptr->job_array_next_j;
 		}
 		goto reply;
 	}

 	array_bitmap = slurm_array_str2bitmap(end_ptr + 1, max_array_size,
 					      NULL);
 	if (!array_bitmap) {
 		info("%s: invalid JobId=%s", __func__, job_id_str);
 		rc = ESLURM_INVALID_JOB_ID;
 		goto reply;
 	}

 	for (int i = 0; (i = bit_ffs_from_bit(array_bitmap, i)) >= 0; i++) {
 		job_ptr = find_job_array_rec(job_id, i);
 		if (job_ptr == NULL) {
 			info("%s: invalid JobId=%u_%d", __func__, job_id, i);
 			_resp_array_add_id(&resp_array, job_id, i,
 					   ESLURM_INVALID_JOB_ID);
 			continue;
 		}

 		rc2 = _job_requeue(uid, job_ptr, preempt, flags);
 		_resp_array_add(&resp_array, job_ptr, rc2, NULL);
 	}

 reply:
 	if (msg) {
 		if (resp_array) {
 			resp_array_msg = _resp_array_xlate(resp_array, job_id);
 			(void) send_msg_response(msg, RESPONSE_JOB_ARRAY_ERRORS,
 						 resp_array_msg);
 			slurm_free_job_array_resp(resp_array_msg);
 		} else {
 			slurm_send_rc_msg(msg, rc);
 		}
 	}
 	_resp_array_free(resp_array);

 	FREE_NULL_BITMAP(array_bitmap);

 	return rc;
 }

 static int _top_job_flag_clear(void *x, void *arg)
 {
 	job_record_t *job_ptr = (job_record_t *) x;
 	job_ptr->bit_flags &= (~TOP_PRIO_TMP);
 	return 0;
 }

 /* This sorts so the highest priorities come off the list first */
 static int _top_job_prio_sort(void *x, void *y)
 {
 	uint32_t *prio1, *prio2;
 	prio1 = *(uint32_t **) x;
 	prio2 = *(uint32_t **) y;
 	if (*prio1 < *prio2)
 		return 1;
 	if (*prio1 > *prio2)
 		return -1;
 	return 0;
 }

 static int _set_top(list_t *top_job_list, uid_t uid)
 {
 	list_t *prio_list, *other_job_list;
 	list_itr_t *iter;
 	job_record_t *job_ptr, *first_job_ptr = NULL;
 	int rc = SLURM_SUCCESS, rc2 = SLURM_SUCCESS;
 	uint32_t last_prio = NO_VAL, next_prio;
 	int64_t delta_prio, delta_nice, total_delta = 0;
 	int other_job_cnt = 0;
 	uint32_t *prio_elem;

 	xassert(job_list);
 	xassert(top_job_list);
 	prio_list = list_create(xfree_ptr);
 	(void) list_for_each(job_list, _top_job_flag_clear, NULL);

 	/*
 	 * Skipping all these list iterators.
 	 * We want to rewrite how job_set_top works.
 	 */

 	/* Validate the jobs in our "top" list */
 	iter = list_iterator_create(top_job_list);
 	while ((job_ptr = list_next(iter))) {
 		if ((job_ptr->user_id != uid) && (uid != 0)) {
 			error("Security violation: REQUEST_TOP_JOB for %pJ from uid=%u",
 			      job_ptr, uid);
 			rc = ESLURM_ACCESS_DENIED;
 			break;
 		}
 		if (!IS_JOB_PENDING(job_ptr) || (job_ptr->details == NULL)) {
 			debug("%s: %pJ not pending",  __func__, job_ptr);
 			list_remove(iter);
 			rc2 = ESLURM_JOB_NOT_PENDING;
 			continue;
 		}
 		if (job_ptr->part_ptr_list) {
 			debug("%s: %pJ in partition list", __func__, job_ptr);
 			list_remove(iter);
 			rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE;
 			break;
 		}
 		if (job_ptr->priority == 0) {
 			debug("%s: %pJ is held", __func__, job_ptr);
 			list_remove(iter);
 			rc2 = ESLURM_JOB_HELD;
 			continue;
 		}
 		if (job_ptr->bit_flags & TOP_PRIO_TMP) {
 			/* Duplicate job ID */
 			list_remove(iter);
 			continue;
 		}
 		if (!first_job_ptr)
 			first_job_ptr = job_ptr;
 		job_ptr->bit_flags |= TOP_PRIO_TMP;
 		prio_elem = xmalloc(sizeof(uint32_t));
 		*prio_elem = job_ptr->priority;
 		list_append(prio_list, prio_elem);
 	}
 	list_iterator_destroy(iter);
 	if (rc != SLURM_SUCCESS) {
 		FREE_NULL_LIST(prio_list);
 		return rc;
 	}
 	if (!first_job_ptr) {
 		FREE_NULL_LIST(prio_list);
 		return rc2;
 	}

 	/* Identify other jobs which we can adjust the nice value of */
 	other_job_list = list_create(NULL);
 	iter = list_iterator_create(job_list);
 	while ((job_ptr = list_next(iter))) {
 		/*
 		 * Do not select jobs with priority 0 (held), or
 		 * priority 1 (would be held if we lowered the priority).
 		 */
 		if ((job_ptr->bit_flags & TOP_PRIO_TMP) ||
 		    (job_ptr->details == NULL) ||
 		    (job_ptr->part_ptr_list)   ||
 		    (job_ptr->priority <= 1)   ||
 		    (job_ptr->assoc_ptr != first_job_ptr->assoc_ptr) ||
 		    (job_ptr->part_ptr  != first_job_ptr->part_ptr)  ||
 		    (job_ptr->qos_ptr   != first_job_ptr->qos_ptr)   ||
 		    (job_ptr->user_id   != first_job_ptr->user_id)   ||
 		    (!IS_JOB_PENDING(job_ptr)))
 			continue;
 		other_job_cnt++;
 		job_ptr->bit_flags |= TOP_PRIO_TMP;
 		prio_elem = xmalloc(sizeof(uint32_t));
 		*prio_elem = job_ptr->priority;
 		list_append(prio_list, prio_elem);
 		list_append(other_job_list, job_ptr);
 	}
 	list_iterator_destroy(iter);

 	/* Now adjust nice values and priorities of the listed "top" jobs */
 	list_sort(prio_list, _top_job_prio_sort);
 	iter = list_iterator_create(top_job_list);
 	while ((job_ptr = list_next(iter))) {
 		prio_elem = list_pop(prio_list);
 		next_prio = *prio_elem;
 		xfree(prio_elem);
 		if ((last_prio != NO_VAL) && (next_prio == last_prio) &&
 		    (last_prio > 2))
 			/*
 			 * We don't want to set job priority lower than 1, so
 			 * last_prio cannot be smaller than 2, since we will
 			 * later use last_prio - 1 for the new job priority.
 			 */
 			next_prio = last_prio - 1;
 		last_prio = next_prio;
 		delta_prio = (int64_t) next_prio - job_ptr->priority;
 		delta_nice = MIN(job_ptr->details->nice, delta_prio);
 		total_delta += delta_nice;
 		job_ptr->priority = next_prio;
 		job_ptr->details->nice -= delta_nice;
 		job_ptr->bit_flags &= (~TOP_PRIO_TMP);
 	}
 	list_iterator_destroy(iter);
 	FREE_NULL_LIST(prio_list);

 	/* Now adjust nice values and priorities of remaining effected jobs */
 	if (other_job_cnt) {
 		iter = list_iterator_create(other_job_list);
 		while ((job_ptr = list_next(iter))) {
 			delta_prio = total_delta / other_job_cnt;
 			next_prio = job_ptr->priority - delta_prio;
 			if (next_prio >= last_prio) {
 				next_prio = last_prio - 1;
 				delta_prio = job_ptr->priority - next_prio;
 			}
 			delta_nice = delta_prio;
 			job_ptr->priority = next_prio;
 			job_ptr->details->nice += delta_nice;
 			job_ptr->bit_flags &= (~TOP_PRIO_TMP);
 			total_delta -= delta_nice;
 			if (--other_job_cnt == 0)
 				break;	/* Count will match list size anyway */
 		}
 		list_iterator_destroy(iter);
 	}
 	FREE_NULL_LIST(other_job_list);

 	last_job_update = time(NULL);

 	return rc;
 }

 /*
  * job_set_top - Move the specified jobs to the top of the queue (at least
  *	for that user ID, partition, account, and QOS).
  *
  * IN msg - original request msg
  * IN top_ptr - user request
  * IN uid - user id of the user issuing the RPC
  * IN protocol_version - slurm protocol version of client
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int job_set_top(slurm_msg_t *msg, top_job_msg_t *top_ptr, uid_t uid,
 		       uint16_t protocol_version)
 {
 	int rc = SLURM_SUCCESS;
 	list_t *top_job_list = NULL;
 	char *job_str_tmp = NULL, *tok, *save_ptr = NULL, *end_ptr = NULL;
 	job_record_t *job_ptr = NULL;
 	long int long_id;
 	uint32_t job_id = 0, task_id = 0;
 	uid_t job_uid = uid;

 	if (validate_operator(uid)) {
 		job_uid = 0;
 	} else {
 		bool disable_user_top = true;
 		if (xstrcasestr(slurm_conf.sched_params, "enable_user_top"))
 			disable_user_top = false;
 		if (disable_user_top) {
 			rc = ESLURM_ACCESS_DENIED;
 			goto reply;
 		}
 	}

 	top_job_list = list_create(NULL);
 	job_str_tmp = xstrdup(top_ptr->job_id_str);
 	tok = strtok_r(job_str_tmp, ",", &save_ptr);
 	while (tok) {
 		long_id = strtol(tok, &end_ptr, 10);
 		if ((long_id <= 0) || (long_id == LONG_MAX) ||
 		    ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) {
 			info("%s: invalid job id %s", __func__, tok);
 			rc = ESLURM_INVALID_JOB_ID;
 			goto reply;
 		}
 		job_id = (uint32_t) long_id;
 		if ((end_ptr[0] == '\0') || /* Single job (or full job array) */
 		    ((end_ptr[0] == '_') && (end_ptr[1] == '*') &&
 		     (end_ptr[2] == '\0'))) {
 			job_ptr = find_job_record(job_id);
 			if (!job_ptr) {
 				rc = ESLURM_INVALID_JOB_ID;
 				goto reply;
 			}
 			list_append(top_job_list, job_ptr);
 		} else if (end_ptr[0] != '_') {        /* Invalid job ID spec */
 			rc = ESLURM_INVALID_JOB_ID;
 			goto reply;
 		} else {		/* Single task of a job array */
 			task_id = strtol(end_ptr + 1, &end_ptr, 10);
 			if (end_ptr[0] != '\0') {      /* Invalid job ID spec */
 				rc = ESLURM_INVALID_JOB_ID;
 				goto reply;
 			}
 			job_ptr = find_job_array_rec(job_id, task_id);
 			if (!job_ptr) {
 				rc = ESLURM_INVALID_JOB_ID;
 				goto reply;
 			}
 			list_append(top_job_list, job_ptr);
 		}
 		tok = strtok_r(NULL, ",", &save_ptr);
 	}

 	if (list_count(top_job_list) == 0) {
 		rc = ESLURM_INVALID_JOB_ID;
 		goto reply;
 	}
 	rc = _set_top(top_job_list, job_uid);

 reply:	FREE_NULL_LIST(top_job_list);
 	xfree(job_str_tmp);
 	slurm_send_rc_msg(msg, rc);
 	return rc;
 }

 /*
  * job_end_time - Process JOB_END_TIME
  * IN time_req_msg - job end time request
  * OUT timeout_msg - job timeout response to be sent
  * RET SLURM_SUCCESS or an error code
  */
 extern int job_end_time(job_alloc_info_msg_t *time_req_msg,
 			srun_timeout_msg_t *timeout_msg)
 {
 	job_record_t *job_ptr;
 	xassert(timeout_msg);

 	job_ptr = find_job_record(time_req_msg->job_id);
 	if (!job_ptr)
 		return ESLURM_INVALID_JOB_ID;

 	memset(timeout_msg, 0, sizeof(srun_timeout_msg_t));
 	timeout_msg->step_id.job_id = time_req_msg->job_id;
 	timeout_msg->step_id.step_id = NO_VAL;
 	timeout_msg->step_id.step_het_comp = NO_VAL;
 	timeout_msg->timeout = job_ptr->end_time;
 	return SLURM_SUCCESS;
 }

 static int _update_job_nodes_str(job_record_t *job_ptr)
 {
 	xfree(job_ptr->nodes_completing);
 	xfree(job_ptr->nodes_pr);

 	if (!job_ptr->node_bitmap)
 		return 0;

 	if (IS_JOB_COMPLETING(job_ptr)) {
 		if (job_ptr->node_bitmap_cg) {
 			job_ptr->nodes_completing =
 				bitmap2node_name(job_ptr->node_bitmap_cg);
 		} else {
 			job_ptr->nodes_completing =
 				bitmap2node_name(job_ptr->node_bitmap);
 		}
 	}
 	if (job_ptr->state_reason == WAIT_PROLOG) {
 		if (job_ptr->node_bitmap_pr) {
 			job_ptr->nodes_pr =
 				bitmap2node_name(job_ptr->node_bitmap_pr);
 		} else {
 			job_ptr->nodes_pr =
 				bitmap2node_name(job_ptr->node_bitmap);
 		}
 	}

 	return 0;
 }

 static int _foreach_hold_by_assoc(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;
 	foreach_hold_by_id_t *hold_by_id = arg;

 	if (job_ptr->assoc_id == hold_by_id->id)
 		hold_by_id->cnt += _job_fail_account(job_ptr, __func__, false);

 	return 0;
 }

 /*
  * job_hold_by_assoc_id - Hold all pending jobs with a given
  *	association ID. This happens when an association is deleted (e.g. when
  *	a user is removed from the association database).
  * RET count of held jobs
  */
 extern int job_hold_by_assoc_id(uint32_t assoc_id)
 {
 	/* Write lock on jobs */
 	slurmctld_lock_t job_write_lock =
 		{ NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
 	foreach_hold_by_id_t hold_by_id = {
 		.id = assoc_id,
 		.cnt = 0,
 	};

 	if (!job_list)
 		return 0;

 	lock_slurmctld(job_write_lock);
 	(void) list_for_each(job_list, _foreach_hold_by_assoc, &hold_by_id);
 	unlock_slurmctld(job_write_lock);

 	return hold_by_id.cnt;
 }

 static int _foreach_hold_by_qos(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;
 	foreach_hold_by_id_t *hold_by_id = arg;

 	if (job_ptr->qos_blocking_ptr &&
 	    (job_ptr->qos_blocking_ptr->id == hold_by_id->id))
 		job_ptr->qos_blocking_ptr = NULL;
 	if (job_ptr->qos_list) {
 		if (!list_find_first(job_ptr->qos_list,
 				     slurmdb_find_qos_in_list,
 				     &hold_by_id->id))
 			return 0;
 	} else if (job_ptr->qos_id != hold_by_id->id)
 		return 0;

 	hold_by_id->cnt += job_fail_qos(job_ptr, __func__, false);

 	return 0;
 }

 /*
  * job_hold_by_qos_id - Hold all pending jobs with a given
  *	QOS ID. This happens when a QOS is deleted (e.g. when
  *	a QOS is removed from the association database).
  * RET count of held jobs
  */
 extern int job_hold_by_qos_id(uint32_t qos_id)
 {
 	/* Write lock on jobs */
 	slurmctld_lock_t job_write_lock =
 		{ NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
 	foreach_hold_by_id_t hold_by_id = {
 		.id = qos_id,
 		.cnt = 0,
 	};

 	if (!job_list)
 		return 0;

 	lock_slurmctld(job_write_lock);
 	(void) list_for_each(job_list, _foreach_hold_by_qos, &hold_by_id);
 	unlock_slurmctld(job_write_lock);
 	return hold_by_id.cnt;
 }

 /*
  * Modify the account associated with a pending job
  * IN module - where this is called from
  * IN job_ptr - pointer to job which should be modified
  * IN new_wckey - desired wckey name
  * RET SLURM_SUCCESS or error code
  */
 extern int update_job_wckey(char *module, job_record_t *job_ptr,
 			    char *new_wckey)
 {
 	slurmdb_wckey_rec_t wckey_rec, *wckey_ptr;

 	if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) {
 		info("%s: attempt to modify account for non-pending %pJ",
 		     module, job_ptr);
 		return ESLURM_JOB_NOT_PENDING;
 	}

 	memset(&wckey_rec, 0, sizeof(wckey_rec));
 	wckey_rec.uid       = job_ptr->user_id;
 	wckey_rec.name      = new_wckey;
 	if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec,
 				    accounting_enforce, &wckey_ptr, false)) {
 		info("%s: invalid wckey %s for %pJ",
 		     module, new_wckey, job_ptr);
 		return ESLURM_INVALID_WCKEY;
 	} else if (slurm_with_slurmdbd() &&
 		   !wckey_ptr &&
 		   !(accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS)) {
 		/* if not enforcing associations we want to look for
 		   the default account and use it to avoid getting
 		   trash in the accounting records.
 		*/
 		wckey_rec.name = NULL;
 		assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec,
 					accounting_enforce, &wckey_ptr, false);
 		if (!wckey_ptr) {
 			debug("%s: we didn't have a wckey record for wckey "
 			      "'%s' and user '%u', and we can't seem to find "
 			      "a default one either.  Setting it anyway. "
 			      "This will produce trash in accounting.  "
 			      "If this is not what you desire please put "
 			      "AccountStorageEnforce=wckeys in your slurm.conf "
 			      "file.", module, new_wckey,
 			      job_ptr->user_id);
 			wckey_rec.name = new_wckey;
 		}
 	}

 	xfree(job_ptr->wckey);
 	if (wckey_rec.name && wckey_rec.name[0] != '\0') {
 		job_ptr->wckey = xstrdup(wckey_rec.name);
 		info("%s: setting wckey to %s for %pJ",
 		     module, wckey_rec.name, job_ptr);
 	} else {
 		info("%s: cleared wckey for %pJ", module, job_ptr);
 	}

 	last_job_update = time(NULL);

 	return SLURM_SUCCESS;
 }

 static int _foreach_send_jobs_to_accounting(void *x, void *arg)
 {
 	job_record_t *job_ptr = x;

 	if (!job_ptr->assoc_id) {
 		slurmdb_assoc_rec_t assoc_rec = {
 			.acct = job_ptr->account,
 			.partition = job_ptr->part_ptr ?
 			job_ptr->part_ptr->name : NULL,
 			.uid = job_ptr->user_id,
 		};

 		if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec,
 					    accounting_enforce,
 					    &job_ptr->assoc_ptr, false) &&
 		    (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) {
 			_job_fail_account(job_ptr, __func__, false);
 			return 0;
 		} else
 			job_ptr->assoc_id = assoc_rec.id;
 	}

 	/* we only want active, un accounted for jobs */
 	if (IS_JOB_IN_DB(job_ptr) || IS_JOB_FINISHED(job_ptr))
 		return 0;

 	debug("first reg: starting %pJ in accounting", job_ptr);
 	jobacct_storage_g_job_start(acct_db_conn, job_ptr);

 	if (IS_JOB_SUSPENDED(job_ptr))
 		jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
 	return 0;
 }

 /*
  * Currently only sends active and suspsended jobs not already in the database.
  *
  * On node changes, we opt not to send updated node_inx's due to the heavy cost
  * of doing so. If we were to update the job's node_inx's, this could be done by
  * resizing the job which will create a new db record for the job with the
  * changed node_inx's -- like how reservations are done.
  * e.g.
  * job_pre_resize_acctg(job_ptr);
  * job_post_resize_acctg(job_ptr);
  */
 extern int send_jobs_to_accounting(void)
 {
 	slurmctld_lock_t job_write_lock = {
 		NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK, NO_LOCK };

 	/* send jobs in pending or running state */
 	lock_slurmctld(job_write_lock);
 	(void) list_for_each(job_list, _foreach_send_jobs_to_accounting, NULL);
 	unlock_slurmctld(job_write_lock);

 	return SLURM_SUCCESS;
 }

 /*
  * copy_job_record_to_job_desc - construct a job_desc_msg_t for a job.
  * IN job_ptr - the job record
  * RET the job_desc_msg_t, NULL on error
  */
 extern job_desc_msg_t *copy_job_record_to_job_desc(job_record_t *job_ptr)
 {
 	job_desc_msg_t *job_desc;
 	job_details_t *details = job_ptr->details;
 	multi_core_data_t *mc_ptr = details->mc_ptr;
 	int i;

 	/* construct a job_desc_msg_t from job */
 	job_desc = xmalloc(sizeof(job_desc_msg_t));

 	job_desc->account           = xstrdup(job_ptr->account);
 	job_desc->acctg_freq        = xstrdup(details->acctg_freq);
 	job_desc->alloc_node        = xstrdup(job_ptr->alloc_node);
 	/* Since the allocating salloc or srun is not expected to exist
 	 * when this checkpointed job is restarted, do not save these:
 	 *
 	 * job_desc->alloc_resp_port   = job_ptr->alloc_resp_port;
 	 * job_desc->alloc_sid         = job_ptr->alloc_sid;
 	 */
 	job_desc->argc              = details->argc;
 	job_desc->argv              = xcalloc(job_desc->argc, sizeof(char *));
 	for (i = 0; i < job_desc->argc; i ++)
 		job_desc->argv[i]   = xstrdup(details->argv[i]);
 	job_desc->begin_time        = details->begin_time;
 	job_desc->bitflags 	    = job_ptr->bit_flags;
 	job_desc->clusters          = xstrdup(job_ptr->clusters);
 	job_desc->comment           = xstrdup(job_ptr->comment);
 	job_desc->container = xstrdup(job_ptr->container);
 	job_desc->container_id = xstrdup(job_ptr->container_id);
 	job_desc->contiguous        = details->contiguous;
 	job_desc->core_spec         = details->core_spec;
 	job_desc->cpu_bind          = xstrdup(details->cpu_bind);
 	job_desc->cpu_bind_type     = details->cpu_bind_type;
 	job_desc->cpu_freq_min      = details->cpu_freq_min;
 	job_desc->cpu_freq_max      = details->cpu_freq_max;
 	job_desc->cpu_freq_gov      = details->cpu_freq_gov;
 	job_desc->deadline          = job_ptr->deadline;
 	job_desc->dependency        = xstrdup(details->dependency);
 	job_desc->end_time          = 0; /* Unused today */
 	job_desc->environment       = get_job_env(job_ptr,
 						  &job_desc->env_size);
 	job_desc->exc_nodes         = xstrdup(details->exc_nodes);
 	job_desc->extra = xstrdup(job_ptr->extra);
 	job_desc->features          = xstrdup(details->features);
 	job_desc->cluster_features  = xstrdup(details->cluster_features);
 	job_desc->group_id          = job_ptr->group_id;
 	job_desc->immediate         = 0; /* nowhere to get this value */
 	job_desc->job_id            = job_ptr->job_id;
 	job_desc->kill_on_node_fail = job_ptr->kill_on_node_fail;
 	job_desc->licenses          = xstrdup(job_ptr->lic_req);
 	job_desc->mail_type         = job_ptr->mail_type;
 	job_desc->mail_user         = xstrdup(job_ptr->mail_user);
 	job_desc->mcs_label	    = xstrdup(job_ptr->mcs_label);
 	job_desc->mem_bind          = xstrdup(details->mem_bind);
 	job_desc->mem_bind_type     = details->mem_bind_type;
 	job_desc->name              = xstrdup(job_ptr->name);
 	job_desc->network           = xstrdup(job_ptr->network);
 	job_desc->nice              = details->nice;
 	job_desc->num_tasks         = details->num_tasks;
 	job_desc->open_mode         = details->open_mode;
 	job_desc->origin_cluster    = xstrdup(job_ptr->origin_cluster);
 	job_desc->other_port        = job_ptr->other_port;
 	job_desc->overcommit        = details->overcommit;
 	job_desc->partition         = xstrdup(job_ptr->partition);
 	job_desc->plane_size = mc_ptr->plane_size;
 	job_desc->prefer            = xstrdup(details->prefer);
 	job_desc->priority          = job_ptr->priority;
 	if (job_ptr->qos_ptr)
 		job_desc->qos       = xstrdup(job_ptr->qos_ptr->name);
 	job_desc->resp_host         = xstrdup(job_ptr->resp_host);
 	job_desc->req_nodes         = xstrdup(details->req_nodes);
 	job_desc->requeue           = details->requeue;
 	job_desc->reservation       = xstrdup(job_ptr->resv_name);
 	job_desc->restart_cnt       = job_ptr->restart_cnt;
 	job_desc->segment_size = details->segment_size;
 	job_desc->script_buf        = get_job_script(job_ptr);
 	if (details->share_res == 1)
 		job_desc->shared     = JOB_SHARED_OK;
 	else if (details->whole_node & WHOLE_NODE_REQUIRED)
 		job_desc->shared     =  JOB_SHARED_NONE;
 	else if (details->whole_node & WHOLE_NODE_USER)
 		job_desc->shared     =  JOB_SHARED_USER;
 	else if (details->whole_node & WHOLE_NODE_MCS)
 		job_desc->shared     =  JOB_SHARED_MCS;
 	else
 		job_desc->shared     = NO_VAL16;
 	job_desc->spank_job_env_size = job_ptr->spank_job_env_size;
 	job_desc->spank_job_env      = xcalloc(job_desc->spank_job_env_size,
 					       sizeof(char *));
 	for (i = 0; i < job_desc->spank_job_env_size; i ++)
 		job_desc->spank_job_env[i]= xstrdup(job_ptr->spank_job_env[i]);
 	job_desc->std_err           = xstrdup(details->std_err);
 	job_desc->std_in            = xstrdup(details->std_in);
 	job_desc->std_out           = xstrdup(details->std_out);
 	job_desc->submit_line       = xstrdup(details->submit_line);
 	job_desc->task_dist         = details->task_dist;
 	job_desc->time_limit        = job_ptr->time_limit;
 	job_desc->time_min          = job_ptr->time_min;
 	job_desc->user_id           = job_ptr->user_id;
 	job_desc->wait_all_nodes    = job_ptr->wait_all_nodes;
 	job_desc->warn_flags        = job_ptr->warn_flags;
 	job_desc->warn_signal       = job_ptr->warn_signal;
 	job_desc->warn_time         = job_ptr->warn_time;
 	job_desc->wckey             = xstrdup(job_ptr->wckey);
 	job_desc->work_dir          = xstrdup(details->work_dir);
 	job_desc->pn_min_cpus       = details->pn_min_cpus;
 	job_desc->pn_min_memory     = details->pn_min_memory;
 	job_desc->oom_kill_step     = details->oom_kill_step;
 	job_desc->pn_min_tmp_disk   = details->pn_min_tmp_disk;
 	job_desc->min_cpus          = details->min_cpus;
 	job_desc->max_cpus          = details->max_cpus;
 	job_desc->min_nodes         = details->min_nodes;
 	job_desc->max_nodes         = details->max_nodes;
 	if (job_desc->max_nodes == 0) /* set 0 in _job_create() */
 		job_desc->max_nodes = NO_VAL;
 	job_desc->sockets_per_node  = mc_ptr->sockets_per_node;
 	job_desc->cores_per_socket  = mc_ptr->cores_per_socket;
 	job_desc->threads_per_core  = mc_ptr->threads_per_core;
 	job_desc->cpus_per_task     = details->cpus_per_task;
 	job_desc->ntasks_per_node   = details->ntasks_per_node;
 	job_desc->ntasks_per_socket = mc_ptr->ntasks_per_socket;
 	job_desc->ntasks_per_core   = mc_ptr->ntasks_per_core;

 	job_desc->cpus_per_tres     = xstrdup(job_ptr->cpus_per_tres);
 	job_desc->mem_per_tres      = xstrdup(job_ptr->mem_per_tres);
 	job_desc->tres_bind         = xstrdup(job_ptr->tres_bind);
 	job_desc->tres_freq         = xstrdup(job_ptr->tres_freq);
 	job_desc->tres_per_job      = xstrdup(job_ptr->tres_per_job);
 	job_desc->tres_per_node     = xstrdup(job_ptr->tres_per_node);
 	job_desc->tres_per_socket   = xstrdup(job_ptr->tres_per_socket);
 	job_desc->tres_per_task     = xstrdup(job_ptr->tres_per_task);

 	if (job_ptr->fed_details) {
 		job_desc->fed_siblings_active =
 			job_ptr->fed_details->siblings_active;
 		job_desc->fed_siblings_viable =
 			job_ptr->fed_details->siblings_viable;
 	}

 	return job_desc;
 }

 /* Build a bitmap of nodes completing this job */
 extern void build_cg_bitmap(job_record_t *job_ptr)
 {
 	FREE_NULL_BITMAP(job_ptr->node_bitmap_cg);
 	if (job_ptr->node_bitmap) {
 		job_ptr->node_bitmap_cg = bit_copy(job_ptr->node_bitmap);
 		if (bit_ffs(job_ptr->node_bitmap_cg) == -1)
 			job_state_unset_flag(job_ptr, JOB_COMPLETING);
 	} else {
 		error("build_cg_bitmap: node_bitmap is NULL");
 		job_ptr->node_bitmap_cg = bit_alloc(node_record_count);
 		job_state_unset_flag(job_ptr, JOB_COMPLETING);
 	}
 }

 /* job_hold_requeue()
  *
  * Requeue the job based upon its current state.
  * If JOB_SPECIAL_EXIT then requeue and hold with JOB_SPECIAL_EXIT state.
  * If JOB_REQUEUE_HOLD then requeue and hold.
  * If JOB_REQUEUE then requeue and let it run again.
  * The requeue can happen directly from job_requeue() or from
  * job_epilog_complete() after the last component has finished.
  *
  * RET returns true if the job was requeued
  */
 extern bool job_hold_requeue(job_record_t *job_ptr)
 {
 	uint32_t state;
 	uint32_t flags;
 	job_record_t *base_job_ptr = NULL;

 	xassert(job_ptr);

 	/* If the job is already pending it was
 	 * eventually requeued somewhere else.
 	 */
 	if (IS_JOB_PENDING(job_ptr) && !IS_JOB_REVOKED(job_ptr))
 		return false;

 	/* If the job is not on the origin cluster, then don't worry about
 	 * requeuing the job here. The exit code will be sent the origin
 	 * cluster and the origin cluster will decide if the job should be
 	 * requeued or not. */
 	if (!fed_mgr_is_origin_job(job_ptr))
 		return false;

 	/*
 	 * A job may be canceled during its epilog in which case we need to
 	 * check that the job (or base job in the case of an array) was not
 	 * canceled before attempting to requeue.
 	 */
 	if (IS_JOB_CANCELLED(job_ptr) ||
 	    (((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) &&
 	     (base_job_ptr = find_job_record(job_ptr->array_job_id)) &&
 	     base_job_ptr->array_recs && IS_JOB_CANCELLED(base_job_ptr)))
 		return false;

 	/* Check if the job exit with one of the
 	 * configured requeue values. */
 	_set_job_requeue_exit_value(job_ptr);

 	/* handle crontab jobs */
 	if ((job_ptr->bit_flags & CRON_JOB) &&
 	    job_ptr->details->crontab_entry) {
 		job_state_set_flag(job_ptr, JOB_REQUEUE);
 		job_ptr->details->begin_time =
 			calc_next_cron_start(job_ptr->details->crontab_entry,
 					     0);
 	} else if (job_ptr->bit_flags & CRON_JOB) {
 		/*
 		 * Skip requeuing this instead of crashing.
 		 */
 		error("Missing cron details for %pJ. This should never happen. Clearing CRON_JOB flag and skipping requeue.",
 		      job_ptr);
 		job_ptr->bit_flags &= ~CRON_JOB;
 	}

 	state = job_ptr->job_state;

 	if (! (state & JOB_REQUEUE))
 		return false;

 	/* Sent event requeue to the database.  */
 	if (!(job_ptr->bit_flags & TRES_STR_CALC) &&
 	    job_ptr->tres_alloc_cnt &&
 	    (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64))
 		assoc_mgr_set_job_tres_alloc_str(job_ptr, false);
 	jobacct_storage_g_job_complete(acct_db_conn, job_ptr);

 	debug("%s: %pJ state 0x%x", __func__, job_ptr, state);

 	/* Set the job pending */
 	flags = job_ptr->job_state & JOB_STATE_FLAGS;
 	job_state_set(job_ptr, (JOB_PENDING | flags));

 	job_ptr->restart_cnt++;

 	/* clear signal sent flag on requeue */
 	job_ptr->warn_flags &= ~WARN_SENT;

 	/*
 	 * Test if user wants to requeue the job
 	 * in hold or with a special exit value.
 	 */
 	if (state & JOB_SPECIAL_EXIT) {
 		/*
 		 * JOB_SPECIAL_EXIT means requeue the job,
 		 * put it on hold and display state as JOB_SPECIAL_EXIT.
 		 */
 		job_state_set_flag(job_ptr, JOB_SPECIAL_EXIT);
 		job_ptr->state_reason = WAIT_HELD_USER;
 		debug("%s: Holding %pJ, special exit", __func__, job_ptr);
 		job_ptr->priority = 0;
 	}

 	job_state_unset_flag(job_ptr, JOB_REQUEUE);

 	/*
 	 * Mark array as requeued. Exit codes have already been handled in
 	 * _job_array_comp()
 	 */
 	if (((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) &&
 	    (base_job_ptr = find_job_record(job_ptr->array_job_id)) &&
 	    base_job_ptr->array_recs) {
 		base_job_ptr->array_recs->array_flags |= ARRAY_TASK_REQUEUED;
 	}

 	debug("%s: %pJ state 0x%x reason %u priority %d",
 	      __func__, job_ptr, job_ptr->job_state,
 	      job_ptr->state_reason, job_ptr->priority);

 	return true;
 }

 static void _parse_max_depend_depth(char *str)
 {
 	int i = atoi(str);
 	if (i < 0)
 		error("ignoring max_depend_depth value of %d", i);
 	else
 		max_depend_depth = i;
 }

 extern void init_depend_policy(void)
 {
 	char *tmp_ptr;

 	disable_remote_singleton =
 		(xstrcasestr(slurm_conf.dependency_params,
 		             "disable_remote_singleton")) ?
 		true : false;

 	kill_invalid_dep =
 		(xstrcasestr(slurm_conf.dependency_params,
 			     "kill_invalid_depend")) ?
 		true : false;

 	/* 			    01234567890123456 */
 	if ((tmp_ptr = xstrcasestr(slurm_conf.dependency_params,
 	                           "max_depend_depth=")))
 		_parse_max_depend_depth(tmp_ptr + 17);
 	else
 		max_depend_depth = 10;

 	log_flag(DEPENDENCY, "%s: kill_invalid_depend is set to %d; disable_remote_singleton is set to %d; max_depend_depth is set to %d",
 	         __func__, kill_invalid_dep, disable_remote_singleton,
 	         max_depend_depth);
 }

 /* init_requeue_policy()
  * Initialize the requeue exit/hold bitmaps.
  */
 extern void init_requeue_policy(void)
 {
 	/* clean first as we can be reconfiguring */
 	FREE_NULL_BITMAP(requeue_exit);
 	FREE_NULL_BITMAP(requeue_exit_hold);

 	requeue_exit = _make_requeue_array(slurm_conf.requeue_exit);
 	requeue_exit_hold = _make_requeue_array(slurm_conf.requeue_exit_hold);
 }

 /* _make_requeue_array()
  *
  * Process the RequeueExit|RequeueExitHold configuration
  * parameters creating two bitmaps holding the exit values
  * of jobs for which they have to be requeued.
  */
 static bitstr_t *_make_requeue_array(char *conf_buf)
 {
 	hostset_t *hs;
 	bitstr_t *bs = NULL;
 	char *tok = NULL, *end_ptr = NULL;
 	long val;

 	if (conf_buf == NULL)
 		return bs;

 	xstrfmtcat(tok, "[%s]", conf_buf);
 	hs = hostset_create(tok);
 	xfree(tok);
 	if (!hs) {
 		error("%s: exit values: %s", __func__, conf_buf);
 		return bs;
 	}

 	debug("%s: exit values: %s", __func__, conf_buf);

 	bs = bit_alloc(MAX_EXIT_VAL + 1);
 	while ((tok = hostset_shift(hs))) {
 		val = strtol(tok, &end_ptr, 10);
 		if ((end_ptr[0] == '\0') &&
 		    (val >= 0) && (val <= MAX_EXIT_VAL)) {
 			bit_set(bs, val);
 		} else {
 			error("%s: exit values: %s (%s)",
 			      __func__, conf_buf, tok);
 		}
 		free(tok);
 	}
 	hostset_destroy(hs);

 	return bs;
 }

 /* _set_job_requeue_exit_value()
  *
  * Compared the job exit values with the configured
  * RequeueExit and RequeueHoldExit and a match is
  * found, set the appropriate state for job_hold_requeue()
  */
 static void _set_job_requeue_exit_value(job_record_t *job_ptr)
 {
 	int exit_code;

 	/* --no-requeue option supersedes config for RequeueExit &
 	 * RequeueExitHold
 	 */
 	if (job_ptr->details && !job_ptr->details->requeue)
 		return;

 	exit_code = WEXITSTATUS(job_ptr->exit_code);

 	if (requeue_exit && bit_test(requeue_exit, exit_code)) {
 		debug2("%s: %pJ exit code %d state JOB_REQUEUE",
 		       __func__, job_ptr, exit_code);
 		job_state_set_flag(job_ptr, JOB_REQUEUE);
 		return;
 	}

 	if (requeue_exit_hold && bit_test(requeue_exit_hold, exit_code)) {
 		/* Not sure if want to set special exit state in this case */
 		debug2("%s: %pJ exit code %d state JOB_SPECIAL_EXIT",
 		       __func__, job_ptr, exit_code);
 		job_state_set_flag(job_ptr, (JOB_REQUEUE | JOB_SPECIAL_EXIT));
 		return;
 	}
 }

 /*
  * Reset a job's end_time based upon it's start_time and time_limit.
  * NOTE: Do not reset the end_time if already being preempted
  */
 extern void job_end_time_reset(job_record_t *job_ptr)
 {
 	if (job_ptr->preempt_time)
 		return; /* Preemption in progress */
 	if (job_ptr->time_limit == INFINITE) {
 		job_ptr->end_time = job_ptr->start_time +
 			(365 * 24 * 60 * 60); /* secs in year */
 	} else {
 		job_ptr->end_time = job_ptr->start_time +
 			(job_ptr->time_limit * 60);	/* secs */
 	}
 	job_ptr->end_time_exp = job_ptr->end_time;
 }

 /* If this is a job array meta-job, prepare it for being scheduled */
 extern void job_array_pre_sched(job_record_t *job_ptr)
 {
 	int32_t i;

 	if (!job_ptr->array_recs || !job_ptr->array_recs->task_id_bitmap)
 		return;

 	i = bit_ffs(job_ptr->array_recs->task_id_bitmap);
 	if (i < 0) {
 		/* This happens if the final task in a meta-job is requeued */
 		if (job_ptr->restart_cnt == 0) {
 			error("%pJ has empty task_id_bitmap", job_ptr);
 		}
 		FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap);
 		return;
 	}

 	job_ptr->array_job_id  = job_ptr->job_id;
 	job_ptr->array_task_id = i;
 }

 /* If this is a job array meta-job, clean up after scheduling attempt */
 extern job_record_t *job_array_post_sched(job_record_t *job_ptr, bool list_add)
 {
 	job_record_t *new_job_ptr = NULL;

 	if (!job_ptr->array_recs || !job_ptr->array_recs->task_id_bitmap)
 		return job_ptr;

 	if (job_ptr->array_recs->task_cnt <= 1) {
 		/* Preserve array_recs for min/max exit codes for job array */
 		if (job_ptr->array_recs->task_cnt) {
 			job_ptr->array_recs->task_cnt--;
 		} else if (job_ptr->restart_cnt) {
 			/* Last task of a job array has been requeued */
 		} else {
 			error("job %pJ array_recs task count underflow",
 			      job_ptr);
 		}
 		xfree(job_ptr->array_recs->task_id_str);
 		if (job_ptr->array_recs->task_cnt == 0)
 			FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap);


 		/* Update the job in the database. */
 		jobacct_storage_g_job_start(acct_db_conn, job_ptr);

 		/* If job is requeued, it will already be in the hash table */
 		if (!find_job_array_rec(job_ptr->array_job_id,
 					job_ptr->array_task_id)) {
 			_add_job_array_hash(job_ptr);
 		}
 		new_job_ptr = job_ptr;
 	} else {
 		new_job_ptr = job_array_split(job_ptr, list_add);
 		job_state_set(new_job_ptr, JOB_PENDING);
 		new_job_ptr->start_time = (time_t) 0;
 	}

 	return new_job_ptr;
 }

 /* _kill_dependent()
  *
  * Exterminate the job that has invalid dependency
  * condition.
  */
 static void _kill_dependent(job_record_t *job_ptr)
 {
 	time_t now = time(NULL);

 	info("%s: Job dependency can't be satisfied, cancelling %pJ",
 	     __func__, job_ptr);
 	job_state_set(job_ptr, JOB_CANCELLED);
 	job_ptr->start_time = now;
 	job_ptr->end_time = now;
 	job_completion_logger(job_ptr, false);
 	last_job_update = now;
 	srun_allocate_abort(job_ptr);
 }

 static job_fed_details_t *_dup_job_fed_details(job_fed_details_t *src)
 {
 	job_fed_details_t *dst = NULL;

 	if (!src)
 		return NULL;

 	dst = xmalloc(sizeof(job_fed_details_t));
 	memcpy(dst, src, sizeof(job_fed_details_t));
 	dst->origin_str          = xstrdup(src->origin_str);
 	dst->siblings_active_str = xstrdup(src->siblings_active_str);
 	dst->siblings_viable_str = xstrdup(src->siblings_viable_str);

 	return dst;
 }

 /* Set federated job's sibling strings. */
 extern void update_job_fed_details(job_record_t *job_ptr)
 {
 	xassert(job_ptr);
 	xassert(job_ptr->fed_details);

 	xfree(job_ptr->fed_details->siblings_active_str);
 	xfree(job_ptr->fed_details->siblings_viable_str);

 	job_ptr->fed_details->siblings_active_str =
 		fed_mgr_cluster_ids_to_names(
 			job_ptr->fed_details->siblings_active);
 	job_ptr->fed_details->siblings_viable_str =
 		fed_mgr_cluster_ids_to_names(
 			job_ptr->fed_details->siblings_viable);

 	/* only set once */
 	if (!job_ptr->fed_details->origin_str)
 		job_ptr->fed_details->origin_str =
 			fed_mgr_get_cluster_name(
 				fed_mgr_get_cluster_id(job_ptr->job_id));
 }

 /*
  * Set the allocation response with the current cluster's information and the
  * job's allocated node's addr's if the allocation is being filled by a cluster
  * other than the cluster that submitted the job
  *
  * Note: make sure that the resp's working_cluster_rec is NULL'ed out before the
  * resp is free'd since it points to global memory.
  *
  * IN resp - allocation response being sent back to client.
  * IN job_ptr - allocated job
  * IN req_cluster - the cluster requesting the allocation info.
  */
 extern void set_remote_working_response(
 	resource_allocation_response_msg_t *resp,
 	job_record_t *job_ptr, const char *req_cluster)
 {
 	xassert(resp);
 	xassert(job_ptr);

 	if (job_ptr->node_cnt && req_cluster &&
 	    xstrcmp(slurm_conf.cluster_name, req_cluster)) {
 		if (job_ptr->fed_details &&
 		    fed_mgr_cluster_rec) {
 			resp->working_cluster_rec = fed_mgr_cluster_rec;
 		} else {
 			resp->working_cluster_rec = response_cluster_rec;
 		}

 		if (!job_ptr->node_addrs) {
 			/*
 			 * The job may be owned by the local cluster but a
 			 * remote srun might be trying to launch a job in the
 			 * allocation.
 			 */
 			set_job_node_addrs(job_ptr, req_cluster);
 		}
 	}
 }

 /*
  * Calculate billable TRES based on partition's defined BillingWeights. If none
  * is defined, return total_cpus. This is cached on job_ptr->billable_tres and
  * is updated if the job was resized since the last iteration.
  *
  * IN job_ptr          - job to calc billable tres on
  * IN start_time       - time the has started or been resized
  * IN assoc_mgr_locked - whether the tres assoc lock is set or not
  */
 extern double calc_job_billable_tres(job_record_t *job_ptr, time_t start_time,
 				     bool assoc_mgr_locked)
 {
 	xassert(job_ptr);

 	part_record_t *part_ptr = job_ptr->part_ptr;

 	/* We don't have any resources allocated, just return 0. */
 	if (!job_ptr->tres_alloc_cnt)
 		return 0;

 	/* Don't recalculate unless the job is new or resized */
 	if ((!fuzzy_equal(job_ptr->billable_tres, NO_VAL)) &&
 	    difftime(job_ptr->resize_time, start_time) < 0.0)
 		return job_ptr->billable_tres;

 	log_flag(PRIO, "BillingWeight: %pJ is either new or it was resized",
 		 job_ptr);

 	/* No billing weights defined. Return CPU count */
 	if (!part_ptr || !part_ptr->billing_weights) {
 		job_ptr->billable_tres = job_ptr->total_cpus;
 		return job_ptr->billable_tres;
 	}

 	log_flag(PRIO, "BillingWeight: %pJ using \"%s\" from partition %s",
 		 job_ptr, part_ptr->billing_weights_str,
 		 job_ptr->part_ptr->name);

 	job_ptr->billable_tres =
 		assoc_mgr_tres_weighted(job_ptr->tres_alloc_cnt,
 		                        part_ptr->billing_weights,
 		                        slurm_conf.priority_flags,
 		                        assoc_mgr_locked);

 	log_flag(PRIO, "BillingWeight: %pJ %s = %f",
 	         job_ptr,
 	         (slurm_conf.priority_flags & PRIORITY_FLAGS_MAX_TRES) ?
 	         "MAX(node TRES) + SUM(Global TRES)" :
 		 (slurm_conf.priority_flags & PRIORITY_FLAGS_MAX_TRES_GRES) ?
 		 "MAX(node TRES) + node GRES + SUM(Global TRES)" : "SUM(TRES)",
 	         job_ptr->billable_tres);

 	return job_ptr->billable_tres;
 }

 /*
  * Send warning signal to job before end time.
  *
  * IN job_ptr - job to send warn signal to.
  * IN ignore_time - If set, ignore the warn time and just send it.
  */
 extern void send_job_warn_signal(job_record_t *job_ptr, bool ignore_time)
 {
 	if (job_ptr->warn_signal &&
 	    !(job_ptr->warn_flags & WARN_SENT) &&
 	    (ignore_time ||
 	     (job_ptr->warn_time &&
 	      ((job_ptr->warn_time + PERIODIC_TIMEOUT + time(NULL)) >=
 	       job_ptr->end_time)))) {
 		/*
 		 * If --signal B option was not specified,
 		 * signal only the steps but not the batch step.
 		 */
 		if (!(job_ptr->warn_flags & KILL_JOB_BATCH))
 			job_ptr->warn_flags |= KILL_STEPS_ONLY;

 		/* send SIGCONT first */
 		job_signal(job_ptr, SIGCONT, job_ptr->warn_flags, 0, false);

 		debug("%s: warning signal %u to %pJ",
 		      __func__, job_ptr->warn_signal, job_ptr);

 		job_signal(job_ptr, job_ptr->warn_signal,
 			   job_ptr->warn_flags, 0, false);

 		/* mark job as signaled */
 		job_ptr->warn_flags |= WARN_SENT;
 	}
 }

 static int _overlap_and_running_internal(void *x, void *arg)
 {
 	job_record_t *job_ptr = (job_record_t *)x;
 	job_overlap_args_t *overlap_args = (job_overlap_args_t *)arg;

 	/* We always break if we find something not running */
 	if ((!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) {
 		overlap_args->rc = 0;
 		return -1;
 	}

 	/*
 	 * We are just looking for something overlapping.  On a hetjob we need
 	 * to check everything.
 	 */
 	if (license_list_overlap(overlap_args->license_list,
 				 job_ptr->license_list) ||
 	    (job_ptr->node_bitmap &&
 	    bit_overlap_any(overlap_args->node_map, job_ptr->node_bitmap)))
 		overlap_args->rc = 1;

 	return 0;
 }

 extern bool job_overlap_and_running(bitstr_t *node_map, list_t *license_list,
 				    job_record_t *job_ptr)
 {
 	job_overlap_args_t overlap_args = {
 		.node_map = node_map,
 		.license_list = license_list,
 	};

 	if (!job_ptr->het_job_list)
 		(void)_overlap_and_running_internal(job_ptr, &overlap_args);
 	else
 		(void)list_for_each(job_ptr->het_job_list,
 				    _overlap_and_running_internal,
 				    &overlap_args);

 	return overlap_args.rc;
 }

 static int _add_hetcomp_hostset(void *x, void *arg)
 {
 	job_record_t *het_job = x;
 	foreach_hetcomp_args_t *args = arg;

 	if (args->job_ptr->het_job_id != het_job->het_job_id) {
 		error("%s: Bad het_job_list for %pJ", __func__, args->job_ptr);
 		return 0;
 	}

 	if (!het_job->nodes) {
 		debug("%s: %pJ het_job->nodes == NULL.  Usually this means the job was canceled while it was starting and shouldn't be a real issue.",
 		      __func__, args->job_ptr);
 		return 0;
 	}

 	if (args->hs)
 		(void) hostset_insert(args->hs, het_job->nodes);
 	else
 		args->hs = hostset_create(het_job->nodes);

 	return 0;
 }

 extern char **job_common_env_vars(job_record_t *job_ptr, bool is_complete)
 {
 	char **my_env, *name, *eq, buf[32];
 	int exit_code, i, signal;

 	my_env = xmalloc(sizeof(char *));
 	my_env[0] = NULL;

 	/* Set SPANK env vars first so that we can overwrite as needed
 	 * below. Prevent user hacking from setting SLURM_JOB_ID etc. */
 	if (job_ptr->spank_job_env_size) {
 		env_array_merge(&my_env,
 				(const char **) job_ptr->spank_job_env);
 		valid_spank_job_env(my_env, job_ptr->spank_job_env_size,
 				    job_ptr->user_id);
 	}

 	setenvf(&my_env, "SLURM_JOB_ACCOUNT", "%s", job_ptr->account);

 	if (is_complete) {
 		exit_code = signal = 0;
 		if (WIFEXITED(job_ptr->exit_code)) {
 			exit_code = WEXITSTATUS(job_ptr->exit_code);
 		}
 		if (WIFSIGNALED(job_ptr->exit_code)) {
 			signal = WTERMSIG(job_ptr->exit_code);
 		}
 		snprintf(buf, sizeof(buf), "%d:%d", exit_code, signal);
 		setenvf(&my_env, "SLURM_JOB_DERIVED_EC", "%u",
 			job_ptr->derived_ec);
 		setenvf(&my_env, "SLURM_JOB_EXIT_CODE2", "%s", buf);
 		setenvf(&my_env, "SLURM_JOB_EXIT_CODE", "%u",
 			job_ptr->exit_code);
 	}

 	if (job_ptr->array_task_id != NO_VAL) {
 		setenvf(&my_env, "SLURM_ARRAY_JOB_ID", "%u",
 			job_ptr->array_job_id);
 		setenvf(&my_env, "SLURM_ARRAY_TASK_ID", "%u",
 			job_ptr->array_task_id);
 		if (job_ptr->details && job_ptr->details->env_sup &&
 		    job_ptr->details->env_cnt) {
 			for (i = 0; i < job_ptr->details->env_cnt; i++) {
 				if (xstrncmp(job_ptr->details->env_sup[i],
 					     "SLURM_ARRAY_TASK", 16))
 					continue;
 				eq = strchr(job_ptr->details->env_sup[i], '=');
 				if (!eq)
 					continue;
 				eq[0] = '\0';
 				setenvf(&my_env,
 					job_ptr->details->env_sup[i],
 					"%s", eq + 1);
 				eq[0] = '=';
 			}
 		}
 	}

 	if (slurm_conf.cluster_name) {
 		setenvf(&my_env, "SLURM_CLUSTER_NAME", "%s",
 		        slurm_conf.cluster_name);
 	}

 	if (job_ptr->comment)
 		setenvf(&my_env, "SLURM_JOB_COMMENT", "%s", job_ptr->comment);

 	setenvf(&my_env, "SLURM_JOB_END_TIME", "%lu", job_ptr->end_time);

 	if (job_ptr->extra)
 		setenvf(&my_env, "SLURM_JOB_EXTRA", "%s", job_ptr->extra);

 	if (job_ptr->het_job_id) {
 		/* Continue support for old hetjob terminology. */
 		setenvf(&my_env, "SLURM_PACK_JOB_ID", "%u",
 			job_ptr->het_job_id);
 		setenvf(&my_env, "SLURM_PACK_JOB_OFFSET", "%u",
 			job_ptr->het_job_offset);
 		setenvf(&my_env, "SLURM_HET_JOB_ID", "%u",
 			job_ptr->het_job_id);
 		setenvf(&my_env, "SLURM_HET_JOB_OFFSET", "%u",
 			job_ptr->het_job_offset);
 		if ((job_ptr->het_job_offset == 0) && job_ptr->het_job_list) {
 			foreach_hetcomp_args_t args = {
 				.job_ptr = job_ptr,
 			};
 			list_for_each(job_ptr->het_job_list,
 				      _add_hetcomp_hostset, &args);
 			if (args.hs) {
 				char *buf = hostset_ranged_string_xmalloc(
 					args.hs);
 				/* Support for old hetjob terminology. */
 				setenvf(&my_env, "SLURM_PACK_JOB_NODELIST",
 					"%s", buf);
 				setenvf(&my_env, "SLURM_HET_JOB_NODELIST",
 					"%s", buf);
 				xfree(buf);
 				hostset_destroy(args.hs);
 			}
 		}
 	}
 	setenvf(&my_env, "SLURM_JOB_GID", "%u", job_ptr->group_id);
 	name = group_from_job(job_ptr);
 	setenvf(&my_env, "SLURM_JOB_GROUP", "%s", name);
 	xfree(name);
 	setenvf(&my_env, "SLURM_JOBID", "%u", job_ptr->job_id);
 	setenvf(&my_env, "SLURM_JOB_ID", "%u", job_ptr->job_id);
 	if (job_ptr->licenses)
 		setenvf(&my_env, "SLURM_JOB_LICENSES", "%s", job_ptr->licenses);
 	setenvf(&my_env, "SLURM_JOB_NAME", "%s", job_ptr->name);
 	setenvf(&my_env, "SLURM_JOB_NODELIST", "%s", job_ptr->nodes);
 	if (job_ptr->job_resrcs) {
 		char *tmp;

 		tmp = uint32_compressed_to_str(
 			job_ptr->job_resrcs->cpu_array_cnt,
 			job_ptr->job_resrcs->cpu_array_value,
 			job_ptr->job_resrcs->cpu_array_reps);
 		setenvf(&my_env, "SLURM_JOB_CPUS_PER_NODE", "%s", tmp);
 		xfree(tmp);

 		setenvf(&my_env, "SLURM_JOB_NUM_NODES", "%u",
 			job_ptr->job_resrcs->nhosts);
 	}
 	if (job_ptr->part_ptr) {
 		setenvf(&my_env, "SLURM_JOB_PARTITION", "%s",
 			job_ptr->part_ptr->name);
 	} else {
 		setenvf(&my_env, "SLURM_JOB_PARTITION", "%s",
 			job_ptr->partition);
 	}

 	if (job_ptr->resv_ptr)
 		setenvf(&my_env, "SLURM_JOB_RESERVATION", "%s",
 			job_ptr->resv_ptr->name);

 	setenvf(&my_env, "SLURM_JOB_RESTART_COUNT", "%d", job_ptr->restart_cnt);

 	setenvf(&my_env, "SLURM_JOB_START_TIME", "%lu", job_ptr->start_time);

 	setenvf(&my_env, "SLURM_JOB_UID", "%u", job_ptr->user_id);
 	name = user_from_job(job_ptr);
 	setenvf(&my_env, "SLURM_JOB_USER", "%s", name);
 	xfree(name);
 	if (job_ptr->wckey) {
 		setenvf(&my_env, "SLURM_WCKEY", "%s", job_ptr->wckey);
 	}

 	if (job_ptr->details) {
 		if (job_ptr->details->features_use)
 			setenvf(&my_env, "SLURM_JOB_CONSTRAINTS", "%s",
 				job_ptr->details->features_use);

 		setenvf(&my_env, "SLURM_JOB_OVERSUBSCRIBE", "%s",
 			job_share_string(get_job_share_value(job_ptr)));

 		if (job_ptr->details->std_err)
 			setenvf(&my_env, "SLURM_JOB_STDERR", "%s",
 				job_ptr->details->std_err);
 		if (job_ptr->details->std_in)
 			setenvf(&my_env, "SLURM_JOB_STDIN", "%s",
 				job_ptr->details->std_in);
 		if (job_ptr->details->std_out)
 			setenvf(&my_env, "SLURM_JOB_STDOUT", "%s",
 				job_ptr->details->std_out);
 		if (job_ptr->details->work_dir)
 			setenvf(&my_env, "SLURM_JOB_WORK_DIR", "%s",
 				job_ptr->details->work_dir);
 	}

 	return my_env;
 }

 extern job_record_t *job_mgr_copy_resv_desc_to_job_record(
 	resv_desc_msg_t *resv_desc_ptr)
 {
 	job_record_t *job_ptr;
 	job_details_t *detail_ptr;
 	part_record_t *part_ptr = NULL;

 	job_ptr = _create_job_record(1, false);
 	detail_ptr = job_ptr->details;

 	job_ptr->partition = xstrdup(resv_desc_ptr->partition);

 	if (job_ptr->partition)
 		part_ptr = find_part_record(job_ptr->partition);
 	detail_ptr->pn_min_memory =
 		_get_def_mem(part_ptr, job_ptr->tres_req_cnt);

 	job_ptr->time_limit = resv_desc_ptr->duration;

 	detail_ptr->begin_time = resv_desc_ptr->start_time;
 	if (resv_desc_ptr->node_cnt != NO_VAL) {
 		detail_ptr->max_nodes = detail_ptr->min_nodes =
 			resv_desc_ptr->node_cnt;
 	} else {
 		detail_ptr->min_nodes = 1;
 		/* 500000 comes from job_scheduler.c job_start_data() */
 		detail_ptr->max_nodes = 500000;
 	}

 	if (resv_desc_ptr->node_list) {
 		hostlist_t *hl = hostlist_create(resv_desc_ptr->node_list);
 		hostlist_uniq(hl);
 		detail_ptr->req_nodes = hostlist_ranged_string_xmalloc(hl);
 		detail_ptr->max_nodes = detail_ptr->min_nodes =
 			hostlist_count(hl);
 		hostlist_destroy(hl);

 		(void) node_name2bitmap(detail_ptr->req_nodes, true,
 					&detail_ptr->req_node_bitmap, NULL);
 	}

 	if (resv_desc_ptr->tres_str || resv_desc_ptr->core_cnt != NO_VAL) {
 		detail_ptr->mc_ptr = job_record_create_mc();

 		/*
 		 * Since reservations are core based we need to request it that
 		 * way with one thread per core and one task per core.
 		 */
 		detail_ptr->mc_ptr->ntasks_per_core = 1;
 		detail_ptr->mc_ptr->threads_per_core = 1;

 		detail_ptr->num_tasks = detail_ptr->min_cpus =
 			resv_desc_ptr->core_cnt;
 		if (detail_ptr->min_cpus == NO_VAL)
 			detail_ptr->min_cpus = detail_ptr->min_nodes;
 	} else {
 		detail_ptr->num_tasks = detail_ptr->min_cpus =
 			detail_ptr->min_nodes;
 		detail_ptr->whole_node = WHOLE_NODE_REQUIRED;
 	}
 	detail_ptr->core_spec = NO_VAL16;
 	detail_ptr->cpus_per_task = 1;
 	detail_ptr->orig_min_cpus = detail_ptr->min_cpus;
 	detail_ptr->orig_max_cpus = detail_ptr->max_cpus = NO_VAL;
 	if ((resv_desc_ptr->flags & RESERVE_TRES_PER_NODE) &&
 	    (resv_desc_ptr->core_cnt != NO_VAL) &&
 	    (resv_desc_ptr->node_cnt != NO_VAL)) {
 		detail_ptr->orig_pn_min_cpus = detail_ptr->pn_min_cpus =
 			resv_desc_ptr->core_cnt / resv_desc_ptr->node_cnt;
 	} else
 		detail_ptr->orig_pn_min_cpus = detail_ptr->pn_min_cpus = 1;
 	detail_ptr->features = xstrdup(resv_desc_ptr->features);

 	if (build_feature_list(job_ptr, false, true)) {
 		error("%s: invalid features(%s) for reservation given",
 		      __func__, detail_ptr->features);
 	}

 	detail_ptr->task_dist = SLURM_DIST_BLOCK;
 	job_ptr->best_switch = true;

 	if (resv_desc_ptr->tres_str) {
 		gres_job_state_validate_t gres_js_val = {
 			.cpus_per_tres = NULL,
 			.mem_per_tres = NULL,
 			.tres_freq = NULL,
 			.tres_per_socket = NULL,
 			.tres_per_task = NULL,

 			.cpus_per_task = &detail_ptr->orig_cpus_per_task,
 			.max_nodes = &detail_ptr->max_nodes,
 			.min_cpus = &detail_ptr->min_cpus,
 			.min_nodes = &detail_ptr->min_nodes,
 			.ntasks_per_node = &detail_ptr->ntasks_per_node,
 			.ntasks_per_socket =
 			&detail_ptr->mc_ptr->ntasks_per_socket,
 			.ntasks_per_tres = &detail_ptr->ntasks_per_tres,
 			.num_tasks = &detail_ptr->num_tasks,
 			.sockets_per_node =
 			&detail_ptr->mc_ptr->sockets_per_node,

 			.gres_list = &job_ptr->gres_list_req,
 		};

 		detail_ptr->mc_ptr->ntasks_per_socket = NO_VAL16;
 		detail_ptr->mc_ptr->sockets_per_node = NO_VAL16;
 		detail_ptr->orig_cpus_per_task = NO_VAL16;
 		detail_ptr->ntasks_per_tres = NO_VAL16;

 		job_ptr->tres_req_str = xstrdup(resv_desc_ptr->tres_str);

 		if (resv_desc_ptr->flags & RESERVE_TRES_PER_NODE)
 			job_ptr->tres_per_node = xstrdup(job_ptr->tres_req_str);
 		else
 			job_ptr->tres_per_job = xstrdup(job_ptr->tres_req_str);

 		gres_js_val.tres_per_job = job_ptr->tres_per_job;
 		gres_js_val.tres_per_node = job_ptr->tres_per_node;

 		(void)gres_job_state_validate(&gres_js_val);

 		if (detail_ptr->num_tasks == NO_VAL)
 			detail_ptr->num_tasks = 0;
 		if (detail_ptr->min_cpus == NO_VAL)
 			detail_ptr->min_cpus = 1;

 		if (resv_desc_ptr->flags & RESERVE_TRES_PER_NODE)
 			detail_ptr->ntasks_per_node = detail_ptr->pn_min_cpus;
 		else if (detail_ptr->ntasks_per_node == NO_VAL16)
 			detail_ptr->ntasks_per_node = 0;

 		if (detail_ptr->mc_ptr->ntasks_per_socket == NO_VAL16)
 			detail_ptr->mc_ptr->ntasks_per_socket = INFINITE16;
 		if (job_ptr->gres_list_req)
 			job_ptr->bit_flags |= GRES_ENFORCE_BIND;
 		gres_job_state_log(job_ptr->gres_list_req, job_ptr->job_id);
 	}
 	return job_ptr;
 }

 extern uint16_t job_mgr_determine_cpus_per_core(
 	job_details_t *details, int node_inx)
 {
 	uint16_t ncpus_per_core = INFINITE16;	/* Usable CPUs per core */
 	uint16_t threads_per_core = node_record_table_ptr[node_inx]->tpc;

 	if ((slurm_conf.select_type_param & SELECT_ONE_TASK_PER_CORE) &&
 	    (details->min_gres_cpu > 0)) {
 		/* May override default of 1 CPU per core */
 		return node_record_table_ptr[node_inx]->tpc;
 	}

 	if (details && details->mc_ptr) {
 		multi_core_data_t *mc_ptr = details->mc_ptr;
 		if ((mc_ptr->ntasks_per_core != INFINITE16) &&
 		    (mc_ptr->ntasks_per_core)) {
 			ncpus_per_core = MIN(threads_per_core,
 					     (mc_ptr->ntasks_per_core *
 					      details->cpus_per_task));
 		}
 		if ((mc_ptr->threads_per_core != NO_VAL16) &&
 		    (mc_ptr->threads_per_core <  ncpus_per_core)) {
 			ncpus_per_core = mc_ptr->threads_per_core;
 		}
 	}

 	threads_per_core = MIN(threads_per_core, ncpus_per_core);

 	return threads_per_core;
 }

 static int _sort_part_lists(void *x, void *none)
 {
 	job_record_t *job_ptr = x;
 	if (job_ptr && job_ptr->part_ptr_list)
 		list_sort(job_ptr->part_ptr_list, priority_sort_part_tier);
 	return SLURM_SUCCESS;
 }

 extern void sort_all_jobs_partition_lists()
 {
 	list_for_each(job_list, _sort_part_lists, NULL);
 }

 extern void job_mgr_handle_cred_failure(job_record_t *job_ptr)
 {
 	job_ptr->priority = 0; /* Hold job */
 	xfree(job_ptr->system_comment);
 	job_ptr->system_comment =
 		xstrdup("slurm_cred_create failure, holding job.");
 	job_complete(job_ptr->job_id, slurm_conf.slurm_user_id, true, false, 0);
 }