src/stepmgr/stepmgr.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  stepmgr.c - manage the job step information of slurm
  *****************************************************************************
  *  Copyright (C) 2002-2007 The Regents of the University of California.
  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
  *  Copyright (C) SchedMD LLC.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov>, et. al.
  *  CODE-OCEC-09-009. All rights reserved.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #include "config.h"

 #include <ctype.h>
 #include <errno.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/param.h>
 #include <sys/types.h>
 #include <time.h>
 #include <unistd.h>

 #include "stepmgr.h"

 #include "slurm/slurm_errno.h"

 #include "src/common/assoc_mgr.h"
 #include "src/common/bitstring.h"
 #include "src/common/forward.h"
 #include "src/common/node_features.h"
 #include "src/common/port_mgr.h"
 #include "src/common/slurm_protocol_pack.h"
 #include "src/common/slurm_protocol_socket.h"
 #include "src/common/slurm_resource_info.h"
 #include "src/common/stepd_proxy.h"
 #include "src/common/tres_bind.h"
 #include "src/common/tres_frequency.h"
 #include "src/common/xstring.h"

 #include "src/interfaces/accounting_storage.h"
 #include "src/interfaces/gres.h"
 #include "src/interfaces/jobacct_gather.h"
 #include "src/interfaces/mcs.h"
 #include "src/interfaces/select.h"
 #include "src/interfaces/switch.h"

 #include "src/stepmgr/gres_stepmgr.h"
 #include "src/stepmgr/srun_comm.h"

 typedef struct {
 	uint16_t flags;
 	bool found;
 	int rc_in;
 	uint16_t signal;
 	slurm_step_id_t step_id;
 	uid_t uid;
 } step_signal_t;

 typedef struct {
 	bitstr_t *all_gres_core_bitmap;
 	bitstr_t *any_gres_core_bitmap;
 	int core_end_bit;
 	int core_start_bit;
 	int job_node_inx;
 	list_t *node_gres_list;
 } foreach_gres_filter_t;

 static void _build_pending_step(job_record_t *job_ptr,
 				job_step_create_request_msg_t *step_specs);
 static int _step_partial_comp(step_record_t *step_ptr,
 			      step_complete_msg_t *req, bool finish,
 			      int *rem, uint32_t *max_rc);
 static int  _count_cpus(job_record_t *job_ptr, bitstr_t *bitmap,
 			uint32_t *usable_cpu_cnt);
 static void _dump_step_layout(step_record_t *step_ptr);
 static bool _is_mem_resv(void);
 static int  _opt_cpu_cnt(uint32_t step_min_cpus, bitstr_t *node_bitmap,
 			 uint32_t *usable_cpu_cnt);
 static int  _opt_node_cnt(uint32_t step_min_nodes, uint32_t step_max_nodes,
 			  int nodes_avail, int nodes_picked_cnt);
 static bitstr_t *_pick_step_nodes(job_record_t *job_ptr,
 				  job_step_create_request_msg_t *step_spec,
 				  list_t *step_gres_list, int cpus_per_task,
 				  uint32_t node_count, int *return_code);
 static bitstr_t *_pick_step_nodes_cpus(job_record_t *job_ptr,
 				       bitstr_t *nodes_bitmap, int node_cnt,
 				       int cpu_cnt, uint32_t *usable_cpu_cnt);
 static void _step_dealloc_lps(step_record_t *step_ptr);
 static step_record_t *_build_interactive_step(
 	job_record_t *job_ptr_in,
 	job_step_create_request_msg_t *step_specs,
 	uint16_t protocol_version);
 static int _build_ext_launcher_step(step_record_t **new_step_record,
 				    job_record_t *job_ptr,
 				    job_step_create_request_msg_t *step_specs,
 				    uint16_t protocol_version);
 static void _wake_pending_steps(job_record_t *job_ptr);

 stepmgr_ops_t *stepmgr_ops = NULL;

 extern void stepmgr_init(stepmgr_ops_t *ops)
 {
 	/*
 	 * Just keep pointers so that the pointers can be assigned after being
 	 * initialized init is called.
 	 */
 	stepmgr_ops = ops;
 }

 /* Determine how many more CPUs are required for a job step */
 static int  _opt_cpu_cnt(uint32_t step_min_cpus, bitstr_t *node_bitmap,
 			 uint32_t *usable_cpu_cnt)
 {
 	int rem_cpus = step_min_cpus;

 	if (!node_bitmap)
 		return rem_cpus;
 	xassert(usable_cpu_cnt);

 	for (int i = 0; next_node_bitmap(node_bitmap, &i); i++) {
 		if (usable_cpu_cnt[i] >= rem_cpus)
 			return 0;
 		rem_cpus -= usable_cpu_cnt[i];
 	}

 	return rem_cpus;
 }

 /* Select the optimal node count for a job step based upon it's min and
  * max target, available resources, and nodes already picked */
 static int _opt_node_cnt(uint32_t step_min_nodes, uint32_t step_max_nodes,
 			 int nodes_avail, int nodes_picked_cnt)
 {
 	int target_node_cnt;

 	if ((step_max_nodes > step_min_nodes) && (step_max_nodes != NO_VAL))
 		target_node_cnt = step_max_nodes;
 	else
 		target_node_cnt = step_min_nodes;
 	if (target_node_cnt > nodes_picked_cnt)
 		target_node_cnt -= nodes_picked_cnt;
 	else
 		target_node_cnt = 0;
 	if (nodes_avail < target_node_cnt)
 		target_node_cnt = nodes_avail;

 	return target_node_cnt;
 }

 /* Purge any duplicate job steps for this PID */
 static int _purge_duplicate_steps(void *x, void *arg)
 {
 	step_record_t *step_ptr = (step_record_t *) x;
 	job_step_create_request_msg_t *step_specs =
 		(job_step_create_request_msg_t *) arg;

 	if ((step_ptr->step_id.step_id == SLURM_PENDING_STEP) &&
 	    (step_ptr->state == JOB_PENDING) &&
 	    (step_ptr->srun_pid	== step_specs->srun_pid) &&
 	    (!xstrcmp(step_ptr->host, step_specs->host))) {
 		return 1;
 	}

 	/*
 	 * See if we have the same step id.  If we do check to see if we
 	 * have the same step_het_comp or if the step's is NO_VAL,
 	 * meaning this step is not a het step.
 	 */
 	if ((step_specs->step_id.step_id == step_ptr->step_id.step_id) &&
 	    ((step_specs->step_id.step_het_comp ==
 	      step_ptr->step_id.step_het_comp) ||
 	     (step_ptr->step_id.step_het_comp == NO_VAL)))
 		return -1;

 	return 0;
 }

 /* The step with a state of PENDING is used as a placeholder for a host and
  * port that can be used to wake a pending srun as soon another step ends */
 static void _build_pending_step(job_record_t *job_ptr,
 				job_step_create_request_msg_t *step_specs)
 {
 	step_record_t *step_ptr;

 	if ((step_specs->host == NULL) || (step_specs->port == 0))
 		return;

 	step_ptr = create_step_record(job_ptr, 0);
 	if (step_ptr == NULL)
 		return;

 	*stepmgr_ops->last_job_update = time(NULL);

 	step_ptr->cpu_count	= step_specs->num_tasks;
 	step_ptr->port		= step_specs->port;
 	step_ptr->srun_pid	= step_specs->srun_pid;
 	step_ptr->host		= xstrdup(step_specs->host);
 	step_ptr->state		= JOB_PENDING;
 	step_ptr->step_id.job_id = job_ptr->job_id;
 	step_ptr->step_id.step_id = SLURM_PENDING_STEP;
 	step_ptr->step_id.step_het_comp = NO_VAL;
 	step_ptr->cwd = xstrdup(step_specs->cwd);
 	step_ptr->std_err = xstrdup(step_specs->std_err);
 	step_ptr->std_in = xstrdup(step_specs->std_in);
 	step_ptr->std_out = xstrdup(step_specs->std_out);
 	step_ptr->submit_line = xstrdup(step_specs->submit_line);

 	if (job_ptr->node_bitmap)
 		step_ptr->step_node_bitmap = bit_copy(job_ptr->node_bitmap);
 	step_ptr->time_last_active = time(NULL);
 }

 static void _internal_step_complete(step_record_t *step_ptr, int remaining)
 {
 	struct jobacctinfo *jobacct = (struct jobacctinfo *)step_ptr->jobacct;
 	job_record_t *job_ptr = step_ptr->job_ptr;
 	bool add_energy = true;

 	if ((slurm_conf.prolog_flags & PROLOG_FLAG_CONTAIN) &&
 	    (step_ptr->step_id.step_id != SLURM_EXTERN_CONT))
 		add_energy = false;

 	if (add_energy && jobacct && job_ptr->tres_alloc_cnt &&
 	    (jobacct->energy.consumed_energy != NO_VAL64)) {
 		if (job_ptr->tres_alloc_cnt[TRES_ARRAY_ENERGY] == NO_VAL64)
 			job_ptr->tres_alloc_cnt[TRES_ARRAY_ENERGY] = 0;
 		job_ptr->tres_alloc_cnt[TRES_ARRAY_ENERGY] +=
 			jobacct->energy.consumed_energy;
 	}

 	if (IS_JOB_FINISHED(job_ptr) &&
 	    job_ptr->tres_alloc_cnt &&
 	    (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64) &&
 	    (remaining == 1)) {
 		assoc_mgr_set_job_tres_alloc_str(job_ptr, false);
 		/* This flag says we have processed the tres alloc including
 		 * energy from all steps, so don't process or handle it again
 		 * with the job.  It also tells the slurmdbd plugin to send it
 		 * to the DBD.
 		 */
 		job_ptr->bit_flags |= TRES_STR_CALC;
 	}

 	jobacct_storage_g_step_complete(stepmgr_ops->acct_db_conn, step_ptr);

 	if (step_ptr->step_id.step_id == SLURM_PENDING_STEP)
 		return;

 	/*
 	 * Derived exit code is the highest exit code of srun steps, so we
 	 * exclude the batch and extern steps.
 	 *
 	 * Sync with _get_derived_ec_update_str() for setting derived_ec on the
 	 * dbd side.
 	 */
 	if ((step_ptr->step_id.step_id != SLURM_EXTERN_CONT) &&
 	    (step_ptr->step_id.step_id != SLURM_BATCH_SCRIPT) &&
 	    ((step_ptr->exit_code == SIG_OOM) ||
 	     (step_ptr->exit_code > job_ptr->derived_ec)))
 		job_ptr->derived_ec = step_ptr->exit_code;

 	step_ptr->state |= JOB_COMPLETING;

 	_step_dealloc_lps(step_ptr);

 	/* Don't need to set state. Will be destroyed in next steps. */
 	/* step_ptr->state = JOB_COMPLETE; */
 }

 static int _step_signal(void *object, void *arg)
 {
 	step_record_t *step_ptr = (step_record_t *)object;
 	step_signal_t *step_signal = (step_signal_t *)arg;
 	uint16_t signal;
 	int rc;


 	if (!(step_signal->flags & KILL_FULL_JOB) &&
 	    !find_step_id(step_ptr, &step_signal->step_id))
 		return SLURM_SUCCESS;

 	step_signal->found = true;
 	signal = step_signal->signal;

 	/*
 	 * If step_het_comp is NO_VAL means it is a non-het step, so return
 	 * SLURM_ERROR to break out of the list_for_each.
 	 */
 	rc = (step_ptr->step_id.step_het_comp == NO_VAL) ?
 		SLURM_ERROR : SLURM_SUCCESS;

 	if (step_signal->flags & KILL_OOM)
 		step_ptr->exit_code = SIG_OOM;
 	if (step_signal->flags & KILL_NO_SIG_FAIL) {
 		debug("%s: setting SSF_NO_SIG_FAIL for %pS",
 		      __func__, step_ptr);
 		step_ptr->flags |= SSF_NO_SIG_FAIL;
 	}

 	/*
 	 * If SIG_NODE_FAIL codes through it means we had nodes failed
 	 * so handle that in the select plugin and switch the signal
 	 * to KILL afterwards.
 	 */
 	if (signal == SIG_NODE_FAIL) {
 		if (step_signal->rc_in != SLURM_SUCCESS)
 			return rc;
 		signal = SIGKILL;
 	}

 	/* save user ID of the one who requested the job be cancelled */
 	if (signal == SIGKILL) {
 		step_ptr->requid = step_signal->uid;
 		srun_step_complete(step_ptr);
 	}

 	signal_step_tasks(step_ptr, signal, REQUEST_SIGNAL_TASKS);

 	return rc;
 }

 static int _step_not_cleaning(void *x, void *arg)
 {
 	step_record_t *step_ptr = (step_record_t *) x;
 	int *remaining = (int *) arg;

 	if (step_ptr->step_id.step_id == SLURM_PENDING_STEP)
 		srun_step_signal(step_ptr, 0);
 	_internal_step_complete(step_ptr, *remaining);

 	(*remaining)--;
 	return 1;
 }

 /*
  * _finish_step_comp - Finish deallocating and delete a non-pending step.
  */
 static int _finish_step_comp(void *x, void *arg)
 {
 	int remaining;
 	step_record_t *step_ptr = x;
 	job_record_t *job_ptr = step_ptr->job_ptr;

 	if (step_ptr->step_id.step_id == SLURM_PENDING_STEP)
 		return 0;

 	remaining = list_count(job_ptr->step_list);
 	_internal_step_complete(step_ptr, remaining);
 	delete_step_record(job_ptr, step_ptr);
 	_wake_pending_steps(job_ptr);

 	*stepmgr_ops->last_job_update = time(NULL);

 	return 1;
 }

 /*
  * delete_step_records - Delete step record for specified job_ptr.
  * This function is called when a step fails to run to completion. For example,
  * when the job is killed due to reaching its time limit or allocated nodes
  * go DOWN.
  * IN job_ptr - pointer to job table entry to have step records removed
  */
 extern void delete_step_records(job_record_t *job_ptr)
 {
 	int remaining;
 	xassert(job_ptr);

 	remaining = list_count(job_ptr->step_list);
 	*stepmgr_ops->last_job_update = time(NULL);
 	list_delete_all(job_ptr->step_list, _step_not_cleaning, &remaining);
 }

 /*
  * delete_step_record - delete record for job step for specified job_ptr
  *	and step_id
  * IN job_ptr - pointer to job table entry to have step record removed
  * IN step_ptr - pointer to step table entry of the desired job step
  */
 void delete_step_record(job_record_t *job_ptr, step_record_t *step_ptr)
 {
 	xassert(job_ptr);
 	xassert(job_ptr->step_list);
 	xassert(step_ptr);

 	*stepmgr_ops->last_job_update = time(NULL);
 	list_delete_ptr(job_ptr->step_list, step_ptr);
 }


 /*
  * dump_step_desc - dump the incoming step initiate request message
  * IN step_spec - job step request specification from RPC
  */
 void
 dump_step_desc(job_step_create_request_msg_t *step_spec)
 {
 	uint64_t mem_value = step_spec->pn_min_memory;
 	char *mem_type = "node";

 	if (mem_value & MEM_PER_CPU) {
 		mem_value &= (~MEM_PER_CPU);
 		mem_type   = "cpu";
 	}

 	log_flag(CPU_FREQ, "StepDesc: user_id=%u JobId=%u cpu_freq_gov=%u cpu_freq_max=%u cpu_freq_min=%u",
 		 step_spec->user_id, step_spec->step_id.job_id,
 		 step_spec->cpu_freq_gov,
 		 step_spec->cpu_freq_max, step_spec->cpu_freq_min);
 	debug3("StepDesc: user_id=%u %ps node_count=%u-%u cpu_count=%u num_tasks=%u",
 	       step_spec->user_id, &step_spec->step_id,
 	       step_spec->min_nodes, step_spec->max_nodes,
 	       step_spec->cpu_count, step_spec->num_tasks);
 	debug3("   cpu_freq_gov=%u cpu_freq_max=%u cpu_freq_min=%u "
 	       "relative=%u task_dist=0x%X plane=%u",
 	       step_spec->cpu_freq_gov, step_spec->cpu_freq_max,
 	       step_spec->cpu_freq_min, step_spec->relative,
 	       step_spec->task_dist, step_spec->plane_size);
 	debug3("   node_list=%s  constraints=%s",
 	       step_spec->node_list, step_spec->features);
 	debug3("   host=%s port=%u srun_pid=%u name=%s network=%s exclusive=%s",
 	       step_spec->host, step_spec->port, step_spec->srun_pid,
 	       step_spec->name, step_spec->network,
 	       (step_spec->flags & SSF_EXCLUSIVE) ? "yes" : "no");
 	debug3("   mem_per_%s=%"PRIu64" resv_port_cnt=%u immediate=%u no_kill=%s",
 	       mem_type, mem_value, step_spec->resv_port_cnt,
 	       step_spec->immediate,
 	       (step_spec->flags & SSF_NO_KILL) ? "yes" : "no");
 	debug3("   overcommit=%s time_limit=%u",
 	       (step_spec->flags & SSF_OVERCOMMIT) ? "yes" : "no",
 	       step_spec->time_limit);

 	if (step_spec->cpus_per_tres)
 		debug3("   CPUs_per_TRES=%s", step_spec->cpus_per_tres);
 	if (step_spec->mem_per_tres)
 		debug3("   Mem_per_TRES=%s", step_spec->mem_per_tres);
 	if (step_spec->tres_bind)
 		debug3("   TRES_bind=%s", step_spec->tres_bind);
 	if (step_spec->tres_freq)
 		debug3("   TRES_freq=%s", step_spec->tres_freq);
 	if (step_spec->tres_per_step)
 		debug3("   TRES_per_step=%s", step_spec->tres_per_step);
 	if (step_spec->tres_per_node)
 		debug3("   TRES_per_node=%s", step_spec->tres_per_node);
 	if (step_spec->tres_per_socket)
 		debug3("   TRES_per_socket=%s", step_spec->tres_per_socket);
 	if (step_spec->tres_per_task)
 		debug3("   TRES_per_task=%s", step_spec->tres_per_task);
 	if (step_spec->container || step_spec->container_id)
 		debug3("   Container=%s ContainerID=%s",
 		       step_spec->container, step_spec->container_id);
 }

 /*
  * job_step_signal - signal the specified job step
  * IN step_id - filled in slurm_step_id_t
  * IN signal - user id of user issuing the RPC
  * IN flags - RPC flags
  * IN uid - user id of user issuing the RPC
  * RET 0 on success, otherwise ESLURM error code
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
  */
 extern int job_step_signal(slurm_step_id_t *step_id,
 			   uint16_t signal, uint16_t flags, uid_t uid)
 {
 	job_record_t *job_ptr;
 	step_signal_t step_signal = {
 		.flags = flags,
 		.found = false,
 		.rc_in = SLURM_SUCCESS,
 		.signal = signal,
 		.uid = uid,
 	};

 	memcpy(&step_signal.step_id, step_id, sizeof(step_signal.step_id));

 	job_ptr = stepmgr_ops->find_job_record(step_id->job_id);
 	if (job_ptr == NULL) {
 		error("job_step_signal: invalid JobId=%u", step_id->job_id);
 		return ESLURM_INVALID_JOB_ID;
 	}

 	if (IS_JOB_FINISHED(job_ptr)) {
 		step_signal.rc_in = ESLURM_ALREADY_DONE;
 		if (signal != SIG_NODE_FAIL)
 			return step_signal.rc_in;
 	} else if (!IS_JOB_RUNNING(job_ptr)) {
 		verbose("%s: %pJ is in state %s, cannot signal steps",
 			__func__, job_ptr,
 			job_state_string(job_ptr->job_state));
 		if (signal != SIG_NODE_FAIL)
 			return ESLURM_TRANSITION_STATE_NO_UPDATE;
 	}

 	list_for_each(job_ptr->step_list, _step_signal, &step_signal);

 	if (!step_signal.found && running_in_slurmctld() &&
 	    (job_ptr->bit_flags & STEPMGR_ENABLED)) {
 		agent_arg_t *agent_args = NULL;
 		job_step_kill_msg_t *kill_msg = NULL;
 		node_record_t *node_ptr;

 		kill_msg = xmalloc(sizeof(*kill_msg));
 		kill_msg->signal = signal;
 		kill_msg->flags = flags;
 		kill_msg->step_id = *step_id;

 		agent_args = xmalloc(sizeof(agent_arg_t));
 		agent_args->msg_type = REQUEST_CANCEL_JOB_STEP;
 		agent_args->retry = 1;
 		agent_args->hostlist = hostlist_create(job_ptr->batch_host);
 		agent_args->node_count = 1;
 		if ((node_ptr = find_node_record(job_ptr->batch_host)))
 			agent_args->protocol_version =
 				node_ptr->protocol_version;

 		agent_args->msg_args = kill_msg;
 		set_agent_arg_r_uid(agent_args, slurm_conf.slurmd_user_id);
 		stepmgr_ops->agent_queue_request(agent_args);

 		step_signal.found = true;
 		step_signal.rc_in = SLURM_SUCCESS;
 	}

 	if (!step_signal.found) {
 		info("%s: %pJ StepId=%u not found",
 		     __func__, job_ptr, step_id->step_id);
 		return ESLURM_INVALID_JOB_ID;
 	}

 	return step_signal.rc_in;
 }

 /*
  * signal_step_tasks - send specific signal to specific job step
  * IN step_ptr - step record pointer
  * IN signal - signal to send
  * IN msg_type - message type to send
  */
 void signal_step_tasks(step_record_t *step_ptr, uint16_t signal,
 		       slurm_msg_type_t msg_type)
 {
 	node_record_t *node_ptr;
 	static bool cloud_dns = false;
 	static time_t last_update = 0;
 	signal_tasks_msg_t *signal_tasks_msg;
 	agent_arg_t *agent_args = NULL;

 	xassert(step_ptr);
 	agent_args = xmalloc(sizeof(agent_arg_t));
 	agent_args->msg_type = msg_type;
 	agent_args->retry    = 1;
 	agent_args->hostlist = hostlist_create(NULL);
 	signal_tasks_msg = xmalloc(sizeof(signal_tasks_msg_t));
 	memcpy(&signal_tasks_msg->step_id, &step_ptr->step_id,
 	       sizeof(signal_tasks_msg->step_id));
 	signal_tasks_msg->signal      = signal;
 	if (step_ptr->flags & SSF_NO_SIG_FAIL)
 		signal_tasks_msg->flags |= KILL_NO_SIG_FAIL;

 	log_flag(STEPS, "%s: queueing signal %d with flags=0x%x for %pS",
 	      __func__, signal, signal_tasks_msg->flags, step_ptr);

         if (last_update != slurm_conf.last_update) {
                 if (xstrcasestr(slurm_conf.slurmctld_params, "cloud_dns"))
                         cloud_dns = true;
                 else
                         cloud_dns = false;
                 last_update = slurm_conf.last_update;
         }

 	agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
 	for (int i = 0;
 	     (node_ptr = next_node_bitmap(step_ptr->step_node_bitmap, &i));
 	     i++) {
 		if (agent_args->protocol_version > node_ptr->protocol_version)
 			agent_args->protocol_version =
 				node_ptr->protocol_version;
 		hostlist_push_host(agent_args->hostlist, node_ptr->name);
 		agent_args->node_count++;
 		if (PACK_FANOUT_ADDRS(node_ptr))
 			agent_args->msg_flags |= SLURM_PACK_ADDRS;
 	}

 	if (agent_args->node_count == 0) {
 		xfree(signal_tasks_msg);
 		hostlist_destroy(agent_args->hostlist);
 		xfree(agent_args);
 		return;
 	}

 	agent_args->msg_args = signal_tasks_msg;
 	set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY);
 	stepmgr_ops->agent_queue_request(agent_args);
 }

 /*
  * signal_step_tasks_on_node - send specific signal to specific job step
  *                             on a specific node.
  * IN node_name - name of node on which to signal tasks
  * IN step_ptr - step record pointer
  * IN signal - signal to send
  * IN msg_type - message type to send
  */
 void signal_step_tasks_on_node(char* node_name, step_record_t *step_ptr,
 			       uint16_t signal, slurm_msg_type_t msg_type)
 {
 	signal_tasks_msg_t *signal_tasks_msg;
 	agent_arg_t *agent_args = NULL;
 	node_record_t *node_ptr;

 	xassert(step_ptr);
 	agent_args = xmalloc(sizeof(agent_arg_t));
 	agent_args->msg_type = msg_type;
 	agent_args->retry    = 1;

 	if ((node_ptr = find_node_record(node_name)))
 		agent_args->protocol_version = node_ptr->protocol_version;
 	agent_args->node_count++;
 	agent_args->hostlist = hostlist_create(node_name);
 	if (!agent_args->hostlist)
 		fatal("Invalid node_name: %s", node_name);
 	signal_tasks_msg = xmalloc(sizeof(signal_tasks_msg_t));
 	memcpy(&signal_tasks_msg->step_id, &step_ptr->step_id,
 	       sizeof(signal_tasks_msg->step_id));
 	signal_tasks_msg->signal      = signal;
 	agent_args->msg_args = signal_tasks_msg;
 	set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY);
 	stepmgr_ops->agent_queue_request(agent_args);
 }

 typedef struct {
 	int config_start_count;
 	int start_count;
 	time_t max_age;
 } wake_steps_args_t;

 static int _wake_steps(void *x, void *arg)
 {
 	step_record_t *step_ptr = (step_record_t *) x;
 	wake_steps_args_t *args = (wake_steps_args_t *) arg;

 	if (step_ptr->state != JOB_PENDING)
 		return 0;

 	if ((args->start_count < args->config_start_count) ||
 	    (step_ptr->time_last_active <= args->max_age)) {
 		srun_step_signal(step_ptr, 0);
 		args->start_count++;
 		return 1;
 	}

 	return 0;
 }

 /* A step just completed, signal srun processes with pending steps to retry */
 static void _wake_pending_steps(job_record_t *job_ptr)
 {
 	static int config_start_count = -1, config_max_age = -1;
 	wake_steps_args_t args;

 	if (!IS_JOB_RUNNING(job_ptr))
 		return;

 	if (!job_ptr->step_list)
 		return;

 	if (config_start_count == -1) {
 		char *tmp_ptr;
 		long int param;
 		config_start_count = 8;
 		config_max_age = 60;

 		if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params,
 					   "step_retry_count="))) {
 			param = strtol(tmp_ptr + 17, NULL, 10);
 			if ((param >= 1) && (param != LONG_MIN) &&
 			    (param != LONG_MAX))
 				config_start_count = param;
 		}
 		if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params,
 					   "step_retry_time="))) {
 			param = strtol(tmp_ptr + 16, NULL, 10);
 			if ((param >= 1) && (param != LONG_MIN) &&
 			    (param != LONG_MAX))
 				config_max_age = param;
 		}
 	}
 	args.max_age = time(NULL) - config_max_age;

 	/* We do not know which steps can use currently available resources.
 	 * Try to start a bit more based upon step sizes. Effectiveness
 	 * varies with step sizes, constraints and order. */
 	args.config_start_count = config_start_count;
 	args.start_count = 0;
 	list_delete_all(job_ptr->step_list, _wake_steps, &args);
 }

 /* Set cur_inx to the next round-robin node index */
 static int _next_node_inx(int *cur_inx, int *check_cnt, int len, int node_cnt,
 			  bitstr_t *nodes_bitmap, bitstr_t **picked_node_bitmap,
 			  int start_inx)
 {
 	bool wrapped = false;
 	xassert(cur_inx);
 	xassert(check_cnt);
 	xassert(nodes_bitmap);
 	xassert(picked_node_bitmap);

 	if (*check_cnt == 0) {
 		*cur_inx = start_inx;
 	} else {
 		*cur_inx = (*cur_inx + 1) % len;
 		wrapped = *cur_inx <= start_inx;
 		if (*cur_inx == start_inx)
 			return SLURM_ERROR; /* Normal break case */
 	}

 	if (*check_cnt >= node_cnt)
 		return SLURM_ERROR; /* Normal break case */

 	*cur_inx = bit_ffs_from_bit(nodes_bitmap, *cur_inx);

 	if (wrapped && (*cur_inx >= start_inx))
 		return SLURM_ERROR; /* Normal break case */

 	if (*cur_inx < 0) {
 		/* This should never happen */
 		xassert(false);
 		FREE_NULL_BITMAP(*picked_node_bitmap);
 		return SLURM_ERROR;
 	}

 	(*check_cnt)++;
 	return SLURM_SUCCESS;
 }

 /* Pick nodes to be allocated to a job step. If a CPU count is also specified,
  * then select nodes with a sufficient CPU count.
  * IN job_ptr - job to contain step allocation
  * IN/OUT node_bitmap - nodes available (IN), selectect for use (OUT)
  * IN node_cnt - step node count specification
  * IN cpu_cnt - step CPU count specification
  * IN usable_cpu_cnt - count of usable CPUs on each node in node_bitmap
  */
 static bitstr_t *_pick_step_nodes_cpus(job_record_t *job_ptr,
 				       bitstr_t *nodes_bitmap, int node_cnt,
 				       int cpu_cnt, uint32_t *usable_cpu_cnt)
 {
 	bitstr_t *picked_node_bitmap = NULL;
 	int *usable_cpu_array;
 	int cpu_target;	/* Target number of CPUs per allocated node */
 	int rem_nodes, rem_cpus, save_rem_nodes, save_rem_cpus;
 	int i;
 	int start_inx, bit_len, check_cnt;

 	xassert(node_cnt > 0);
 	xassert(nodes_bitmap);
 	xassert(usable_cpu_cnt);

 	picked_node_bitmap = bit_alloc(node_record_count);
 	start_inx = job_ptr->job_resrcs->next_step_node_inx;
 	bit_len = bit_fls(nodes_bitmap) + 1;
 	if (start_inx >= bit_len)
 		start_inx = 0;

 	cpu_target = ROUNDUP(cpu_cnt, node_cnt);
 	if (cpu_target > 1024)
 		info("%s: high cpu_target (%d)", __func__, cpu_target);
 	if ((cpu_cnt <= node_cnt) || (cpu_target > 1024)) {
 		check_cnt = 0;
 		while (_next_node_inx(&i, &check_cnt, bit_len, node_cnt,
 				      nodes_bitmap, &picked_node_bitmap,
 				      start_inx) == SLURM_SUCCESS)
 			bit_set(picked_node_bitmap, i);

 		return picked_node_bitmap;
 	}

 	/* Need to satisfy both a node count and a cpu count */
 	usable_cpu_array = xcalloc(cpu_target, sizeof(int));
 	rem_nodes = node_cnt;
 	rem_cpus  = cpu_cnt;
 	check_cnt = 0;
 	while (_next_node_inx(&i, &check_cnt, bit_len, bit_len, nodes_bitmap,
 			      &picked_node_bitmap, start_inx) ==
 	       SLURM_SUCCESS) {
 		if (usable_cpu_cnt[i] < cpu_target) {
 			usable_cpu_array[usable_cpu_cnt[i]]++;
 			continue;
 		}
 		bit_set(picked_node_bitmap, i);
 		rem_cpus -= usable_cpu_cnt[i];
 		rem_nodes--;
 		if ((rem_cpus <= 0) && (rem_nodes <= 0)) {
 			/* Satisfied request */
 			xfree(usable_cpu_array);
 			return picked_node_bitmap;
 		}
 		if (rem_nodes == 0) {	/* Reached node limit, not CPU limit */
 			xfree(usable_cpu_array);
 			FREE_NULL_BITMAP(picked_node_bitmap);
 			return NULL;
 		}
 	}

 	if (!picked_node_bitmap) {
 		xfree(usable_cpu_array);
 		return NULL;
 	}

 	/* Need more resources. Determine what CPU counts per node to use */
 	save_rem_nodes = rem_nodes;
 	save_rem_cpus  = rem_cpus;
 	usable_cpu_array[0] = 0;
 	for (i = (cpu_target - 1); i > 0; i--) {
 		if (usable_cpu_array[i] == 0)
 			continue;
 		if (usable_cpu_array[i] > rem_nodes)
 			usable_cpu_array[i] = rem_nodes;
 		if (rem_nodes > 0) {
 			rem_nodes -= usable_cpu_array[i];
 			rem_cpus  -= (usable_cpu_array[i] * i);
 		}
 	}
 	if ((rem_cpus > 0) || (rem_nodes > 0)){	/* Can not satisfy request */
 		xfree(usable_cpu_array);
 		FREE_NULL_BITMAP(picked_node_bitmap);
 		return NULL;
 	}
 	rem_nodes = save_rem_nodes;
 	rem_cpus  = save_rem_cpus;

 	/* Pick nodes with CPU counts below original target */
 	check_cnt = 0;
 	while (_next_node_inx(&i, &check_cnt, bit_len, bit_len, nodes_bitmap,
 			      &picked_node_bitmap, start_inx) ==
 	       SLURM_SUCCESS) {
 		if (usable_cpu_cnt[i] >= cpu_target)
 			continue;	/* already picked */
 		if (usable_cpu_array[usable_cpu_cnt[i]] == 0)
 			continue;
 		usable_cpu_array[usable_cpu_cnt[i]]--;
 		bit_set(picked_node_bitmap, i);
 		rem_cpus -= usable_cpu_cnt[i];
 		rem_nodes--;
 		if ((rem_cpus <= 0) && (rem_nodes <= 0)) {
 			/* Satisfied request */
 			xfree(usable_cpu_array);
 			return picked_node_bitmap;
 		}
 		if (rem_nodes == 0)	/* Reached node limit */
 			break;
 	}

 	/* Can not satisfy request */
 	xfree(usable_cpu_array);
 	FREE_NULL_BITMAP(picked_node_bitmap);
 	return NULL;
 }

 static int _mark_busy_nodes(void *x, void *arg)
 {
 	step_record_t *step_ptr = (step_record_t *) x;
 	bitstr_t *busy = (bitstr_t *) arg;

 	if (step_ptr->state < JOB_RUNNING)
 		return 0;

 	/*
 	 * Don't consider the batch and extern steps when
 	 * looking for "idle" nodes.
 	 */
 	if ((step_ptr->step_id.step_id == SLURM_BATCH_SCRIPT) ||
 	    (step_ptr->step_id.step_id == SLURM_EXTERN_CONT) ||
 	    (step_ptr->step_id.step_id == SLURM_INTERACTIVE_STEP) ||
 	    (step_ptr->flags & SSF_EXT_LAUNCHER))
 		return 0;

 	if (!step_ptr->step_node_bitmap) {
 		error("%s: %pS has no step_node_bitmap",
 		      __func__, step_ptr);
 		return 0;
 	}

 	bit_or(busy, step_ptr->step_node_bitmap);

 	if (slurm_conf.debug_flags & DEBUG_FLAG_STEPS) {
 		char *temp;
 		temp = bitmap2node_name(step_ptr->step_node_bitmap);
 		log_flag(STEPS, "%s: %pS has nodes %s",
 			 __func__, step_ptr, temp);
 		xfree(temp);
 	}

 	return 0;
 }

 static void _step_test_gres(job_step_create_request_msg_t *step_spec,
 			    gres_stepmgr_step_test_args_t *gres_test_args,
 			    job_record_t *job_ptr,
 			    uint32_t *node_usable_cpu_cnt,
 			    uint32_t *total_cpus,
 			    uint32_t *avail_cpus,
 			    int *gres_invalid_nodes,
 			    int *fail_mode)
 {
 	uint64_t gres_cpus;
 	int err_code = SLURM_SUCCESS;

 	gres_test_args->err_code = &err_code;

 	/* ignore current step allocations */
 	gres_test_args->ignore_alloc = true;
 	gres_cpus = gres_stepmgr_step_test(gres_test_args);
 	*total_cpus = MIN(*total_cpus, gres_cpus);

 	/*
 	 * consider current step allocations if
 	 * not --overlap=force
 	 */
 	if (!(step_spec->flags & SSF_OVERLAP_FORCE)) {
 		gres_test_args->ignore_alloc = false;
 		gres_cpus = gres_stepmgr_step_test(gres_test_args);
 	}
 	if (gres_cpus < *avail_cpus) {
 		log_flag(STEPS, "%s: %pJ Usable CPUs for GRES %"PRIu64" from %d previously available",
 			 __func__, job_ptr, gres_cpus,
 			 *avail_cpus);
 		*avail_cpus = gres_cpus;
 		*node_usable_cpu_cnt = *avail_cpus;
 		if (err_code != SLURM_SUCCESS)
 			*fail_mode = err_code;
 		else
 			*fail_mode = ESLURM_INVALID_GRES;
 		if (*total_cpus == 0) {
 			/*
 			 * total_cpus == 0 is set from this:
 			 *   MIN(*total_cpus, gres_cpus);
 			 * This means that it is impossible to run this step on
 			 * this node due to GRES.
 			 */
 			*gres_invalid_nodes = *gres_invalid_nodes + 1;
 		}
 	}
 }

 /* Returns threads_per_core required by the step or NO_VAL16 if not specified */
 static uint16_t _get_threads_per_core(uint16_t step_threads_per_core,
 				      job_record_t *job_ptr)
 {
 	uint16_t tpc = NO_VAL16;

 	if (step_threads_per_core &&
 	    (step_threads_per_core != NO_VAL16)) {
 		tpc = step_threads_per_core;
 	} else if (job_ptr->details->mc_ptr->threads_per_core &&
 		   (job_ptr->details->mc_ptr->threads_per_core != NO_VAL16))
 		tpc = job_ptr->details->mc_ptr->threads_per_core;
 	return tpc;
 }

 static int _cmp_cpu_counts(const void *num1, const void *num2) {
 	uint16_t cpu1 = *(uint16_t *) num1;
 	uint16_t cpu2 = *(uint16_t *) num2;

 	if (cpu1 > cpu2)
 		return -1;
 	else if (cpu1 < cpu2)
 		return 1;
 	return 0;
 }

 static void _set_max_num_tasks(job_step_create_request_msg_t *step_spec,
 			       job_record_t *job_ptr,
 			       bitstr_t *node_bitmap,
 			       int cpus_per_task)
 {
 	int j = 0;
 	int k = 0;
 	uint32_t avail_cnt, num_nodes;
 	uint16_t *cpus;
 	uint32_t num_tasks = 0;
 	uint16_t tpc = _get_threads_per_core(step_spec->threads_per_core,
 					     job_ptr);

 	xassert(node_bitmap);
 	xassert(cpus_per_task);

 	avail_cnt = bit_set_count(node_bitmap);
 	num_nodes = MIN(avail_cnt, step_spec->max_nodes);
 	cpus = xcalloc(avail_cnt, sizeof(*cpus));
 	for (int i = 0; i < job_ptr->job_resrcs->nhosts; i++) {
 		j = bit_ffs_from_bit(job_ptr->job_resrcs->node_bitmap, j);
 		if (j < 0)
 			break;
 		if (!bit_test(node_bitmap, j)) {
 			j++;
 			continue;
 		}

 		if (tpc != NO_VAL16) {
 			cpus[k] = ROUNDUP(job_ptr->job_resrcs->cpus[i],
 					  node_record_table_ptr[j]->tpc);
 			cpus[k] *= tpc;
 		} else
 			cpus[k] = job_ptr->job_resrcs->cpus[i];

 		j++;
 		k++;
 	}

 	if (num_nodes < avail_cnt)
 		qsort(cpus, avail_cnt, sizeof(*cpus), _cmp_cpu_counts);

 	for (int i = 0; i < num_nodes; i++) {
 		num_tasks += cpus[i] / cpus_per_task;
 	}
 	step_spec->num_tasks = num_tasks;
 	step_spec->cpu_count = num_tasks * cpus_per_task;

 	xfree(cpus);
 }

 /*
  * _pick_step_nodes - select nodes for a job step that satisfy its requirements
  *	we satisfy the super-set of constraints.
  * IN job_ptr - pointer to job to have new step started
  * IN step_spec - job step specification
  * IN step_gres_list - job step's gres requirement details
  * IN cpus_per_task - NOTE could be zero
  * IN node_count - How many real nodes a select plugin should be looking for
  * OUT return_code - exit code or SLURM_SUCCESS
  * global: node_record_table_ptr - pointer to global node table
  * NOTE: returns all of a job's nodes if step_spec->node_count == INFINITE
  * NOTE: returned bitmap must be freed by the caller using FREE_NULL_BITMAP()
  */
 static bitstr_t *_pick_step_nodes(job_record_t *job_ptr,
 				  job_step_create_request_msg_t *step_spec,
 				  list_t *step_gres_list, int cpus_per_task,
 				  uint32_t node_count, int *return_code)
 {
 	node_record_t *node_ptr;
 	bitstr_t *nodes_avail = NULL, *nodes_idle = NULL;
 	bitstr_t *select_nodes_avail = NULL;
 	bitstr_t *nodes_picked = NULL, *node_tmp = NULL;
 	int error_code, nodes_picked_cnt = 0, cpus_picked_cnt = 0;
 	int cpu_cnt, i;
 	int mem_blocked_nodes = 0, mem_blocked_cpus = 0;
 	int job_blocked_nodes = 0, job_blocked_cpus = 0;
 	int gres_invalid_nodes = 0;
 	job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;
 	uint32_t *usable_cpu_cnt = NULL;
 	gres_stepmgr_step_test_args_t gres_test_args = {
 		.cpus_per_task = cpus_per_task,
 		.first_step_node = true,
 		.job_gres_list = job_ptr->gres_list_alloc,
 		.job_id = job_ptr->job_id,
 		.job_resrcs_ptr = job_resrcs_ptr,
 		.max_rem_nodes = step_spec->max_nodes,
 		.step_gres_list = step_gres_list,
 		.step_id = NO_VAL,
 		.test_mem = false,
 	};

 	xassert(job_resrcs_ptr);
 	xassert(job_resrcs_ptr->cpus);
 	xassert(job_resrcs_ptr->cpus_used);

 	*return_code = SLURM_SUCCESS;
 	if (job_ptr->node_bitmap == NULL) {
 		*return_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
 		return NULL;
 	}

 	if (step_spec->max_nodes == 0)
 		step_spec->max_nodes = job_ptr->node_cnt;

 	if (step_spec->max_nodes < step_spec->min_nodes) {
 		*return_code = ESLURM_INVALID_NODE_COUNT;
 		return NULL;
 	}

 	if (!nodes_avail)
 		nodes_avail = bit_copy (job_ptr->node_bitmap);
 	bit_and(nodes_avail, stepmgr_ops->up_node_bitmap);

 	if (step_spec->exc_nodes) {
 		bitstr_t *exc_bitmap = NULL;
 		error_code = node_name2bitmap(step_spec->exc_nodes, false,
 					      &exc_bitmap, NULL);
 		if (error_code)
 			debug("Assuming invalid nodename requested for exclusion from step is excluded");
 		bit_and_not(nodes_avail, exc_bitmap);

 		if (step_spec->node_list) {
 			bitstr_t *req_nodes = NULL;
 			error_code = node_name2bitmap(
 				step_spec->node_list, false,
 				&req_nodes, NULL);
 			if (error_code) {
 				info("%s: invalid requested node list %s",
 				     __func__,
 				     step_spec->node_list);
 				FREE_NULL_BITMAP(exc_bitmap);
 				FREE_NULL_BITMAP(req_nodes);
 				goto cleanup;
 			}
 			if (bit_overlap_any(req_nodes, exc_bitmap)) {
 				info("%s: %ps requested nodes %s is also excluded %s",
 				     __func__, &step_spec->step_id,
 				     step_spec->node_list,
 				     step_spec->exc_nodes);
 				FREE_NULL_BITMAP(exc_bitmap);
 				FREE_NULL_BITMAP(req_nodes);
 				goto cleanup;
 			}
 			FREE_NULL_BITMAP(req_nodes);
 		}
 		FREE_NULL_BITMAP(exc_bitmap);
 	}

 	if (step_spec->features &&
 	    (!job_ptr->details ||
 	     xstrcmp(step_spec->features, job_ptr->details->features_use))) {
 		/*
 		 * We only select for a single feature name here.
 		 * Ignore step features if equal to job features.
 		 * FIXME: Add support for AND, OR, etc. here if desired
 		 */
 		node_feature_t *feat_ptr;
 		feat_ptr =
 			list_find_first(active_feature_list, list_find_feature,
 					(void *) step_spec->features);
 		if (feat_ptr && feat_ptr->node_bitmap)
 			bit_and(nodes_avail, feat_ptr->node_bitmap);
 		else {
 			bit_clear_all(nodes_avail);
 			*return_code = ESLURM_INVALID_FEATURE;
 			goto cleanup;
 		}
 	}

 	if (step_spec->pn_min_memory &&
 	    ((job_resrcs_ptr->memory_allocated == NULL) ||
 	     (job_resrcs_ptr->memory_used == NULL))) {
 		error("%s: job lacks memory allocation details to enforce memory limits for %pJ",
 		      __func__, job_ptr);
 		step_spec->pn_min_memory = 0;
 	} else if (step_spec->pn_min_memory == MEM_PER_CPU)
 		step_spec->pn_min_memory = 0;	/* clear MEM_PER_CPU flag */

 	if (job_ptr->next_step_id == 0) {
 		for (int i = 0;
 		     (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i));
 		     i++) {
 			if (IS_NODE_POWERED_DOWN(node_ptr) ||
 			    IS_NODE_FUTURE(node_ptr) ||
 			    IS_NODE_NO_RESPOND(node_ptr)) {
 				/*
 				 * Node is/was powered down. Need to wait
 				 * for it to start responding again.
 				 */
 				FREE_NULL_BITMAP(nodes_avail);
 				FREE_NULL_BITMAP(select_nodes_avail);
 				*return_code = ESLURM_NODES_BUSY;
 				return NULL;
 			}
 		}
 		if (IS_JOB_CONFIGURING(job_ptr)) {
 			info("%s: Configuration for %pJ is complete",
 			     __func__, job_ptr);
 			stepmgr_ops->job_config_fini(job_ptr);
 		}
 	}

 	if (_is_mem_resv() && step_spec->pn_min_memory &&
 	    ((step_spec->pn_min_memory & MEM_PER_CPU) == 0) &&
 	    job_ptr->details && job_ptr->details->pn_min_memory &&
 	    ((job_ptr->details->pn_min_memory & MEM_PER_CPU) == 0) &&
 	    (step_spec->pn_min_memory >
 	     job_ptr->details->pn_min_memory)) {
 		FREE_NULL_BITMAP(nodes_avail);
 		FREE_NULL_BITMAP(select_nodes_avail);
 		*return_code = ESLURM_INVALID_TASK_MEMORY;
 		return NULL;
 	}

 	usable_cpu_cnt = xcalloc(node_record_count, sizeof(uint32_t));
 	for (int i = 0, node_inx = -1;
 	     (node_ptr = next_node_bitmap(job_resrcs_ptr->node_bitmap, &i));
 	     i++) {
 		node_inx++;
 		if (!bit_test(nodes_avail, i))
 			continue;	/* node now DOWN */

 		usable_cpu_cnt[i] = job_resrcs_ptr->cpus[node_inx];

 		log_flag(STEPS, "%s: %pJ Currently running steps use %d of allocated %d CPUs on node %s",
 			 __func__, job_ptr,
 			 job_resrcs_ptr->cpus_used[node_inx],
 			 usable_cpu_cnt[i], node_record_table_ptr[i]->name);

 		/*
 		 * Don't do this test if --overlap=force or
 		 * --external-launcher
 		 */
 		if ((!(step_spec->flags & SSF_OVERLAP_FORCE)) &&
 		    (!(step_spec->flags & SSF_EXT_LAUNCHER))) {
 			/*
 			 * If whole is given and
 			 * job_resrcs_ptr->cpus_used[node_inx]
 			 * we can't use this node.
 			 */
 			if ((step_spec->flags & SSF_WHOLE) &&
 			    job_resrcs_ptr->cpus_used[node_inx]) {
 				log_flag(STEPS, "%s: %pJ Node requested --whole node while other step running here.",
 					 __func__, job_ptr);
 				job_blocked_cpus +=
 					job_resrcs_ptr->cpus_used[node_inx];
 				job_blocked_nodes++;
 				usable_cpu_cnt[i] = 0;
 			} else {
 				usable_cpu_cnt[i] -=
 					job_resrcs_ptr->cpus_used[node_inx];
 				job_blocked_cpus +=
 					job_resrcs_ptr->cpus_used[node_inx];
 				if (!usable_cpu_cnt[i]) {
 					job_blocked_nodes++;
 					log_flag(STEPS, "%s: %pJ Skipping node %s. Not enough CPUs to run step here.",
 						 __func__,
 						 job_ptr,
 						 node_record_table_ptr[i]->name);
 				}
 			}
 		}

 		if (!usable_cpu_cnt[i]) {
 			bit_clear(nodes_avail, i);
 			continue;
 		}

 		if ((step_spec->pn_min_memory && _is_mem_resv()) ||
 		    step_gres_list) {
 			int fail_mode = ESLURM_NODES_BUSY;
 			uint64_t tmp_mem;
 			uint32_t tmp_cpus, avail_cpus, total_cpus;
 			uint32_t avail_tasks, total_tasks;

 			gres_test_args.node_offset = node_inx;
 			gres_test_args.test_mem = false;

 			avail_cpus = total_cpus = usable_cpu_cnt[i];;
 			if (_is_mem_resv() &&
 			    step_spec->pn_min_memory & MEM_PER_CPU) {
 				uint64_t mem_use = step_spec->pn_min_memory;
 				mem_use &= (~MEM_PER_CPU);
 				/* ignore current step allocations */
 				tmp_mem    = job_resrcs_ptr->
 					memory_allocated[node_inx];
 				tmp_cpus   = tmp_mem / mem_use;
 				total_cpus = MIN(total_cpus, tmp_cpus);
 				/*
 				 * consider current step allocations if
 				 * not --overlap=force
 				 */
 				if (!(step_spec->flags & SSF_OVERLAP_FORCE)) {
 					tmp_mem   -= job_resrcs_ptr->
 						memory_used[node_inx];
 					tmp_cpus   = tmp_mem / mem_use;
 				}
 				if (tmp_cpus < avail_cpus) {
 					avail_cpus = tmp_cpus;
 					usable_cpu_cnt[i] = avail_cpus;
 					fail_mode = ESLURM_INVALID_TASK_MEMORY;
 				}
 				log_flag(STEPS, "%s: %pJ Based on --mem-per-cpu=%"PRIu64" we have %d/%d usable of available cpus on node %s, usable memory was: %"PRIu64,
 					 __func__, job_ptr, mem_use, tmp_cpus,
 					 avail_cpus, node_ptr->name, tmp_mem);
 			} else if (_is_mem_resv() && step_spec->pn_min_memory) {
 				uint64_t mem_use = step_spec->pn_min_memory;
 				/* ignore current step allocations */
 				tmp_mem    = job_resrcs_ptr->
 					memory_allocated[node_inx];
 				if (tmp_mem < mem_use)
 					total_cpus = 0;
 				/*
 				 * consider current step allocations if
 				 * not --overlap=force
 				 */
 				if (!(step_spec->flags & SSF_OVERLAP_FORCE)) {
 					tmp_mem   -= job_resrcs_ptr->
 						memory_used[node_inx];
 				}
 				if ((tmp_mem < mem_use) && (avail_cpus > 0)) {
 					log_flag(STEPS, "%s: %pJ Usable memory on node %s: %"PRIu64" is less than requested %"PRIu64" skipping the node",
 						 __func__, job_ptr,
 						 node_ptr->name,
 						 tmp_mem,
 						 mem_use);
 					avail_cpus = 0;
 					usable_cpu_cnt[i] = avail_cpus;
 					fail_mode = ESLURM_INVALID_TASK_MEMORY;
 				}
 			} else if (_is_mem_resv())
 				gres_test_args.test_mem = true;

 			_step_test_gres(step_spec, &gres_test_args, job_ptr,
 					&usable_cpu_cnt[i],
 					&total_cpus, &avail_cpus,
 					&gres_invalid_nodes,
 					&fail_mode);

 			avail_tasks = avail_cpus;
 			total_tasks = total_cpus;
 			if (cpus_per_task > 0) {
 				avail_tasks /= cpus_per_task;
 				total_tasks /= cpus_per_task;
 			}
 			if (avail_tasks == 0) {
 				log_flag(STEPS, "%s: %pJ No task can start on node %s",
 					 __func__, job_ptr, node_ptr->name);
 				if ((step_spec->min_nodes == INFINITE) ||
 				    (step_spec->min_nodes ==
 				     job_ptr->node_cnt)) {
 					log_flag(STEPS, "%s: %pJ All nodes in allocation required, but can't use them now",
 						 __func__, job_ptr);
 					FREE_NULL_BITMAP(nodes_avail);
 					FREE_NULL_BITMAP(select_nodes_avail);
 					xfree(usable_cpu_cnt);
 					*return_code = ESLURM_NODES_BUSY;
 					if (total_tasks == 0 &&
 					    (fail_mode !=
 					     ESLURM_INVALID_TASK_MEMORY)) {
 						*return_code = fail_mode;
 						log_flag(STEPS, "%s: %pJ Step cannot ever run in the allocation: %s",
 							 __func__,
 							 job_ptr,
 							 slurm_strerror(
 								 fail_mode));
 					}
 					return NULL;
 				}
 				bit_clear(nodes_avail, i);
 				mem_blocked_nodes++;
 				mem_blocked_cpus += (total_cpus - avail_cpus);
 			} else {
 				mem_blocked_cpus += (total_cpus - avail_cpus);
 				gres_test_args.first_step_node = false;
 			}
 		}
 	}

 	if (gres_invalid_nodes >
 	    (job_resrcs_ptr->nhosts - step_spec->min_nodes)) {
 		*return_code = ESLURM_INVALID_GRES;
 		log_flag(STEPS, "%s: Never able to satisfy the GRES request for this step",
 			 __func__);
 		FREE_NULL_BITMAP(nodes_avail);
 		FREE_NULL_BITMAP(select_nodes_avail);
 		xfree(usable_cpu_cnt);
 		return NULL;
 	}

 	if (step_spec->min_nodes == INFINITE) {	/* use all nodes */
 		if ((step_spec->num_tasks == NO_VAL) && nodes_avail &&
 		    !(step_spec->flags & SSF_EXT_LAUNCHER)) {
 			_set_max_num_tasks(step_spec, job_ptr, nodes_avail,
 					   cpus_per_task);
 			if (step_spec->num_tasks == 0) {
 				log_flag(STEPS, "%s: Step requested more processors per task (%d) than can be satisfied.",
 					 __func__, cpus_per_task);
 				*return_code = ESLURM_TOO_MANY_REQUESTED_CPUS;
 				goto cleanup;
 			}
 		}

 		job_resrcs_ptr->next_step_node_inx = 0;
 		xfree(usable_cpu_cnt);
 		FREE_NULL_BITMAP(select_nodes_avail);
 		return nodes_avail;
 	}

 	if (select_nodes_avail) {
 		/*
 		 * The select plugin told us these were the only ones we could
 		 * choose from.  If it doesn't fit here then defer request
 		 */
 		bit_and(nodes_avail, select_nodes_avail);
 		FREE_NULL_BITMAP(select_nodes_avail);
 	}

 	/*
 	 * An allocating srun will send in the same node_list that was already
 	 * used to construct the job allocation. In that case, we can assume
 	 * that the job allocation already satisfies those requirements.
 	 */
 	if (step_spec->node_list && xstrcmp(step_spec->node_list,
 					    job_ptr->details->req_nodes)) {
 		bitstr_t *selected_nodes = NULL;
 		log_flag(STEPS, "%s: selected nodelist is %s",
 			 __func__, step_spec->node_list);
 		error_code = node_name2bitmap(step_spec->node_list, false,
 					      &selected_nodes, NULL);
 		if (error_code) {
 			log_flag(STEPS, "%s: invalid node list %s", __func__,
 				 step_spec->node_list);
 			FREE_NULL_BITMAP(selected_nodes);
 			goto cleanup;
 		}
 		if (!bit_super_set(selected_nodes, job_ptr->node_bitmap)) {
 			log_flag(STEPS, "%s: requested nodes %s not part of %pJ",
 				 __func__, step_spec->node_list, job_ptr);
 			FREE_NULL_BITMAP(selected_nodes);
 			goto cleanup;
 		}
 		if (!bit_super_set(selected_nodes, nodes_avail)) {
 			/*
 			 * If some nodes still have some memory or CPUs
 			 * allocated to other steps, just defer the execution
 			 * of the step
 			 */
 			if (job_blocked_nodes) {
 				*return_code = ESLURM_NODES_BUSY;
 				log_flag(STEPS, "%s: some requested nodes %s still have CPUs used by other steps",
 					 __func__, step_spec->node_list);
 			} else if (mem_blocked_nodes == 0) {
 				*return_code = ESLURM_INVALID_TASK_MEMORY;
 				log_flag(STEPS, "%s: requested nodes %s have inadequate memory",
 					 __func__, step_spec->node_list);
 			} else {
 				*return_code = ESLURM_NODES_BUSY;
 				log_flag(STEPS, "%s: some requested nodes %s still have memory used by other steps",
 					 __func__, step_spec->node_list);
 			}
 			FREE_NULL_BITMAP(selected_nodes);
 			goto cleanup;
 		}
 		if ((step_spec->task_dist & SLURM_DIST_STATE_BASE) ==
 		    SLURM_DIST_ARBITRARY) {
 			step_spec->min_nodes = bit_set_count(selected_nodes);
 		}
 		if (selected_nodes) {
 			int node_cnt = 0;
 			/*
 			 * Use selected nodes to run the step and
 			 * mark them unavailable for future use
 			 */

 			/*
 			 * If we have selected more than we requested
 			 * make the available nodes equal to the
 			 * selected nodes and we will pick from that
 			 * list later on in the function.
 			 * Other than that copy the nodes selected as
 			 * the nodes we want.
 			 */
 			node_cnt = bit_set_count(selected_nodes);
 			if (node_cnt > step_spec->max_nodes) {
 				log_flag(STEPS, "%s: requested nodes %s exceed max node count for %pJ (%d > %u)",
 					 __func__, step_spec->node_list,
 					 job_ptr, node_cnt,
 					 step_spec->max_nodes);
 				FREE_NULL_BITMAP(selected_nodes);
 				goto cleanup;
 			} else if (step_spec->min_nodes &&
 				   (node_cnt > step_spec->min_nodes)) {
 				nodes_picked = bit_alloc(bit_size(nodes_avail));
 				FREE_NULL_BITMAP(nodes_avail);
 				nodes_avail = selected_nodes;
 				selected_nodes = NULL;
 			} else {
 				nodes_picked = bit_copy(selected_nodes);
 				bit_and_not(nodes_avail, selected_nodes);
 				FREE_NULL_BITMAP(selected_nodes);
 			}
 		}
 	} else {
 		nodes_picked = bit_alloc(bit_size(nodes_avail));
 	}

 	/* If gres_per_step then filter nodes_avail to nodes that fill req */
 	gres_stepmgr_step_test_per_step(step_gres_list, job_ptr,
 				     nodes_avail, step_spec->min_nodes);

 	/*
 	 * In case we are in relative mode, do not look for idle nodes
 	 * as we will not try to get idle nodes first but try to get
 	 * the relative node first
 	 */
 	if (step_spec->relative != NO_VAL16) {
 		/*
 		 * Remove first (step_spec->relative) nodes from
 		 * available list
 		 */
 		bitstr_t *relative_nodes = NULL;
 		relative_nodes = bit_pick_cnt(job_ptr->node_bitmap,
 					      step_spec->relative);
 		if (relative_nodes == NULL) {
 			log_flag(STEPS, "%s: Invalid relative value (%u) for %pJ",
 				 __func__, step_spec->relative, job_ptr);
 			goto cleanup;
 		}
 		bit_and_not(nodes_avail, relative_nodes);
 		FREE_NULL_BITMAP(relative_nodes);
 	} else {
 		nodes_idle = bit_alloc (bit_size (nodes_avail) );
 		list_for_each(job_ptr->step_list, _mark_busy_nodes, nodes_idle);
 		bit_not(nodes_idle);
 		bit_and(nodes_idle, nodes_avail);
 	}

 	if (slurm_conf.debug_flags & DEBUG_FLAG_STEPS) {
 		char *temp1, *temp2, *temp3;
 		temp1 = bitmap2node_name(nodes_avail);
 		temp2 = bitmap2node_name(nodes_idle);
 		if (step_spec->node_list)
 			temp3 = step_spec->node_list;
 		else
 			temp3 = "NONE";
 		log_flag(STEPS, "%s: step pick %u-%u nodes, avail:%s idle:%s picked:%s",
 			 __func__, step_spec->min_nodes, step_spec->max_nodes,
 			 temp1, temp2, temp3);
 		xfree(temp1);
 		xfree(temp2);
 	}

 	if ((step_spec->num_tasks == NO_VAL) &&
 	    !(step_spec->flags & SSF_EXT_LAUNCHER)) {
 		uint32_t cnt = 0;
 		bitstr_t *node_bitmap = NULL;

 		if ((step_spec->flags & SSF_OVERLAP_FORCE) && nodes_avail) {
 			cnt = bit_set_count(nodes_avail);
 			node_bitmap = nodes_avail;
 		} else if (nodes_idle) {
 			cnt = bit_set_count(nodes_idle);
 			node_bitmap = nodes_idle;
 		}
 		if (cnt < step_spec->min_nodes) {
 			log_flag(STEPS, "%s: Step requested more nodes (%u) than are available (%d), deferring step until enough nodes are available.",
 				 __func__, step_spec->min_nodes, cnt);
 			*return_code = ESLURM_NODES_BUSY;
 			goto cleanup;
 		}

 		_set_max_num_tasks(step_spec, job_ptr, node_bitmap,
 				   cpus_per_task);
 		if (step_spec->num_tasks == 0) {
 			log_flag(STEPS, "%s: Step requested more processors per task (%d) than can be satisfied.",
 				 __func__, cpus_per_task);
 			*return_code = ESLURM_TOO_MANY_REQUESTED_CPUS;
 			goto cleanup;
 		}
 	}

 	/*
 	 * If user specifies step needs a specific processor count and
 	 * all nodes have the same processor count, just translate this to
 	 * a node count
 	 */
 	if (step_spec->cpu_count && job_resrcs_ptr &&
 	    (job_resrcs_ptr->cpu_array_cnt == 1) &&
 	    (job_resrcs_ptr->cpu_array_value)) {
 		uint32_t cpu_count = step_spec->cpu_count;
 		uint16_t req_tpc;
 		/*
 		 * Expand cpu account to account for blocked/used threads when
 		 * using threads-per-core. See _step_[de]alloc_lps() for similar
 		 * code.
 		 */
 		req_tpc = _get_threads_per_core(step_spec->threads_per_core,
 						job_ptr);

 		/*
 		 * Only process this differently if the allocation requested
 		 * more threads per core than the step is requesting as
 		 * job_resrcs->cpu_array_value is already processed with the
 		 * threads per core the allocation requested so you don't need
 		 * to do this again. See src/common/job_resources.c
 		 * build_job_resources_cpu_array().
 		 */
 		if ((req_tpc != NO_VAL16) &&
 		    (req_tpc < job_resrcs_ptr->threads_per_core)) {
 			int first_inx = bit_ffs(job_resrcs_ptr->node_bitmap);
 			if (first_inx == -1) {
 				error("%s: Job %pJ doesn't have any nodes in it! This should never happen",
 				      __func__, job_ptr);
 				*return_code = ESLURM_INVALID_NODE_COUNT;
 				goto cleanup;
 			}
 			if (req_tpc < node_record_table_ptr[first_inx]->tpc) {
 				cpu_count = ROUNDUP(cpu_count, req_tpc);
 				cpu_count *=
 					node_record_table_ptr[first_inx]->tpc;
 			} else if (req_tpc >
 				   node_record_table_ptr[first_inx]->tpc) {
 				log_flag(STEPS, "%s: requested more threads per core than possible in allocation (%u > %u) for %pJ",
 					 __func__,
 					 req_tpc,
 					 node_record_table_ptr[first_inx]->tpc,
 					 job_ptr);
 				*return_code = ESLURM_BAD_THREAD_PER_CORE;
 				goto cleanup;
 			}
 		}

 		i = ROUNDUP(cpu_count, job_resrcs_ptr->cpu_array_value[0]);
 		step_spec->min_nodes = (i > step_spec->min_nodes) ?
 			i : step_spec->min_nodes ;

 		/*
 		 * If we are trying to pack the nodes we only want the minimum
 		 * it takes to satisfy the request.
 		 */
 		if (step_spec->task_dist & SLURM_DIST_PACK_NODES)
 			step_spec->max_nodes = step_spec->min_nodes;

 		if (step_spec->max_nodes < step_spec->min_nodes) {
 			log_flag(STEPS, "%s: %pJ max node less than min node count (%u < %u)",
 				 __func__, job_ptr, step_spec->max_nodes,
 				 step_spec->min_nodes);
 			*return_code = ESLURM_TOO_MANY_REQUESTED_CPUS;
 			goto cleanup;
 		}
 	}

 	if (step_spec->min_nodes) {
 		int cpus_needed, node_avail_cnt, nodes_needed;

 		nodes_picked_cnt = bit_set_count(nodes_picked);
 		log_flag(STEPS, "%s: step picked %d of %u nodes",
 			 __func__, nodes_picked_cnt, step_spec->min_nodes);

 		/*
 		 * First do a basic test - if there aren't enough nodes for
 		 * this step to run on then we need to defer execution of this
 		 * step. As long as there aren't enough nodes for this
 		 * step we can never test if the step requested too
 		 * many CPUs, too much memory, etc. so we just bail right here.
 		 */
 		if (nodes_avail)
 			node_avail_cnt = bit_set_count(nodes_avail);
 		else
 			node_avail_cnt = 0;
 		if ((node_avail_cnt + nodes_picked_cnt) <
 		    step_spec->min_nodes) {
 			log_flag(STEPS, "%s: Step requested more nodes (%u) than are available (%d), deferring step until enough nodes are available.",
 				 __func__, step_spec->min_nodes,
 				 node_avail_cnt);
 			*return_code = ESLURM_NODES_BUSY;
 			goto cleanup;
 		}

 		if (nodes_idle)
 			node_avail_cnt = bit_set_count(nodes_idle);
 		else
 			node_avail_cnt = 0;
 		nodes_needed = step_spec->min_nodes - nodes_picked_cnt;
 		if ((nodes_needed > 0) &&
 		    (node_avail_cnt >= nodes_needed)) {
 			cpus_needed = _opt_cpu_cnt(step_spec->cpu_count,
 						   nodes_picked,
 						   usable_cpu_cnt);
 			nodes_needed = _opt_node_cnt(step_spec->min_nodes,
 						     step_spec->max_nodes,
 						     node_avail_cnt,
 						     nodes_picked_cnt);
 			node_tmp = _pick_step_nodes_cpus(job_ptr, nodes_idle,
 							 nodes_needed,
 							 cpus_needed,
 							 usable_cpu_cnt);
 			if (node_tmp) {
 				bit_or(nodes_picked, node_tmp);
 				bit_and_not(nodes_idle, node_tmp);
 				bit_and_not(nodes_avail, node_tmp);
 				FREE_NULL_BITMAP(node_tmp);
 				nodes_picked_cnt = step_spec->min_nodes;
 				nodes_needed = 0;
 			}
 		}
 		if (nodes_avail)
 			node_avail_cnt = bit_set_count(nodes_avail);
 		else
 			node_avail_cnt = 0;
 		if ((nodes_needed > 0) &&
 		    (node_avail_cnt >= nodes_needed)) {
 			cpus_needed = _opt_cpu_cnt(step_spec->cpu_count,
 						   nodes_picked,
 						   usable_cpu_cnt);
 			nodes_needed = _opt_node_cnt(step_spec->min_nodes,
 						     step_spec->max_nodes,
 						     node_avail_cnt,
 						     nodes_picked_cnt);
 			node_tmp = _pick_step_nodes_cpus(job_ptr, nodes_avail,
 							 nodes_needed,
 							 cpus_needed,
 							 usable_cpu_cnt);
 			if (node_tmp == NULL) {
 				/* Count of nodes already picked for step */
 				int pick_node_cnt = bit_set_count(nodes_avail);
 				pick_node_cnt += nodes_picked_cnt;
 				if ((step_spec->max_nodes <= pick_node_cnt) &&
 				    (mem_blocked_cpus == 0) &&
 				    (job_blocked_cpus == 0)) {
 					*return_code =
 						ESLURM_TOO_MANY_REQUESTED_CPUS;
 				} else if ((mem_blocked_cpus > 0) ||
 					   (step_spec->min_nodes <=
 					    (pick_node_cnt + mem_blocked_nodes +
 					     job_blocked_nodes))) {
 					*return_code = ESLURM_NODES_BUSY;
 				} else if (!bit_super_set(job_ptr->node_bitmap,
 							  stepmgr_ops->up_node_bitmap)) {
 					*return_code = ESLURM_NODE_NOT_AVAIL;
 				}
 				goto cleanup;
 			}
 			bit_or(nodes_picked, node_tmp);
 			bit_and_not(nodes_avail, node_tmp);
 			FREE_NULL_BITMAP(node_tmp);
 			nodes_picked_cnt = step_spec->min_nodes;
 		} else if (nodes_needed > 0) {
 			if ((step_spec->max_nodes <= nodes_picked_cnt) &&
 			    (mem_blocked_cpus == 0) &&
 			    (job_blocked_cpus == 0)) {
 				*return_code = ESLURM_TOO_MANY_REQUESTED_CPUS;
 			} else if ((mem_blocked_cpus > 0) ||
 				   (step_spec->min_nodes <=
 				    (nodes_picked_cnt + mem_blocked_nodes +
 				     job_blocked_nodes))) {
 				*return_code = ESLURM_NODES_BUSY;
 			} else if (!bit_super_set(job_ptr->node_bitmap,
 						  stepmgr_ops->up_node_bitmap)) {
 				*return_code = ESLURM_NODE_NOT_AVAIL;
 			}
 			goto cleanup;
 		}
 	}
 	if (step_spec->cpu_count) {
 		/* make sure the selected nodes have enough cpus */
 		cpus_picked_cnt = _count_cpus(job_ptr, nodes_picked,
 					      usable_cpu_cnt);
 		if ((step_spec->cpu_count > cpus_picked_cnt) &&
 		    (step_spec->max_nodes > nodes_picked_cnt)) {
 			/* Attempt to add more nodes to allocation */
 			nodes_picked_cnt = bit_set_count(nodes_picked);
 			while (step_spec->cpu_count > cpus_picked_cnt) {
 				node_tmp = bit_pick_cnt(nodes_avail, 1);
 				if (node_tmp == NULL)
 					break;

 				cpu_cnt = _count_cpus(job_ptr, node_tmp,
 						      usable_cpu_cnt);
 				if (cpu_cnt == 0) {
 					/*
 					 * Node not usable (memory insufficient
 					 * to allocate any CPUs, etc.)
 					 */
 					bit_and_not(nodes_avail, node_tmp);
 					FREE_NULL_BITMAP(node_tmp);
 					continue;
 				}

 				bit_or(nodes_picked, node_tmp);
 				bit_and_not(nodes_avail, node_tmp);
 				FREE_NULL_BITMAP(node_tmp);
 				nodes_picked_cnt += 1;
 				if (step_spec->min_nodes)
 					step_spec->min_nodes = nodes_picked_cnt;

 				cpus_picked_cnt += cpu_cnt;
 				if (nodes_picked_cnt >= step_spec->max_nodes)
 					break;
 			}
 		}

 		/*
 		 * User is requesting more cpus than we got from the
 		 * picked nodes. We should return with an error
 		 */
 		if (step_spec->cpu_count > cpus_picked_cnt) {
 			if (step_spec->cpu_count &&
 			    (step_spec->cpu_count <=
 			     (cpus_picked_cnt + mem_blocked_cpus +
 			      job_blocked_cpus))) {
 				*return_code = ESLURM_NODES_BUSY;
 			} else if (!bit_super_set(job_ptr->node_bitmap,
 						  stepmgr_ops->up_node_bitmap)) {
 				*return_code = ESLURM_NODE_NOT_AVAIL;
 			}
 			log_flag(STEPS, "Have %d nodes with %d cpus which is less than what the user is asking for (%d cpus) aborting.",
 				 nodes_picked_cnt,
 				 cpus_picked_cnt,
 				 step_spec->cpu_count);
 			goto cleanup;
 		}
 	}

 	job_resrcs_ptr->next_step_node_inx = bit_fls(nodes_picked) + 1;
 	FREE_NULL_BITMAP(nodes_avail);
 	FREE_NULL_BITMAP(select_nodes_avail);
 	FREE_NULL_BITMAP(nodes_idle);
 	xfree(usable_cpu_cnt);
 	return nodes_picked;

 cleanup:
 	FREE_NULL_BITMAP(nodes_avail);
 	FREE_NULL_BITMAP(select_nodes_avail);
 	FREE_NULL_BITMAP(nodes_idle);
 	FREE_NULL_BITMAP(nodes_picked);
 	xfree(usable_cpu_cnt);
 	if (*return_code == SLURM_SUCCESS) {
 		*return_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
 	} else if (*return_code == ESLURM_NODE_NOT_AVAIL) {
 		/*
 		 * Return ESLURM_NODES_BUSY if the node is not responding.
 		 * The node will eventually either come back UP or go DOWN.
 		 */
 		nodes_picked = bit_copy(stepmgr_ops->up_node_bitmap);
 		bit_not(nodes_picked);
 		bit_and(nodes_picked, job_ptr->node_bitmap);
 		for (i = 0; (node_ptr = next_node_bitmap(
 				     job_resrcs_ptr->node_bitmap, &i));
 		     i++) {
 			if (!IS_NODE_NO_RESPOND(node_ptr)) {
 				*return_code = ESLURM_NODES_BUSY;
 				break;
 			}
 		}
 		FREE_NULL_BITMAP(nodes_picked);
 	}
 	return NULL;
 }

 /*
  * _count_cpus - report how many cpus are allocated to this job for the
  *		 identified nodes
  * IN job_ptr - point to job
  * IN bitmap - map of nodes to tally
  * IN usable_cpu_cnt - count of usable CPUs based upon memory or gres specs
  *		NULL if not available
  * RET cpu count
  */
 static int _count_cpus(job_record_t *job_ptr, bitstr_t *bitmap,
 		       uint32_t *usable_cpu_cnt)
 {
 	int i, sum = 0;
 	node_record_t *node_ptr;

 	if (job_ptr->job_resrcs && job_ptr->job_resrcs->cpus &&
 	    job_ptr->job_resrcs->node_bitmap) {
 		int node_inx = -1;
 		for (i = 0;
 		     (node_ptr = next_node_bitmap(
 			job_ptr->job_resrcs->node_bitmap, &i));
 		     i++) {
 			node_inx++;
 			if (!bit_test(job_ptr->node_bitmap, node_ptr->index) ||
 			    !bit_test(bitmap, node_ptr->index)) {
 				/* absent from current job or step bitmap */
 				continue;
 			}
 			if (usable_cpu_cnt)
 				sum += usable_cpu_cnt[node_ptr->index];
 			else
 				sum += job_ptr->job_resrcs->cpus[node_inx];
 		}
 	} else {
 		error("%pJ lacks cpus array", job_ptr);
 		for (i = 0; (node_ptr = next_node_bitmap(bitmap, &i)); i++) {
 			sum += node_ptr->config_ptr->cpus;
 		}
 	}

 	return sum;
 }

 /* Clear avail_core_bitmap cores which are not bound to the allocated gres */
 static int _gres_filter_avail_cores(void *x, void *arg)
 {
 	gres_state_t *gres_state_step = x;
 	foreach_gres_filter_t *args = arg;
 	gres_step_state_t *gres_ss = gres_state_step->gres_data;
 	bitstr_t *filter_core_bitmap;
 	gres_state_t *gres_state_node;
 	gres_node_state_t *gres_ns;

 	/* Bail early if this GRES isn't used on the node */
 	if (!gres_ss->gres_cnt_node_alloc ||
 	    !gres_ss->gres_cnt_node_alloc[args->job_node_inx])
 		return 0;

 	if (!(gres_state_node = list_find_first(args->node_gres_list,
 						gres_find_id,
 						&gres_state_step->plugin_id))) {
 		error("No node gres when step gres is allocated. This should never happen.");
 		return 0;
 	}
 	gres_ns = gres_state_node->gres_data;

 	if (!gres_ns->topo_cnt) /* No topology info */
 		return 0;

 	filter_core_bitmap = bit_copy(args->all_gres_core_bitmap);

 	/* Determine which specific cores can be used */
 	for (int i = 0; i < gres_ns->topo_cnt; i++) {
 		/* Is this gres allocated to the step? */
 		if (gres_ss->gres_bit_alloc &&
 		    !bit_overlap_any(
 			    gres_ss->gres_bit_alloc[args->job_node_inx],
 			    gres_ns->topo_gres_bitmap[i]))
 			continue;
 		/* Does it specify which cores which can use it */
 		if (!gres_ns->topo_core_bitmap[i]) {
 			bit_nset(args->any_gres_core_bitmap,
 				 args->core_start_bit, args->core_end_bit);
 			continue;
 		}
 		bit_nclear(filter_core_bitmap, args->core_start_bit,
 			   args->core_end_bit);
 		for (int j = 0;
 		     j < bit_size(gres_ns->topo_core_bitmap[i]);
 		     j++) {
 			if (bit_test(gres_ns->topo_core_bitmap[i], j)) {
 				bit_set(filter_core_bitmap,
 					args->core_start_bit + j);
 			}
 		}
 		bit_or(args->any_gres_core_bitmap, filter_core_bitmap);
 		bit_and(args->all_gres_core_bitmap, filter_core_bitmap);
 	}
 	FREE_NULL_BITMAP(filter_core_bitmap);
 	return 0;
 }

 /* Return true if a core was picked, false if not */
 static bool _pick_step_core(step_record_t *step_ptr,
 			    job_resources_t *job_resrcs_ptr,
 			    bitstr_t *avail_core_bitmap, int job_node_inx,
 			    int sock_inx, int core_inx, bool use_all_cores)
 {
 	int bit_offset;

 	bit_offset = get_job_resources_offset(job_resrcs_ptr,
 					      job_node_inx,
 					      sock_inx,
 					      core_inx);
 	if (bit_offset < 0)
 		fatal("get_job_resources_offset");

 	if (!bit_test(avail_core_bitmap, bit_offset))
 		return false;

 	if (bit_test(step_ptr->core_bitmap_job, bit_offset))
 		return false; /* already taken by this step */

 	/* Check and set the job's used cores. */
 	if (!(step_ptr->flags & SSF_OVERLAP_FORCE)) {
 		if ((use_all_cores == false) &&
 		    bit_test(job_resrcs_ptr->core_bitmap_used, bit_offset))
 			return false;
 		bit_set(job_resrcs_ptr->core_bitmap_used, bit_offset);
 	}

 	log_flag(STEPS, "%s: alloc Node:%d Socket:%d Core:%d",
 			 __func__, job_node_inx, sock_inx, core_inx);
 	bit_set(step_ptr->core_bitmap_job, bit_offset);

 	return true;
 }

 static bool _handle_core_select(step_record_t *step_ptr,
 				job_resources_t *job_resrcs_ptr,
 				bitstr_t *avail_core_bitmap,
 				int job_node_inx, uint16_t sockets,
 				uint16_t cores, bool use_all_cores,
 				int *core_cnt, uint16_t cores_per_task)
 {
 	int core_inx, sock_inx;

 	xassert(core_cnt);

 	if (*core_cnt <= 0)
 		return true;

 	/*
 	 * Figure out the task distribution. The default is to cyclically
 	 * distribute to sockets.
 	 */
 	if (step_ptr->step_layout &&
 	    ((step_ptr->step_layout->task_dist & SLURM_DIST_SOCKMASK) ==
 	     SLURM_DIST_SOCKBLOCK)) {
 		/* Fill sockets before allocating to the next socket */
 		for (sock_inx=0; sock_inx < sockets; sock_inx++) {
 			for (core_inx = 0; core_inx < cores; core_inx++) {
 				if (!_pick_step_core(step_ptr, job_resrcs_ptr,
 						     avail_core_bitmap,
 						     job_node_inx, sock_inx,
 						     core_inx, use_all_cores))
 					continue;

 				if (--(*core_cnt) == 0)
 					return true;
 			}
 		}
 	} else if (step_ptr->step_layout &&
 		   ((step_ptr->step_layout->task_dist & SLURM_DIST_SOCKMASK) ==
 		    SLURM_DIST_SOCKCFULL)) {
 		for (core_inx = 0; core_inx < cores; core_inx++) {
 			for (sock_inx = 0; sock_inx < sockets; sock_inx++) {
 				if (!_pick_step_core(step_ptr, job_resrcs_ptr,
 						     avail_core_bitmap,
 						     job_node_inx, sock_inx,
 						     core_inx, use_all_cores)) {
 						if (sock_inx == sockets)
 							sock_inx = 0;
 						continue;
 				}
 				if (--(*core_cnt) == 0)
 					return true;
 			}
 		}
 	} else { /* SLURM_DIST_SOCKCYCLIC */
 		int task_alloc_cores = 0;
 		int *next_core = xcalloc(sockets, sizeof(int));
 		bool nothing_allocated = false;
 		while (!nothing_allocated) {
 			nothing_allocated = true;
 			for (sock_inx = 0; sock_inx < sockets; sock_inx++) {
 				for (core_inx = next_core[sock_inx];
 				     core_inx < cores; core_inx++) {
 					next_core[sock_inx] = core_inx + 1;
 					if (!_pick_step_core(
 						step_ptr,
 						job_resrcs_ptr,
 						avail_core_bitmap,
 						job_node_inx,
 						sock_inx,
 						core_inx,
 						use_all_cores))
 						continue;
 					nothing_allocated = false;
 					if (--(*core_cnt) == 0) {
 						xfree(next_core);
 						return true;
 					}
 					if (++task_alloc_cores ==
 					    cores_per_task) {
 						task_alloc_cores = 0;
 						break;
 					}
 				}
 			}
 		}
 		xfree(next_core);
 	}
 	return false;
 }

 /* Update the step's core bitmaps, create as needed.
  *	Add the specified task count for a specific node in the job's
  *	and step's allocation */
 static int _pick_step_cores(step_record_t *step_ptr,
 			    job_resources_t *job_resrcs_ptr, int job_node_inx,
 			    uint16_t task_cnt, uint16_t cpus_per_core,
 			    int node_inx, int ntasks_per_core,
 			    int gres_cpus_alloc)
 {
 	uint16_t sockets, cores, cores_per_task, tasks_per_node;
 	int core_cnt = (int) task_cnt;
 	bool use_all_cores;
 	bitstr_t *all_gres_core_bitmap = NULL, *any_gres_core_bitmap = NULL;

 	xassert(task_cnt);

 	if (!step_ptr->core_bitmap_job)
 		step_ptr->core_bitmap_job =
 			bit_alloc(bit_size(job_resrcs_ptr->core_bitmap));

 	if (get_job_resources_cnt(job_resrcs_ptr, job_node_inx,
 				  &sockets, &cores))
 		fatal("get_job_resources_cnt");

 	if (ntasks_per_core != INFINITE16)
 		tasks_per_node = cores * ntasks_per_core * sockets;
 	else
 		tasks_per_node = cores * cpus_per_core * sockets;

 	if (((step_ptr->flags & SSF_WHOLE) || task_cnt == (cores * sockets)) &&
 	    (task_cnt <= tasks_per_node || (step_ptr->flags & SSF_OVERCOMMIT)))
 	{
 		use_all_cores = true;
 		core_cnt = ROUNDUP(job_resrcs_ptr->cpus[job_node_inx],
 				   cpus_per_core);
 	} else {
 		use_all_cores = false;

 		if (gres_cpus_alloc) {
 			core_cnt = ROUNDUP(gres_cpus_alloc, cpus_per_core);
 		} else if (step_ptr->cpus_per_task > 0) {
 			core_cnt *= step_ptr->cpus_per_task;
 			core_cnt = ROUNDUP(core_cnt, cpus_per_core);
 		}

 		log_flag(STEPS, "%s: step %pS requires %u cores on node %d with cpus_per_core=%u, available cpus from job: %u",
 			 __func__, step_ptr, core_cnt, job_node_inx,
 			 cpus_per_core, job_resrcs_ptr->cpus[job_node_inx]);

 		if (core_cnt > ROUNDUP(job_resrcs_ptr->cpus[job_node_inx],
 				       cpus_per_core) &&
 		    !(step_ptr->flags & SSF_OVERCOMMIT)) {
 			/* Node can never fulfill step request */
 			return ESLURM_TOO_MANY_REQUESTED_CPUS;
 		}
 	}

 	all_gres_core_bitmap = bit_copy(job_resrcs_ptr->core_bitmap);
 	any_gres_core_bitmap = bit_copy(job_resrcs_ptr->core_bitmap);
 	if (step_ptr->gres_list_alloc) {
 		foreach_gres_filter_t args = {
 			.all_gres_core_bitmap = all_gres_core_bitmap,
 			.any_gres_core_bitmap = any_gres_core_bitmap,
 			.core_start_bit = get_job_resources_offset(
 				job_resrcs_ptr, job_node_inx, 0, 0),
 			.core_end_bit = get_job_resources_offset(
 				job_resrcs_ptr, job_node_inx, sockets - 1,
 				cores - 1),
 			.job_node_inx = job_node_inx,
 			.node_gres_list =
 				node_record_table_ptr[node_inx]->gres_list,
 		};

 		if ((args.core_start_bit > bit_size(all_gres_core_bitmap)) ||
 		    (args.core_end_bit > bit_size(all_gres_core_bitmap)))
 			error("coremap offsets fall outside core_bitmap size. This should never happen.");
 		else if (!args.node_gres_list)
 			error("No node gres when step gres is allocated. This should never happen.");
 		else {
 			bit_nclear(any_gres_core_bitmap, args.core_start_bit,
 				   args.core_end_bit);
 			list_for_each(step_ptr->gres_list_alloc,
 				      _gres_filter_avail_cores, &args);
 			bit_and(any_gres_core_bitmap,
 				job_resrcs_ptr->core_bitmap);
 		}
 	}
 	cores_per_task = ROUNDUP(core_cnt, task_cnt); /* Round up */

 	/* select idle cores that fit all gres binding first */
 	if (_handle_core_select(step_ptr, job_resrcs_ptr,
 				all_gres_core_bitmap, job_node_inx,
 				sockets, cores, use_all_cores, &core_cnt,
 				cores_per_task))
 		goto cleanup;

 	/* select idle cores that fit any gres binding second */
 	if (!bit_equal(all_gres_core_bitmap, any_gres_core_bitmap) &&
 	    _handle_core_select(step_ptr, job_resrcs_ptr,
 				any_gres_core_bitmap, job_node_inx,
 				sockets, cores, use_all_cores, &core_cnt,
 				cores_per_task))
 		goto cleanup;

 	/* select any idle cores */
 	if (!(step_ptr->job_ptr->bit_flags & GRES_ENFORCE_BIND) &&
 	    !bit_equal(any_gres_core_bitmap, job_resrcs_ptr->core_bitmap)) {
 		log_flag(STEPS, "gres topology sub-optimal for %ps",
 			&(step_ptr->step_id));
 		if (_handle_core_select(step_ptr, job_resrcs_ptr,
 					job_resrcs_ptr->core_bitmap,
 					job_node_inx, sockets, cores,
 					use_all_cores, &core_cnt,
 					cores_per_task))
 			goto cleanup;
 	}

 	/* The test for cores==0 is just to avoid CLANG errors.
 	 * It should never happen */
 	if (use_all_cores || (cores == 0))
 		goto cleanup;


 	if (!(step_ptr->flags & SSF_OVERCOMMIT)) {
 		FREE_NULL_BITMAP(all_gres_core_bitmap);
 		FREE_NULL_BITMAP(any_gres_core_bitmap);
 		return ESLURM_NODES_BUSY;
 	}

 	/* We need to overcommit one or more cores. */
 	log_flag(STEPS, "%s: %pS needs to overcommit cores. Cores still needed:%u Cores assigned to step:%u exclusive:%c overlap:%c",
 		 __func__, step_ptr, core_cnt,
 		bit_set_count(step_ptr->core_bitmap_job),
 		 ((step_ptr->flags & SSF_EXCLUSIVE) ? 'T' : 'F'),
 		 ((step_ptr->flags & SSF_OVERLAP_FORCE) ? 'T' : 'F'));

 cleanup:
 	FREE_NULL_BITMAP(all_gres_core_bitmap);
 	FREE_NULL_BITMAP(any_gres_core_bitmap);
 	return SLURM_SUCCESS;
 }

 static bool _use_one_thread_per_core(step_record_t *step_ptr)
 {
 	job_record_t *job_ptr = step_ptr->job_ptr;
 	job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;

 	if ((step_ptr->threads_per_core == 1) ||
 	    ((step_ptr->threads_per_core == NO_VAL16) &&
 	     (job_ptr->details->mc_ptr->threads_per_core == 1)) ||
 	    (!(job_resrcs_ptr->whole_node & WHOLE_NODE_REQUIRED) &&
 	     (slurm_conf.select_type_param & (SELECT_CORE | SELECT_SOCKET)) &&
 	     (job_ptr->details &&
 	      (job_ptr->details->cpu_bind_type != NO_VAL16) &&
 	      (job_ptr->details->cpu_bind_type &
 	       CPU_BIND_ONE_THREAD_PER_CORE))))
 		return true;
 	return false;
 }

 static void _modify_cpus_alloc_for_tpc(uint16_t cr_type, uint16_t req_tpc,
 				       uint16_t vpus, int *cpus_alloc)
 {
 	xassert(cpus_alloc);

 	if ((cr_type & (SELECT_CORE | SELECT_SOCKET | SELECT_LINEAR)) &&
 	    (req_tpc != NO_VAL16) && (req_tpc < vpus)) {
 		*cpus_alloc = ROUNDUP(*cpus_alloc, req_tpc);
 		*cpus_alloc *= vpus;
 	}
 }

 /* Update a job's record of allocated CPUs when a job step gets scheduled */
 static int _step_alloc_lps(step_record_t *step_ptr, char **err_msg)
 {
 	job_record_t *job_ptr = step_ptr->job_ptr;
 	job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;
 	node_record_t *node_ptr;
 	slurm_step_layout_t *step_layout = step_ptr->step_layout;
 	int cpus_alloc, cpus_alloc_mem, cpu_array_inx = 0;
 	int job_node_inx = -1, step_node_inx = -1, node_cnt = 0;
 	bool first_step_node = true, pick_step_cores = true;
 	bool all_job_mem = false;
 	uint32_t rem_nodes;
 	int rc = SLURM_SUCCESS, final_rc = SLURM_SUCCESS;
 	multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr;
 	uint16_t orig_cpus_per_task = step_ptr->cpus_per_task;
 	uint16_t *cpus_per_task_array = NULL;
 	uint16_t *cpus_alloc_pn = NULL;
 	uint16_t ntasks_per_core = step_ptr->ntasks_per_core;
 	uint16_t req_tpc = _get_threads_per_core(step_ptr->threads_per_core,
 						 job_ptr);

 	xassert(job_resrcs_ptr);
 	xassert(job_resrcs_ptr->cpus);
 	xassert(job_resrcs_ptr->cpus_used);

 	if (!step_layout) /* batch step */
 		return rc;

 	if (!bit_set_count(job_resrcs_ptr->node_bitmap))
 		return rc;

 	xfree(*err_msg);

 	xassert(job_resrcs_ptr->core_bitmap);
 	xassert(job_resrcs_ptr->core_bitmap_used);
 	if (step_ptr->core_bitmap_job) {
 		/* "scontrol reconfig" of live system */
 		pick_step_cores = false;
 	} else if (!(step_ptr->flags & SSF_OVERCOMMIT) &&
 		   (step_ptr->cpu_count == job_ptr->total_cpus) &&
 		   ((ntasks_per_core == mc_ptr->threads_per_core) ||
 		    (ntasks_per_core == INFINITE16))) {
 		/*
 		 * If the step isn't overcommitting and uses all of job's cores
 		 * Just copy the bitmap to save time
 		 */
 		step_ptr->core_bitmap_job = bit_copy(
 			job_resrcs_ptr->core_bitmap);
 		pick_step_cores = false;
 	}

 	if (step_ptr->pn_min_memory && _is_mem_resv() &&
 	    ((job_resrcs_ptr->memory_allocated == NULL) ||
 	     (job_resrcs_ptr->memory_used == NULL))) {
 		error("%s: lack memory allocation details to enforce memory limits for %pJ",
 		      __func__, job_ptr);
 		step_ptr->pn_min_memory = 0;
 	}

 	if (!step_ptr->pn_min_memory)
 		all_job_mem = true;

 	rem_nodes = bit_set_count(step_ptr->step_node_bitmap);
 	xassert(rem_nodes == step_layout->node_cnt);

 	cpus_alloc_pn = xcalloc(step_layout->node_cnt, sizeof(*cpus_alloc_pn));
 	step_ptr->memory_allocated = xcalloc(rem_nodes, sizeof(uint64_t));
 	for (int i = 0;
 	     (node_ptr = next_node_bitmap(job_resrcs_ptr->node_bitmap, &i));
 	     i++) {
 		/*
 		 * gres_cpus_alloc - if cpus_per_gres is requested, this is
 		 * cpus_per_gres * gres_alloc on this node
 		 */
 		int gres_cpus_alloc = 0;
 		uint16_t cpus_per_task = orig_cpus_per_task;
 		uint64_t gres_step_node_mem_alloc = 0;
 		uint16_t vpus, avail_cpus_per_core, alloc_cpus_per_core;
 		uint16_t task_cnt;
 		bitstr_t *unused_core_bitmap;
 		job_node_inx++;
 		if (!bit_test(step_ptr->step_node_bitmap, i))
 			continue;
 		step_node_inx++;
 		if (job_node_inx >= job_resrcs_ptr->nhosts)
 			fatal("%s: node index bad", __func__);

 		if (!(task_cnt = step_layout->tasks[step_node_inx])) {
 			/* This should have been caught earlier */
 			error("Bad step layout: no tasks placed on node %d (%s)",
 			      job_node_inx,
 			      node_ptr->name);
 			final_rc = ESLURM_BAD_TASK_COUNT;
 			/*
 			 * Finish allocating resources to all nodes to avoid
 			 * underflow errors in _step_alloc_lps
 			 */
 			continue;
 		}

 		/*
 		 * NOTE: The --overcommit option can result in
 		 * cpus_used[] having a higher value than cpus[]
 		 */

 		/*
 		 * If whole allocate all cpus here instead of just the ones
 		 * requested
 		 */
 		if (first_step_node)
 			step_ptr->cpu_count = 0;

 		if ((++node_cnt) >
 		    job_resrcs_ptr->cpu_array_reps[cpu_array_inx]) {
 			cpu_array_inx++;
 			node_cnt = 0;
 		}

 		vpus = node_ptr->tpc;

 		if (req_tpc != NO_VAL16)
 			avail_cpus_per_core = req_tpc;
 		else
 			avail_cpus_per_core = vpus;

 		/*
 		 * If the step requested cpus_per_gres, this is mutually
 		 * exclusive with cpus_per_task. We need to calculate total
 		 * gres times cpus_per_gres to get a total cpu count.
 		 */
 		unused_core_bitmap = bit_copy(job_resrcs_ptr->core_bitmap);
 		bit_and_not(unused_core_bitmap,
 			    job_resrcs_ptr->core_bitmap_used);
 		rc = gres_stepmgr_step_alloc(step_ptr->gres_list_req,
 					  &step_ptr->gres_list_alloc,
 					  job_ptr->gres_list_alloc,
 					  job_node_inx, first_step_node,
 					  task_cnt,
 					  rem_nodes, job_ptr->job_id,
 					  step_ptr->step_id.step_id,
 					  !(step_ptr->flags &
 					    SSF_OVERLAP_FORCE),
 					  &gres_step_node_mem_alloc,
 					  node_ptr->gres_list,
 					  unused_core_bitmap,
 					  &gres_cpus_alloc);
 		FREE_NULL_BITMAP(unused_core_bitmap);
 		if (rc != SLURM_SUCCESS) {
 			log_flag(STEPS, "unable to allocate step GRES for job node %d (%s): %s",
 				 job_node_inx,
 				 node_ptr->name,
 				 slurm_strerror(rc));
 			/*
 			 * We need to set alloc resources before we continue to
 			 * avoid underflow in _step_dealloc_lps()
 			 */
 			final_rc = rc;
 		}
 		first_step_node = false;
 		rem_nodes--;

 		if (gres_cpus_alloc) {
 			if (task_cnt > gres_cpus_alloc) {
 				/*
 				 * Do not error here. If a job requests fewer
 				 * cpus than tasks via cpus_per_gres,
 				 * the job will be allocated one cpu per task.
 				 * Do the same here.
 				 * Use this same logic in _step_dealloc_lps.
 				 */
 				cpus_per_task = 1;
 				log_flag(STEPS, "%s: %pS node %d (%s) gres_cpus_alloc (%d) < tasks (%u), changing gres_cpus_alloc to tasks.",
 					 __func__, step_ptr, job_node_inx,
 					 node_ptr->name, gres_cpus_alloc,
 					 task_cnt);
 				gres_cpus_alloc = task_cnt;
 			} else {
 				cpus_per_task = gres_cpus_alloc / task_cnt;
 			}
 		}

 		/*
 		 * Modify cpus-per-task to request full cores if they can't
 		 * be shared
 		*/
 		if ((ntasks_per_core != INFINITE16) && ntasks_per_core) {
 			alloc_cpus_per_core = avail_cpus_per_core /
 					      ntasks_per_core;
 			if ((alloc_cpus_per_core > 1) &&
 			    (cpus_per_task % alloc_cpus_per_core)) {
 				cpus_per_task += alloc_cpus_per_core -
 					(cpus_per_task % alloc_cpus_per_core);
 				/*
 				 * Modify gres_cpus_alloc to account for
 				 * ntasks_per_core. If this results in
 				 * requesting more cores than are available,
 				 * then _pick_step_cores() will fail.
 				 *
 				 * Make sure to use this same logic in
 				 * _step_dealloc_lps() to know how many
 				 * cpus were allocated to this step on this
 				 * node.
 				 */
 				if (gres_cpus_alloc)
 					gres_cpus_alloc = task_cnt *
 						cpus_per_task;
 			}
 		}
 		step_ptr->cpus_per_task = cpus_per_task;
 		/*
 		 * Only populate cpus_per_task_array if needed: if cpus_per_tres
 		 * was requested, then cpus_per_task may not be the same on all
 		 * nodes. Otherwise, cpus_per_task is the same on all nodes,
 		 * and this per-node array isn't needed.
 		 */
 		if (gres_cpus_alloc) {
 			if (!cpus_per_task_array)
 				cpus_per_task_array =
 					xcalloc(step_layout->node_cnt,
 						sizeof(*cpus_per_task_array));
 			cpus_per_task_array[step_node_inx] = cpus_per_task;
 		}
 		log_flag(STEPS, "%s: %pS node %d (%s) gres_cpus_alloc=%d tasks=%u cpus_per_task=%u",
 			 __func__, step_ptr, job_node_inx, node_ptr->name,
 			 gres_cpus_alloc, task_cnt,
 			 cpus_per_task);

 		if (step_ptr->flags & SSF_WHOLE) {
 			cpus_alloc_mem = cpus_alloc =
 				job_resrcs_ptr->cpus[job_node_inx];

 			/*
 			 * If we are requesting all the memory in the job
 			 * (--mem=0) we get it all, otherwise we use what was
 			 * requested specifically for the step.
 			 *
 			 * Else factor in the tpc so we get the correct amount
 			 * of memory.
 			 */
 			if (all_job_mem)
 				cpus_alloc_mem =
 					job_resrcs_ptr->
 					cpu_array_value[cpu_array_inx];
 			else if ((req_tpc != NO_VAL16) &&
 				 (req_tpc < vpus)) {
 				cpus_alloc_mem = ROUNDUP(cpus_alloc_mem, vpus);
 				cpus_alloc_mem *= req_tpc;
 			}
 		} else {
 			if (gres_cpus_alloc)
 				cpus_alloc = gres_cpus_alloc;
 			else
 				cpus_alloc = task_cnt * cpus_per_task;

 			/*
 			 * If we are requesting all the memory in the job
 			 * (--mem=0) we get it all, otherwise we use what was
 			 * requested specifically for the step.
 			 */
 			if (all_job_mem)
 				cpus_alloc_mem =
 					job_resrcs_ptr->
 					cpu_array_value[cpu_array_inx];
 			else
 				cpus_alloc_mem = cpus_alloc;

 			/*
 			 * If we are doing threads per core we need the whole
 			 * core allocated even though we are only using what was
 			 * requested. Don't worry about cpus_alloc_mem, it's
 			 * already correct.
 			 */
 			_modify_cpus_alloc_for_tpc(job_resrcs_ptr->cr_type,
 						   req_tpc, vpus, &cpus_alloc);

 			/*
 			 * TODO: We need ntasks-per-* sent to the ctld to make
 			 * more decisions on allocation cores.
 			 */
 		}
 		step_ptr->cpu_count += cpus_alloc;
 		cpus_alloc_pn[step_node_inx] = cpus_alloc;

 		/*
 		 * Don't count this step against the allocation if
 		 * --overlap=force
 		 */
 		if (!(step_ptr->flags & SSF_OVERLAP_FORCE)) {
 			cpus_alloc = ROUNDUP(cpus_alloc, vpus);
 			cpus_alloc *= vpus;
 			if ((job_resrcs_ptr->cr_type & SELECT_CPU) &&
 			    (vpus > 1) &&
 			    (job_resrcs_ptr->cpus_used[job_node_inx] +
 			     cpus_alloc) > job_resrcs_ptr->cpus[job_node_inx])
 				job_resrcs_ptr->cpus_used[job_node_inx] =
 					job_resrcs_ptr->cpus[job_node_inx];
 			else
 				job_resrcs_ptr->cpus_used[job_node_inx] +=
 					cpus_alloc;
 		}

 		if (!step_ptr->pn_min_memory && !gres_step_node_mem_alloc) {
 			/* If we aren't requesting memory get it from the job */
 			step_ptr->pn_min_memory =
 				job_resrcs_ptr->memory_allocated[job_node_inx];
 			step_ptr->flags |= SSF_MEM_ZERO;
 		}

 		if (step_ptr->pn_min_memory && _is_mem_resv()) {
 			uint64_t mem_use;
 			if (step_ptr->pn_min_memory & MEM_PER_CPU) {
 				mem_use = step_ptr->pn_min_memory;
 				mem_use &= (~MEM_PER_CPU);
 				mem_use *= cpus_alloc_mem;
 			} else if (step_ptr->flags & SSF_MEM_ZERO) {
 				mem_use = job_resrcs_ptr->
 					memory_allocated[job_node_inx];
 			} else {
 				mem_use = step_ptr->pn_min_memory;
 			}
 			step_ptr->memory_allocated[step_node_inx] = mem_use;
 			/*
 			 * Do not count against the job's memory allocation if
 			 * --mem=0 or --overlap=force were requested.
 			 */
 			if (!(step_ptr->flags & SSF_MEM_ZERO) &&
 			    !(step_ptr->flags & SSF_OVERLAP_FORCE))
 				job_resrcs_ptr->memory_used[job_node_inx] +=
 					mem_use;
 		} else if (_is_mem_resv()) {
 			step_ptr->memory_allocated[step_node_inx] =
 				gres_step_node_mem_alloc;
 			/*
 			 * Don't count this step against the allocation if
 			 * --overlap=force
 			 */
 			if (!(step_ptr->flags & SSF_OVERLAP_FORCE))
 				job_resrcs_ptr->memory_used[job_node_inx] +=
 					gres_step_node_mem_alloc;
 		}

 		/*
 		 * Now that we have set cpus and memory used for this node,
 		 * we can check if there was an error, and continue to the
 		 * next node. If any node had an error, we can also skip
 		 * picking cores and skip to the next node.
 		 *
 		 */
 		if (final_rc != SLURM_SUCCESS) {
 			continue;
 		}

 		if (pick_step_cores) {
 			uint16_t cpus_per_core = 1;
 			/*
 			 * Here we're setting number of CPUs per core
 			 * if we don't enforce 1 thread per core
 			 *
 			 * TODO: move cpus_per_core to slurm_step_layout_t
 			 */
 			if (!_use_one_thread_per_core(step_ptr) &&
 			    (!(node_ptr->cpus == node_ptr->tot_cores))) {
 				if (step_ptr->threads_per_core != NO_VAL16)
 					cpus_per_core =
 						step_ptr->threads_per_core;
 				else if (mc_ptr->threads_per_core != NO_VAL16)
 					cpus_per_core =
 						mc_ptr->threads_per_core;
 				else {
 					cpus_per_core = node_ptr->threads;
 				}
 			}
 			if ((rc = _pick_step_cores(step_ptr, job_resrcs_ptr,
 						   job_node_inx,
 						   task_cnt,
 						   cpus_per_core, i,
 						   ntasks_per_core,
 						   gres_cpus_alloc))) {
 				log_flag(STEPS, "unable to pick step cores for job node %d (%s): %s",
 					 job_node_inx,
 					 node_ptr->name,
 					 slurm_strerror(rc));
 				final_rc = rc;
 				/* Finish allocating resources to all nodes */
 				continue;
 			}
 		}
 		if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_BIND)
 			_dump_step_layout(step_ptr);

 		if (step_ptr->flags & SSF_OVERLAP_FORCE)
 			log_flag(STEPS, "step alloc on job node %d (%s); does not count against job allocation",
 				 job_node_inx,
 				 node_ptr->name);

 		else
 			log_flag(STEPS, "step alloc on job node %d (%s) used %u of %u CPUs",
 				 job_node_inx,
 				 node_ptr->name,
 				 job_resrcs_ptr->cpus_used[job_node_inx],
 				 job_resrcs_ptr->cpus[job_node_inx]);

 		if (step_node_inx == (step_layout->node_cnt - 1))
 			break;
 	}
 	slurm_array16_to_value_reps(cpus_per_task_array, step_layout->node_cnt,
 				    &step_layout->cpt_compact_array,
 				    &step_layout->cpt_compact_reps,
 				    &step_layout->cpt_compact_cnt);
 	xfree(cpus_per_task_array);

 	slurm_array16_to_value_reps(cpus_alloc_pn, step_layout->node_cnt,
 				    &step_ptr->cpu_alloc_values,
 				    &step_ptr->cpu_alloc_reps,
 				    &step_ptr->cpu_alloc_array_cnt);
 	xfree(cpus_alloc_pn);

 	gres_step_state_log(step_ptr->gres_list_req, job_ptr->job_id,
 			    step_ptr->step_id.step_id);
 	if ((slurm_conf.debug_flags & DEBUG_FLAG_GRES) &&
 	    step_ptr->gres_list_alloc)
 		info("Step Alloc GRES:");
 	gres_step_state_log(step_ptr->gres_list_alloc, job_ptr->job_id,
 			    step_ptr->step_id.step_id);

 	/*
 	 * If we failed to allocate resources on at least one of the nodes, we
 	 * need to deallocate resources.
 	 * Creating a backup of the resources then restoring in case of an
 	 * error does not work - this method leaves cpus allocated to the node
 	 * after the job completes. Instead, we try to allocate resources on
 	 * all nodes in the job even if one of the nodes resulted in a failure.
 	 */
 	if (final_rc != SLURM_SUCCESS)
 		_step_dealloc_lps(step_ptr);

 	return final_rc;
 }

 /* Dump a job step's CPU binding information.
  * NOTE: The core_bitmap_job and node index are based upon
  * the _job_ allocation */
 static void _dump_step_layout(step_record_t *step_ptr)
 {
 	job_record_t *job_ptr = step_ptr->job_ptr;
 	job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;
 	int i, bit_inx, core_inx, node_inx, rep, sock_inx;

 	if ((step_ptr->core_bitmap_job == NULL) ||
 	    (job_resrcs_ptr == NULL) ||
 	    (job_resrcs_ptr->cores_per_socket == NULL))
 		return;

 	info("====================");
 	info("%pS", step_ptr);
 	for (i=0, bit_inx=0, node_inx=0; node_inx<job_resrcs_ptr->nhosts; i++) {
 		for (rep=0; rep<job_resrcs_ptr->sock_core_rep_count[i]; rep++) {
 			for (sock_inx=0;
 			     sock_inx<job_resrcs_ptr->sockets_per_node[i];
 			     sock_inx++) {
 				for (core_inx=0;
 				     core_inx<job_resrcs_ptr->cores_per_socket[i];
 				     core_inx++) {
 					if (bit_test(step_ptr->
 						     core_bitmap_job,
 						     bit_inx++)) {
 						info("JobNode[%d] Socket[%d] "
 						     "Core[%d] is allocated",
 						     node_inx, sock_inx,
 						     core_inx);
 					}
 				}
 			}
 			node_inx++;
 		}
 	}
 	info("====================");
 }

 static void _step_dealloc_lps(step_record_t *step_ptr)
 {
 	job_record_t *job_ptr = step_ptr->job_ptr;
 	job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;
 	int cpus_alloc;
 	int job_node_inx = -1, step_node_inx = -1;
 	uint32_t step_id = step_ptr->step_id.step_id;
 	node_record_t *node_ptr;

 	xassert(job_resrcs_ptr);
 	if (!job_resrcs_ptr) {
 		error("%s: job_resrcs is NULL for %pS; this should never happen",
 		      __func__, step_ptr);
 		return;
 	}

 	xassert(job_resrcs_ptr->cpus);
 	xassert(job_resrcs_ptr->cpus_used);

 	/* These special steps do not allocate any resources */
 	if ((step_id == SLURM_EXTERN_CONT) ||
 	    (step_id == SLURM_BATCH_SCRIPT) ||
 	    (step_id == SLURM_INTERACTIVE_STEP) ||
 	    (step_ptr->flags & SSF_EXT_LAUNCHER)) {
 		log_flag(STEPS, "Skip %s for %pS", __func__, step_ptr);
 		return;
 	}

 	if (!bit_set_count(job_resrcs_ptr->node_bitmap))
 		return;

 	if (step_ptr->memory_allocated && _is_mem_resv() &&
 	    ((job_resrcs_ptr->memory_allocated == NULL) ||
 	     (job_resrcs_ptr->memory_used == NULL))) {
 		error("%s: lack memory allocation details to enforce memory limits for %pJ",
 		      __func__, job_ptr);
 	}

 	for (int i = 0;
 	     (node_ptr = next_node_bitmap(job_resrcs_ptr->node_bitmap, &i));
 	     i++) {
 		int inx;
 		uint16_t vpus = node_ptr->tpc;
 		job_node_inx++;
 		if (!bit_test(step_ptr->step_node_bitmap, i))
 			continue;
 		step_node_inx++;
 		if (job_node_inx >= job_resrcs_ptr->nhosts)
 			fatal("_step_dealloc_lps: node index bad");

 		/*
 		 * We need to free GRES structures regardless of overlap.
 		 */
 		gres_stepmgr_step_dealloc(step_ptr->gres_list_alloc,
 				       job_ptr->gres_list_alloc, job_ptr->job_id,
 				       step_ptr->step_id.step_id,
 				       job_node_inx,
 				       !(step_ptr->flags & SSF_OVERLAP_FORCE));

 		if (step_ptr->flags & SSF_OVERLAP_FORCE) {
 			log_flag(STEPS, "step dealloc on job node %d (%s); did not count against job allocation",
 				 job_node_inx,
 				 node_ptr->name);
 			continue; /* Next node */
 		}

 		/*
 		 * If zero tasks, then _step_alloc_lps() error'd and did not
 		 * allocate any resources, so we should not deallocate anything.
 		 */
 		if (!step_ptr->step_layout->tasks[step_node_inx])
 			continue;

 		xassert(step_ptr->cpu_alloc_array_cnt);
 		xassert(step_ptr->cpu_alloc_reps);
 		xassert(step_ptr->cpu_alloc_values);

 		inx = slurm_get_rep_count_inx(
 			step_ptr->cpu_alloc_reps,
 			step_ptr->cpu_alloc_array_cnt,
 			step_node_inx);
 		cpus_alloc = ROUNDUP(step_ptr->cpu_alloc_values[inx], vpus);
 		cpus_alloc *= vpus;

 		if ((job_resrcs_ptr->cr_type & SELECT_CPU) &&
 		    (node_ptr->tpc > 1)) {
 			int core_alloc = ROUNDUP(cpus_alloc, vpus);
 			int used_cores =
 				ROUNDUP(job_resrcs_ptr->cpus_used[job_node_inx],
 					vpus);

 			/*
 			 * If SELECT_CPU is used with a thread count > 1 the
 			 * cpus recorded being allocated to a job don't have to
 			 * be a multiple of threads per core. Make sure to
 			 * dealloc full cores and not partial cores.
 			 */

 			if (used_cores >= core_alloc) {
 				used_cores -= core_alloc;
 				job_resrcs_ptr->cpus_used[job_node_inx] =
 					MIN(used_cores * vpus,
 					    job_resrcs_ptr->cpus[job_node_inx]);
 			} else {
 				error("%s: CPU underflow for %pS (%u<%u on job node %d)",
 					__func__, step_ptr, used_cores * vpus,
 					core_alloc * vpus, job_node_inx);
 				job_resrcs_ptr->cpus_used[job_node_inx] = 0;
 			}
 		} else if (job_resrcs_ptr->cpus_used[job_node_inx] >=
 			   cpus_alloc) {
 			job_resrcs_ptr->cpus_used[job_node_inx] -= cpus_alloc;
 		} else {
 			error("%s: CPU underflow for %pS (%u<%u on job node %d)",
 			      __func__, step_ptr,
 			      job_resrcs_ptr->cpus_used[job_node_inx],
 			      cpus_alloc, job_node_inx);
 			job_resrcs_ptr->cpus_used[job_node_inx] = 0;
 		}
 		if (step_ptr->memory_allocated && _is_mem_resv() &&
 		    !(step_ptr->flags & SSF_MEM_ZERO)) {
 			uint64_t mem_use =
 				step_ptr->memory_allocated[step_node_inx];
 			if (job_resrcs_ptr->memory_used[job_node_inx] >=
 			    mem_use) {
 				job_resrcs_ptr->memory_used[job_node_inx] -=
 					mem_use;
 				log_flag(STEPS, "Deallocating %"PRIu64"MB of memory on node %d (%s) now used: %"PRIu64" of %"PRIu64,
 					 mem_use,
 					 job_node_inx,
 					 node_ptr->name,
 					 job_resrcs_ptr->
 					 memory_used[job_node_inx],
 					 job_resrcs_ptr->
 					 memory_allocated[job_node_inx]);
 			} else {
 				error("%s: Allocated memory underflow for %pS (freed memory=%"PRIu64")",
 				      __func__, step_ptr, mem_use);
 				job_resrcs_ptr->memory_used[job_node_inx] = 0;
 			}
 		}
 		log_flag(STEPS, "step dealloc on job node %d (%s) used: %u of %u CPUs",
 			 job_node_inx, node_ptr->name,
 			 job_resrcs_ptr->cpus_used[job_node_inx],
 			 job_resrcs_ptr->cpus[job_node_inx]);
 		if (step_node_inx == (step_ptr->step_layout->node_cnt - 1))
 			break;
 	}

 	xassert(job_resrcs_ptr->core_bitmap);
 	xassert(job_resrcs_ptr->core_bitmap_used);
 	if (step_ptr->core_bitmap_job) {
 		/* Mark the job's cores as no longer in use */
 		int job_core_size, step_core_size;
 		job_core_size  = bit_size(job_resrcs_ptr->core_bitmap_used);
 		step_core_size = bit_size(step_ptr->core_bitmap_job);
 		/*
 		 * Don't remove step's used cores from job core_bitmap_used if
 		 * SSF_OVERLAP_FORCE
 		 */
 		if (job_core_size == step_core_size) {
 			if (!(step_ptr->flags & SSF_OVERLAP_FORCE))
 				bit_and_not(job_resrcs_ptr->core_bitmap_used,
 					    step_ptr->core_bitmap_job);
 		} else {
 			error("%s: %pS core_bitmap size mismatch (%d != %d)",
 			      __func__, step_ptr, job_core_size,
 			      step_core_size);
 		}
 		FREE_NULL_BITMAP(step_ptr->core_bitmap_job);
 	}
 }

 static int _test_strlen(char *test_str, char *str_name, int max_str_len)
 {
 	int i = 0;

 	if (test_str)
 		i = strlen(test_str);
 	if (i > max_str_len) {
 		info("step_create_request: strlen(%s) too big (%d > %d)",
 		     str_name, i, max_str_len);
 		return ESLURM_PATHNAME_TOO_LONG;
 	}
 	return SLURM_SUCCESS;
 }

 /* Calculate a step's cpus_per_task value. Set to zero if we can't distributed
  * the tasks evenly over the nodes (heterogeneous job allocation). */
 static int _calc_cpus_per_task(job_step_create_request_msg_t *step_specs,
 			       job_record_t *job_ptr)
 {
 	int cpus_per_task = 0, i;
 	int num_tasks;
 	char *cpt = NULL;

 	if ((cpt = xstrstr(step_specs->tres_per_task, "cpu:"))) {
 		cpus_per_task = slurm_atoul(cpt + 4);
 		if (cpus_per_task < 0)
 			cpus_per_task = 0;
 		return cpus_per_task;
 	}

 	if (step_specs->cpus_per_tres)
 		return 0;
 	if (step_specs->num_tasks == NO_VAL)
 		return 0;

 	if ((step_specs->cpu_count == 0) ||
 	    (step_specs->cpu_count % step_specs->num_tasks))
 		return cpus_per_task;

 	cpus_per_task = step_specs->cpu_count / step_specs->num_tasks;
 	if (cpus_per_task < 1)
 		cpus_per_task = 1;

 	if (!job_ptr->job_resrcs)
 		return cpus_per_task;

 	num_tasks = step_specs->num_tasks;
 	for (i = 0; i < job_ptr->job_resrcs->cpu_array_cnt; i++) {
 		if (cpus_per_task > job_ptr->job_resrcs->cpu_array_value[i]) {
 			cpus_per_task = 0;
 			break;
 		}
 		num_tasks -= (job_ptr->job_resrcs->cpu_array_value[i] /
 			      cpus_per_task) *
 			job_ptr->job_resrcs->cpu_array_reps[i];
 	}

 	if (num_tasks > 0)
 		return 0;

 	return cpus_per_task;
 }

 /*
  * Set a job's default cpu_bind_type based upon configuration of allocated nodes,
  * partition or global TaskPluginParams
  */
 static void _set_def_cpu_bind(job_record_t *job_ptr)
 {
 	job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;
 	node_record_t *node_ptr;
 	uint32_t bind_bits, bind_to_bits, node_bind = NO_VAL;
 	bool node_fail = false;

 	if (!job_ptr->details || !job_resrcs_ptr ||
 	    !job_resrcs_ptr->node_bitmap)
 		return;		/* No data structure */

 	bind_to_bits = CPU_BIND_TO_SOCKETS | CPU_BIND_TO_CORES |
 		CPU_BIND_TO_THREADS | CPU_BIND_TO_LDOMS;
 	if ((job_ptr->details->cpu_bind_type != NO_VAL16) &&
 	    (job_ptr->details->cpu_bind_type & bind_to_bits)) {
 		if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_BIND) {
 			char tmp_str[128];
 			slurm_sprint_cpu_bind_type(
 				tmp_str, job_ptr->details->cpu_bind_type);
 			log_flag(CPU_BIND, "%pJ CpuBind='%s' already set for job/allocation using it as a default for new step.",
 				 job_ptr, tmp_str);
 		}
 		return;		/* Already set */
 	}
 	bind_bits = job_ptr->details->cpu_bind_type & CPU_BIND_VERBOSE;

 	/*
 	 * Set job's cpu_bind to the node's cpu_bind if all of the job's
 	 * allocated nodes have the same cpu_bind (or it is not set)
 	 */
 	for (int i = 0;
 	     (node_ptr = next_node_bitmap(job_resrcs_ptr->node_bitmap, &i));
 	     i++) {
 		if (node_bind == NO_VAL) {
 			if (node_ptr->cpu_bind != 0)
 				node_bind = node_ptr->cpu_bind;
 		} else if ((node_ptr->cpu_bind != 0) &&
 			   (node_bind != node_ptr->cpu_bind)) {
 			node_fail = true;
 			break;
 		}
 	}
 	if (!node_fail && (node_bind != NO_VAL)) {
 		job_ptr->details->cpu_bind_type = bind_bits | node_bind;
 		if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_BIND) {
 			char tmp_str[128];
 			slurm_sprint_cpu_bind_type(
 				tmp_str, job_ptr->details->cpu_bind_type);
 			log_flag(CPU_BIND, "%pJ setting default CpuBind to nodes default '%s' for new step.",
 				 job_ptr, tmp_str);
 		}
 		return;
 	}

 	/* Use partition's cpu_bind (if any) */
 	if (job_ptr->part_ptr && job_ptr->part_ptr->cpu_bind) {
 		job_ptr->details->cpu_bind_type = bind_bits |
 			job_ptr->part_ptr->cpu_bind;
 		if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_BIND) {
 			char tmp_str[128];
 			slurm_sprint_cpu_bind_type(
 				tmp_str, job_ptr->details->cpu_bind_type);
 			log_flag(CPU_BIND, "%pJ setting default CpuBind to partition default '%s' for new step.",
 				 job_ptr, tmp_str);

 		}
 		return;
 	}

 	/* Use global default from TaskPluginParams */
 	job_ptr->details->cpu_bind_type = bind_bits |
 		slurm_conf.task_plugin_param;

 	if (slurm_conf.debug_flags & DEBUG_FLAG_CPU_BIND) {
 		char tmp_str[128];
 		slurm_sprint_cpu_bind_type(tmp_str,
 					   job_ptr->details->cpu_bind_type);
 		log_flag(CPU_BIND, "%pJ setting default CpuBind to TaskPluginParam '%s' for new step.",
 			 job_ptr, tmp_str);

 	}
 }

 /*
  * A step may explicitly set a TRES count to zero in order to avoid making use
  * of the job's TRES specifications. At this point, clear the records with
  * zero counts.
  */
 static void _clear_zero_tres(char **tres_spec)
 {
 	char *new_spec = NULL, *new_sep = "";
 	char *tmp, *tok, *sep, *end_ptr = NULL, *save_ptr = NULL;
 	long int cnt;

 	if (*tres_spec == NULL)
 		return;

 	tmp = xstrdup(*tres_spec);
 	tok = strtok_r(tmp, ",", &save_ptr);
 	while (tok) {
 		bool copy_rec = true;
 		sep = strrchr(tok, ':');
 		if (sep) {
 			cnt = strtoll(sep+1, &end_ptr, 10);
 			if ((cnt == 0) && (end_ptr[0] == '\0'))
 				copy_rec = false;
 		}
 		if (copy_rec) {
 			xstrfmtcat(new_spec, "%s%s", new_sep, tok);
 			new_sep = ",";
 		}
 		tok = strtok_r(NULL, ",", &save_ptr);
 	}
 	xfree(tmp);
 	xfree(*tres_spec);
 	*tres_spec = new_spec;
 }

 /*
  * A step may explicitly request --gres=none in order to avoid making use
  * of the job's TRES specifications. At this point, clear all GRES records.
  */
 static void _clear_gres_tres(char **tres_spec)
 {
 	char *new_spec = NULL, *new_sep = "";
 	char *tmp, *tok, *save_ptr = NULL;

 	if (*tres_spec == NULL)
 		return;

 	tmp = xstrdup(*tres_spec);
 	tok = strtok_r(tmp, ",", &save_ptr);
 	while (tok) {
 		if (xstrncmp(tok, "gres", 4)) {
 			xstrfmtcat(new_spec, "%s%s", new_sep, tok);
 			new_sep = ",";
 		}
 		tok = strtok_r(NULL, ",", &save_ptr);
 	}
 	xfree(tmp);
 	xfree(*tres_spec);
 	*tres_spec = new_spec;
 }

 /*
  * If a job step specification does not include any GRES specification,
  * then copy those values from the job record.
  * Currently we only want to check if the step lacks a "gres" request.
  * "tres_per_[step|task]" has "cpu:<count>" in it, so we need to search for
  * "gres" in the strings.
  */
 static void _copy_job_tres_to_step(job_step_create_request_msg_t *step_specs,
 				   job_record_t *job_ptr)
 {
 	if (!xstrcasecmp(step_specs->tres_per_node, "NONE")) {
 		xfree(step_specs->tres_per_node);
 		_clear_gres_tres(&step_specs->tres_per_step);
 		_clear_gres_tres(&step_specs->tres_per_socket);
 		_clear_gres_tres(&step_specs->tres_per_task);
 	} else if (xstrstr(step_specs->tres_per_step, "gres")	||
 		   xstrstr(step_specs->tres_per_node, "gres")	||
 		   xstrstr(step_specs->tres_per_socket, "gres")	||
 		   xstrstr(step_specs->tres_per_task, "gres")) {
 		_clear_zero_tres(&step_specs->tres_per_step);
 		_clear_zero_tres(&step_specs->tres_per_node);
 		_clear_zero_tres(&step_specs->tres_per_socket);
 		_clear_zero_tres(&step_specs->tres_per_task);
 	} else {
 		xfree(step_specs->tres_per_step);
 		xfree(step_specs->tres_per_node);
 		xfree(step_specs->tres_per_socket);
 		xfree(step_specs->tres_per_task);
 		step_specs->tres_per_step   = xstrdup(job_ptr->tres_per_job);
 		step_specs->tres_per_node   = xstrdup(job_ptr->tres_per_node);
 		step_specs->tres_per_socket = xstrdup(job_ptr->tres_per_socket);
 		step_specs->tres_per_task   = xstrdup(job_ptr->tres_per_task);
 	}
 }

 static int _test_step_desc_fields(job_step_create_request_msg_t *step_specs)
 {
 	static time_t sched_update = 0;
 	static int max_submit_line = DEFAULT_MAX_SUBMIT_LINE_SIZE;

 	if (sched_update != slurm_conf.last_update) {
 		char *tmp_ptr;
 		sched_update = slurm_conf.last_update;

 		if ((tmp_ptr = xstrcasestr(slurm_conf.sched_params,
 		                           "max_submit_line_size="))) {
 			max_submit_line = atoi(tmp_ptr + 21);
 		} else {
 			max_submit_line = DEFAULT_MAX_SUBMIT_LINE_SIZE;
 		}
 	}

 	if (_test_strlen(step_specs->host, "host", 1024) ||
 	    _test_strlen(step_specs->name, "name", 1024) ||
 	    _test_strlen(step_specs->network, "network", 1024) ||
 	    _test_strlen(step_specs->submit_line, "submit_line",
 			 max_submit_line))
 		return ESLURM_PATHNAME_TOO_LONG;
 	return SLURM_SUCCESS;
 }

 static int _switch_setup(step_record_t *step_ptr)
 {
 	xassert(step_ptr);

 	if (!step_ptr->step_layout)
 		return SLURM_SUCCESS;

 	errno = 0;
 	if (switch_g_build_stepinfo(&step_ptr->switch_step,
 				    step_ptr->step_layout,
 				    step_ptr) < 0) {
 		if (errno == ESLURM_INTERCONNECT_BUSY)
 			return errno;
 		return ESLURM_INTERCONNECT_FAILURE;
 	}
 	return SLURM_SUCCESS;
 }

 extern int step_create(job_record_t *job_ptr,
 		       job_step_create_request_msg_t *step_specs,
 		       step_record_t** new_step_record,
 		       uint16_t protocol_version, char **err_msg)
 {
 	step_record_t *step_ptr;
 	bitstr_t *nodeset;
 	int cpus_per_task, ret_code, i;
 	uint32_t node_count = 0;
 	time_t now = time(NULL);
 	char *step_node_list = NULL;
 	uint32_t orig_cpu_count;
 	list_t *step_gres_list = NULL;
 	uint32_t task_dist;
 	uint32_t max_tasks;
 	uint32_t over_time_limit;
 	bool resv_ports_present = false;

 	*new_step_record = NULL;

 	xassert(job_ptr);

 	/*
 	 * NOTE: We have already confirmed the UID originating
 	 * the request is identical with step_specs->user_id
 	 */
 	if (step_specs->user_id != job_ptr->user_id)
 		return ESLURM_ACCESS_DENIED ;

 	if (step_specs->step_id.step_id != NO_VAL) {
 		if (list_delete_first(job_ptr->step_list,
 				      _purge_duplicate_steps,
 				      step_specs) < 0)
 			return ESLURM_DUPLICATE_STEP_ID;
 	}

 	if ((job_ptr->details == NULL) || IS_JOB_SUSPENDED(job_ptr))
 		return ESLURM_DISABLED;

 	if (IS_JOB_PENDING(job_ptr)) {
 		/* NOTE: LSF creates a job allocation for batch jobs.
 		 * After the allocation has been made, LSF submits a
 		 * job to run in that allocation (sbatch --jobid= ...).
 		 * If that job is pending either LSF messed up or LSF is
 		 * not being used. We have seen this problem with Moab. */
 		return ESLURM_DUPLICATE_JOB_ID;
 	}

 	/* Get OverTimeLimit from job's partition if set, or globally. */
 	if (job_ptr->part_ptr &&
 	    (job_ptr->part_ptr->over_time_limit != NO_VAL16))
 		over_time_limit = job_ptr->part_ptr->over_time_limit;
 	else
 		over_time_limit = slurm_conf.over_time_limit;

 	if (over_time_limit == INFINITE16)
 		over_time_limit = YEAR_MINUTES;

 	if (IS_JOB_FINISHED(job_ptr) ||
 	    (((job_ptr->end_time + (over_time_limit * 60)) <= time(NULL)) &&
 	     !IS_JOB_CONFIGURING(job_ptr)))
 		return ESLURM_ALREADY_DONE;

 	if (job_ptr->details->prolog_running)
 		return ESLURM_PROLOG_RUNNING;

 	if (step_specs->flags & SSF_INTERACTIVE) {
 		debug("%s: interactive step requested", __func__);
 		*new_step_record = _build_interactive_step(job_ptr, step_specs,
 							   protocol_version);
 		if (*new_step_record)
 			return SLURM_SUCCESS;
 		else
 			return ESLURM_DUPLICATE_STEP_ID;
 	}

 	if (step_specs->flags & SSF_EXT_LAUNCHER) {
 		debug("%s: external launcher step requested", __func__);
 		return _build_ext_launcher_step(new_step_record, job_ptr,
 						step_specs, protocol_version);
 	}

 	/* A step cannot request more threads per core than its allocation. */
 	if ((step_specs->threads_per_core != NO_VAL16) &&
 	    (step_specs->threads_per_core >
 	     job_ptr->job_resrcs->threads_per_core))
 		return ESLURM_BAD_THREAD_PER_CORE;

 	task_dist = step_specs->task_dist & SLURM_DIST_STATE_BASE;
 	/* Set to block in the case that mem is 0. srun leaves the dist
 	 * set to unknown if mem is 0. */
 	if ((task_dist == SLURM_DIST_UNKNOWN) &&
 	    (!(step_specs->pn_min_memory &(~MEM_PER_CPU)))) {
 		step_specs->task_dist &= SLURM_DIST_STATE_FLAGS;
 		step_specs->task_dist |= SLURM_DIST_BLOCK;
 		task_dist = SLURM_DIST_BLOCK;
 	}

 	if ((task_dist != SLURM_DIST_CYCLIC) &&
 	    (task_dist != SLURM_DIST_BLOCK) &&
 	    (task_dist != SLURM_DIST_CYCLIC_CYCLIC) &&
 	    (task_dist != SLURM_DIST_BLOCK_CYCLIC) &&
 	    (task_dist != SLURM_DIST_CYCLIC_BLOCK) &&
 	    (task_dist != SLURM_DIST_BLOCK_BLOCK) &&
 	    (task_dist != SLURM_DIST_CYCLIC_CFULL) &&
 	    (task_dist != SLURM_DIST_BLOCK_CFULL) &&
 	    (task_dist != SLURM_DIST_CYCLIC_CYCLIC_CYCLIC) &&
 	    (task_dist != SLURM_DIST_CYCLIC_CYCLIC_BLOCK) &&
 	    (task_dist != SLURM_DIST_CYCLIC_CYCLIC_CFULL) &&
 	    (task_dist != SLURM_DIST_CYCLIC_BLOCK_CYCLIC) &&
 	    (task_dist != SLURM_DIST_CYCLIC_BLOCK_BLOCK) &&
 	    (task_dist != SLURM_DIST_CYCLIC_BLOCK_CFULL) &&
 	    (task_dist != SLURM_DIST_CYCLIC_CFULL_CYCLIC) &&
 	    (task_dist != SLURM_DIST_CYCLIC_CFULL_BLOCK) &&
 	    (task_dist != SLURM_DIST_CYCLIC_CFULL_CFULL) &&
 	    (task_dist != SLURM_DIST_BLOCK_CYCLIC_CYCLIC) &&
 	    (task_dist != SLURM_DIST_BLOCK_CYCLIC_BLOCK) &&
 	    (task_dist != SLURM_DIST_BLOCK_CYCLIC_CFULL) &&
 	    (task_dist != SLURM_DIST_BLOCK_BLOCK_CYCLIC) &&
 	    (task_dist != SLURM_DIST_BLOCK_BLOCK_BLOCK) &&
 	    (task_dist != SLURM_DIST_BLOCK_BLOCK_CFULL) &&
 	    (task_dist != SLURM_DIST_BLOCK_CFULL_CYCLIC) &&
 	    (task_dist != SLURM_DIST_BLOCK_CFULL_BLOCK) &&
 	    (task_dist != SLURM_DIST_BLOCK_CFULL_CFULL) &&
 	    (task_dist != SLURM_DIST_PLANE) &&
 	    (task_dist != SLURM_DIST_ARBITRARY))
 		return ESLURM_BAD_DIST;

 	if (!assoc_mgr_valid_tres_cnt(step_specs->cpus_per_tres, 0) ||
 	    !assoc_mgr_valid_tres_cnt(step_specs->mem_per_tres, 0) ||
 	    tres_bind_verify_cmdline(step_specs->tres_bind) ||
 	    tres_freq_verify_cmdline(step_specs->tres_freq) ||
 	    !assoc_mgr_valid_tres_cnt(step_specs->tres_per_step, 0) ||
 	    (!assoc_mgr_valid_tres_cnt(step_specs->tres_per_node, 0) &&
 	     xstrcasecmp(step_specs->tres_per_node, "NONE")) ||
 	    !assoc_mgr_valid_tres_cnt(step_specs->tres_per_socket, 0) ||
 	    !assoc_mgr_valid_tres_cnt(step_specs->tres_per_task, 0))
 		return ESLURM_INVALID_TRES;

 	if ((ret_code = _test_step_desc_fields(step_specs)) != SLURM_SUCCESS)
 		return ret_code;

 	if (job_ptr->next_step_id >= slurm_conf.max_step_cnt)
 		return ESLURM_STEP_LIMIT;

 	/*
 	 * If the overcommit flag is checked, we set cpu_count=0
 	 * which makes it so we don't check to see the available cpus
 	 */
 	orig_cpu_count =  step_specs->cpu_count;

 	if (step_specs->flags & SSF_OVERCOMMIT)
 		step_specs->cpu_count = 0;

 	if (!step_specs->ntasks_per_tres)
 		step_specs->ntasks_per_tres = NO_VAL16;

 	/* determine cpus_per_task value by reversing what srun does */
 	if (step_specs->num_tasks < 1)
 		return ESLURM_BAD_TASK_COUNT;

 	cpus_per_task = _calc_cpus_per_task(step_specs, job_ptr);

 	_copy_job_tres_to_step(step_specs, job_ptr);

 	/* If whole is given we probably need to copy tres_per_* from the job */
 	i = gres_step_state_validate(step_specs->cpus_per_tres,
 				     step_specs->tres_per_step,
 				     step_specs->tres_per_node,
 				     step_specs->tres_per_socket,
 				     step_specs->tres_per_task,
 				     step_specs->mem_per_tres,
 				     step_specs->ntasks_per_tres,
 				     step_specs->min_nodes,
 				     &step_gres_list,
 				     job_ptr->job_id,
 				     NO_VAL, &step_specs->num_tasks,
 				     &step_specs->cpu_count, err_msg);
 	if (i != SLURM_SUCCESS) {
 		FREE_NULL_LIST(step_gres_list);
 		return i;
 	}

 	job_ptr->time_last_active = now;

 	nodeset = _pick_step_nodes(job_ptr, step_specs, step_gres_list,
 				   cpus_per_task, node_count, &ret_code);
 	if (nodeset == NULL) {
 		FREE_NULL_LIST(step_gres_list);
 		if ((ret_code == ESLURM_NODES_BUSY) ||
 		    (ret_code == ESLURM_PORTS_BUSY) ||
 		    (ret_code == ESLURM_INTERCONNECT_BUSY))
 			_build_pending_step(job_ptr, step_specs);
 		return ret_code;
 	}
 	_set_def_cpu_bind(job_ptr);

 	node_count = bit_set_count(nodeset);
 	xassert(step_specs->num_tasks != NO_VAL);

 	max_tasks = node_count * slurm_conf.max_tasks_per_node;
 	if (step_specs->num_tasks > max_tasks) {
 		error("step has invalid task count: %u max is %u",
 		      step_specs->num_tasks, max_tasks);
 		FREE_NULL_LIST(step_gres_list);
 		FREE_NULL_BITMAP(nodeset);
 		return ESLURM_BAD_TASK_COUNT;
 	}

 	step_ptr = create_step_record(job_ptr, protocol_version);
 	if (step_ptr == NULL) {
 		FREE_NULL_LIST(step_gres_list);
 		FREE_NULL_BITMAP(nodeset);
 		return ESLURMD_TOOMANYSTEPS;
 	}
 	*stepmgr_ops->last_job_update = time(NULL);

 	step_ptr->start_time = time(NULL);
 	step_ptr->state      = JOB_RUNNING;

 	memcpy(&step_ptr->step_id, &step_specs->step_id,
 	       sizeof(step_ptr->step_id));

 	if (step_specs->array_task_id != NO_VAL)
 		step_ptr->step_id.job_id = job_ptr->job_id;

 	if (step_specs->step_id.step_id != NO_VAL) {
 		if (step_specs->step_id.step_het_comp == NO_VAL) {
 			job_ptr->next_step_id =
 				MAX(job_ptr->next_step_id,
 				    step_specs->step_id.step_id);
 			job_ptr->next_step_id++;
 		}
 	} else if (job_ptr->het_job_id &&
 		   (job_ptr->het_job_id != job_ptr->job_id)) {
 		job_record_t *het_job;
 		het_job = stepmgr_ops->find_job_record(job_ptr->het_job_id);
 		if (het_job)
 			step_ptr->step_id.step_id = het_job->next_step_id++;
 		else
 			step_ptr->step_id.step_id = job_ptr->next_step_id++;
 		job_ptr->next_step_id = MAX(job_ptr->next_step_id,
 					    step_ptr->step_id.step_id);
 	} else {
 		step_ptr->step_id.step_id = job_ptr->next_step_id++;
 	}

 	/* Here is where the node list is set for the step */
 	if (step_specs->node_list &&
 	    ((step_specs->task_dist & SLURM_DIST_STATE_BASE) ==
 	     SLURM_DIST_ARBITRARY)) {
 		step_node_list = xstrdup(step_specs->node_list);
 		xfree(step_specs->node_list);
 		step_specs->node_list = bitmap2node_name(nodeset);
 	} else {
 		step_node_list = bitmap2node_name_sortable(nodeset, false);
 		xfree(step_specs->node_list);
 		step_specs->node_list = xstrdup(step_node_list);
 	}
 	log_flag(STEPS, "Picked nodes %s when accumulating from %s",
 		 step_node_list, step_specs->node_list);
 	step_ptr->step_node_bitmap = nodeset;

 	switch (step_specs->task_dist & SLURM_DIST_NODESOCKMASK) {
 	case SLURM_DIST_CYCLIC:
 	case SLURM_DIST_CYCLIC_CYCLIC:
 	case SLURM_DIST_CYCLIC_CFULL:
 	case SLURM_DIST_CYCLIC_BLOCK:
 		step_ptr->cyclic_alloc = 1;
 		break;
 	default:
 		step_ptr->cyclic_alloc = 0;
 		break;
 	}

 	step_ptr->container = xstrdup(step_specs->container);
 	step_ptr->container_id = xstrdup(step_specs->container_id);
 	step_ptr->gres_list_req = step_gres_list;
 	step_gres_list = NULL;
 	gres_step_state_log(step_ptr->gres_list_req, job_ptr->job_id,
 			    step_ptr->step_id.step_id);
 	if ((slurm_conf.debug_flags & DEBUG_FLAG_GRES) &&
 	    step_ptr->gres_list_alloc)
 		info("Step Alloc GRES:");
 	gres_step_state_log(step_ptr->gres_list_alloc, job_ptr->job_id,
 			    step_ptr->step_id.step_id);

 	step_ptr->port = step_specs->port;
 	step_ptr->srun_pid = step_specs->srun_pid;
 	step_ptr->host = xstrdup(step_specs->host);
 	if ((step_specs->cpu_freq_min == NO_VAL) &&
 	    (step_specs->cpu_freq_max == NO_VAL) &&
 	    (step_specs->cpu_freq_gov == NO_VAL)) {
 		step_ptr->cpu_freq_min = job_ptr->details->cpu_freq_min;
 		step_ptr->cpu_freq_max = job_ptr->details->cpu_freq_max;
 		step_ptr->cpu_freq_gov = job_ptr->details->cpu_freq_gov;
 	} else {
 		step_ptr->cpu_freq_min = step_specs->cpu_freq_min;
 		step_ptr->cpu_freq_max = step_specs->cpu_freq_max;
 		step_ptr->cpu_freq_gov = step_specs->cpu_freq_gov;
 	}
 	step_ptr->cpus_per_task = (uint16_t)cpus_per_task;
 	step_ptr->ntasks_per_core = step_specs->ntasks_per_core;
 	step_ptr->pn_min_memory = step_specs->pn_min_memory;
 	/*
 	 * cpu_count can be updated by gres_step_state_validate() if OVERCOMMIT
 	 * is not used. If so, use the updated value.
 	 */
 	if (step_specs->flags & SSF_OVERCOMMIT)
 		step_ptr->cpu_count = orig_cpu_count;
 	else
 		step_ptr->cpu_count = step_specs->cpu_count;
 	step_ptr->exit_code = NO_VAL;
 	step_ptr->flags = step_specs->flags;

 	step_ptr->cpus_per_tres = xstrdup(step_specs->cpus_per_tres);
 	step_ptr->mem_per_tres = xstrdup(step_specs->mem_per_tres);
 	step_ptr->cwd = xstrdup(step_specs->cwd);
 	step_ptr->std_err = xstrdup(step_specs->std_err);
 	step_ptr->std_in = xstrdup(step_specs->std_in);
 	step_ptr->std_out = xstrdup(step_specs->std_out);
 	step_ptr->submit_line = xstrdup(step_specs->submit_line);
 	step_ptr->tres_bind = xstrdup(step_specs->tres_bind);
 	step_ptr->tres_freq = xstrdup(step_specs->tres_freq);
 	step_ptr->tres_per_step = xstrdup(step_specs->tres_per_step);
 	step_ptr->tres_per_node = xstrdup(step_specs->tres_per_node);
 	step_ptr->tres_per_socket = xstrdup(step_specs->tres_per_socket);
 	step_ptr->tres_per_task = xstrdup(step_specs->tres_per_task);

 	step_ptr->threads_per_core = step_specs->threads_per_core;

 	/*
 	 * step's name and network default to job's values if not
 	 * specified in the step specification
 	 */
 	if (step_specs->name && step_specs->name[0])
 		step_ptr->name = xstrdup(step_specs->name);
 	else
 		step_ptr->name = xstrdup(job_ptr->name);
 	if (step_specs->network && step_specs->network[0])
 		step_ptr->network = xstrdup(step_specs->network);
 	else
 		step_ptr->network = xstrdup(job_ptr->network);

 	/*
 	 * the step time_limit is recorded as submitted (INFINITE
 	 * or partition->max_time by default), but the allocation
 	 * time limits may cut it short
 	 */
 	if (step_specs->time_limit == NO_VAL || step_specs->time_limit == 0 ||
 	    step_specs->time_limit == INFINITE) {
 		step_ptr->time_limit = INFINITE;
 	} else {
 		/* enforce partition limits if necessary */
 		if ((step_specs->time_limit > job_ptr->part_ptr->max_time) &&
 		    slurm_conf.enforce_part_limits) {
 			info("%s: %pS time greater than partition's (%u > %u)",
 			     __func__, step_ptr, step_specs->time_limit,
 			     job_ptr->part_ptr->max_time);
 			delete_step_record(job_ptr, step_ptr);
 			xfree(step_node_list);
 			return ESLURM_INVALID_TIME_LIMIT;
 		}
 		step_ptr->time_limit = step_specs->time_limit;
 	}

 	step_ptr->step_layout =
 		step_layout_create(step_ptr,
 				   step_node_list, node_count,
 				   step_specs->num_tasks,
 				   (uint16_t)cpus_per_task,
 				   step_specs->task_dist,
 				   step_specs->plane_size);
 	xfree(step_node_list);
 	if (!step_ptr->step_layout) {
 		delete_step_record(job_ptr, step_ptr);
 		if (step_specs->pn_min_memory)
 			return ESLURM_INVALID_TASK_MEMORY;
 		return SLURM_ERROR;
 	}
 	if (slurm_conf.mpi_params && xstrstr(slurm_conf.mpi_params, "ports="))
 		resv_ports_present = true;
 	if ((step_specs->resv_port_cnt == NO_VAL16) &&
 	    (resv_ports_present || job_ptr->resv_ports)) {
 		step_specs->resv_port_cnt = 0;
 		/*
 		 * reserved port count set to maximum task count on
 		 * any node plus one
 		 */
 		for (i = 0; i < step_ptr->step_layout->node_cnt; i++) {
 			step_specs->resv_port_cnt =
 				MAX(step_specs->resv_port_cnt,
 				    step_ptr->step_layout->tasks[i]);
 		}
 		step_specs->resv_port_cnt++;
 	}
 	if ((step_specs->resv_port_cnt != NO_VAL16) &&
 	    (step_specs->resv_port_cnt != 0)) {
 		step_ptr->resv_port_cnt = step_specs->resv_port_cnt;
 		i = resv_port_step_alloc(step_ptr);
 		if (i != SLURM_SUCCESS) {
 			delete_step_record(job_ptr, step_ptr);
 			return i;
 		}
 	}

 	if ((ret_code = _switch_setup(step_ptr))) {
 		delete_step_record(job_ptr, step_ptr);
 		return ret_code;
 	}

 	if ((ret_code = _step_alloc_lps(step_ptr, err_msg))) {
 		delete_step_record(job_ptr, step_ptr);
 		return ret_code;
 	}

 	xassert(bit_set_count(step_ptr->core_bitmap_job) != 0);

 	*new_step_record = step_ptr;

 	step_set_alloc_tres(step_ptr, node_count, false, true);
 	jobacct_storage_g_step_start(stepmgr_ops->acct_db_conn, step_ptr);
 	return SLURM_SUCCESS;
 }

 extern slurm_step_layout_t *step_layout_create(step_record_t *step_ptr,
 					       char *step_node_list,
 					       uint32_t node_count,
 					       uint32_t num_tasks,
 					       uint16_t cpus_per_task,
 					       uint32_t task_dist,
 					       uint16_t plane_size)
 {
 	slurm_step_layout_t *step_layout = NULL;
 	uint16_t cpus_per_node[node_count];
 	uint16_t cpus_per_task_array[node_count];
 	job_record_t *job_ptr = step_ptr->job_ptr;
 	job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;
 	slurm_step_layout_req_t step_layout_req;
 	uint64_t gres_cpus;
 	int cpu_inx = -1, cpus_task_inx = -1;
 	int usable_cpus, usable_mem;
 	int set_nodes = 0/* , set_tasks = 0 */;
 	int pos = -1;
 	uint32_t cpu_count_reps[node_count];
 	uint32_t cpus_task_reps[node_count];
 	uint32_t cpus_task = 0;
 	uint16_t ntasks_per_core = step_ptr->ntasks_per_core;
 	uint16_t ntasks_per_socket = 0;
 	node_record_t *node_ptr;
 	gres_stepmgr_step_test_args_t gres_test_args = {
 		.cpus_per_task = step_ptr->cpus_per_task,
 		.first_step_node = true,
 		.job_gres_list = job_ptr->gres_list_alloc,
 		.job_id = job_ptr->job_id,
 		.job_resrcs_ptr = job_resrcs_ptr,
 		.node_offset = -1,
 		.step_gres_list = step_ptr->gres_list_req,
 		.step_id = step_ptr->step_id.step_id,
 		.test_mem = false,
 	};

 	xassert(job_resrcs_ptr);
 	xassert(job_resrcs_ptr->cpus);
 	xassert(job_resrcs_ptr->cpus_used);

 	if (step_ptr->pn_min_memory && _is_mem_resv() &&
 	    ((job_resrcs_ptr->memory_allocated == NULL) ||
 	     (job_resrcs_ptr->memory_used == NULL))) {
 		error("%s: lack memory allocation details to enforce memory limits for %pJ",
 		      __func__, job_ptr);
 		step_ptr->pn_min_memory = 0;
 	} else if (step_ptr->pn_min_memory == MEM_PER_CPU)
 		step_ptr->pn_min_memory = 0;	/* clear MEM_PER_CPU flag */

 	/* build cpus-per-node arrays for the subset of nodes used by step */
 	gres_test_args.max_rem_nodes =
 		bit_set_count(step_ptr->step_node_bitmap);
 	for (int i = 0; (node_ptr = next_node_bitmap(job_ptr->node_bitmap, &i));
 	     i++) {
 		uint16_t cpus, cpus_used;
 		int err_code = SLURM_SUCCESS;
 		node_record_t *node_ptr;

 		gres_test_args.test_mem = false;
 		gres_test_args.err_code = &err_code;
 		gres_test_args.node_offset++;
 		if (!bit_test(step_ptr->step_node_bitmap, i))
 			continue;
 		node_ptr = node_record_table_ptr[i];

 		if (step_ptr->start_protocol_ver > node_ptr->protocol_version)
 			step_ptr->start_protocol_ver =
 				node_ptr->protocol_version;

 		/* find out the position in the job */
 		if (!bit_test(job_resrcs_ptr->node_bitmap, i))
 			return NULL;
 		pos = bit_set_count_range(job_resrcs_ptr->node_bitmap, 0, i);
 		if (pos >= job_resrcs_ptr->nhosts)
 			fatal("%s: node index bad", __func__);

 		cpus = job_resrcs_ptr->cpus[pos];
 		cpus_used = job_resrcs_ptr->cpus_used[pos];
 		/*
 		 * Here we are trying to figure out the number
 		 * of cpus available if we only want to run 1
 		 * thread per core.
 		 */
 		if (_use_one_thread_per_core(step_ptr)) {
 			uint16_t threads;
 			threads = node_ptr->config_ptr->threads;

 			cpus /= threads;
 			cpus_used /= threads;
 			cpus_per_task_array[0] = cpus_per_task;
 			cpus_task_reps[0] = node_count;
 		} else {
 			/*
 			 * Here we are trying to figure out how many
 			 * CPUs each task really needs. This really
 			 * only becomes an issue if the job requested
 			 * ntasks_per_core|socket=1. We just increase
 			 * the number of cpus_per_task to the thread
 			 * count. Since the system could be
 			 * heterogeneous, we needed to make this an
 			 * array.
 			 */
 			uint16_t threads_per_core;
 			multi_core_data_t *mc_ptr = NULL;

 			if (job_ptr->details)
 				mc_ptr = job_ptr->details->mc_ptr;

 			if (step_ptr->threads_per_core != NO_VAL16)
 				threads_per_core = step_ptr->threads_per_core;
 			else if (mc_ptr &&
 				 (mc_ptr->threads_per_core != NO_VAL16))
 				threads_per_core = mc_ptr->threads_per_core;
 			else
 				threads_per_core =
 					node_ptr->config_ptr->threads;
 			if (ntasks_per_socket == 1) {
 				uint16_t threads_per_socket;
 				threads_per_socket =
 					node_ptr->config_ptr->cores;
 				threads_per_socket *= threads_per_core;

 				if (cpus_per_task < threads_per_socket)
 					cpus_task = threads_per_socket;
 			} else if ((ntasks_per_core == 1) &&
 				   (cpus_per_task < threads_per_core))
 				cpus_task = threads_per_core;
 			else
 				cpus_task = cpus_per_task;

 			if ((cpus_task_inx == -1) ||
 			    (cpus_per_task_array[cpus_task_inx] != cpus_task)) {
 				cpus_task_inx++;
 				cpus_per_task_array[cpus_task_inx] = cpus_task;
 				cpus_task_reps[cpus_task_inx] = 1;
 			} else
 				cpus_task_reps[cpus_task_inx]++;
 		}

 		if (step_ptr->flags & SSF_OVERLAP_FORCE)
 			usable_cpus = cpus;
 		else
 			usable_cpus = cpus - cpus_used;

 		if (usable_cpus <= 0)
 			continue;

 		if ((step_ptr->pn_min_memory & MEM_PER_CPU) && _is_mem_resv()) {
 			uint64_t mem_use = step_ptr->pn_min_memory;
 			mem_use &= (~MEM_PER_CPU);
 			usable_mem = job_resrcs_ptr->memory_allocated[pos] -
 				     job_resrcs_ptr->memory_used[pos];
 			usable_mem /= mem_use;
 			usable_cpus = MIN(usable_cpus, usable_mem);
 		} else if ((!step_ptr->pn_min_memory) && _is_mem_resv()) {
 			gres_test_args.test_mem = true;
 		}

 		if (step_ptr->flags & SSF_OVERLAP_FORCE)
 			gres_test_args.ignore_alloc = true;
 		else
 			gres_test_args.ignore_alloc = false;

 		gres_cpus = gres_stepmgr_step_test(&gres_test_args);
 		if (usable_cpus > gres_cpus)
 			usable_cpus = gres_cpus;
 		if (usable_cpus <= 0) {
 			error("%s: no usable CPUs", __func__);
 			return NULL;
 		}
 		debug3("step_layout cpus = %d pos = %d", usable_cpus, pos);

 		if ((cpu_inx == -1) ||
 		    (cpus_per_node[cpu_inx] != usable_cpus)) {
 			cpu_inx++;

 			cpus_per_node[cpu_inx] = usable_cpus;
 			cpu_count_reps[cpu_inx] = 1;
 		} else
 			cpu_count_reps[cpu_inx]++;
 		set_nodes++;
 		gres_test_args.first_step_node = false;
 		gres_test_args.max_rem_nodes--;

 #if 0
 		/*
 		 * FIXME: on a heterogeneous system running the
 		 * select/linear plugin we could get a node that doesn't
 		 * have as many CPUs as we decided we needed for each
 		 * task. This would result in not getting a task for
 		 * the node we selected. This is usually in error. This
 		 * only happens when the person doesn't specify how many
 		 * cpus_per_task they want, and we have to come up with
 		 * a number, in this case it is wrong.
 		 */
 		if (cpus_per_task > 0) {
 			set_tasks +=
 				(uint16_t)usable_cpus / cpus_per_task;
 		} else {
 			/*
 			 * Since cpus_per_task is 0, we just add the
 			 * count of CPUs available for this job
 			 */
 			set_tasks += usable_cpus;
 		}
 		info("usable_cpus is %d and set_tasks %d %d",
 		     usable_cpus, set_tasks, cpus_per_task);
 #endif
 		if (set_nodes == node_count)
 			break;
 	}

 	/* if (set_tasks < num_tasks) { */
 	/*	error("Resources only available for %u of %u tasks", */
 	/*	     set_tasks, num_tasks); */
 	/*	return NULL; */
 	/* } */

 	/* layout the tasks on the nodes */
 	memset(&step_layout_req, 0, sizeof(slurm_step_layout_req_t));
 	step_layout_req.node_list = step_node_list;
 	step_layout_req.cpus_per_node = cpus_per_node;
 	step_layout_req.cpu_count_reps = cpu_count_reps;
 	step_layout_req.cpus_per_task = cpus_per_task_array;
 	step_layout_req.cpus_task_reps = cpus_task_reps;
 	step_layout_req.num_hosts = node_count;
 	step_layout_req.num_tasks = num_tasks;
 	step_layout_req.task_dist = task_dist;
 	step_layout_req.plane_size = plane_size;

 	if ((step_layout = slurm_step_layout_create(&step_layout_req))) {
 		step_layout->start_protocol_ver = step_ptr->start_protocol_ver;

 		if (job_ptr->node_addrs)
 			step_layout->alias_addrs = build_alias_addrs(job_ptr);
 	}

 	return step_layout;
 }

 typedef struct {
 	list_t *dealloc_steps;
 	node_record_t *node_ptr;
 	bool node_fail;
 } kill_step_on_node_args_t;

 static int _kill_step_on_node(void *x, void *arg)
 {
 	step_record_t *step_ptr = (step_record_t *) x;
 	kill_step_on_node_args_t *args = (kill_step_on_node_args_t *) arg;
 	int step_node_inx = 0;
 	int bit_position = args->node_ptr->index;
 	int rem = 0;
 	uint32_t step_rc = 0;
 	step_complete_msg_t req;

 	if (step_ptr->state != JOB_RUNNING)
 		return 0;
 	if (!bit_test(step_ptr->step_node_bitmap, bit_position))
 		return 0;

 	/* Remove step allocation from the job's allocation */
 	step_node_inx = bit_set_count_range(step_ptr->step_node_bitmap, 0,
 					    bit_position);

 	memset(&req, 0, sizeof(step_complete_msg_t));
 	memcpy(&req.step_id, &step_ptr->step_id, sizeof(req.step_id));

 	req.range_first = step_node_inx;
 	req.range_last = step_node_inx;
 	req.step_rc = 9;
 	req.jobacct = NULL;	/* No accounting */
 	(void) _step_partial_comp(step_ptr, &req, false, &rem, &step_rc);

 	/*
 	 * Do not kill the extern step on all nodes, only on the nodes that
 	 * failed. Otherwise things that rely on the extern step such as x11
 	 * or job_container/tmpfs won't work on the remaining nodes in the
 	 * allocation.
 	 */
 	if (args->node_fail && !(step_ptr->flags & SSF_NO_KILL) &&
 	    (step_ptr->step_id.step_id != SLURM_EXTERN_CONT)) {
 		info("Killing %pS due to failed node %s",
 		     step_ptr, args->node_ptr->name);
 		signal_step_tasks(step_ptr, SIGKILL, REQUEST_TERMINATE_TASKS);
 	} else {
 		info("Killing %pS on failed node %s",
 		     step_ptr, args->node_ptr->name);
 		signal_step_tasks_on_node(args->node_ptr->name, step_ptr,
 					  SIGKILL, REQUEST_TERMINATE_TASKS);
 	}

 	if (!rem) {
 		if (!args->dealloc_steps)
 			args->dealloc_steps = list_create(NULL);
 		list_append(args->dealloc_steps, step_ptr);
 	}

 	return 0;
 }

 /*
  * kill_step_on_node - determine if the specified job has any job steps
  *	allocated to the specified node and kill them unless no_kill flag
  *	is set on the step
  * IN job_ptr - pointer to an active job record
  * IN node_ptr - pointer to a node record
  * IN node_fail - true of removed node has failed
  */
 extern void kill_step_on_node(job_record_t *job_ptr, node_record_t *node_ptr,
 			      bool node_fail)
 {
 	kill_step_on_node_args_t args = {
 		.dealloc_steps = NULL,
 		.node_ptr = node_ptr,
 		.node_fail = node_fail,
 	};

 	if (!job_ptr || !node_ptr)
 		return;

 	list_for_each(job_ptr->step_list, _kill_step_on_node, &args);

 	if (args.dealloc_steps) {
 		/*
 		 * Because _finish_step_comp() may free the step_ptr, call
 		 * list_delete_all() to delete the list-node when the step_ptr
 		 * is free'd. It doesn't actually matter because we are
 		 * deleting the list immediately afterward, but it is good
 		 * practice to not leave invalid pointer references.
 		 */
 		list_delete_all(args.dealloc_steps, _finish_step_comp, NULL);
 		FREE_NULL_LIST(args.dealloc_steps);
 	}
 }

 /*
  * step_partial_comp - Note the completion of a job step on at least
  *	some of its nodes
  * IN req     - step_completion_msg RPC from slurmstepd
  * IN uid     - UID issuing the request
  * IN finish  - If true, no error, and no rem is 0 finish the step.
  * OUT rem    - count of nodes for which responses are still pending
  * OUT max_rc - highest return code for any step thus far
  * RET 0 on success, otherwise ESLURM error code
  */
 extern int step_partial_comp(step_complete_msg_t *req, uid_t uid, bool finish,
 			     int *rem, uint32_t *max_rc)
 {
 	job_record_t *job_ptr;
 	step_record_t *step_ptr;

 	xassert(rem);

 	/* find the job, step, and validate input */
 	job_ptr = stepmgr_ops->find_job_record(req->step_id.job_id);
 	if (job_ptr == NULL) {
 		info("%s: JobId=%u invalid", __func__, req->step_id.job_id);
 		return ESLURM_INVALID_JOB_ID;
 	}

 	/* If we are requeuing the job the completing flag will be set
 	 * but the state will be Pending, so don't use IS_JOB_PENDING
 	 * which won't see the completing flag.
 	 */
 	if (job_ptr->job_state == JOB_PENDING) {
 		info("%s: %pJ pending", __func__, job_ptr);
 		return ESLURM_JOB_PENDING;
 	}

 	if ((!validate_slurm_user(uid)) && (uid != job_ptr->user_id)) {
 		/* Normally from slurmstepd, from srun on some failures */
 		error("Security violation: REQUEST_STEP_COMPLETE RPC for %pJ from uid=%u",
 		      job_ptr, (unsigned int) uid);
 		return ESLURM_USER_ID_MISSING;
 	}

 	step_ptr = find_step_record(job_ptr, &req->step_id);

 	if (step_ptr == NULL) {
 		info("step_partial_comp: %pJ StepID=%u invalid; this step may have already completed",
 		     job_ptr, req->step_id.step_id);
 		return ESLURM_INVALID_JOB_ID;
 	}
 	if (req->range_last < req->range_first) {
 		error("%s: %pS range=%u-%u",
 		      __func__, step_ptr, req->range_first, req->range_last);
 		return EINVAL;
 	}

 	return _step_partial_comp(step_ptr, req, finish, rem, max_rc);
 }

 static int _step_partial_comp(step_record_t *step_ptr,
 			      step_complete_msg_t *req, bool finish,
 			      int *rem, uint32_t *max_rc)
 {
 	int nodes, rem_nodes;
 	int range_bits, set_bits;

 	if (step_ptr->step_id.step_id == SLURM_BATCH_SCRIPT) {
 		error("%s: batch step received for %pJ. This should never happen.",
 		      __func__, step_ptr->job_ptr);
 		return ESLURM_INVALID_JOB_ID;
 	}

 	/* we have been adding task average frequencies for
 	 * jobacct->act_cpufreq so we need to divide with the
 	 * total number of tasks/cpus for the step average frequency */
 	if (step_ptr->cpu_count && step_ptr->jobacct)
 		step_ptr->jobacct->act_cpufreq /= step_ptr->cpu_count;

 	if (!step_ptr->exit_node_bitmap) {
 		/* initialize the node bitmap for exited nodes */
 		nodes = bit_set_count(step_ptr->step_node_bitmap);
 		step_ptr->exit_node_bitmap = bit_alloc(nodes);
 		step_ptr->exit_code = req->step_rc;
 	} else {
 		nodes = bit_size(step_ptr->exit_node_bitmap);
 		if ((req->step_rc == SIG_OOM) ||
 		    (req->step_rc > step_ptr->exit_code))
 			step_ptr->exit_code = req->step_rc;
 	}
 	if ((req->range_first >= nodes) || (req->range_last >= nodes) ||
 	    (req->range_first > req->range_last)) {
 		/* range is zero origin */
 		error("%s: %pS range=%u-%u nodes=%d",
 		      __func__, step_ptr, req->range_first, req->range_last,
 		      nodes);
 		return EINVAL;
 	}

 	if ((step_ptr->flags & SSF_NO_SIG_FAIL) && WIFSIGNALED(req->step_rc)) {
 		step_ptr->exit_code = 0;
 	}

 	range_bits = req->range_last + 1 - req->range_first;
 	set_bits = bit_set_count_range(step_ptr->exit_node_bitmap,
 				       req->range_first,
 				       req->range_last + 1);

 	/* Check if any stepd of the range was already received */
 	if (set_bits) {
 		/* If all are already received skip jobacctinfo_aggregate */
 		if (set_bits == range_bits) {
 			debug("Step complete from %d to %d was already processed. Probably a RPC was resent from a child.",
 			      req->range_first, req->range_last);
 			goto no_aggregate;
 		}

 		/*
 		 * If partially received, we cannot recover the right gathered
 		 * information. If we don't gather the new one we'll miss some
 		 * information, and if we gather it some of the info will be
 		 * duplicated. We log that error and chose to partially
 		 * duplicate because it's probably a smaller error.
 		 */
 		error("Step complete from %d to %d was already processed (%d of %d). Probably a RPC was resent from a child and gathered information is partially duplicated.",
 		      req->range_first, req->range_last,
 		      set_bits, range_bits);
 	}

 	bit_nset(step_ptr->exit_node_bitmap,
 		 req->range_first, req->range_last);

 	jobacctinfo_aggregate(step_ptr->jobacct, req->jobacct);

 no_aggregate:
 	rem_nodes = bit_clear_count(step_ptr->exit_node_bitmap);

 	*rem = rem_nodes;
 	if (rem_nodes == 0) {
 		/* release all switch windows */
 		if (step_ptr->switch_step) {
 			debug2("full switch release for %pS, nodes %s",
 			       step_ptr, step_ptr->step_layout->node_list);
 			switch_g_job_step_complete(
 				step_ptr->switch_step,
 				step_ptr->step_layout->node_list);
 			switch_g_free_stepinfo(step_ptr->switch_step);
 			step_ptr->switch_step = NULL;
 		}
 	}

 	if (max_rc)
 		*max_rc = step_ptr->exit_code;

 	if (req->step_rc == ESLURMD_EXECVE_FAILED)
 		step_ptr->state = JOB_NODE_FAIL;

 	/* The step has finished, finish it completely */
 	if (!*rem && finish) {
 		(void) _finish_step_comp(step_ptr, NULL);
 	}

 	return SLURM_SUCCESS;
 }

 /*
  * step_set_alloc_tres - set the tres up when allocating the step.
  * Only set when job is running.
  */
 extern void step_set_alloc_tres(step_record_t *step_ptr, uint32_t node_count,
 				bool assoc_mgr_locked, bool make_formatted)
 {
 	uint64_t cpu_count = 1, mem_count = 0;
 	char *tmp_tres_str = NULL;
 	assoc_mgr_lock_t locks = { .tres = READ_LOCK };
 	job_record_t *job_ptr = step_ptr->job_ptr;

 	xassert(step_ptr);

 	xfree(step_ptr->tres_alloc_str);
 	xfree(step_ptr->tres_fmt_alloc_str);

 	if (((step_ptr->step_id.step_id == SLURM_EXTERN_CONT) ||
 	     (step_ptr->flags & SSF_EXT_LAUNCHER)) &&
 	    job_ptr->tres_alloc_str) {
 		/* get the tres from the whole job */
 		step_ptr->tres_alloc_str =
 			xstrdup(job_ptr->tres_alloc_str);
 		if (make_formatted)
 			step_ptr->tres_fmt_alloc_str =
 				xstrdup(job_ptr->tres_fmt_alloc_str);
 		return;
 	}

 	if (!assoc_mgr_locked)
 		assoc_mgr_lock(&locks);

 	if (((step_ptr->step_id.step_id == SLURM_BATCH_SCRIPT) ||
 	     (step_ptr->step_id.step_id == SLURM_INTERACTIVE_STEP)) &&
 	    job_ptr->job_resrcs) {
 		int batch_inx = 0;

 		/*
 		 * Figure out the index for the batch_host in relation to the
 		 * job specific job_resrcs structure.
 		 */
 		if (job_ptr->batch_host) {
 			batch_inx = job_get_node_inx(
 				job_ptr->batch_host, job_ptr->node_bitmap);
 			if (batch_inx == -1) {
 				error("%s: Invalid batch host %s for %pJ; this should never happen",
 				      __func__, job_ptr->batch_host, job_ptr);
 				batch_inx = 0;
 			}
 		}

 		/* get the cpus and memory on the first node */
 		if (job_ptr->job_resrcs->cpus)
 			cpu_count = job_ptr->job_resrcs->cpus[batch_inx];
 		if (job_ptr->job_resrcs->memory_allocated)
 			mem_count = job_ptr->job_resrcs->
 				memory_allocated[batch_inx];

 		tmp_tres_str = gres_stepmgr_gres_on_node_as_tres(
 			job_ptr->gres_list_alloc, 0, true);
 	} else {
 		if (!step_ptr->step_layout || !step_ptr->step_layout->task_cnt)
 			cpu_count = (uint64_t)job_ptr->total_cpus;
 		else
 			cpu_count = (uint64_t)step_ptr->cpu_count;

 		for (int i = 0; i < bit_set_count(step_ptr->step_node_bitmap);
 		     i++)
 			mem_count += step_ptr->memory_allocated[i];

 		tmp_tres_str = gres_stepmgr_gres_2_tres_str(
 			step_ptr->gres_list_alloc, true);
 	}

 	xstrfmtcat(step_ptr->tres_alloc_str,
 		   "%s%u=%"PRIu64",%u=%"PRIu64",%u=%u",
 		   step_ptr->tres_alloc_str ? "," : "",
 		   TRES_CPU, cpu_count,
 		   TRES_MEM, mem_count,
 		   TRES_NODE, node_count);

 	if (tmp_tres_str) {
 		xstrfmtcat(step_ptr->tres_alloc_str, "%s%s",
 			   step_ptr->tres_alloc_str ? "," : "",
 			   tmp_tres_str);
 		xfree(tmp_tres_str);
 	}

 	if (make_formatted)
 		step_ptr->tres_fmt_alloc_str =
 			slurmdb_make_tres_string_from_simple(
 				step_ptr->tres_alloc_str, assoc_mgr_tres_list,
 				NO_VAL, CONVERT_NUM_UNIT_EXACT, 0, NULL);

 	if (!assoc_mgr_locked)
 		assoc_mgr_unlock(&locks);
 }

 static int _suspend_job_step(void *x, void *arg)
 {
 	step_record_t *step_ptr = (step_record_t *) x;
 	job_record_t *job_ptr = step_ptr->job_ptr;
 	time_t *now = (time_t *) arg;

 	if (step_ptr->state != JOB_RUNNING)
 		return 0;

 	if ((job_ptr->suspend_time) &&
 	    (job_ptr->suspend_time > step_ptr->start_time)) {
 		step_ptr->pre_sus_time +=
 			difftime(*now, job_ptr->suspend_time);
 	} else {
 		step_ptr->pre_sus_time +=
 			difftime(*now, step_ptr->start_time);
 	}

 	return 0;
 }

 /* Update time stamps for job step suspend */
 extern void suspend_job_step(job_record_t *job_ptr)
 {
 	time_t now = time(NULL);
 	list_for_each(job_ptr->step_list, _suspend_job_step, &now);
 }

 static int _resume_job_step(void *x, void *arg)
 {
 	step_record_t *step_ptr = (step_record_t *) x;
 	job_record_t *job_ptr = (job_record_t *) step_ptr->job_ptr;
 	time_t *now = (time_t *) arg;

 	if (step_ptr->state != JOB_RUNNING)
 		return 0;

 	if ((job_ptr->suspend_time) &&
 	    (job_ptr->suspend_time < step_ptr->start_time)) {
 		step_ptr->tot_sus_time +=
 			difftime(*now, step_ptr->start_time);
 	} else {
 		step_ptr->tot_sus_time +=
 			difftime(*now, job_ptr->suspend_time);
 	}

 	return 0;
 }

 /* Update time stamps for job step resume */
 extern void resume_job_step(job_record_t *job_ptr)
 {
 	time_t now = time(NULL);
 	list_for_each(job_ptr->step_list, _resume_job_step, &now);
 }

 static void _signal_step_timelimit(step_record_t *step_ptr, time_t now)
 {
 	node_record_t *node_ptr;
 	static bool cloud_dns = false;
 	static time_t last_update = 0;
 	job_record_t *job_ptr = step_ptr->job_ptr;
 	kill_job_msg_t *kill_step;
 	agent_arg_t *agent_args = NULL;

 	step_ptr->state = JOB_TIMEOUT;

 	xassert(step_ptr);
 	agent_args = xmalloc(sizeof(agent_arg_t));
 	agent_args->msg_type = REQUEST_KILL_TIMELIMIT;
 	agent_args->retry = 1;
 	agent_args->hostlist = hostlist_create(NULL);
 	kill_step = xmalloc(sizeof(kill_job_msg_t));
 	memcpy(&kill_step->step_id, &step_ptr->step_id,
 	       sizeof(kill_step->step_id));
 	kill_step->het_job_id = job_ptr->het_job_id;
 	kill_step->job_state = job_ptr->job_state;
 	kill_step->job_uid   = job_ptr->user_id;
 	kill_step->job_gid   = job_ptr->group_id;
 	kill_step->nodes     = xstrdup(job_ptr->nodes);
 	kill_step->time      = now;
 	kill_step->start_time = job_ptr->start_time;
 	kill_step->details = xstrdup(job_ptr->state_desc);

         if (last_update != slurm_conf.last_update) {
                 if (xstrcasestr(slurm_conf.slurmctld_params, "cloud_dns"))
                         cloud_dns = true;
                 else
                         cloud_dns = false;
                 last_update = slurm_conf.last_update;
         }

 	if (step_ptr->step_node_bitmap) {
 		agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
 		for (int i = 0;
 		     (node_ptr = next_node_bitmap(step_ptr->step_node_bitmap,
 						  &i));
 		     i++) {
 			if (agent_args->protocol_version >
 			    node_ptr->protocol_version) {
 				agent_args->protocol_version =
 					node_ptr->protocol_version;
 			}
 			hostlist_push_host(agent_args->hostlist,
 					   node_ptr->name);
 			agent_args->node_count++;
 			if (PACK_FANOUT_ADDRS(node_ptr))
 				agent_args->msg_flags |= SLURM_PACK_ADDRS;
 		}
 	} else {
 		/* Could happen on node failure */
 		info("%s: %pJ Step %u has NULL node_bitmap", __func__,
 		     job_ptr, step_ptr->step_id.step_id);
 	}

 	if (agent_args->node_count == 0) {
 		hostlist_destroy(agent_args->hostlist);
 		xfree(agent_args);
 		slurm_free_kill_job_msg(kill_step);
 		return;
 	}

 	agent_args->msg_args = kill_step;
 	set_agent_arg_r_uid(agent_args, SLURM_AUTH_UID_ANY);
 	stepmgr_ops->agent_queue_request(agent_args);
 }

 extern int check_job_step_time_limit(void *x, void *arg)
 {
 	step_record_t *step_ptr = (step_record_t *) x;
 	time_t *now = (time_t *) arg;
 	uint32_t job_run_mins = 0;

 	if (step_ptr->state != JOB_RUNNING)
 		return 0;

 	if (step_ptr->time_limit == INFINITE || step_ptr->time_limit == NO_VAL)
 		return 0;

 	job_run_mins = (uint32_t) (((*now - step_ptr->start_time) -
 				    step_ptr->tot_sus_time) / 60);

 	if (job_run_mins >= step_ptr->time_limit) {
 		/* this step has timed out */
 		info("%s: %pS has timed out (%u)",
 		     __func__, step_ptr, step_ptr->time_limit);
 		_signal_step_timelimit(step_ptr, *now);
 	}

 	return 0;
 }

 /* Return true if memory is a reserved resources, false otherwise */
 static bool _is_mem_resv(void)
 {
 	static bool mem_resv_value  = false;
 	static bool mem_resv_tested = false;

 	if (!mem_resv_tested) {
 		mem_resv_tested = true;
 		if (slurm_conf.select_type_param & SELECT_MEMORY)
 			mem_resv_value = true;
 	}

 	return mem_resv_value;
 }

 typedef struct {
 	int mod_cnt;
 	uint32_t time_limit;
 } update_step_args_t;

 static int _update_step(void *x, void *arg)
 {
 	step_record_t *step_ptr = (step_record_t *) x;
 	update_step_args_t *args = (update_step_args_t *) arg;

 	if (step_ptr->state != JOB_RUNNING)
 		return 0;

 	step_ptr->time_limit = args->time_limit;
 	args->mod_cnt++;

 	jobacct_storage_g_step_start(stepmgr_ops->acct_db_conn, step_ptr);

 	info("Updating %pS time limit to %u", step_ptr, args->time_limit);

 	return 0;
 }

 /*
  * Process job step update request from specified user,
  * RET - 0 or error code
  */
 extern int update_step(step_update_request_msg_t *req, uid_t uid)
 {
 	job_record_t *job_ptr;
 	step_record_t *step_ptr = NULL;
 	update_step_args_t args = { .mod_cnt = 0 };
 	slurm_step_id_t step_id = { 0 };

 	job_ptr = stepmgr_ops->find_job_record(req->job_id);
 	if (job_ptr == NULL) {
 		error("%s: invalid JobId=%u", __func__, req->job_id);
 		return ESLURM_INVALID_JOB_ID;
 	}

 	step_id.job_id = job_ptr->job_id;
 	step_id.step_id = req->step_id;
 	step_id.step_het_comp = NO_VAL;

 	/*
 	 * No need to limit step time limit as job time limit will kill
 	 * any steps with any time limit
 	 */
 	if (req->step_id == NO_VAL) {
 		args.time_limit = req->time_limit;
 		list_for_each(job_ptr->step_list, _update_step, &args);
 	} else {
 		step_ptr = find_step_record(job_ptr, &step_id);

 		if (!step_ptr && (job_ptr->bit_flags & STEPMGR_ENABLED))
 			goto stepmgr;
 		if (!step_ptr)
 			return ESLURM_INVALID_JOB_ID;
 		if (req->time_limit) {
 			step_ptr->time_limit = req->time_limit;
 			args.mod_cnt++;

 			jobacct_storage_g_step_start(stepmgr_ops->acct_db_conn,
 						     step_ptr);

 			info("Updating %pS time limit to %u",
 			     step_ptr, req->time_limit);
 		}
 	}

 stepmgr:
 	if (running_in_slurmctld() && !step_ptr &&
 	    (job_ptr->bit_flags & STEPMGR_ENABLED)) {
 		agent_arg_t *agent_args = NULL;
 		step_update_request_msg_t *agent_update_msg = NULL;

 		agent_update_msg = xmalloc(sizeof(*agent_update_msg));
 		agent_update_msg->job_id = req->job_id;
 		agent_update_msg->step_id = req->step_id;
 		agent_update_msg->time_limit = req->time_limit;

 		agent_args = xmalloc(sizeof(*agent_args));
 		agent_args->msg_type = REQUEST_UPDATE_JOB_STEP;
 		agent_args->retry = 1;
 		agent_args->hostlist = hostlist_create(job_ptr->batch_host);
 		agent_args->node_count = 1;
 		agent_args->protocol_version = SLURM_PROTOCOL_VERSION;

 		agent_args->msg_args = agent_update_msg;
 		set_agent_arg_r_uid(agent_args, slurm_conf.slurmd_user_id);
 		stepmgr_ops->agent_queue_request(agent_args);
 		args.mod_cnt++;
 	}

 	if (args.mod_cnt)
 		*stepmgr_ops->last_job_update = time(NULL);

 	return SLURM_SUCCESS;
 }

 static int _rebuild_bitmaps(void *x, void *arg)
 {
 	int i_first, i_last, i_size;
 	int old_core_offset = 0, new_core_offset = 0;
 	bool old_node_set, new_node_set;
 	uint32_t step_id;
 	bitstr_t *orig_step_core_bitmap;
 	step_record_t *step_ptr = (step_record_t *) x;
 	bitstr_t *orig_job_node_bitmap = (bitstr_t *) arg;
 	job_record_t *job_ptr = step_ptr->job_ptr;

 	if (step_ptr->state < JOB_RUNNING)
 		return 0;

 	gres_stepmgr_step_state_rebase(step_ptr->gres_list_alloc,
 				    orig_job_node_bitmap,
 				    job_ptr->job_resrcs->node_bitmap);
 	if (!step_ptr->core_bitmap_job)
 		return 0;

 	step_id = step_ptr->step_id.step_id;

 	orig_step_core_bitmap = step_ptr->core_bitmap_job;
 	i_size = bit_size(job_ptr->job_resrcs->core_bitmap);
 	step_ptr->core_bitmap_job = bit_alloc(i_size);
 	i_first = MIN(bit_ffs(orig_job_node_bitmap),
 		      bit_ffs(job_ptr->job_resrcs->node_bitmap));
 	i_last  = MAX(bit_fls(orig_job_node_bitmap),
 		      bit_fls(job_ptr->job_resrcs->node_bitmap));
 	for (int i = i_first; i <= i_last; i++) {
 		old_node_set = bit_test(orig_job_node_bitmap, i);
 		new_node_set = bit_test(job_ptr->job_resrcs->node_bitmap, i);
 		if (!old_node_set && !new_node_set)
 			continue;
 		if (old_node_set && new_node_set) {
 			for (int j = 0; j < node_record_table_ptr[i]->tot_cores;
 			     j++) {
 				if (!bit_test(orig_step_core_bitmap,
 					      old_core_offset + j))
 					continue;
 				bit_set(step_ptr->core_bitmap_job,
 					new_core_offset + j);
 				/*
 				 * Only regular, non-overlapping steps should
 				 * set bits in core_bitmap_used
 				 */
 				if ((step_id != SLURM_INTERACTIVE_STEP) &&
 				    (step_id != SLURM_EXTERN_CONT) &&
 				    (step_id != SLURM_BATCH_SCRIPT) &&
 				    !(step_ptr->flags & SSF_OVERLAP_FORCE) &&
 				    !(step_ptr->flags & SSF_EXT_LAUNCHER))
 					bit_set(job_ptr->job_resrcs->
 						core_bitmap_used,
 						new_core_offset + j);
 			}
 		}
 		if (old_node_set)
 			old_core_offset += node_record_table_ptr[i]->tot_cores;
 		if (new_node_set)
 			new_core_offset += node_record_table_ptr[i]->tot_cores;
 	}
 	FREE_NULL_BITMAP(orig_step_core_bitmap);

 	return 0;
 }

 /*
  * Rebuild a job step's core_bitmap_job after a job has just changed size
  * job_ptr IN - job that was just re-sized
  * orig_job_node_bitmap IN - The job's original node bitmap
  */
 extern void rebuild_step_bitmaps(job_record_t *job_ptr,
 				 bitstr_t *orig_job_node_bitmap)
 {
 	if (job_ptr->step_list == NULL)
 		return;

 	log_flag(STEPS, "Resizing steps of %pJ", job_ptr);
 	list_for_each(job_ptr->step_list, _rebuild_bitmaps,
 		      orig_job_node_bitmap);

 }

 extern step_record_t *build_extern_step(job_record_t *job_ptr)
 {
 	step_record_t *step_ptr = create_step_record(job_ptr, 0);
 	char *node_list;
 	uint32_t node_cnt;

 	node_list = job_ptr->nodes;
 	node_cnt = job_ptr->node_cnt;

 	if (!step_ptr) {
 		error("%s: Can't create step_record! This should never happen",
 		      __func__);
 		return NULL;
 	}

 	*stepmgr_ops->last_job_update = time(NULL);

 	step_ptr->step_layout = fake_slurm_step_layout_create(
 		node_list, NULL, NULL, node_cnt, node_cnt,
 		SLURM_PROTOCOL_VERSION);

 	step_ptr->name = xstrdup("extern");
 	step_ptr->state = JOB_RUNNING;
 	step_ptr->start_time = job_ptr->start_time;
 	step_ptr->step_id.job_id = job_ptr->job_id;
 	step_ptr->step_id.step_id = SLURM_EXTERN_CONT;
 	step_ptr->step_id.step_het_comp = NO_VAL;
 	if (job_ptr->node_bitmap)
 		step_ptr->step_node_bitmap =
 			bit_copy(job_ptr->node_bitmap);
 	step_ptr->time_last_active = time(NULL);
 	step_set_alloc_tres(step_ptr, 1, false, false);

 	jobacct_storage_g_step_start(stepmgr_ops->acct_db_conn, step_ptr);

 	return step_ptr;
 }

 extern step_record_t *build_batch_step(job_record_t *job_ptr_in)
 {
 	job_record_t *job_ptr;
 	step_record_t *step_ptr;
 	char *host = NULL;

 	if (job_ptr_in->het_job_id) {
 		job_ptr = stepmgr_ops->find_job_record(job_ptr_in->het_job_id);
 		if (!job_ptr) {
 			error("%s: hetjob leader is corrupt! This should never happen",
 			      __func__);
 			job_ptr = job_ptr_in;
 		}
 	} else
 		job_ptr = job_ptr_in;

 	step_ptr = create_step_record(job_ptr, 0);

 	if (!step_ptr) {
 		error("%s: Can't create step_record! This should never happen",
 		      __func__);
 		return NULL;
 	}

 	*stepmgr_ops->last_job_update = time(NULL);

 	host = job_ptr->batch_host;

 	step_ptr->step_layout = fake_slurm_step_layout_create(
 		host, NULL, NULL, 1, 1, SLURM_PROTOCOL_VERSION);
 	step_ptr->name = xstrdup("batch");
 	step_ptr->state = JOB_RUNNING;
 	step_ptr->start_time = job_ptr->start_time;
 	step_ptr->step_id.job_id = job_ptr->job_id;
 	step_ptr->step_id.step_id = SLURM_BATCH_SCRIPT;
 	step_ptr->step_id.step_het_comp = NO_VAL;
 	step_ptr->container = xstrdup(job_ptr->container);
 	step_ptr->container_id = xstrdup(job_ptr->container_id);

 	if (node_name2bitmap(job_ptr->batch_host, false,
 			     &step_ptr->step_node_bitmap, NULL)) {
 		error("%s: %pJ has invalid node list (%s)",
 		      __func__, job_ptr, job_ptr->batch_host);
 	}

 	step_ptr->time_last_active = time(NULL);
 	step_set_alloc_tres(step_ptr, 1, false, false);

 	jobacct_storage_g_step_start(stepmgr_ops->acct_db_conn, step_ptr);

 	return step_ptr;
 }

 static step_record_t *_build_interactive_step(
 	job_record_t *job_ptr_in,
 	job_step_create_request_msg_t *step_specs,
 	uint16_t protocol_version)
 {
 	job_record_t *job_ptr;
 	step_record_t *step_ptr;
 	char *host = NULL;
 	slurm_step_id_t step_id = {0};

 	if (job_ptr_in->het_job_id) {
 		job_ptr = stepmgr_ops->find_job_record(job_ptr_in->het_job_id);
 		if (!job_ptr) {
 			error("%s: hetjob leader is corrupt! This should never happen",
 			      __func__);
 			job_ptr = job_ptr_in;
 		}
 	} else
 		job_ptr = job_ptr_in;

 	step_id.job_id = job_ptr->job_id;
 	step_id.step_id = SLURM_INTERACTIVE_STEP;
 	step_id.step_het_comp = NO_VAL;
 	step_ptr = find_step_record(job_ptr, &step_id);
 	if (step_ptr) {
 		debug("%s: interactive step for %pJ already exists",
 		      __func__, job_ptr);
 		return NULL;
 	}

 	host = job_ptr->batch_host;
 	if (!host) {
 		error("%s: %pJ batch_host is NULL! This should never happen",
 		      __func__, job_ptr);
 		return NULL;
 	}

 	step_ptr = create_step_record(job_ptr, protocol_version);

 	if (!step_ptr) {
 		error("%s: Can't create step_record! This should never happen",
 		      __func__);
 		return NULL;
 	}
 	*stepmgr_ops->last_job_update = time(NULL);

 	step_ptr->step_layout = fake_slurm_step_layout_create(
 		host, NULL, NULL, 1, 1, protocol_version);
 	step_ptr->name = xstrdup("interactive");
 	step_ptr->state = JOB_RUNNING;
 	step_ptr->start_time = job_ptr->start_time;
 	step_ptr->step_id.job_id = job_ptr->job_id;
 	step_ptr->step_id.step_id = SLURM_INTERACTIVE_STEP;
 	step_ptr->step_id.step_het_comp = NO_VAL;
 	step_ptr->container = xstrdup(job_ptr->container);
 	step_ptr->container_id = xstrdup(job_ptr->container_id);

 	step_ptr->port = step_specs->port;
 	step_ptr->srun_pid = step_specs->srun_pid;
 	step_ptr->host = xstrdup(step_specs->host);
 	step_ptr->submit_line = xstrdup(step_specs->submit_line);

 	step_ptr->core_bitmap_job = bit_copy(job_ptr->job_resrcs->core_bitmap);

 	if (node_name2bitmap(job_ptr->batch_host, false,
 			     &step_ptr->step_node_bitmap, NULL)) {
 		error("%s: %pJ has invalid node list (%s)",
 		      __func__, job_ptr, job_ptr->batch_host);
 		delete_step_record(job_ptr, step_ptr);
 		return NULL;
 	}

 	step_ptr->time_last_active = time(NULL);
 	step_set_alloc_tres(step_ptr, 1, false, false);

 	jobacct_storage_g_step_start(stepmgr_ops->acct_db_conn, step_ptr);

 	return step_ptr;
 }

 /*
  * Build a special step for mpi launchers.
  */
 static int _build_ext_launcher_step(step_record_t **step_rec,
 				    job_record_t *job_ptr,
 				    job_step_create_request_msg_t *step_specs,
 				    uint16_t protocol_version)
 {
 	bitstr_t *nodeset;
 	uint32_t node_count;
 	int rc;
 	char *step_node_list;
 	step_record_t *step_ptr;

 	if (!step_rec)
 		return SLURM_ERROR;

 	if (job_ptr->next_step_id >= slurm_conf.max_step_cnt) {
 		error("%s: %pJ MaxStepCount limit reached", __func__, job_ptr);
 		return ESLURM_STEP_LIMIT;
 	}

 	/* Reset some fields we're going to ignore in _pick_step_nodes. */
 	step_specs->flags = SSF_EXT_LAUNCHER;
 	step_specs->cpu_count = 0;
 	xfree(step_specs->cpus_per_tres);
 	step_specs->ntasks_per_core = NO_VAL16;
 	step_specs->ntasks_per_tres = NO_VAL16;
 	step_specs->pn_min_memory = 0;
 	xfree(step_specs->mem_per_tres);
 	step_specs->threads_per_core = NO_VAL16;
 	xfree(step_specs->tres_bind);
 	xfree(step_specs->tres_per_step);
 	xfree(step_specs->tres_per_node);
 	xfree(step_specs->tres_per_socket);
 	xfree(step_specs->tres_per_task);

 	/* Select the nodes for this job */
 	nodeset = _pick_step_nodes(job_ptr, step_specs, NULL, 0, 0, &rc);
 	if (nodeset == NULL) {
 		return rc;
 	}

 	/* Here is where the node list is set for the step */
 	if (step_specs->node_list &&
 	    ((step_specs->task_dist & SLURM_DIST_STATE_BASE) ==
 	     SLURM_DIST_ARBITRARY)) {
 		step_node_list = xstrdup(step_specs->node_list);
 		xfree(step_specs->node_list);
 		step_specs->node_list = bitmap2node_name(nodeset);
 	} else {
 		step_node_list = bitmap2node_name_sortable(nodeset, false);
 		xfree(step_specs->node_list);
 		step_specs->node_list = xstrdup(step_node_list);
 	}
 	log_flag(STEPS, "Picked nodes %s when accumulating from %s",
 		 step_node_list, step_specs->node_list);

 	step_ptr = *step_rec = create_step_record(job_ptr, protocol_version);

 	if (!step_ptr) {
 		error("%s: Can't create step_record! This should never happen",
 		      __func__);
 		return SLURM_ERROR;
 	}
 	*stepmgr_ops->last_job_update = time(NULL);

 	/* We want 1 task per node. */
 	step_ptr->step_node_bitmap = nodeset;
 	node_count = bit_set_count(nodeset);
 	step_specs->num_tasks = node_count;

 	/* Create the fake step layout with 1 task per node */
 	step_ptr->step_layout = fake_slurm_step_layout_create(
 		step_node_list, NULL, NULL, node_count, node_count,
 		SLURM_PROTOCOL_VERSION);
 	xfree(step_node_list);

 	if (!step_ptr->step_layout) {
 		delete_step_record(job_ptr, step_ptr);
 		return SLURM_ERROR;
 	}

 	/* Needed for not considering it in _mark_busy_nodes */
 	step_ptr->flags |= SSF_EXT_LAUNCHER;

 	/* Set the step id */
 	memcpy(&step_ptr->step_id, &step_specs->step_id,
 	       sizeof(step_ptr->step_id));

 	if (step_specs->array_task_id != NO_VAL)
 		step_ptr->step_id.job_id = job_ptr->job_id;

 	if (step_specs->step_id.step_id != NO_VAL) {
 		if (step_specs->step_id.step_het_comp == NO_VAL) {
 			job_ptr->next_step_id =
 				MAX(job_ptr->next_step_id,
 				    step_specs->step_id.step_id);
 			job_ptr->next_step_id++;
 		}
 	} else if (job_ptr->het_job_id &&
 		   (job_ptr->het_job_id != job_ptr->job_id)) {
 		job_record_t *het_job;
 		het_job = stepmgr_ops->find_job_record(job_ptr->het_job_id);
 		if (het_job)
 			step_ptr->step_id.step_id = het_job->next_step_id++;
 		else
 			step_ptr->step_id.step_id = job_ptr->next_step_id++;
 		job_ptr->next_step_id = MAX(job_ptr->next_step_id,
 					    step_ptr->step_id.step_id);
 	} else {
 		step_ptr->step_id.step_id = job_ptr->next_step_id++;
 	}

 	/* The step needs to run on all the cores. */
 	step_ptr->core_bitmap_job = bit_copy(job_ptr->job_resrcs->core_bitmap);
 	step_ptr->name = xstrdup(step_specs->name);
 	step_ptr->state = JOB_RUNNING;
 	step_ptr->start_time = job_ptr->start_time;
 	step_ptr->time_last_active = time(NULL);

 	step_set_alloc_tres(step_ptr, 1, false, false);
 	jobacct_storage_g_step_start(stepmgr_ops->acct_db_conn, step_ptr);

 	if ((rc = _switch_setup(step_ptr))) {
 		delete_step_record(job_ptr, step_ptr);
 		return rc;
 	}

 	return SLURM_SUCCESS;
 }

 extern slurm_node_alias_addrs_t *build_alias_addrs(job_record_t *job_ptr)
 {
 	slurm_node_alias_addrs_t *alias_addrs;

 	if (!job_ptr || !job_ptr->node_addrs)
 		return NULL;

 	alias_addrs = xmalloc(sizeof(slurm_node_alias_addrs_t));
 	alias_addrs->node_cnt = job_ptr->node_cnt;
 	alias_addrs->node_addrs = xcalloc(job_ptr->node_cnt,
 					  sizeof(slurm_addr_t));
 	memcpy(alias_addrs->node_addrs, job_ptr->node_addrs,
 	       (sizeof(slurm_addr_t) * job_ptr->node_cnt));
 	alias_addrs->node_list = xstrdup(job_ptr->nodes);

 	return alias_addrs;
 }

 extern int job_get_node_inx(char *node_name, bitstr_t *node_bitmap)
 {
 	int node_inx = -1;

 	if (!node_name)
 		return -1;

 	xassert(node_bitmap);

 	node_inx = node_name_get_inx(node_name);
 	if (node_inx == -1)
 		return -1;

 	if (!bit_test(node_bitmap, node_inx))
 		return -1;

 	return bit_set_count_range(node_bitmap, 0, node_inx);
 }

 static void _kill_step_on_msg_fail(step_complete_msg_t *req, slurm_msg_t *msg,
 				   void (*lock_func)(bool lock))
 {
 	int rc, rem;
 	uint32_t step_rc;
 	DEF_TIMERS;
 	/* init */
 	START_TIMER;
 	error("Step creation timed out: Deallocating %ps nodes %u-%u",
 	      &req->step_id, req->range_first, req->range_last);

 	if (lock_func)
 		lock_func(true);

 	rc = step_partial_comp(req, msg->auth_uid, true, &rem, &step_rc);

 	if (lock_func)
 		lock_func(false);

 	END_TIMER2(__func__);
 	log_flag(STEPS, "%s: %ps rc:%s %s",
 		 __func__, &req->step_id, slurm_strerror(rc), TIME_STR);
 }

 /* create a credential for a given job step, return error code */
 static int _make_step_cred(step_record_t *step_ptr, slurm_cred_t **slurm_cred,
 			   uint16_t protocol_version)
 {
 	slurm_cred_arg_t cred_arg;
 	job_record_t *job_ptr = step_ptr->job_ptr;
 	job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;

 	xassert(job_resrcs_ptr && job_resrcs_ptr->cpus);

 	setup_cred_arg(&cred_arg, job_ptr);

 	memcpy(&cred_arg.step_id, &step_ptr->step_id, sizeof(cred_arg.step_id));
 	if (job_resrcs_ptr->memory_allocated) {
 		slurm_array64_to_value_reps(job_resrcs_ptr->memory_allocated,
 					    job_resrcs_ptr->nhosts,
 					    &cred_arg.job_mem_alloc,
 					    &cred_arg.job_mem_alloc_rep_count,
 					    &cred_arg.job_mem_alloc_size);
 	}

 	cred_arg.step_gres_list  = step_ptr->gres_list_alloc;

 	cred_arg.step_core_bitmap = step_ptr->core_bitmap_job;
 	cred_arg.step_hostlist   = step_ptr->step_layout->node_list;
 	if (step_ptr->memory_allocated) {
 		slurm_array64_to_value_reps(step_ptr->memory_allocated,
 					    step_ptr->step_layout->node_cnt,
 					    &cred_arg.step_mem_alloc,
 					    &cred_arg.step_mem_alloc_rep_count,
 					    &cred_arg.step_mem_alloc_size);
 	}

 	cred_arg.switch_step = step_ptr->switch_step;

 	*slurm_cred = slurm_cred_create(&cred_arg, true, protocol_version);

 	xfree(cred_arg.job_mem_alloc);
 	xfree(cred_arg.job_mem_alloc_rep_count);
 	xfree(cred_arg.step_mem_alloc);
 	xfree(cred_arg.step_mem_alloc_rep_count);
 	if (*slurm_cred == NULL) {
 		error("slurm_cred_create error");
 		return ESLURM_INVALID_JOB_CREDENTIAL;
 	}

 	return SLURM_SUCCESS;
 }

 static int _send_msg(slurm_msg_t *msg, int slurmd_fd, slurm_msg_type_t type,
 		     void *data)
 {
 	xassert(running_in_slurmctld() || running_in_slurmstepd());
 	int rc;

 	if (running_in_slurmctld()) {
 		if ((rc = send_msg_response(msg, type, data))) {
 			errno = rc;
 			return SLURM_ERROR;
 		}
 		return SLURM_SUCCESS;
 	}

 	if (running_in_slurmstepd()) {
 		if ((stepd_proxy_send_resp_to_slurmd(slurmd_fd, msg, type,
 						     data))) {
 			return SLURM_ERROR;
 		}
 		return SLURM_SUCCESS;
 	}

 	return SLURM_ERROR;
 }

 extern int step_create_from_msg(slurm_msg_t *msg, int slurmd_fd,
 				void (*lock_func)(bool lock),
 				void (*fail_lock_func)(bool lock))
 {
 	char *err_msg = NULL;
 	int error_code = SLURM_SUCCESS;
 	DEF_TIMERS;
 	step_record_t *step_rec;
 	job_step_create_response_msg_t job_step_resp;
 	job_step_create_request_msg_t *req_step_msg = msg->data;
 	slurm_cred_t *slurm_cred = NULL;
 	job_record_t *job_ptr = NULL;

 	START_TIMER;

 	xassert(msg->auth_ids_set);

 	if (req_step_msg->user_id == SLURM_AUTH_NOBODY) {
 		req_step_msg->user_id = msg->auth_uid;

 		if (get_log_level() >= LOG_LEVEL_DEBUG3) {
 			char *host = auth_g_get_host(msg);
 			debug3("%s: [%s] set RPC user_id to %d",
 			       __func__, host, msg->auth_uid);
 			xfree(host);
 		}
 	} else if (msg->auth_uid != req_step_msg->user_id) {
 		return_code_msg_t rc_msg = {
 			.return_code = ESLURM_USER_ID_MISSING,
 		};
 		error("Security violation, JOB_STEP_CREATE RPC from uid=%u to run as uid %u",
 		      msg->auth_uid, req_step_msg->user_id);
 		_send_msg(msg, slurmd_fd, RESPONSE_SLURM_RC, &rc_msg);
 		return ESLURM_USER_ID_MISSING;
 	}

 	dump_step_desc(req_step_msg);

 	if (lock_func) {
 		lock_func(true);
 	}

 	if (req_step_msg->array_task_id != NO_VAL)
 		job_ptr = stepmgr_ops->find_job_array_rec(
 			req_step_msg->step_id.job_id,
 			req_step_msg->array_task_id);
 	else
 		job_ptr = stepmgr_ops->find_job_record(
 			req_step_msg->step_id.job_id);

 	if (job_ptr == NULL) {
 		error_code = ESLURM_INVALID_JOB_ID ;
 		goto end_it;
 	}

 	if (job_ptr->bit_flags & EXTERNAL_JOB) {
 		error("%s: step creation disabled for external jobs", __func__);
 		error_code = ESLURM_NOT_SUPPORTED;
 		goto end_it;
 	}

 	if (running_in_slurmctld() &&
 	    (job_ptr->bit_flags & STEPMGR_ENABLED)) {
 		reroute_msg_t reroute_msg = {
 			.stepmgr = job_ptr->batch_host,
 		};
 		_send_msg(msg, slurmd_fd, RESPONSE_SLURM_REROUTE_MSG,
 			  &reroute_msg);
 		if (lock_func)
 			lock_func(false);
 		return SLURM_SUCCESS;
 	}

 	error_code = step_create(job_ptr, req_step_msg, &step_rec,
 				 msg->protocol_version, &err_msg);

 	if (error_code == SLURM_SUCCESS) {
 		error_code = _make_step_cred(step_rec, &slurm_cred,
 					     step_rec->start_protocol_ver);
 	}
 	END_TIMER2(__func__);

 end_it:
 	/* return result */
 	if (error_code) {
 		if (lock_func)
 			lock_func(false);

 		if (error_code == ESLURM_PROLOG_RUNNING)
 			log_flag(STEPS, "%s for configuring %ps: %s",
 				 __func__, &req_step_msg->step_id,
 				 slurm_strerror(error_code));
 		else if (error_code == ESLURM_DISABLED)
 			log_flag(STEPS, "%s for suspended %ps: %s",
 				 __func__, &req_step_msg->step_id,
 				 slurm_strerror(error_code));
 		else
 			log_flag(STEPS, "%s for %ps: %s",
 				 __func__, &req_step_msg->step_id,
 				 slurm_strerror(error_code));
 		if (err_msg) {
 			return_code2_msg_t rc_msg = {
 				.return_code = error_code,
 				.err_msg = err_msg,
 			};
 			_send_msg(msg, slurmd_fd, RESPONSE_SLURM_RC_MSG,
 				  &rc_msg);
 		} else {
 			return_code_msg_t rc_msg = {
 				.return_code = error_code,
 			};
 			_send_msg(msg, slurmd_fd, RESPONSE_SLURM_RC, &rc_msg);
 		}
 	} else {
 		slurm_step_layout_t *step_layout = NULL;
 		dynamic_plugin_data_t *switch_step = NULL;

 		log_flag(STEPS, "%s: %pS %s %s",
 			 __func__, step_rec, req_step_msg->node_list, TIME_STR);

 		memset(&job_step_resp, 0, sizeof(job_step_resp));
 		job_step_resp.job_id = step_rec->step_id.job_id;
 		job_step_resp.job_step_id = step_rec->step_id.step_id;
 		job_step_resp.resv_ports  = step_rec->resv_ports;

 		step_layout = slurm_step_layout_copy(step_rec->step_layout);
 		job_step_resp.step_layout = step_layout;

 		if (step_rec->job_ptr && step_rec->job_ptr->details &&
 		    (step_rec->job_ptr->details->cpu_bind_type != NO_VAL16)) {
 			job_step_resp.def_cpu_bind_type =
 				step_rec->job_ptr->details->cpu_bind_type;
 		}
 		job_step_resp.cred = slurm_cred;
 		job_step_resp.use_protocol_ver = step_rec->start_protocol_ver;

 		if (step_rec->switch_step)
 			switch_g_duplicate_stepinfo(step_rec->switch_step,
 						    &switch_step);
 		job_step_resp.switch_step = switch_step;

 		if (job_ptr->bit_flags & STEPMGR_ENABLED)
 			job_step_resp.stepmgr = job_ptr->batch_host;

 		if (lock_func)
 			lock_func(false);

 		if (msg->protocol_version != step_rec->start_protocol_ver) {
 			log_flag(NET, "%s: responding with non-matching msg 0x%x to step 0x%x protocol version",
 				 __func__, msg->protocol_version,
 				 step_rec->start_protocol_ver);
 			msg->protocol_version = step_rec->start_protocol_ver;
 		}

 		if (_send_msg(msg, slurmd_fd, RESPONSE_JOB_STEP_CREATE,
 			      &job_step_resp)) {
 			step_complete_msg_t req;

 			memset(&req, 0, sizeof(req));
 			req.step_id = step_rec->step_id;
 			req.jobacct = step_rec->jobacct;
 			req.step_rc = SIGKILL;
 			req.range_first = 0;
 			req.range_last = step_layout->node_cnt - 1;
 			_kill_step_on_msg_fail(&req, msg, fail_lock_func);
 		}

 		slurm_cred_destroy(slurm_cred);
 		slurm_step_layout_destroy(step_layout);
 		switch_g_free_stepinfo(switch_step);
 	}

 	xfree(err_msg);

 	return error_code;
 }

 /*
  * pack_job_step_info_response_msg - packs job step info
  * IN step_id - specific id or NO_VAL/NO_VAL for all
  * IN uid - user issuing request
  * IN show_flags - job step filtering options
  * OUT buffer - location to store data, pointers automatically advanced
  * RET - 0 or error code
  * NOTE: MUST free_buf buffer
  */
 extern int pack_job_step_info_response_msg(pack_step_args_t *args)
 {
 	int error_code = 0;
 	uint32_t tmp_offset;
 	time_t now = time(NULL);

 	if (args->proto_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		/* steps_packed placeholder */
 		pack32(args->steps_packed, args->buffer);
 		pack_time(now, args->buffer);

 		list_for_each_ro(args->job_step_list,
 				 args->pack_job_step_list_func, args);

 		if (list_count(job_list) && !args->valid_job &&
 		    !args->steps_packed)
 			error_code = ESLURM_INVALID_JOB_ID;

 		slurm_pack_list(args->stepmgr_jobs,
 				slurm_pack_stepmgr_job_info, args->buffer,
 				args->proto_version);

 		/* put the real record count in the message body header */
 		tmp_offset = get_buf_offset(args->buffer);
 		set_buf_offset(args->buffer, 0);
 		pack32(args->steps_packed, args->buffer);

 		set_buf_offset(args->buffer, tmp_offset);
 	}

 	xfree(args->visible_parts);

 	return error_code;
 }

 extern int stepmgr_get_step_layouts(job_record_t *job_ptr,
 				    slurm_step_id_t *step_id,
 				    slurm_step_layout_t **out_step_layout)
 {
 	list_itr_t *itr;
 	step_record_t *step_ptr = NULL;
 	slurm_step_layout_t *step_layout = NULL;

 	/* We can't call find_step_record here since we may need more than 1 */
 	itr = list_iterator_create(job_ptr->step_list);
 	while ((step_ptr = list_next(itr))) {
 		if (!verify_step_id(&step_ptr->step_id, step_id))
 			continue;
 		/*
 		 * Rebuild alias_addrs if need after restart of slurmctld
 		 */
 		 if (job_ptr->node_addrs &&
 		     !step_ptr->step_layout->alias_addrs) {
 			step_ptr->step_layout->alias_addrs =
 				build_alias_addrs(job_ptr);
 		}

 		if (step_layout)
 			slurm_step_layout_merge(step_layout,
 						step_ptr->step_layout);
 		else
 			step_layout = slurm_step_layout_copy(
 				step_ptr->step_layout);

 		/* break if don't need to look for further het_steps */
 		if (step_ptr->step_id.step_het_comp == NO_VAL)
 			break;
 		/*
 		 * If we are looking for a specific het step we can break here
 		 * as well.
 		 */
 		if (step_id->step_het_comp != NO_VAL)
 			break;
 	}
 	list_iterator_destroy(itr);

 	if (!step_layout) {
 		log_flag(STEPS, "%s: %pJ StepId=%u Not Found",
 			 __func__, job_ptr, step_id->step_id);
 		return ESLURM_INVALID_JOB_ID;
 	}

 	/*
 	 * The cpt_compact* fields don't go to the client because they are not
 	 * handled in slurm_step_layout_merge(). Free them so the client does
 	 * not get bad data.
 	 */
 	xfree(step_layout->cpt_compact_array);
 	xfree(step_layout->cpt_compact_reps);
 	step_layout->cpt_compact_cnt = 0;

 	*out_step_layout = step_layout;

 	return SLURM_SUCCESS;
 }

 extern int stepmgr_get_job_sbcast_cred_msg(job_record_t *job_ptr,
 					   slurm_step_id_t *step_id,
 					   uint16_t protocol_version,
 					   job_sbcast_cred_msg_t **out_sbcast_cred_msg)
 {
 	sbcast_cred_t *sbcast_cred;
 	sbcast_cred_arg_t sbcast_arg;
 	step_record_t *step_ptr = NULL;
 	char *node_list = NULL;
 	job_sbcast_cred_msg_t *job_info_resp_msg;

 	xassert(job_ptr);

 	if (step_id->step_id != NO_VAL) {
 		step_ptr = find_step_record(job_ptr, step_id);
 		if (!step_ptr) {
 			return ESLURM_INVALID_JOB_ID;
 		} else if (step_ptr->step_layout &&
 			   (step_ptr->step_layout->node_cnt !=
 			    job_ptr->node_cnt)) {
 			node_list = step_ptr->step_layout->node_list;
 		}
 	}

 	if (!node_list)
 		node_list = job_ptr->nodes;

 	/*
 	 * Note - using pointers to other xmalloc'd elements owned by other
 	 * structures to avoid copy overhead. Do not free them!
 	 */
 	memset(&sbcast_arg, 0, sizeof(sbcast_arg));
 	sbcast_arg.job_id = job_ptr->job_id;
 	sbcast_arg.het_job_id = job_ptr->het_job_id;
 	if (step_ptr)
 		sbcast_arg.step_id = step_ptr->step_id.step_id;
 	else
 		sbcast_arg.step_id = job_ptr->next_step_id;
 	sbcast_arg.nodes = node_list; /* avoid extra copy */
 	sbcast_arg.expiration = job_ptr->end_time;

 	if (!(sbcast_cred = create_sbcast_cred(&sbcast_arg, job_ptr->user_id,
 					       job_ptr->group_id,
 					       protocol_version))) {
 		error("%s %pJ cred create error", __func__, job_ptr);
 		return SLURM_ERROR;
 	}

 	job_info_resp_msg = xmalloc(sizeof(*job_info_resp_msg));
 	job_info_resp_msg->job_id = job_ptr->job_id;
 	job_info_resp_msg->node_list = xstrdup(node_list);
 	job_info_resp_msg->sbcast_cred = sbcast_cred;

 	*out_sbcast_cred_msg = job_info_resp_msg;

 	return SLURM_SUCCESS;
 }

 /* Build structure with job allocation details */
 extern resource_allocation_response_msg_t *build_job_info_resp(
 	job_record_t *job_ptr)
 {
 	resource_allocation_response_msg_t *job_info_resp_msg;
 	int i, j;

 	job_info_resp_msg = xmalloc(sizeof(resource_allocation_response_msg_t));


 	if (!job_ptr->job_resrcs) {
 		;
 	} else if (bit_equal(job_ptr->node_bitmap,
 			     job_ptr->job_resrcs->node_bitmap)) {
 		job_info_resp_msg->num_cpu_groups =
 			job_ptr->job_resrcs->cpu_array_cnt;
 		job_info_resp_msg->cpu_count_reps =
 			xcalloc(job_ptr->job_resrcs->cpu_array_cnt,
 				sizeof(uint32_t));
 		memcpy(job_info_resp_msg->cpu_count_reps,
 		       job_ptr->job_resrcs->cpu_array_reps,
 		       (sizeof(uint32_t) * job_ptr->job_resrcs->cpu_array_cnt));
 		job_info_resp_msg->cpus_per_node  =
 			xcalloc(job_ptr->job_resrcs->cpu_array_cnt,
 				sizeof(uint16_t));
 		memcpy(job_info_resp_msg->cpus_per_node,
 		       job_ptr->job_resrcs->cpu_array_value,
 		       (sizeof(uint16_t) * job_ptr->job_resrcs->cpu_array_cnt));
 	} else {
 		/* Job has changed size, rebuild CPU count info */
 		job_info_resp_msg->num_cpu_groups = job_ptr->node_cnt;
 		job_info_resp_msg->cpu_count_reps = xcalloc(job_ptr->node_cnt,
 							    sizeof(uint32_t));
 		job_info_resp_msg->cpus_per_node = xcalloc(job_ptr->node_cnt,
 							   sizeof(uint32_t));
 		for (i = 0, j = -1; i < job_ptr->job_resrcs->nhosts; i++) {
 			if (job_ptr->job_resrcs->cpus[i] == 0)
 				continue;
 			if ((j == -1) ||
 			    (job_info_resp_msg->cpus_per_node[j] !=
 			     job_ptr->job_resrcs->cpus[i])) {
 				j++;
 				job_info_resp_msg->cpus_per_node[j] =
 					job_ptr->job_resrcs->cpus[i];
 				job_info_resp_msg->cpu_count_reps[j] = 1;
 			} else {
 				job_info_resp_msg->cpu_count_reps[j]++;
 			}
 		}
 		job_info_resp_msg->num_cpu_groups = j + 1;
 	}
 	job_info_resp_msg->account        = xstrdup(job_ptr->account);
 	job_info_resp_msg->batch_host = xstrdup(job_ptr->batch_host);
 	job_info_resp_msg->job_id         = job_ptr->job_id;
 	job_info_resp_msg->node_cnt       = job_ptr->node_cnt;
 	job_info_resp_msg->node_list      = xstrdup(job_ptr->nodes);
 	if (job_ptr->part_ptr)
 		job_info_resp_msg->partition = xstrdup(job_ptr->part_ptr->name);
 	else
 		job_info_resp_msg->partition = xstrdup(job_ptr->partition);
 	if (job_ptr->qos_ptr) {
 		slurmdb_qos_rec_t *qos;
 		qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr;
 		job_info_resp_msg->qos = xstrdup(qos->name);
 	}
 	job_info_resp_msg->resv_name      = xstrdup(job_ptr->resv_name);
 	if (job_ptr->details) {
 		if (job_ptr->bit_flags & JOB_MEM_SET) {
 			job_info_resp_msg->pn_min_memory =
 				job_ptr->details->pn_min_memory;
 		}
 		if (job_ptr->details->mc_ptr) {
 			job_info_resp_msg->ntasks_per_board =
 				job_ptr->details->mc_ptr->ntasks_per_board;
 			job_info_resp_msg->ntasks_per_core =
 				job_ptr->details->mc_ptr->ntasks_per_core;
 			job_info_resp_msg->ntasks_per_socket =
 				job_ptr->details->mc_ptr->ntasks_per_socket;
 		}
 	} else {
 		/* job_info_resp_msg->pn_min_memory     = 0; */
 		job_info_resp_msg->ntasks_per_board  = NO_VAL16;
 		job_info_resp_msg->ntasks_per_core   = NO_VAL16;
 		job_info_resp_msg->ntasks_per_socket = NO_VAL16;
 	}

 	if (job_ptr->details && job_ptr->details->env_cnt) {
 		job_info_resp_msg->env_size = job_ptr->details->env_cnt;
 		job_info_resp_msg->environment =
 			xcalloc(job_info_resp_msg->env_size + 1,
 				sizeof(char *));
 		for (i = 0; i < job_info_resp_msg->env_size; i++) {
 			job_info_resp_msg->environment[i] =
 				xstrdup(job_ptr->details->env_sup[i]);
 		}
 		job_info_resp_msg->environment[i] = NULL;
 	}

 	job_info_resp_msg->uid = job_ptr->user_id;
 	job_info_resp_msg->user_name = user_from_job(job_ptr);
 	job_info_resp_msg->gid = job_ptr->group_id;
 	job_info_resp_msg->group_name = group_from_job(job_ptr);

 	return job_info_resp_msg;
 }