src/plugins/task/affinity/dist_tasks.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  Copyright (C) 2006-2009 Hewlett-Packard Development Company, L.P.
  *  Copyright (C) 2008-2009 Lawrence Livermore National Security.
  *  Written by Susanne M. Balle, <susanne.balle@hp.com>
  *  CODE-OCEC-09-009. All rights reserved.
  *
  *  This file is part of SLURM, a resource management program.
  *  For details, see <http://www.schedmd.com/slurmdocs/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  SLURM is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with SLURM; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #include "affinity.h"
 #include "dist_tasks.h"
 #include "src/common/bitstring.h"
 #include "src/common/log.h"
 #include "src/common/slurm_cred.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/slurm_resource_info.h"
 #include "src/common/xmalloc.h"
 #include "src/slurmd/slurmd/slurmd.h"

 #ifdef HAVE_NUMA
 #include <numa.h>
 #endif

 static char *_alloc_mask(launch_tasks_request_msg_t *req,
 			 int *whole_node_cnt, int *whole_socket_cnt,
 			 int *whole_core_cnt, int *whole_thread_cnt,
 			 int *part_socket_cnt, int *part_core_cnt);
 static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req,
 				uint16_t *hw_sockets, uint16_t *hw_cores,
 				uint16_t *hw_threads);
 static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id,
 				uint16_t *sockets, uint16_t *cores);

 static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
 				   uint32_t node_id, bitstr_t ***masks_p);
 static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
 				    uint32_t node_id, bitstr_t ***masks_p);
 static int _task_layout_lllp_multi(launch_tasks_request_msg_t *req,
 				    uint32_t node_id, bitstr_t ***masks_p);

 static void _lllp_map_abstract_masks(const uint32_t maxtasks,
 				     bitstr_t **masks);
 static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req,
 				    const uint32_t maxtasks,
 				    bitstr_t **masks);

 /*     BLOCK_MAP     physical machine LLLP index to abstract block LLLP index
  *     BLOCK_MAP_INV physical abstract block LLLP index to machine LLLP index
  */
 #define BLOCK_MAP(index)	_block_map(index, conf->block_map)
 #define BLOCK_MAP_INV(index)	_block_map(index, conf->block_map_inv)


 /* _block_map
  *
  * safely returns a mapped index using a provided block map
  *
  * IN - index to map
  * IN - map to use
  */
 static uint16_t _block_map(uint16_t index, uint16_t *map)
 {
 	if (map == NULL) {
 	    	return index;
 	}
 	/* make sure bit falls in map */
 	if (index >= conf->block_map_size) {
 		debug3("wrapping index %u into block_map_size of %u",
 		       index, conf->block_map_size);
 		index = index % conf->block_map_size;
 	}
 	index = map[index];
 	return(index);
 }

 static void _task_layout_display_masks(launch_tasks_request_msg_t *req,
 					const uint32_t *gtid,
 					const uint32_t maxtasks,
 					bitstr_t **masks)
 {
 	int i;
 	char *str = NULL;
 	for(i = 0; i < maxtasks; i++) {
 		str = (char *)bit_fmt_hexmask(masks[i]);
 		debug3("_task_layout_display_masks jobid [%u:%d] %s",
 		       req->job_id, gtid[i], str);
 		xfree(str);
 	}
 }

 static void _lllp_free_masks(const uint32_t maxtasks, bitstr_t **masks)
 {
     	int i;
 	bitstr_t *bitmask;

 	for (i = 0; i < maxtasks; i++) {
 		bitmask = masks[i];
 		FREE_NULL_BITMAP(bitmask);
 	}
 	xfree(masks);
 }

 #ifdef HAVE_NUMA
 /* _match_mask_to_ldom
  *
  * expand each mask to encompass the whole locality domain
  * within which it currently exists
  * NOTE: this assumes that the masks are already in logical
  * (and not abstract) CPU order.
  */
 static void _match_masks_to_ldom(const uint32_t maxtasks, bitstr_t **masks)
 {
 	uint32_t i, b, size;

 	if (!masks || !masks[0])
 		return;
 	size = bit_size(masks[0]);
 	for(i = 0; i < maxtasks; i++) {
 		for (b = 0; b < size; b++) {
 			if (bit_test(masks[i], b)) {
 				/* get the NUMA node for this CPU, and then
 				 * set all CPUs in the mask that exist in
 				 * the same CPU */
 				int c;
 				uint16_t nnid = slurm_get_numa_node(b);
 				for (c = 0; c < size; c++) {
 					if (slurm_get_numa_node(c) == nnid)
 						bit_set(masks[i], c);
 				}
 			}
 		}
 	}
 }
 #endif

 /*
  * batch_bind - Set the batch request message so as to bind the shell to the
  *	proper resources
  */
 void batch_bind(batch_job_launch_msg_t *req)
 {
 	bitstr_t *req_map, *hw_map;
 	slurm_cred_arg_t arg;
 	uint16_t sockets=0, cores=0, num_cpus;
 	int start, task_cnt=0;

 	if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) {
 		error("task/affinity: job lacks a credential");
 		return;
 	}
 	start = _get_local_node_info(&arg, 0, &sockets, &cores);
 	if (start != 0) {
 		error("task/affinity: missing node 0 in job credential");
 		slurm_cred_free_args(&arg);
 		return;
 	}
 	if ((sockets * cores) == 0) {
 		error("task/affinity: socket and core count both zero");
 		slurm_cred_free_args(&arg);
 		return;
 	}

 	num_cpus  = MIN((sockets * cores),
 			 (conf->sockets * conf->cores));
 	req_map = (bitstr_t *) bit_alloc(num_cpus);
 	hw_map  = (bitstr_t *) bit_alloc(conf->block_map_size);
 	if (!req_map || !hw_map) {
 		error("task/affinity: malloc error");
 		FREE_NULL_BITMAP(req_map);
 		FREE_NULL_BITMAP(hw_map);
 		slurm_cred_free_args(&arg);
 		return;
 	}

 #ifdef HAVE_FRONT_END
 {
 	/* Since the front-end nodes are a shared resource, we limit each job
 	 * to one CPU based upon monotonically increasing sequence number */
 	static int last_id = 0;
 	bit_set(hw_map, ((last_id++) % conf->block_map_size));
 	task_cnt = 1;
 }
 #else
 {
 	char *str;
 	int t, p;

 	/* Transfer core_bitmap data to local req_map.
 	 * The MOD function handles the case where fewer processes
 	 * physically exist than are configured (slurmd is out of
 	 * sync with the slurmctld daemon). */
 	for (p = 0; p < (sockets * cores); p++) {
 		if (bit_test(arg.job_core_bitmap, p))
 			bit_set(req_map, (p % num_cpus));
 	}

 	str = (char *)bit_fmt_hexmask(req_map);
 	debug3("task/affinity: job %u CPU mask from slurmctld: %s",
 		req->job_id, str);
 	xfree(str);

 	for (p = 0; p < num_cpus; p++) {
 		if (bit_test(req_map, p) == 0)
 			continue;
 		/* core_bitmap does not include threads, so we
 		 * add them here but limit them to what the job
 		 * requested */
 		for (t = 0; t < conf->threads; t++) {
 			uint16_t pos = p * conf->threads + t;
 			if (pos >= conf->block_map_size) {
 				info("more resources configured than exist");
 				p = num_cpus;
 				break;
 			}
 			bit_set(hw_map, pos);
 			task_cnt++;
 		}
 	}
 }
 #endif
 	if (task_cnt) {
 		req->cpu_bind_type = CPU_BIND_MASK;
 		if (conf->task_plugin_param & CPU_BIND_VERBOSE)
 			req->cpu_bind_type |= CPU_BIND_VERBOSE;
 		req->cpu_bind = (char *)bit_fmt_hexmask(hw_map);
 		info("task/affinity: job %u CPU input mask for node: %s",
 		     req->job_id, req->cpu_bind);
 		/* translate abstract masks to actual hardware layout */
 		_lllp_map_abstract_masks(1, &hw_map);
 #ifdef HAVE_NUMA
 		if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
 			_match_masks_to_ldom(1, &hw_map);
 		}
 #endif
 		xfree(req->cpu_bind);
 		req->cpu_bind = (char *)bit_fmt_hexmask(hw_map);
 		info("task/affinity: job %u CPU final HW mask for node: %s",
 		     req->job_id, req->cpu_bind);
 	} else {
 		error("task/affinity: job %u allocated no CPUs",
 		      req->job_id);
 	}
 	FREE_NULL_BITMAP(hw_map);
 	FREE_NULL_BITMAP(req_map);
 	slurm_cred_free_args(&arg);
 }

 /*
  * lllp_distribution
  *
  * Note: lllp stands for Lowest Level of Logical Processors.
  *
  * When automatic binding is enabled:
  *      - no binding flags set >= CPU_BIND_NONE, and
  *      - a auto binding level selected CPU_BIND_TO_{SOCKETS,CORES,THREADS}
  * Otherwise limit job step to the allocated CPUs
  *
  * generate the appropriate cpu_bind type and string which results in
  * the specified lllp distribution.
  *
  * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated)
  * IN- global task id array
  */
 void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id)
 {
 	int rc = SLURM_SUCCESS;
 	bitstr_t **masks = NULL;
 	char buf_type[100];
 	int maxtasks = req->tasks_to_launch[(int)node_id];
 	int whole_nodes, whole_sockets, whole_cores, whole_threads;
 	int part_sockets, part_cores;
         const uint32_t *gtid = req->global_task_ids[(int)node_id];
 	static uint16_t bind_entity = CPU_BIND_TO_THREADS | CPU_BIND_TO_CORES |
 				      CPU_BIND_TO_SOCKETS | CPU_BIND_TO_LDOMS;
 	static uint16_t bind_mode = CPU_BIND_NONE   | CPU_BIND_MASK   |
 				    CPU_BIND_RANK   | CPU_BIND_MAP    |
 				    CPU_BIND_LDMASK | CPU_BIND_LDRANK |
 				    CPU_BIND_LDMAP;

 	if (req->cpu_bind_type & bind_mode) {
 		/* Explicit step binding specified by user */
 		char *avail_mask = _alloc_mask(req,
 					       &whole_nodes,  &whole_sockets,
 					       &whole_cores,  &whole_threads,
 					       &part_sockets, &part_cores);
 		if ((whole_nodes == 0) && avail_mask) {
 			/* Step does NOT have access to whole node,
 			 * bind to full mask of available processors */
 			xfree(req->cpu_bind);
 			req->cpu_bind = avail_mask;
 			req->cpu_bind_type &= (~bind_mode);
 			req->cpu_bind_type |= CPU_BIND_MASK;
 		} else {
 			/* Step does have access to whole node,
 			 * bind to whatever step wants */
 			xfree(avail_mask);
 		}
 		slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
 		info("lllp_distribution jobid [%u] manual binding: %s",
 		     req->job_id, buf_type);
 		return;
 	}

 	if (!(req->cpu_bind_type & bind_entity)) {
 		/* No bind unit (sockets, cores) specified by user,
 		 * pick something reasonable */
 		int max_tasks = req->tasks_to_launch[(int)node_id];
 		char *avail_mask = _alloc_mask(req,
 					       &whole_nodes,  &whole_sockets,
 					       &whole_cores,  &whole_threads,
 					       &part_sockets, &part_cores);
 		debug("binding tasks:%d to "
 		      "nodes:%d sockets:%d:%d cores:%d:%d threads:%d",
 		      max_tasks, whole_nodes, whole_sockets ,part_sockets,
 		      whole_cores, part_cores, whole_threads);

 		if ((max_tasks == whole_sockets) && (part_sockets == 0)) {
 			req->cpu_bind_type |= CPU_BIND_TO_SOCKETS;
 			goto make_auto;
 		}
 		if ((max_tasks == whole_cores) && (part_cores == 0)) {
 			req->cpu_bind_type |= CPU_BIND_TO_CORES;
 			goto make_auto;
 		}
 		if (max_tasks == whole_threads) {
 			req->cpu_bind_type |= CPU_BIND_TO_THREADS;
 			goto make_auto;
 		}
 		if (avail_mask) {
 			xfree(req->cpu_bind);
 			req->cpu_bind = avail_mask;
 			req->cpu_bind_type |= CPU_BIND_MASK;
 		}
 		slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
 		info("lllp_distribution jobid [%u] auto binding off: %s",
 		     req->job_id, buf_type);
 		return;

   make_auto:	xfree(avail_mask);
 		slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
 		info("lllp_distribution jobid [%u] implicit auto binding: "
 		     "%s, dist %d", req->job_id, buf_type, req->task_dist);
 	} else {
 		/* Explicit bind unit (sockets, cores) specified by user */
 		slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
 		info("lllp_distribution jobid [%u] binding: %s, dist %d",
 		     req->job_id, buf_type, req->task_dist);
 	}

 	switch (req->task_dist) {
 	case SLURM_DIST_BLOCK_BLOCK:
 	case SLURM_DIST_CYCLIC_BLOCK:
 	case SLURM_DIST_PLANE:
 		/* tasks are distributed in blocks within a plane */
 		rc = _task_layout_lllp_block(req, node_id, &masks);
 		break;
 	case SLURM_DIST_CYCLIC:
 	case SLURM_DIST_BLOCK:
 	case SLURM_DIST_CYCLIC_CYCLIC:
 	case SLURM_DIST_BLOCK_CYCLIC:
 		rc = _task_layout_lllp_cyclic(req, node_id, &masks);
 		break;
 	default:
 		if (req->cpus_per_task > 1)
 			rc = _task_layout_lllp_multi(req, node_id, &masks);
 		else
 			rc = _task_layout_lllp_cyclic(req, node_id, &masks);
 		req->task_dist = SLURM_DIST_BLOCK_CYCLIC;
 		break;
 	}

 	/* FIXME: I'm worried about core_bitmap with CPU_BIND_TO_SOCKETS &
 	 * max_cores - does select/cons_res plugin allocate whole
 	 * socket??? Maybe not. Check srun man page.
 	 */

 	if (rc == SLURM_SUCCESS) {
 		_task_layout_display_masks(req, gtid, maxtasks, masks);
 	    	/* translate abstract masks to actual hardware layout */
 		_lllp_map_abstract_masks(maxtasks, masks);
 		_task_layout_display_masks(req, gtid, maxtasks, masks);
 #ifdef HAVE_NUMA
 		if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
 			_match_masks_to_ldom(maxtasks, masks);
 			_task_layout_display_masks(req, gtid, maxtasks, masks);
 		}
 #endif
 	    	 /* convert masks into cpu_bind mask string */
 		 _lllp_generate_cpu_bind(req, maxtasks, masks);
 	} else {
 		char *avail_mask = _alloc_mask(req,
 					       &whole_nodes,  &whole_sockets,
 					       &whole_cores,  &whole_threads,
 					       &part_sockets, &part_cores);
 		if (avail_mask) {
 			xfree(req->cpu_bind);
 			req->cpu_bind = avail_mask;
 			req->cpu_bind_type &= (~bind_mode);
 			req->cpu_bind_type |= CPU_BIND_MASK;
 		}
 		slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
 		error("lllp_distribution jobid [%u] overriding binding: %s",
 		      req->job_id, buf_type);
 		error("Verify socket/core/thread counts in configuration");
 	}
 	if (masks)
 		_lllp_free_masks(maxtasks, masks);
 }


 /*
  * _get_local_node_info - get job allocation details for this node
  * IN: req         - launch request structure
  * IN: job_node_id - index of the local node in the job allocation
  * IN/OUT: sockets - pointer to socket count variable
  * IN/OUT: cores   - pointer to cores_per_socket count variable
  * OUT:  returns the core_bitmap index of the first core for this node
  */
 static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id,
 				uint16_t *sockets, uint16_t *cores)
 {
 	int bit_start = 0, bit_finish = 0;
 	int i, index = -1, cur_node_id = -1;

 	do {
 		index++;
 		for (i = 0; i < arg->sock_core_rep_count[index] &&
 			     cur_node_id < job_node_id; i++) {
 			bit_start = bit_finish;
 			bit_finish += arg->sockets_per_node[index] *
 					arg->cores_per_socket[index];
 			cur_node_id++;
 		}

 	} while (cur_node_id < job_node_id);

 	*sockets = arg->sockets_per_node[index];
 	*cores   = arg->cores_per_socket[index];
 	return bit_start;
 }

 /* Determine which CPUs a job step can use.
  * OUT whole_<entity>_count - returns count of whole <entities> in this
  *                            allocation for this node
  * OUT part__<entity>_count - returns count of partial <entities> in this
  *                            allocation for this node
  * RET - a string representation of the available mask or NULL on error
  * NOTE: Caller must xfree() the return value. */
 static char *_alloc_mask(launch_tasks_request_msg_t *req,
 			 int *whole_node_cnt,  int *whole_socket_cnt,
 			 int *whole_core_cnt,  int *whole_thread_cnt,
 			 int *part_socket_cnt, int *part_core_cnt)
 {
 	uint16_t sockets, cores, threads;
 	int c, s, t, i;
 	int c_miss, s_miss, t_miss, c_hit, t_hit;
 	bitstr_t *alloc_bitmap;
 	char *str_mask;
 	bitstr_t *alloc_mask;

 	*whole_node_cnt   = 0;
 	*whole_socket_cnt = 0;
 	*whole_core_cnt   = 0;
 	*whole_thread_cnt = 0;
 	*part_socket_cnt  = 0;
 	*part_core_cnt    = 0;

 	alloc_bitmap = _get_avail_map(req, &sockets, &cores, &threads);
 	if (!alloc_bitmap)
 		return NULL;

 	alloc_mask = bit_alloc(bit_size(alloc_bitmap));
 	if (!alloc_mask) {
 		error("malloc error");
 		FREE_NULL_BITMAP(alloc_bitmap);
 		return NULL;
 	}

 	i = 0;
 	for (s=0, s_miss=false; s<sockets; s++) {
 		for (c=0, c_hit=c_miss=false; c<cores; c++) {
 			for (t=0, t_hit=t_miss=false; t<threads; t++) {
 				/* If we are pretending we have a
 				   larger system than we really have
 				   this is needed to make sure we
 				   don't bust the bank.
 				*/
 				if (i >= bit_size(alloc_bitmap))
 					i = 0;
 				if (bit_test(alloc_bitmap, i)) {
 					bit_set(alloc_mask, i);
 					(*whole_thread_cnt)++;
 					t_hit = true;
 					c_hit = true;
 				} else
 					t_miss = true;
 				i++;
 			}
 			if (!t_miss)
 				(*whole_core_cnt)++;
 			else {
 				if (t_hit)
 					(*part_core_cnt)++;
 				c_miss = true;
 			}
 		}
 		if (!c_miss)
 			(*whole_socket_cnt)++;
 		else {
 			if (c_hit)
 				(*part_socket_cnt)++;
 			s_miss = true;
 		}
 	}
 	if (!s_miss)
 		(*whole_node_cnt)++;
 	FREE_NULL_BITMAP(alloc_bitmap);

 	/* translate abstract masks to actual hardware layout */
 	_lllp_map_abstract_masks(1, &alloc_mask);

 #ifdef HAVE_NUMA
 	if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
 		_match_masks_to_ldom(1, &alloc_mask);
 	}
 #endif

 	str_mask = bit_fmt_hexmask(alloc_mask);
 	FREE_NULL_BITMAP(alloc_mask);
 	return str_mask;
 }

 /*
  * Given a job step request, return an equivalent local bitmap for this node
  * IN req          - The job step launch request
  * OUT hw_sockets  - number of actual sockets on this node
  * OUT hw_cores    - number of actual cores per socket on this node
  * OUT hw_threads  - number of actual threads per core on this node
  * RET: bitmap of processors available to this job step on this node
  *      OR NULL on error
  */
 static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req,
 				uint16_t *hw_sockets, uint16_t *hw_cores,
 				uint16_t *hw_threads)
 {
 	bitstr_t *req_map, *hw_map;
 	slurm_cred_arg_t arg;
 	uint16_t p, t, new_p, num_cpus, sockets, cores;
 	int job_node_id;
 	int start;
 	char *str;

 	*hw_sockets = conf->sockets;
 	*hw_cores   = conf->cores;
 	*hw_threads = conf->threads;

 	if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) {
 		error("task/affinity: job lacks a credential");
 		return NULL;
 	}

 	/* we need this node's ID in relation to the whole
 	 * job allocation, not just this jobstep */
 	job_node_id = nodelist_find(arg.job_hostlist, conf->node_name);
 	start = _get_local_node_info(&arg, job_node_id, &sockets, &cores);
 	if (start < 0) {
 		error("task/affinity: missing node %d in job credential",
 		      job_node_id);
 		slurm_cred_free_args(&arg);
 		return NULL;
 	}
 	debug3("task/affinity: slurmctld s %u c %u; hw s %u c %u t %u",
 	       sockets, cores, *hw_sockets, *hw_cores, *hw_threads);

 	num_cpus = MIN((sockets * cores), ((*hw_sockets)*(*hw_cores)));
 	req_map = (bitstr_t *) bit_alloc(num_cpus);
 	hw_map  = (bitstr_t *) bit_alloc(conf->block_map_size);

 	if (!req_map || !hw_map) {
 		error("task/affinity: malloc error");
 		FREE_NULL_BITMAP(req_map);
 		FREE_NULL_BITMAP(hw_map);
 		slurm_cred_free_args(&arg);
 		return NULL;
 	}
 	/* Transfer core_bitmap data to local req_map.
 	 * The MOD function handles the case where fewer processes
 	 * physically exist than are configured (slurmd is out of
 	 * sync with the slurmctld daemon). */
 	for (p = 0; p < (sockets * cores); p++) {
 		if (bit_test(arg.step_core_bitmap, start+p))
 			bit_set(req_map, (p % num_cpus));
 	}

 	str = (char *)bit_fmt_hexmask(req_map);
 	debug3("task/affinity: job %u.%u CPU mask from slurmctld: %s",
 		req->job_id, req->job_step_id, str);
 	xfree(str);

 	for (p = 0; p < num_cpus; p++) {
 		if (bit_test(req_map, p) == 0)
 			continue;
 		/* If we are pretending we have a larger system than
 		   we really have this is needed to make sure we
 		   don't bust the bank.
 		*/
 		new_p = p % conf->block_map_size;
 		/* core_bitmap does not include threads, so we
 		 * add them here but limit them to what the job
 		 * requested */
 		for (t = 0; t < (*hw_threads); t++) {
 			uint16_t bit = new_p * (*hw_threads) + t;
 			bit %= conf->block_map_size;
 			bit_set(hw_map, bit);
 		}
 	}

 	str = (char *)bit_fmt_hexmask(hw_map);
 	debug3("task/affinity: job %u.%u CPU final mask for local node: %s",
 		req->job_id, req->job_step_id, str);
 	xfree(str);

 	FREE_NULL_BITMAP(req_map);
 	slurm_cred_free_args(&arg);
 	return hw_map;
 }

 /* helper function for _expand_masks() */
 static void _blot_mask(bitstr_t *mask, uint16_t blot)
 {
 	uint16_t i, size = 0;
 	int prev = -1;

 	if (!mask)
 		return;
 	size = bit_size(mask);
 	for (i = 0; i < size; i++) {
 		if (bit_test(mask, i)) {
 			/* fill in this blot */
 			uint16_t start = (i / blot) * blot;
 			if (start != prev) {
 				bit_nset(mask, start, start+blot-1);
 				prev = start;
 			}
 		}
 	}
 }

 /* helper function for _expand_masks()
  * for each task, consider which other bits are set in avail_map
  * on the same socket */
 static void _blot_mask_sockets(const uint32_t maxtasks, const uint32_t task,
 			       bitstr_t **masks, uint16_t blot,
 			       bitstr_t *avail_map)
 {
   	uint16_t i, j, size = 0;

 	if (!masks[task])
  		return;

 	size = bit_size(masks[task]);
 	for (i = 0; i < size; i++) {
 		if (bit_test(masks[task], i)) {
 			/* check if other bits are set in avail_map on this
 			 * socket and set each corresponding bit in masks */
 			uint16_t start = (i / blot) * blot;
 			for (j = start; j < start+blot; j++) {
 				if (bit_test(avail_map, j))
 					bit_set(masks[task], j);
 			}
 		}
 	}
 }

 /* for each mask, expand the mask around the set bits to include the
  * complete resource to which the set bits are to be bound */
 static void _expand_masks(uint16_t cpu_bind_type, const uint32_t maxtasks,
 			  bitstr_t **masks, uint16_t hw_sockets,
 			  uint16_t hw_cores, uint16_t hw_threads,
 			  bitstr_t *avail_map)
 {
 	uint32_t i;

 	if (cpu_bind_type & CPU_BIND_TO_THREADS)
 		return;
 	if (cpu_bind_type & CPU_BIND_TO_CORES) {
 		if (hw_threads < 2)
 			return;
 		for (i = 0; i < maxtasks; i++) {
 			_blot_mask(masks[i], hw_threads);
 		}
 		return;
 	}
 	if (cpu_bind_type & CPU_BIND_TO_SOCKETS) {
 		if (hw_threads*hw_cores < 2)
 			return;
 		for (i = 0; i < maxtasks; i++) {
    			_blot_mask_sockets(maxtasks, i, masks,
 					   hw_threads*hw_cores, avail_map);
 		}
 		return;
 	}
 }

 /*
  * _task_layout_lllp_multi
  *
  * A variant of _task_layout_lllp_cyclic for use with allocations having
  * more than one CPU per task, put the tasks as close as possible (fill
  * core rather than going next socket for the extra task)
  *
  */
 static int _task_layout_lllp_multi(launch_tasks_request_msg_t *req,
 				    uint32_t node_id, bitstr_t ***masks_p)
 {
 	int last_taskcount = -1, taskcount = 0;
 	uint16_t c, i, s, t, hw_sockets = 0, hw_cores = 0, hw_threads = 0;
 	int size, max_tasks = req->tasks_to_launch[(int)node_id];
 	int max_cpus = max_tasks * req->cpus_per_task;
 	bitstr_t *avail_map;
 	bitstr_t **masks = NULL;

 	info ("_task_layout_lllp_multi ");

 	avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
 	if (!avail_map)
 		return SLURM_ERROR;

 	*masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
 	masks = *masks_p;

 	size = bit_set_count(avail_map);
 	if (size < max_tasks) {
 		error("task/affinity: only %d bits in avail_map for %d tasks!",
 		      size, max_tasks);
 		FREE_NULL_BITMAP(avail_map);
 		return SLURM_ERROR;
 	}
 	if (size < max_cpus) {
 		/* Possible result of overcommit */
 		i = size / max_tasks;
 		info("task/affinity: reset cpus_per_task from %d to %d",
 		     req->cpus_per_task, i);
 		req->cpus_per_task = i;
 	}

 	size = bit_size(avail_map);
 	i = 0;
 	while (taskcount < max_tasks) {
 		if (taskcount == last_taskcount)
 			fatal("_task_layout_lllp_multi failure");
 		last_taskcount = taskcount;
 		for (s = 0; s < hw_sockets; s++) {
 			for (c = 0; c < hw_cores; c++) {
 				for (t = 0; t < hw_threads; t++) {
 					uint16_t bit = s*(hw_cores*hw_threads) +
 							c*(hw_threads) + t;
 					if (bit_test(avail_map, bit) == 0)
 						continue;
 					if (masks[taskcount] == NULL) {
 						masks[taskcount] =
 							bit_alloc(conf->block_map_size);
 					}
 					bit_set(masks[taskcount], bit);
 					if (++i < req->cpus_per_task)
 						continue;
 					i = 0;
 					if (++taskcount >= max_tasks)
 						break;
 				}
 				if (taskcount >= max_tasks)
 					break;
 			}
 			if (taskcount >= max_tasks)
 				break;
 		}
 	}

 	/* last step: expand the masks to bind each task
 	 * to the requested resource */
 	_expand_masks(req->cpu_bind_type, max_tasks, masks,
 			hw_sockets, hw_cores, hw_threads, avail_map);
 	FREE_NULL_BITMAP(avail_map);

 	return SLURM_SUCCESS;
 }

 /*
  * _task_layout_lllp_cyclic
  *
  * task_layout_lllp_cyclic creates a cyclic distribution at the
  * lowest level of logical processor which is either socket, core or
  * thread depending on the system architecture. The Cyclic algorithm
  * is the same as the Cyclic distribution performed in srun.
  *
  *  Distribution at the lllp:
  *  -m hostfile|plane|block|cyclic:block|cyclic
  *
  * The first distribution "hostfile|plane|block|cyclic" is computed
  * in srun. The second distribution "plane|block|cyclic" is computed
  * locally by each slurmd.
  *
  * The input to the lllp distribution algorithms is the gids (tasks
  * ids) generated for the local node.
  *
  * The output is a mapping of the gids onto logical processors
  * (thread/core/socket) with is expressed cpu_bind masks.
  *
  */
 static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
 				    uint32_t node_id, bitstr_t ***masks_p)
 {
 	int last_taskcount = -1, taskcount = 0;
 	uint16_t c, i, s, t, hw_sockets = 0, hw_cores = 0, hw_threads = 0;
 	int size, max_tasks = req->tasks_to_launch[(int)node_id];
 	int max_cpus = max_tasks * req->cpus_per_task;
 	int avail_size;
 	bitstr_t *avail_map;
 	bitstr_t **masks = NULL;

 	info ("_task_layout_lllp_cyclic ");

 	avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
 	if (!avail_map)
 		return SLURM_ERROR;
 	avail_size = bit_size(avail_map);

 	*masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
 	masks = *masks_p;

 	size = bit_set_count(avail_map);
 	if (size < max_tasks) {
 		error("task/affinity: only %d bits in avail_map for %d tasks!",
 		      size, max_tasks);
 		FREE_NULL_BITMAP(avail_map);
 		return SLURM_ERROR;
 	}
 	if (size < max_cpus) {
 		/* Possible result of overcommit */
 		i = size / max_tasks;
 		info("task/affinity: reset cpus_per_task from %d to %d",
 		     req->cpus_per_task, i);
 		req->cpus_per_task = i;
 	}

 	size = bit_size(avail_map);
 	i = 0;
 	while (taskcount < max_tasks) {
 		if (taskcount == last_taskcount)
 			fatal("_task_layout_lllp_cyclic failure");
 		last_taskcount = taskcount;
 		for (t = 0; t < hw_threads; t++) {
 			for (c = 0; c < hw_cores; c++) {
 				for (s = 0; s < hw_sockets; s++) {
 					uint16_t bit = s*(hw_cores*hw_threads) +
 						       c*(hw_threads) + t;
 					/* In case hardware and config differ */
 					bit %= avail_size;
 					if (bit_test(avail_map, bit) == 0)
 						continue;
 					if (masks[taskcount] == NULL) {
 						masks[taskcount] =
 							(bitstr_t *)
 							bit_alloc(conf->
 								  block_map_size);
 					}
 					bit_set(masks[taskcount], bit);

 					if (++i < req->cpus_per_task)
 						continue;
 					i = 0;
 					if (++taskcount >= max_tasks)
 						break;
 				}
 				if (taskcount >= max_tasks)
 					break;
 			}
 			if (taskcount >= max_tasks)
 				break;
 		}
 	}

 	/* last step: expand the masks to bind each task
 	 * to the requested resource */
 	_expand_masks(req->cpu_bind_type, max_tasks, masks,
 			hw_sockets, hw_cores, hw_threads, avail_map);
 	FREE_NULL_BITMAP(avail_map);

 	return SLURM_SUCCESS;
 }

 /*
  * _task_layout_lllp_block
  *
  * task_layout_lllp_block will create a block distribution at the
  * lowest level of logical processor which is either socket, core or
  * thread depending on the system architecture. The Block algorithm
  * is the same as the Block distribution performed in srun.
  *
  *  Distribution at the lllp:
  *  -m hostfile|plane|block|cyclic:block|cyclic
  *
  * The first distribution "hostfile|plane|block|cyclic" is computed
  * in srun. The second distribution "plane|block|cyclic" is computed
  * locally by each slurmd.
  *
  * The input to the lllp distribution algorithms is the gids (tasks
  * ids) generated for the local node.
  *
  * The output is a mapping of the gids onto logical processors
  * (thread/core/socket)  with is expressed cpu_bind masks.
  *
  */
 static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
 				   uint32_t node_id, bitstr_t ***masks_p)
 {
 	int c, i, j, t, size, last_taskcount = -1, taskcount = 0;
 	uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0;
 	int max_tasks = req->tasks_to_launch[(int)node_id];
 	int max_cpus = max_tasks * req->cpus_per_task;
 	int *task_array;
 	bitstr_t *avail_map;
 	bitstr_t **masks = NULL;

 	info("_task_layout_lllp_block ");

 	avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
 	if (!avail_map) {
 		return SLURM_ERROR;
 	}

 	size = bit_set_count(avail_map);
 	if (size < max_tasks) {
 		error("task/affinity: only %d bits in avail_map for %d tasks!",
 		      size, max_tasks);
 		FREE_NULL_BITMAP(avail_map);
 		return SLURM_ERROR;
 	}
 	if (size < max_cpus) {
 		/* Possible result of overcommit */
 		i = size / max_tasks;
 		info("task/affinity: reset cpus_per_task from %d to %d",
 		     req->cpus_per_task, i);
 		req->cpus_per_task = i;
 	}
 	size = bit_size(avail_map);

 	*masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
 	masks = *masks_p;

 	task_array = xmalloc(size * sizeof(int));
 	if (!task_array) {
 		error("In lllp_block: task_array memory error");
 		FREE_NULL_BITMAP(avail_map);
 		return SLURM_ERROR;
 	}

 	/* block distribution with oversubsciption */
 	c = 0;
 	while(taskcount < max_tasks) {
 		if (taskcount == last_taskcount) {
 			fatal("_task_layout_lllp_block infinite loop");
 		}
 		last_taskcount = taskcount;
 		/* the abstract map is already laid out in block order,
 		 * so just iterate over it
 		 */
 		for (i = 0; i < size; i++) {
 			/* skip unrequested threads */
 			if (i%hw_threads >= hw_threads)
 				continue;
 			/* skip unavailable resources */
 			if (bit_test(avail_map, i) == 0)
 				continue;
 			/* if multiple CPUs per task, only
 			 * count the task on the first CPU */
 			if (c == 0)
 				task_array[i] += 1;
 			if (++c < req->cpus_per_task)
 				continue;
 			c = 0;
 			if (++taskcount >= max_tasks)
 				break;
 		}
 	}
 	/* Distribute the tasks and create per-task masks that only
 	 * contain the first CPU. Note that unused resources
 	 * (task_array[i] == 0) will get skipped */
 	taskcount = 0;
 	for (i = 0; i < size; i++) {
 		for (t = 0; t < task_array[i]; t++) {
 			if (masks[taskcount] == NULL)
 				masks[taskcount] = (bitstr_t *)bit_alloc(conf->block_map_size);
 			bit_set(masks[taskcount++], i);
 		}
 	}
 	/* now set additional CPUs for cpus_per_task > 1 */
 	for (t=0; t<max_tasks && req->cpus_per_task>1; t++) {
 		if (!masks[t])
 			continue;
 		c = 0;
 		for (i = 0; i < size && c<req->cpus_per_task; i++) {
 			if (bit_test(masks[t], i) == 0)
 				continue;
 			for (j=i+1,c=1; j<size && c<req->cpus_per_task;j++) {
 				if (bit_test(avail_map, j) == 0)
 					continue;
 				bit_set(masks[t], j);
 				c++;
 			}
 			if (c < req->cpus_per_task) {
 				/* we haven't found all of the CPUs for this
 				 * task, so we'll wrap the search to cover the
 				 * whole node */
 				for (j=0; j<i && c<req->cpus_per_task; j++) {
 					if (bit_test(avail_map, j) == 0)
 						continue;
 					bit_set(masks[t], j);
 					c++;
 				}
 			}
 		}
 	}

 	xfree(task_array);

 	/* last step: expand the masks to bind each task
 	 * to the requested resource */
 	_expand_masks(req->cpu_bind_type, max_tasks, masks,
 			hw_sockets, hw_cores, hw_threads, avail_map);
 	FREE_NULL_BITMAP(avail_map);

 	return SLURM_SUCCESS;
 }

 /*
  * _lllp_map_abstract_mask
  *
  * Map one abstract block mask to a physical machine mask
  *
  * IN - mask to map
  * OUT - mapped mask (storage allocated in this routine)
  */
 static bitstr_t *_lllp_map_abstract_mask(bitstr_t *bitmask)
 {
     	int i, bit;
 	int num_bits = bit_size(bitmask);
 	bitstr_t *newmask = NULL;
 	newmask = (bitstr_t *) bit_alloc(num_bits);

 	/* remap to physical machine */
 	for (i = 0; i < num_bits; i++) {
 		if (bit_test(bitmask,i)) {
 			bit = BLOCK_MAP(i);
 			if(bit < bit_size(newmask))
 				bit_set(newmask, bit);
 			else
 				error("_lllp_map_abstract_mask: can't go from "
 				      "%d -> %d since we only have %d bits",
 				      i, bit, bit_size(newmask));
 		}
 	}
 	return newmask;
 }

 /*
  * _lllp_map_abstract_masks
  *
  * Map an array of abstract block masks to physical machine masks
  *
  * IN- maximum number of tasks
  * IN/OUT- array of masks
  */
 static void _lllp_map_abstract_masks(const uint32_t maxtasks, bitstr_t **masks)
 {
     	int i;
 	debug3("_lllp_map_abstract_masks");

 	for (i = 0; i < maxtasks; i++) {
 		bitstr_t *bitmask = masks[i];
 	    	if (bitmask) {
 			bitstr_t *newmask = _lllp_map_abstract_mask(bitmask);
 			FREE_NULL_BITMAP(bitmask);
 			masks[i] = newmask;
 		}
 	}
 }

 /*
  * _lllp_generate_cpu_bind
  *
  * Generate the cpu_bind type and string given an array of bitstr_t masks
  *
  * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated)
  * IN- maximum number of tasks
  * IN- array of masks
  */
 static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req,
 				    const uint32_t maxtasks, bitstr_t **masks)
 {
     	int i, num_bits=0, masks_len;
 	bitstr_t *bitmask;
 	bitoff_t charsize;
 	char *masks_str = NULL;
 	char buf_type[100];

 	for (i = 0; i < maxtasks; i++) {
 		bitmask = masks[i];
 	    	if (bitmask) {
 			num_bits = bit_size(bitmask);
 			break;
 		}
 	}
 	charsize = (num_bits + 3) / 4;		/* ASCII hex digits */
 	charsize += 3;				/* "0x" and trailing "," */
 	masks_len = maxtasks * charsize + 1;	/* number of masks + null */

 	debug3("_lllp_generate_cpu_bind %d %d %d", maxtasks, charsize,
 		masks_len);

 	masks_str = xmalloc(masks_len);
 	masks_len = 0;
 	for (i = 0; i < maxtasks; i++) {
 	    	char *str;
 		int curlen;
 		bitmask = masks[i];
 	    	if (bitmask == NULL) {
 			continue;
 		}
 		str = (char *)bit_fmt_hexmask(bitmask);
 		curlen = strlen(str) + 1;

 		if (masks_len > 0)
 			masks_str[masks_len-1]=',';
 		strncpy(&masks_str[masks_len], str, curlen);
 		masks_len += curlen;
 		xassert(masks_str[masks_len] == '\0');
 		xfree(str);
 	}

 	if (req->cpu_bind) {
 	    	xfree(req->cpu_bind);
 	}
 	if (masks_str[0] != '\0') {
 		req->cpu_bind = masks_str;
 		req->cpu_bind_type |= CPU_BIND_MASK;
 	} else {
 		req->cpu_bind = NULL;
 		req->cpu_bind_type &= ~CPU_BIND_VERBOSE;
 	}

 	/* clear mask generation bits */
 	req->cpu_bind_type &= ~CPU_BIND_TO_THREADS;
 	req->cpu_bind_type &= ~CPU_BIND_TO_CORES;
 	req->cpu_bind_type &= ~CPU_BIND_TO_SOCKETS;
 	req->cpu_bind_type &= ~CPU_BIND_TO_LDOMS;

 	slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
 	info("_lllp_generate_cpu_bind jobid [%u]: %s, %s",
 	     req->job_id, buf_type, masks_str);
 }