src/plugins/select/linear/select_linear.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  select_linear.c - node selection plugin for simple one-dimensional
  *  address space. Selects nodes for a job so as to minimize the number
  *  of sets of consecutive nodes using a best-fit algorithm.
  *****************************************************************************
  *  Copyright (C) 2004-2007 The Regents of the University of California.
  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
  *  Portions Copyright (C) 2010 SchedMD <http://www.schedmd.com>.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Morris Jette <jette1@llnl.gov>
  *  CODE-OCEC-09-009. All rights reserved.
  *
  *  This file is part of SLURM, a resource management program.
  *  For details, see <https://computing.llnl.gov/linux/slurm/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  SLURM is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with SLURM; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #  if HAVE_STDINT_H
 #    include <stdint.h>
 #  endif
 #  if HAVE_INTTYPES_H
 #    include <inttypes.h>
 #  endif
 #endif

 #include <stdio.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #include <slurm/slurm.h>
 #include <slurm/slurm_errno.h>

 #include "src/common/slurm_xlator.h"	/* Must be first */
 #include "src/common/gres.h"
 #include "src/common/job_resources.h"
 #include "src/common/list.h"
 #include "src/common/log.h"
 #include "src/common/node_select.h"
 #include "src/common/parse_time.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/slurm_resource_info.h"
 #include "src/common/xassert.h"
 #include "src/common/xmalloc.h"

 #include "src/slurmctld/slurmctld.h"
 #include "src/slurmctld/preempt.h"
 #include "src/slurmctld/proc_req.h"
 #include "src/plugins/select/linear/select_linear.h"

 #define NO_SHARE_LIMIT	0xfffe
 #define NODEINFO_MAGIC	0x82ad
 #define RUN_JOB_INCR	16
 #define SELECT_DEBUG	0

 /* These are defined here so when we link with something other than
  * the slurmctld we will have these symbols defined.  They will get
  * overwritten when linking with the slurmctld.
  */
 #if defined (__APPLE__)
 slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import));
 struct node_record *node_record_table_ptr __attribute__((weak_import));
 List part_list __attribute__((weak_import));
 List job_list __attribute__((weak_import));
 int node_record_count __attribute__((weak_import));
 time_t last_node_update __attribute__((weak_import));
 struct switch_record *switch_record_table __attribute__((weak_import));
 int switch_record_cnt __attribute__((weak_import));
 #else
 slurm_ctl_conf_t slurmctld_conf;
 struct node_record *node_record_table_ptr;
 List part_list;
 List job_list;
 int node_record_count;
 time_t last_node_update;
 struct switch_record *switch_record_table;
 int switch_record_cnt;
 #endif

 struct select_nodeinfo {
 	uint16_t magic;		/* magic number */
 	uint16_t alloc_cpus;
 };

 static int  _add_job_to_nodes(struct cr_record *cr_ptr,
 			      struct job_record *job_ptr, char *pre_err,
 			      int suspended);
 static void _add_run_job(struct cr_record *cr_ptr, uint32_t job_id);
 static void _add_tot_job(struct cr_record *cr_ptr, uint32_t job_id);
 static void _build_select_struct(struct job_record *job_ptr, bitstr_t *bitmap);
 static int  _cr_job_list_sort(void *x, void *y);
 static void _dump_node_cr(struct cr_record *cr_ptr);
 static struct cr_record *_dup_cr(struct cr_record *cr_ptr);
 static int  _find_job_mate(struct job_record *job_ptr, bitstr_t *bitmap,
 			   uint32_t min_nodes, uint32_t max_nodes,
 			   uint32_t req_nodes);
 static void _free_cr(struct cr_record *cr_ptr);
 static uint16_t _get_avail_cpus(struct job_record *job_ptr, int index);
 static uint16_t _get_total_cpus(int index);
 static void _init_node_cr(void);
 static int _job_count_bitmap(struct cr_record *cr_ptr,
 			     struct job_record *job_ptr,
 			     bitstr_t * bitmap, bitstr_t * jobmap,
 			     int run_job_cnt, int tot_job_cnt, uint16_t mode);
 static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap,
 		     uint32_t min_nodes, uint32_t max_nodes,
 		     uint32_t req_nodes);
 static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap,
 			  uint32_t min_nodes, uint32_t max_nodes,
 			  uint32_t req_nodes);
 static bool _rem_run_job(struct cr_record *cr_ptr, uint32_t job_id);
 static bool _rem_tot_job(struct cr_record *cr_ptr, uint32_t job_id);
 static int _rm_job_from_nodes(struct cr_record *cr_ptr,
 			      struct job_record *job_ptr, char *pre_err,
 			      bool remove_all);
 static int _rm_job_from_one_node(struct job_record *job_ptr,
 				 struct node_record *node_ptr, char *pre_err);
 static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 		    uint32_t min_nodes, uint32_t max_nodes,
 		    int max_share, uint32_t req_nodes,
 		    List preemptee_candidates,
 		    List *preemptee_job_list);
 static bool _test_run_job(struct cr_record *cr_ptr, uint32_t job_id);
 static bool _test_tot_job(struct cr_record *cr_ptr, uint32_t job_id);
 static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap,
 		      uint32_t min_nodes, uint32_t max_nodes,
 		      uint32_t req_nodes, int max_share);
 static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
 			  uint32_t min_nodes, uint32_t max_nodes,
 			  int max_share, uint32_t req_nodes,
 			  List preemptee_candidates,
 			  List *preemptee_job_list);

 extern select_nodeinfo_t *select_p_select_nodeinfo_alloc(uint32_t size);
 extern int select_p_select_nodeinfo_free(select_nodeinfo_t *nodeinfo);

 /*
  * These variables are required by the generic plugin interface.  If they
  * are not found in the plugin, the plugin loader will ignore it.
  *
  * plugin_name - a string giving a human-readable description of the
  * plugin.  There is no maximum length, but the symbol must refer to
  * a valid string.
  *
  * plugin_type - a string suggesting the type of the plugin or its
  * applicability to a particular form of data or method of data handling.
  * If the low-level plugin API is used, the contents of this string are
  * unimportant and may be anything.  SLURM uses the higher-level plugin
  * interface which requires this string to be of the form
  *
  *	<application>/<method>
  *
  * where <application> is a description of the intended application of
  * the plugin (e.g., "select" for SLURM node selection) and <method>
  * is a description of how this plugin satisfies that application.  SLURM will
  * only load select plugins if the plugin_type string has a
  * prefix of "select/".
  *
  * plugin_version - an unsigned 32-bit integer giving the version number
  * of the plugin.  If major and minor revisions are desired, the major
  * version number may be multiplied by a suitable magnitude constant such
  * as 100 or 1000.  Various SLURM versions will likely require a certain
  * minimum version for their plugins as the node selection API matures.
  */
 const char plugin_name[]       	= "Linear node selection plugin";
 const char plugin_type[]       	= "select/linear";
 const uint32_t plugin_id	= 102;
 const uint32_t plugin_version	= 90;

 static struct node_record *select_node_ptr = NULL;
 static int select_node_cnt = 0;
 static uint16_t select_fast_schedule;
 static uint16_t cr_type;

 /* Record of resources consumed on each node including job details */
 static struct cr_record *cr_ptr = NULL;
 static pthread_mutex_t cr_mutex = PTHREAD_MUTEX_INITIALIZER;

 #ifdef HAVE_XCPU
 #define XCPU_POLL_TIME 120
 static pthread_t xcpu_thread = 0;
 static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
 static int agent_fini = 0;

 static void *xcpu_agent(void *args)
 {
 	int i;
 	static time_t last_xcpu_test;
 	char clone_path[128], down_node_list[512];
 	struct stat buf;
 	time_t now;

 	last_xcpu_test = time(NULL) + XCPU_POLL_TIME;
 	while (!agent_fini) {
 		now = time(NULL);

 		if (difftime(now, last_xcpu_test) >= XCPU_POLL_TIME) {
 			debug3("Running XCPU node state test");
 			down_node_list[0] = '\0';

 			for (i=0; i<select_node_cnt; i++) {
 				snprintf(clone_path, sizeof(clone_path),
 					 "%s/%s/xcpu/clone", XCPU_DIR,
 					 select_node_ptr[i].name);
 				if (stat(clone_path, &buf) == 0)
 					continue;
 				error("stat %s: %m", clone_path);
 				if ((strlen(select_node_ptr[i].name) +
 				     strlen(down_node_list) + 2) <
 				    sizeof(down_node_list)) {
 					if (down_node_list[0] != '\0')
 						strcat(down_node_list,",");
 					strcat(down_node_list,
 					       select_node_ptr[i].name);
 				} else
 					error("down_node_list overflow");
 			}
 			if (down_node_list[0]) {
 				slurm_drain_nodes(
 					down_node_list,
 					"select_linear: Can not stat XCPU ",
 					slurm_get_slurm_user_id());
 			}
 			last_xcpu_test = now;
 		}

 		sleep(1);
 	}
 	return NULL;
 }

 static int _init_status_pthread(void)
 {
 	pthread_attr_t attr;

 	slurm_mutex_lock( &thread_flag_mutex );
 	if ( xcpu_thread ) {
 		debug2("XCPU thread already running, not starting another");
 		slurm_mutex_unlock( &thread_flag_mutex );
 		return SLURM_ERROR;
 	}

 	slurm_attr_init( &attr );
 	pthread_attr_setdetachstate( &attr, PTHREAD_CREATE_DETACHED );
 	pthread_create( &xcpu_thread, &attr, xcpu_agent, NULL);
 	slurm_mutex_unlock( &thread_flag_mutex );
 	slurm_attr_destroy( &attr );

 	return SLURM_SUCCESS;
 }

 static int _fini_status_pthread(void)
 {
 	int i, rc = SLURM_SUCCESS;

 	slurm_mutex_lock( &thread_flag_mutex );
 	if ( xcpu_thread ) {
 		agent_fini = 1;
 		for (i=0; i<4; i++) {
 			sleep(1);
 			if (pthread_kill(xcpu_thread, 0)) {
 				xcpu_thread = 0;
 				break;
 			}
 		}
 		if ( xcpu_thread ) {
 			error("could not kill XCPU agent thread");
 			rc = SLURM_ERROR;
 		}
 	}
 	slurm_mutex_unlock( &thread_flag_mutex );
 	return rc;
 }
 #endif

 /* Add job id to record of jobs running on this node */
 static void _add_run_job(struct cr_record *cr_ptr, uint32_t job_id)
 {
 	int i;

 	if (cr_ptr->run_job_ids == NULL) {	/* create new array */
 		cr_ptr->run_job_len = RUN_JOB_INCR;
 		cr_ptr->run_job_ids = xmalloc(sizeof(uint32_t) *
 					      cr_ptr->run_job_len);
 		cr_ptr->run_job_ids[0] = job_id;
 		return;
 	}

 	for (i=0; i<cr_ptr->run_job_len; i++) {
 		if (cr_ptr->run_job_ids[i])
 			continue;
 		/* fill in hole */
 		cr_ptr->run_job_ids[i] = job_id;
 		return;
 	}

 	/* expand array and add to end */
 	cr_ptr->run_job_len += RUN_JOB_INCR;
 	xrealloc(cr_ptr->run_job_ids, sizeof(uint32_t) * cr_ptr->run_job_len);
 	cr_ptr->run_job_ids[i] = job_id;
 }

 /* Add job id to record of jobs running or suspended on this node */
 static void _add_tot_job(struct cr_record *cr_ptr, uint32_t job_id)
 {
 	int i;

 	if (cr_ptr->tot_job_ids == NULL) {	/* create new array */
 		cr_ptr->tot_job_len = RUN_JOB_INCR;
 		cr_ptr->tot_job_ids = xmalloc(sizeof(uint32_t) *
 					      cr_ptr->tot_job_len);
 		cr_ptr->tot_job_ids[0] = job_id;
 		return;
 	}

 	for (i=0; i<cr_ptr->tot_job_len; i++) {
 		if (cr_ptr->tot_job_ids[i])
 			continue;
 		/* fill in hole */
 		cr_ptr->tot_job_ids[i] = job_id;
 		return;
 	}

 	/* expand array and add to end */
 	cr_ptr->tot_job_len += RUN_JOB_INCR;
 	xrealloc(cr_ptr->tot_job_ids, sizeof(uint32_t) * cr_ptr->tot_job_len);
 	cr_ptr->tot_job_ids[i] = job_id;
 }

 static bool _ck_run_job(struct cr_record *cr_ptr, uint32_t job_id,
 			bool clear_it)
 {
 	int i;
 	bool rc = false;

 	if ((cr_ptr->run_job_ids == NULL) || (cr_ptr->run_job_len == 0))
 		return rc;

 	for (i=0; i<cr_ptr->run_job_len; i++) {
 		if (cr_ptr->run_job_ids[i] != job_id)
 			continue;
 		if (clear_it)
 			cr_ptr->run_job_ids[i] = 0;
 		rc = true;
 	}
 	return rc;
 }

 /* Remove job id from record of jobs running,
  * RET true if successful, false if the job was not running */
 static bool _rem_run_job(struct cr_record *cr_ptr, uint32_t job_id)
 {
 	return _ck_run_job(cr_ptr, job_id, true);
 }

 /* Test for job id in record of jobs running,
  * RET true if successful, false if the job was not running */
 static bool _test_run_job(struct cr_record *cr_ptr, uint32_t job_id)
 {
 	return _ck_run_job(cr_ptr, job_id, false);
 }

 static bool _ck_tot_job(struct cr_record *cr_ptr, uint32_t job_id,
 			bool clear_it)
 {
 	int i;
 	bool rc = false;

 	if ((cr_ptr->tot_job_ids == NULL) || (cr_ptr->tot_job_len == 0))
 		return rc;

 	for (i=0; i<cr_ptr->tot_job_len; i++) {
 		if (cr_ptr->tot_job_ids[i] != job_id)
 			continue;
 		if (clear_it)
 			cr_ptr->tot_job_ids[i] = 0;
 		rc = true;
 	}
 	return rc;
 }
 /* Remove job id from record of jobs running or suspended,
  * RET true if successful, false if the job was not found */
 static bool _rem_tot_job(struct cr_record *cr_ptr, uint32_t job_id)
 {
 	return _ck_tot_job(cr_ptr, job_id, true);
 }

 /* Test for job id in record of jobs running or suspended,
  * RET true if successful, false if the job was not found */
 static bool _test_tot_job(struct cr_record *cr_ptr, uint32_t job_id)
 {
 	return _ck_tot_job(cr_ptr, job_id, false);
 }

 static bool _enough_nodes(int avail_nodes, int rem_nodes,
 			  uint32_t min_nodes, uint32_t req_nodes)
 {
 	int needed_nodes;

 	if (req_nodes > min_nodes)
 		needed_nodes = rem_nodes + min_nodes - req_nodes;
 	else
 		needed_nodes = rem_nodes;

 	return(avail_nodes >= needed_nodes);
 }

 /*
  * _get_avail_cpus - Get the number of "available" cpus on a node
  *	given this number given the number of cpus_per_task and
  *	maximum sockets, cores, threads.  Note that the value of
  *	cpus is the lowest-level logical processor (LLLP).
  * IN job_ptr - pointer to job being scheduled
  * IN index - index of node's configuration information in select_node_ptr
  */
 static uint16_t _get_avail_cpus(struct job_record *job_ptr, int index)
 {
 	struct node_record *node_ptr;
 	uint16_t avail_cpus;
 	uint16_t cpus, sockets, cores, threads;
 	uint16_t cpus_per_task = 1;
 	uint16_t ntasks_per_node = 0, ntasks_per_socket, ntasks_per_core;
 	uint16_t min_sockets, min_cores, min_threads;
 	multi_core_data_t *mc_ptr = NULL;

 	if (job_ptr->details == NULL)
 		return (uint16_t) 0;

 	if (job_ptr->details->cpus_per_task)
 		cpus_per_task = job_ptr->details->cpus_per_task;
 	if (job_ptr->details->ntasks_per_node)
 		ntasks_per_node = job_ptr->details->ntasks_per_node;
 	if ((mc_ptr = job_ptr->details->mc_ptr)) {
 		ntasks_per_socket = mc_ptr->ntasks_per_socket;
 		ntasks_per_core   = mc_ptr->ntasks_per_core;
 		min_sockets       = mc_ptr->sockets_per_node;
 		min_cores         = mc_ptr->cores_per_socket;
 		min_threads       = mc_ptr->threads_per_core;
 	} else {
 		ntasks_per_socket = 0;
 		ntasks_per_core   = 0;
 		min_sockets       = (uint16_t) NO_VAL;
 		min_cores         = (uint16_t) NO_VAL;
 		min_threads       = (uint16_t) NO_VAL;
 	}

 	node_ptr = select_node_ptr + index;
 	if (select_fast_schedule) { /* don't bother checking each node */
 		cpus    = node_ptr->config_ptr->cpus;
 		sockets = node_ptr->config_ptr->sockets;
 		cores   = node_ptr->config_ptr->cores;
 		threads = node_ptr->config_ptr->threads;
 	} else {
 		cpus    = node_ptr->cpus;
 		sockets = node_ptr->sockets;
 		cores   = node_ptr->cores;
 		threads = node_ptr->threads;
 	}

 #if SELECT_DEBUG
 	info("host %s HW_ cpus %u sockets %u cores %u threads %u ",
 	     node_ptr->name, cpus, sockets, cores, threads);
 #endif

 	avail_cpus = slurm_get_avail_procs(
 		min_sockets, min_cores, min_threads, cpus_per_task,
 		ntasks_per_node, ntasks_per_socket, ntasks_per_core,
 		&cpus, &sockets, &cores, &threads, NULL,
 		CR_CPU, job_ptr->job_id, node_ptr->name);

 #if SELECT_DEBUG
 	debug("avail_cpus index %d = %d (out of %d %d %d %d)",
 	      index, avail_cpus, cpus, sockets, cores, threads);
 #endif
 	return(avail_cpus);
 }

 /*
  * _get_total_cpus - Get the total number of cpus on a node
  *	Note that the value of cpus is the lowest-level logical
  *	processor (LLLP).
  * IN index - index of node's configuration information in select_node_ptr
  */
 static uint16_t _get_total_cpus(int index)
 {
 	struct node_record *node_ptr = &(select_node_ptr[index]);
 	if (select_fast_schedule)
 		return node_ptr->config_ptr->cpus;
 	else
 		return node_ptr->cpus;
 }

 /* Build the full job_resources_t *structure for a job based upon the nodes
  *	allocated to it (the bitmap) and the job's memory requirement */
 static void _build_select_struct(struct job_record *job_ptr, bitstr_t *bitmap)
 {
 	int i, j, k;
 	int first_bit, last_bit;
 	uint32_t node_cpus, total_cpus = 0, node_cnt;
 	struct node_record *node_ptr;
 	uint32_t job_memory_cpu = 0, job_memory_node = 0;
 	job_resources_t *job_resrcs_ptr;

 	if (job_ptr->details->pn_min_memory  && (cr_type == CR_MEMORY)) {
 		if (job_ptr->details->pn_min_memory & MEM_PER_CPU)
 			job_memory_cpu = job_ptr->details->pn_min_memory &
 				(~MEM_PER_CPU);
 		else
 			job_memory_node = job_ptr->details->pn_min_memory;
 	}

 	if (job_ptr->job_resrcs)	/* Old struct due to job requeue */
 		free_job_resources(&job_ptr->job_resrcs);

 	node_cnt = bit_set_count(bitmap);
 	job_ptr->job_resrcs = job_resrcs_ptr = create_job_resources();
 	job_resrcs_ptr->cpu_array_reps = xmalloc(sizeof(uint32_t) * node_cnt);
 	job_resrcs_ptr->cpu_array_value = xmalloc(sizeof(uint16_t) * node_cnt);
 	job_resrcs_ptr->cpus = xmalloc(sizeof(uint16_t) * node_cnt);
 	job_resrcs_ptr->cpus_used = xmalloc(sizeof(uint16_t) * node_cnt);
 	job_resrcs_ptr->memory_allocated = xmalloc(sizeof(uint32_t) * node_cnt);
 	job_resrcs_ptr->memory_used = xmalloc(sizeof(uint32_t) * node_cnt);
 	job_resrcs_ptr->nhosts = node_cnt;
 	job_resrcs_ptr->node_bitmap = bit_copy(bitmap);
 	job_resrcs_ptr->nodes = bitmap2node_name(bitmap);
 	if (job_resrcs_ptr->node_bitmap == NULL)
 		fatal("bit_copy malloc failure");
 	job_resrcs_ptr->ncpus = job_ptr->total_cpus;
 	if (build_job_resources(job_resrcs_ptr, (void *)select_node_ptr,
 				select_fast_schedule))
 		error("_build_select_struct: build_job_resources: %m");

 	first_bit = bit_ffs(bitmap);
 	last_bit  = bit_fls(bitmap);
 	if (last_bit == -1)
 		last_bit = -2;	/* no bits set */
 	for (i=first_bit, j=0, k=-1; i<=last_bit; i++) {
 		if (!bit_test(bitmap, i))
 			continue;
 		node_ptr = &(select_node_ptr[i]);
 		if (select_fast_schedule)
 			node_cpus = node_ptr->config_ptr->cpus;
 		else
 			node_cpus = node_ptr->cpus;
 		job_resrcs_ptr->cpus[j] = node_cpus;
 		if ((k == -1) ||
 		    (job_resrcs_ptr->cpu_array_value[k] != node_cpus)) {
 			job_resrcs_ptr->cpu_array_cnt++;
 			job_resrcs_ptr->cpu_array_reps[++k] = 1;
 			job_resrcs_ptr->cpu_array_value[k] = node_cpus;
 		} else
 			job_resrcs_ptr->cpu_array_reps[k]++;
 		total_cpus += node_cpus;

 		if (job_memory_node) {
 			job_resrcs_ptr->memory_allocated[j] = job_memory_node;
 		} else if (job_memory_cpu) {
 			job_resrcs_ptr->memory_allocated[j] =
 				job_memory_cpu * node_cpus;
 		}

 		if (set_job_resources_node(job_resrcs_ptr, j)) {
 			error("_build_select_struct: set_job_resources_node: "
 			      "%m");
 		}
 		j++;
 	}
 	if (job_resrcs_ptr->ncpus != total_cpus) {
 		error("_build_select_struct: ncpus mismatch %u != %u",
 		      job_resrcs_ptr->ncpus, total_cpus);
 	}
 }

 /*
  * Set the bits in 'jobmap' that correspond to bits in the 'bitmap'
  * that are running 'run_job_cnt' jobs or less, and clear the rest.
  */
 static int _job_count_bitmap(struct cr_record *cr_ptr,
 			     struct job_record *job_ptr,
 			     bitstr_t * bitmap, bitstr_t * jobmap,
 			     int run_job_cnt, int tot_job_cnt, uint16_t mode)
 {
 	int i, i_first, i_last;
 	int count = 0, total_jobs, total_run_jobs;
 	struct part_cr_record *part_cr_ptr;
 	struct node_record *node_ptr;
 	uint32_t job_memory_cpu = 0, job_memory_node = 0;
 	uint32_t alloc_mem = 0, job_mem = 0, avail_mem = 0;
 	uint32_t cpu_cnt, gres_cpus;
 	List gres_list;
 	bool use_total_gres = true;

 	xassert(cr_ptr);
 	xassert(cr_ptr->nodes);
 	if (mode != SELECT_MODE_TEST_ONLY) {
 		use_total_gres = false;
 		if (job_ptr->details->pn_min_memory  &&
 		    (cr_type == CR_MEMORY)) {
 			if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
 				job_memory_cpu = job_ptr->details->pn_min_memory
 					& (~MEM_PER_CPU);
 			} else {
 				job_memory_node = job_ptr->details->
 					pn_min_memory;
 			}
 		}
 	}

 	i_first = bit_ffs(bitmap);
 	i_last  = bit_fls(bitmap);
 	if (i_first == -1)	/* job has no nodes */
 		i_last = -2;
 	for (i = i_first; i <= i_last; i++) {
 		if (!bit_test(bitmap, i)) {
 			bit_clear(jobmap, i);
 			continue;
 		}

 		node_ptr = node_record_table_ptr + i;
 		if (select_fast_schedule)
 			cpu_cnt = node_ptr->config_ptr->cpus;
 		else
 			cpu_cnt = node_ptr->cpus;

 		if (cr_ptr->nodes[i].gres_list)
 			gres_list = cr_ptr->nodes[i].gres_list;
 		else
 			gres_list = node_ptr->gres_list;
 		gres_cpus = gres_plugin_job_test(job_ptr->gres_list,
 						 gres_list, use_total_gres,
 						 NULL, 0, 0, job_ptr->job_id,
 						 node_ptr->name);
 		if ((gres_cpus != NO_VAL) && (gres_cpus < cpu_cnt)) {
 			bit_clear(jobmap, i);
 			continue;
 		}

 		if (mode == SELECT_MODE_TEST_ONLY) {
 			bit_set(jobmap, i);
 			count++;
 			continue;	/* No need to test other resources */
 		}

 		if (job_memory_cpu || job_memory_node) {
 			alloc_mem = cr_ptr->nodes[i].alloc_memory;
 			if (select_fast_schedule) {
 				avail_mem = node_ptr->config_ptr->real_memory;
 				if (job_memory_cpu)
 					job_mem = job_memory_cpu * cpu_cnt;
 				else
 					job_mem = job_memory_node;
 			} else {
 				avail_mem = node_ptr->real_memory;
 				if (job_memory_cpu)
 					job_mem = job_memory_cpu * cpu_cnt;
 				else
 					job_mem = job_memory_node;
 			}
 			if ((alloc_mem + job_mem) > avail_mem) {
 				bit_clear(jobmap, i);
 				continue;
 			}
 		}

 		if ((mode != SELECT_MODE_TEST_ONLY) &&
 		    (cr_ptr->nodes[i].exclusive_cnt != 0)) {
 			/* already reserved by some exclusive job */
 			bit_clear(jobmap, i);
 			continue;
 		}

 		total_jobs = 0;
 		total_run_jobs = 0;
 		part_cr_ptr = cr_ptr->nodes[i].parts;
 		while (part_cr_ptr) {
 			total_run_jobs += part_cr_ptr->run_job_cnt;
 			total_jobs     += part_cr_ptr->tot_job_cnt;
 			part_cr_ptr = part_cr_ptr->next;
 		}
 		if ((total_run_jobs <= run_job_cnt) &&
 		    (total_jobs     <= tot_job_cnt)) {
 			bit_set(jobmap, i);
 			count++;
 		} else {
 			bit_clear(jobmap, i);
 		}

 	}
 	return count;
 }

 /* _find_job_mate - does most of the real work for select_p_job_test(),
  *	in trying to find a suitable job to mate this one with. This is
  *	a pretty simple algorithm now, but could try to match the job
  *	with multiple jobs that add up to the proper size or a single
  *	job plus a few idle nodes. */
 static int _find_job_mate(struct job_record *job_ptr, bitstr_t *bitmap,
 			  uint32_t min_nodes, uint32_t max_nodes,
 			  uint32_t req_nodes)
 {
 	ListIterator job_iterator;
 	struct job_record *job_scan_ptr;
 	int rc = EINVAL;

 	job_iterator = list_iterator_create(job_list);
 	while ((job_scan_ptr = (struct job_record *) list_next(job_iterator))) {
 		if ((!IS_JOB_RUNNING(job_scan_ptr))			||
 		    (job_scan_ptr->node_cnt   != req_nodes)		||
 		    (job_scan_ptr->total_cpus <
 		     job_ptr->details->min_cpus)			||
 		    (!bit_super_set(job_scan_ptr->node_bitmap, bitmap)))
 			continue;
 		if (job_scan_ptr->details && job_ptr->details &&
 		    (job_scan_ptr->details->contiguous !=
 		     job_ptr->details->contiguous))
 			continue;

 		if (job_ptr->details->req_node_bitmap &&
 		    (!bit_super_set(job_ptr->details->req_node_bitmap,
 				    job_scan_ptr->node_bitmap)))
 			continue;	/* Required nodes missing from job */

 		if (job_ptr->details->exc_node_bitmap &&
 		    (bit_overlap(job_ptr->details->exc_node_bitmap,
 				 job_scan_ptr->node_bitmap) != 0))
 			continue;	/* Excluded nodes in this job */

 		bit_and(bitmap, job_scan_ptr->node_bitmap);
 		job_ptr->total_cpus = job_scan_ptr->total_cpus;
 		rc = SLURM_SUCCESS;
 		break;
 	}
 	list_iterator_destroy(job_iterator);
 	return rc;
 }

 /* _job_test - does most of the real work for select_p_job_test(), which
  *	pretty much just handles load-leveling and max_share logic */
 static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap,
 		     uint32_t min_nodes, uint32_t max_nodes,
 		     uint32_t req_nodes)
 {
 	int i, index, error_code = EINVAL, sufficient;
 	int *consec_nodes;	/* how many nodes we can add from this
 				 * consecutive set of nodes */
 	int *consec_cpus;	/* how many nodes we can add from this
 				 * consecutive set of nodes */
 	int *consec_start;	/* where this consecutive set starts (index) */
 	int *consec_end;	/* where this consecutive set ends (index) */
 	int *consec_req;	/* are nodes from this set required
 				 * (in req_bitmap) */
 	int consec_index, consec_size;
 	int rem_cpus, rem_nodes;	/* remaining resources desired */
 	int best_fit_nodes, best_fit_cpus, best_fit_req;
 	int best_fit_location = 0, best_fit_sufficient;
 	int avail_cpus, alloc_cpus = 0, total_cpus = 0;

 	if (bit_set_count(bitmap) < min_nodes)
 		return error_code;

 	if ((job_ptr->details->req_node_bitmap) &&
 	    (!bit_super_set(job_ptr->details->req_node_bitmap, bitmap)))
 		return error_code;

 	if (switch_record_cnt && switch_record_table) {
 		/* Perform optimized resource selection based upon topology */
 		return _job_test_topo(job_ptr, bitmap,
 				      min_nodes, max_nodes, req_nodes);
 	}

 	consec_index = 0;
 	consec_size  = 50;	/* start allocation for 50 sets of
 				 * consecutive nodes */
 	consec_cpus  = xmalloc(sizeof(int) * consec_size);
 	consec_nodes = xmalloc(sizeof(int) * consec_size);
 	consec_start = xmalloc(sizeof(int) * consec_size);
 	consec_end   = xmalloc(sizeof(int) * consec_size);
 	consec_req   = xmalloc(sizeof(int) * consec_size);


 	/* Build table with information about sets of consecutive nodes */
 	consec_cpus[consec_index] = consec_nodes[consec_index] = 0;
 	consec_req[consec_index] = -1;	/* no required nodes here by default */
 	rem_cpus = job_ptr->details->min_cpus;
 	if (req_nodes > min_nodes)
 		rem_nodes = req_nodes;
 	else
 		rem_nodes = min_nodes;

 	for (index = 0; index < select_node_cnt; index++) {
 		if (bit_test(bitmap, index)) {
 			if (consec_nodes[consec_index] == 0)
 				consec_start[consec_index] = index;
 			avail_cpus = _get_avail_cpus(job_ptr, index);
 			if (job_ptr->details->req_node_bitmap	&&
 			    (max_nodes > 0)			&&
 			    bit_test(job_ptr->details->req_node_bitmap,index)){
 				if (consec_req[consec_index] == -1) {
 					/* first required node in set */
 					consec_req[consec_index] = index;
 				}
 				rem_nodes--;
 				max_nodes--;
 				rem_cpus   -= avail_cpus;
 				alloc_cpus += avail_cpus;
 				total_cpus += _get_total_cpus(index);
 			} else {	 /* node not required (yet) */
 				bit_clear(bitmap, index);
 				consec_cpus[consec_index] += avail_cpus;
 				consec_nodes[consec_index]++;
 			}
 		} else if (consec_nodes[consec_index] == 0) {
 			consec_req[consec_index] = -1;
 			/* already picked up any required nodes */
 			/* re-use this record */
 		} else {
 			consec_end[consec_index] = index - 1;
 			if (++consec_index >= consec_size) {
 				consec_size *= 2;
 				xrealloc(consec_cpus,
 					 sizeof(int) * consec_size);
 				xrealloc(consec_nodes,
 					 sizeof(int) * consec_size);
 				xrealloc(consec_start,
 					 sizeof(int) * consec_size);
 				xrealloc(consec_end,
 					 sizeof(int) * consec_size);
 				xrealloc(consec_req,
 					 sizeof(int) * consec_size);
 			}
 			consec_cpus[consec_index] = 0;
 			consec_nodes[consec_index] = 0;
 			consec_req[consec_index] = -1;
 		}
 	}
 	if (consec_nodes[consec_index] != 0)
 		consec_end[consec_index++] = index - 1;

 #if SELECT_DEBUG
 	/* don't compile this, it slows things down too much */
 	debug3("rem_cpus=%d, rem_nodes=%d", rem_cpus, rem_nodes);
 	for (i = 0; i < consec_index; i++) {
 		if (consec_req[i] != -1)
 			debug3("start=%s, end=%s, nodes=%d, cpus=%d, req=%s",
 			       select_node_ptr[consec_start[i]].name,
 			       select_node_ptr[consec_end[i]].name,
 			       consec_nodes[i], consec_cpus[i],
 			       select_node_ptr[consec_req[i]].name);
 		else
 			debug3("start=%s, end=%s, nodes=%d, cpus=%d",
 			       select_node_ptr[consec_start[i]].name,
 			       select_node_ptr[consec_end[i]].name,
 			       consec_nodes[i], consec_cpus[i]);
 	}
 #endif

 	/* accumulate nodes from these sets of consecutive nodes until */
 	/*   sufficient resources have been accumulated */
 	while (consec_index && (max_nodes > 0)) {
 		best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0;
 		best_fit_req = -1;	/* first required node, -1 if none */
 		for (i = 0; i < consec_index; i++) {
 			if (consec_nodes[i] == 0)
 				continue;	/* no usable nodes here */

 			if (job_ptr->details->contiguous &&
 			    job_ptr->details->req_node_bitmap &&
 			    (consec_req[i] == -1))
 				continue;	/* no required nodes here */

 			sufficient = (consec_cpus[i] >= rem_cpus) &&
 				_enough_nodes(consec_nodes[i], rem_nodes,
 					      min_nodes, req_nodes);

 			/* if first possibility OR */
 			/* contains required nodes OR */
 			/* first set large enough for request OR */
 			/* tightest fit (less resource waste) OR */
 			/* nothing yet large enough, but this is biggest */
 			if ((best_fit_nodes == 0) ||
 			    ((best_fit_req == -1) && (consec_req[i] != -1))  ||
 			    (sufficient && (best_fit_sufficient == 0))       ||
 			    (sufficient && (consec_cpus[i] < best_fit_cpus)) ||
 			    ((sufficient == 0) &&
 			     (consec_cpus[i] > best_fit_cpus))) {
 				best_fit_cpus = consec_cpus[i];
 				best_fit_nodes = consec_nodes[i];
 				best_fit_location = i;
 				best_fit_req = consec_req[i];
 				best_fit_sufficient = sufficient;
 			}

 			if (job_ptr->details->contiguous &&
 			    job_ptr->details->req_node_bitmap) {
 				/* Must wait for all required nodes to be
 				 * in a single consecutive block */
 				int j, other_blocks = 0;
 				for (j = (i+1); j < consec_index; j++) {
 					if (consec_req[j] != -1) {
 						other_blocks = 1;
 						break;
 					}
 				}
 				if (other_blocks) {
 					best_fit_nodes = 0;
 					break;
 				}
 			}
 		}
 		if (best_fit_nodes == 0)
 			break;
 		if (job_ptr->details->contiguous &&
 		    ((best_fit_cpus < rem_cpus) ||
 		     (!_enough_nodes(best_fit_nodes, rem_nodes,
 				     min_nodes, req_nodes))))
 			break;	/* no hole large enough */
 		if (best_fit_req != -1) {
 			/* This collection of nodes includes required ones
 			 * select nodes from this set, first working up
 			 * then down from the required nodes */
 			for (i = best_fit_req;
 			     i <= consec_end[best_fit_location]; i++) {
 				if ((max_nodes <= 0) ||
 				    ((rem_nodes <= 0) && (rem_cpus <= 0)))
 					break;
 				if (bit_test(bitmap, i))
 					continue;
 				bit_set(bitmap, i);
 				rem_nodes--;
 				max_nodes--;
 				avail_cpus = _get_avail_cpus(job_ptr, i);
 				rem_cpus   -= avail_cpus;
 				alloc_cpus += avail_cpus;
 				total_cpus += _get_total_cpus(i);
 			}
 			for (i = (best_fit_req - 1);
 			     i >= consec_start[best_fit_location]; i--) {
 				if ((max_nodes <= 0) ||
 				    ((rem_nodes <= 0) && (rem_cpus <= 0)))
 					break;
 				if (bit_test(bitmap, i))
 					continue;
 				bit_set(bitmap, i);
 				rem_nodes--;
 				max_nodes--;
 				avail_cpus = _get_avail_cpus(job_ptr, i);
 				rem_cpus   -= avail_cpus;
 				alloc_cpus += avail_cpus;
 				total_cpus += _get_total_cpus(i);
 			}
 		} else {
 			for (i = consec_start[best_fit_location];
 			     i <= consec_end[best_fit_location]; i++) {
 				if ((max_nodes <= 0) ||
 				    ((rem_nodes <= 0) && (rem_cpus <= 0)))
 					break;
 				if (bit_test(bitmap, i))
 					continue;
 				bit_set(bitmap, i);
 				rem_nodes--;
 				max_nodes--;
 				avail_cpus = _get_avail_cpus(job_ptr, i);
 				rem_cpus   -= avail_cpus;
 				alloc_cpus += avail_cpus;
 				total_cpus += _get_total_cpus(i);
 			}
 		}
 		if (job_ptr->details->contiguous ||
 		    ((rem_nodes <= 0) && (rem_cpus <= 0))) {
 			error_code = SLURM_SUCCESS;
 			break;
 		}
 		consec_cpus[best_fit_location] = 0;
 		consec_nodes[best_fit_location] = 0;
 	}

 	if (error_code && (rem_cpus <= 0) &&
 	    _enough_nodes(0, rem_nodes, min_nodes, req_nodes)) {
 		error_code = SLURM_SUCCESS;
 	}
 	if (error_code == SLURM_SUCCESS) {
 		/* job's total_cpus is needed for SELECT_MODE_WILL_RUN */
 		job_ptr->total_cpus = total_cpus;
 	}

 	xfree(consec_cpus);
 	xfree(consec_nodes);
 	xfree(consec_start);
 	xfree(consec_end);
 	xfree(consec_req);
 	return error_code;
 }

 /*
  * _job_test_topo - A topology aware version of _job_test()
  * NOTE: The logic here is almost identical to that of _eval_nodes_topo() in
  *       select/cons_res/job_test.c. Any bug found here is probably also there.
  */
 static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap,
 			  uint32_t min_nodes, uint32_t max_nodes,
 			  uint32_t req_nodes)
 {
 	bitstr_t **switches_bitmap;		/* nodes on this switch */
 	int       *switches_cpu_cnt;		/* total CPUs on switch */
 	int       *switches_node_cnt;		/* total nodes on switch */
 	int       *switches_required;		/* set if has required node */

 	bitstr_t  *avail_nodes_bitmap = NULL;	/* nodes on any switch */
 	bitstr_t  *req_nodes_bitmap   = NULL;
 	int rem_cpus, rem_nodes;	/* remaining resources desired */
 	int avail_cpus, alloc_cpus = 0, total_cpus = 0;
 	int i, j, rc = SLURM_SUCCESS;
 	int best_fit_inx, first, last;
 	int best_fit_nodes, best_fit_cpus;
 	int best_fit_location = 0, best_fit_sufficient;
 	bool sufficient;

 	rem_cpus = job_ptr->details->min_cpus;
 	if (req_nodes > min_nodes)
 		rem_nodes = req_nodes;
 	else
 		rem_nodes = min_nodes;

 	if (job_ptr->details->req_node_bitmap) {
 		req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap);
 		i = bit_set_count(req_nodes_bitmap);
 		if (i > max_nodes) {
 			info("job %u requires more nodes than currently "
 			     "available (%u>%u)",
 			     job_ptr->job_id, i, max_nodes);
 			rc = EINVAL;
 			goto fini;
 		}
 	}

 	/* Construct a set of switch array entries,
 	 * use the same indexes as switch_record_table in slurmctld */
 	switches_bitmap   = xmalloc(sizeof(bitstr_t *) * switch_record_cnt);
 	switches_cpu_cnt  = xmalloc(sizeof(int)        * switch_record_cnt);
 	switches_node_cnt = xmalloc(sizeof(int)        * switch_record_cnt);
 	switches_required = xmalloc(sizeof(int)        * switch_record_cnt);
 	avail_nodes_bitmap = bit_alloc(node_record_count);
 	for (i=0; i<switch_record_cnt; i++) {
 		switches_bitmap[i] = bit_copy(switch_record_table[i].
 					      node_bitmap);
 		bit_and(switches_bitmap[i], bitmap);
 		bit_or(avail_nodes_bitmap, switches_bitmap[i]);
 		switches_node_cnt[i] = bit_set_count(switches_bitmap[i]);
 		if (req_nodes_bitmap &&
 		    bit_overlap(req_nodes_bitmap, switches_bitmap[i])) {
 			switches_required[i] = 1;
 		}
 	}
 	bit_nclear(bitmap, 0, node_record_count - 1);

 #if SELECT_DEBUG
 	/* Don't compile this, it slows things down too much */
 	for (i=0; i<switch_record_cnt; i++) {
 		char *node_names = NULL;
 		if (switches_node_cnt[i])
 			node_names = bitmap2node_name(switches_bitmap[i]);
 		debug("switch=%s nodes=%u:%s required:%u speed=%u",
 		      switch_record_table[i].name,
 		      switches_node_cnt[i], node_names,
 		      switches_required[i],
 		      switch_record_table[i].link_speed);
 		xfree(node_names);
 	}
 #endif

 	if (req_nodes_bitmap &&
 	    (!bit_super_set(req_nodes_bitmap, avail_nodes_bitmap))) {
 		info("job %u requires nodes not available on any switch",
 		     job_ptr->job_id);
 		rc = EINVAL;
 		goto fini;
 	}

 	if (req_nodes_bitmap) {
 		/* Accumulate specific required resources, if any */
 		first = bit_ffs(req_nodes_bitmap);
 		last  = bit_fls(req_nodes_bitmap);
 		for (i=first; ((i<=last) && (first>=0)); i++) {
 			if (!bit_test(req_nodes_bitmap, i))
 				continue;
 			if (max_nodes <= 0) {
 				info("job %u requires nodes than allowed",
 				     job_ptr->job_id);
 				rc = EINVAL;
 				goto fini;
 			}
 			bit_set(bitmap, i);
 			bit_clear(avail_nodes_bitmap, i);
 			rem_nodes--;
 			max_nodes--;
 			avail_cpus = _get_avail_cpus(job_ptr, i);
 			rem_cpus   -= avail_cpus;
 			alloc_cpus += avail_cpus;
 			total_cpus += _get_total_cpus(i);
 			for (j=0; j<switch_record_cnt; j++) {
 				if (!bit_test(switches_bitmap[j], i))
 					continue;
 				bit_clear(switches_bitmap[j], i);
 				switches_node_cnt[j]--;
 			}
 		}
 		if ((rem_nodes <= 0) && (rem_cpus <= 0))
 			goto fini;

 		/* Accumulate additional resources from leafs that
 		 * contain required nodes */
 		for (j=0; j<switch_record_cnt; j++) {
 			if ((switch_record_table[j].level != 0) ||
 			    (switches_node_cnt[j] == 0) ||
 			    (switches_required[j] == 0)) {
 				continue;
 			}
 			while ((max_nodes > 0) &&
 			       ((rem_nodes > 0) || (rem_cpus > 0))) {
 				i = bit_ffs(switches_bitmap[j]);
 				if (i == -1)
 					break;
 				bit_clear(switches_bitmap[j], i);
 				switches_node_cnt[j]--;
 				if (bit_test(bitmap, i)) {
 					/* node on multiple leaf switches
 					 * and already selected */
 					continue;
 				}
 				bit_set(bitmap, i);
 				bit_clear(avail_nodes_bitmap, i);
 				rem_nodes--;
 				max_nodes--;
 				avail_cpus = _get_avail_cpus(job_ptr, i);
 				rem_cpus   -= avail_cpus;
 				alloc_cpus += avail_cpus;
 				total_cpus += _get_total_cpus(i);
 			}
 		}
 		if ((rem_nodes <= 0) && (rem_cpus <= 0))
 			goto fini;

 		/* Update bitmaps and node counts for higher-level switches */
 		for (j=0; j<switch_record_cnt; j++) {
 			if (switches_node_cnt[j] == 0)
 				continue;
 			first = bit_ffs(switches_bitmap[j]);
 			if (first < 0)
 				continue;
 			last  = bit_fls(switches_bitmap[j]);
 			for (i=first; i<=last; i++) {
 				if (!bit_test(switches_bitmap[j], i))
 					continue;
 				if (!bit_test(avail_nodes_bitmap, i)) {
 					/* cleared from lower level */
 					bit_clear(switches_bitmap[j], i);
 					switches_node_cnt[j]--;
 				} else {
 					switches_cpu_cnt[j] +=
 						_get_avail_cpus(job_ptr, i);
 				}
 			}
 		}
 	} else {
 		/* No specific required nodes, calculate CPU counts */
 		for (j=0; j<switch_record_cnt; j++) {
 			first = bit_ffs(switches_bitmap[j]);
 			if (first < 0)
 				continue;
 			last  = bit_fls(switches_bitmap[j]);
 			for (i=first; i<=last; i++) {
 				if (!bit_test(switches_bitmap[j], i))
 					continue;
 				switches_cpu_cnt[j] +=
 					_get_avail_cpus(job_ptr, i);
 			}
 		}
 	}

 	/* Determine lowest level switch satifying request with best fit */
 	best_fit_inx = -1;
 	for (j=0; j<switch_record_cnt; j++) {
 		if ((switches_cpu_cnt[j]  < rem_cpus) ||
 		    (!_enough_nodes(switches_node_cnt[j], rem_nodes,
 				    min_nodes, req_nodes)))
 			continue;
 		if ((best_fit_inx == -1) ||
 		    (switch_record_table[j].level <
 		     switch_record_table[best_fit_inx].level) ||
 		    ((switch_record_table[j].level ==
 		      switch_record_table[best_fit_inx].level) &&
 		     (switches_node_cnt[j] < switches_node_cnt[best_fit_inx])))
 			best_fit_inx = j;
 	}
 	if (best_fit_inx == -1) {
 		debug("_job_test_topo: could not find resources for job %u",
 		      job_ptr->job_id);
 		rc = EINVAL;
 		goto fini;
 	}
 	bit_and(avail_nodes_bitmap, switches_bitmap[best_fit_inx]);

 	/* Identify usable leafs (within higher switch having best fit) */
 	for (j=0; j<switch_record_cnt; j++) {
 		if ((switch_record_table[j].level != 0) ||
 		    (!bit_super_set(switches_bitmap[j],
 				    switches_bitmap[best_fit_inx]))) {
 			switches_node_cnt[j] = 0;
 		}
 	}

 	/* Select resources from these leafs on a best-fit basis */
 	while ((max_nodes > 0) && ((rem_nodes > 0) || (rem_cpus > 0))) {
 		best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0;
 		for (j=0; j<switch_record_cnt; j++) {
 			if (switches_node_cnt[j] == 0)
 				continue;
 			sufficient = (switches_cpu_cnt[j] >= rem_cpus) &&
 				_enough_nodes(switches_node_cnt[j],
 					      rem_nodes, min_nodes,
 					      req_nodes);
 			/* If first possibility OR */
 			/* first set large enough for request OR */
 			/* tightest fit (less resource waste) OR */
 			/* nothing yet large enough, but this is biggest */
 			if ((best_fit_nodes == 0) ||
 			    (sufficient && (best_fit_sufficient == 0)) ||
 			    (sufficient &&
 			     (switches_cpu_cnt[j] < best_fit_cpus)) ||
 			    ((sufficient == 0) &&
 			     (switches_cpu_cnt[j] > best_fit_cpus))) {
 				best_fit_cpus =  switches_cpu_cnt[j];
 				best_fit_nodes = switches_node_cnt[j];
 				best_fit_location = j;
 				best_fit_sufficient = sufficient;
 			}
 		}
 		if (best_fit_nodes == 0)
 			break;
 		/* Use select nodes from this leaf */
 		first = bit_ffs(switches_bitmap[best_fit_location]);
 		last  = bit_fls(switches_bitmap[best_fit_location]);
 		for (i=first; ((i<=last) && (first>=0)); i++) {
 			if (!bit_test(switches_bitmap[best_fit_location], i))
 				continue;

 			bit_clear(switches_bitmap[best_fit_location], i);
 			switches_node_cnt[best_fit_location]--;
 			avail_cpus = _get_avail_cpus(job_ptr, i);
 			switches_cpu_cnt[best_fit_location] -= avail_cpus;

 			if (bit_test(bitmap, i)) {
 				/* node on multiple leaf switches
 				 * and already selected */
 				continue;
 			}

 			bit_set(bitmap, i);
 			rem_nodes--;
 			max_nodes--;
 			rem_cpus   -= avail_cpus;
 			alloc_cpus += avail_cpus;
 			total_cpus += _get_total_cpus(i);
 			if ((max_nodes <= 0) ||
 			    ((rem_nodes <= 0) && (rem_cpus <= 0)))
 				break;
 		}
 		switches_node_cnt[best_fit_location] = 0;
 	}
 	if ((rem_cpus <= 0) &&
 	    _enough_nodes(0, rem_nodes, min_nodes, req_nodes)) {
 		rc = SLURM_SUCCESS;
 	} else
 		rc = EINVAL;

 fini:	if (rc == SLURM_SUCCESS) {
 		/* Job's total_cpus is needed for SELECT_MODE_WILL_RUN */
 		job_ptr->total_cpus = total_cpus;
 	}
 	FREE_NULL_BITMAP(avail_nodes_bitmap);
 	FREE_NULL_BITMAP(req_nodes_bitmap);
 	for (i=0; i<switch_record_cnt; i++)
 		FREE_NULL_BITMAP(switches_bitmap[i]);
 	xfree(switches_bitmap);
 	xfree(switches_cpu_cnt);
 	xfree(switches_node_cnt);
 	xfree(switches_required);

 	return rc;
 }


 /*
  * deallocate resources that were assigned to this job
  *
  * if remove_all = false: the job has been suspended, so just deallocate CPUs
  * if remove_all = true: deallocate all resources
  */
 static int _rm_job_from_nodes(struct cr_record *cr_ptr,
 			      struct job_record *job_ptr, char *pre_err,
 			      bool remove_all)
 {
 	int i, i_first, i_last, node_offset, rc = SLURM_SUCCESS;
 	struct part_cr_record *part_cr_ptr;
 	job_resources_t *job_resrcs_ptr;
 	uint32_t job_memory, job_memory_cpu = 0, job_memory_node = 0;
 	bool exclusive, is_job_running;
 	uint16_t cpu_cnt;
 	struct node_record *node_ptr;
 	List gres_list;

 	if (cr_ptr == NULL) {
 		error("%s: cr_ptr not initialized", pre_err);
 		return SLURM_ERROR;
 	}

 	if (_rem_tot_job(cr_ptr, job_ptr->job_id) == 0) {
 		info("select/linear: job %u has no resources allocated",
 		     job_ptr->job_id);
 		return SLURM_ERROR;
 	}

 	if (remove_all && job_ptr->details &&
 	    job_ptr->details->pn_min_memory && (cr_type == CR_MEMORY)) {
 		if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
 			job_memory_cpu = job_ptr->details->pn_min_memory &
 				(~MEM_PER_CPU);
 		} else
 			job_memory_node = job_ptr->details->pn_min_memory;
 	}

 	if ((job_resrcs_ptr = job_ptr->job_resrcs) == NULL) {
 		error("job %u lacks a job_resources struct", job_ptr->job_id);
 		return SLURM_ERROR;
 	}

 	is_job_running = _rem_run_job(cr_ptr, job_ptr->job_id);
 	exclusive = (job_ptr->details->shared == 0);
 	i_first = bit_ffs(job_resrcs_ptr->node_bitmap);
 	i_last  = bit_fls(job_resrcs_ptr->node_bitmap);
 	if (i_first == -1)	/* job has no nodes */
 		i_last = -2;
 	node_offset = -1;
 	for (i = i_first; i <= i_last; i++) {
 		if (!bit_test(job_resrcs_ptr->node_bitmap, i))
 			continue;
 		node_offset++;
 		if (!bit_test(job_ptr->node_bitmap, i))
 			continue;

 		node_ptr = node_record_table_ptr + i;
 		if (select_fast_schedule)
 			cpu_cnt = node_ptr->config_ptr->cpus;
 		else
 			cpu_cnt = node_ptr->cpus;
 		if (job_memory_cpu)
 			job_memory = job_memory_cpu * cpu_cnt;
 		else
 			job_memory = job_memory_node;
 		if (cr_ptr->nodes[i].alloc_memory >= job_memory)
 			cr_ptr->nodes[i].alloc_memory -= job_memory;
 		else {
 			/* This can be the result of FastSchedule=0 and
 			 * the node being configured with fewer CPUs than
 			 * actually exist. The job allocation set when
 			 * slurmctld restarts may be based upon a lower CPU
 			 * count than when the job gets deallocated. */
 			if (select_fast_schedule ||
 			    (node_ptr->config_ptr->cpus == node_ptr->cpus)) {
 				error("%s: memory underflow for node %s",
 				      pre_err, node_ptr->name);
 			} else {
 				debug("%s: memory underflow for node %s",
 				      pre_err, node_ptr->name);
 			}
 			cr_ptr->nodes[i].alloc_memory = 0;
 		}

 		if (remove_all) {
 			if (cr_ptr->nodes[i].gres_list)
 				gres_list = cr_ptr->nodes[i].gres_list;
 			else
 				gres_list = node_ptr->gres_list;
 			gres_plugin_job_dealloc(job_ptr->gres_list, gres_list,
 						node_offset, job_ptr->job_id,
 						node_ptr->name);
 			gres_plugin_node_state_log(gres_list, node_ptr->name);
 		}

 		if (exclusive) {
 			if (cr_ptr->nodes[i].exclusive_cnt)
 				cr_ptr->nodes[i].exclusive_cnt--;
 			else {
 				error("%s: exclusive_cnt underflow for "
 				      "node %s", pre_err, node_ptr->name);
 			}
 		}

 		part_cr_ptr = cr_ptr->nodes[i].parts;
 		while (part_cr_ptr) {
 			if (part_cr_ptr->part_ptr != job_ptr->part_ptr) {
 				part_cr_ptr = part_cr_ptr->next;
 				continue;
 			}
 			if (!is_job_running)
 				/* cancelled job already suspended */;
 			else if (part_cr_ptr->run_job_cnt > 0)
 				part_cr_ptr->run_job_cnt--;
 			else {
 				error("%s: run_job_cnt underflow for node %s",
 				      pre_err, node_ptr->name);
 			}
 			if (remove_all) {
 				if (part_cr_ptr->tot_job_cnt > 0)
 					part_cr_ptr->tot_job_cnt--;
 				else {
 					error("%s: tot_job_cnt underflow "
 					      "for node %s",
 					      pre_err, node_ptr->name);
 				}
 				if ((part_cr_ptr->tot_job_cnt == 0) &&
 				    (part_cr_ptr->run_job_cnt)) {
 					part_cr_ptr->run_job_cnt = 0;
 					error("%s: run_job_cnt out of sync "
 					      "for node %s",
 					      pre_err, node_ptr->name);
 				}
 			}
 			break;
 		}
 		if (part_cr_ptr == NULL) {
 			if (job_ptr->part_nodes_missing) {
 				;
 			} else if (job_ptr->part_ptr) {
 				info("%s: job %u and its partition %s "
 				     "no longer contain node %s",
 				     pre_err, job_ptr->job_id,
 				     job_ptr->partition, node_ptr->name);
 			} else {
 				info("%s: job %u has no pointer to partition "
 				     "%s and node %s",
 				     pre_err, job_ptr->job_id,
 				     job_ptr->partition, node_ptr->name);
 			}
 			job_ptr->part_nodes_missing = true;
 			rc = SLURM_ERROR;
 		}
 	}

 	return rc;
 }

 /*
  * deallocate resources that were assigned to this job on one node
  */
 static int _rm_job_from_one_node(struct job_record *job_ptr,
 				 struct node_record *node_ptr, char *pre_err)
 {
 	int i, node_inx, node_offset, rc = SLURM_SUCCESS;
 	struct part_cr_record *part_cr_ptr;
 	job_resources_t *job_resrcs_ptr;
 	uint32_t job_memory, job_memory_cpu = 0, job_memory_node = 0;
 	bool exclusive, is_job_running;
 	int first_bit, last_bit;
 	uint16_t cpu_cnt;
 	List gres_list;

 	if (cr_ptr == NULL) {
 		error("%s: cr_ptr not initialized", pre_err);
 		return SLURM_ERROR;
 	}

 	if (_test_tot_job(cr_ptr, job_ptr->job_id) == 0) {
 		info("select/linear: job %u has no resources allocated",
 		     job_ptr->job_id);
 		return SLURM_ERROR;
 	}

 	if (job_ptr->details &&
 	    job_ptr->details->pn_min_memory && (cr_type == CR_MEMORY)) {
 		if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
 			job_memory_cpu = job_ptr->details->pn_min_memory &
 				(~MEM_PER_CPU);
 		} else
 			job_memory_node = job_ptr->details->pn_min_memory;
 	}
 	if ((job_ptr->job_resrcs == NULL) ||
 	    (job_ptr->job_resrcs->cpus == NULL)) {
 		error("job %u lacks a job_resources struct", job_ptr->job_id);
 		return SLURM_ERROR;
 	}
 	job_resrcs_ptr = job_ptr->job_resrcs;
 	node_inx = node_ptr - node_record_table_ptr;
 	if (!bit_test(job_resrcs_ptr->node_bitmap, node_inx)) {
 		error("job %u allocated nodes (%s) which have been removed "
 		      "from slurm.conf",
 		      job_ptr->job_id, node_ptr->name);
 		return SLURM_ERROR;
 	}
 	first_bit = bit_ffs(job_resrcs_ptr->node_bitmap);
 	last_bit  = node_inx;
 	node_offset = -1;
 	for (i = first_bit; i <= node_inx; i++) {
 		if (!bit_test(job_resrcs_ptr->node_bitmap, i))
 			continue;
 		node_offset++;
 	}
 	if (job_resrcs_ptr->cpus[node_offset] == 0) {
 		error("duplicate relinquish of node %s by job %u",
 		      node_ptr->name, job_ptr->job_id);
 		return SLURM_ERROR;
 	}
 	job_resrcs_ptr->cpus[node_offset] = 0;
 	build_job_resources_cpu_array(job_resrcs_ptr);

 	is_job_running = _test_run_job(cr_ptr, job_ptr->job_id);
 	if (select_fast_schedule)
 		cpu_cnt = node_ptr->config_ptr->cpus;
 	else
 		cpu_cnt = node_ptr->cpus;
 	if (job_memory_cpu)
 		job_memory = job_memory_cpu * cpu_cnt;
 	else
 		job_memory = job_memory_node;
 	if (cr_ptr->nodes[node_inx].alloc_memory >= job_memory)
 		cr_ptr->nodes[node_inx].alloc_memory -= job_memory;
 	else {
 		cr_ptr->nodes[node_inx].alloc_memory = 0;
 		error("%s: memory underflow for node %s",
 		      pre_err, node_ptr->name);
 	}

 	if (cr_ptr->nodes[i].gres_list)
 		gres_list = cr_ptr->nodes[i].gres_list;
 	else
 		gres_list = node_ptr->gres_list;
 	gres_plugin_job_dealloc(job_ptr->gres_list, gres_list, node_offset,
 				job_ptr->job_id, node_ptr->name);
 	gres_plugin_node_state_log(gres_list, node_ptr->name);

 	exclusive = (job_ptr->details->shared == 0);
 	if (exclusive) {
 		if (cr_ptr->nodes[node_inx].exclusive_cnt)
 			cr_ptr->nodes[node_inx].exclusive_cnt--;
 		else {
 			error("%s: exclusive_cnt underflow for node %s",
 			      pre_err, node_ptr->name);
 		}
 	}
 	part_cr_ptr = cr_ptr->nodes[node_inx].parts;
 	while (part_cr_ptr) {
 		if (part_cr_ptr->part_ptr != job_ptr->part_ptr) {
 			part_cr_ptr = part_cr_ptr->next;
 			continue;
 		}
 		if (!is_job_running)
 			/* cancelled job already suspended */;
 		else if (part_cr_ptr->run_job_cnt > 0)
 			part_cr_ptr->run_job_cnt--;
 		else {
 			error("%s: run_job_cnt underflow for node %s",
 			      pre_err, node_ptr->name);
 		}
 		if (part_cr_ptr->tot_job_cnt > 0)
 			part_cr_ptr->tot_job_cnt--;
 		else {
 			error("%s: tot_job_cnt underflow for node %s",
 			      pre_err, node_ptr->name);
 		}
 		if ((part_cr_ptr->tot_job_cnt == 0) &&
 		    (part_cr_ptr->run_job_cnt)) {
 			part_cr_ptr->run_job_cnt = 0;
 			error("%s: run_job_cnt out of sync for node %s",
 			      pre_err, node_ptr->name);
 		}
 		break;
 	}
 	if (part_cr_ptr == NULL) {
 		if (job_ptr->part_ptr) {
 			error("%s: Could not find partition %s for node %s",
 			      pre_err, job_ptr->part_ptr->name, node_ptr->name);
 		} else {
 			error("%s: no partition ptr given for job %u and node %s",
 			      pre_err, job_ptr->job_id, node_ptr->name);
 		}
 		rc = SLURM_ERROR;
 	}

 	return rc;
 }

 /*
  * allocate resources to the given job
  *
  * if alloc_all = 0: the job has been suspended, so just re-allocate CPUs
  * if alloc_all = 1: allocate all resources (CPUs and memory)
  */
 static int _add_job_to_nodes(struct cr_record *cr_ptr,
 			     struct job_record *job_ptr, char *pre_err,
 			     int alloc_all)
 {
 	int i, i_first, i_last, node_cnt, node_offset, rc = SLURM_SUCCESS;
 	bool exclusive;
 	struct part_cr_record *part_cr_ptr;
 	job_resources_t *job_resrcs_ptr;
 	uint32_t job_memory_cpu = 0, job_memory_node = 0;
 	uint16_t cpu_cnt;
 	struct node_record *node_ptr;
 	List gres_list;

 	if (cr_ptr == NULL) {
 		error("%s: cr_ptr not initialized", pre_err);
 		return SLURM_ERROR;
 	}

 	if (alloc_all && job_ptr->details &&
 	    job_ptr->details->pn_min_memory && (cr_type == CR_MEMORY)) {
 		if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
 			job_memory_cpu = job_ptr->details->pn_min_memory &
 				(~MEM_PER_CPU);
 		} else
 			job_memory_node = job_ptr->details->pn_min_memory;
 	}
 	if ((job_resrcs_ptr = job_ptr->job_resrcs) == NULL) {
 		error("job %u lacks a job_resources struct", job_ptr->job_id);
 		return SLURM_ERROR;
 	}

 	exclusive = (job_ptr->details->shared == 0);
 	if (alloc_all)
 		_add_run_job(cr_ptr, job_ptr->job_id);
 	_add_tot_job(cr_ptr, job_ptr->job_id);

 	i_first = bit_ffs(job_resrcs_ptr->node_bitmap);
 	i_last  = bit_fls(job_resrcs_ptr->node_bitmap);
 	node_cnt = bit_set_count(job_resrcs_ptr->node_bitmap);
 	if (i_first == -1)	/* job has no nodes */
 		i_last = -2;
 	node_offset = -1;
 	for (i = i_first; i <= i_last; i++) {
 		if (!bit_test(job_resrcs_ptr->node_bitmap, i))
 			continue;
 		node_offset++;
 		if (!bit_test(job_ptr->node_bitmap, i))
 			continue;

 		node_ptr = node_record_table_ptr + i;
 		if (select_fast_schedule)
 			cpu_cnt = node_ptr->config_ptr->cpus;
 		else
 			cpu_cnt = node_ptr->cpus;

 		if (job_memory_cpu) {
 			cr_ptr->nodes[i].alloc_memory += job_memory_cpu *
 				cpu_cnt;
 		} else
 			cr_ptr->nodes[i].alloc_memory += job_memory_node;

 		if (alloc_all) {
 			if (cr_ptr->nodes[i].gres_list)
 				gres_list = cr_ptr->nodes[i].gres_list;
 			else
 				gres_list = node_ptr->gres_list;
 			gres_plugin_job_alloc(job_ptr->gres_list, gres_list,
 					      node_cnt, node_offset, cpu_cnt,
 					      job_ptr->job_id, node_ptr->name);
 			gres_plugin_node_state_log(gres_list, node_ptr->name);
 		}

 		if (exclusive)
 			cr_ptr->nodes[i].exclusive_cnt++;

 		part_cr_ptr = cr_ptr->nodes[i].parts;
 		while (part_cr_ptr) {
 			if (part_cr_ptr->part_ptr != job_ptr->part_ptr) {
 				part_cr_ptr = part_cr_ptr->next;
 				continue;
 			}
 			if (alloc_all)
 				part_cr_ptr->run_job_cnt++;
 			part_cr_ptr->tot_job_cnt++;
 			break;
 		}
 		if (part_cr_ptr == NULL) {
 			info("%s: job %u could not find partition %s for "
 			     "node %s",
 			     pre_err, job_ptr->job_id, job_ptr->partition,
 			     node_ptr->name);
 			job_ptr->part_nodes_missing = true;
 			rc = SLURM_ERROR;
 		}
 	}

 	return rc;
 }

 static void _free_cr(struct cr_record *cr_ptr)
 {
 	int i;
 	struct part_cr_record *part_cr_ptr1, *part_cr_ptr2;

 	if (cr_ptr == NULL)
 		return;

 	for (i = 0; i < select_node_cnt; i++) {
 		part_cr_ptr1 = cr_ptr->nodes[i].parts;
 		while (part_cr_ptr1) {
 			part_cr_ptr2 = part_cr_ptr1->next;
 			xfree(part_cr_ptr1);
 			part_cr_ptr1 = part_cr_ptr2;
 		}
 		if (cr_ptr->nodes[i].gres_list)
 			list_destroy(cr_ptr->nodes[i].gres_list);
 	}
 	xfree(cr_ptr->nodes);
 	xfree(cr_ptr->run_job_ids);
 	xfree(cr_ptr->tot_job_ids);
 	xfree(cr_ptr);
 }

 static void _dump_node_cr(struct cr_record *cr_ptr)
 {
 #if SELECT_DEBUG
 	int i;
 	struct part_cr_record *part_cr_ptr;
 	struct node_record *node_ptr;
 	List gres_list;

 	if ((cr_ptr == NULL) || (cr_ptr->nodes == NULL))
 		return;

 	for (i = 0; i < cr_ptr->run_job_len; i++) {
 		if (cr_ptr->run_job_ids[i])
 			info("Running job:%u", cr_ptr->run_job_ids[i]);
 	}
 	for (i = 0; i < cr_ptr->tot_job_len; i++) {
 		if (cr_ptr->tot_job_ids[i])
 			info("Alloc job:%u", cr_ptr->tot_job_ids[i]);
 	}

 	for (i = 0; i < select_node_cnt; i++) {
 		node_ptr = node_record_table_ptr + i;
 		info("Node:%s exclusive_cnt:%u alloc_mem:%u",
 		     node_ptr->name, cr_ptr->nodes[i].exclusive_cnt,
 		     cr_ptr->nodes[i].alloc_memory);

 		part_cr_ptr = cr_ptr->nodes[i].parts;
 		while (part_cr_ptr) {
 			info("  Part:%s run:%u tot:%u",
 			     part_cr_ptr->part_ptr->name,
 			     part_cr_ptr->run_job_cnt,
 			     part_cr_ptr->tot_job_cnt);
 			part_cr_ptr = part_cr_ptr->next;
 		}

 		if (cr_ptr->nodes[i].gres_list)
 			gres_list = cr_ptr->nodes[i].gres_list;
 		else
 			gres_list = node_ptr->gres_list;
 		if (gres_list)
 			gres_plugin_node_state_log(gres_list, node_ptr->name);
 	}
 #endif
 }

 static struct cr_record *_dup_cr(struct cr_record *cr_ptr)
 {
 	int i;
 	struct cr_record *new_cr_ptr;
 	struct part_cr_record *part_cr_ptr, *new_part_cr_ptr;
 	struct node_record *node_ptr;
 	List gres_list;

 	if (cr_ptr == NULL)
 		return NULL;

 	new_cr_ptr = xmalloc(sizeof(struct cr_record));
 	new_cr_ptr->run_job_len = cr_ptr->run_job_len;
 	i = sizeof(uint32_t) * cr_ptr->run_job_len;
 	new_cr_ptr->run_job_ids = xmalloc(i);
 	memcpy(new_cr_ptr->run_job_ids, cr_ptr->run_job_ids, i);
 	new_cr_ptr->tot_job_len = cr_ptr->tot_job_len;
 	i = sizeof(uint32_t) * cr_ptr->tot_job_len;
 	new_cr_ptr->tot_job_ids = xmalloc(i);
 	memcpy(new_cr_ptr->tot_job_ids, cr_ptr->tot_job_ids, i);

 	new_cr_ptr->nodes = xmalloc(select_node_cnt *
 				    sizeof(struct node_cr_record));
 	for (i = 0; i < select_node_cnt; i++) {
 		node_ptr = node_record_table_ptr + i;
 		new_cr_ptr->nodes[i].alloc_memory = cr_ptr->nodes[i].
 			alloc_memory;
 		new_cr_ptr->nodes[i].exclusive_cnt = cr_ptr->nodes[i].
 			exclusive_cnt;

 		part_cr_ptr = cr_ptr->nodes[i].parts;
 		while (part_cr_ptr) {
 			new_part_cr_ptr =
 				xmalloc(sizeof(struct part_cr_record));
 			new_part_cr_ptr->part_ptr    = part_cr_ptr->part_ptr;
 			new_part_cr_ptr->run_job_cnt = part_cr_ptr->run_job_cnt;
 			new_part_cr_ptr->tot_job_cnt = part_cr_ptr->tot_job_cnt;
 			new_part_cr_ptr->next 	     = new_cr_ptr->nodes[i].
 				parts;
 			new_cr_ptr->nodes[i].parts   = new_part_cr_ptr;
 			part_cr_ptr = part_cr_ptr->next;
 		}

 		if (cr_ptr->nodes[i].gres_list)
 			gres_list = cr_ptr->nodes[i].gres_list;
 		else
 			gres_list = node_ptr->gres_list;
 		new_cr_ptr->nodes[i].gres_list =
 			gres_plugin_node_state_dup(gres_list);
 	}
 	return new_cr_ptr;
 }

 static void _init_node_cr(void)
 {
 	struct part_record *part_ptr;
 	struct part_cr_record *part_cr_ptr;
 	job_resources_t *job_resrcs_ptr;
 	struct node_record *node_ptr;
 	ListIterator part_iterator;
 	struct job_record *job_ptr;
 	ListIterator job_iterator;
 	uint32_t job_memory_cpu, job_memory_node;
 	int exclusive, i, i_first, i_last, node_offset;

 	if (cr_ptr)
 		return;

 	cr_ptr = xmalloc(sizeof(struct cr_record));
 	cr_ptr->nodes = xmalloc(select_node_cnt
 				* sizeof(struct node_cr_record));

 	/* build partition records */
 	part_iterator = list_iterator_create(part_list);
 	while ((part_ptr = (struct part_record *) list_next(part_iterator))) {
 		for (i = 0; i < select_node_cnt; i++) {
 			if (part_ptr->node_bitmap == NULL)
 				break;
 			if (!bit_test(part_ptr->node_bitmap, i))
 				continue;
 			part_cr_ptr = xmalloc(sizeof(struct part_cr_record));
 			part_cr_ptr->next = cr_ptr->nodes[i].parts;
 			part_cr_ptr->part_ptr = part_ptr;
 			cr_ptr->nodes[i].parts = part_cr_ptr;
 		}

 	}
 	list_iterator_destroy(part_iterator);

 	/* Clear existing node Gres allocations */
 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
 	     i++, node_ptr++) {
 		gres_plugin_node_state_dealloc_all(node_ptr->gres_list);
 	}

 	/* record running and suspended jobs in node_cr_records */
 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))
 			continue;
 		if ((job_resrcs_ptr = job_ptr->job_resrcs) == NULL) {
 			error("job %u lacks a job_resources struct",
 			      job_ptr->job_id);
 			continue;
 		}
 		if (IS_JOB_RUNNING(job_ptr) ||
 		    (IS_JOB_SUSPENDED(job_ptr) && (job_ptr->priority != 0)))
 			_add_run_job(cr_ptr, job_ptr->job_id);
 		_add_tot_job(cr_ptr, job_ptr->job_id);

 		job_memory_cpu  = 0;
 		job_memory_node = 0;
 		if (job_ptr->details && job_ptr->details->pn_min_memory &&
 		    (cr_type == CR_MEMORY)) {
 			if (job_ptr->details->pn_min_memory & MEM_PER_CPU) {
 				job_memory_cpu = job_ptr->details->
 					pn_min_memory &
 					(~MEM_PER_CPU);
 			} else {
 				job_memory_node = job_ptr->details->
 					pn_min_memory;
 			}
 		}

 		/* Use job_resrcs_ptr->node_bitmap rather than
 		 * job_ptr->node_bitmap which can have DOWN nodes
 		 * cleared from the bitmap */
 		if (job_resrcs_ptr->node_bitmap == NULL)
 			continue;

 		exclusive = (job_ptr->details->shared == 0);
 		node_offset = -1;
 		i_first = bit_ffs(job_resrcs_ptr->node_bitmap);
 		i_last  = bit_fls(job_resrcs_ptr->node_bitmap);
 		if (i_first == -1)
 			i_last = -2;
 		for (i = i_first; i <= i_last; i++) {
 			if (!bit_test(job_resrcs_ptr->node_bitmap, i))
 				continue;
 			node_offset++;
 			node_ptr = node_record_table_ptr + i;
 			if (exclusive)
 				cr_ptr->nodes[i].exclusive_cnt++;
 			if (job_memory_cpu == 0) {
 				cr_ptr->nodes[i].alloc_memory +=
 					job_memory_node;
 			} else if (select_fast_schedule) {
 				cr_ptr->nodes[i].alloc_memory +=
 					job_memory_cpu *
 					node_record_table_ptr[i].
 					config_ptr->cpus;
 			} else {
 				cr_ptr->nodes[i].alloc_memory +=
 					job_memory_cpu *
 					node_record_table_ptr[i].cpus;
 			}

 			if (bit_test(job_ptr->node_bitmap, i)) {
 				gres_plugin_job_alloc(job_ptr->gres_list,
 						      node_ptr->gres_list,
 						      job_resrcs_ptr->nhosts,
 						      node_offset,
 						      job_resrcs_ptr->
 						      cpus[node_offset],
 						      job_ptr->job_id,
 						      node_ptr->name);
 			}

 			part_cr_ptr = cr_ptr->nodes[i].parts;
 			while (part_cr_ptr) {
 				if (part_cr_ptr->part_ptr !=
 				    job_ptr->part_ptr) {
 					part_cr_ptr = part_cr_ptr->next;
 					continue;
 				}
 				if (IS_JOB_RUNNING(job_ptr) ||
 				    (IS_JOB_SUSPENDED(job_ptr) &&
 				     (job_ptr->priority != 0))) {
 					/* Running or being gang scheduled */
 					part_cr_ptr->run_job_cnt++;
 				}
 				part_cr_ptr->tot_job_cnt++;
 				break;
 			}
 			if (part_cr_ptr == NULL) {
 				info("_init_node_cr: job %u could not find "
 				     "partition %s for node %s",
 				     job_ptr->job_id, job_ptr->partition,
 				     node_ptr->name);
 				job_ptr->part_nodes_missing = true;
 			}
 		}
 	}
 	list_iterator_destroy(job_iterator);
 	_dump_node_cr(cr_ptr);
 }

 static int _find_job (void *x, void *key)
 {
 	struct job_record *job_ptr = (struct job_record *) x;
 	if (job_ptr == (struct job_record *) key)
 		return 1;
 	return 0;
 }

 static bool _is_preemptable(struct job_record *job_ptr,
 			    List preemptee_candidates)
 {
 	if (!preemptee_candidates)
 		return false;
 	if (list_find_first(preemptee_candidates, _find_job, job_ptr))
 		return true;
 	return false;
 }

 /* Determine if a job can ever run */
 static int _test_only(struct job_record *job_ptr, bitstr_t *bitmap,
 		      uint32_t min_nodes, uint32_t max_nodes,
 		      uint32_t req_nodes, int max_share)
 {
 	bitstr_t *orig_map;
 	int i, rc = SLURM_ERROR;
 	uint32_t save_mem;

 	orig_map = bit_copy(bitmap);
 	if (!orig_map)
 		fatal("bit_copy: malloc failure");

 	/* Try to run with currently available nodes */
 	i = _job_count_bitmap(cr_ptr, job_ptr, orig_map, bitmap,
 			      NO_SHARE_LIMIT, NO_SHARE_LIMIT,
 			      SELECT_MODE_TEST_ONLY);
 	if (i >= min_nodes) {
 		save_mem = job_ptr->details->pn_min_memory;
 		job_ptr->details->pn_min_memory = 0;
 		rc = _job_test(job_ptr, bitmap, min_nodes,
 			       max_nodes, req_nodes);
 		job_ptr->details->pn_min_memory = save_mem;
 	}
 	FREE_NULL_BITMAP(orig_map);

 	return rc;
 }

 /* Allocate resources for a job now, if possible */
 static int _run_now(struct job_record *job_ptr, bitstr_t *bitmap,
 		    uint32_t min_nodes, uint32_t max_nodes,
 		    int max_share, uint32_t req_nodes,
 		    List preemptee_candidates,
 		    List *preemptee_job_list)
 {

 	bitstr_t *orig_map;
 	int max_run_job, j, sus_jobs, rc = EINVAL, prev_cnt = -1;
 	struct job_record *tmp_job_ptr;
 	ListIterator job_iterator, preemptee_iterator;
 	struct cr_record *exp_cr;

 	orig_map = bit_copy(bitmap);
 	if (!orig_map)
 		fatal("bit_copy: malloc failure");

 	for (max_run_job=0; ((max_run_job<max_share) && (rc != SLURM_SUCCESS));
 	     max_run_job++) {
 		bool last_iteration = (max_run_job == (max_share - 1));
 		for (sus_jobs=0; ((sus_jobs<5) && (rc != SLURM_SUCCESS));
 		     sus_jobs+=4) {
 			if (last_iteration)
 				sus_jobs = NO_SHARE_LIMIT;
 			j = _job_count_bitmap(cr_ptr, job_ptr,
 					      orig_map, bitmap,
 					      max_run_job,
 					      max_run_job + sus_jobs,
 					      SELECT_MODE_RUN_NOW);
 #if SELECT_DEBUG
 			{
 				char *node_list = bitmap2node_name(bitmap);
 				info("_run_job %u iter:%d cnt:%d nodes:%s",
 				     job_ptr->job_id, max_run_job, j,
 				     node_list);
 				xfree(node_list);
 			}
 #endif
 			if ((j == prev_cnt) || (j < min_nodes))
 				continue;
 			prev_cnt = j;
 			if (max_run_job > 0) {
 				/* We need to share. Try to find
 				 * suitable job to share nodes with */
 				rc = _find_job_mate(job_ptr, bitmap,
 						    min_nodes,
 						    max_nodes, req_nodes);
 				if (rc == SLURM_SUCCESS)
 					break;
 			}
 			rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes,
 				       req_nodes);
 		}
 	}

 	if ((rc != SLURM_SUCCESS) && preemptee_candidates &&
 	    (exp_cr = _dup_cr(cr_ptr))) {
 		/* Remove all preemptable jobs from simulated environment */
 		job_iterator = list_iterator_create(job_list);
 		while ((tmp_job_ptr = (struct job_record *)
 			list_next(job_iterator))) {
 			if (!IS_JOB_RUNNING(tmp_job_ptr) &&
 			    !IS_JOB_SUSPENDED(tmp_job_ptr))
 				continue;
 			if (_is_preemptable(tmp_job_ptr,
 					    preemptee_candidates)) {
 				bool remove_all = false;
 				uint16_t mode;
 				mode = slurm_job_preempt_mode(tmp_job_ptr);
 				if ((mode == PREEMPT_MODE_REQUEUE)    ||
 				    (mode == PREEMPT_MODE_CHECKPOINT) ||
 				    (mode == PREEMPT_MODE_CANCEL))
 					remove_all = true;
 				/* Remove preemptable job now */
 				_rm_job_from_nodes(exp_cr, tmp_job_ptr,
 						   "_run_now",
 						   remove_all);
 				j = _job_count_bitmap(exp_cr, job_ptr,
 						      orig_map, bitmap,
 						      (max_share - 1),
 						      NO_SHARE_LIMIT,
 						      SELECT_MODE_RUN_NOW);
 				if (j < min_nodes)
 					continue;
 				rc = _job_test(job_ptr, bitmap, min_nodes,
 					       max_nodes, req_nodes);
 				if (rc == SLURM_SUCCESS)
 					break;
 			}
 		}
 		list_iterator_destroy(job_iterator);

 		if ((rc == SLURM_SUCCESS) && preemptee_job_list &&
 		    preemptee_candidates) {
 			/* Build list of preemptee jobs whose resources are
 			 * actually used */
 			if (*preemptee_job_list == NULL) {
 				*preemptee_job_list = list_create(NULL);
 				if (*preemptee_job_list == NULL)
 					fatal("list_create malloc failure");
 			}
 			preemptee_iterator = list_iterator_create(
 				preemptee_candidates);
 			while ((tmp_job_ptr = (struct job_record *)
 				list_next(preemptee_iterator))) {
 				if (bit_overlap(bitmap,
 						tmp_job_ptr->node_bitmap) == 0)
 					continue;
 				list_append(*preemptee_job_list,
 					    tmp_job_ptr);
 			}
 			list_iterator_destroy(preemptee_iterator);
 		}
 		_free_cr(exp_cr);
 	}
 	if (rc == SLURM_SUCCESS)
 		_build_select_struct(job_ptr, bitmap);
 	FREE_NULL_BITMAP(orig_map);

 	return rc;
 }

 /* Determine where and when the job at job_ptr can begin execution by updating
  * a scratch cr_record structure to reflect each job terminating at the
  * end of its time limit and use this to show where and when the job at job_ptr
  * will begin execution. Used by SLURM's sched/backfill plugin and Moab. */
 static int _will_run_test(struct job_record *job_ptr, bitstr_t *bitmap,
 			  uint32_t min_nodes, uint32_t max_nodes,
 			  int max_share, uint32_t req_nodes,
 			  List preemptee_candidates,
 			  List *preemptee_job_list)
 {
 	struct cr_record *exp_cr;
 	struct job_record *tmp_job_ptr;
 	List cr_job_list;
 	ListIterator job_iterator, preemptee_iterator;
 	bitstr_t *orig_map;
 	int i, max_run_jobs, rc = SLURM_ERROR;
 	time_t now = time(NULL);

 	max_run_jobs = MAX((max_share - 1), 1);	/* exclude this job */
 	orig_map = bit_copy(bitmap);
 	if (!orig_map)
 		fatal("bit_copy: malloc failure");

 	/* Try to run with currently available nodes */
 	i = _job_count_bitmap(cr_ptr, job_ptr, orig_map, bitmap,
 			      max_run_jobs, NO_SHARE_LIMIT,
 			      SELECT_MODE_WILL_RUN);
 	if (i >= min_nodes) {
 		rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes,
 			       req_nodes);
 		if (rc == SLURM_SUCCESS) {
 			FREE_NULL_BITMAP(orig_map);
 			job_ptr->start_time = time(NULL);
 			return SLURM_SUCCESS;
 		}
 	}

 	/* Job is still pending. Simulate termination of jobs one at a time
 	 * to determine when and where the job can start. */
 	exp_cr = _dup_cr(cr_ptr);
 	if (exp_cr == NULL) {
 		FREE_NULL_BITMAP(orig_map);
 		return SLURM_ERROR;
 	}

 	/* Build list of running and suspended jobs */
 	cr_job_list = list_create(NULL);
 	if (!cr_job_list)
 		fatal("list_create: memory allocation failure");
 	job_iterator = list_iterator_create(job_list);
 	while ((tmp_job_ptr = (struct job_record *) list_next(job_iterator))) {
 		if (!IS_JOB_RUNNING(tmp_job_ptr) &&
 		    !IS_JOB_SUSPENDED(tmp_job_ptr))
 			continue;
 		if (tmp_job_ptr->end_time == 0) {
 			error("Job %u has zero end_time", tmp_job_ptr->job_id);
 			continue;
 		}
 		if (_is_preemptable(tmp_job_ptr, preemptee_candidates)) {
 			uint16_t mode = slurm_job_preempt_mode(tmp_job_ptr);
 			bool remove_all = false;
 			if ((mode == PREEMPT_MODE_REQUEUE)    ||
 			    (mode == PREEMPT_MODE_CHECKPOINT) ||
 			    (mode == PREEMPT_MODE_CANCEL))
 				remove_all = true;
 			/* Remove preemptable job now */
 			_rm_job_from_nodes(exp_cr, tmp_job_ptr,
 					   "_will_run_test", remove_all);
 		} else
 			list_append(cr_job_list, tmp_job_ptr);

 	}
 	list_iterator_destroy(job_iterator);

 	/* Test with all preemptable jobs gone */
 	if (preemptee_candidates) {
 		i = _job_count_bitmap(exp_cr, job_ptr, orig_map, bitmap,
 				      max_run_jobs, NO_SHARE_LIMIT,
 				      SELECT_MODE_RUN_NOW);
 		if (i >= min_nodes) {
 			rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes,
 				       req_nodes);
 			if (rc == SLURM_SUCCESS)
 				job_ptr->start_time = now + 1;
 		}
 	}

 	/* Remove the running jobs one at a time from exp_node_cr and try
 	 * scheduling the pending job after each one */
 	if (rc != SLURM_SUCCESS) {
 		list_sort(cr_job_list, _cr_job_list_sort);
 		job_iterator = list_iterator_create(cr_job_list);
 		while ((tmp_job_ptr = (struct job_record *)
 			list_next(job_iterator))) {
 			_rm_job_from_nodes(exp_cr, tmp_job_ptr,
 					   "_will_run_test", true);
 			i = _job_count_bitmap(exp_cr, job_ptr, orig_map,
 					      bitmap, max_run_jobs,
 					      NO_SHARE_LIMIT,
 					      SELECT_MODE_RUN_NOW);
 			if (i < min_nodes)
 				continue;
 			rc = _job_test(job_ptr, bitmap, min_nodes, max_nodes,
 				       req_nodes);
 			if (rc != SLURM_SUCCESS)
 				continue;
 			if (tmp_job_ptr->end_time <= now)
 				job_ptr->start_time = now + 1;
 			else
 				job_ptr->start_time = tmp_job_ptr->end_time;
 			break;
 		}
 		list_iterator_destroy(job_iterator);
 	}

 	if ((rc == SLURM_SUCCESS) && preemptee_job_list &&
 	    preemptee_candidates) {
 		/* Build list of preemptee jobs whose resources are
 		 * actually used. List returned even if not killed
 		 * in selected plugin, but by Moab or something else. */
 		if (*preemptee_job_list == NULL) {
 			*preemptee_job_list = list_create(NULL);
 			if (*preemptee_job_list == NULL)
 				fatal("list_create malloc failure");
 		}
 		preemptee_iterator =list_iterator_create(preemptee_candidates);
 		while ((tmp_job_ptr = (struct job_record *)
 			list_next(preemptee_iterator))) {
 			if (bit_overlap(bitmap, tmp_job_ptr->node_bitmap) == 0)
 				continue;

 			list_append(*preemptee_job_list, tmp_job_ptr);
 		}
 		list_iterator_destroy(preemptee_iterator);
 	}

 	list_destroy(cr_job_list);
 	_free_cr(exp_cr);
 	FREE_NULL_BITMAP(orig_map);
 	return rc;
 }

 static int  _cr_job_list_sort(void *x, void *y)
 {
 	struct job_record *job1_ptr = (struct job_record *) x;
 	struct job_record *job2_ptr = (struct job_record *) y;
 	return (int) difftime(job1_ptr->end_time, job2_ptr->end_time);
 }

 /*
  * init() is called when the plugin is loaded, before any other functions
  * are called.  Put global initialization here.
  */
 extern int init ( void )
 {
 	int rc = SLURM_SUCCESS;
 #ifdef HAVE_XCPU
 	rc = _init_status_pthread();
 #endif
 	cr_type = slurmctld_conf.select_type_param;
 	return rc;
 }

 extern int fini ( void )
 {
 	int rc = SLURM_SUCCESS;
 #ifdef HAVE_XCPU
 	rc = _fini_status_pthread();
 #endif
 	slurm_mutex_lock(&cr_mutex);
 	_free_cr(cr_ptr);
 	cr_ptr = NULL;
 	slurm_mutex_unlock(&cr_mutex);
 	return rc;
 }

 /*
  * The remainder of this file implements the standard SLURM
  * node selection API.
  */

 extern int select_p_state_save(char *dir_name)
 {
 	return SLURM_SUCCESS;
 }

 extern int select_p_state_restore(char *dir_name)
 {
 	return SLURM_SUCCESS;
 }

 extern int select_p_job_init(List job_list)
 {
 	return SLURM_SUCCESS;
 }

 extern int select_p_node_init(struct node_record *node_ptr, int node_cnt)
 {
 	if (node_ptr == NULL) {
 		error("select_p_node_init: node_ptr == NULL");
 		return SLURM_ERROR;
 	}

 	if (node_cnt < 0) {
 		error("select_p_node_init: node_cnt < 0");
 		return SLURM_ERROR;
 	}

 	/* NOTE: We free the consumable resources info here, but
 	 * can't rebuild it since the partition and node structures
 	 * have not yet had node bitmaps reset. */
 	slurm_mutex_lock(&cr_mutex);
 	_free_cr(cr_ptr);
 	cr_ptr = NULL;
 	slurm_mutex_unlock(&cr_mutex);

 	select_node_ptr = node_ptr;
 	select_node_cnt = node_cnt;
 	select_fast_schedule = slurm_get_fast_schedule();

 	return SLURM_SUCCESS;
 }

 extern int select_p_block_init(List part_list)
 {
 	return SLURM_SUCCESS;
 }

 /*
  * select_p_job_test - Given a specification of scheduling requirements,
  *	identify the nodes which "best" satisfy the request.
  * 	"best" is defined as either single set of consecutive nodes satisfying
  *	the request and leaving the minimum number of unused nodes OR
  *	the fewest number of consecutive node sets
  * IN/OUT job_ptr - pointer to job being considered for initiation,
  *                  set's start_time when job expected to start
  * IN/OUT bitmap - usable nodes are set on input, nodes not required to
  *	satisfy the request are cleared, other left set
  * IN min_nodes - minimum count of nodes
  * IN req_nodes - requested (or desired) count of nodes
  * IN max_nodes - maximum count of nodes
  * IN mode - SELECT_MODE_RUN_NOW: try to schedule job now
  *           SELECT_MODE_TEST_ONLY: test if job can ever run
  *           SELECT_MODE_WILL_RUN: determine when and where job can run
  * IN preemptee_candidates - List of pointers to jobs which can be preempted.
  * IN/OUT preemptee_job_list - Pointer to list of job pointers. These are the
  *		jobs to be preempted to initiate the pending job. Not set
  *		if mode=SELECT_MODE_TEST_ONLY or input pointer is NULL.
  * RET zero on success, EINVAL otherwise
  * globals (passed via select_p_node_init):
  *	node_record_count - count of nodes configured
  *	node_record_table_ptr - pointer to global node table
  * NOTE: the job information that is considered for scheduling includes:
  *	req_node_bitmap: bitmap of specific nodes required by the job
  *	contiguous: allocated nodes must be sequentially located
  *	num_cpus: minimum number of processors required by the job
  * NOTE: bitmap must be a superset of the job's required at the time that
  *	select_p_job_test is called
  */
 extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap,
 			     uint32_t min_nodes, uint32_t max_nodes,
 			     uint32_t req_nodes, uint16_t mode,
 			     List preemptee_candidates,
 			     List *preemptee_job_list)
 {
 	int max_share = 0, rc = EINVAL;

 	xassert(bitmap);
 	if (job_ptr->details == NULL)
 		return EINVAL;

 	slurm_mutex_lock(&cr_mutex);
 	if (cr_ptr == NULL) {
 		_init_node_cr();
 		if (cr_ptr == NULL) {
 			slurm_mutex_unlock(&cr_mutex);
 			error("select_p_job_test: cr_ptr not initialized");
 			return SLURM_ERROR;
 		}
 	}

 	if (bit_set_count(bitmap) < min_nodes) {
 		slurm_mutex_unlock(&cr_mutex);
 		return EINVAL;
 	}

 	if (job_ptr->details->shared)
 		max_share = job_ptr->part_ptr->max_share & ~SHARED_FORCE;
 	else	/* ((shared == 0) || (shared == (uint16_t) NO_VAL)) */
 		max_share = 1;

 	if (mode == SELECT_MODE_WILL_RUN) {
 		rc = _will_run_test(job_ptr, bitmap, min_nodes, max_nodes,
 				    max_share, req_nodes,
 				    preemptee_candidates, preemptee_job_list);
 	} else if (mode == SELECT_MODE_TEST_ONLY) {
 		rc = _test_only(job_ptr, bitmap, min_nodes, max_nodes,
 				req_nodes, max_share);
 	} else if (mode == SELECT_MODE_RUN_NOW) {
 		rc = _run_now(job_ptr, bitmap, min_nodes, max_nodes,
 			      max_share, req_nodes,
 			      preemptee_candidates, preemptee_job_list);
 	} else
 		fatal("select_p_job_test: Mode %d is invalid", mode);

 	slurm_mutex_unlock(&cr_mutex);

 	return rc;
 }

 extern int select_p_job_begin(struct job_record *job_ptr)
 {
 	int rc = SLURM_SUCCESS;
 #ifdef HAVE_XCPU
 	int i;
 	char clone_path[128];

 	xassert(job_ptr);
 	xassert(job_ptr->node_bitmap);

 	for (i=0; i<select_node_cnt; i++) {
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;
 		snprintf(clone_path, sizeof(clone_path),
 			 "%s/%s/xcpu/clone", XCPU_DIR,
 			 select_node_ptr[i].name);
 		if (chown(clone_path, (uid_t)job_ptr->user_id,
 			  (gid_t)job_ptr->group_id)) {
 			error("chown %s: %m", clone_path);
 			rc = SLURM_ERROR;
 		} else {
 			debug("chown %s to %u", clone_path,
 			      job_ptr->user_id);
 		}
 	}
 #endif
 	slurm_mutex_lock(&cr_mutex);
 	if (cr_ptr == NULL)
 		_init_node_cr();
 	_add_job_to_nodes(cr_ptr, job_ptr, "select_p_job_begin", 1);
 	gres_plugin_job_state_log(job_ptr->gres_list, job_ptr->job_id);
 	slurm_mutex_unlock(&cr_mutex);
 	return rc;
 }

 /* Determine if allocated nodes are usable (powered up) */
 extern int select_p_job_ready(struct job_record *job_ptr)
 {
 	int i, i_first, i_last;
 	struct node_record *node_ptr;

 	if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr)) {
 		/* Gang scheduling might suspend job immediately */
 		return 0;
 	}

 	if ((job_ptr->node_bitmap == NULL) ||
 	    ((i_first = bit_ffs(job_ptr->node_bitmap)) == -1))
 		return READY_NODE_STATE;
 	i_last  = bit_fls(job_ptr->node_bitmap);

 	for (i = i_first; i <= i_last; i++) {
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;
 		node_ptr = node_record_table_ptr + i;
 		if (IS_NODE_POWER_SAVE(node_ptr) || IS_NODE_POWER_UP(node_ptr))
 			return 0;
 	}

 	return READY_NODE_STATE;
 }

 extern int select_p_job_resized(struct job_record *job_ptr,
 				struct node_record *node_ptr)
 {
 	int rc = SLURM_SUCCESS;
 #ifdef HAVE_XCPU
 	int i = node_ptr - node_record_table_ptr;
 	char clone_path[128];

 	if (bit_test(job_ptr->node_bitmap, i) == 0)
 		continue;
 	snprintf(clone_path, sizeof(clone_path), "%s/%s/xcpu/clone", XCPU_DIR,
 		 node_ptr->name);
 	if (chown(clone_path, (uid_t)0, (gid_t)0)) {
 		error("chown %s: %m", clone_path);
 		rc = SLURM_ERROR;
 	} else
 		debug("chown %s to 0", clone_path);
 #endif

 	slurm_mutex_lock(&cr_mutex);
 	if (cr_ptr == NULL)
 		_init_node_cr();
 	_rm_job_from_one_node(job_ptr, node_ptr, "select_p_job_resized");
 	slurm_mutex_unlock(&cr_mutex);
 	return rc;
 }

 extern int select_p_job_fini(struct job_record *job_ptr)
 {
 	int rc = SLURM_SUCCESS;
 #ifdef HAVE_XCPU
 	int i;
 	char clone_path[128];

 	for (i=0; i<select_node_cnt; i++) {
 		if (bit_test(job_ptr->node_bitmap, i) == 0)
 			continue;
 		snprintf(clone_path, sizeof(clone_path), "%s/%s/xcpu/clone",
 			 XCPU_DIR, select_node_ptr[i].name);
 		if (chown(clone_path, (uid_t)0, (gid_t)0)) {
 			error("chown %s: %m", clone_path);
 			rc = SLURM_ERROR;
 		} else {
 			debug("chown %s to 0", clone_path);
 		}
 	}
 #endif
 	slurm_mutex_lock(&cr_mutex);
 	if (cr_ptr == NULL)
 		_init_node_cr();
 	_rm_job_from_nodes(cr_ptr, job_ptr, "select_p_job_fini", true);
 	slurm_mutex_unlock(&cr_mutex);
 	return rc;
 }

 extern int select_p_job_suspend(struct job_record *job_ptr)
 {
 	slurm_mutex_lock(&cr_mutex);
 	if (cr_ptr == NULL)
 		_init_node_cr();
 	_rm_job_from_nodes(cr_ptr, job_ptr, "select_p_job_suspend", false);
 	slurm_mutex_unlock(&cr_mutex);
 	return SLURM_SUCCESS;
 }

 extern int select_p_job_resume(struct job_record *job_ptr)
 {
 	slurm_mutex_lock(&cr_mutex);
 	if (cr_ptr == NULL)
 		_init_node_cr();
 	_add_job_to_nodes(cr_ptr, job_ptr, "select_p_job_resume", 0);
 	slurm_mutex_unlock(&cr_mutex);
 	return SLURM_SUCCESS;
 }

 extern int select_p_pack_select_info(time_t last_query_time,
 				     uint16_t show_flags, Buf *buffer_ptr,
 				     uint16_t protocol_version)
 {
 	/* This function is always invalid on normal Linux clusters */
 	return SLURM_ERROR;
 }

 extern int select_p_select_nodeinfo_pack(select_nodeinfo_t *nodeinfo,
 					 Buf buffer,
 					 uint16_t protocol_version)
 {
 	pack16(nodeinfo->alloc_cpus, buffer);

 	return SLURM_SUCCESS;
 }

 extern int select_p_select_nodeinfo_unpack(select_nodeinfo_t **nodeinfo,
 					   Buf buffer,
 					   uint16_t protocol_version)
 {
 	select_nodeinfo_t *nodeinfo_ptr = NULL;

 	nodeinfo_ptr = select_p_select_nodeinfo_alloc(NO_VAL);
 	*nodeinfo = nodeinfo_ptr;

 	safe_unpack16(&nodeinfo_ptr->alloc_cpus, buffer);

 	return SLURM_SUCCESS;

 unpack_error:
 	error("select_nodeinfo_unpack: error unpacking here");
 	select_p_select_nodeinfo_free(nodeinfo_ptr);
 	*nodeinfo = NULL;

 	return SLURM_ERROR;
 }

 extern select_nodeinfo_t *select_p_select_nodeinfo_alloc(uint32_t size)
 {
 	select_nodeinfo_t *nodeinfo = xmalloc(sizeof(struct select_nodeinfo));

 	nodeinfo->magic = NODEINFO_MAGIC;

 	return nodeinfo;
 }

 extern int select_p_select_nodeinfo_free(select_nodeinfo_t *nodeinfo)
 {
 	if(nodeinfo) {
 		if (nodeinfo->magic != NODEINFO_MAGIC) {
 			error("select_p_select_nodeinfo_free: "
 			      "nodeinfo magic bad");
 			return EINVAL;
 		}
 		nodeinfo->magic = 0;
 		xfree(nodeinfo);
 	}
 	return SLURM_SUCCESS;
 }

 extern int select_p_select_nodeinfo_set_all(time_t last_query_time)
 {
 	struct node_record *node_ptr = NULL;
 	int i=0;
 	static time_t last_set_all = 0;

 	/* only set this once when the last_node_update is newer than
 	 * the last time we set things up. */
 	if(last_set_all && (last_node_update < last_set_all)) {
 		debug2("Node select info for set all hasn't "
 		       "changed since %ld",
 		       (long)last_set_all);
 		return SLURM_NO_CHANGE_IN_DATA;
 	}
 	last_set_all = last_node_update;

 	for (i=0; i<node_record_count; i++) {
 		select_nodeinfo_t *nodeinfo = NULL;

 		node_ptr = node_record_table_ptr + i;
 		/* We have to use the '_g_' here to make sure we get
 		   the correct data to work on.  i.e. cray calls this
 		   plugin from within select/cray which has it's own
 		   struct.
 		*/
 		select_g_select_nodeinfo_get(node_ptr->select_nodeinfo,
 					     SELECT_NODEDATA_PTR, 0,
 					     (void *)&nodeinfo);
 		if(!nodeinfo) {
 			error("no nodeinfo returned from structure");
 			continue;
 		}

 		if ((node_ptr->node_state & NODE_STATE_COMPLETING) ||
 		    (node_ptr->node_state == NODE_STATE_ALLOCATED)) {
 			if (slurmctld_conf.fast_schedule)
 				nodeinfo->alloc_cpus =
 					node_ptr->config_ptr->cpus;
 			else
 				nodeinfo->alloc_cpus = node_ptr->cpus;
 		} else
 			nodeinfo->alloc_cpus = 0;
 	}

 	return SLURM_SUCCESS;
 }

 extern int select_p_select_nodeinfo_set(struct job_record *job_ptr)
 {
 	xassert(job_ptr);

 	slurm_mutex_lock(&cr_mutex);
 	if (cr_ptr == NULL)
 		_init_node_cr();
 	slurm_mutex_unlock(&cr_mutex);

 	return SLURM_SUCCESS;
 }

 extern int select_p_select_nodeinfo_get(select_nodeinfo_t *nodeinfo,
 					enum select_nodedata_type dinfo,
 					enum node_states state,
 					void *data)
 {
 	int rc = SLURM_SUCCESS;
 	uint16_t *uint16 = (uint16_t *) data;
 	select_nodeinfo_t **select_nodeinfo = (select_nodeinfo_t **) data;

 	if (nodeinfo == NULL) {
 		error("get_nodeinfo: nodeinfo not set");
 		return SLURM_ERROR;
 	}

 	if (nodeinfo->magic != NODEINFO_MAGIC) {
 		error("get_nodeinfo: nodeinfo magic bad");
 		return SLURM_ERROR;
 	}

 	switch (dinfo) {
 	case SELECT_NODEDATA_SUBGRP_SIZE:
 		*uint16 = 0;
 		break;
 	case SELECT_NODEDATA_SUBCNT:
 		if (state == NODE_STATE_ALLOCATED)
 			*uint16 = nodeinfo->alloc_cpus;
 		else
 			*uint16 = 0;
 		break;
 	case SELECT_NODEDATA_PTR:
 		*select_nodeinfo = nodeinfo;
 		break;
 	default:
 		error("Unsupported option %d for get_nodeinfo.", dinfo);
 		rc = SLURM_ERROR;
 		break;
 	}
 	return rc;
 }

 extern select_jobinfo_t *select_p_select_jobinfo_alloc(void)
 {
 	return SLURM_SUCCESS;
 }

 extern int select_p_select_jobinfo_set(select_jobinfo_t *jobinfo,
 				       enum select_jobdata_type data_type,
 				       void *data)
 {
 	return SLURM_SUCCESS;
 }

 extern int select_p_select_jobinfo_get (select_jobinfo_t *jobinfo,
 					enum select_jobdata_type data_type,
 					void *data)
 {
 	return SLURM_ERROR;
 }

 extern select_jobinfo_t *select_p_select_jobinfo_copy(
 	select_jobinfo_t *jobinfo)
 {
 	return NULL;
 }

 extern int select_p_select_jobinfo_free  (select_jobinfo_t *jobinfo)
 {
 	return SLURM_SUCCESS;
 }

 extern int  select_p_select_jobinfo_pack(select_jobinfo_t *jobinfo, Buf buffer,
 					 uint16_t protocol_version)
 {
 	return SLURM_SUCCESS;
 }

 extern int  select_p_select_jobinfo_unpack(select_jobinfo_t **jobinfo,
 					   Buf buffer,
 					   uint16_t protocol_version)
 {
 	return SLURM_SUCCESS;
 }

 extern char *select_p_select_jobinfo_sprint(select_jobinfo_t *jobinfo,
 					    char *buf, size_t size, int mode)
 {
 	if (buf && size) {
 		buf[0] = '\0';
 		return buf;
 	} else
 		return NULL;
 }

 extern char *select_p_select_jobinfo_xstrdup(select_jobinfo_t *jobinfo,
 					     int mode)
 {
 	return NULL;
 }

 extern int select_p_update_block (update_part_msg_t *part_desc_ptr)
 {
 	return SLURM_SUCCESS;
 }

 extern int select_p_update_sub_node (update_part_msg_t *part_desc_ptr)
 {
 	return SLURM_SUCCESS;
 }

 extern int select_p_get_info_from_plugin (enum select_jobdata_type info,
 					  struct job_record *job_ptr,
 					  void *data)
 {
 	return SLURM_SUCCESS;
 }

 extern int select_p_update_node_config (int index)
 {
 	return SLURM_SUCCESS;
 }

 extern int select_p_update_node_state (int index, uint16_t state)
 {
 	return SLURM_SUCCESS;
 }

 extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data)
 {
 	return SLURM_SUCCESS;
 }

 extern int select_p_reconfigure(void)
 {
 	slurm_mutex_lock(&cr_mutex);
 	_free_cr(cr_ptr);
 	cr_ptr = NULL;
 	_init_node_cr();
 	slurm_mutex_unlock(&cr_mutex);

 	return SLURM_SUCCESS;
 }