src/plugins/topology/tree/eval_nodes_tree.c - SchedMD/slurm - Git at Google

 /*****************************************************************************\
  *  eval_nodes_tree.c - Determine order of nodes for job using tree algo.
  *****************************************************************************
  *  Copyright (C) SchedMD LLC.
  *
  *  This file is part of Slurm, a resource management program.
  *  For details, see <https://slurm.schedmd.com/>.
  *  Please also read the included file: DISCLAIMER.
  *
  *  Slurm is free software; you can redistribute it and/or modify it under
  *  the terms of the GNU General Public License as published by the Free
  *  Software Foundation; either version 2 of the License, or (at your option)
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission
  *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and
  *  distribute linked combinations including the two. You must obey the GNU
  *  General Public License in all respects for all of the code used other than
  *  OpenSSL. If you modify file(s) with this exception, you may extend this
  *  exception to your version of the file(s), but you are not obligated to do
  *  so. If you do not wish to do so, delete this exception statement from your
  *  version.  If you delete this exception statement from all source files in
  *  the program, then also delete it here.
  *
  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
  *  details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
 \*****************************************************************************/

 #include "eval_nodes_tree.h"

 #include "../common/eval_nodes.h"
 #include "../common/gres_sched.h"

 #include "src/common/xstring.h"

 static void _topo_add_dist(uint32_t *dist, int inx, tree_context_t *ctx)
 {
 	for (int i = 0; i < ctx->switch_count; i++) {
 		if (ctx->switch_table[inx].switches_dist[i] == INFINITE ||
 		    dist[i] == INFINITE) {
 			dist[i] = INFINITE;
 		} else {
 			dist[i] += ctx->switch_table[inx].switches_dist[i];
 		}
 	}
 }

 /*
  * returns 1 if switch "i" is better fit
  * returns -1 if switch "j" is better fit
  * returns 0 if there is no better fit
  */
 static int _topo_compare_switches(int i, int j, int rem_nodes,
 				  int *switch_node_cnt, int rem_cpus,
 				  uint32_t *switch_cpu_cnt, bool *i_fit_out,
 				  tree_context_t *ctx)
 {
 	while (1) {
 		bool i_fit = ((switch_node_cnt[i] >= rem_nodes) &&
 			      ((int) switch_cpu_cnt[i] >= rem_cpus));
 		bool j_fit = ((switch_node_cnt[j] >= rem_nodes) &&
 			      ((int) switch_cpu_cnt[j] >= rem_cpus));
 		*i_fit_out = i_fit;

 		if (i_fit && j_fit) {
 			if (switch_node_cnt[i] < switch_node_cnt[j])
 				return 1;
 			if (switch_node_cnt[i] > switch_node_cnt[j])
 				return -1;
 			break;
 		} else if (i_fit) {
 			return 1;
 		} else if (j_fit) {
 			return -1;
 		}

 		if (((ctx->switch_table[i].parent != i) ||
 		     (ctx->switch_table[j].parent != j)) &&
 		    (ctx->switch_table[i].parent !=
 		     ctx->switch_table[j].parent)) {
 			i = ctx->switch_table[i].parent;
 			j = ctx->switch_table[j].parent;
 			continue;
 		}

 		break;
 	}

 	if (switch_node_cnt[i] > switch_node_cnt[j])
 		return 1;
 	if (switch_node_cnt[i] < switch_node_cnt[j])
 		return -1;
 	if (ctx->switch_table[i].level < ctx->switch_table[j].level)
 		return 1;
 	if (ctx->switch_table[i].level > ctx->switch_table[j].level)
 		return -1;
 	return 0;

 }

 static void _topo_choose_best_switch(uint32_t *dist, int *switch_node_cnt,
 				     int rem_nodes, uint32_t *switch_cpu_cnt,
 				     int rem_cpus, int i, int *best_switch,
 				     tree_context_t *ctx)
 {
 	int tcs = 0;
 	bool i_fit = false;

 	if (*best_switch == -1 || dist[i] == INFINITE || !switch_node_cnt[i]) {
 		/*
 		 * If first possibility
 		 */
 		if (switch_node_cnt[i] && dist[i] < INFINITE)
 			*best_switch = i;
 		return;
 	}

 	tcs = _topo_compare_switches(i, *best_switch, rem_nodes,
 				     switch_node_cnt, rem_cpus, switch_cpu_cnt,
 				     &i_fit, ctx);
 	if (((dist[i] < dist[*best_switch]) && i_fit) ||
 	    ((dist[i] == dist[*best_switch]) && (tcs > 0))) {
 		/*
 		 * If closer and fit request OR
 		 * same distance and tightest fit (less resource waste)
 		 */
 		*best_switch = i;
 	}
 }

 /*
  * Allocate resources to the job on one leaf switch if possible,
  * otherwise distribute the job allocation over many leaf switches.
  */
 static int _eval_nodes_dfly(topology_eval_t *topo_eval)
 {
 	list_t **switch_gres = NULL;		/* available GRES on switch */
 	bitstr_t **switch_node_bitmap = NULL;	/* nodes on this switch */
 	int       *switch_node_cnt = NULL;	/* total nodes on switch */
 	int       *switch_required = NULL;	/* set if has required node */
 	bitstr_t  *avail_nodes_bitmap = NULL;	/* nodes on any switch */
 	bitstr_t  *req_nodes_bitmap   = NULL;	/* required node bitmap */
 	bitstr_t  *req2_nodes_bitmap  = NULL;	/* required+lowest prio nodes */
 	bitstr_t  *best_nodes_bitmap  = NULL;	/* required+low prio nodes */
 	int i, j, rc = SLURM_SUCCESS;
 	int best_cpu_cnt = 0, best_node_cnt = 0, req_node_cnt = 0;
 	list_t *best_gres = NULL;
 	switch_record_t *switch_ptr;
 	list_t *node_weight_list = NULL;
 	topo_weight_info_t *nw = NULL;
 	list_itr_t *iter;
 	node_record_t *node_ptr;
 	int64_t rem_max_cpus;
 	int rem_cpus, rem_nodes; /* remaining resources desired */
 	int min_rem_nodes;	/* remaining resources desired */
 	job_record_t *job_ptr = topo_eval->job_ptr;
 	job_details_t *details_ptr = job_ptr->details;
 	bool sufficient = false;
 	uint16_t *avail_cpu_per_node = NULL;
 	time_t time_waiting = 0;
 	int leaf_switch_count = 0;
 	int top_switch_inx = -1;
 	int prev_rem_nodes;
 	avail_res_t **avail_res_array = topo_eval->avail_res_array;
 	uint32_t min_nodes = topo_eval->min_nodes;
 	uint32_t req_nodes = topo_eval->req_nodes;
 	uint64_t maxtasks;
 	tree_context_t *ctx = topo_eval->tctx->plugin_ctx;

 	topo_eval->avail_cpus = 0;

 	if (job_ptr->req_switch > 1) {
 		/* Maximum leaf switch count >1 probably makes no sense */
 		info("Resetting %pJ leaf switch count from %u to 0",
 		     job_ptr, job_ptr->req_switch);
 		job_ptr->req_switch = 0;
 	}
 	if (job_ptr->req_switch) {
 		time_t     time_now;
 		time_now = time(NULL);
 		if (job_ptr->wait4switch_start == 0)
 			job_ptr->wait4switch_start = time_now;
 		time_waiting = time_now - job_ptr->wait4switch_start;
 	}

 	rem_cpus = details_ptr->min_cpus;
 	min_rem_nodes = min_nodes;
 	if ((topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req)))
 		rem_nodes = MIN(min_nodes, req_nodes);
 	else
 		rem_nodes = MAX(min_nodes, req_nodes);
 	rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes);
 	maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus,
 					    topo_eval->max_nodes);

 	/* Validate availability of required nodes */
 	if (job_ptr->details->req_node_bitmap) {
 		if (!bit_super_set(job_ptr->details->req_node_bitmap,
 				   topo_eval->node_map)) {
 			info("%pJ requires nodes which are not currently available",
 			      job_ptr);
 			rc = SLURM_ERROR;
 			goto fini;
 		}

 		req_node_cnt = bit_set_count(job_ptr->details->req_node_bitmap);
 		if (req_node_cnt == 0) {
 			info("%pJ required node list has no nodes",
 			      job_ptr);
 			rc = SLURM_ERROR;
 			goto fini;
 		}
 		if (req_node_cnt > topo_eval->max_nodes) {
 			info("%pJ requires more nodes than currently available (%u>%u)",
 			      job_ptr, req_node_cnt,
 			      topo_eval->max_nodes);
 			rc = SLURM_ERROR;
 			goto fini;
 		}
 		req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap);
 	}

 	/*
 	 * Add required nodes to job allocation and
 	 * build list of node bitmaps, sorted by weight
 	 */
 	if (!bit_set_count(topo_eval->node_map)) {
 		debug("%pJ node_map is empty",
 		      job_ptr);
 		rc = SLURM_ERROR;
 		goto fini;
 	}
 	avail_cpu_per_node = xcalloc(node_record_count, sizeof(uint16_t));
 	node_weight_list = list_create(eval_nodes_topo_weight_free);
 	for (i = 0;
 	     (node_ptr = next_node_bitmap(topo_eval->node_map, &i));
 	     i++) {
 		topo_weight_info_t nw_static;
 		if (req_nodes_bitmap && bit_test(req_nodes_bitmap, i)) {
 			eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
 			(void) eval_nodes_cpus_to_use(
 				topo_eval, i, rem_max_cpus, min_rem_nodes,
 				&maxtasks, true);
 			if (topo_eval->avail_cpus == 0) {
 				log_flag(SELECT_TYPE, "%pJ insufficient resources on required node",
 				       job_ptr);
 				rc = SLURM_ERROR;
 				goto fini;
 			}
 			avail_cpu_per_node[i] = topo_eval->avail_cpus;
 			rem_nodes--;
 			min_rem_nodes--;
 			topo_eval->max_nodes--;
 			rem_cpus   -= topo_eval->avail_cpus;
 			rem_max_cpus -= topo_eval->avail_cpus;
 		}

 		nw_static.weight = node_ptr->sched_weight;
 		nw = list_find_first(node_weight_list,
 				     eval_nodes_topo_weight_find,
 				     &nw_static);
 		if (!nw) {	/* New node weight to add */
 			nw = xmalloc(sizeof(topo_weight_info_t));
 			nw->node_bitmap = bit_alloc(node_record_count);
 			nw->weight = node_ptr->sched_weight;
 			list_append(node_weight_list, nw);
 		}
 		bit_set(nw->node_bitmap, i);
 		nw->node_cnt++;
 	}

 	if (req_nodes_bitmap) {
 		bit_and(topo_eval->node_map, req_nodes_bitmap);
 		if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
 		    gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
 			/* Required nodes completely satisfied the request */
 			rc = SLURM_SUCCESS;
 			goto fini;
 		}
 		if (topo_eval->max_nodes <= 0) {
 			rc = SLURM_ERROR;
 			log_flag(SELECT_TYPE, "%pJ requires nodes exceed maximum node limit",
 				 job_ptr);
 			goto fini;
 		}
 	} else {
 		bit_clear_all(topo_eval->node_map);
 	}

 	list_sort(node_weight_list, eval_nodes_topo_weight_sort);
 	if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE)
 		(void) list_for_each(node_weight_list,
 				     eval_nodes_topo_weight_log, NULL);

 	/*
 	 * Identify the highest level switch to be used.
 	 * Note that nodes can be on multiple non-overlapping switches.
 	 */
 	switch_gres = xcalloc(ctx->switch_count, sizeof(list_t *));
 	switch_node_bitmap = xcalloc(ctx->switch_count, sizeof(bitstr_t *));
 	switch_node_cnt = xcalloc(ctx->switch_count, sizeof(int));
 	switch_required = xcalloc(ctx->switch_count, sizeof(int));

 	if (!req_nodes_bitmap)
 		nw = list_peek(node_weight_list);
 	for (i = 0, switch_ptr = ctx->switch_table; i < ctx->switch_count;
 	     i++, switch_ptr++) {
 		switch_node_bitmap[i] = bit_copy(switch_ptr->node_bitmap);
 		if (req_nodes_bitmap &&
 		    bit_overlap_any(req_nodes_bitmap, switch_node_bitmap[i])) {
 			switch_required[i] = 1;
 			if (ctx->switch_table[i].level == 0) {
 				leaf_switch_count++;
 			}
 			if ((top_switch_inx == -1) ||
 			    (ctx->switch_table[i].level >
 			     ctx->switch_table[top_switch_inx].level)) {
 				top_switch_inx = i;
 			}
 		}
 		if (!req_nodes_bitmap &&
 		    (list_find_first(node_weight_list,
 				     eval_nodes_topo_node_find,
 				     switch_node_bitmap[i]))) {
 			if ((top_switch_inx == -1) ||
 			    (ctx->switch_table[i].level >
 			     ctx->switch_table[top_switch_inx].level)) {
 				top_switch_inx = i;
 			}
 		}
 	}

 	/*
 	 * Top switch is highest level switch containing all required nodes
 	 * OR all nodes of the lowest scheduling weight
 	 * OR -1 of can not identify top-level switch
 	 */
 	if (top_switch_inx == -1) {
 		error("%pJ unable to identify top level switch",
 		       job_ptr);
 		rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
 		goto fini;
 	}

 	/* Check that all specifically required nodes are on shared network */
 	if (req_nodes_bitmap &&
 	    !bit_super_set(req_nodes_bitmap,
 			   switch_node_bitmap[top_switch_inx])) {
 		rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
 		info("%pJ requires nodes that do not have shared network",
 		     job_ptr);
 		goto fini;
 	}

 	/*
 	 * Remove nodes from consideration that can not be reached from this
 	 * top level switch
 	 */
 	for (i = 0; i < ctx->switch_count; i++) {
 		if (top_switch_inx != i) {
 			  bit_and(switch_node_bitmap[i],
 				  switch_node_bitmap[top_switch_inx]);
 		}
 	}

 	/*
 	 * Identify the best set of nodes (i.e. nodes with the lowest weight,
 	 * in addition to the required nodes) that can be used to satisfy the
 	 * job request. All nodes must be on a common top-level switch. The
 	 * logic here adds groups of nodes, all with the same weight, so we
 	 * usually identify more nodes than required to satisfy the request.
 	 * Later logic selects from those nodes to get the best topology.
 	 */
 	best_nodes_bitmap = bit_alloc(node_record_count);
 	iter = list_iterator_create(node_weight_list);
 	while (!sufficient && (nw = list_next(iter))) {
 		if (best_node_cnt > 0) {
 			/*
 			 * All of the lower priority nodes should be included
 			 * in the job's allocation. Nodes from the next highest
 			 * weight nodes are included only as needed.
 			 */
 			if (req2_nodes_bitmap)
 				bit_or(req2_nodes_bitmap, best_nodes_bitmap);
 			else
 				req2_nodes_bitmap = bit_copy(best_nodes_bitmap);
 		}
 		for (i = 0; next_node_bitmap(nw->node_bitmap, &i); i++) {
 			if (avail_cpu_per_node[i])
 				continue;	/* Required node */
 			if (!bit_test(switch_node_bitmap[top_switch_inx], i))
 				continue;
 			eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
 			if (topo_eval->avail_cpus == 0) {
 				bit_clear(nw->node_bitmap, i);
 				continue;
 			}
 			bit_set(best_nodes_bitmap, i);
 			avail_cpu_per_node[i] = topo_eval->avail_cpus;
 			best_cpu_cnt += topo_eval->avail_cpus;
 			best_node_cnt++;
 			if (topo_eval->gres_per_job) {
 				gres_sched_consec(
 					&best_gres, job_ptr->gres_list_req,
 					avail_res_array[i]->sock_gres_list);
 			}
 		}

 		sufficient = (best_cpu_cnt >= rem_cpus) &&
 			     eval_nodes_enough_nodes(best_node_cnt, rem_nodes,
 						     min_nodes, req_nodes);
 		if (sufficient && topo_eval->gres_per_job) {
 			sufficient = gres_sched_sufficient(
 				job_ptr->gres_list_req, best_gres);
 		}
 	}
 	list_iterator_destroy(iter);

 	if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
 		char *gres_str = NULL, *gres_print = "";
 		char *node_names;
 		if (req_nodes_bitmap) {
 			node_names = bitmap2node_name(req_nodes_bitmap);
 			info("Required nodes:%s", node_names);
 			xfree(node_names);
 		}
 		node_names = bitmap2node_name(best_nodes_bitmap);
 		if (topo_eval->gres_per_job) {
 			gres_str = gres_sched_str(best_gres);
 			if (gres_str)
 				gres_print = gres_str;
 		}
 		info("Best nodes:%s node_cnt:%d cpu_cnt:%d %s",
 		     node_names, best_node_cnt, best_cpu_cnt, gres_print);
 		xfree(node_names);
 		xfree(gres_str);
 	}
 	if (!sufficient) {
 		log_flag(SELECT_TYPE, "insufficient resources currently available for %pJ",
 		      job_ptr);
 		rc = SLURM_ERROR;
 		goto fini;
 	}

 	/*
 	 * Add lowest weight nodes. Treat similar to required nodes for the job.
 	 * Job will still need to add some higher weight nodes later.
 	 */
 	if (req2_nodes_bitmap) {
 		for (i = 0;
 		     next_node_bitmap(req2_nodes_bitmap, &i) && (topo_eval->max_nodes > 0);
 		     i++) {
 			topo_eval->avail_cpus = avail_cpu_per_node[i];
 			if (!eval_nodes_cpus_to_use(
 				topo_eval, i, rem_max_cpus, min_rem_nodes,
 				&maxtasks, true)) {
 				/*
 				 * Too many restricted cores removed due to
 				 * gres layout. Skip node
 				 */
 				bit_clear(req2_nodes_bitmap, i);
 				continue;
 			}
 			rem_nodes--;
 			min_rem_nodes--;
 			topo_eval->max_nodes--;
 			rem_cpus   -= topo_eval->avail_cpus;
 			rem_max_cpus -= topo_eval->avail_cpus;
 		}

 		for (i = 0, switch_ptr = ctx->switch_table;
 		     i < ctx->switch_count; i++, switch_ptr++) {
 			if (switch_required[i])
 				continue;
 			if (bit_overlap_any(req2_nodes_bitmap,
 					    switch_node_bitmap[i])) {
 				switch_required[i] = 1;
 				if (ctx->switch_table[i].level == 0) {
 					leaf_switch_count++;
 				}
 			}
 		}
 		bit_or(topo_eval->node_map, req2_nodes_bitmap);
 		if (topo_eval->max_nodes <= 0) {
 			rc = SLURM_ERROR;
 			log_flag(SELECT_TYPE, "%pJ reached maximum node limit",
 				 job_ptr);
 			goto fini;
 		}
 		if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
 		    (!topo_eval->gres_per_job ||
 		     gres_sched_test(job_ptr->gres_list_req,
 				     job_ptr->job_id))) {
 			/* Required nodes completely satisfied the request */
 			error("Scheduling anomaly for %pJ",
 			      job_ptr);
 			rc = SLURM_SUCCESS;
 			goto fini;
 		}
 	}

 	/*
 	 * Construct a set of switch array entries.
 	 * Use the same indexes as ctx->switch_table in slurmctld.
 	 */
 	bit_or(best_nodes_bitmap, topo_eval->node_map);
 	avail_nodes_bitmap = bit_alloc(node_record_count);
 	for (i = 0, switch_ptr = ctx->switch_table; i < ctx->switch_count;
 	     i++, switch_ptr++) {
 		bit_and(switch_node_bitmap[i], best_nodes_bitmap);
 		bit_or(avail_nodes_bitmap, switch_node_bitmap[i]);
 		switch_node_cnt[i] = bit_set_count(switch_node_bitmap[i]);
 	}

 	if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
 		for (i = 0; i < ctx->switch_count; i++) {
 			char *node_names = NULL;
 			if (switch_node_cnt[i]) {
 				node_names =
 					bitmap2node_name(switch_node_bitmap[i]);
 			}
 			info("switch=%s level=%d nodes=%u:%s required:%u speed:%u",
 			     ctx->switch_table[i].name,
 			     ctx->switch_table[i].level,
 			     switch_node_cnt[i], node_names,
 			     switch_required[i],
 			     ctx->switch_table[i].link_speed);
 			xfree(node_names);
 		}
 	}

 	if (req_nodes_bitmap &&
 	    (!bit_super_set(req_nodes_bitmap, avail_nodes_bitmap))) {
 		info("%pJ requires nodes not available on any switch",
 		     job_ptr);
 		rc = SLURM_ERROR;
 		goto fini;
 	}

 	/*
 	 * If no resources have yet been  selected,
 	 * then pick one leaf switch with the most available nodes.
 	 */
 	if (leaf_switch_count == 0) {
 		int best_switch_inx = -1;
 		for (i = 0; i < ctx->switch_count; i++) {
 			if (ctx->switch_table[i].level != 0)
 				continue;
 			if ((best_switch_inx == -1) ||
 			    (switch_node_cnt[i] >
 			     switch_node_cnt[best_switch_inx]))
 				best_switch_inx = i;
 		}
 		if (best_switch_inx != -1) {
 			leaf_switch_count = 1;
 			switch_required[best_switch_inx] = 1;
 		}
 	}

 	/*
 	 * All required resources currently on one leaf switch. Determine if
 	 * the entire job request can be satisfied using just that one switch.
 	 */
 	if (leaf_switch_count == 1) {
 		best_cpu_cnt = 0;
 		best_node_cnt = 0;
 		FREE_NULL_LIST(best_gres);
 		for (i = 0; i < ctx->switch_count; i++) {
 			if (!switch_required[i] || !switch_node_bitmap[i] ||
 			    (ctx->switch_table[i].level != 0))
 				continue;
 			for (j = 0; next_node_bitmap(switch_node_bitmap[i], &j);
 			     j++) {
 				if (bit_test(topo_eval->node_map, j) ||
 				    !avail_cpu_per_node[j])
 					continue;
 				topo_eval->avail_cpus = avail_cpu_per_node[j];
 				best_cpu_cnt += topo_eval->avail_cpus;
 				best_node_cnt++;
 				if (topo_eval->gres_per_job) {
 					gres_sched_consec(
 						&best_gres,
 						job_ptr->gres_list_req,
 						avail_res_array[j]->
 						sock_gres_list);
 				}
 			}
 			break;
 		}
 		sufficient = (best_cpu_cnt >= rem_cpus) &&
 			     eval_nodes_enough_nodes(best_node_cnt, rem_nodes,
 						     min_nodes, req_nodes);
 		if (sufficient && topo_eval->gres_per_job) {
 			sufficient = gres_sched_sufficient(
 				job_ptr->gres_list_req, best_gres);
 		}
 		if (sufficient && (i < ctx->switch_count)) {
 			/* Complete request using this one leaf switch */
 			for (j = 0; next_node_bitmap(switch_node_bitmap[i], &j);
 			     j++) {
 				if (bit_test(topo_eval->node_map, j) ||
 				    !avail_cpu_per_node[j])
 					continue;
 				topo_eval->avail_cpus = avail_cpu_per_node[j];
 				if (!eval_nodes_cpus_to_use(topo_eval, j,
 						            rem_max_cpus,
 						            min_rem_nodes,
 						            &maxtasks, true)) {
 					avail_cpu_per_node[j] = 0;
 					continue;
 				}

 				rem_nodes--;
 				min_rem_nodes--;
 				topo_eval->max_nodes--;
 				rem_cpus   -= topo_eval->avail_cpus;
 				rem_max_cpus -= topo_eval->avail_cpus;
 				bit_set(topo_eval->node_map, j);
 				if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
 				    (!topo_eval->gres_per_job ||
 				     gres_sched_test(job_ptr->gres_list_req,
 						     job_ptr->job_id))) {
 					rc = SLURM_SUCCESS;
 					goto fini;
 				}
 				if (topo_eval->max_nodes <= 0) {
 					rc = SLURM_ERROR;
 					log_flag(SELECT_TYPE, "%pJ reached maximum node limit",
 						 job_ptr);
 					goto fini;
 				}
 			}
 		}
 	}

 	/*
 	 * Add additional resources as required from additional leaf switches
 	 * on a round-robin basis
 	 */
 	prev_rem_nodes = rem_nodes + 1;
 	while (1) {
 		if (prev_rem_nodes == rem_nodes)
 			break;	/* Stalled */
 		prev_rem_nodes = rem_nodes;
 		for (i = 0; i < ctx->switch_count; i++) {
 			if (!switch_node_bitmap[i] ||
 			    (ctx->switch_table[i].level != 0))
 				continue;
 			for (j = 0; next_node_bitmap(switch_node_bitmap[i], &j);
 			     j++) {
 				if (bit_test(topo_eval->node_map, j) ||
 				    !avail_cpu_per_node[j])
 					continue;
 				topo_eval->avail_cpus = avail_cpu_per_node[j];
 				if (!eval_nodes_cpus_to_use(topo_eval, j,
 						            rem_max_cpus,
 						            min_rem_nodes,
 						            &maxtasks, true)) {
 					avail_cpu_per_node[j] = 0;
 					continue;
 				}
 				rem_nodes--;
 				min_rem_nodes--;
 				topo_eval->max_nodes--;
 				rem_cpus   -= topo_eval->avail_cpus;
 				rem_max_cpus -= topo_eval->avail_cpus;
 				bit_set(topo_eval->node_map, j);
 				if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
 				    (!topo_eval->gres_per_job ||
 				     gres_sched_test(job_ptr->gres_list_req,
 						     job_ptr->job_id))) {
 					rc = SLURM_SUCCESS;
 					goto fini;
 				}
 				if (topo_eval->max_nodes <= 0) {
 					rc = SLURM_ERROR;
 					log_flag(SELECT_TYPE, "%pJ reached maximum node limit",
 						 job_ptr);
 					goto fini;
 				}
 				break;	/* Move to next switch */
 			}
 		}
 	}
 	if ((min_rem_nodes <= 0) && (rem_cpus <= 0) &&
 	    (!topo_eval->gres_per_job ||
 	     gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id))) {
 		rc = SLURM_SUCCESS;
 		goto fini;
 	}
 	rc = SLURM_ERROR;

 fini:
 	if (rc == SLURM_SUCCESS)
 		eval_nodes_clip_socket_cores(topo_eval);

 	if ((job_ptr->req_switch > 0) && (rc == SLURM_SUCCESS) &&
 	    switch_node_bitmap) {
 		/* req_switch == 1 here; enforced at the top of the function. */
 		leaf_switch_count = 0;

 		/* count up leaf switches */
 		for (i = 0, switch_ptr = ctx->switch_table;
 		     i < ctx->switch_count; i++, switch_ptr++) {
 			if (ctx->switch_table[i].level != 0)
 				continue;
 			if (bit_overlap_any(switch_node_bitmap[i], topo_eval->node_map))
 				leaf_switch_count++;
 		}
 		if (time_waiting >= job_ptr->wait4switch) {
 			job_ptr->best_switch = true;
 			debug3("%pJ waited %ld sec for switches use=%d",
 				job_ptr, time_waiting, leaf_switch_count);
 		} else if (leaf_switch_count > job_ptr->req_switch) {
 			/*
 			 * Allocation is for more than requested number of
 			 * switches.
 			 */
 			job_ptr->best_switch = false;
 			debug3("%pJ waited %ld sec for switches=%u found=%d wait %u",
 				job_ptr, time_waiting, job_ptr->req_switch,
 				leaf_switch_count, job_ptr->wait4switch);
 		} else {
 			job_ptr->best_switch = true;
 		}
 	}

 	FREE_NULL_LIST(best_gres);
 	FREE_NULL_LIST(node_weight_list);
 	FREE_NULL_BITMAP(avail_nodes_bitmap);
 	FREE_NULL_BITMAP(req_nodes_bitmap);
 	FREE_NULL_BITMAP(req2_nodes_bitmap);
 	FREE_NULL_BITMAP(best_nodes_bitmap);
 	xfree(avail_cpu_per_node);
 	xfree(switch_gres);
 	if (switch_node_bitmap) {
 		for (i = 0; i < ctx->switch_count; i++)
 			FREE_NULL_BITMAP(switch_node_bitmap[i]);
 		xfree(switch_node_bitmap);
 	}
 	xfree(switch_node_cnt);
 	xfree(switch_required);
 	return rc;
 }

 static void _decrement_node_cnt(int num_nodes_taken, int switch_index,
 				int *switch_node_cnt, tree_context_t *ctx)
 {
 	for (int i = switch_index; i >= 0; i = ctx->switch_table[i].parent) {
 		if (switch_node_cnt[i] <= num_nodes_taken) {
 			switch_node_cnt[i] = 0;
 		} else {
 			switch_node_cnt[i] -= num_nodes_taken;
 		}

 		/* end once we've reached root switch */
 		if (ctx->switch_table[i].parent == SWITCH_NO_PARENT)
 			break;
 	}
 }

 /* Allocate resources to job using a minimal leaf switch count */
 static int _eval_nodes_topo(topology_eval_t *topo_eval)
 {
 	uint32_t *switch_cpu_cnt = NULL;	/* total CPUs on switch */
 	bitstr_t **switch_node_bitmap = NULL;	/* nodes on this switch */
 	bitstr_t **start_switch_node_bitmap = NULL;
 	int       *switch_node_cnt = NULL;	/* total nodes on switch */
 	int       *switch_required = NULL;	/* set if has required node */
 	int *req_switch_required = NULL;
 	bitstr_t  *avail_nodes_bitmap = NULL;	/* nodes on any switch */
 	bitstr_t  *req_nodes_bitmap   = NULL;	/* required node bitmap */
 	bitstr_t  *req2_nodes_bitmap  = NULL;	/* required+lowest prio nodes */
 	bitstr_t  *best_nodes_bitmap  = NULL;	/* required+low prio nodes */
 	bitstr_t *start_node_map = NULL;
 	int i, j, rc = SLURM_SUCCESS;
 	int best_cpu_cnt, best_node_cnt, req_node_cnt = 0;
 	list_t *best_gres = NULL;
 	switch_record_t *switch_ptr;
 	list_t *node_weight_list = NULL;
 	topo_weight_info_t *nw = NULL;
 	list_itr_t *iter;
 	node_record_t *node_ptr;
 	int64_t rem_max_cpus, start_rem_max_cpus = 0;
 	int rem_cpus, start_rem_cpus = 0, rem_nodes; /* remaining resources desired */
 	int min_rem_nodes;	/* remaining resources desired */
 	job_record_t *job_ptr = topo_eval->job_ptr;
 	job_details_t *details_ptr = job_ptr->details;
 	bool requested, sufficient = false;
 	uint16_t *avail_cpu_per_node = NULL;
 	uint32_t *switches_dist= NULL;
 	time_t time_waiting = 0;
 	int top_switch_inx = -1;
 	uint64_t top_switch_lowest_weight = 0;
 	int prev_rem_nodes;
 	avail_res_t **avail_res_array = topo_eval->avail_res_array;
 	uint32_t min_nodes = topo_eval->min_nodes;
 	uint32_t req_nodes = topo_eval->req_nodes;
 	uint32_t org_max_nodes = topo_eval->max_nodes;
 	uint64_t maxtasks;
 	tree_context_t *ctx = topo_eval->tctx->plugin_ctx;

 	topo_eval->avail_cpus = 0;

 	if (job_ptr->req_switch) {
 		time_t     time_now;
 		time_now = time(NULL);
 		if (job_ptr->wait4switch_start == 0)
 			job_ptr->wait4switch_start = time_now;
 		time_waiting = time_now - job_ptr->wait4switch_start;
 	}

 	rem_cpus = details_ptr->min_cpus;
 	min_rem_nodes = min_nodes;
 	if ((topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req)))
 		rem_nodes = MIN(min_nodes, req_nodes);
 	else
 		rem_nodes = MAX(min_nodes, req_nodes);

 	rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes);
 	maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus,
 					    topo_eval->max_nodes);

 	/* Validate availability of required nodes */
 	if (job_ptr->details->req_node_bitmap) {
 		if (!bit_super_set(job_ptr->details->req_node_bitmap,
 				   topo_eval->node_map)) {
 			info("%pJ requires nodes which are not currently available",
 			      job_ptr);
 			rc = SLURM_ERROR;
 			goto fini;
 		}

 		req_node_cnt = bit_set_count(job_ptr->details->req_node_bitmap);
 		if (req_node_cnt == 0) {
 			info("%pJ required node list has no nodes",
 			      job_ptr);
 			rc = SLURM_ERROR;
 			goto fini;
 		}
 		if (req_node_cnt > topo_eval->max_nodes) {
 			info("%pJ requires more nodes than currently available (%u>%u)",
 			      job_ptr, req_node_cnt,
 			      topo_eval->max_nodes);
 			rc = SLURM_ERROR;
 			goto fini;
 		}
 		req_nodes_bitmap = job_ptr->details->req_node_bitmap;
 	}

 	/*
 	 * Add required nodes to job allocation and
 	 * build list of node bitmaps, sorted by weight
 	 */
 	if (!bit_set_count(topo_eval->node_map)) {
 		debug("%pJ node_map is empty",
 		      job_ptr);
 		rc = SLURM_ERROR;
 		goto fini;
 	}
 	avail_cpu_per_node = xcalloc(node_record_count, sizeof(uint16_t));
 	node_weight_list = list_create(eval_nodes_topo_weight_free);
 	for (i = 0;
 	     (node_ptr = next_node_bitmap(topo_eval->node_map, &i));
 	     i++) {
 		topo_weight_info_t nw_static;
 		if (req_nodes_bitmap && bit_test(req_nodes_bitmap, i)) {
 			eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
 			(void) eval_nodes_cpus_to_use(topo_eval, i,
 						      rem_max_cpus,
 						      min_rem_nodes,
 						      &maxtasks, true);
 			if (topo_eval->avail_cpus == 0) {
 				debug2("%pJ insufficient resources on required node",
 				       job_ptr);
 				rc = SLURM_ERROR;
 				goto fini;
 			}
 			avail_cpu_per_node[i] = topo_eval->avail_cpus;
 			rem_nodes--;
 			min_rem_nodes--;
 			topo_eval->max_nodes--;
 			rem_cpus   -= topo_eval->avail_cpus;
 			rem_max_cpus -= topo_eval->avail_cpus;
 		}

 		nw_static.weight = node_ptr->sched_weight;
 		nw = list_find_first(node_weight_list,
 				     eval_nodes_topo_weight_find,
 				     &nw_static);
 		if (!nw) {	/* New node weight to add */
 			nw = xmalloc(sizeof(topo_weight_info_t));
 			nw->node_bitmap = bit_alloc(node_record_count);
 			nw->weight = node_ptr->sched_weight;
 			list_append(node_weight_list, nw);
 		}
 		bit_set(nw->node_bitmap, i);
 		nw->node_cnt++;
 	}

 	list_sort(node_weight_list, eval_nodes_topo_weight_sort);
 	if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE)
 		(void) list_for_each(node_weight_list,
 				     eval_nodes_topo_weight_log, NULL);

 	/*
 	 * Identify the highest level switch to be used.
 	 * Note that nodes can be on multiple non-overlapping switches.
 	 */
 	switch_cpu_cnt = xcalloc(ctx->switch_count, sizeof(uint32_t));
 	switch_node_bitmap = xcalloc(ctx->switch_count, sizeof(bitstr_t *));
 	start_switch_node_bitmap =
 		xcalloc(ctx->switch_count, sizeof(bitstr_t *));
 	switch_node_cnt = xcalloc(ctx->switch_count, sizeof(int));
 	switch_required = xcalloc(ctx->switch_count, sizeof(int));
 	req_switch_required = xcalloc(ctx->switch_count, sizeof(int));

 	for (i = 0, switch_ptr = ctx->switch_table; i < ctx->switch_count;
 	     i++, switch_ptr++) {
 		uint32_t switch_cpus = 0;
 		switch_node_bitmap[i] = bit_copy(switch_ptr->node_bitmap);
 		bit_and(switch_node_bitmap[i], topo_eval->node_map);
 		switch_node_cnt[i] = bit_set_count(switch_node_bitmap[i]);
 		/*
 		 * Count total CPUs of the intersection of node_map and
 		 * switch_node_bitmap.
 		 */
 		for (j = 0; (node_ptr = next_node_bitmap(switch_node_bitmap[i],
 							 &j));
 		     j++)
 			switch_cpus += avail_res_array[j]->avail_cpus;
 		switch_cpu_cnt[i] = switch_cpus;
 		if (req_nodes_bitmap &&
 		    bit_overlap_any(req_nodes_bitmap, switch_node_bitmap[i])) {
 			switch_required[i] = 1;
 			if ((top_switch_inx == -1) ||
 			    (ctx->switch_table[i].level >
 			     ctx->switch_table[top_switch_inx].level)) {
 				top_switch_inx = i;
 			}
 		}
 		if (!eval_nodes_enough_nodes(switch_node_cnt[i], rem_nodes,
 					     min_nodes, req_nodes) ||
 		    (rem_cpus > switch_cpu_cnt[i]))
 			continue;
 		if (!req_nodes_bitmap &&
 		    (nw = list_find_first(node_weight_list,
 					  eval_nodes_topo_node_find,
 				    switch_node_bitmap[i]))) {
 			if ((top_switch_inx == -1) ||
 			    ((ctx->switch_table[i].level >=
 			      ctx->switch_table[top_switch_inx].level) &&
 			     (nw->weight <= top_switch_lowest_weight))) {
 				top_switch_inx = i;
 				top_switch_lowest_weight = nw->weight;
 			}
 		}
 	}

 	if (!req_nodes_bitmap) {
 		bit_clear_all(topo_eval->node_map);
 	}

 	/*
 	 * Top switch is highest level switch containing all required nodes
 	 * OR all nodes of the lowest scheduling weight
 	 * OR -1 if can not identify top-level switch, which may be due to a
 	 * disjoint topology and available nodes living on different switches.
 	 */
 	if (top_switch_inx == -1) {
 		log_flag(SELECT_TYPE, "%pJ unable to identify top level switch",
 			 job_ptr);
 		rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
 		goto fini;
 	}

 	/* Check that all specifically required nodes are on shared network */
 	if (req_nodes_bitmap &&
 	    !bit_super_set(req_nodes_bitmap,
 			   switch_node_bitmap[top_switch_inx])) {
 		rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
 		info("%pJ requires nodes that do not have shared network",
 		     job_ptr);
 		goto fini;
 	}

 	/*
 	 * Remove nodes from consideration that can not be reached from this
 	 * top level switch.
 	 */
 	for (i = 0; i < ctx->switch_count; i++) {
 		if (top_switch_inx != i) {
 			  bit_and(switch_node_bitmap[i],
 				  switch_node_bitmap[top_switch_inx]);
 		}
 	}

 	start_rem_cpus = rem_cpus;
 	start_rem_max_cpus = rem_max_cpus;
 	if (req_nodes_bitmap) {
 		bit_and(topo_eval->node_map, req_nodes_bitmap);
 		if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
 		    gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
 			/* Required nodes completely satisfied the request */
 			rc = SLURM_SUCCESS;
 			goto fini;
 		}
 		if (topo_eval->max_nodes <= 0) {
 			rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
 			log_flag(SELECT_TYPE, "%pJ requires nodes exceed maximum node limit",
 				 job_ptr);
 			goto fini;
 		}
 	}

 	start_node_map = bit_copy(topo_eval->node_map);
 	memcpy(req_switch_required, switch_required,
 	       ctx->switch_count * sizeof(int));
 	for (i = 0; i < ctx->switch_count; i++)
 		start_switch_node_bitmap[i] = bit_copy(switch_node_bitmap[i]);

 try_again:
 	/*
 	 * Identify the best set of nodes (i.e. nodes with the lowest weight,
 	 * in addition to the required nodes) that can be used to satisfy the
 	 * job request. All nodes must be on a common top-level switch. The
 	 * logic here adds groups of nodes, all with the same weight, so we
 	 * usually identify more nodes than required to satisfy the request.
 	 * Later logic selects from those nodes to get the best topology.
 	 */
 	requested = false;
 	best_node_cnt = 0;
 	best_cpu_cnt = 0;
 	best_nodes_bitmap = bit_alloc(node_record_count);
 	iter = list_iterator_create(node_weight_list);
 	while (!requested && (nw = list_next(iter))) {
 		if (best_node_cnt > 0) {
 			/*
 			 * All of the lower priority nodes should be included
 			 * in the job's allocation. Nodes from the next highest
 			 * weight nodes are included only as needed.
 			 */
 			if (req2_nodes_bitmap)
 				bit_or(req2_nodes_bitmap, best_nodes_bitmap);
 			else
 				req2_nodes_bitmap = bit_copy(best_nodes_bitmap);
 		}

 		if (!bit_set_count(nw->node_bitmap))
 			continue;

 		for (i = 0; (node_ptr = next_node_bitmap(nw->node_bitmap, &i));
 		     i++) {
 			if (req_nodes_bitmap && bit_test(req_nodes_bitmap, i))
 				continue;	/* Required node */
 			if (!bit_test(switch_node_bitmap[top_switch_inx], i))
 				continue;
 			eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
 			if (topo_eval->avail_cpus == 0) {
 				bit_clear(nw->node_bitmap, i);
 				continue;
 			}
 			bit_set(best_nodes_bitmap, i);
 			avail_cpu_per_node[i] = topo_eval->avail_cpus;
 			best_cpu_cnt += topo_eval->avail_cpus;
 			best_node_cnt++;
 			if (topo_eval->gres_per_job) {
 				gres_sched_consec(
 					&best_gres, job_ptr->gres_list_req,
 					avail_res_array[i]->sock_gres_list);
 			}
 		}

 		if (!sufficient) {
 			sufficient = (best_cpu_cnt >= rem_cpus) &&
 				     eval_nodes_enough_nodes(
 					     best_node_cnt, rem_nodes,
 					     min_nodes, req_nodes);
 			if (sufficient && topo_eval->gres_per_job) {
 				sufficient = gres_sched_sufficient(
 						job_ptr->gres_list_req,
 						best_gres);
 			}
 		}
 		requested = ((best_node_cnt >= rem_nodes) &&
 			     (best_cpu_cnt >= rem_cpus) &&
 			     (!topo_eval->gres_per_job ||
 			      gres_sched_sufficient(job_ptr->gres_list_req,
 						    best_gres)));
 	}
 	list_iterator_destroy(iter);

 	if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
 		char *gres_str = NULL, *gres_print = "";
 		char *node_names;
 		if (req_nodes_bitmap) {
 			node_names = bitmap2node_name(req_nodes_bitmap);
 			info("Required nodes:%s", node_names);
 			xfree(node_names);
 		}
 		node_names = bitmap2node_name(best_nodes_bitmap);
 		if (topo_eval->gres_per_job) {
 			gres_str = gres_sched_str(best_gres);
 			if (gres_str)
 				gres_print = gres_str;
 		}
 		info("Best nodes:%s node_cnt:%d cpu_cnt:%d %s",
 		     node_names, best_node_cnt, best_cpu_cnt, gres_print);
 		xfree(node_names);
 		xfree(gres_str);
 	}
 	if (!sufficient) {
 		log_flag(SELECT_TYPE, "insufficient resources currently available for %pJ",
 		      job_ptr);
 		rc = SLURM_ERROR;
 		goto fini;
 	}

 	/*
 	 * Add lowest weight nodes. Treat similar to required nodes for the job.
 	 * Job will still need to add some higher weight nodes later.
 	 */
 	if (req2_nodes_bitmap) {
 		for (i = 0;
 		     next_node_bitmap(req2_nodes_bitmap, &i) && (topo_eval->max_nodes > 0);
 		     i++) {
 			topo_eval->avail_cpus = avail_cpu_per_node[i];
 			if (!eval_nodes_cpus_to_use(topo_eval, i, rem_max_cpus,
 						    min_rem_nodes, &maxtasks,
 						    true)) {
 				/*
 				 * Too many restricted gpu cores removed due to
 				 * gres layout. Skip node
 				 */
 				bit_clear(req2_nodes_bitmap, i);
 				continue;
 			}
 			rem_nodes--;
 			min_rem_nodes--;
 			topo_eval->max_nodes--;
 			rem_cpus   -= topo_eval->avail_cpus;
 			rem_max_cpus -= topo_eval->avail_cpus;
 		}

 		for (i = 0, switch_ptr = ctx->switch_table;
 		     i < ctx->switch_count; i++, switch_ptr++) {
 			if (switch_required[i])
 				continue;
 			if (bit_overlap_any(req2_nodes_bitmap,
 					    switch_node_bitmap[i])) {
 				switch_required[i] = 1;
 			}
 		}
 		bit_or(topo_eval->node_map, req2_nodes_bitmap);

 		if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
 		    (!topo_eval->gres_per_job ||
 		     gres_sched_test(job_ptr->gres_list_req,
 				     job_ptr->job_id))) {
 			/* Required nodes completely satisfied the request */
 			error("Scheduling anomaly for %pJ",
 			      job_ptr);
 			rc = SLURM_SUCCESS;
 			goto fini;
 		}
 		if (topo_eval->max_nodes <= 0) {
 			rc = SLURM_ERROR;
 			log_flag(SELECT_TYPE, "%pJ reached maximum node limit",
 				 job_ptr);
 			goto fini;
 		}
 	}

 	/*
 	 * Construct a set of switch array entries.
 	 * Use the same indexes as ctx->switch_table in slurmctld.
 	 */
 	bit_or(best_nodes_bitmap, topo_eval->node_map);
 	avail_nodes_bitmap = bit_alloc(node_record_count);
 	for (i = 0, switch_ptr = ctx->switch_table; i < ctx->switch_count;
 	     i++, switch_ptr++) {
 		bit_and(switch_node_bitmap[i], best_nodes_bitmap);
 		bit_or(avail_nodes_bitmap, switch_node_bitmap[i]);
 		switch_node_cnt[i] = bit_set_count(switch_node_bitmap[i]);
 	}

 	if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
 		for (i = 0; i < ctx->switch_count; i++) {
 			char *node_names = NULL;
 			if (switch_node_cnt[i]) {
 				node_names =
 					bitmap2node_name(switch_node_bitmap[i]);
 			}
 			info("switch=%s level=%d nodes=%u:%s required:%u speed:%u",
 			     ctx->switch_table[i].name,
 			     ctx->switch_table[i].level,
 			     switch_node_cnt[i], node_names,
 			     switch_required[i],
 			     ctx->switch_table[i].link_speed);
 			xfree(node_names);
 		}
 	}

 	/* Add additional resources for already required leaf switches */
 	if (req_nodes_bitmap || req2_nodes_bitmap) {
 		int num_nodes_taken = 0;
 		for (i = 0; i < ctx->switch_count; i++) {
 			if (!switch_required[i] || !switch_node_bitmap[i] ||
 			    (ctx->switch_table[i].level != 0))
 				continue;
 			for (j = 0; next_node_bitmap(switch_node_bitmap[i], &j);
 			     j++) {
 				if (bit_test(topo_eval->node_map, j) ||
 				    !avail_cpu_per_node[j])
 					continue;
 				topo_eval->avail_cpus = avail_cpu_per_node[j];
 				if (!eval_nodes_cpus_to_use(topo_eval, j,
 							    rem_max_cpus,
 							    min_rem_nodes,
 							    &maxtasks, true)) {
 					avail_cpu_per_node[j] = 0;
 					continue;
 				}
 				num_nodes_taken++;
 				rem_nodes--;
 				min_rem_nodes--;
 				topo_eval->max_nodes--;
 				rem_cpus   -= topo_eval->avail_cpus;
 				rem_max_cpus -= topo_eval->avail_cpus;
 				bit_set(topo_eval->node_map, j);
 				if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
 				    (!topo_eval->gres_per_job ||
 				     gres_sched_test(job_ptr->gres_list_req,
 						     job_ptr->job_id))) {
 					rc = SLURM_SUCCESS;
 					goto fini;
 				}

 				if (topo_eval->max_nodes <= 0) {
 					rc = SLURM_ERROR;
 					log_flag(SELECT_TYPE,
 						 "%pJ reached maximum node limit",
 						 job_ptr);
 					goto fini;
 				}
 			}

 			_decrement_node_cnt(num_nodes_taken, i, switch_node_cnt,
 					    ctx);
 		}
 	}

 	switches_dist = xcalloc(ctx->switch_count, sizeof(uint32_t));

 	for (i = 0; i < ctx->switch_count; i++) {
 		if (switch_required[i])
 			_topo_add_dist(switches_dist, i, ctx);
 	}
 	/* Add additional resources as required from additional leaf switches */
 	prev_rem_nodes = rem_nodes + 1;
 	while (1) {
 		int best_switch_inx = -1;

 		if (prev_rem_nodes == rem_nodes)
 			break; 	/* Stalled */
 		prev_rem_nodes = rem_nodes;

 		for (i = 0; i < ctx->switch_count; i++) {
 			if (switch_required[i] || !switch_node_bitmap[i] ||
 			    (ctx->switch_table[i].level != 0))
 				continue;
 			_topo_choose_best_switch(switches_dist, switch_node_cnt,
 						 rem_nodes, switch_cpu_cnt,
 						 rem_cpus, i, &best_switch_inx,
 						 ctx);
 		}
 		if (best_switch_inx == -1)
 			break;

 		_topo_add_dist(switches_dist, best_switch_inx, ctx);
 		/*
 		 * NOTE: Ideally we would add nodes in order of resource
 		 * availability rather than in order of bitmap position, but
 		 * that would add even more complexity and overhead.
 		 */
 		for (i = 0;
 		     next_node_bitmap(
 			     switch_node_bitmap[best_switch_inx], &i) &&
 		     (topo_eval->max_nodes > 0);
 		     i++) {
 			if (bit_test(topo_eval->node_map, i) ||
 			    !avail_cpu_per_node[i])
 				continue;
 			topo_eval->avail_cpus = avail_cpu_per_node[i];
 			if (!eval_nodes_cpus_to_use(topo_eval, i,
 						    rem_max_cpus, min_rem_nodes,
 						    &maxtasks, true)) {
 				avail_cpu_per_node[i] = 0;
 				continue;
 			}
 			rem_nodes--;
 			min_rem_nodes--;
 			topo_eval->max_nodes--;
 			rem_cpus   -= topo_eval->avail_cpus;
 			rem_max_cpus -= topo_eval->avail_cpus;
 			bit_set(topo_eval->node_map, i);
 			if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
 			    (!topo_eval->gres_per_job ||
 			     gres_sched_test(job_ptr->gres_list_req,
 					     job_ptr->job_id))) {
 				rc = SLURM_SUCCESS;
 				goto fini;
 			}

 			if (topo_eval->max_nodes <= 0) {
 				rc = SLURM_ERROR;
 				log_flag(SELECT_TYPE,
 					 "%pJ reached maximum node limit",
 					 job_ptr);
 				goto fini;
 			}
 		}
 		_decrement_node_cnt(switch_node_cnt[best_switch_inx],
 				    best_switch_inx, switch_node_cnt, ctx);
 		switch_node_cnt[best_switch_inx] = 0;	/* Used all */
 	}
 	if ((min_rem_nodes <= 0) && (rem_cpus <= 0) &&
 	    (!topo_eval->gres_per_job ||
 	     gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id))) {
 		rc = SLURM_SUCCESS;
 		goto fini;
 	}
 	rc = SLURM_ERROR;

 fini:
 	if (rc == SLURM_SUCCESS)
 		eval_nodes_clip_socket_cores(topo_eval);

 	if (job_ptr->req_switch > 0 && rc == SLURM_SUCCESS) {
 		int leaf_switch_count = 0;

 		/* Count up leaf switches. */
 		for (i = 0, switch_ptr = ctx->switch_table;
 		     i < ctx->switch_count; i++, switch_ptr++) {
 			if (ctx->switch_table[i].level != 0)
 				continue;
 			if (bit_overlap_any(switch_node_bitmap[i], topo_eval->node_map))
 				leaf_switch_count++;
 		}
 		if (time_waiting >= job_ptr->wait4switch) {
 			job_ptr->best_switch = true;
 			debug3("%pJ waited %ld sec for switches use=%d",
 				job_ptr, time_waiting, leaf_switch_count);
 		} else if (leaf_switch_count > job_ptr->req_switch) {
 			/*
 			 * Allocation is for more than requested number of
 			 * switches.
 			 */
 			if ((req_nodes > min_nodes) && best_nodes_bitmap) {
 				/* TRUE only for !topo_eval->gres_per_job */
 				req_nodes--;
 				rem_nodes = req_nodes;
 				rem_nodes -= req_node_cnt;
 				min_rem_nodes = min_nodes;
 				min_rem_nodes -= req_node_cnt;
 				topo_eval->max_nodes = org_max_nodes;
 				topo_eval->max_nodes -= req_node_cnt;
 				rem_cpus = start_rem_cpus;
 				rem_max_cpus = start_rem_max_cpus;
 				xfree(switches_dist);
 				bit_copybits(topo_eval->node_map, start_node_map);
 				memcpy(switch_required, req_switch_required,
 				       ctx->switch_count * sizeof(int));
 				memset(avail_cpu_per_node, 0,
 				       node_record_count * sizeof(uint16_t));
 				for (i = 0; i < ctx->switch_count; i++)
 					bit_copybits(
 						switch_node_bitmap[i],
 						start_switch_node_bitmap[i]);
 				FREE_NULL_BITMAP(avail_nodes_bitmap);
 				FREE_NULL_BITMAP(req2_nodes_bitmap);
 				FREE_NULL_BITMAP(best_nodes_bitmap);
 				FREE_NULL_LIST(best_gres);
 				log_flag(SELECT_TYPE, "%pJ goto try_again req_nodes %d",
 					 job_ptr, req_nodes);
 				goto try_again;
 			}
 			job_ptr->best_switch = false;
 			debug3("%pJ waited %ld sec for switches=%u found=%d wait %u",
 				job_ptr, time_waiting, job_ptr->req_switch,
 				leaf_switch_count, job_ptr->wait4switch);
 		} else {
 			job_ptr->best_switch = true;
 		}
 	}

 	FREE_NULL_LIST(best_gres);
 	FREE_NULL_LIST(node_weight_list);
 	FREE_NULL_BITMAP(avail_nodes_bitmap);
 	FREE_NULL_BITMAP(req2_nodes_bitmap);
 	FREE_NULL_BITMAP(best_nodes_bitmap);
 	FREE_NULL_BITMAP(start_node_map);
 	xfree(avail_cpu_per_node);
 	xfree(switch_cpu_cnt);
 	if (switch_node_bitmap) {
 		for (i = 0; i < ctx->switch_count; i++)
 			FREE_NULL_BITMAP(switch_node_bitmap[i]);
 		xfree(switch_node_bitmap);
 	}
 	if (start_switch_node_bitmap) {
 		for (i = 0; i < ctx->switch_count; i++)
 			FREE_NULL_BITMAP(start_switch_node_bitmap[i]);
 		xfree(start_switch_node_bitmap);
 	}
 	xfree(switch_node_cnt);
 	xfree(switch_required);
 	xfree(req_switch_required);
 	xfree(switches_dist);
 	return rc;
 }

 extern int eval_nodes_tree(topology_eval_t *topo_eval)
 {
 	job_record_t *job_ptr = topo_eval->job_ptr;
 	job_details_t *details_ptr = job_ptr->details;

 	static bool have_dragonfly = false;
 	static bool topo_optional = false;

 	static bool set = false;

 	if (!set) {
 		if (xstrcasestr(slurm_conf.topology_param, "dragonfly"))
 			have_dragonfly = true;
 		if (xstrcasestr(slurm_conf.topology_param, "TopoOptional"))
 			topo_optional = true;
 		set = true;
 	}

 	xassert(((tree_context_t *) topo_eval->tctx->plugin_ctx)->switch_count);
 	xassert(((tree_context_t *) topo_eval->tctx->plugin_ctx)->switch_table);

 	if (!details_ptr->contiguous &&
 	    ((topo_optional == false) || topo_eval->job_ptr->req_switch)) {
 		/* Perform optimized resource selection based upon topology */
 		if (have_dragonfly) {
 			return _eval_nodes_dfly(topo_eval);
 		} else {
 			return _eval_nodes_topo(topo_eval);
 		}
 	}

 	return ESLURM_NOT_SUPPORTED;
 }