blob: d210c27245b58a0951bbe9735aa5a131a170dc8f [file] [log] [blame]
/*****************************************************************************\
* eval_nodes_tree.c - Determine order of nodes for job using tree algo.
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "eval_nodes_tree.h"
#include "../common/eval_nodes.h"
#include "../common/gres_sched.h"
#include "src/common/xstring.h"
static void _topo_add_dist(uint32_t *dist, int inx, tree_context_t *ctx)
{
for (int i = 0; i < ctx->switch_count; i++) {
if (ctx->switch_table[inx].switches_dist[i] == INFINITE ||
dist[i] == INFINITE) {
dist[i] = INFINITE;
} else {
dist[i] += ctx->switch_table[inx].switches_dist[i];
}
}
}
/*
* returns 1 if switch "i" is better fit
* returns -1 if switch "j" is better fit
* returns 0 if there is no better fit
*/
static int _topo_compare_switches(int i, int j, int rem_nodes,
int *switch_node_cnt, int rem_cpus,
uint32_t *switch_cpu_cnt, bool *i_fit_out,
tree_context_t *ctx)
{
while (1) {
bool i_fit = ((switch_node_cnt[i] >= rem_nodes) &&
((int) switch_cpu_cnt[i] >= rem_cpus));
bool j_fit = ((switch_node_cnt[j] >= rem_nodes) &&
((int) switch_cpu_cnt[j] >= rem_cpus));
*i_fit_out = i_fit;
if (i_fit && j_fit) {
if (switch_node_cnt[i] < switch_node_cnt[j])
return 1;
if (switch_node_cnt[i] > switch_node_cnt[j])
return -1;
break;
} else if (i_fit) {
return 1;
} else if (j_fit) {
return -1;
}
if (((ctx->switch_table[i].parent != i) ||
(ctx->switch_table[j].parent != j)) &&
(ctx->switch_table[i].parent !=
ctx->switch_table[j].parent)) {
i = ctx->switch_table[i].parent;
j = ctx->switch_table[j].parent;
continue;
}
break;
}
if (switch_node_cnt[i] > switch_node_cnt[j])
return 1;
if (switch_node_cnt[i] < switch_node_cnt[j])
return -1;
if (ctx->switch_table[i].level < ctx->switch_table[j].level)
return 1;
if (ctx->switch_table[i].level > ctx->switch_table[j].level)
return -1;
return 0;
}
static void _topo_choose_best_switch(uint32_t *dist, int *switch_node_cnt,
int rem_nodes, uint32_t *switch_cpu_cnt,
int rem_cpus, int i, int *best_switch,
tree_context_t *ctx)
{
int tcs = 0;
bool i_fit = false;
if (*best_switch == -1 || dist[i] == INFINITE || !switch_node_cnt[i]) {
/*
* If first possibility
*/
if (switch_node_cnt[i] && dist[i] < INFINITE)
*best_switch = i;
return;
}
tcs = _topo_compare_switches(i, *best_switch, rem_nodes,
switch_node_cnt, rem_cpus, switch_cpu_cnt,
&i_fit, ctx);
if (((dist[i] < dist[*best_switch]) && i_fit) ||
((dist[i] == dist[*best_switch]) && (tcs > 0))) {
/*
* If closer and fit request OR
* same distance and tightest fit (less resource waste)
*/
*best_switch = i;
}
}
/*
* Allocate resources to the job on one leaf switch if possible,
* otherwise distribute the job allocation over many leaf switches.
*/
static int _eval_nodes_dfly(topology_eval_t *topo_eval)
{
list_t **switch_gres = NULL; /* available GRES on switch */
bitstr_t **switch_node_bitmap = NULL; /* nodes on this switch */
int *switch_node_cnt = NULL; /* total nodes on switch */
int *switch_required = NULL; /* set if has required node */
bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */
bitstr_t *req_nodes_bitmap = NULL; /* required node bitmap */
bitstr_t *req2_nodes_bitmap = NULL; /* required+lowest prio nodes */
bitstr_t *best_nodes_bitmap = NULL; /* required+low prio nodes */
int i, j, rc = SLURM_SUCCESS;
int best_cpu_cnt = 0, best_node_cnt = 0, req_node_cnt = 0;
list_t *best_gres = NULL;
switch_record_t *switch_ptr;
list_t *node_weight_list = NULL;
topo_weight_info_t *nw = NULL;
list_itr_t *iter;
node_record_t *node_ptr;
int64_t rem_max_cpus;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
job_record_t *job_ptr = topo_eval->job_ptr;
job_details_t *details_ptr = job_ptr->details;
bool sufficient = false;
uint16_t *avail_cpu_per_node = NULL;
time_t time_waiting = 0;
int leaf_switch_count = 0;
int top_switch_inx = -1;
int prev_rem_nodes;
avail_res_t **avail_res_array = topo_eval->avail_res_array;
uint32_t min_nodes = topo_eval->min_nodes;
uint32_t req_nodes = topo_eval->req_nodes;
uint64_t maxtasks;
tree_context_t *ctx = topo_eval->tctx->plugin_ctx;
topo_eval->avail_cpus = 0;
if (job_ptr->req_switch > 1) {
/* Maximum leaf switch count >1 probably makes no sense */
info("Resetting %pJ leaf switch count from %u to 0",
job_ptr, job_ptr->req_switch);
job_ptr->req_switch = 0;
}
if (job_ptr->req_switch) {
time_t time_now;
time_now = time(NULL);
if (job_ptr->wait4switch_start == 0)
job_ptr->wait4switch_start = time_now;
time_waiting = time_now - job_ptr->wait4switch_start;
}
rem_cpus = details_ptr->min_cpus;
min_rem_nodes = min_nodes;
if ((topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req)))
rem_nodes = MIN(min_nodes, req_nodes);
else
rem_nodes = MAX(min_nodes, req_nodes);
rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes);
maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus,
topo_eval->max_nodes);
/* Validate availability of required nodes */
if (job_ptr->details->req_node_bitmap) {
if (!bit_super_set(job_ptr->details->req_node_bitmap,
topo_eval->node_map)) {
info("%pJ requires nodes which are not currently available",
job_ptr);
rc = SLURM_ERROR;
goto fini;
}
req_node_cnt = bit_set_count(job_ptr->details->req_node_bitmap);
if (req_node_cnt == 0) {
info("%pJ required node list has no nodes",
job_ptr);
rc = SLURM_ERROR;
goto fini;
}
if (req_node_cnt > topo_eval->max_nodes) {
info("%pJ requires more nodes than currently available (%u>%u)",
job_ptr, req_node_cnt,
topo_eval->max_nodes);
rc = SLURM_ERROR;
goto fini;
}
req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap);
}
/*
* Add required nodes to job allocation and
* build list of node bitmaps, sorted by weight
*/
if (!bit_set_count(topo_eval->node_map)) {
debug("%pJ node_map is empty",
job_ptr);
rc = SLURM_ERROR;
goto fini;
}
avail_cpu_per_node = xcalloc(node_record_count, sizeof(uint16_t));
node_weight_list = list_create(eval_nodes_topo_weight_free);
for (i = 0;
(node_ptr = next_node_bitmap(topo_eval->node_map, &i));
i++) {
topo_weight_info_t nw_static;
if (req_nodes_bitmap && bit_test(req_nodes_bitmap, i)) {
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
(void) eval_nodes_cpus_to_use(
topo_eval, i, rem_max_cpus, min_rem_nodes,
&maxtasks, true);
if (topo_eval->avail_cpus == 0) {
log_flag(SELECT_TYPE, "%pJ insufficient resources on required node",
job_ptr);
rc = SLURM_ERROR;
goto fini;
}
avail_cpu_per_node[i] = topo_eval->avail_cpus;
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
}
nw_static.weight = node_ptr->sched_weight;
nw = list_find_first(node_weight_list,
eval_nodes_topo_weight_find,
&nw_static);
if (!nw) { /* New node weight to add */
nw = xmalloc(sizeof(topo_weight_info_t));
nw->node_bitmap = bit_alloc(node_record_count);
nw->weight = node_ptr->sched_weight;
list_append(node_weight_list, nw);
}
bit_set(nw->node_bitmap, i);
nw->node_cnt++;
}
if (req_nodes_bitmap) {
bit_and(topo_eval->node_map, req_nodes_bitmap);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
/* Required nodes completely satisfied the request */
rc = SLURM_SUCCESS;
goto fini;
}
if (topo_eval->max_nodes <= 0) {
rc = SLURM_ERROR;
log_flag(SELECT_TYPE, "%pJ requires nodes exceed maximum node limit",
job_ptr);
goto fini;
}
} else {
bit_clear_all(topo_eval->node_map);
}
list_sort(node_weight_list, eval_nodes_topo_weight_sort);
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE)
(void) list_for_each(node_weight_list,
eval_nodes_topo_weight_log, NULL);
/*
* Identify the highest level switch to be used.
* Note that nodes can be on multiple non-overlapping switches.
*/
switch_gres = xcalloc(ctx->switch_count, sizeof(list_t *));
switch_node_bitmap = xcalloc(ctx->switch_count, sizeof(bitstr_t *));
switch_node_cnt = xcalloc(ctx->switch_count, sizeof(int));
switch_required = xcalloc(ctx->switch_count, sizeof(int));
if (!req_nodes_bitmap)
nw = list_peek(node_weight_list);
for (i = 0, switch_ptr = ctx->switch_table; i < ctx->switch_count;
i++, switch_ptr++) {
switch_node_bitmap[i] = bit_copy(switch_ptr->node_bitmap);
if (req_nodes_bitmap &&
bit_overlap_any(req_nodes_bitmap, switch_node_bitmap[i])) {
switch_required[i] = 1;
if (ctx->switch_table[i].level == 0) {
leaf_switch_count++;
}
if ((top_switch_inx == -1) ||
(ctx->switch_table[i].level >
ctx->switch_table[top_switch_inx].level)) {
top_switch_inx = i;
}
}
if (!req_nodes_bitmap &&
(list_find_first(node_weight_list,
eval_nodes_topo_node_find,
switch_node_bitmap[i]))) {
if ((top_switch_inx == -1) ||
(ctx->switch_table[i].level >
ctx->switch_table[top_switch_inx].level)) {
top_switch_inx = i;
}
}
}
/*
* Top switch is highest level switch containing all required nodes
* OR all nodes of the lowest scheduling weight
* OR -1 of can not identify top-level switch
*/
if (top_switch_inx == -1) {
error("%pJ unable to identify top level switch",
job_ptr);
rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
goto fini;
}
/* Check that all specifically required nodes are on shared network */
if (req_nodes_bitmap &&
!bit_super_set(req_nodes_bitmap,
switch_node_bitmap[top_switch_inx])) {
rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
info("%pJ requires nodes that do not have shared network",
job_ptr);
goto fini;
}
/*
* Remove nodes from consideration that can not be reached from this
* top level switch
*/
for (i = 0; i < ctx->switch_count; i++) {
if (top_switch_inx != i) {
bit_and(switch_node_bitmap[i],
switch_node_bitmap[top_switch_inx]);
}
}
/*
* Identify the best set of nodes (i.e. nodes with the lowest weight,
* in addition to the required nodes) that can be used to satisfy the
* job request. All nodes must be on a common top-level switch. The
* logic here adds groups of nodes, all with the same weight, so we
* usually identify more nodes than required to satisfy the request.
* Later logic selects from those nodes to get the best topology.
*/
best_nodes_bitmap = bit_alloc(node_record_count);
iter = list_iterator_create(node_weight_list);
while (!sufficient && (nw = list_next(iter))) {
if (best_node_cnt > 0) {
/*
* All of the lower priority nodes should be included
* in the job's allocation. Nodes from the next highest
* weight nodes are included only as needed.
*/
if (req2_nodes_bitmap)
bit_or(req2_nodes_bitmap, best_nodes_bitmap);
else
req2_nodes_bitmap = bit_copy(best_nodes_bitmap);
}
for (i = 0; next_node_bitmap(nw->node_bitmap, &i); i++) {
if (avail_cpu_per_node[i])
continue; /* Required node */
if (!bit_test(switch_node_bitmap[top_switch_inx], i))
continue;
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
if (topo_eval->avail_cpus == 0) {
bit_clear(nw->node_bitmap, i);
continue;
}
bit_set(best_nodes_bitmap, i);
avail_cpu_per_node[i] = topo_eval->avail_cpus;
best_cpu_cnt += topo_eval->avail_cpus;
best_node_cnt++;
if (topo_eval->gres_per_job) {
gres_sched_consec(
&best_gres, job_ptr->gres_list_req,
avail_res_array[i]->sock_gres_list);
}
}
sufficient = (best_cpu_cnt >= rem_cpus) &&
eval_nodes_enough_nodes(best_node_cnt, rem_nodes,
min_nodes, req_nodes);
if (sufficient && topo_eval->gres_per_job) {
sufficient = gres_sched_sufficient(
job_ptr->gres_list_req, best_gres);
}
}
list_iterator_destroy(iter);
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
char *gres_str = NULL, *gres_print = "";
char *node_names;
if (req_nodes_bitmap) {
node_names = bitmap2node_name(req_nodes_bitmap);
info("Required nodes:%s", node_names);
xfree(node_names);
}
node_names = bitmap2node_name(best_nodes_bitmap);
if (topo_eval->gres_per_job) {
gres_str = gres_sched_str(best_gres);
if (gres_str)
gres_print = gres_str;
}
info("Best nodes:%s node_cnt:%d cpu_cnt:%d %s",
node_names, best_node_cnt, best_cpu_cnt, gres_print);
xfree(node_names);
xfree(gres_str);
}
if (!sufficient) {
log_flag(SELECT_TYPE, "insufficient resources currently available for %pJ",
job_ptr);
rc = SLURM_ERROR;
goto fini;
}
/*
* Add lowest weight nodes. Treat similar to required nodes for the job.
* Job will still need to add some higher weight nodes later.
*/
if (req2_nodes_bitmap) {
for (i = 0;
next_node_bitmap(req2_nodes_bitmap, &i) && (topo_eval->max_nodes > 0);
i++) {
topo_eval->avail_cpus = avail_cpu_per_node[i];
if (!eval_nodes_cpus_to_use(
topo_eval, i, rem_max_cpus, min_rem_nodes,
&maxtasks, true)) {
/*
* Too many restricted cores removed due to
* gres layout. Skip node
*/
bit_clear(req2_nodes_bitmap, i);
continue;
}
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
}
for (i = 0, switch_ptr = ctx->switch_table;
i < ctx->switch_count; i++, switch_ptr++) {
if (switch_required[i])
continue;
if (bit_overlap_any(req2_nodes_bitmap,
switch_node_bitmap[i])) {
switch_required[i] = 1;
if (ctx->switch_table[i].level == 0) {
leaf_switch_count++;
}
}
}
bit_or(topo_eval->node_map, req2_nodes_bitmap);
if (topo_eval->max_nodes <= 0) {
rc = SLURM_ERROR;
log_flag(SELECT_TYPE, "%pJ reached maximum node limit",
job_ptr);
goto fini;
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id))) {
/* Required nodes completely satisfied the request */
error("Scheduling anomaly for %pJ",
job_ptr);
rc = SLURM_SUCCESS;
goto fini;
}
}
/*
* Construct a set of switch array entries.
* Use the same indexes as ctx->switch_table in slurmctld.
*/
bit_or(best_nodes_bitmap, topo_eval->node_map);
avail_nodes_bitmap = bit_alloc(node_record_count);
for (i = 0, switch_ptr = ctx->switch_table; i < ctx->switch_count;
i++, switch_ptr++) {
bit_and(switch_node_bitmap[i], best_nodes_bitmap);
bit_or(avail_nodes_bitmap, switch_node_bitmap[i]);
switch_node_cnt[i] = bit_set_count(switch_node_bitmap[i]);
}
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
for (i = 0; i < ctx->switch_count; i++) {
char *node_names = NULL;
if (switch_node_cnt[i]) {
node_names =
bitmap2node_name(switch_node_bitmap[i]);
}
info("switch=%s level=%d nodes=%u:%s required:%u speed:%u",
ctx->switch_table[i].name,
ctx->switch_table[i].level,
switch_node_cnt[i], node_names,
switch_required[i],
ctx->switch_table[i].link_speed);
xfree(node_names);
}
}
if (req_nodes_bitmap &&
(!bit_super_set(req_nodes_bitmap, avail_nodes_bitmap))) {
info("%pJ requires nodes not available on any switch",
job_ptr);
rc = SLURM_ERROR;
goto fini;
}
/*
* If no resources have yet been selected,
* then pick one leaf switch with the most available nodes.
*/
if (leaf_switch_count == 0) {
int best_switch_inx = -1;
for (i = 0; i < ctx->switch_count; i++) {
if (ctx->switch_table[i].level != 0)
continue;
if ((best_switch_inx == -1) ||
(switch_node_cnt[i] >
switch_node_cnt[best_switch_inx]))
best_switch_inx = i;
}
if (best_switch_inx != -1) {
leaf_switch_count = 1;
switch_required[best_switch_inx] = 1;
}
}
/*
* All required resources currently on one leaf switch. Determine if
* the entire job request can be satisfied using just that one switch.
*/
if (leaf_switch_count == 1) {
best_cpu_cnt = 0;
best_node_cnt = 0;
FREE_NULL_LIST(best_gres);
for (i = 0; i < ctx->switch_count; i++) {
if (!switch_required[i] || !switch_node_bitmap[i] ||
(ctx->switch_table[i].level != 0))
continue;
for (j = 0; next_node_bitmap(switch_node_bitmap[i], &j);
j++) {
if (bit_test(topo_eval->node_map, j) ||
!avail_cpu_per_node[j])
continue;
topo_eval->avail_cpus = avail_cpu_per_node[j];
best_cpu_cnt += topo_eval->avail_cpus;
best_node_cnt++;
if (topo_eval->gres_per_job) {
gres_sched_consec(
&best_gres,
job_ptr->gres_list_req,
avail_res_array[j]->
sock_gres_list);
}
}
break;
}
sufficient = (best_cpu_cnt >= rem_cpus) &&
eval_nodes_enough_nodes(best_node_cnt, rem_nodes,
min_nodes, req_nodes);
if (sufficient && topo_eval->gres_per_job) {
sufficient = gres_sched_sufficient(
job_ptr->gres_list_req, best_gres);
}
if (sufficient && (i < ctx->switch_count)) {
/* Complete request using this one leaf switch */
for (j = 0; next_node_bitmap(switch_node_bitmap[i], &j);
j++) {
if (bit_test(topo_eval->node_map, j) ||
!avail_cpu_per_node[j])
continue;
topo_eval->avail_cpus = avail_cpu_per_node[j];
if (!eval_nodes_cpus_to_use(topo_eval, j,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true)) {
avail_cpu_per_node[j] = 0;
continue;
}
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
bit_set(topo_eval->node_map, j);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
if (topo_eval->max_nodes <= 0) {
rc = SLURM_ERROR;
log_flag(SELECT_TYPE, "%pJ reached maximum node limit",
job_ptr);
goto fini;
}
}
}
}
/*
* Add additional resources as required from additional leaf switches
* on a round-robin basis
*/
prev_rem_nodes = rem_nodes + 1;
while (1) {
if (prev_rem_nodes == rem_nodes)
break; /* Stalled */
prev_rem_nodes = rem_nodes;
for (i = 0; i < ctx->switch_count; i++) {
if (!switch_node_bitmap[i] ||
(ctx->switch_table[i].level != 0))
continue;
for (j = 0; next_node_bitmap(switch_node_bitmap[i], &j);
j++) {
if (bit_test(topo_eval->node_map, j) ||
!avail_cpu_per_node[j])
continue;
topo_eval->avail_cpus = avail_cpu_per_node[j];
if (!eval_nodes_cpus_to_use(topo_eval, j,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true)) {
avail_cpu_per_node[j] = 0;
continue;
}
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
bit_set(topo_eval->node_map, j);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
if (topo_eval->max_nodes <= 0) {
rc = SLURM_ERROR;
log_flag(SELECT_TYPE, "%pJ reached maximum node limit",
job_ptr);
goto fini;
}
break; /* Move to next switch */
}
}
}
if ((min_rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
rc = SLURM_ERROR;
fini:
if (rc == SLURM_SUCCESS)
eval_nodes_clip_socket_cores(topo_eval);
if ((job_ptr->req_switch > 0) && (rc == SLURM_SUCCESS) &&
switch_node_bitmap) {
/* req_switch == 1 here; enforced at the top of the function. */
leaf_switch_count = 0;
/* count up leaf switches */
for (i = 0, switch_ptr = ctx->switch_table;
i < ctx->switch_count; i++, switch_ptr++) {
if (ctx->switch_table[i].level != 0)
continue;
if (bit_overlap_any(switch_node_bitmap[i], topo_eval->node_map))
leaf_switch_count++;
}
if (time_waiting >= job_ptr->wait4switch) {
job_ptr->best_switch = true;
debug3("%pJ waited %ld sec for switches use=%d",
job_ptr, time_waiting, leaf_switch_count);
} else if (leaf_switch_count > job_ptr->req_switch) {
/*
* Allocation is for more than requested number of
* switches.
*/
job_ptr->best_switch = false;
debug3("%pJ waited %ld sec for switches=%u found=%d wait %u",
job_ptr, time_waiting, job_ptr->req_switch,
leaf_switch_count, job_ptr->wait4switch);
} else {
job_ptr->best_switch = true;
}
}
FREE_NULL_LIST(best_gres);
FREE_NULL_LIST(node_weight_list);
FREE_NULL_BITMAP(avail_nodes_bitmap);
FREE_NULL_BITMAP(req_nodes_bitmap);
FREE_NULL_BITMAP(req2_nodes_bitmap);
FREE_NULL_BITMAP(best_nodes_bitmap);
xfree(avail_cpu_per_node);
xfree(switch_gres);
if (switch_node_bitmap) {
for (i = 0; i < ctx->switch_count; i++)
FREE_NULL_BITMAP(switch_node_bitmap[i]);
xfree(switch_node_bitmap);
}
xfree(switch_node_cnt);
xfree(switch_required);
return rc;
}
static void _decrement_node_cnt(int num_nodes_taken, int switch_index,
int *switch_node_cnt, tree_context_t *ctx)
{
for (int i = switch_index; i >= 0; i = ctx->switch_table[i].parent) {
if (switch_node_cnt[i] <= num_nodes_taken) {
switch_node_cnt[i] = 0;
} else {
switch_node_cnt[i] -= num_nodes_taken;
}
/* end once we've reached root switch */
if (ctx->switch_table[i].parent == SWITCH_NO_PARENT)
break;
}
}
/* Allocate resources to job using a minimal leaf switch count */
static int _eval_nodes_topo(topology_eval_t *topo_eval)
{
uint32_t *switch_cpu_cnt = NULL; /* total CPUs on switch */
bitstr_t **switch_node_bitmap = NULL; /* nodes on this switch */
bitstr_t **start_switch_node_bitmap = NULL;
int *switch_node_cnt = NULL; /* total nodes on switch */
int *switch_required = NULL; /* set if has required node */
int *req_switch_required = NULL;
bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */
bitstr_t *req_nodes_bitmap = NULL; /* required node bitmap */
bitstr_t *req2_nodes_bitmap = NULL; /* required+lowest prio nodes */
bitstr_t *best_nodes_bitmap = NULL; /* required+low prio nodes */
bitstr_t *start_node_map = NULL;
int i, j, rc = SLURM_SUCCESS;
int best_cpu_cnt, best_node_cnt, req_node_cnt = 0;
list_t *best_gres = NULL;
switch_record_t *switch_ptr;
list_t *node_weight_list = NULL;
topo_weight_info_t *nw = NULL;
list_itr_t *iter;
node_record_t *node_ptr;
int64_t rem_max_cpus, start_rem_max_cpus = 0;
int rem_cpus, start_rem_cpus = 0, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
job_record_t *job_ptr = topo_eval->job_ptr;
job_details_t *details_ptr = job_ptr->details;
bool requested, sufficient = false;
uint16_t *avail_cpu_per_node = NULL;
uint32_t *switches_dist= NULL;
time_t time_waiting = 0;
int top_switch_inx = -1;
uint64_t top_switch_lowest_weight = 0;
int prev_rem_nodes;
avail_res_t **avail_res_array = topo_eval->avail_res_array;
uint32_t min_nodes = topo_eval->min_nodes;
uint32_t req_nodes = topo_eval->req_nodes;
uint32_t org_max_nodes = topo_eval->max_nodes;
uint64_t maxtasks;
tree_context_t *ctx = topo_eval->tctx->plugin_ctx;
topo_eval->avail_cpus = 0;
if (job_ptr->req_switch) {
time_t time_now;
time_now = time(NULL);
if (job_ptr->wait4switch_start == 0)
job_ptr->wait4switch_start = time_now;
time_waiting = time_now - job_ptr->wait4switch_start;
}
rem_cpus = details_ptr->min_cpus;
min_rem_nodes = min_nodes;
if ((topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req)))
rem_nodes = MIN(min_nodes, req_nodes);
else
rem_nodes = MAX(min_nodes, req_nodes);
rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes);
maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus,
topo_eval->max_nodes);
/* Validate availability of required nodes */
if (job_ptr->details->req_node_bitmap) {
if (!bit_super_set(job_ptr->details->req_node_bitmap,
topo_eval->node_map)) {
info("%pJ requires nodes which are not currently available",
job_ptr);
rc = SLURM_ERROR;
goto fini;
}
req_node_cnt = bit_set_count(job_ptr->details->req_node_bitmap);
if (req_node_cnt == 0) {
info("%pJ required node list has no nodes",
job_ptr);
rc = SLURM_ERROR;
goto fini;
}
if (req_node_cnt > topo_eval->max_nodes) {
info("%pJ requires more nodes than currently available (%u>%u)",
job_ptr, req_node_cnt,
topo_eval->max_nodes);
rc = SLURM_ERROR;
goto fini;
}
req_nodes_bitmap = job_ptr->details->req_node_bitmap;
}
/*
* Add required nodes to job allocation and
* build list of node bitmaps, sorted by weight
*/
if (!bit_set_count(topo_eval->node_map)) {
debug("%pJ node_map is empty",
job_ptr);
rc = SLURM_ERROR;
goto fini;
}
avail_cpu_per_node = xcalloc(node_record_count, sizeof(uint16_t));
node_weight_list = list_create(eval_nodes_topo_weight_free);
for (i = 0;
(node_ptr = next_node_bitmap(topo_eval->node_map, &i));
i++) {
topo_weight_info_t nw_static;
if (req_nodes_bitmap && bit_test(req_nodes_bitmap, i)) {
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
(void) eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true);
if (topo_eval->avail_cpus == 0) {
debug2("%pJ insufficient resources on required node",
job_ptr);
rc = SLURM_ERROR;
goto fini;
}
avail_cpu_per_node[i] = topo_eval->avail_cpus;
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
}
nw_static.weight = node_ptr->sched_weight;
nw = list_find_first(node_weight_list,
eval_nodes_topo_weight_find,
&nw_static);
if (!nw) { /* New node weight to add */
nw = xmalloc(sizeof(topo_weight_info_t));
nw->node_bitmap = bit_alloc(node_record_count);
nw->weight = node_ptr->sched_weight;
list_append(node_weight_list, nw);
}
bit_set(nw->node_bitmap, i);
nw->node_cnt++;
}
list_sort(node_weight_list, eval_nodes_topo_weight_sort);
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE)
(void) list_for_each(node_weight_list,
eval_nodes_topo_weight_log, NULL);
/*
* Identify the highest level switch to be used.
* Note that nodes can be on multiple non-overlapping switches.
*/
switch_cpu_cnt = xcalloc(ctx->switch_count, sizeof(uint32_t));
switch_node_bitmap = xcalloc(ctx->switch_count, sizeof(bitstr_t *));
start_switch_node_bitmap =
xcalloc(ctx->switch_count, sizeof(bitstr_t *));
switch_node_cnt = xcalloc(ctx->switch_count, sizeof(int));
switch_required = xcalloc(ctx->switch_count, sizeof(int));
req_switch_required = xcalloc(ctx->switch_count, sizeof(int));
for (i = 0, switch_ptr = ctx->switch_table; i < ctx->switch_count;
i++, switch_ptr++) {
uint32_t switch_cpus = 0;
switch_node_bitmap[i] = bit_copy(switch_ptr->node_bitmap);
bit_and(switch_node_bitmap[i], topo_eval->node_map);
switch_node_cnt[i] = bit_set_count(switch_node_bitmap[i]);
/*
* Count total CPUs of the intersection of node_map and
* switch_node_bitmap.
*/
for (j = 0; (node_ptr = next_node_bitmap(switch_node_bitmap[i],
&j));
j++)
switch_cpus += avail_res_array[j]->avail_cpus;
switch_cpu_cnt[i] = switch_cpus;
if (req_nodes_bitmap &&
bit_overlap_any(req_nodes_bitmap, switch_node_bitmap[i])) {
switch_required[i] = 1;
if ((top_switch_inx == -1) ||
(ctx->switch_table[i].level >
ctx->switch_table[top_switch_inx].level)) {
top_switch_inx = i;
}
}
if (!eval_nodes_enough_nodes(switch_node_cnt[i], rem_nodes,
min_nodes, req_nodes) ||
(rem_cpus > switch_cpu_cnt[i]))
continue;
if (!req_nodes_bitmap &&
(nw = list_find_first(node_weight_list,
eval_nodes_topo_node_find,
switch_node_bitmap[i]))) {
if ((top_switch_inx == -1) ||
((ctx->switch_table[i].level >=
ctx->switch_table[top_switch_inx].level) &&
(nw->weight <= top_switch_lowest_weight))) {
top_switch_inx = i;
top_switch_lowest_weight = nw->weight;
}
}
}
if (!req_nodes_bitmap) {
bit_clear_all(topo_eval->node_map);
}
/*
* Top switch is highest level switch containing all required nodes
* OR all nodes of the lowest scheduling weight
* OR -1 if can not identify top-level switch, which may be due to a
* disjoint topology and available nodes living on different switches.
*/
if (top_switch_inx == -1) {
log_flag(SELECT_TYPE, "%pJ unable to identify top level switch",
job_ptr);
rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
goto fini;
}
/* Check that all specifically required nodes are on shared network */
if (req_nodes_bitmap &&
!bit_super_set(req_nodes_bitmap,
switch_node_bitmap[top_switch_inx])) {
rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
info("%pJ requires nodes that do not have shared network",
job_ptr);
goto fini;
}
/*
* Remove nodes from consideration that can not be reached from this
* top level switch.
*/
for (i = 0; i < ctx->switch_count; i++) {
if (top_switch_inx != i) {
bit_and(switch_node_bitmap[i],
switch_node_bitmap[top_switch_inx]);
}
}
start_rem_cpus = rem_cpus;
start_rem_max_cpus = rem_max_cpus;
if (req_nodes_bitmap) {
bit_and(topo_eval->node_map, req_nodes_bitmap);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
/* Required nodes completely satisfied the request */
rc = SLURM_SUCCESS;
goto fini;
}
if (topo_eval->max_nodes <= 0) {
rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
log_flag(SELECT_TYPE, "%pJ requires nodes exceed maximum node limit",
job_ptr);
goto fini;
}
}
start_node_map = bit_copy(topo_eval->node_map);
memcpy(req_switch_required, switch_required,
ctx->switch_count * sizeof(int));
for (i = 0; i < ctx->switch_count; i++)
start_switch_node_bitmap[i] = bit_copy(switch_node_bitmap[i]);
try_again:
/*
* Identify the best set of nodes (i.e. nodes with the lowest weight,
* in addition to the required nodes) that can be used to satisfy the
* job request. All nodes must be on a common top-level switch. The
* logic here adds groups of nodes, all with the same weight, so we
* usually identify more nodes than required to satisfy the request.
* Later logic selects from those nodes to get the best topology.
*/
requested = false;
best_node_cnt = 0;
best_cpu_cnt = 0;
best_nodes_bitmap = bit_alloc(node_record_count);
iter = list_iterator_create(node_weight_list);
while (!requested && (nw = list_next(iter))) {
if (best_node_cnt > 0) {
/*
* All of the lower priority nodes should be included
* in the job's allocation. Nodes from the next highest
* weight nodes are included only as needed.
*/
if (req2_nodes_bitmap)
bit_or(req2_nodes_bitmap, best_nodes_bitmap);
else
req2_nodes_bitmap = bit_copy(best_nodes_bitmap);
}
if (!bit_set_count(nw->node_bitmap))
continue;
for (i = 0; (node_ptr = next_node_bitmap(nw->node_bitmap, &i));
i++) {
if (req_nodes_bitmap && bit_test(req_nodes_bitmap, i))
continue; /* Required node */
if (!bit_test(switch_node_bitmap[top_switch_inx], i))
continue;
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
if (topo_eval->avail_cpus == 0) {
bit_clear(nw->node_bitmap, i);
continue;
}
bit_set(best_nodes_bitmap, i);
avail_cpu_per_node[i] = topo_eval->avail_cpus;
best_cpu_cnt += topo_eval->avail_cpus;
best_node_cnt++;
if (topo_eval->gres_per_job) {
gres_sched_consec(
&best_gres, job_ptr->gres_list_req,
avail_res_array[i]->sock_gres_list);
}
}
if (!sufficient) {
sufficient = (best_cpu_cnt >= rem_cpus) &&
eval_nodes_enough_nodes(
best_node_cnt, rem_nodes,
min_nodes, req_nodes);
if (sufficient && topo_eval->gres_per_job) {
sufficient = gres_sched_sufficient(
job_ptr->gres_list_req,
best_gres);
}
}
requested = ((best_node_cnt >= rem_nodes) &&
(best_cpu_cnt >= rem_cpus) &&
(!topo_eval->gres_per_job ||
gres_sched_sufficient(job_ptr->gres_list_req,
best_gres)));
}
list_iterator_destroy(iter);
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
char *gres_str = NULL, *gres_print = "";
char *node_names;
if (req_nodes_bitmap) {
node_names = bitmap2node_name(req_nodes_bitmap);
info("Required nodes:%s", node_names);
xfree(node_names);
}
node_names = bitmap2node_name(best_nodes_bitmap);
if (topo_eval->gres_per_job) {
gres_str = gres_sched_str(best_gres);
if (gres_str)
gres_print = gres_str;
}
info("Best nodes:%s node_cnt:%d cpu_cnt:%d %s",
node_names, best_node_cnt, best_cpu_cnt, gres_print);
xfree(node_names);
xfree(gres_str);
}
if (!sufficient) {
log_flag(SELECT_TYPE, "insufficient resources currently available for %pJ",
job_ptr);
rc = SLURM_ERROR;
goto fini;
}
/*
* Add lowest weight nodes. Treat similar to required nodes for the job.
* Job will still need to add some higher weight nodes later.
*/
if (req2_nodes_bitmap) {
for (i = 0;
next_node_bitmap(req2_nodes_bitmap, &i) && (topo_eval->max_nodes > 0);
i++) {
topo_eval->avail_cpus = avail_cpu_per_node[i];
if (!eval_nodes_cpus_to_use(topo_eval, i, rem_max_cpus,
min_rem_nodes, &maxtasks,
true)) {
/*
* Too many restricted gpu cores removed due to
* gres layout. Skip node
*/
bit_clear(req2_nodes_bitmap, i);
continue;
}
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
}
for (i = 0, switch_ptr = ctx->switch_table;
i < ctx->switch_count; i++, switch_ptr++) {
if (switch_required[i])
continue;
if (bit_overlap_any(req2_nodes_bitmap,
switch_node_bitmap[i])) {
switch_required[i] = 1;
}
}
bit_or(topo_eval->node_map, req2_nodes_bitmap);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id))) {
/* Required nodes completely satisfied the request */
error("Scheduling anomaly for %pJ",
job_ptr);
rc = SLURM_SUCCESS;
goto fini;
}
if (topo_eval->max_nodes <= 0) {
rc = SLURM_ERROR;
log_flag(SELECT_TYPE, "%pJ reached maximum node limit",
job_ptr);
goto fini;
}
}
/*
* Construct a set of switch array entries.
* Use the same indexes as ctx->switch_table in slurmctld.
*/
bit_or(best_nodes_bitmap, topo_eval->node_map);
avail_nodes_bitmap = bit_alloc(node_record_count);
for (i = 0, switch_ptr = ctx->switch_table; i < ctx->switch_count;
i++, switch_ptr++) {
bit_and(switch_node_bitmap[i], best_nodes_bitmap);
bit_or(avail_nodes_bitmap, switch_node_bitmap[i]);
switch_node_cnt[i] = bit_set_count(switch_node_bitmap[i]);
}
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
for (i = 0; i < ctx->switch_count; i++) {
char *node_names = NULL;
if (switch_node_cnt[i]) {
node_names =
bitmap2node_name(switch_node_bitmap[i]);
}
info("switch=%s level=%d nodes=%u:%s required:%u speed:%u",
ctx->switch_table[i].name,
ctx->switch_table[i].level,
switch_node_cnt[i], node_names,
switch_required[i],
ctx->switch_table[i].link_speed);
xfree(node_names);
}
}
/* Add additional resources for already required leaf switches */
if (req_nodes_bitmap || req2_nodes_bitmap) {
int num_nodes_taken = 0;
for (i = 0; i < ctx->switch_count; i++) {
if (!switch_required[i] || !switch_node_bitmap[i] ||
(ctx->switch_table[i].level != 0))
continue;
for (j = 0; next_node_bitmap(switch_node_bitmap[i], &j);
j++) {
if (bit_test(topo_eval->node_map, j) ||
!avail_cpu_per_node[j])
continue;
topo_eval->avail_cpus = avail_cpu_per_node[j];
if (!eval_nodes_cpus_to_use(topo_eval, j,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true)) {
avail_cpu_per_node[j] = 0;
continue;
}
num_nodes_taken++;
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
bit_set(topo_eval->node_map, j);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
if (topo_eval->max_nodes <= 0) {
rc = SLURM_ERROR;
log_flag(SELECT_TYPE,
"%pJ reached maximum node limit",
job_ptr);
goto fini;
}
}
_decrement_node_cnt(num_nodes_taken, i, switch_node_cnt,
ctx);
}
}
switches_dist = xcalloc(ctx->switch_count, sizeof(uint32_t));
for (i = 0; i < ctx->switch_count; i++) {
if (switch_required[i])
_topo_add_dist(switches_dist, i, ctx);
}
/* Add additional resources as required from additional leaf switches */
prev_rem_nodes = rem_nodes + 1;
while (1) {
int best_switch_inx = -1;
if (prev_rem_nodes == rem_nodes)
break; /* Stalled */
prev_rem_nodes = rem_nodes;
for (i = 0; i < ctx->switch_count; i++) {
if (switch_required[i] || !switch_node_bitmap[i] ||
(ctx->switch_table[i].level != 0))
continue;
_topo_choose_best_switch(switches_dist, switch_node_cnt,
rem_nodes, switch_cpu_cnt,
rem_cpus, i, &best_switch_inx,
ctx);
}
if (best_switch_inx == -1)
break;
_topo_add_dist(switches_dist, best_switch_inx, ctx);
/*
* NOTE: Ideally we would add nodes in order of resource
* availability rather than in order of bitmap position, but
* that would add even more complexity and overhead.
*/
for (i = 0;
next_node_bitmap(
switch_node_bitmap[best_switch_inx], &i) &&
(topo_eval->max_nodes > 0);
i++) {
if (bit_test(topo_eval->node_map, i) ||
!avail_cpu_per_node[i])
continue;
topo_eval->avail_cpus = avail_cpu_per_node[i];
if (!eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus, min_rem_nodes,
&maxtasks, true)) {
avail_cpu_per_node[i] = 0;
continue;
}
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
bit_set(topo_eval->node_map, i);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
if (topo_eval->max_nodes <= 0) {
rc = SLURM_ERROR;
log_flag(SELECT_TYPE,
"%pJ reached maximum node limit",
job_ptr);
goto fini;
}
}
_decrement_node_cnt(switch_node_cnt[best_switch_inx],
best_switch_inx, switch_node_cnt, ctx);
switch_node_cnt[best_switch_inx] = 0; /* Used all */
}
if ((min_rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
rc = SLURM_ERROR;
fini:
if (rc == SLURM_SUCCESS)
eval_nodes_clip_socket_cores(topo_eval);
if (job_ptr->req_switch > 0 && rc == SLURM_SUCCESS) {
int leaf_switch_count = 0;
/* Count up leaf switches. */
for (i = 0, switch_ptr = ctx->switch_table;
i < ctx->switch_count; i++, switch_ptr++) {
if (ctx->switch_table[i].level != 0)
continue;
if (bit_overlap_any(switch_node_bitmap[i], topo_eval->node_map))
leaf_switch_count++;
}
if (time_waiting >= job_ptr->wait4switch) {
job_ptr->best_switch = true;
debug3("%pJ waited %ld sec for switches use=%d",
job_ptr, time_waiting, leaf_switch_count);
} else if (leaf_switch_count > job_ptr->req_switch) {
/*
* Allocation is for more than requested number of
* switches.
*/
if ((req_nodes > min_nodes) && best_nodes_bitmap) {
/* TRUE only for !topo_eval->gres_per_job */
req_nodes--;
rem_nodes = req_nodes;
rem_nodes -= req_node_cnt;
min_rem_nodes = min_nodes;
min_rem_nodes -= req_node_cnt;
topo_eval->max_nodes = org_max_nodes;
topo_eval->max_nodes -= req_node_cnt;
rem_cpus = start_rem_cpus;
rem_max_cpus = start_rem_max_cpus;
xfree(switches_dist);
bit_copybits(topo_eval->node_map, start_node_map);
memcpy(switch_required, req_switch_required,
ctx->switch_count * sizeof(int));
memset(avail_cpu_per_node, 0,
node_record_count * sizeof(uint16_t));
for (i = 0; i < ctx->switch_count; i++)
bit_copybits(
switch_node_bitmap[i],
start_switch_node_bitmap[i]);
FREE_NULL_BITMAP(avail_nodes_bitmap);
FREE_NULL_BITMAP(req2_nodes_bitmap);
FREE_NULL_BITMAP(best_nodes_bitmap);
FREE_NULL_LIST(best_gres);
log_flag(SELECT_TYPE, "%pJ goto try_again req_nodes %d",
job_ptr, req_nodes);
goto try_again;
}
job_ptr->best_switch = false;
debug3("%pJ waited %ld sec for switches=%u found=%d wait %u",
job_ptr, time_waiting, job_ptr->req_switch,
leaf_switch_count, job_ptr->wait4switch);
} else {
job_ptr->best_switch = true;
}
}
FREE_NULL_LIST(best_gres);
FREE_NULL_LIST(node_weight_list);
FREE_NULL_BITMAP(avail_nodes_bitmap);
FREE_NULL_BITMAP(req2_nodes_bitmap);
FREE_NULL_BITMAP(best_nodes_bitmap);
FREE_NULL_BITMAP(start_node_map);
xfree(avail_cpu_per_node);
xfree(switch_cpu_cnt);
if (switch_node_bitmap) {
for (i = 0; i < ctx->switch_count; i++)
FREE_NULL_BITMAP(switch_node_bitmap[i]);
xfree(switch_node_bitmap);
}
if (start_switch_node_bitmap) {
for (i = 0; i < ctx->switch_count; i++)
FREE_NULL_BITMAP(start_switch_node_bitmap[i]);
xfree(start_switch_node_bitmap);
}
xfree(switch_node_cnt);
xfree(switch_required);
xfree(req_switch_required);
xfree(switches_dist);
return rc;
}
extern int eval_nodes_tree(topology_eval_t *topo_eval)
{
job_record_t *job_ptr = topo_eval->job_ptr;
job_details_t *details_ptr = job_ptr->details;
static bool have_dragonfly = false;
static bool topo_optional = false;
static bool set = false;
if (!set) {
if (xstrcasestr(slurm_conf.topology_param, "dragonfly"))
have_dragonfly = true;
if (xstrcasestr(slurm_conf.topology_param, "TopoOptional"))
topo_optional = true;
set = true;
}
xassert(((tree_context_t *) topo_eval->tctx->plugin_ctx)->switch_count);
xassert(((tree_context_t *) topo_eval->tctx->plugin_ctx)->switch_table);
if (!details_ptr->contiguous &&
((topo_optional == false) || topo_eval->job_ptr->req_switch)) {
/* Perform optimized resource selection based upon topology */
if (have_dragonfly) {
return _eval_nodes_dfly(topo_eval);
} else {
return _eval_nodes_topo(topo_eval);
}
}
return ESLURM_NOT_SUPPORTED;
}