blob: a28156447486fcde4ddd8d9317ebf8759b0066a5 [file] [log] [blame]
/*****************************************************************************\
* eval_nodes_block.c - Determine order of nodes for job using tree algo.
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include <math.h>
#include "eval_nodes_block.h"
#include "../common/eval_nodes.h"
#include "../common/gres_sched.h"
#include "src/common/xstring.h"
static int _cmp_bblock(const void *a, const void *b)
{
int ca = *((int *) a);
int cb = *((int *) b);
if (ca < cb)
return 1;
else if (ca > cb)
return -1;
return 0;
}
static bool _bblocks_in_same_block(int block_inx1, int block_inx2,
int block_level)
{
if ((block_inx1 >> block_level) == (block_inx2 >> block_level))
return true;
return false;
}
static void _choose_best_bblock(bitstr_t *bblock_required, int llblock_level,
int rem_nodes, uint32_t *nodes_on_bblock,
uint32_t *nodes_on_llblock, int i,
bool *best_same_block, bool *best_fit,
int *best_bblock_inx, block_context_t *ctx)
{
bool fit = (nodes_on_bblock[i] >= rem_nodes);
bool same_block = false;
/*
* Minimialize number of llblock
*/
if (nodes_on_llblock &&
!_bblocks_in_same_block(*best_bblock_inx, i, llblock_level)) {
bool llblock_fit, best_llblock_fit;
uint32_t best_llblock_node_cnt, llblock_node_cnt;
for (int j = (i & (~0 << llblock_level));
((j < ctx->block_count) &&
(j <= (i | ~(~0 << llblock_level))));
j++) {
if (!bit_test(bblock_required, j))
continue;
if ((same_block =
_bblocks_in_same_block(j, i, llblock_level)))
break;
}
if ((*best_bblock_inx == -1) ||
(same_block && !(*best_same_block))) {
*best_bblock_inx = i;
*best_fit = fit;
*best_same_block = same_block;
return;
}
if (!same_block && (*best_same_block))
return;
/*
* New llblock needed or both bblocks in already used llbock
*/
best_llblock_node_cnt = nodes_on_llblock[(*best_bblock_inx >>
llblock_level)];
llblock_node_cnt = nodes_on_llblock[(i >> llblock_level)];
llblock_fit = (llblock_node_cnt >= rem_nodes);
best_llblock_fit = (best_llblock_node_cnt >= rem_nodes);
/*
* Try to use llblock big enough to meet job requaierment
*/
if (llblock_fit && !best_llblock_fit) {
*best_bblock_inx = i;
*best_fit = fit;
*best_same_block = same_block;
return;
}
if (!llblock_fit && best_llblock_fit)
return;
if (llblock_fit && best_llblock_fit) {
/*
* If both bblock are on llblock which meet requirement
* choose llblock with less nodes to avoid fragmentation
*/
if (llblock_node_cnt < best_llblock_node_cnt) {
*best_bblock_inx = i;
*best_fit = fit;
*best_same_block = same_block;
return;
}
if (llblock_node_cnt > best_llblock_node_cnt)
return;
} else {
/*
* If neither of bblocks are on llblock which meet
* requirement choose llblock with more nodes to
* minimalize number of llblock
*/
if (llblock_node_cnt > best_llblock_node_cnt) {
*best_bblock_inx = i;
*best_fit = fit;
*best_same_block = same_block;
return;
}
if (llblock_node_cnt < best_llblock_node_cnt)
return;
}
}
/*
* Minimialize number of bblock
*/
if (*best_bblock_inx == -1 || (fit && !(*best_fit)) ||
(!fit && !(*best_fit) && (nodes_on_bblock[i] >=
nodes_on_bblock[*best_bblock_inx])) ||
(fit && (nodes_on_bblock[i] <=
nodes_on_bblock[*best_bblock_inx]))) {
*best_bblock_inx = i;
*best_fit = fit;
return;
}
}
static void _jobinfo_init(
job_record_t *job_ptr)
{
topology_jobinfo_t *topo_jobinfo;
xassert(job_ptr->topo_jobinfo);
/*
* Currently segment info is only used when using the
* --network=unique-channel-per-segment option. If that option is not
* specified, no segment data needs to be collected here.
*/
if (!xstrstr(job_ptr->network, "unique-channel-per-segment")) {
log_flag(SELECT_TYPE, "Not recording segment information for %pJ",
job_ptr);
return;
}
log_flag(SELECT_TYPE, "Recording segment information for %pJ",
job_ptr);
/* Must be free'd by topology_g_jobinfo_free() */
topo_jobinfo = xmalloc(sizeof(*topo_jobinfo));
topo_jobinfo->segment_list = list_create(xfree_ptr);
job_ptr->topo_jobinfo->data = topo_jobinfo;
return;
}
static void _jobinfo_add_segment(
job_record_t *job_ptr,
bitstr_t *node_bitmap)
{
topology_jobinfo_t *topo_jobinfo;
char *node_list;
xassert(job_ptr->topo_jobinfo);
/* not tracking segment info */
if (!(topo_jobinfo = job_ptr->topo_jobinfo->data))
return;
node_list = bitmap2node_name(node_bitmap);
list_append(topo_jobinfo->segment_list, node_list);
log_flag(SELECT_TYPE, "Added segment with nodelist %s to %pJ",
node_list, job_ptr);
return;
}
int _get_block_level(int rem_nodes, int *llblock_level, block_context_t *ctx)
{
int bblock_per_block = ROUNDUP(rem_nodes, ctx->bblock_node_cnt);
int block_level = ceil(log2(bblock_per_block));
if (block_level > 0 && llblock_level)
*llblock_level =
bit_fls_from_bit(ctx->block_levels, block_level - 1);
else if (llblock_level)
*llblock_level = 0;
/* rem_nodes may have been zero */
if (block_level < 0)
return -1;
block_level = bit_ffs_from_bit(ctx->block_levels, block_level);
return block_level;
}
extern int eval_nodes_block(topology_eval_t *topo_eval)
{
bitstr_t **block_node_bitmap = NULL; /* nodes on this block */
bitstr_t **bblock_node_bitmap = NULL; /* nodes on this base block */
uint32_t block_node_cnt = 0; /* total nodes on block */
uint32_t *nodes_on_bblock = NULL; /* total nodes on bblock */
uint32_t *nodes_on_block = NULL; /* total nodes on block */
bitstr_t *req_nodes_bitmap = NULL; /* required node bitmap */
bitstr_t *req2_nodes_bitmap = NULL; /* required+lowest prio nodes */
bitstr_t *best_nodes_bitmap = NULL; /* required+low prio nodes */
bitstr_t *bblock_bitmap = NULL;
int *bblock_block_inx = NULL;
bitstr_t *bblock_required = NULL;
int i, j, rc = SLURM_SUCCESS;
int best_cpu_cnt, best_node_cnt;
list_t *best_gres = NULL;
block_record_t *block_ptr;
list_t *node_weight_list = NULL;
topo_weight_info_t *nw = NULL;
list_itr_t *iter;
node_record_t *node_ptr;
int64_t rem_max_cpus;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
job_record_t *job_ptr = topo_eval->job_ptr;
job_details_t *details_ptr = job_ptr->details;
bool requested, sufficient = false;
uint16_t *avail_cpu_per_node = NULL;
int block_inx = -1;
uint64_t block_lowest_weight = 0;
int block_cnt = -1, bblock_per_block;
int prev_rem_nodes;
avail_res_t **avail_res_array = topo_eval->avail_res_array;
uint32_t min_nodes = topo_eval->min_nodes;
uint32_t req_nodes = topo_eval->req_nodes;
int max_llblock;
int llblock_level;
int llblock_size;
int llblock_cnt = 0;
uint32_t *nodes_on_llblock = NULL;
int block_level;
int bblock_per_llblock;
uint64_t maxtasks;
block_context_t *ctx = topo_eval->tctx->plugin_ctx;
int segment_cnt = 1, rem_segment_cnt = 0;
bitstr_t *orig_node_map = bit_copy(topo_eval->node_map);
bitstr_t *alloc_node_map = NULL;
uint32_t orig_max_nodes = topo_eval->max_nodes;
int as_rem_nodes = -1, asblock_cnt = -1;
int asblock_inx = -1;
uint32_t *nodes_on_asblock = NULL; /* total nodes on asblock */
int block_per_asblock = 0;
hres_select_t *hres_select = topo_eval->job_ptr->hres_select;
bool hres_match_topo = false;
int *bblock_hres_inx = NULL;
topo_eval->avail_cpus = 0;
rem_cpus = details_ptr->min_cpus;
min_rem_nodes = min_nodes;
/* Always use min_nodes */
topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req);
rem_nodes = MIN(min_nodes, req_nodes);
_jobinfo_init(job_ptr);
if (details_ptr->arbitrary_tpn) {
info("topology block does not support arbitrary tasks distribution");
rc = ESLURM_NOT_SUPPORTED;
goto fini;
}
if (details_ptr->segment_size &&
(rem_nodes % details_ptr->segment_size)) {
info("%s: segment_size (%u) does not fit the job size (%d)",
__func__, details_ptr->segment_size, rem_nodes);
rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
goto fini;
}
if (details_ptr->segment_size) {
as_rem_nodes = rem_nodes;
segment_cnt = rem_nodes / details_ptr->segment_size;
rem_segment_cnt = segment_cnt;
rem_nodes = details_ptr->segment_size;
if (segment_cnt > 1)
req_nodes = req_nodes / segment_cnt;
}
block_level = _get_block_level(rem_nodes, &llblock_level, ctx);
if (block_level < 0) {
/* Number of base blocks in block */
bblock_per_block = ctx->block_count;
block_cnt = 1;
} else {
/* Number of base blocks in block */
bblock_per_block = (1 << block_level);
block_cnt = ROUNDUP(ctx->block_count, bblock_per_block);
}
xassert(llblock_level >= 0);
bblock_per_llblock = (1 << llblock_level);
llblock_size = bblock_per_llblock * ctx->bblock_node_cnt;
max_llblock = ROUNDUP(rem_nodes, llblock_size);
if (details_ptr->segment_size &&
job_ptr->bit_flags & CONSOLIDATE_SEGMENTS) {
int asblock_level;
if (job_ptr->bit_flags & SPREAD_SEGMENTS) {
int tmp = ROUNDUP(details_ptr->segment_size,
ctx->bblock_node_cnt);
tmp *= ctx->bblock_node_cnt;
tmp *= segment_cnt;
asblock_level = _get_block_level(tmp, NULL, ctx);
} else {
asblock_level =
_get_block_level(as_rem_nodes, NULL, ctx);
}
if (asblock_level < 0) {
/*
* Use the whole topology
*/
block_per_asblock = ctx->block_count;
asblock_cnt = 1;
} else {
block_per_asblock =
(1 << (asblock_level - block_level));
asblock_cnt = ROUNDUP(block_cnt, block_per_asblock);
}
}
/* Validate availability of required nodes */
if (job_ptr->details->req_node_bitmap) {
int req_node_cnt;
if (!bit_super_set(job_ptr->details->req_node_bitmap,
topo_eval->node_map)) {
info("%pJ requires nodes which are not currently available",
job_ptr);
rc = ESLURM_BREAK_EVAL;
goto fini;
}
if (!bit_super_set(job_ptr->details->req_node_bitmap,
ctx->blocks_nodes_bitmap)) {
info("%pJ requires nodes which are not in blocks",
job_ptr);
rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
goto fini;
}
req_node_cnt = bit_set_count(job_ptr->details->req_node_bitmap);
if (req_node_cnt == 0) {
info("%pJ required node list has no nodes",
job_ptr);
rc = ESLURM_BREAK_EVAL;
goto fini;
}
if (req_node_cnt > topo_eval->max_nodes) {
info("%pJ requires more nodes than currently available (%u>%u)",
job_ptr, req_node_cnt,
topo_eval->max_nodes);
rc = ESLURM_BREAK_EVAL;
goto fini;
}
if (segment_cnt > 1) {
if (as_rem_nodes > req_node_cnt) {
info("%pJ requires less nodes than job size with segment are not supported",
job_ptr);
rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
goto fini;
}
bit_and(orig_node_map,
job_ptr->details->req_node_bitmap);
bit_and(topo_eval->node_map,
job_ptr->details->req_node_bitmap);
} else {
req_nodes_bitmap = job_ptr->details->req_node_bitmap;
}
}
if (hres_select &&
(hres_select->topology_idx == topo_eval->tctx->idx)) {
hres_match_topo = true;
bblock_hres_inx =
xcalloc(ctx->block_count, sizeof(*bblock_hres_inx));
for (i = 0; i < ctx->block_count; i++) {
bblock_hres_inx[i] = -1;
for (int n = 0;
(n = bit_ffs_from_bit(ctx->block_record_table[i]
.node_bitmap,
n)) >= 0;
n++) {
if (avail_res_array[n]) {
bblock_hres_inx[i] =
avail_res_array[n]
->hres_leaf_idx;
break;
}
}
}
}
next_segment:
/*
* Add required nodes to job allocation and
* build list of node bitmaps, sorted by weight
*/
if (rem_segment_cnt) {
rem_nodes = details_ptr->segment_size;
min_rem_nodes = min_nodes / segment_cnt;
topo_eval->max_nodes = orig_max_nodes / segment_cnt;
rem_cpus = details_ptr->min_cpus / segment_cnt;
if (details_ptr->max_cpus != NO_VAL)
rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr,
rem_nodes);
else
rem_max_cpus = details_ptr->max_cpus / segment_cnt;
max_llblock = ROUNDUP(rem_nodes, llblock_size);
} else
rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr,
rem_nodes);
maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus,
topo_eval->max_nodes);
if (!bit_set_count(topo_eval->node_map)) {
debug("%pJ node_map is empty",
job_ptr);
if (alloc_node_map) {
bit_or(topo_eval->node_map, alloc_node_map);
rc = ESLURM_RETRY_EVAL_HINT;
} else
rc = ESLURM_BREAK_EVAL;
goto fini;
}
if (!avail_cpu_per_node)
avail_cpu_per_node = xcalloc(node_record_count,
sizeof(uint16_t));
node_weight_list = list_create(eval_nodes_topo_weight_free);
for (i = 0;
(node_ptr = next_node_bitmap(topo_eval->node_map, &i));
i++) {
topo_weight_info_t nw_static;
if (req_nodes_bitmap && bit_test(req_nodes_bitmap, i)) {
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
(void) eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true);
if (topo_eval->avail_cpus == 0) {
debug2("%pJ insufficient resources on required node",
job_ptr);
rc = ESLURM_BREAK_EVAL;
goto fini;
}
avail_cpu_per_node[i] = topo_eval->avail_cpus;
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
}
nw_static.weight = node_ptr->sched_weight;
nw = list_find_first(node_weight_list,
eval_nodes_topo_weight_find,
&nw_static);
if (!nw) { /* New node weight to add */
nw = xmalloc(sizeof(topo_weight_info_t));
nw->node_bitmap = bit_alloc(node_record_count);
nw->weight = node_ptr->sched_weight;
list_append(node_weight_list, nw);
}
bit_set(nw->node_bitmap, i);
nw->node_cnt++;
}
list_sort(node_weight_list, eval_nodes_topo_weight_sort);
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE)
(void) list_for_each(node_weight_list,
eval_nodes_topo_weight_log, NULL);
if ((bblock_per_block != (bblock_per_llblock * max_llblock)) &&
!nodes_on_llblock) {
llblock_cnt = ROUNDUP(ctx->block_count, bblock_per_llblock);
nodes_on_llblock = xcalloc(llblock_cnt, sizeof(uint32_t));
}
log_flag(SELECT_TYPE, "%s: bblock_per_block:%u rem_nodes:%u llblock_cnt:%u max_llblock:%d llblock_level:%d",
__func__, bblock_per_block, rem_nodes, llblock_cnt,
max_llblock, llblock_level);
if (!bblock_required)
bblock_required = bit_alloc(ctx->block_count);
else
bit_clear_all(bblock_required);
if (!alloc_node_map) {
block_node_bitmap = xcalloc(block_cnt, sizeof(bitstr_t *));
bblock_block_inx = xcalloc(ctx->block_count, sizeof(int));
}
if (!nodes_on_block)
nodes_on_block = xcalloc(block_cnt, sizeof(*nodes_on_block));
else
memset(nodes_on_block, 0, block_cnt * sizeof(*nodes_on_block));
for (i = 0, block_ptr = ctx->block_record_table; i < ctx->block_count;
i++, block_ptr++) {
int block_inx_tmp = i / bblock_per_block;
uint32_t nodes_on_bblock_tmp;
if (alloc_node_map)
;
else if (block_node_bitmap[block_inx_tmp])
bit_or(block_node_bitmap[block_inx_tmp],
block_ptr->node_bitmap);
else
block_node_bitmap[block_inx_tmp] =
bit_copy(block_ptr->node_bitmap);
bblock_block_inx[i] = block_inx_tmp;
nodes_on_bblock_tmp = bit_overlap(block_ptr->node_bitmap,
topo_eval->node_map);
if (hres_match_topo) {
uint32_t tmp_cap =
hres_get_capacity(hres_select,
bblock_hres_inx[i]);
tmp_cap /= hres_select->hres_per_node;
nodes_on_bblock_tmp = MIN(tmp_cap, nodes_on_bblock_tmp);
}
nodes_on_block[block_inx_tmp] += nodes_on_bblock_tmp;
if (nodes_on_llblock) {
int llblock_inx = i / bblock_per_llblock;
nodes_on_llblock[llblock_inx] += nodes_on_bblock_tmp;
}
}
if (block_per_asblock && !alloc_node_map) {
nodes_on_asblock = xcalloc(asblock_cnt, sizeof(uint32_t));
for (i = 0; i < block_cnt; i++) {
int asb_inx = i / block_per_asblock;
nodes_on_asblock[asb_inx] += nodes_on_block[i];
}
}
block_inx = -1;
for (i = 0; i < block_cnt; i++) {
uint32_t block_cpus = 0;
uint32_t avail_bnc = 0;
uint32_t bnc;
if (block_per_asblock && !alloc_node_map) {
if (nodes_on_asblock[i / block_per_asblock] <
as_rem_nodes)
continue;
}
bit_and(block_node_bitmap[i], topo_eval->node_map);
bnc = nodes_on_block[i];
if (!nodes_on_llblock) {
avail_bnc = bnc;
} else {
int llblock_per_block = (bblock_per_block /
bblock_per_llblock);
int offset = i * llblock_per_block;
int tmp_max_llblock;
llblock_per_block = MIN(llblock_per_block,
llblock_cnt - offset);
qsort(&nodes_on_llblock[offset], llblock_per_block,
sizeof(int), _cmp_bblock);
tmp_max_llblock = MIN(max_llblock, llblock_per_block);
for (j = 0; j < tmp_max_llblock; j++)
avail_bnc += nodes_on_llblock[offset + j];
}
/*
* Count total CPUs of the intersection of topo_eval->node_map
* and block_node_bitmap.
*/
for (j = 0; (node_ptr = next_node_bitmap(block_node_bitmap[i],
&j));
j++)
block_cpus += avail_res_array[j]->avail_cpus;
if (req_nodes_bitmap &&
bit_overlap_any(req_nodes_bitmap, block_node_bitmap[i])) {
if (block_inx == -1) {
block_inx = i;
break;
}
}
if (!eval_nodes_enough_nodes(avail_bnc, rem_nodes, min_nodes,
req_nodes) ||
(rem_cpus > block_cpus))
continue;
/*
* Select the block:
* 1) with lowest weight nodes
* 2) with lowest sufficient count of nodes -
* to minimize fragmentation
*/
if (!req_nodes_bitmap &&
(nw = list_find_first(node_weight_list,
eval_nodes_topo_node_find,
block_node_bitmap[i]))) {
if ((block_inx == -1) ||
(nw->weight < block_lowest_weight) ||
((nw->weight == block_lowest_weight) &&
(bnc <= block_node_cnt))) {
block_inx = i;
block_lowest_weight = nw->weight;
block_node_cnt = bnc;
}
}
}
if (!req_nodes_bitmap) {
bit_clear_all(topo_eval->node_map);
}
if (block_inx == -1) {
log_flag(SELECT_TYPE, "%pJ unable to find block",
job_ptr);
if (alloc_node_map && !block_per_asblock) {
bit_or(topo_eval->node_map, alloc_node_map);
rc = ESLURM_RETRY_EVAL_HINT;
} else
rc = ESLURM_BREAK_EVAL;
goto fini;
}
/* Check that all specifically required nodes are in one block */
if (req_nodes_bitmap &&
!bit_super_set(req_nodes_bitmap, block_node_bitmap[block_inx])) {
rc = ESLURM_BREAK_EVAL;
info("%pJ requires nodes that do not have shared block",
job_ptr);
goto fini;
}
if (block_per_asblock && !alloc_node_map) {
asblock_inx = block_inx / block_per_asblock;
bitstr_t *tmp_bitmap = bit_alloc(node_record_count);
for (i = 0; i < block_cnt; i++) {
if ((i / block_per_asblock) == asblock_inx)
bit_or(tmp_bitmap, block_node_bitmap[i]);
}
bit_and(orig_node_map, tmp_bitmap);
FREE_NULL_BITMAP(tmp_bitmap);
}
if (req_nodes_bitmap) {
int last_llblock = -1;
bit_and(topo_eval->node_map, req_nodes_bitmap);
for (i = 0; (i < ctx->block_count) && nodes_on_llblock; i++) {
if (block_inx != bblock_block_inx[i])
continue;
if (bit_overlap_any(req_nodes_bitmap,
ctx->block_record_table[i]
.node_bitmap)) {
bit_set(bblock_required, i);
if (!_bblocks_in_same_block(last_llblock, i,
llblock_level)) {
max_llblock--;
last_llblock = i;
}
}
}
if (max_llblock < 0) {
rc = ESLURM_REQUESTED_TOPO_CONFIG_UNAVAILABLE;
info("%pJ requires nodes exceed maximum llblock limit due to required nodes",
job_ptr);
goto fini;
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
/* Required nodes completely satisfied the request */
rc = SLURM_SUCCESS;
goto fini;
}
if (topo_eval->max_nodes <= 0) {
rc = ESLURM_BREAK_EVAL;
info("%pJ requires nodes exceed maximum node limit",
job_ptr);
goto fini;
}
}
requested = false;
sufficient = false;
best_node_cnt = 0;
best_cpu_cnt = 0;
if (!best_nodes_bitmap)
best_nodes_bitmap = bit_alloc(node_record_count);
else
bit_clear_all(best_nodes_bitmap);
if (req2_nodes_bitmap)
bit_clear_all(req2_nodes_bitmap);
iter = list_iterator_create(node_weight_list);
while (!requested && (nw = list_next(iter))) {
if (best_node_cnt > 0) {
/*
* All of the lower priority nodes should be included
* in the job's allocation. Nodes from the next highest
* weight nodes are included only as needed.
*/
if (req2_nodes_bitmap)
bit_or(req2_nodes_bitmap, best_nodes_bitmap);
else
req2_nodes_bitmap = bit_copy(best_nodes_bitmap);
}
if (!bit_set_count(nw->node_bitmap))
continue;
for (i = 0; (node_ptr = next_node_bitmap(nw->node_bitmap, &i));
i++) {
if (req_nodes_bitmap && bit_test(req_nodes_bitmap, i))
continue; /* Required node */
if (!bit_test(block_node_bitmap[block_inx], i))
continue;
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
if (topo_eval->avail_cpus == 0) {
bit_clear(nw->node_bitmap, i);
continue;
}
bit_set(best_nodes_bitmap, i);
avail_cpu_per_node[i] = topo_eval->avail_cpus;
best_cpu_cnt += topo_eval->avail_cpus;
best_node_cnt++;
if (topo_eval->gres_per_job) {
gres_sched_consec(
&best_gres, job_ptr->gres_list_req,
avail_res_array[i]->sock_gres_list);
}
}
if (!sufficient) {
sufficient = (best_cpu_cnt >= rem_cpus) &&
eval_nodes_enough_nodes(
best_node_cnt, rem_nodes,
min_nodes, req_nodes);
if (sufficient && topo_eval->gres_per_job) {
sufficient = gres_sched_sufficient(
job_ptr->gres_list_req,
best_gres);
}
}
requested = ((best_node_cnt >= rem_nodes) &&
(best_cpu_cnt >= rem_cpus) &&
(!topo_eval->gres_per_job ||
gres_sched_sufficient(job_ptr->gres_list_req,
best_gres)));
}
list_iterator_destroy(iter);
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
char *gres_str = NULL, *gres_print = "";
char *node_names;
if (req_nodes_bitmap) {
node_names = bitmap2node_name(req_nodes_bitmap);
info("Required nodes:%s", node_names);
xfree(node_names);
}
node_names = bitmap2node_name(best_nodes_bitmap);
if (topo_eval->gres_per_job) {
gres_str = gres_sched_str(best_gres);
if (gres_str)
gres_print = gres_str;
}
info("Best nodes:%s node_cnt:%d cpu_cnt:%d %s",
node_names, best_node_cnt, best_cpu_cnt, gres_print);
xfree(node_names);
xfree(gres_str);
}
if (!sufficient) {
log_flag(SELECT_TYPE, "insufficient resources currently available for %pJ",
job_ptr);
rc = SLURM_ERROR;
goto fini;
}
/*
* Add lowest weight nodes. Treat similar to required nodes for the job.
* Job will still need to add some higher weight nodes later.
*/
if (req2_nodes_bitmap) {
int last_llblock = -1;
for (i = 0;
(next_node_bitmap(req2_nodes_bitmap, &i) &&
(topo_eval->max_nodes > 0));
i++) {
topo_eval->avail_cpus = avail_cpu_per_node[i];
if (!eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus, min_rem_nodes,
&maxtasks, true)) {
/*
* To many restricted gpu cores were removed
* due to gres or hres layout.
*/
bit_clear(req2_nodes_bitmap, i);
continue;
}
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
}
bit_or(topo_eval->node_map, req2_nodes_bitmap);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id))) {
/* Required nodes completely satisfied the request */
error("Scheduling anomaly for %pJ",
job_ptr);
rc = SLURM_SUCCESS;
goto fini;
}
if (topo_eval->max_nodes <= 0) {
rc = ESLURM_RETRY_EVAL_HINT;
debug("%pJ reached maximum node limit",
job_ptr);
goto fini;
}
for (i = 0; i < ctx->block_count; i++) {
if (block_inx != bblock_block_inx[i])
continue;
if (bit_test(bblock_required, i)) {
last_llblock = i;
continue;
}
if (bit_overlap_any(req2_nodes_bitmap,
ctx->block_record_table[i]
.node_bitmap)) {
bit_set(bblock_required, i);
if (!_bblocks_in_same_block(last_llblock, i,
llblock_level)) {
max_llblock--;
last_llblock = i;
}
}
}
}
if (max_llblock < 0) {
rc = SLURM_ERROR;
info("%pJ requires nodes exceed maximum llblock limit due to node weights",
job_ptr);
goto fini;
}
/* Add additional resources for already required base block */
if (req_nodes_bitmap || req2_nodes_bitmap) {
for (i = 0; i < ctx->block_count; i++) {
if (!bit_test(bblock_required, i))
continue;
if (!bblock_bitmap)
bblock_bitmap =
bit_copy(ctx->block_record_table[i]
.node_bitmap);
else
bit_copybits(bblock_bitmap,
ctx->block_record_table[i]
.node_bitmap);
bit_and(bblock_bitmap, block_node_bitmap[block_inx]);
bit_and(bblock_bitmap, best_nodes_bitmap);
bit_and_not(bblock_bitmap, topo_eval->node_map);
for (j = 0; next_node_bitmap(bblock_bitmap, &j); j++) {
if (!avail_cpu_per_node[j])
continue;
topo_eval->avail_cpus = avail_cpu_per_node[j];
if (!eval_nodes_cpus_to_use(topo_eval, j,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true))
continue;
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
bit_set(topo_eval->node_map, j);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
}
}
}
if (!nodes_on_bblock)
nodes_on_bblock =
xcalloc(ctx->block_count, sizeof(*nodes_on_bblock));
if (!bblock_node_bitmap)
bblock_node_bitmap =
xcalloc(ctx->block_count, sizeof(bitstr_t *));
if (nodes_on_llblock)
memset(nodes_on_llblock, 0, llblock_cnt * sizeof(uint32_t));
for (i = 0; i < ctx->block_count; i++) {
if (block_inx != bblock_block_inx[i])
continue;
if (bit_test(bblock_required, i))
continue;
if (!bblock_node_bitmap[i])
bblock_node_bitmap[i] =
bit_copy(ctx->block_record_table[i]
.node_bitmap);
else
bit_copybits(bblock_node_bitmap[i],
ctx->block_record_table[i].node_bitmap);
bit_and(bblock_node_bitmap[i], block_node_bitmap[block_inx]);
bit_and(bblock_node_bitmap[i], best_nodes_bitmap);
nodes_on_bblock[i] = bit_set_count(bblock_node_bitmap[i]);
if (hres_match_topo) {
uint32_t tmp_cap =
hres_get_capacity(hres_select,
bblock_hres_inx[i]);
tmp_cap /= hres_select->hres_per_node;
nodes_on_bblock[i] = MIN(tmp_cap, nodes_on_bblock[i]);
}
if (nodes_on_llblock) {
int llblock_inx = i / bblock_per_llblock;
nodes_on_llblock[llblock_inx] += nodes_on_bblock[i];
}
}
prev_rem_nodes = rem_nodes + 1;
while (1) {
int best_bblock_inx = -1;
bool best_fit = false, best_same_block = true;
bitstr_t *best_bblock_bitmap = NULL;
if (prev_rem_nodes == rem_nodes)
break; /* Stalled */
prev_rem_nodes = rem_nodes;
for (i = 0; i < ctx->block_count; i++) {
if (block_inx != bblock_block_inx[i])
continue;
if (bit_test(bblock_required, i))
continue;
_choose_best_bblock(bblock_required, llblock_level,
rem_nodes, nodes_on_bblock,
nodes_on_llblock, i,
&best_same_block, &best_fit,
&best_bblock_inx, ctx);
}
log_flag(SELECT_TYPE, "%s: rem_nodes:%d best_bblock_inx:%d",
__func__, rem_nodes, best_bblock_inx);
if (best_bblock_inx == -1)
break;
if ((max_llblock <= 0) && !best_same_block) {
log_flag(SELECT_TYPE, "%s: min_rem_nodes:%d can't add more bblocks due to llblock limit",
__func__, min_rem_nodes);
break;
}
best_bblock_bitmap = bblock_node_bitmap[best_bblock_inx];
bit_and_not(best_bblock_bitmap, topo_eval->node_map);
bit_set(bblock_required, best_bblock_inx);
/*
* NOTE: Ideally we would add nodes in order of resource
* availability rather than in order of bitmap position, but
* that would add even more complexity and overhead.
*/
for (i = 0; next_node_bitmap(best_bblock_bitmap, &i) &&
(topo_eval->max_nodes > 0); i++) {
if (!avail_cpu_per_node[i])
continue;
topo_eval->avail_cpus = avail_cpu_per_node[i];
if (!eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true))
continue;
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
bit_set(topo_eval->node_map, i);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
}
if (!best_same_block)
max_llblock--;
}
if ((min_rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id))) {
rc = SLURM_SUCCESS;
goto fini;
}
rc = ESLURM_RETRY_EVAL_HINT;
if (alloc_node_map)
bit_or(topo_eval->node_map, alloc_node_map);
fini:
if (rem_segment_cnt && !rc ) {
int segment_index = segment_cnt - rem_segment_cnt;
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
char *node_names;
node_names = bitmap2node_name(topo_eval->node_map);
info("Segment:%d nodes:%s", segment_index, node_names);
xfree(node_names);
}
_jobinfo_add_segment(job_ptr, topo_eval->node_map);
if (--rem_segment_cnt > 0) {
if (alloc_node_map)
bit_or(alloc_node_map, topo_eval->node_map);
else
alloc_node_map = bit_copy(topo_eval->node_map);
FREE_NULL_LIST(best_gres);
FREE_NULL_LIST(node_weight_list);
if (nodes_on_llblock)
memset(nodes_on_llblock, 0,
llblock_cnt * sizeof(uint32_t));
if (job_ptr->bit_flags & SPREAD_SEGMENTS) {
for (i = 0; i < ctx->block_count; i++) {
if (!bit_test(bblock_required, i))
continue;
bit_and_not(orig_node_map,
ctx->block_record_table[i].
node_bitmap);
}
}
bit_copybits(topo_eval->node_map, orig_node_map);
bit_and_not(topo_eval->node_map, alloc_node_map);
log_flag(SELECT_TYPE, "%s: rem_segment_cnt:%d",
__func__, rem_segment_cnt);
goto next_segment;
} else if (alloc_node_map) {
bit_or(topo_eval->node_map, alloc_node_map);
}
}
if (rc && block_per_asblock && (asblock_inx != -1)) {
bit_clear_all(topo_eval->node_map);
for (i = 0; i < ctx->block_count; i++) {
if ((i / (block_per_asblock * bblock_per_block)) ==
asblock_inx)
bit_or(topo_eval->node_map,
ctx->block_record_table[i].node_bitmap);
}
rc = ESLURM_RETRY_EVAL;
}
if (rc == SLURM_SUCCESS)
eval_nodes_clip_socket_cores(topo_eval);
FREE_NULL_LIST(best_gres);
FREE_NULL_LIST(node_weight_list);
FREE_NULL_BITMAP(req2_nodes_bitmap);
FREE_NULL_BITMAP(best_nodes_bitmap);
FREE_NULL_BITMAP(bblock_bitmap);
FREE_NULL_BITMAP(orig_node_map);
FREE_NULL_BITMAP(alloc_node_map);
xfree(avail_cpu_per_node);
xfree(bblock_block_inx);
if (block_node_bitmap) {
for (i = 0; i < block_cnt; i++)
FREE_NULL_BITMAP(block_node_bitmap[i]);
xfree(block_node_bitmap);
}
if (bblock_node_bitmap) {
for (i = 0; i < ctx->block_count; i++)
FREE_NULL_BITMAP(bblock_node_bitmap[i]);
xfree(bblock_node_bitmap);
}
xfree(nodes_on_block);
xfree(nodes_on_bblock);
xfree(nodes_on_llblock);
xfree(nodes_on_asblock);
xfree(bblock_hres_inx);
FREE_NULL_BITMAP(bblock_required);
return rc;
}