blob: 81508fb7d24cd9e248482e9f522949a57a59efb9 [file] [log] [blame]
/*****************************************************************************\
* eval_nodes.c - Determine order of nodes for job.
*****************************************************************************
* Copyright (C) SchedMD LLC.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "eval_nodes.h"
#include "gres_filter.h"
#include "gres_sched.h"
#include "src/common/xstring.h"
typedef struct node_weight_struct {
bitstr_t *node_bitmap; /* bitmap of nodes with this weight */
uint64_t weight; /* priority of node for scheduling work on */
} node_weight_type;
typedef struct {
uint16_t *avail_cpu_per_node;
avail_res_t **avail_res_array;
uint32_t cpus_per_task;
int i_end;
int i_start;
uint32_t *max_nodes;
int *min_rem_nodes;
bitstr_t *node_map;
int *rem_cpus;
int *rem_nodes;
int64_t *rem_max_cpus;
uint16_t *used_cpu_per_node;
} foreach_add_nodes_lln_t;
/* Find node_weight_type element from list with same weight as node config */
static int _node_weight_find(void *x, void *key)
{
node_weight_type *nwt = x;
node_record_t *node_ptr = key;
if (nwt->weight == node_ptr->sched_weight)
return 1;
return 0;
}
/* Free node_weight_type element from list */
static void _node_weight_free(void *x)
{
node_weight_type *nwt = x;
FREE_NULL_BITMAP(nwt->node_bitmap);
xfree(nwt);
}
/* Sort list of node_weight_type records in order of increasing node weight */
static int _node_weight_sort(void *x, void *y)
{
node_weight_type *nwt1 = *(node_weight_type **) x;
node_weight_type *nwt2 = *(node_weight_type **) y;
if (nwt1->weight < nwt2->weight)
return -1;
if (nwt1->weight > nwt2->weight)
return 1;
return 0;
}
/*
* Given a bitmap of available nodes, return a list of node_weight_type
* records in order of increasing "weight" (priority)
*/
static list_t *_build_node_weight_list(bitstr_t *node_bitmap)
{
list_t *node_list;
node_record_t *node_ptr;
node_weight_type *nwt;
xassert(node_bitmap);
/* Build list of node_weight_type records, one per node weight */
node_list = list_create(_node_weight_free);
for (int i = 0; (node_ptr = next_node_bitmap(node_bitmap, &i)); i++) {
nwt = list_find_first(node_list, _node_weight_find, node_ptr);
if (!nwt) {
nwt = xmalloc(sizeof(node_weight_type));
nwt->node_bitmap = bit_alloc(node_record_count);
nwt->weight = node_ptr->sched_weight;
list_append(node_list, nwt);
}
bit_set(nwt->node_bitmap, i);
}
/* Sort the list in order of increasing node weight */
list_sort(node_list, _node_weight_sort);
return node_list;
}
static void _reduce_res_cores(topology_eval_t *topo_eval,
uint64_t *maxtasks,
uint16_t res_cores_per_gpu,
int sockets,
uint16_t cores_per_socket,
uint16_t cpus_per_core,
int node_i)
{
gres_job_state_t *gres_js;
gres_state_t *gres_job_state;
sock_gres_t *sock_gres;
list_t *sock_list = topo_eval->avail_res_array[node_i]->sock_gres_list;
bitstr_t *avail_core = topo_eval->avail_core[node_i];
uint16_t *avail_cores_per_sock =
topo_eval->avail_res_array[node_i]->avail_cores_per_sock;
uint16_t *actual_cores_p_s;
uint32_t tot_cores = 0;
if (topo_eval->cr_type & SELECT_SOCKET)
return;
actual_cores_p_s = xcalloc(sockets, sizeof(uint16_t));
for (int s = 0; s < sockets; s++) {
int start_core = s * cores_per_socket;
int end_core = start_core + cores_per_socket;
actual_cores_p_s[s] = bit_set_count_range(avail_core,
start_core,
end_core);
tot_cores += avail_cores_per_sock[s];
}
list_itr_t *sock_list_iter;
sock_list_iter = list_iterator_create(sock_list);
while ((sock_gres = list_next(sock_list_iter))) {
bitstr_t *res_cores;
uint16_t tot_res_core;
uint32_t max_res_cores = 0;
uint64_t max_gres = 0;
uint32_t max_gres_by_cpu = 0;
int i = (sockets * cores_per_socket) - 1;
bool done = false;
if (!sock_gres->gres_state_job)
continue;
gres_job_state = sock_gres->gres_state_job;
gres_js = gres_job_state->gres_data;
if (!gres_js->gres_per_task &&
(!gres_js->ntasks_per_gres ||
(gres_js->ntasks_per_gres == NO_VAL16)))
continue;
/* Gres per node takes priority in selection */
if (gres_js->gres_per_node)
continue;
if (gres_js->gres_per_task)
max_gres = *maxtasks * gres_js->gres_per_task;
else if (gres_js->ntasks_per_gres) {
max_gres = *maxtasks / gres_js->ntasks_per_gres;
*maxtasks = max_gres * gres_js->ntasks_per_gres;
}
sock_gres->total_cnt = MIN(sock_gres->total_cnt, max_gres);
if ((gres_job_state->plugin_id != gres_get_gpu_plugin_id()) ||
!gres_js->res_gpu_cores ||
!gres_js->res_gpu_cores[node_i])
continue;
max_res_cores = max_gres * res_cores_per_gpu;
res_cores = bit_copy(gres_js->res_gpu_cores[node_i]);
bit_and(res_cores, avail_core);
tot_res_core = bit_set_count(res_cores);
while (!done) {
while (tot_res_core > max_res_cores) {
int s;
/*
* Must remove restricted cores from the end of
* the bitmap first since cores are picked from
* front to back. This helps the needed
* restricted cores get picked.
*/
i = bit_fls_from_bit(res_cores, i);
if (i < 0)
break; /* This should never happen */
bit_clear(avail_core, i);
tot_res_core--;
s = i / cores_per_socket;
actual_cores_p_s[s]--;
if (actual_cores_p_s[s] <
avail_cores_per_sock[s]) {
int cnt;
avail_cores_per_sock[s]--;
tot_cores--;
cnt = tot_cores * cpus_per_core;
if (cnt < topo_eval->avail_cpus)
topo_eval->avail_cpus = cnt;
}
i--;
}
if (gres_js->cpus_per_gres) {
max_gres_by_cpu = topo_eval->avail_cpus /
gres_js->cpus_per_gres;
while (max_gres_by_cpu < max_gres) {
(*maxtasks)--;
if (gres_js->gres_per_task) {
max_gres = *maxtasks *
gres_js->gres_per_task;
} else if (gres_js->ntasks_per_gres) {
max_gres = *maxtasks /
gres_js->
ntasks_per_gres;
*maxtasks = max_gres * gres_js->
ntasks_per_gres;
}
}
sock_gres->total_cnt =
MIN(sock_gres->total_cnt, max_gres);
max_res_cores = max_gres * res_cores_per_gpu;
if (tot_res_core <= max_res_cores)
done = true;
} else
done = true;
}
FREE_NULL_BITMAP(res_cores);
}
list_iterator_destroy(sock_list_iter);
xfree(actual_cores_p_s);
}
static uint32_t _reduce_res_core_by_task_cnt(topology_eval_t* topo_eval,
uint64_t maxtasks,
uint32_t *gres_max_tasks,
node_record_t *node_ptr,
int node_i,
int select_inx)
{
job_record_t *job_ptr = topo_eval->job_ptr;
uint16_t plane_size = NO_VAL16;
uint16_t arbitrary_tasks = NO_VAL16;
bool one_task_per_node = false;
uint16_t res_cores_per_gpu = node_ptr->res_cores_per_gpu;
if (job_ptr->details->overcommit && !job_ptr->tres_per_task)
one_task_per_node = true;
if (((job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) ==
SLURM_DIST_PLANE) && !one_task_per_node) {
plane_size = 1;
if (job_ptr->details->mc_ptr)
plane_size = job_ptr->details->mc_ptr->plane_size;
} else if ((job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) ==
SLURM_DIST_ARBITRARY) {
arbitrary_tasks = job_ptr->details->arbitrary_tpn[select_inx];
}
maxtasks = MIN(maxtasks, plane_size);
maxtasks = MIN(maxtasks, arbitrary_tasks);
if (maxtasks < *gres_max_tasks) {
_reduce_res_cores(topo_eval, &maxtasks, res_cores_per_gpu,
node_ptr->tot_sockets, node_ptr->cores,
node_ptr->tpc, node_i);
*gres_max_tasks = maxtasks;
}
maxtasks = MIN(maxtasks, *gres_max_tasks);
return maxtasks;
}
/*
* Reduce the gres_max_tasks and total GRES available to a node based on
* what will be laid out on the node.
* If the GRES available gets reduced and RestrictedCoresPerGPU
* is used, any unusable restricted cores will be removed.
* If to many cores are removed such that the node is no longer usable
* in the allocation it returns false, else true.
*/
extern bool eval_nodes_gres(topology_eval_t *topo_eval,
uint64_t *max_tasks,
job_record_t *job_ptr,
node_record_t *node_ptr,
int rem_nodes,
int node_i,
int select_inx)
{
bool use_node = true;
uint64_t used_tasks;
uint32_t save_tasks = MAX((rem_nodes - 1), 0);
uint16_t min_cpus = job_ptr->details->cpus_per_task;
*max_tasks -= save_tasks;
if (!job_ptr->details->overcommit) {
used_tasks = MIN(*max_tasks, (topo_eval->avail_cpus /
job_ptr->details->cpus_per_task));
if (used_tasks < *max_tasks) {
save_tasks += *max_tasks - used_tasks;
*max_tasks = used_tasks;
}
}
*max_tasks = MAX(*max_tasks, 1);
used_tasks = _reduce_res_core_by_task_cnt(
topo_eval, *max_tasks,
&topo_eval->avail_res_array[node_i]->gres_max_tasks, node_ptr,
node_i, select_inx);
if (!job_ptr->details->overcommit)
min_cpus = job_ptr->details->cpus_per_task * used_tasks;
else if (use_node)
min_cpus = job_ptr->details->cpus_per_task;
if (min_cpus < job_ptr->details->pn_min_cpus)
min_cpus = job_ptr->details->pn_min_cpus;
if (!used_tasks)
use_node = false;
else
use_node = topo_eval->avail_cpus >= min_cpus;
if (topo_eval->gres_per_job && use_node) {
use_node = gres_sched_add(
&topo_eval->avail_cpus,
topo_eval->avail_core[node_i],
topo_eval->avail_res_array[node_i]->
avail_cores_per_sock,
topo_eval->avail_res_array[node_i]->sock_gres_list,
job_ptr->gres_list_req,
node_ptr->res_cores_per_gpu,
node_ptr->tot_sockets,
node_ptr->cores, node_ptr->tpc, topo_eval->cr_type,
min_cpus, node_i);
}
if (use_node)
*max_tasks -= used_tasks;
else
topo_eval->avail_cpus = 0;
topo_eval->avail_res_array[node_i]->avail_cpus = topo_eval->avail_cpus;
*max_tasks += save_tasks;
return use_node;
}
extern uint64_t eval_nodes_set_max_tasks(job_record_t *job_ptr,
uint64_t max_cpus,
uint32_t max_nodes) {
uint32_t max_tasks = max_cpus;
if (!job_ptr->details->overcommit &&
(job_ptr->details->cpus_per_task > 1)) {
if (job_ptr->details->ntasks_per_node == 0) {
max_tasks = max_tasks / job_ptr->details->cpus_per_task;
} else {
max_tasks = job_ptr->details->ntasks_per_node *
max_nodes;
}
}
return max_tasks;
}
extern void eval_nodes_clip_socket_cores(topology_eval_t *topo_eval)
{
bitstr_t *avail_core;
uint16_t *avail_cores_per_sock;
uint16_t actual_core_cnt;
node_record_t *node_ptr;
int start_core;
int end_core;
if (!topo_eval->job_ptr->gres_list_req)
return;
for (int i = 0;
(node_ptr = next_node_bitmap(topo_eval->node_map, &i));
i++) {
avail_core = topo_eval->avail_core[i];
avail_cores_per_sock =
topo_eval->avail_res_array[i]->avail_cores_per_sock;
for (int s = 0; s < node_ptr->tot_sockets; s++) {
start_core = s * node_ptr->cores;
end_core = start_core + node_ptr->cores;
actual_core_cnt = bit_set_count_range(avail_core,
start_core,
end_core);
for (int c = node_ptr->cores - 1; c >= 0; c--) {
int i = (s * node_ptr->cores) + c;
if (actual_core_cnt <= avail_cores_per_sock[s])
break;
if (!bit_test(avail_core, i))
continue;
bit_clear(avail_core, i);
actual_core_cnt--;
}
}
}
}
/*
* A variation of _eval_nodes() to select resources using busy nodes first.
*/
static int _eval_nodes_busy(topology_eval_t *topo_eval)
{
int i, i_start, i_end, error_code = SLURM_ERROR;
int idle_test;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
int total_cpus = 0; /* #CPUs allocated to job */
int64_t rem_max_cpus;
job_record_t *job_ptr = topo_eval->job_ptr;
job_details_t *details_ptr = job_ptr->details;
bitstr_t *req_map = details_ptr->req_node_bitmap;
bitstr_t *orig_node_map = bit_copy(topo_eval->node_map);
avail_res_t **avail_res_array = topo_eval->avail_res_array;
uint32_t min_nodes = topo_eval->min_nodes;
uint32_t req_nodes = topo_eval->req_nodes;
bool all_done = false;
node_record_t *node_ptr;
list_t *node_weight_list = NULL;
node_weight_type *nwt;
list_itr_t *iter;
uint64_t maxtasks;
topo_eval->avail_cpus = 0;
rem_cpus = details_ptr->min_cpus;
min_rem_nodes = min_nodes;
if ((details_ptr->num_tasks != NO_VAL) &&
(details_ptr->num_tasks != 0))
topo_eval->max_nodes = MIN(topo_eval->max_nodes,
details_ptr->num_tasks);
if ((topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req)))
rem_nodes = MIN(min_nodes, req_nodes);
else
rem_nodes = MAX(min_nodes, req_nodes);
rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes);
maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus,
topo_eval->max_nodes);
i_start = bit_ffs(topo_eval->node_map);
if (i_start >= 0)
i_end = bit_fls(topo_eval->node_map);
else
i_end = i_start - 1;
if (req_map) {
for (i = i_start; i <= i_end; i++) {
if (!bit_test(req_map, i)) {
bit_clear(topo_eval->node_map, i);
continue;
}
node_ptr = node_record_table_ptr[i];
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus) {
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
if (topo_eval->max_nodes <= 0) {
log_flag(SELECT_TYPE, "%pJ requires nodes exceed maximum node limit",
job_ptr);
goto fini;
}
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
(void) eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true);
if (topo_eval->avail_cpus <= 0) {
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
total_cpus += topo_eval->avail_cpus;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
rem_nodes--;
min_rem_nodes--;
/* leaving bitmap set, decr max limit */
topo_eval->max_nodes--;
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
bit_and(topo_eval->node_map, req_map);
goto fini;
}
if (topo_eval->max_nodes <= 0) {
error_code = SLURM_ERROR;
goto fini;
}
bit_and_not(orig_node_map, topo_eval->node_map);
} else {
bit_clear_all(topo_eval->node_map);
}
/* Compute CPUs already allocated to required nodes */
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
info("%pJ can't use required nodes due to max CPU limit",
job_ptr);
goto fini;
}
/*
* Start by using nodes that already have a job running.
* Then try to use idle nodes.
*/
if (topo_eval->max_nodes == 0)
all_done = true;
node_weight_list = _build_node_weight_list(orig_node_map);
iter = list_iterator_create(node_weight_list);
while (!all_done && (nwt = list_next(iter))) {
for (idle_test = 0; idle_test < 2; idle_test++) {
for (i = i_start; i <= i_end; i++) {
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus)
continue;
/* Node not available or already selected */
if (!bit_test(nwt->node_bitmap, i) ||
bit_test(topo_eval->node_map, i))
continue;
if (((idle_test == 0) &&
bit_test(idle_node_bitmap, i)) ||
((idle_test == 1) &&
!bit_test(idle_node_bitmap, i)))
continue;
eval_nodes_select_cores(topo_eval, i,
min_rem_nodes);
(void) eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true);
if (topo_eval->avail_cpus == 0)
continue;
total_cpus += topo_eval->avail_cpus;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
bit_set(topo_eval->node_map, i);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
all_done = true;
break;
}
if (topo_eval->max_nodes == 0) {
all_done = true;
break;
}
}
}
}
list_iterator_destroy(iter);
if (error_code == SLURM_SUCCESS) {
/* Already succeeded */
} else if ((rem_cpus > 0) || (min_rem_nodes > 0) ||
!gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
bit_clear_all(topo_eval->node_map);
error_code = SLURM_ERROR;
} else {
error_code = SLURM_SUCCESS;
}
fini:
if (error_code == SLURM_SUCCESS)
eval_nodes_clip_socket_cores(topo_eval);
FREE_NULL_LIST(node_weight_list);
FREE_NULL_BITMAP(orig_node_map);
return error_code;
}
static int _eval_nodes_consec(topology_eval_t *topo_eval)
{
int i, j, error_code = SLURM_ERROR;
int *consec_cpus; /* how many CPUs we can add from this
* consecutive set of nodes */
list_t **consec_gres; /* how many GRES we can add from this
* consecutive set of nodes */
int *consec_nodes; /* how many nodes we can add from this
* consecutive set of nodes */
int *consec_start; /* where this consecutive set starts (index) */
int *consec_end; /* where this consecutive set ends (index) */
int *consec_req; /* are nodes from this set required
* (in req_bitmap) */
uint64_t *consec_weight; /* node scheduling weight */
node_record_t *node_ptr = NULL;
int consec_index, consec_size, sufficient;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
int best_fit_nodes, best_fit_cpus, best_fit_req;
int best_fit_sufficient, best_fit_index = 0;
bool new_best;
uint64_t best_weight = 0;
int64_t rem_max_cpus;
int total_cpus = 0; /* #CPUs allocated to job */
bool required_node;
avail_res_t **avail_res_array = topo_eval->avail_res_array;
job_record_t *job_ptr = topo_eval->job_ptr;
job_details_t *details_ptr = job_ptr->details;
bitstr_t *req_map = details_ptr->req_node_bitmap;
uint32_t min_nodes = topo_eval->min_nodes;
uint32_t req_nodes = topo_eval->req_nodes;
uint16_t *avail_cpu_per_node = NULL;
uint64_t maxtasks;
topo_eval->avail_cpus = 0;
/* make allocation for 50 sets of consecutive nodes, expand as needed */
consec_size = 50;
consec_cpus = xcalloc(consec_size, sizeof(int));
consec_nodes = xcalloc(consec_size, sizeof(int));
consec_start = xcalloc(consec_size, sizeof(int));
consec_end = xcalloc(consec_size, sizeof(int));
consec_req = xcalloc(consec_size, sizeof(int));
consec_weight = xcalloc(consec_size, sizeof(uint64_t));
/* Build table with information about sets of consecutive nodes */
consec_index = 0;
consec_req[consec_index] = -1; /* no required nodes here by default */
consec_weight[consec_index] = NO_VAL64;
avail_cpu_per_node = xcalloc(node_record_count, sizeof(uint16_t));
rem_cpus = details_ptr->min_cpus;
min_rem_nodes = min_nodes;
if ((topo_eval->gres_per_job =
gres_sched_init(job_ptr->gres_list_req))) {
rem_nodes = MIN(min_nodes, req_nodes);
consec_gres = xcalloc(consec_size, sizeof(list_t *));
} else
rem_nodes = MAX(min_nodes, req_nodes);
rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes);
maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus,
topo_eval->max_nodes);
/*
* If there are required nodes, first determine the resources they
* provide, then select additional resources as needed in next loop
*/
if (req_map) {
int count = 0;
uint16_t *arbitrary_tpn = job_ptr->details->arbitrary_tpn;
for (i = 0;
((node_ptr = next_node_bitmap(req_map, &i)) &&
(topo_eval->max_nodes > 0));
i++) {
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
if (arbitrary_tpn) {
int req_cpus = arbitrary_tpn[count++];
if ((details_ptr->cpus_per_task != NO_VAL16) &&
(details_ptr->cpus_per_task != 0))
req_cpus *= details_ptr->cpus_per_task;
req_cpus = MAX(req_cpus,
(int) details_ptr->pn_min_cpus);
req_cpus = MAX(req_cpus,
details_ptr->min_gres_cpu);
if (topo_eval->avail_cpus < req_cpus) {
debug("%pJ required node %s needed %d cpus but only has %d",
job_ptr, node_ptr->name, req_cpus,
topo_eval->avail_cpus);
goto fini;
}
topo_eval->avail_cpus = req_cpus;
avail_res_array[i]->avail_cpus =
topo_eval->avail_cpus;
if (topo_eval->gres_per_job) {
eval_nodes_gres(topo_eval, &maxtasks,
job_ptr, node_ptr,
min_rem_nodes, i,
(count - 1));
}
} else
(void) eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true);
if (topo_eval->avail_cpus == 0) {
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
avail_cpu_per_node[i] = topo_eval->avail_cpus;
total_cpus += topo_eval->avail_cpus;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
bit_and(topo_eval->node_map, req_map);
goto fini;
}
if (topo_eval->max_nodes <= 0) {
error_code = SLURM_ERROR;
goto fini;
}
}
for (i = 0; next_node(&i); i++) { /* For each node */
if ((consec_index + 1) >= consec_size) {
consec_size *= 2;
xrecalloc(consec_cpus, consec_size, sizeof(int));
xrecalloc(consec_nodes, consec_size, sizeof(int));
xrecalloc(consec_start, consec_size, sizeof(int));
xrecalloc(consec_end, consec_size, sizeof(int));
xrecalloc(consec_req, consec_size, sizeof(int));
xrecalloc(consec_weight, consec_size, sizeof(uint64_t));
if (topo_eval->gres_per_job) {
xrecalloc(consec_gres,
consec_size, sizeof(list_t *));
}
}
if (req_map)
required_node = bit_test(req_map, i);
else
required_node = false;
if (!bit_test(topo_eval->node_map, i)) {
node_ptr = NULL; /* Use as flag, avoid second test */
} else if (required_node) {
node_ptr = node_record_table_ptr[i];
} else {
node_ptr = node_record_table_ptr[i];
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
if (topo_eval->avail_cpus == 0) {
bit_clear(topo_eval->node_map, i);
node_ptr = NULL;
}
avail_cpu_per_node[i] = topo_eval->avail_cpus;
}
/*
* If job requested contiguous nodes,
* do not worry about matching node weights
*/
if (node_ptr &&
!details_ptr->contiguous &&
(consec_weight[consec_index] != NO_VAL64) && /* Init value*/
(node_ptr->sched_weight != consec_weight[consec_index])) {
/* End last consecutive set, setup start of next set */
if (consec_nodes[consec_index] == 0) {
/* Only required nodes, reuse consec record */
consec_req[consec_index] = -1;
} else {
/* End last set, setup for start of next set */
consec_end[consec_index] = i - 1;
consec_req[++consec_index] = -1;
}
}
if (node_ptr) {
if (consec_nodes[consec_index] == 0)
consec_start[consec_index] = i;
if (required_node) {
/*
* Required node, resources counters updated
* in above loop, leave bitmap set
*/
if (consec_req[consec_index] == -1) {
/* first required node in set */
consec_req[consec_index] = i;
}
continue;
}
/* node not selected (yet) */
bit_clear(topo_eval->node_map, i);
consec_cpus[consec_index] += topo_eval->avail_cpus;
consec_nodes[consec_index]++;
if (topo_eval->gres_per_job) {
gres_sched_consec(
&consec_gres[consec_index],
job_ptr->gres_list_req,
avail_res_array[i]->sock_gres_list);
}
consec_weight[consec_index] = node_ptr->sched_weight;
} else if (consec_nodes[consec_index] == 0) {
/* Only required nodes, reuse consec record */
consec_req[consec_index] = -1;
consec_weight[consec_index] = NO_VAL64;
} else {
/* End last set, setup for start of next set */
consec_end[consec_index] = i - 1;
consec_req[++consec_index] = -1;
consec_weight[consec_index] = NO_VAL64;
}
}
if (consec_nodes[consec_index] != 0)
consec_end[consec_index++] = i - 1;
if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) {
if (consec_index == 0) {
info("consec_index is zero");
}
for (i = 0; i < consec_index; i++) {
char *gres_str = NULL, *gres_print = "";
bitstr_t *host_bitmap;
char *host_list;
if (topo_eval->gres_per_job) {
gres_str = gres_sched_str(consec_gres[i]);
if (gres_str) {
xstrcat(gres_str, " ");
gres_print = gres_str;
}
}
host_bitmap = bit_alloc(node_record_count);
bit_nset(host_bitmap, consec_start[i], consec_end[i]);
host_list = bitmap2node_name(host_bitmap);
info("set:%d consec CPUs:%d nodes:%d:%s %sbegin:%d end:%d required:%d weight:%"PRIu64,
i, consec_cpus[i], consec_nodes[i],
host_list, gres_print, consec_start[i],
consec_end[i], consec_req[i], consec_weight[i]);
FREE_NULL_BITMAP(host_bitmap);
xfree(gres_str);
xfree(host_list);
}
}
/* Compute CPUs already allocated to required nodes */
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
info("%pJ can't use required nodes due to max CPU limit",
job_ptr);
goto fini;
}
/*
* accumulate nodes from these sets of consecutive nodes until
* sufficient resources have been accumulated
*/
while (consec_index && (topo_eval->max_nodes > 0)) {
best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0;
best_fit_req = -1; /* first required node, -1 if none */
for (i = 0; i < consec_index; i++) {
if (consec_nodes[i] == 0)
continue; /* no usable nodes here */
if (details_ptr->contiguous &&
details_ptr->req_node_bitmap &&
(consec_req[i] == -1))
continue; /* not required nodes */
sufficient = (consec_cpus[i] >= rem_cpus) &&
eval_nodes_enough_nodes(
consec_nodes[i], rem_nodes,
min_nodes, req_nodes);
if (sufficient && topo_eval->gres_per_job) {
sufficient = gres_sched_sufficient(
job_ptr->gres_list_req, consec_gres[i]);
}
/*
* if first possibility OR
* contains required nodes OR
* lowest node weight
*/
if ((best_fit_nodes == 0) ||
((best_fit_req == -1) && (consec_req[i] != -1)) ||
(consec_weight[i] < best_weight))
new_best = true;
else
new_best = false;
/*
* If equal node weight
* first set large enough for request OR
* tightest fit (less resource/CPU waste) OR
* nothing yet large enough, but this is biggest
*/
if (!new_best && (consec_weight[i] == best_weight) &&
((sufficient && (best_fit_sufficient == 0)) ||
(sufficient && (consec_cpus[i] < best_fit_cpus)) ||
(!sufficient &&
(consec_cpus[i] > best_fit_cpus))))
new_best = true;
/*
* if first continuous node set large enough
*/
if (!new_best && !best_fit_sufficient &&
details_ptr->contiguous && sufficient)
new_best = true;
if (new_best) {
best_fit_cpus = consec_cpus[i];
best_fit_nodes = consec_nodes[i];
best_fit_index = i;
best_fit_req = consec_req[i];
best_fit_sufficient = sufficient;
best_weight = consec_weight[i];
}
if (details_ptr->contiguous &&
details_ptr->req_node_bitmap) {
/*
* Must wait for all required nodes to be
* in a single consecutive block
*/
int j, other_blocks = 0;
for (j = (i+1); j < consec_index; j++) {
if (consec_req[j] != -1) {
other_blocks = 1;
break;
}
}
if (other_blocks) {
best_fit_nodes = 0;
break;
}
}
}
if (best_fit_nodes == 0)
break;
if (details_ptr->contiguous && !best_fit_sufficient)
break; /* no hole large enough */
if (best_fit_req != -1) {
/*
* This collection of nodes includes required ones
* select nodes from this set, first working up
* then down from the required nodes
*/
for (i = best_fit_req;
i <= consec_end[best_fit_index]; i++) {
if ((topo_eval->max_nodes == 0) ||
((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id))))
break;
if (bit_test(topo_eval->node_map, i)) {
/* required node already in set */
continue;
}
if (avail_cpu_per_node[i] == 0)
continue;
topo_eval->avail_cpus = avail_cpu_per_node[i];
/*
* This could result in 0, but if the user
* requested nodes here we will still give
* them and then the step layout will sort
* things out. But if the gres's cpu requirement
* can not be satisfied due to gres layout try
* next node.
*/
if (!eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true))
continue;
total_cpus += topo_eval->avail_cpus;
bit_set(topo_eval->node_map, i);
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
}
for (i = (best_fit_req - 1);
i >= consec_start[best_fit_index]; i--) {
if ((topo_eval->max_nodes == 0) ||
((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id))))
break;
if (bit_test(topo_eval->node_map, i))
continue;
if (avail_cpu_per_node[i] == 0)
continue;
topo_eval->avail_cpus = avail_cpu_per_node[i];
/*
* This could result in 0, but if the user
* requested nodes here we will still give
* them and then the step layout will sort
* things out. But if the gres's cpu requirement
* can not be satisfied due to gres layout try
* next node.
*/
if (!eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true))
continue;
total_cpus += topo_eval->avail_cpus;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
bit_set(topo_eval->node_map, i);
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
}
} else {
/* No required nodes, try best fit single node */
int best_fit = -1, best_size = 0;
int first = consec_start[best_fit_index];
int last = consec_end[best_fit_index];
if (rem_nodes <= 1) {
for (i = first, j = 0; i <= last; i++, j++) {
if (bit_test(topo_eval->node_map, i) ||
!avail_res_array[i])
continue;
if (avail_cpu_per_node[i] < rem_cpus)
continue;
if (topo_eval->gres_per_job &&
!gres_sched_sufficient(
job_ptr->gres_list_req,
avail_res_array[i]->
sock_gres_list)) {
continue;
}
if ((best_fit == -1) ||
(avail_cpu_per_node[i] <best_size)){
best_fit = i;
best_size =
avail_cpu_per_node[i];
if (best_size == rem_cpus)
break;
}
}
/*
* If we found a single node to use,
* clear CPU counts for all other nodes
*/
if (best_fit != -1) {
for (i = first; i <= last; i++) {
if (i == best_fit)
continue;
avail_cpu_per_node[i] = 0;
}
}
}
for (i = first, j = 0; i <= last; i++, j++) {
if ((topo_eval->max_nodes == 0) ||
((rem_nodes <= 0) && (rem_cpus <= 0) &&
(!topo_eval->gres_per_job ||
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id))))
break;
if (bit_test(topo_eval->node_map, i) ||
!avail_res_array[i])
continue;
topo_eval->avail_cpus = avail_cpu_per_node[i];
if (topo_eval->avail_cpus <= 0)
continue;
if ((topo_eval->max_nodes == 1) &&
(topo_eval->avail_cpus < rem_cpus)) {
/*
* Job can only take one more node and
* this one has insufficient CPU
*/
continue;
}
/*
* This could result in 0, but if the user
* requested nodes here we will still give
* them and then the step layout will sort
* things out. But if the gres's cpu requirement
* can not be satisfied due to gres layout try
* next node.
*/
if (!eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true))
continue;
total_cpus += topo_eval->avail_cpus;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
bit_set(topo_eval->node_map, i);
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
}
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
break;
}
consec_cpus[best_fit_index] = 0;
consec_nodes[best_fit_index] = 0;
}
if (error_code && (rem_cpus <= 0) &&
gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id) &&
eval_nodes_enough_nodes(0, rem_nodes, min_nodes, req_nodes))
error_code = SLURM_SUCCESS;
fini:
if (error_code == SLURM_SUCCESS)
eval_nodes_clip_socket_cores(topo_eval);
xfree(avail_cpu_per_node);
xfree(consec_cpus);
xfree(consec_nodes);
xfree(consec_start);
xfree(consec_end);
xfree(consec_req);
xfree(consec_weight);
if (topo_eval->gres_per_job) {
for (i = 0; i < consec_size; i++)
FREE_NULL_LIST(consec_gres[i]);
xfree(consec_gres);
}
return error_code;
}
static int _eval_nodes_lln(topology_eval_t *topo_eval)
{
int i, i_start, i_end, error_code = SLURM_ERROR;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
int total_cpus = 0; /* #CPUs allocated to job */
int64_t rem_max_cpus;
job_record_t *job_ptr = topo_eval->job_ptr;
job_details_t *details_ptr = job_ptr->details;
bitstr_t *req_map = details_ptr->req_node_bitmap;
bitstr_t *orig_node_map = bit_copy(topo_eval->node_map);
bool all_done = false;
node_record_t *node_ptr;
list_t *node_weight_list = NULL;
node_weight_type *nwt;
list_itr_t *iter;
avail_res_t **avail_res_array = topo_eval->avail_res_array;
uint32_t min_nodes = topo_eval->min_nodes;
uint32_t req_nodes = topo_eval->req_nodes;
uint64_t maxtasks;
topo_eval->avail_cpus = 0;
rem_cpus = details_ptr->min_cpus;
min_rem_nodes = min_nodes;
if ((details_ptr->num_tasks != NO_VAL) &&
(details_ptr->num_tasks != 0))
topo_eval->max_nodes = MIN(topo_eval->max_nodes,
details_ptr->num_tasks);
if ((topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req)))
rem_nodes = MIN(min_nodes, req_nodes);
else
rem_nodes = MAX(min_nodes, req_nodes);
rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes);
maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus,
topo_eval->max_nodes);
i_start = bit_ffs(topo_eval->node_map);
if (i_start >= 0)
i_end = bit_fls(topo_eval->node_map);
else
i_end = i_start - 1;
if (req_map) {
for (i = i_start; i <= i_end; i++) {
if (!bit_test(req_map, i)) {
bit_clear(topo_eval->node_map, i);
continue;
}
node_ptr = node_record_table_ptr[i];
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus) {
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
if (topo_eval->max_nodes <= 0) {
log_flag(SELECT_TYPE, "%pJ requires nodes exceed maximum node limit",
job_ptr);
goto fini;
}
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
(void) eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus,
min_rem_nodes,
&maxtasks, true);
if (topo_eval->avail_cpus <= 0) {
debug("%pJ required node %s not available",
job_ptr, node_ptr->name);
goto fini;
}
total_cpus += topo_eval->avail_cpus;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
rem_nodes--;
min_rem_nodes--;
/* leaving bitmap set, decr max limit */
topo_eval->max_nodes--;
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
bit_and(topo_eval->node_map, req_map);
goto fini;
}
if (topo_eval->max_nodes <= 0) {
error_code = SLURM_ERROR;
goto fini;
}
bit_and_not(orig_node_map, topo_eval->node_map);
} else {
bit_clear_all(topo_eval->node_map);
}
/* Compute CPUs already allocated to required nodes */
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
info("%pJ can't use required nodes due to max CPU limit",
job_ptr);
goto fini;
}
/*
* Accumulate nodes from those with highest available CPU count.
* Logic is optimized for small node/CPU count allocations.
* For larger allocation, use list_sort().
*/
if (topo_eval->max_nodes == 0)
all_done = true;
node_weight_list = _build_node_weight_list(orig_node_map);
iter = list_iterator_create(node_weight_list);
while (!all_done && (nwt = list_next(iter))) {
int last_max_cpu_cnt = -1;
while (!all_done) {
int max_cpu_idx = -1;
for (i = i_start; i <= i_end; i++) {
/* Node not available or already selected */
if (!bit_test(nwt->node_bitmap, i) ||
bit_test(topo_eval->node_map, i))
continue;
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus)
continue;
eval_nodes_select_cores(topo_eval, i,
min_rem_nodes);
if (topo_eval->avail_cpus == 0)
continue;
/*
* Find the "least-loaded" node at the current
* node-weight level. This is defined as the
* node with the greatest ratio of available to
* total cpus. (But shift the divisors around
* to avoid any floating-point math.)
*/
if ((max_cpu_idx == -1) ||
((avail_res_array[max_cpu_idx]->max_cpus *
node_record_table_ptr[i]->cpus) <
(avail_res_array[i]->max_cpus *
node_record_table_ptr[max_cpu_idx]->
cpus))) {
max_cpu_idx = i;
if (avail_res_array[max_cpu_idx]->
max_cpus == last_max_cpu_cnt)
break;
}
}
if (max_cpu_idx == -1) {
/* No more usable nodes left, get next weight */
break;
}
i = max_cpu_idx;
(void) eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus,
min_rem_nodes, &maxtasks,
true);
if (topo_eval->avail_cpus == 0)
continue;
last_max_cpu_cnt = avail_res_array[i]->max_cpus;
total_cpus += topo_eval->avail_cpus;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
bit_set(topo_eval->node_map, i);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
all_done = true;
break;
}
if (topo_eval->max_nodes == 0) {
all_done = true;
break;
}
}
}
list_iterator_destroy(iter);
if (error_code == SLURM_SUCCESS) {
/* Already succeeded */
} else if ((rem_cpus > 0) || (min_rem_nodes > 0) ||
!gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
bit_clear_all(topo_eval->node_map);
error_code = SLURM_ERROR;
} else {
error_code = SLURM_SUCCESS;
}
fini:
if (error_code == SLURM_SUCCESS)
eval_nodes_clip_socket_cores(topo_eval);
FREE_NULL_LIST(node_weight_list);
FREE_NULL_BITMAP(orig_node_map);
return error_code;
}
/*
* A variation of _eval_nodes() to select resources at the end of the node
* list to reduce fragmentation
*/
static int _eval_nodes_serial(topology_eval_t *topo_eval)
{
int i, i_start, i_end, error_code = SLURM_ERROR;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes; /* remaining resources desired */
int total_cpus = 0; /* #CPUs allocated to job */
int64_t rem_max_cpus;
job_record_t *job_ptr = topo_eval->job_ptr;
job_details_t *details_ptr = job_ptr->details;
bitstr_t *req_map = details_ptr->req_node_bitmap;
bitstr_t *orig_node_map = bit_copy(topo_eval->node_map);
avail_res_t **avail_res_array = topo_eval->avail_res_array;
uint32_t min_nodes = topo_eval->min_nodes;
uint32_t req_nodes = topo_eval->req_nodes;
bool all_done = false;
node_record_t *node_ptr;
list_t *node_weight_list = NULL;
node_weight_type *nwt;
list_itr_t *iter;
uint64_t maxtasks;
topo_eval->avail_cpus = 0;
rem_cpus = details_ptr->min_cpus;
min_rem_nodes = min_nodes;
if ((details_ptr->num_tasks != NO_VAL) &&
(details_ptr->num_tasks != 0))
topo_eval->max_nodes = MIN(topo_eval->max_nodes,
details_ptr->num_tasks);
if ((topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req)))
rem_nodes = MIN(min_nodes, req_nodes);
else
rem_nodes = MAX(min_nodes, req_nodes);
rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes);
maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus,
topo_eval->max_nodes);
i_start = bit_ffs(topo_eval->node_map);
if (i_start >= 0)
i_end = bit_fls(topo_eval->node_map);
else
i_end = i_start - 1;
if (req_map) {
for (i = i_start; i <= i_end; i++) {
if (!bit_test(req_map, i)) {
bit_clear(topo_eval->node_map, i);
continue;
}
node_ptr = node_record_table_ptr[i];
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus) {
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
if (topo_eval->max_nodes <= 0) {
log_flag(SELECT_TYPE, "%pJ requires nodes exceed maximum node limit",
job_ptr);
goto fini;
}
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
(void) eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus, min_rem_nodes,
&maxtasks, true);
if (topo_eval->avail_cpus <= 0) {
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
total_cpus += topo_eval->avail_cpus;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
rem_nodes--;
min_rem_nodes--;
/* leaving bitmap set, decr max limit */
topo_eval->max_nodes--;
}
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
bit_and(topo_eval->node_map, req_map);
goto fini;
}
if (topo_eval->max_nodes <= 0) {
error_code = SLURM_ERROR;
goto fini;
}
bit_and_not(orig_node_map, topo_eval->node_map);
} else {
bit_clear_all(topo_eval->node_map);
}
/* Compute CPUs already allocated to required nodes */
if ((details_ptr->max_cpus != NO_VAL) &&
(total_cpus > details_ptr->max_cpus)) {
info("%pJ can't use required nodes due to max CPU limit",
job_ptr);
goto fini;
}
if (topo_eval->max_nodes == 0)
all_done = true;
node_weight_list = _build_node_weight_list(orig_node_map);
iter = list_iterator_create(node_weight_list);
while (!all_done && (nwt = list_next(iter))) {
for (i = i_end;
((i >= i_start) && (topo_eval->max_nodes > 0));
i--) {
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus)
continue;
/* Node not available or already selected */
if (!bit_test(nwt->node_bitmap, i) ||
bit_test(topo_eval->node_map, i))
continue;
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
(void) eval_nodes_cpus_to_use(topo_eval, i,
rem_max_cpus, min_rem_nodes,
&maxtasks, true);
if (topo_eval->avail_cpus == 0)
continue;
total_cpus += topo_eval->avail_cpus;
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
bit_set(topo_eval->node_map, i);
if ((rem_nodes <= 0) && (rem_cpus <= 0) &&
gres_sched_test(job_ptr->gres_list_req,
job_ptr->job_id)) {
error_code = SLURM_SUCCESS;
all_done = true;
break;
}
if (topo_eval->max_nodes == 0) {
all_done = true;
break;
}
}
}
list_iterator_destroy(iter);
if (error_code == SLURM_SUCCESS) {
/* Already succeeded */
} else if ((rem_cpus > 0) || (min_rem_nodes > 0) ||
!gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
bit_clear_all(topo_eval->node_map);
error_code = SLURM_ERROR;
} else {
error_code = SLURM_SUCCESS;
}
fini:
if (error_code == SLURM_SUCCESS)
eval_nodes_clip_socket_cores(topo_eval);
FREE_NULL_LIST(node_weight_list);
FREE_NULL_BITMAP(orig_node_map);
return error_code;
}
static int _add_nodes_by_weight_spread(void *x, void *arg)
{
foreach_add_nodes_lln_t *args = arg;
node_weight_type *nwt = x;
for (int i = args->i_start; i <= args->i_end; i++) {
if (!args->avail_res_array[i] ||
!args->avail_res_array[i]->avail_cpus)
continue;
/* Node not available or already selected */
if (!bit_test(nwt->node_bitmap, i) ||
bit_test(args->node_map, i))
continue;
if (!args->avail_cpu_per_node[i])
continue;
bit_set(args->node_map, i);
args->used_cpu_per_node[i] = args->cpus_per_task;
(*args->rem_nodes)--;
(*args->min_rem_nodes)--;
(*args->max_nodes)--;
*args->rem_max_cpus -= args->cpus_per_task;
*args->rem_cpus -= args->cpus_per_task;
if ((*args->max_nodes <= 0) || (*args->rem_nodes <= 0))
return 1;
}
return 0;
}
/*
* A variation of _eval_nodes() to select resources using as many nodes as
* possible.
*/
static int _eval_nodes_spread(topology_eval_t *topo_eval)
{
int i, i_start, i_end, error_code = SLURM_ERROR;
int rem_cpus, rem_nodes; /* remaining resources desired */
int min_rem_nodes, orig_min_rem_nodes; /* remaining resources desired */
int64_t rem_max_cpus, orig_rem_max_cpus;
avail_res_t **avail_res_array = topo_eval->avail_res_array;
job_record_t *job_ptr = topo_eval->job_ptr;
job_details_t *details_ptr = job_ptr->details;
bitstr_t *req_map = details_ptr->req_node_bitmap;
bitstr_t *orig_node_map = bit_copy(topo_eval->node_map);
uint32_t min_nodes = topo_eval->min_nodes;
uint32_t req_nodes = topo_eval->req_nodes;
uint32_t cpus_per_task = job_ptr->details->cpus_per_task;
bool all_done = false;
node_record_t *node_ptr;
list_t *node_weight_list = NULL;
uint64_t maxtasks;
uint16_t *avail_cpu_per_node = NULL;
uint16_t *used_cpu_per_node = NULL;
uint32_t prev_max_nodes = topo_eval->max_nodes;
foreach_add_nodes_lln_t args = { 0 };
topo_eval->avail_cpus = 0;
rem_cpus = details_ptr->min_cpus;
min_rem_nodes = min_nodes;
orig_min_rem_nodes = min_rem_nodes;
if ((details_ptr->num_tasks != NO_VAL) &&
(details_ptr->num_tasks != 0))
topo_eval->max_nodes =
MIN(topo_eval->max_nodes, details_ptr->num_tasks);
if ((topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req)))
rem_nodes = MIN(min_nodes, req_nodes);
else
rem_nodes = MAX(min_nodes, req_nodes);
rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes);
orig_rem_max_cpus = rem_max_cpus;
maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus,
topo_eval->max_nodes);
avail_cpu_per_node = xcalloc(node_record_count, sizeof(uint16_t));
used_cpu_per_node = xcalloc(node_record_count, sizeof(uint16_t));
for (i = 0; (node_ptr = next_node_bitmap(orig_node_map, &i)); i++) {
node_ptr = node_record_table_ptr[i];
eval_nodes_select_cores(topo_eval, i, min_rem_nodes);
if (topo_eval->avail_cpus == 0)
bit_clear(topo_eval->node_map, i);
avail_cpu_per_node[i] = topo_eval->avail_cpus;
}
i_start = bit_ffs(topo_eval->node_map);
if (i_start >= 0)
i_end = bit_fls(topo_eval->node_map);
else
i_end = i_start - 1;
if (req_map) {
for (i = i_start; i <= i_end; i++) {
if (!bit_test(req_map, i)) {
bit_clear(topo_eval->node_map, i);
continue;
}
node_ptr = node_record_table_ptr[i];
if (!avail_res_array[i] ||
!avail_res_array[i]->avail_cpus) {
debug("%pJ required node %s lacks available resources",
job_ptr, node_ptr->name);
goto fini;
}
if (topo_eval->max_nodes <= 0) {
log_flag(SELECT_TYPE, "%pJ requires nodes exceed maximum node limit",
job_ptr);
goto fini;
}
used_cpu_per_node[i] = cpus_per_task;
rem_nodes--;
min_rem_nodes--;
topo_eval->max_nodes--;
rem_max_cpus -= cpus_per_task;
rem_cpus -= cpus_per_task;
}
bit_and_not(orig_node_map, topo_eval->node_map);
} else {
bit_clear_all(topo_eval->node_map);
}
if (topo_eval->max_nodes > 0) {
node_weight_list = _build_node_weight_list(orig_node_map);
args.avail_res_array = avail_res_array;
args.node_map = topo_eval->node_map;
args.avail_cpu_per_node = avail_cpu_per_node;
args.used_cpu_per_node = used_cpu_per_node;
args.rem_nodes = &rem_nodes;
args.min_rem_nodes = &min_rem_nodes;
args.max_nodes = &(topo_eval->max_nodes);
args.rem_max_cpus = &rem_max_cpus;
args.rem_cpus = &rem_cpus;
args.i_start = i_start;
args.i_end = i_end;
args.cpus_per_task = cpus_per_task;
more_nodes:
list_for_each(node_weight_list, _add_nodes_by_weight_spread,
&args);
}
if (rem_cpus <= 0)
all_done = true;
while (!all_done) {
all_done = true;
for (i = 0;
(node_ptr = next_node_bitmap(topo_eval->node_map, &i));
i++) {
if (used_cpu_per_node[i] >= avail_cpu_per_node[i])
continue;
used_cpu_per_node[i] += cpus_per_task;
rem_max_cpus -= cpus_per_task;
rem_cpus -= cpus_per_task;
if (rem_cpus <= 0) {
all_done = true;
break;
} else {
all_done = false;
}
}
}
if ((rem_cpus > 0 ||
!gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) &&
(topo_eval->max_nodes > 0) &&
(prev_max_nodes != topo_eval->max_nodes)) {
if (!rem_nodes)
rem_nodes++;
prev_max_nodes = topo_eval->max_nodes;
all_done = false;
goto more_nodes;
}
rem_max_cpus = orig_rem_max_cpus;
rem_cpus = details_ptr->min_cpus;
min_rem_nodes = orig_min_rem_nodes;
for (i = 0; (node_ptr = next_node_bitmap(topo_eval->node_map, &i));
i++) {
topo_eval->avail_cpus =
MAX(used_cpu_per_node[i], details_ptr->pn_min_cpus);
if (!eval_nodes_cpus_to_use(topo_eval, i, rem_max_cpus,
min_rem_nodes, &maxtasks, true)) {
bit_clear(topo_eval->node_map, i);
continue;
}
rem_cpus -= topo_eval->avail_cpus;
rem_max_cpus -= topo_eval->avail_cpus;
min_rem_nodes--;
}
if ((rem_cpus > 0) || (min_rem_nodes > 0) ||
!gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) {
bit_clear_all(topo_eval->node_map);
error_code = SLURM_ERROR;
} else {
error_code = SLURM_SUCCESS;
}
fini:
if (error_code == SLURM_SUCCESS)
eval_nodes_clip_socket_cores(topo_eval);
FREE_NULL_LIST(node_weight_list);
FREE_NULL_BITMAP(orig_node_map);
xfree(avail_cpu_per_node);
xfree(used_cpu_per_node);
return error_code;
}
extern int eval_nodes(topology_eval_t *topo_eval)
{
job_details_t *details_ptr = topo_eval->job_ptr->details;
static bool pack_serial_at_end = false;
static bool set = false;
if (!set) {
if (xstrcasestr(slurm_conf.sched_params, "pack_serial_at_end"))
pack_serial_at_end = true;
else
pack_serial_at_end = false;
set = true;
}
xassert(topo_eval->node_map);
if (bit_set_count(topo_eval->node_map) < topo_eval->min_nodes)
return SLURM_ERROR;
if ((details_ptr->req_node_bitmap) &&
(!bit_super_set(details_ptr->req_node_bitmap, topo_eval->node_map)))
return SLURM_ERROR;
if (topo_eval->trump_others && topo_eval->eval_nodes) {
int rc = topo_eval->eval_nodes(topo_eval);
if (rc != ESLURM_NOT_SUPPORTED)
return rc;
}
if (topo_eval->job_ptr->bit_flags & SPREAD_JOB) {
/* Spread the job out over many nodes */
return _eval_nodes_spread(topo_eval);
}
if (topo_eval->prefer_alloc_nodes && !details_ptr->contiguous) {
/*
* Select resource on busy nodes first in order to leave
* idle resources free for as long as possible so that longer
* running jobs can get more easily started by the backfill
* scheduler plugin
*/
return _eval_nodes_busy(topo_eval);
}
if ((topo_eval->cr_type & SELECT_LLN) ||
(topo_eval->job_ptr->part_ptr &&
(topo_eval->job_ptr->part_ptr->flags & PART_FLAG_LLN))) {
/* Select resource on the Least Loaded Node */
return _eval_nodes_lln(topo_eval);
}
if (pack_serial_at_end &&
(details_ptr->min_cpus == 1) && (topo_eval->req_nodes == 1)) {
/*
* Put serial jobs at the end of the available node list
* rather than using a best-fit algorithm, which fragments
* resources.
*/
return _eval_nodes_serial(topo_eval);
}
if (topo_eval->eval_nodes) {
int rc = topo_eval->eval_nodes(topo_eval);
if (rc != ESLURM_NOT_SUPPORTED)
return rc;
}
return _eval_nodes_consec(topo_eval);
}
extern bool eval_nodes_cpus_to_use(topology_eval_t *topo_eval, int node_inx,
int64_t rem_max_cpus, int rem_nodes,
uint64_t *max_tasks, bool check_gres)
{
job_record_t *job_ptr = topo_eval->job_ptr;
job_details_t *details_ptr = job_ptr->details;
avail_res_t *avail_res = topo_eval->avail_res_array[node_inx];
int resv_cpus; /* CPUs to be allocated on other nodes */
/* Use all resources on node */
if (details_ptr->whole_node & WHOLE_NODE_REQUIRED)
goto check_gres_per_job;
resv_cpus = MAX((rem_nodes - 1), 0);
resv_cpus *= job_mgr_determine_cpus_per_core(details_ptr, node_inx);
if (topo_eval->cr_type & SELECT_SOCKET)
resv_cpus *= node_record_table_ptr[node_inx]->cores;
rem_max_cpus -= resv_cpus;
if (topo_eval->avail_cpus > rem_max_cpus) {
topo_eval->avail_cpus = MAX(rem_max_cpus,
(int)details_ptr->pn_min_cpus);
if (avail_res->gres_min_cpus)
topo_eval->avail_cpus =
MAX(topo_eval->avail_cpus,
avail_res->gres_min_cpus);
else
topo_eval->avail_cpus =
MAX(topo_eval->avail_cpus,
details_ptr->min_gres_cpu);
/* Round up CPU count to CPU in allocation unit (e.g. core) */
avail_res->avail_cpus = topo_eval->avail_cpus;
}
check_gres_per_job:
if (check_gres && topo_eval->gres_per_job && topo_eval->avail_cpus) {
node_record_t *node_ptr = node_record_table_ptr[node_inx];
return eval_nodes_gres(topo_eval, max_tasks, job_ptr, node_ptr,
rem_nodes, node_inx, 0);
}
return true;
}
extern void eval_nodes_select_cores(topology_eval_t *topo_eval,
int node_inx, int rem_nodes)
{
bitstr_t **avail_core = topo_eval->avail_core;
uint16_t *avail_cpus = &topo_eval->avail_cpus;
avail_res_t **avail_res_array = topo_eval->avail_res_array;
uint16_t cr_type = topo_eval->cr_type;
bool enforce_binding = topo_eval->enforce_binding;
bool first_pass = topo_eval->first_pass;
job_record_t *job_ptr = topo_eval->job_ptr;
gres_mc_data_t *mc_ptr = topo_eval->mc_ptr;
uint32_t min_tasks_this_node = 0, max_tasks_this_node = 0;
uint32_t min_cores_this_node = 0;
job_details_t *details_ptr = job_ptr->details;
node_record_t *node_ptr = node_record_table_ptr[node_inx];
xassert(mc_ptr->cpus_per_task);
rem_nodes = MAX(rem_nodes, 1); /* If range of node counts */
if (mc_ptr->ntasks_per_node) {
min_tasks_this_node = mc_ptr->ntasks_per_node;
max_tasks_this_node = mc_ptr->ntasks_per_node;
} else if (mc_ptr->ntasks_per_board) {
min_tasks_this_node = mc_ptr->ntasks_per_board;
max_tasks_this_node = mc_ptr->ntasks_per_board *
node_ptr->boards;
} else if (mc_ptr->ntasks_per_socket) {
min_tasks_this_node = mc_ptr->ntasks_per_socket;
max_tasks_this_node = mc_ptr->ntasks_per_socket *
node_ptr->tot_sockets;
} else if (mc_ptr->ntasks_per_core) {
min_tasks_this_node = mc_ptr->ntasks_per_core;
max_tasks_this_node = mc_ptr->ntasks_per_core *
(node_ptr->tot_cores -
node_ptr->core_spec_cnt);
} else if (details_ptr && details_ptr->ntasks_per_tres &&
(details_ptr->ntasks_per_tres != NO_VAL16)) {
/* Node ranges not allowed with --ntasks-per-gpu */
if ((details_ptr->min_nodes != NO_VAL) &&
(details_ptr->min_nodes != 0) &&
(details_ptr->min_nodes == details_ptr->max_nodes)) {
min_tasks_this_node = details_ptr->num_tasks /
details_ptr->min_nodes;
max_tasks_this_node = min_tasks_this_node;
} else {
min_tasks_this_node = details_ptr->ntasks_per_tres;
max_tasks_this_node = details_ptr->num_tasks;
}
} else if (details_ptr && (details_ptr->max_nodes == 1)) {
if ((details_ptr->num_tasks == NO_VAL) ||
(details_ptr->num_tasks == 0)) {
min_tasks_this_node = 1;
max_tasks_this_node = NO_VAL;
} else {
min_tasks_this_node = details_ptr->num_tasks;
max_tasks_this_node = details_ptr->num_tasks;
}
} else if (details_ptr &&
((details_ptr->num_tasks == 1) ||
((details_ptr->num_tasks == details_ptr->min_nodes) &&
(details_ptr->num_tasks == details_ptr->max_nodes)))) {
min_tasks_this_node = 1;
max_tasks_this_node = 1;
} else {
min_tasks_this_node = 1;
max_tasks_this_node = NO_VAL;
}
/* Determine how many tasks can be started on this node */
if ((!details_ptr || !details_ptr->overcommit)) {
int alloc_tasks = avail_res_array[node_inx]->avail_cpus /
mc_ptr->cpus_per_task;
if (alloc_tasks < min_tasks_this_node)
max_tasks_this_node = 0;
else if ((max_tasks_this_node == NO_VAL) ||
(alloc_tasks < max_tasks_this_node))
max_tasks_this_node = alloc_tasks;
}
*avail_cpus = avail_res_array[node_inx]->avail_cpus;
/*
* _allocate_sc() filters available cpus and cores if the job does
* not request gres. If the job requests gres, _allocate_sc() defers
* filtering cpus and cores so that gres_select_filter_sock_core() can
* do it.
*/
if (job_ptr->gres_list_req) {
foreach_gres_filter_sock_core_args_t args = {
.job_ptr = job_ptr,
.mc_ptr = mc_ptr,
.sockets = avail_res_array[node_inx]->sock_cnt,
.cores_per_socket = node_ptr->cores,
.cpus_per_core = node_ptr->tpc,
.avail_cpus = avail_cpus,
.min_tasks_this_node = &min_tasks_this_node,
.max_tasks_this_node = &max_tasks_this_node,
.min_cores_this_node = &min_cores_this_node,
.rem_nodes = rem_nodes,
.enforce_binding = enforce_binding,
.first_pass = first_pass,
.avail_core = avail_core[node_inx],
.node_name = node_record_table_ptr[node_inx]->name,
.cr_type = cr_type,
.res_cores_per_gpu = node_ptr->res_cores_per_gpu,
.node_i = node_inx,
};
gres_filter_sock_core(
avail_res_array[node_inx]->sock_gres_list,
&avail_res_array[node_inx]->avail_cores_per_sock,
&args);
}
if (max_tasks_this_node == 0) {
*avail_cpus = 0;
} else if ((slurm_conf.select_type_param & SELECT_ONE_TASK_PER_CORE) &&
((mc_ptr->ntasks_per_core == INFINITE16) ||
(mc_ptr->ntasks_per_core == 0)) &&
details_ptr && (details_ptr->min_gres_cpu == 0)) {
*avail_cpus = bit_set_count(avail_core[node_inx]);
}
avail_res_array[node_inx]->gres_min_cpus =
job_mgr_determine_cpus_per_core(job_ptr->details, node_inx) *
min_cores_this_node;
avail_res_array[node_inx]->min_cpus =
avail_res_array[node_inx]->gres_min_cpus;
avail_res_array[node_inx]->gres_max_tasks = max_tasks_this_node;
}
extern int64_t eval_nodes_get_rem_max_cpus(
job_details_t *details_ptr, int rem_nodes)
{
int64_t rem_max_cpus = details_ptr->min_cpus;
if (details_ptr->max_cpus != NO_VAL)
rem_max_cpus = details_ptr->max_cpus;
if (details_ptr->min_gres_cpu)
rem_max_cpus = MAX(rem_max_cpus,
details_ptr->min_gres_cpu * rem_nodes);
if (details_ptr->min_job_gres_cpu)
rem_max_cpus = MAX(rem_max_cpus, details_ptr->min_job_gres_cpu);
return rem_max_cpus;
}
extern int eval_nodes_topo_weight_find(void *x, void *key)
{
topo_weight_info_t *nw = x;
topo_weight_info_t *nw_key = key;
if (nw->weight == nw_key->weight)
return 1;
return 0;
}
extern int eval_nodes_topo_node_find(void *x, void *key)
{
topo_weight_info_t *nw = x;
bitstr_t *nw_key = key;
if (bit_overlap_any(nw->node_bitmap, nw_key))
return 1;
return 0;
}
extern void eval_nodes_topo_weight_free(void *x)
{
topo_weight_info_t *nw = x;
FREE_NULL_BITMAP(nw->node_bitmap);
xfree(nw);
}
extern int eval_nodes_topo_weight_log(void *x, void *arg)
{
topo_weight_info_t *nw = x;
char *node_names = bitmap2node_name(nw->node_bitmap);
info("Topo:%s weight:%"PRIu64, node_names, nw->weight);
xfree(node_names);
return 0;
}
extern int eval_nodes_topo_weight_sort(void *x, void *y)
{
topo_weight_info_t *nwt1 = *(topo_weight_info_t **) x;
topo_weight_info_t *nwt2 = *(topo_weight_info_t **) y;
if (nwt1->weight < nwt2->weight)
return -1;
if (nwt1->weight > nwt2->weight)
return 1;
return 0;
}
extern bool eval_nodes_enough_nodes(int avail_nodes, int rem_nodes,
uint32_t min_nodes, uint32_t req_nodes)
{
int needed_nodes;
if (req_nodes > min_nodes)
needed_nodes = rem_nodes + min_nodes - req_nodes;
else
needed_nodes = rem_nodes;
return (avail_nodes >= needed_nodes);
}