blob: 66e0a601f29fde699d88b8b2087215f493e44240 [file] [log] [blame]
/*****************************************************************************\
* gres_sched.c - Scheduling functions used by cons_tres
*****************************************************************************
* Copyright (C) SchedMD LLC.
* Derived in large part from code previously in common/gres.c
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "src/common/slurm_xlator.h"
#include "gres_sched.h"
#include "src/common/xstring.h"
typedef struct {
uint16_t *actual_cores_per_sock;
bitstr_t *avail_core; /* Core bitmap of available cores on this node */
uint16_t *avail_cores_per_sock; /* # of cores per socket available */
uint16_t *avail_cpus; /* CPUs currently available on this node */
uint16_t cores_per_socket; /* # of cores on each socket on the node */
uint16_t cpus_per_core; /* Number of threads per core on the node */
uint16_t cr_type; /* Allocation type (sockets, cores, etc.) */
uint16_t gres_cpus;
uint16_t min_cpus; /* Minimum cpus required on this node */
int node_i; /* Index of the current node */
uint16_t res_cores_per_gpu; /* Number of restricted cores per gpu */
int sockets; /* Number of sockets on the node */
list_t *sock_gres_list; /* list of sock_gres_t */
uint64_t tot_cores;
} foreach_gres_add_args_t;
typedef struct {
list_t **consec_gres;
list_t *sock_gres_list;
} foreach_gres_consec_args_t;
static int _foreach_gres_str(void *x, void *arg)
{
sock_gres_t *sock_data = x;
char **out_str = arg;
gres_job_state_t *gres_js;
char *sep;
if (!sock_data->gres_state_job) { /* Should never happen */
error("%s: sock_data has no gres_state_job. This should never happen.",
__func__);
return 0;
}
gres_js = sock_data->gres_state_job->gres_data;
if (*out_str)
sep = ",";
else
sep = "GRES:";
if (gres_js->type_name) {
xstrfmtcat(*out_str, "%s%s:%s:%"PRIu64, sep,
sock_data->gres_state_job->gres_name,
gres_js->type_name, sock_data->total_cnt);
} else {
xstrfmtcat(*out_str, "%s%s:%"PRIu64, sep,
sock_data->gres_state_job->gres_name,
sock_data->total_cnt);
}
return 0;
}
/*
* Given a list of sock_gres_t entries, return a string identifying the
* count of each GRES available on this set of nodes
* IN sock_gres_list - count of GRES available in this group of nodes
* RET xfree the returned string
*/
extern char *gres_sched_str(list_t *sock_gres_list)
{
char *out_str = NULL;
if (!sock_gres_list)
return NULL;
(void) list_for_each(sock_gres_list, _foreach_gres_str, &out_str);
return out_str;
}
static int _foreach_gres_init(void *x, void *arg)
{
gres_state_t *gres_state_job = x;
bool *rc = arg;
gres_job_state_t *gres_js = gres_state_job->gres_data;
if (!gres_js->gres_per_job)
return 0;
gres_js->total_gres = 0;
*rc = true;
return 0;
}
/*
* Clear GRES allocation info for all job GRES at start of scheduling cycle
* Return TRUE if any gres_per_job constraints to satisfy
*/
extern bool gres_sched_init(list_t *job_gres_list)
{
bool rc = false;
if (!job_gres_list)
return rc;
(void) list_for_each(job_gres_list, _foreach_gres_init, &rc);
return rc;
}
/* Note - key is not used */
static int _is_gres_per_job_met(void *x, void *key)
{
gres_state_t *gres_state_job = x;
gres_job_state_t *gres_js = gres_state_job->gres_data;
if (gres_js->gres_per_job &&
(gres_js->gres_per_job > gres_js->total_gres))
return -1; /* break out of list_find_first */
return 0;
}
/*
* Return TRUE if all gres_per_job specifications are satisfied
*/
extern bool gres_sched_test(list_t *job_gres_list, uint32_t job_id)
{
if (!job_gres_list)
return true;
return !list_find_first(job_gres_list, _is_gres_per_job_met, NULL);
}
static void _gres_per_job_reduce_res_cores(bitstr_t *avail_core,
uint16_t *avail_cores_per_sock,
uint16_t *actual_cores_per_sock,
uint64_t *tot_cores,
uint16_t *avail_cpus,
uint64_t *gres_limit,
gres_job_state_t *gres_js,
uint16_t res_cores_per_gpu,
int sockets,
uint16_t cores_per_socket,
uint16_t cpus_per_core,
uint16_t cr_type,
int node_i)
{
bitstr_t *res_cores;
uint16_t tot_res_core;
uint64_t max_res_cores = 0;
int i = (sockets * cores_per_socket) - 1;
bool done = false;
int cnt;
if (cr_type & SELECT_SOCKET)
return;
if (!gres_js->res_gpu_cores ||
!gres_js->res_gpu_cores[node_i])
return;
max_res_cores = *gres_limit * res_cores_per_gpu;
res_cores = bit_copy(gres_js->res_gpu_cores[node_i]);
bit_and(res_cores, avail_core);
tot_res_core = bit_set_count(res_cores);
if (tot_res_core <= max_res_cores) {
FREE_NULL_BITMAP(res_cores);
return;
}
while (!done) {
while (tot_res_core > max_res_cores) {
int s;
/*
* Must remove restricted cores from the end of the
* bitmap first since cores are picked from front to
* back. This helps the needed restricted cores get
* picked.
*/
i = bit_fls_from_bit(res_cores, i);
if (i < 0)
break; /* This should never happen */
bit_clear(avail_core, i);
tot_res_core--;
s = i / cores_per_socket;
actual_cores_per_sock[s]--;
(*tot_cores)--;
if (actual_cores_per_sock[s] <
avail_cores_per_sock[s])
avail_cores_per_sock[s]--;
i--;
}
cnt = *tot_cores * cpus_per_core;
if (cnt < *avail_cpus)
*avail_cpus = cnt;
if (gres_js->cpus_per_gres) {
uint64_t new_gres_limit =
*avail_cpus / gres_js->cpus_per_gres;
if (new_gres_limit < *gres_limit) {
*gres_limit = new_gres_limit;
max_res_cores = *gres_limit * res_cores_per_gpu;
} else
done = true;
} else
done = true;
}
FREE_NULL_BITMAP(res_cores);
}
static int _foreach_gres_add(void *x, void *arg)
{
foreach_gres_add_args_t *args = arg;
gres_state_t *gres_state_job = x;
gres_job_state_t *gres_js = gres_state_job->gres_data;
sock_gres_t *sock_data;
uint64_t gres_limit;
uint64_t min_gres;
if (!gres_js->gres_per_job) /* Don't care about totals */
return 0;
sock_data =
list_find_first(args->sock_gres_list,
gres_find_sock_by_job_state, gres_state_job);
if (!sock_data) /* None of this GRES available */
return 0;
if (gres_js->cpus_per_gres) {
gres_limit = *(args->avail_cpus) / gres_js->cpus_per_gres;
gres_limit = MIN(gres_limit, sock_data->total_cnt);
args->gres_cpus = MAX(args->gres_cpus,
gres_limit * gres_js->cpus_per_gres);
} else
gres_limit = sock_data->total_cnt;
min_gres = MAX(gres_js->gres_per_node, 1);
if (gres_js->gres_per_task ||
(gres_js->ntasks_per_gres &&
(gres_js->ntasks_per_gres != NO_VAL16))) {
/*
* Already assumed a number of gres tasks
* on this node.
*/
min_gres = gres_limit;
}
if (gres_js->gres_per_job > gres_js->total_gres) {
gres_limit = MIN((gres_js->gres_per_job - gres_js->total_gres),
gres_limit);
}
gres_limit = MAX(gres_limit, min_gres);
if ((gres_state_job->plugin_id == gres_get_gpu_plugin_id()) &&
args->res_cores_per_gpu) {
if (!args->actual_cores_per_sock) {
args->actual_cores_per_sock =
xcalloc(args->sockets, sizeof(uint16_t));
for (int s = 0; s < args->sockets; s++) {
int start_core = s * args->cores_per_socket;
int end_core =
start_core + args->cores_per_socket;
args->actual_cores_per_sock[s] =
bit_set_count_range(args->avail_core,
start_core,
end_core);
args->tot_cores +=
args->actual_cores_per_sock[s];
}
}
_gres_per_job_reduce_res_cores(
args->avail_core, args->avail_cores_per_sock,
args->actual_cores_per_sock, &args->tot_cores,
args->avail_cpus, &gres_limit, gres_js,
args->res_cores_per_gpu, args->sockets,
args->cores_per_socket, args->cpus_per_core,
args->cr_type, args->node_i);
if ((gres_limit < min_gres) ||
(args->min_cpus > *(args->avail_cpus)))
return -1;
}
sock_data->total_cnt = gres_limit;
gres_js->total_gres += gres_limit;
return 0;
}
/*
* Update a job's total_gres counter as we add a node to potential allocation
* IN/OUT avail_cpus - CPUs currently available on this node
* IN/OUT avail_core - Core bitmap of currently available cores on this node
* IN/OUT avail_cores_per_sock - Number of cores per socket available
* IN/OUT sock_gres_list - Per socket GRES availability on this node
* (sock_gres_t). Updates total_cnt
* IN job_gres_list - list of job's GRES requirements (gres_state_job_t)
* IN res_cores_per_gpu - Number of restricted cores per gpu
* IN sockets - Number of sockets on the node
* IN cores_per_socket - Number of cores on each socket on the node
* IN cpus_per_core - Number of threads per core on the node
* IN cr_type - Allocation type (sockets, cores, etc.)
* IN min_cpus - Minimum cpus required on this node
* IN node_i - Index of the current node
*/
extern bool gres_sched_add(uint16_t *avail_cpus,
bitstr_t *avail_core,
uint16_t *avail_cores_per_sock,
list_t *sock_gres_list,
list_t *job_gres_list,
uint16_t res_cores_per_gpu,
int sockets,
uint16_t cores_per_socket,
uint16_t cpus_per_core,
uint16_t cr_type,
uint16_t min_cpus,
int node_i)
{
bool rc = true;
foreach_gres_add_args_t args = {
.avail_core = avail_core,
.avail_cores_per_sock = avail_cores_per_sock,
.avail_cpus = avail_cpus,
.cores_per_socket = cores_per_socket,
.cpus_per_core = cpus_per_core,
.cr_type = cr_type,
.min_cpus = min_cpus,
.node_i = node_i,
.res_cores_per_gpu = res_cores_per_gpu,
.sockets = sockets,
.sock_gres_list = sock_gres_list,
};
if (!job_gres_list || !(*avail_cpus))
return true;
rc = !list_find_first(job_gres_list, _foreach_gres_add, &args);
if (rc && args.gres_cpus && (args.gres_cpus < *avail_cpus) &&
(args.gres_cpus > args.min_cpus))
*avail_cpus = args.gres_cpus;
xfree(args.actual_cores_per_sock);
return rc;
}
static int _foreach_gres_consec(void *x, void *arg)
{
foreach_gres_consec_args_t *args = arg;
gres_state_t *gres_state_job = x;
gres_job_state_t *gres_js = gres_state_job->gres_data;
sock_gres_t *sock_data, *consec_data;
if (!gres_js->gres_per_job) /* Don't care about totals */
return 0;
sock_data =
list_find_first(args->sock_gres_list,
gres_find_sock_by_job_state, gres_state_job);
if (!sock_data) /* None of this GRES available */
return 0;
if (*(args->consec_gres) == NULL)
*(args->consec_gres) = list_create(gres_sock_delete);
consec_data =
list_find_first(*(args->consec_gres),
gres_find_sock_by_job_state, gres_state_job);
if (!consec_data) {
consec_data = xmalloc(sizeof(sock_gres_t));
consec_data->gres_state_job = gres_state_job;
list_append(*(args->consec_gres), consec_data);
}
consec_data->total_cnt += sock_data->total_cnt;
return 0;
}
/*
* Create/update list GRES that can be made available on the specified node
* IN/OUT consec_gres - list of sock_gres_t that can be made available on
* a set of nodes
* IN job_gres_list - list of job's GRES requirements (gres_job_state_t)
* IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t)
*/
extern void gres_sched_consec(list_t **consec_gres, list_t *job_gres_list,
list_t *sock_gres_list)
{
foreach_gres_consec_args_t args = {
.consec_gres = consec_gres,
.sock_gres_list = sock_gres_list,
};
if (!job_gres_list)
return;
(void) list_for_each(job_gres_list, _foreach_gres_consec, &args);
}
static int _find_insufficient_gres(void *x, void *args)
{
gres_state_t *gres_state_job = x;
gres_job_state_t *gres_js = gres_state_job->gres_data;
list_t *sock_gres_list = args;
sock_gres_t *sock_data;
if (!gres_js->gres_per_job) /* Don't care about totals */
return 0;
if (gres_js->total_gres >= gres_js->gres_per_job)
return 0;
sock_data = list_find_first(sock_gres_list, gres_find_sock_by_job_state,
gres_state_job);
if (!sock_data) /* None of this GRES available */
return -1;
if ((gres_js->total_gres + sock_data->total_cnt) <
gres_js->gres_per_job)
return -1;
return 0;
}
/*
* Determine if the additional sock_gres_list resources will result in
* satisfying the job's gres_per_job constraints
* IN job_gres_list - job's GRES requirements
* IN sock_gres_list - available GRES in a set of nodes, data structure built
* by gres_job_sched_consec()
*/
extern bool gres_sched_sufficient(list_t *job_gres_list, list_t *sock_gres_list)
{
if (!job_gres_list)
return true;
if (!sock_gres_list)
return false;
return !list_find_first(job_gres_list, _find_insufficient_gres,
sock_gres_list);
}