| /*****************************************************************************\ |
| * gres_sched.c - Scheduling functions used by cons_tres |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * Derived in large part from code previously in common/gres.c |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "src/common/slurm_xlator.h" |
| |
| #include "gres_sched.h" |
| |
| #include "src/common/xstring.h" |
| |
| typedef struct { |
| uint16_t *actual_cores_per_sock; |
| bitstr_t *avail_core; /* Core bitmap of available cores on this node */ |
| uint16_t *avail_cores_per_sock; /* # of cores per socket available */ |
| uint16_t *avail_cpus; /* CPUs currently available on this node */ |
| uint16_t cores_per_socket; /* # of cores on each socket on the node */ |
| uint16_t cpus_per_core; /* Number of threads per core on the node */ |
| uint16_t cr_type; /* Allocation type (sockets, cores, etc.) */ |
| uint16_t gres_cpus; |
| uint16_t min_cpus; /* Minimum cpus required on this node */ |
| int node_i; /* Index of the current node */ |
| uint16_t res_cores_per_gpu; /* Number of restricted cores per gpu */ |
| int sockets; /* Number of sockets on the node */ |
| list_t *sock_gres_list; /* list of sock_gres_t */ |
| uint64_t tot_cores; |
| } foreach_gres_add_args_t; |
| |
| typedef struct { |
| list_t **consec_gres; |
| list_t *sock_gres_list; |
| } foreach_gres_consec_args_t; |
| |
| static int _foreach_gres_str(void *x, void *arg) |
| { |
| sock_gres_t *sock_data = x; |
| char **out_str = arg; |
| gres_job_state_t *gres_js; |
| char *sep; |
| |
| if (!sock_data->gres_state_job) { /* Should never happen */ |
| error("%s: sock_data has no gres_state_job. This should never happen.", |
| __func__); |
| return 0; |
| } |
| gres_js = sock_data->gres_state_job->gres_data; |
| if (*out_str) |
| sep = ","; |
| else |
| sep = "GRES:"; |
| if (gres_js->type_name) { |
| xstrfmtcat(*out_str, "%s%s:%s:%"PRIu64, sep, |
| sock_data->gres_state_job->gres_name, |
| gres_js->type_name, sock_data->total_cnt); |
| } else { |
| xstrfmtcat(*out_str, "%s%s:%"PRIu64, sep, |
| sock_data->gres_state_job->gres_name, |
| sock_data->total_cnt); |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Given a list of sock_gres_t entries, return a string identifying the |
| * count of each GRES available on this set of nodes |
| * IN sock_gres_list - count of GRES available in this group of nodes |
| * RET xfree the returned string |
| */ |
| extern char *gres_sched_str(list_t *sock_gres_list) |
| { |
| char *out_str = NULL; |
| |
| if (!sock_gres_list) |
| return NULL; |
| |
| (void) list_for_each(sock_gres_list, _foreach_gres_str, &out_str); |
| |
| return out_str; |
| } |
| |
| static int _foreach_gres_init(void *x, void *arg) |
| { |
| gres_state_t *gres_state_job = x; |
| bool *rc = arg; |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| |
| if (!gres_js->gres_per_job) |
| return 0; |
| gres_js->total_gres = 0; |
| *rc = true; |
| |
| return 0; |
| } |
| |
| /* |
| * Clear GRES allocation info for all job GRES at start of scheduling cycle |
| * Return TRUE if any gres_per_job constraints to satisfy |
| */ |
| extern bool gres_sched_init(list_t *job_gres_list) |
| { |
| bool rc = false; |
| |
| if (!job_gres_list) |
| return rc; |
| |
| (void) list_for_each(job_gres_list, _foreach_gres_init, &rc); |
| |
| return rc; |
| } |
| |
| /* Note - key is not used */ |
| static int _is_gres_per_job_met(void *x, void *key) |
| { |
| gres_state_t *gres_state_job = x; |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| |
| if (gres_js->gres_per_job && |
| (gres_js->gres_per_job > gres_js->total_gres)) |
| return -1; /* break out of list_find_first */ |
| |
| return 0; |
| } |
| |
| /* |
| * Return TRUE if all gres_per_job specifications are satisfied |
| */ |
| extern bool gres_sched_test(list_t *job_gres_list, uint32_t job_id) |
| { |
| if (!job_gres_list) |
| return true; |
| |
| return !list_find_first(job_gres_list, _is_gres_per_job_met, NULL); |
| } |
| |
| static void _gres_per_job_reduce_res_cores(bitstr_t *avail_core, |
| uint16_t *avail_cores_per_sock, |
| uint16_t *actual_cores_per_sock, |
| uint64_t *tot_cores, |
| uint16_t *avail_cpus, |
| uint64_t *gres_limit, |
| gres_job_state_t *gres_js, |
| uint16_t res_cores_per_gpu, |
| int sockets, |
| uint16_t cores_per_socket, |
| uint16_t cpus_per_core, |
| uint16_t cr_type, |
| int node_i) |
| { |
| bitstr_t *res_cores; |
| uint16_t tot_res_core; |
| uint64_t max_res_cores = 0; |
| int i = (sockets * cores_per_socket) - 1; |
| bool done = false; |
| int cnt; |
| |
| if (cr_type & SELECT_SOCKET) |
| return; |
| if (!gres_js->res_gpu_cores || |
| !gres_js->res_gpu_cores[node_i]) |
| return; |
| |
| max_res_cores = *gres_limit * res_cores_per_gpu; |
| res_cores = bit_copy(gres_js->res_gpu_cores[node_i]); |
| bit_and(res_cores, avail_core); |
| tot_res_core = bit_set_count(res_cores); |
| |
| if (tot_res_core <= max_res_cores) { |
| FREE_NULL_BITMAP(res_cores); |
| return; |
| } |
| |
| while (!done) { |
| while (tot_res_core > max_res_cores) { |
| int s; |
| /* |
| * Must remove restricted cores from the end of the |
| * bitmap first since cores are picked from front to |
| * back. This helps the needed restricted cores get |
| * picked. |
| */ |
| i = bit_fls_from_bit(res_cores, i); |
| if (i < 0) |
| break; /* This should never happen */ |
| bit_clear(avail_core, i); |
| tot_res_core--; |
| |
| s = i / cores_per_socket; |
| actual_cores_per_sock[s]--; |
| (*tot_cores)--; |
| if (actual_cores_per_sock[s] < |
| avail_cores_per_sock[s]) |
| avail_cores_per_sock[s]--; |
| i--; |
| } |
| cnt = *tot_cores * cpus_per_core; |
| if (cnt < *avail_cpus) |
| *avail_cpus = cnt; |
| if (gres_js->cpus_per_gres) { |
| uint64_t new_gres_limit = |
| *avail_cpus / gres_js->cpus_per_gres; |
| if (new_gres_limit < *gres_limit) { |
| *gres_limit = new_gres_limit; |
| max_res_cores = *gres_limit * res_cores_per_gpu; |
| } else |
| done = true; |
| } else |
| done = true; |
| } |
| FREE_NULL_BITMAP(res_cores); |
| } |
| |
| static int _foreach_gres_add(void *x, void *arg) |
| { |
| foreach_gres_add_args_t *args = arg; |
| gres_state_t *gres_state_job = x; |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| sock_gres_t *sock_data; |
| uint64_t gres_limit; |
| uint64_t min_gres; |
| |
| if (!gres_js->gres_per_job) /* Don't care about totals */ |
| return 0; |
| sock_data = |
| list_find_first(args->sock_gres_list, |
| gres_find_sock_by_job_state, gres_state_job); |
| if (!sock_data) /* None of this GRES available */ |
| return 0; |
| if (gres_js->cpus_per_gres) { |
| gres_limit = *(args->avail_cpus) / gres_js->cpus_per_gres; |
| gres_limit = MIN(gres_limit, sock_data->total_cnt); |
| args->gres_cpus = MAX(args->gres_cpus, |
| gres_limit * gres_js->cpus_per_gres); |
| } else |
| gres_limit = sock_data->total_cnt; |
| |
| min_gres = MAX(gres_js->gres_per_node, 1); |
| if (gres_js->gres_per_task || |
| (gres_js->ntasks_per_gres && |
| (gres_js->ntasks_per_gres != NO_VAL16))) { |
| /* |
| * Already assumed a number of gres tasks |
| * on this node. |
| */ |
| min_gres = gres_limit; |
| } |
| if (gres_js->gres_per_job > gres_js->total_gres) { |
| gres_limit = MIN((gres_js->gres_per_job - gres_js->total_gres), |
| gres_limit); |
| } |
| gres_limit = MAX(gres_limit, min_gres); |
| |
| if ((gres_state_job->plugin_id == gres_get_gpu_plugin_id()) && |
| args->res_cores_per_gpu) { |
| if (!args->actual_cores_per_sock) { |
| args->actual_cores_per_sock = |
| xcalloc(args->sockets, sizeof(uint16_t)); |
| for (int s = 0; s < args->sockets; s++) { |
| int start_core = s * args->cores_per_socket; |
| int end_core = |
| start_core + args->cores_per_socket; |
| args->actual_cores_per_sock[s] = |
| bit_set_count_range(args->avail_core, |
| start_core, |
| end_core); |
| args->tot_cores += |
| args->actual_cores_per_sock[s]; |
| } |
| } |
| |
| _gres_per_job_reduce_res_cores( |
| args->avail_core, args->avail_cores_per_sock, |
| args->actual_cores_per_sock, &args->tot_cores, |
| args->avail_cpus, &gres_limit, gres_js, |
| args->res_cores_per_gpu, args->sockets, |
| args->cores_per_socket, args->cpus_per_core, |
| args->cr_type, args->node_i); |
| if ((gres_limit < min_gres) || |
| (args->min_cpus > *(args->avail_cpus))) |
| return -1; |
| } |
| |
| sock_data->total_cnt = gres_limit; |
| gres_js->total_gres += gres_limit; |
| |
| return 0; |
| } |
| |
| /* |
| * Update a job's total_gres counter as we add a node to potential allocation |
| * IN/OUT avail_cpus - CPUs currently available on this node |
| * IN/OUT avail_core - Core bitmap of currently available cores on this node |
| * IN/OUT avail_cores_per_sock - Number of cores per socket available |
| * IN/OUT sock_gres_list - Per socket GRES availability on this node |
| * (sock_gres_t). Updates total_cnt |
| * IN job_gres_list - list of job's GRES requirements (gres_state_job_t) |
| * IN res_cores_per_gpu - Number of restricted cores per gpu |
| * IN sockets - Number of sockets on the node |
| * IN cores_per_socket - Number of cores on each socket on the node |
| * IN cpus_per_core - Number of threads per core on the node |
| * IN cr_type - Allocation type (sockets, cores, etc.) |
| * IN min_cpus - Minimum cpus required on this node |
| * IN node_i - Index of the current node |
| */ |
| extern bool gres_sched_add(uint16_t *avail_cpus, |
| bitstr_t *avail_core, |
| uint16_t *avail_cores_per_sock, |
| list_t *sock_gres_list, |
| list_t *job_gres_list, |
| uint16_t res_cores_per_gpu, |
| int sockets, |
| uint16_t cores_per_socket, |
| uint16_t cpus_per_core, |
| uint16_t cr_type, |
| uint16_t min_cpus, |
| int node_i) |
| { |
| bool rc = true; |
| foreach_gres_add_args_t args = { |
| .avail_core = avail_core, |
| .avail_cores_per_sock = avail_cores_per_sock, |
| .avail_cpus = avail_cpus, |
| .cores_per_socket = cores_per_socket, |
| .cpus_per_core = cpus_per_core, |
| .cr_type = cr_type, |
| .min_cpus = min_cpus, |
| .node_i = node_i, |
| .res_cores_per_gpu = res_cores_per_gpu, |
| .sockets = sockets, |
| .sock_gres_list = sock_gres_list, |
| }; |
| |
| if (!job_gres_list || !(*avail_cpus)) |
| return true; |
| |
| rc = !list_find_first(job_gres_list, _foreach_gres_add, &args); |
| if (rc && args.gres_cpus && (args.gres_cpus < *avail_cpus) && |
| (args.gres_cpus > args.min_cpus)) |
| *avail_cpus = args.gres_cpus; |
| |
| xfree(args.actual_cores_per_sock); |
| return rc; |
| } |
| |
| static int _foreach_gres_consec(void *x, void *arg) |
| { |
| foreach_gres_consec_args_t *args = arg; |
| gres_state_t *gres_state_job = x; |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| sock_gres_t *sock_data, *consec_data; |
| |
| if (!gres_js->gres_per_job) /* Don't care about totals */ |
| return 0; |
| sock_data = |
| list_find_first(args->sock_gres_list, |
| gres_find_sock_by_job_state, gres_state_job); |
| if (!sock_data) /* None of this GRES available */ |
| return 0; |
| if (*(args->consec_gres) == NULL) |
| *(args->consec_gres) = list_create(gres_sock_delete); |
| consec_data = |
| list_find_first(*(args->consec_gres), |
| gres_find_sock_by_job_state, gres_state_job); |
| if (!consec_data) { |
| consec_data = xmalloc(sizeof(sock_gres_t)); |
| consec_data->gres_state_job = gres_state_job; |
| list_append(*(args->consec_gres), consec_data); |
| } |
| consec_data->total_cnt += sock_data->total_cnt; |
| |
| return 0; |
| } |
| |
| /* |
| * Create/update list GRES that can be made available on the specified node |
| * IN/OUT consec_gres - list of sock_gres_t that can be made available on |
| * a set of nodes |
| * IN job_gres_list - list of job's GRES requirements (gres_job_state_t) |
| * IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t) |
| */ |
| extern void gres_sched_consec(list_t **consec_gres, list_t *job_gres_list, |
| list_t *sock_gres_list) |
| { |
| foreach_gres_consec_args_t args = { |
| .consec_gres = consec_gres, |
| .sock_gres_list = sock_gres_list, |
| }; |
| |
| if (!job_gres_list) |
| return; |
| |
| (void) list_for_each(job_gres_list, _foreach_gres_consec, &args); |
| } |
| |
| static int _find_insufficient_gres(void *x, void *args) |
| { |
| gres_state_t *gres_state_job = x; |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| list_t *sock_gres_list = args; |
| sock_gres_t *sock_data; |
| |
| if (!gres_js->gres_per_job) /* Don't care about totals */ |
| return 0; |
| if (gres_js->total_gres >= gres_js->gres_per_job) |
| return 0; |
| sock_data = list_find_first(sock_gres_list, gres_find_sock_by_job_state, |
| gres_state_job); |
| if (!sock_data) /* None of this GRES available */ |
| return -1; |
| if ((gres_js->total_gres + sock_data->total_cnt) < |
| gres_js->gres_per_job) |
| return -1; |
| |
| return 0; |
| } |
| |
| /* |
| * Determine if the additional sock_gres_list resources will result in |
| * satisfying the job's gres_per_job constraints |
| * IN job_gres_list - job's GRES requirements |
| * IN sock_gres_list - available GRES in a set of nodes, data structure built |
| * by gres_job_sched_consec() |
| */ |
| extern bool gres_sched_sufficient(list_t *job_gres_list, list_t *sock_gres_list) |
| { |
| if (!job_gres_list) |
| return true; |
| if (!sock_gres_list) |
| return false; |
| |
| return !list_find_first(job_gres_list, _find_insufficient_gres, |
| sock_gres_list); |
| } |