| /*****************************************************************************\ |
| * eval_nodes.c - Determine order of nodes for job. |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "eval_nodes.h" |
| #include "gres_filter.h" |
| #include "gres_sched.h" |
| |
| #include "src/common/xstring.h" |
| |
| typedef struct node_weight_struct { |
| bitstr_t *node_bitmap; /* bitmap of nodes with this weight */ |
| uint64_t weight; /* priority of node for scheduling work on */ |
| } node_weight_type; |
| |
| typedef struct { |
| uint16_t *avail_cpu_per_node; |
| avail_res_t **avail_res_array; |
| uint32_t cpus_per_task; |
| int i_end; |
| int i_start; |
| uint32_t *max_nodes; |
| int *min_rem_nodes; |
| bitstr_t *node_map; |
| int *rem_cpus; |
| int *rem_nodes; |
| int64_t *rem_max_cpus; |
| uint16_t *used_cpu_per_node; |
| } foreach_add_nodes_lln_t; |
| |
| /* Find node_weight_type element from list with same weight as node config */ |
| static int _node_weight_find(void *x, void *key) |
| { |
| node_weight_type *nwt = x; |
| node_record_t *node_ptr = key; |
| if (nwt->weight == node_ptr->sched_weight) |
| return 1; |
| return 0; |
| } |
| |
| /* Free node_weight_type element from list */ |
| static void _node_weight_free(void *x) |
| { |
| node_weight_type *nwt = x; |
| FREE_NULL_BITMAP(nwt->node_bitmap); |
| xfree(nwt); |
| } |
| |
| /* Sort list of node_weight_type records in order of increasing node weight */ |
| static int _node_weight_sort(void *x, void *y) |
| { |
| node_weight_type *nwt1 = *(node_weight_type **) x; |
| node_weight_type *nwt2 = *(node_weight_type **) y; |
| if (nwt1->weight < nwt2->weight) |
| return -1; |
| if (nwt1->weight > nwt2->weight) |
| return 1; |
| return 0; |
| } |
| |
| /* |
| * Given a bitmap of available nodes, return a list of node_weight_type |
| * records in order of increasing "weight" (priority) |
| */ |
| static list_t *_build_node_weight_list(bitstr_t *node_bitmap) |
| { |
| list_t *node_list; |
| node_record_t *node_ptr; |
| node_weight_type *nwt; |
| |
| xassert(node_bitmap); |
| /* Build list of node_weight_type records, one per node weight */ |
| node_list = list_create(_node_weight_free); |
| for (int i = 0; (node_ptr = next_node_bitmap(node_bitmap, &i)); i++) { |
| nwt = list_find_first(node_list, _node_weight_find, node_ptr); |
| if (!nwt) { |
| nwt = xmalloc(sizeof(node_weight_type)); |
| nwt->node_bitmap = bit_alloc(node_record_count); |
| nwt->weight = node_ptr->sched_weight; |
| list_append(node_list, nwt); |
| } |
| bit_set(nwt->node_bitmap, i); |
| } |
| |
| /* Sort the list in order of increasing node weight */ |
| list_sort(node_list, _node_weight_sort); |
| |
| return node_list; |
| } |
| |
| static void _reduce_res_cores(topology_eval_t *topo_eval, |
| uint64_t *maxtasks, |
| uint16_t res_cores_per_gpu, |
| int sockets, |
| uint16_t cores_per_socket, |
| uint16_t cpus_per_core, |
| int node_i) |
| { |
| gres_job_state_t *gres_js; |
| gres_state_t *gres_job_state; |
| sock_gres_t *sock_gres; |
| list_t *sock_list = topo_eval->avail_res_array[node_i]->sock_gres_list; |
| bitstr_t *avail_core = topo_eval->avail_core[node_i]; |
| uint16_t *avail_cores_per_sock = |
| topo_eval->avail_res_array[node_i]->avail_cores_per_sock; |
| uint16_t *actual_cores_p_s; |
| uint32_t tot_cores = 0; |
| |
| if (topo_eval->cr_type & SELECT_SOCKET) |
| return; |
| |
| actual_cores_p_s = xcalloc(sockets, sizeof(uint16_t)); |
| for (int s = 0; s < sockets; s++) { |
| int start_core = s * cores_per_socket; |
| int end_core = start_core + cores_per_socket; |
| actual_cores_p_s[s] = bit_set_count_range(avail_core, |
| start_core, |
| end_core); |
| tot_cores += avail_cores_per_sock[s]; |
| } |
| |
| list_itr_t *sock_list_iter; |
| sock_list_iter = list_iterator_create(sock_list); |
| while ((sock_gres = list_next(sock_list_iter))) { |
| bitstr_t *res_cores; |
| uint16_t tot_res_core; |
| uint32_t max_res_cores = 0; |
| uint64_t max_gres = 0; |
| uint32_t max_gres_by_cpu = 0; |
| int i = (sockets * cores_per_socket) - 1; |
| bool done = false; |
| |
| if (!sock_gres->gres_state_job) |
| continue; |
| |
| gres_job_state = sock_gres->gres_state_job; |
| gres_js = gres_job_state->gres_data; |
| if (!gres_js->gres_per_task && |
| (!gres_js->ntasks_per_gres || |
| (gres_js->ntasks_per_gres == NO_VAL16))) |
| continue; |
| /* Gres per node takes priority in selection */ |
| if (gres_js->gres_per_node) |
| continue; |
| if (gres_js->gres_per_task) |
| max_gres = *maxtasks * gres_js->gres_per_task; |
| else if (gres_js->ntasks_per_gres) { |
| max_gres = *maxtasks / gres_js->ntasks_per_gres; |
| *maxtasks = max_gres * gres_js->ntasks_per_gres; |
| } |
| |
| sock_gres->total_cnt = MIN(sock_gres->total_cnt, max_gres); |
| |
| if ((gres_job_state->plugin_id != gres_get_gpu_plugin_id()) || |
| !gres_js->res_gpu_cores || |
| !gres_js->res_gpu_cores[node_i]) |
| continue; |
| |
| max_res_cores = max_gres * res_cores_per_gpu; |
| res_cores = bit_copy(gres_js->res_gpu_cores[node_i]); |
| bit_and(res_cores, avail_core); |
| tot_res_core = bit_set_count(res_cores); |
| |
| while (!done) { |
| while (tot_res_core > max_res_cores) { |
| int s; |
| /* |
| * Must remove restricted cores from the end of |
| * the bitmap first since cores are picked from |
| * front to back. This helps the needed |
| * restricted cores get picked. |
| */ |
| i = bit_fls_from_bit(res_cores, i); |
| if (i < 0) |
| break; /* This should never happen */ |
| bit_clear(avail_core, i); |
| tot_res_core--; |
| |
| s = i / cores_per_socket; |
| actual_cores_p_s[s]--; |
| if (actual_cores_p_s[s] < |
| avail_cores_per_sock[s]) { |
| int cnt; |
| avail_cores_per_sock[s]--; |
| tot_cores--; |
| cnt = tot_cores * cpus_per_core; |
| if (cnt < topo_eval->avail_cpus) |
| topo_eval->avail_cpus = cnt; |
| } |
| i--; |
| } |
| |
| if (gres_js->cpus_per_gres) { |
| max_gres_by_cpu = topo_eval->avail_cpus / |
| gres_js->cpus_per_gres; |
| while (max_gres_by_cpu < max_gres) { |
| (*maxtasks)--; |
| if (gres_js->gres_per_task) { |
| max_gres = *maxtasks * |
| gres_js->gres_per_task; |
| } else if (gres_js->ntasks_per_gres) { |
| max_gres = *maxtasks / |
| gres_js-> |
| ntasks_per_gres; |
| *maxtasks = max_gres * gres_js-> |
| ntasks_per_gres; |
| } |
| } |
| sock_gres->total_cnt = |
| MIN(sock_gres->total_cnt, max_gres); |
| max_res_cores = max_gres * res_cores_per_gpu; |
| if (tot_res_core <= max_res_cores) |
| done = true; |
| } else |
| done = true; |
| } |
| FREE_NULL_BITMAP(res_cores); |
| } |
| list_iterator_destroy(sock_list_iter); |
| xfree(actual_cores_p_s); |
| } |
| |
| static uint32_t _reduce_res_core_by_task_cnt(topology_eval_t* topo_eval, |
| uint64_t maxtasks, |
| uint32_t *gres_max_tasks, |
| node_record_t *node_ptr, |
| int node_i, |
| int select_inx) |
| { |
| job_record_t *job_ptr = topo_eval->job_ptr; |
| uint16_t plane_size = NO_VAL16; |
| uint16_t arbitrary_tasks = NO_VAL16; |
| bool one_task_per_node = false; |
| uint16_t res_cores_per_gpu = node_ptr->res_cores_per_gpu; |
| |
| if (job_ptr->details->overcommit && !job_ptr->tres_per_task) |
| one_task_per_node = true; |
| if (((job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) == |
| SLURM_DIST_PLANE) && !one_task_per_node) { |
| plane_size = 1; |
| if (job_ptr->details->mc_ptr) |
| plane_size = job_ptr->details->mc_ptr->plane_size; |
| } else if ((job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) == |
| SLURM_DIST_ARBITRARY) { |
| arbitrary_tasks = job_ptr->details->arbitrary_tpn[select_inx]; |
| } |
| |
| maxtasks = MIN(maxtasks, plane_size); |
| maxtasks = MIN(maxtasks, arbitrary_tasks); |
| if (maxtasks < *gres_max_tasks) { |
| _reduce_res_cores(topo_eval, &maxtasks, res_cores_per_gpu, |
| node_ptr->tot_sockets, node_ptr->cores, |
| node_ptr->tpc, node_i); |
| *gres_max_tasks = maxtasks; |
| } |
| maxtasks = MIN(maxtasks, *gres_max_tasks); |
| return maxtasks; |
| } |
| |
| /* |
| * Reduce the gres_max_tasks and total GRES available to a node based on |
| * what will be laid out on the node. |
| * If the GRES available gets reduced and RestrictedCoresPerGPU |
| * is used, any unusable restricted cores will be removed. |
| * If to many cores are removed such that the node is no longer usable |
| * in the allocation it returns false, else true. |
| */ |
| extern bool eval_nodes_gres(topology_eval_t *topo_eval, |
| uint64_t *max_tasks, |
| job_record_t *job_ptr, |
| node_record_t *node_ptr, |
| int rem_nodes, |
| int node_i, |
| int select_inx) |
| { |
| bool use_node = true; |
| uint64_t used_tasks; |
| uint32_t save_tasks = MAX((rem_nodes - 1), 0); |
| uint16_t min_cpus = job_ptr->details->cpus_per_task; |
| |
| *max_tasks -= save_tasks; |
| if (!job_ptr->details->overcommit) { |
| used_tasks = MIN(*max_tasks, (topo_eval->avail_cpus / |
| job_ptr->details->cpus_per_task)); |
| if (used_tasks < *max_tasks) { |
| save_tasks += *max_tasks - used_tasks; |
| *max_tasks = used_tasks; |
| } |
| } |
| *max_tasks = MAX(*max_tasks, 1); |
| used_tasks = _reduce_res_core_by_task_cnt( |
| topo_eval, *max_tasks, |
| &topo_eval->avail_res_array[node_i]->gres_max_tasks, node_ptr, |
| node_i, select_inx); |
| |
| if (!job_ptr->details->overcommit) |
| min_cpus = job_ptr->details->cpus_per_task * used_tasks; |
| else if (use_node) |
| min_cpus = job_ptr->details->cpus_per_task; |
| if (min_cpus < job_ptr->details->pn_min_cpus) |
| min_cpus = job_ptr->details->pn_min_cpus; |
| |
| if (!used_tasks) |
| use_node = false; |
| else |
| use_node = topo_eval->avail_cpus >= min_cpus; |
| |
| if (topo_eval->gres_per_job && use_node) { |
| use_node = gres_sched_add( |
| &topo_eval->avail_cpus, |
| topo_eval->avail_core[node_i], |
| topo_eval->avail_res_array[node_i]-> |
| avail_cores_per_sock, |
| topo_eval->avail_res_array[node_i]->sock_gres_list, |
| job_ptr->gres_list_req, |
| node_ptr->res_cores_per_gpu, |
| node_ptr->tot_sockets, |
| node_ptr->cores, node_ptr->tpc, topo_eval->cr_type, |
| min_cpus, node_i); |
| } |
| |
| if (use_node) |
| *max_tasks -= used_tasks; |
| else |
| topo_eval->avail_cpus = 0; |
| |
| topo_eval->avail_res_array[node_i]->avail_cpus = topo_eval->avail_cpus; |
| *max_tasks += save_tasks; |
| return use_node; |
| } |
| |
| extern uint64_t eval_nodes_set_max_tasks(job_record_t *job_ptr, |
| uint64_t max_cpus, |
| uint32_t max_nodes) { |
| uint32_t max_tasks = max_cpus; |
| if (!job_ptr->details->overcommit && |
| (job_ptr->details->cpus_per_task > 1)) { |
| if (job_ptr->details->ntasks_per_node == 0) { |
| max_tasks = max_tasks / job_ptr->details->cpus_per_task; |
| } else { |
| max_tasks = job_ptr->details->ntasks_per_node * |
| max_nodes; |
| } |
| } |
| return max_tasks; |
| } |
| |
| extern void eval_nodes_clip_socket_cores(topology_eval_t *topo_eval) |
| { |
| bitstr_t *avail_core; |
| uint16_t *avail_cores_per_sock; |
| uint16_t actual_core_cnt; |
| node_record_t *node_ptr; |
| int start_core; |
| int end_core; |
| |
| if (!topo_eval->job_ptr->gres_list_req) |
| return; |
| |
| for (int i = 0; |
| (node_ptr = next_node_bitmap(topo_eval->node_map, &i)); |
| i++) { |
| avail_core = topo_eval->avail_core[i]; |
| avail_cores_per_sock = |
| topo_eval->avail_res_array[i]->avail_cores_per_sock; |
| for (int s = 0; s < node_ptr->tot_sockets; s++) { |
| start_core = s * node_ptr->cores; |
| end_core = start_core + node_ptr->cores; |
| actual_core_cnt = bit_set_count_range(avail_core, |
| start_core, |
| end_core); |
| for (int c = node_ptr->cores - 1; c >= 0; c--) { |
| int i = (s * node_ptr->cores) + c; |
| if (actual_core_cnt <= avail_cores_per_sock[s]) |
| break; |
| if (!bit_test(avail_core, i)) |
| continue; |
| bit_clear(avail_core, i); |
| actual_core_cnt--; |
| } |
| } |
| } |
| } |
| |
| /* |
| * A variation of _eval_nodes() to select resources using busy nodes first. |
| */ |
| static int _eval_nodes_busy(topology_eval_t *topo_eval) |
| { |
| int i, i_start, i_end, error_code = SLURM_ERROR; |
| int idle_test; |
| int rem_cpus, rem_nodes; /* remaining resources desired */ |
| int min_rem_nodes; /* remaining resources desired */ |
| int total_cpus = 0; /* #CPUs allocated to job */ |
| int64_t rem_max_cpus; |
| job_record_t *job_ptr = topo_eval->job_ptr; |
| job_details_t *details_ptr = job_ptr->details; |
| bitstr_t *req_map = details_ptr->req_node_bitmap; |
| bitstr_t *orig_node_map = bit_copy(topo_eval->node_map); |
| avail_res_t **avail_res_array = topo_eval->avail_res_array; |
| uint32_t min_nodes = topo_eval->min_nodes; |
| uint32_t req_nodes = topo_eval->req_nodes; |
| bool all_done = false; |
| node_record_t *node_ptr; |
| list_t *node_weight_list = NULL; |
| node_weight_type *nwt; |
| list_itr_t *iter; |
| uint64_t maxtasks; |
| |
| topo_eval->avail_cpus = 0; |
| |
| rem_cpus = details_ptr->min_cpus; |
| min_rem_nodes = min_nodes; |
| if ((details_ptr->num_tasks != NO_VAL) && |
| (details_ptr->num_tasks != 0)) |
| topo_eval->max_nodes = MIN(topo_eval->max_nodes, |
| details_ptr->num_tasks); |
| if ((topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req))) |
| rem_nodes = MIN(min_nodes, req_nodes); |
| else |
| rem_nodes = MAX(min_nodes, req_nodes); |
| rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes); |
| maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus, |
| topo_eval->max_nodes); |
| |
| i_start = bit_ffs(topo_eval->node_map); |
| if (i_start >= 0) |
| i_end = bit_fls(topo_eval->node_map); |
| else |
| i_end = i_start - 1; |
| if (req_map) { |
| for (i = i_start; i <= i_end; i++) { |
| if (!bit_test(req_map, i)) { |
| bit_clear(topo_eval->node_map, i); |
| continue; |
| } |
| node_ptr = node_record_table_ptr[i]; |
| if (!avail_res_array[i] || |
| !avail_res_array[i]->avail_cpus) { |
| debug("%pJ required node %s lacks available resources", |
| job_ptr, node_ptr->name); |
| goto fini; |
| } |
| if (topo_eval->max_nodes <= 0) { |
| log_flag(SELECT_TYPE, "%pJ requires nodes exceed maximum node limit", |
| job_ptr); |
| goto fini; |
| } |
| eval_nodes_select_cores(topo_eval, i, min_rem_nodes); |
| (void) eval_nodes_cpus_to_use(topo_eval, i, |
| rem_max_cpus, |
| min_rem_nodes, |
| &maxtasks, true); |
| if (topo_eval->avail_cpus <= 0) { |
| debug("%pJ required node %s lacks available resources", |
| job_ptr, node_ptr->name); |
| goto fini; |
| } |
| total_cpus += topo_eval->avail_cpus; |
| rem_cpus -= topo_eval->avail_cpus; |
| rem_max_cpus -= topo_eval->avail_cpus; |
| rem_nodes--; |
| min_rem_nodes--; |
| /* leaving bitmap set, decr max limit */ |
| topo_eval->max_nodes--; |
| } |
| if ((rem_nodes <= 0) && (rem_cpus <= 0) && |
| gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) { |
| error_code = SLURM_SUCCESS; |
| bit_and(topo_eval->node_map, req_map); |
| goto fini; |
| } |
| if (topo_eval->max_nodes <= 0) { |
| error_code = SLURM_ERROR; |
| goto fini; |
| } |
| bit_and_not(orig_node_map, topo_eval->node_map); |
| } else { |
| bit_clear_all(topo_eval->node_map); |
| } |
| |
| /* Compute CPUs already allocated to required nodes */ |
| if ((details_ptr->max_cpus != NO_VAL) && |
| (total_cpus > details_ptr->max_cpus)) { |
| info("%pJ can't use required nodes due to max CPU limit", |
| job_ptr); |
| goto fini; |
| } |
| |
| /* |
| * Start by using nodes that already have a job running. |
| * Then try to use idle nodes. |
| */ |
| if (topo_eval->max_nodes == 0) |
| all_done = true; |
| node_weight_list = _build_node_weight_list(orig_node_map); |
| iter = list_iterator_create(node_weight_list); |
| while (!all_done && (nwt = list_next(iter))) { |
| for (idle_test = 0; idle_test < 2; idle_test++) { |
| for (i = i_start; i <= i_end; i++) { |
| if (!avail_res_array[i] || |
| !avail_res_array[i]->avail_cpus) |
| continue; |
| /* Node not available or already selected */ |
| if (!bit_test(nwt->node_bitmap, i) || |
| bit_test(topo_eval->node_map, i)) |
| continue; |
| if (((idle_test == 0) && |
| bit_test(idle_node_bitmap, i)) || |
| ((idle_test == 1) && |
| !bit_test(idle_node_bitmap, i))) |
| continue; |
| eval_nodes_select_cores(topo_eval, i, |
| min_rem_nodes); |
| (void) eval_nodes_cpus_to_use(topo_eval, i, |
| rem_max_cpus, |
| min_rem_nodes, |
| &maxtasks, true); |
| if (topo_eval->avail_cpus == 0) |
| continue; |
| total_cpus += topo_eval->avail_cpus; |
| rem_cpus -= topo_eval->avail_cpus; |
| rem_max_cpus -= topo_eval->avail_cpus; |
| rem_nodes--; |
| min_rem_nodes--; |
| topo_eval->max_nodes--; |
| bit_set(topo_eval->node_map, i); |
| if ((rem_nodes <= 0) && (rem_cpus <= 0) && |
| gres_sched_test(job_ptr->gres_list_req, |
| job_ptr->job_id)) { |
| error_code = SLURM_SUCCESS; |
| all_done = true; |
| break; |
| } |
| if (topo_eval->max_nodes == 0) { |
| all_done = true; |
| break; |
| } |
| } |
| } |
| } |
| list_iterator_destroy(iter); |
| |
| if (error_code == SLURM_SUCCESS) { |
| /* Already succeeded */ |
| } else if ((rem_cpus > 0) || (min_rem_nodes > 0) || |
| !gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) { |
| bit_clear_all(topo_eval->node_map); |
| error_code = SLURM_ERROR; |
| } else { |
| error_code = SLURM_SUCCESS; |
| } |
| |
| fini: |
| if (error_code == SLURM_SUCCESS) |
| eval_nodes_clip_socket_cores(topo_eval); |
| FREE_NULL_LIST(node_weight_list); |
| FREE_NULL_BITMAP(orig_node_map); |
| return error_code; |
| } |
| |
| static int _eval_nodes_consec(topology_eval_t *topo_eval) |
| { |
| int i, j, error_code = SLURM_ERROR; |
| int *consec_cpus; /* how many CPUs we can add from this |
| * consecutive set of nodes */ |
| list_t **consec_gres; /* how many GRES we can add from this |
| * consecutive set of nodes */ |
| int *consec_nodes; /* how many nodes we can add from this |
| * consecutive set of nodes */ |
| int *consec_start; /* where this consecutive set starts (index) */ |
| int *consec_end; /* where this consecutive set ends (index) */ |
| int *consec_req; /* are nodes from this set required |
| * (in req_bitmap) */ |
| uint64_t *consec_weight; /* node scheduling weight */ |
| node_record_t *node_ptr = NULL; |
| int consec_index, consec_size, sufficient; |
| int rem_cpus, rem_nodes; /* remaining resources desired */ |
| int min_rem_nodes; /* remaining resources desired */ |
| int best_fit_nodes, best_fit_cpus, best_fit_req; |
| int best_fit_sufficient, best_fit_index = 0; |
| bool new_best; |
| uint64_t best_weight = 0; |
| int64_t rem_max_cpus; |
| int total_cpus = 0; /* #CPUs allocated to job */ |
| bool required_node; |
| avail_res_t **avail_res_array = topo_eval->avail_res_array; |
| job_record_t *job_ptr = topo_eval->job_ptr; |
| job_details_t *details_ptr = job_ptr->details; |
| bitstr_t *req_map = details_ptr->req_node_bitmap; |
| uint32_t min_nodes = topo_eval->min_nodes; |
| uint32_t req_nodes = topo_eval->req_nodes; |
| uint16_t *avail_cpu_per_node = NULL; |
| uint64_t maxtasks; |
| |
| topo_eval->avail_cpus = 0; |
| |
| /* make allocation for 50 sets of consecutive nodes, expand as needed */ |
| consec_size = 50; |
| consec_cpus = xcalloc(consec_size, sizeof(int)); |
| consec_nodes = xcalloc(consec_size, sizeof(int)); |
| consec_start = xcalloc(consec_size, sizeof(int)); |
| consec_end = xcalloc(consec_size, sizeof(int)); |
| consec_req = xcalloc(consec_size, sizeof(int)); |
| consec_weight = xcalloc(consec_size, sizeof(uint64_t)); |
| |
| /* Build table with information about sets of consecutive nodes */ |
| consec_index = 0; |
| consec_req[consec_index] = -1; /* no required nodes here by default */ |
| consec_weight[consec_index] = NO_VAL64; |
| |
| avail_cpu_per_node = xcalloc(node_record_count, sizeof(uint16_t)); |
| rem_cpus = details_ptr->min_cpus; |
| min_rem_nodes = min_nodes; |
| if ((topo_eval->gres_per_job = |
| gres_sched_init(job_ptr->gres_list_req))) { |
| rem_nodes = MIN(min_nodes, req_nodes); |
| consec_gres = xcalloc(consec_size, sizeof(list_t *)); |
| } else |
| rem_nodes = MAX(min_nodes, req_nodes); |
| rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes); |
| maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus, |
| topo_eval->max_nodes); |
| |
| /* |
| * If there are required nodes, first determine the resources they |
| * provide, then select additional resources as needed in next loop |
| */ |
| if (req_map) { |
| int count = 0; |
| uint16_t *arbitrary_tpn = job_ptr->details->arbitrary_tpn; |
| for (i = 0; |
| ((node_ptr = next_node_bitmap(req_map, &i)) && |
| (topo_eval->max_nodes > 0)); |
| i++) { |
| eval_nodes_select_cores(topo_eval, i, min_rem_nodes); |
| if (arbitrary_tpn) { |
| int req_cpus = arbitrary_tpn[count++]; |
| if ((details_ptr->cpus_per_task != NO_VAL16) && |
| (details_ptr->cpus_per_task != 0)) |
| req_cpus *= details_ptr->cpus_per_task; |
| |
| req_cpus = MAX(req_cpus, |
| (int) details_ptr->pn_min_cpus); |
| req_cpus = MAX(req_cpus, |
| details_ptr->min_gres_cpu); |
| |
| if (topo_eval->avail_cpus < req_cpus) { |
| debug("%pJ required node %s needed %d cpus but only has %d", |
| job_ptr, node_ptr->name, req_cpus, |
| topo_eval->avail_cpus); |
| goto fini; |
| } |
| topo_eval->avail_cpus = req_cpus; |
| |
| avail_res_array[i]->avail_cpus = |
| topo_eval->avail_cpus; |
| |
| if (topo_eval->gres_per_job) { |
| eval_nodes_gres(topo_eval, &maxtasks, |
| job_ptr, node_ptr, |
| min_rem_nodes, i, |
| (count - 1)); |
| } |
| } else |
| (void) eval_nodes_cpus_to_use(topo_eval, i, |
| rem_max_cpus, |
| min_rem_nodes, |
| &maxtasks, true); |
| |
| if (topo_eval->avail_cpus == 0) { |
| debug("%pJ required node %s lacks available resources", |
| job_ptr, node_ptr->name); |
| goto fini; |
| } |
| avail_cpu_per_node[i] = topo_eval->avail_cpus; |
| total_cpus += topo_eval->avail_cpus; |
| rem_cpus -= topo_eval->avail_cpus; |
| rem_max_cpus -= topo_eval->avail_cpus; |
| rem_nodes--; |
| min_rem_nodes--; |
| topo_eval->max_nodes--; |
| } |
| |
| if ((rem_nodes <= 0) && (rem_cpus <= 0) && |
| gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) { |
| error_code = SLURM_SUCCESS; |
| bit_and(topo_eval->node_map, req_map); |
| goto fini; |
| } |
| if (topo_eval->max_nodes <= 0) { |
| error_code = SLURM_ERROR; |
| goto fini; |
| } |
| } |
| |
| for (i = 0; next_node(&i); i++) { /* For each node */ |
| if ((consec_index + 1) >= consec_size) { |
| consec_size *= 2; |
| xrecalloc(consec_cpus, consec_size, sizeof(int)); |
| xrecalloc(consec_nodes, consec_size, sizeof(int)); |
| xrecalloc(consec_start, consec_size, sizeof(int)); |
| xrecalloc(consec_end, consec_size, sizeof(int)); |
| xrecalloc(consec_req, consec_size, sizeof(int)); |
| xrecalloc(consec_weight, consec_size, sizeof(uint64_t)); |
| if (topo_eval->gres_per_job) { |
| xrecalloc(consec_gres, |
| consec_size, sizeof(list_t *)); |
| } |
| } |
| if (req_map) |
| required_node = bit_test(req_map, i); |
| else |
| required_node = false; |
| if (!bit_test(topo_eval->node_map, i)) { |
| node_ptr = NULL; /* Use as flag, avoid second test */ |
| } else if (required_node) { |
| node_ptr = node_record_table_ptr[i]; |
| } else { |
| node_ptr = node_record_table_ptr[i]; |
| eval_nodes_select_cores(topo_eval, i, min_rem_nodes); |
| if (topo_eval->avail_cpus == 0) { |
| bit_clear(topo_eval->node_map, i); |
| node_ptr = NULL; |
| } |
| avail_cpu_per_node[i] = topo_eval->avail_cpus; |
| } |
| /* |
| * If job requested contiguous nodes, |
| * do not worry about matching node weights |
| */ |
| if (node_ptr && |
| !details_ptr->contiguous && |
| (consec_weight[consec_index] != NO_VAL64) && /* Init value*/ |
| (node_ptr->sched_weight != consec_weight[consec_index])) { |
| /* End last consecutive set, setup start of next set */ |
| if (consec_nodes[consec_index] == 0) { |
| /* Only required nodes, reuse consec record */ |
| consec_req[consec_index] = -1; |
| } else { |
| /* End last set, setup for start of next set */ |
| consec_end[consec_index] = i - 1; |
| consec_req[++consec_index] = -1; |
| } |
| } |
| if (node_ptr) { |
| if (consec_nodes[consec_index] == 0) |
| consec_start[consec_index] = i; |
| if (required_node) { |
| /* |
| * Required node, resources counters updated |
| * in above loop, leave bitmap set |
| */ |
| if (consec_req[consec_index] == -1) { |
| /* first required node in set */ |
| consec_req[consec_index] = i; |
| } |
| continue; |
| } |
| |
| /* node not selected (yet) */ |
| bit_clear(topo_eval->node_map, i); |
| consec_cpus[consec_index] += topo_eval->avail_cpus; |
| consec_nodes[consec_index]++; |
| if (topo_eval->gres_per_job) { |
| gres_sched_consec( |
| &consec_gres[consec_index], |
| job_ptr->gres_list_req, |
| avail_res_array[i]->sock_gres_list); |
| } |
| consec_weight[consec_index] = node_ptr->sched_weight; |
| } else if (consec_nodes[consec_index] == 0) { |
| /* Only required nodes, reuse consec record */ |
| consec_req[consec_index] = -1; |
| consec_weight[consec_index] = NO_VAL64; |
| } else { |
| /* End last set, setup for start of next set */ |
| consec_end[consec_index] = i - 1; |
| consec_req[++consec_index] = -1; |
| consec_weight[consec_index] = NO_VAL64; |
| } |
| } |
| if (consec_nodes[consec_index] != 0) |
| consec_end[consec_index++] = i - 1; |
| |
| if (slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE) { |
| if (consec_index == 0) { |
| info("consec_index is zero"); |
| } |
| for (i = 0; i < consec_index; i++) { |
| char *gres_str = NULL, *gres_print = ""; |
| bitstr_t *host_bitmap; |
| char *host_list; |
| if (topo_eval->gres_per_job) { |
| gres_str = gres_sched_str(consec_gres[i]); |
| if (gres_str) { |
| xstrcat(gres_str, " "); |
| gres_print = gres_str; |
| } |
| } |
| |
| host_bitmap = bit_alloc(node_record_count); |
| bit_nset(host_bitmap, consec_start[i], consec_end[i]); |
| host_list = bitmap2node_name(host_bitmap); |
| info("set:%d consec CPUs:%d nodes:%d:%s %sbegin:%d end:%d required:%d weight:%"PRIu64, |
| i, consec_cpus[i], consec_nodes[i], |
| host_list, gres_print, consec_start[i], |
| consec_end[i], consec_req[i], consec_weight[i]); |
| FREE_NULL_BITMAP(host_bitmap); |
| xfree(gres_str); |
| xfree(host_list); |
| } |
| } |
| |
| /* Compute CPUs already allocated to required nodes */ |
| if ((details_ptr->max_cpus != NO_VAL) && |
| (total_cpus > details_ptr->max_cpus)) { |
| info("%pJ can't use required nodes due to max CPU limit", |
| job_ptr); |
| goto fini; |
| } |
| |
| /* |
| * accumulate nodes from these sets of consecutive nodes until |
| * sufficient resources have been accumulated |
| */ |
| while (consec_index && (topo_eval->max_nodes > 0)) { |
| best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0; |
| best_fit_req = -1; /* first required node, -1 if none */ |
| for (i = 0; i < consec_index; i++) { |
| if (consec_nodes[i] == 0) |
| continue; /* no usable nodes here */ |
| |
| if (details_ptr->contiguous && |
| details_ptr->req_node_bitmap && |
| (consec_req[i] == -1)) |
| continue; /* not required nodes */ |
| sufficient = (consec_cpus[i] >= rem_cpus) && |
| eval_nodes_enough_nodes( |
| consec_nodes[i], rem_nodes, |
| min_nodes, req_nodes); |
| if (sufficient && topo_eval->gres_per_job) { |
| sufficient = gres_sched_sufficient( |
| job_ptr->gres_list_req, consec_gres[i]); |
| } |
| |
| /* |
| * if first possibility OR |
| * contains required nodes OR |
| * lowest node weight |
| */ |
| if ((best_fit_nodes == 0) || |
| ((best_fit_req == -1) && (consec_req[i] != -1)) || |
| (consec_weight[i] < best_weight)) |
| new_best = true; |
| else |
| new_best = false; |
| /* |
| * If equal node weight |
| * first set large enough for request OR |
| * tightest fit (less resource/CPU waste) OR |
| * nothing yet large enough, but this is biggest |
| */ |
| if (!new_best && (consec_weight[i] == best_weight) && |
| ((sufficient && (best_fit_sufficient == 0)) || |
| (sufficient && (consec_cpus[i] < best_fit_cpus)) || |
| (!sufficient && |
| (consec_cpus[i] > best_fit_cpus)))) |
| new_best = true; |
| /* |
| * if first continuous node set large enough |
| */ |
| if (!new_best && !best_fit_sufficient && |
| details_ptr->contiguous && sufficient) |
| new_best = true; |
| if (new_best) { |
| best_fit_cpus = consec_cpus[i]; |
| best_fit_nodes = consec_nodes[i]; |
| best_fit_index = i; |
| best_fit_req = consec_req[i]; |
| best_fit_sufficient = sufficient; |
| best_weight = consec_weight[i]; |
| } |
| |
| if (details_ptr->contiguous && |
| details_ptr->req_node_bitmap) { |
| /* |
| * Must wait for all required nodes to be |
| * in a single consecutive block |
| */ |
| int j, other_blocks = 0; |
| for (j = (i+1); j < consec_index; j++) { |
| if (consec_req[j] != -1) { |
| other_blocks = 1; |
| break; |
| } |
| } |
| if (other_blocks) { |
| best_fit_nodes = 0; |
| break; |
| } |
| } |
| } |
| if (best_fit_nodes == 0) |
| break; |
| |
| if (details_ptr->contiguous && !best_fit_sufficient) |
| break; /* no hole large enough */ |
| if (best_fit_req != -1) { |
| /* |
| * This collection of nodes includes required ones |
| * select nodes from this set, first working up |
| * then down from the required nodes |
| */ |
| for (i = best_fit_req; |
| i <= consec_end[best_fit_index]; i++) { |
| if ((topo_eval->max_nodes == 0) || |
| ((rem_nodes <= 0) && (rem_cpus <= 0) && |
| (!topo_eval->gres_per_job || |
| gres_sched_test(job_ptr->gres_list_req, |
| job_ptr->job_id)))) |
| break; |
| if (bit_test(topo_eval->node_map, i)) { |
| /* required node already in set */ |
| continue; |
| } |
| if (avail_cpu_per_node[i] == 0) |
| continue; |
| topo_eval->avail_cpus = avail_cpu_per_node[i]; |
| |
| /* |
| * This could result in 0, but if the user |
| * requested nodes here we will still give |
| * them and then the step layout will sort |
| * things out. But if the gres's cpu requirement |
| * can not be satisfied due to gres layout try |
| * next node. |
| */ |
| if (!eval_nodes_cpus_to_use(topo_eval, i, |
| rem_max_cpus, |
| min_rem_nodes, |
| &maxtasks, true)) |
| continue; |
| |
| total_cpus += topo_eval->avail_cpus; |
| bit_set(topo_eval->node_map, i); |
| rem_nodes--; |
| min_rem_nodes--; |
| topo_eval->max_nodes--; |
| rem_cpus -= topo_eval->avail_cpus; |
| rem_max_cpus -= topo_eval->avail_cpus; |
| } |
| for (i = (best_fit_req - 1); |
| i >= consec_start[best_fit_index]; i--) { |
| if ((topo_eval->max_nodes == 0) || |
| ((rem_nodes <= 0) && (rem_cpus <= 0) && |
| (!topo_eval->gres_per_job || |
| gres_sched_test(job_ptr->gres_list_req, |
| job_ptr->job_id)))) |
| break; |
| if (bit_test(topo_eval->node_map, i)) |
| continue; |
| if (avail_cpu_per_node[i] == 0) |
| continue; |
| topo_eval->avail_cpus = avail_cpu_per_node[i]; |
| |
| /* |
| * This could result in 0, but if the user |
| * requested nodes here we will still give |
| * them and then the step layout will sort |
| * things out. But if the gres's cpu requirement |
| * can not be satisfied due to gres layout try |
| * next node. |
| */ |
| if (!eval_nodes_cpus_to_use(topo_eval, i, |
| rem_max_cpus, |
| min_rem_nodes, |
| &maxtasks, true)) |
| continue; |
| |
| total_cpus += topo_eval->avail_cpus; |
| rem_cpus -= topo_eval->avail_cpus; |
| rem_max_cpus -= topo_eval->avail_cpus; |
| bit_set(topo_eval->node_map, i); |
| rem_nodes--; |
| min_rem_nodes--; |
| topo_eval->max_nodes--; |
| } |
| } else { |
| /* No required nodes, try best fit single node */ |
| int best_fit = -1, best_size = 0; |
| int first = consec_start[best_fit_index]; |
| int last = consec_end[best_fit_index]; |
| if (rem_nodes <= 1) { |
| for (i = first, j = 0; i <= last; i++, j++) { |
| if (bit_test(topo_eval->node_map, i) || |
| !avail_res_array[i]) |
| continue; |
| if (avail_cpu_per_node[i] < rem_cpus) |
| continue; |
| if (topo_eval->gres_per_job && |
| !gres_sched_sufficient( |
| job_ptr->gres_list_req, |
| avail_res_array[i]-> |
| sock_gres_list)) { |
| continue; |
| } |
| if ((best_fit == -1) || |
| (avail_cpu_per_node[i] <best_size)){ |
| best_fit = i; |
| best_size = |
| avail_cpu_per_node[i]; |
| if (best_size == rem_cpus) |
| break; |
| } |
| } |
| /* |
| * If we found a single node to use, |
| * clear CPU counts for all other nodes |
| */ |
| if (best_fit != -1) { |
| for (i = first; i <= last; i++) { |
| if (i == best_fit) |
| continue; |
| avail_cpu_per_node[i] = 0; |
| } |
| } |
| } |
| |
| for (i = first, j = 0; i <= last; i++, j++) { |
| if ((topo_eval->max_nodes == 0) || |
| ((rem_nodes <= 0) && (rem_cpus <= 0) && |
| (!topo_eval->gres_per_job || |
| gres_sched_test(job_ptr->gres_list_req, |
| job_ptr->job_id)))) |
| break; |
| if (bit_test(topo_eval->node_map, i) || |
| !avail_res_array[i]) |
| continue; |
| |
| topo_eval->avail_cpus = avail_cpu_per_node[i]; |
| if (topo_eval->avail_cpus <= 0) |
| continue; |
| |
| if ((topo_eval->max_nodes == 1) && |
| (topo_eval->avail_cpus < rem_cpus)) { |
| /* |
| * Job can only take one more node and |
| * this one has insufficient CPU |
| */ |
| continue; |
| } |
| |
| /* |
| * This could result in 0, but if the user |
| * requested nodes here we will still give |
| * them and then the step layout will sort |
| * things out. But if the gres's cpu requirement |
| * can not be satisfied due to gres layout try |
| * next node. |
| */ |
| if (!eval_nodes_cpus_to_use(topo_eval, i, |
| rem_max_cpus, |
| min_rem_nodes, |
| &maxtasks, true)) |
| continue; |
| |
| total_cpus += topo_eval->avail_cpus; |
| rem_cpus -= topo_eval->avail_cpus; |
| rem_max_cpus -= topo_eval->avail_cpus; |
| bit_set(topo_eval->node_map, i); |
| rem_nodes--; |
| min_rem_nodes--; |
| topo_eval->max_nodes--; |
| } |
| } |
| |
| if ((rem_nodes <= 0) && (rem_cpus <= 0) && |
| gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) { |
| error_code = SLURM_SUCCESS; |
| break; |
| } |
| consec_cpus[best_fit_index] = 0; |
| consec_nodes[best_fit_index] = 0; |
| } |
| |
| if (error_code && (rem_cpus <= 0) && |
| gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id) && |
| eval_nodes_enough_nodes(0, rem_nodes, min_nodes, req_nodes)) |
| error_code = SLURM_SUCCESS; |
| |
| fini: |
| if (error_code == SLURM_SUCCESS) |
| eval_nodes_clip_socket_cores(topo_eval); |
| xfree(avail_cpu_per_node); |
| xfree(consec_cpus); |
| xfree(consec_nodes); |
| xfree(consec_start); |
| xfree(consec_end); |
| xfree(consec_req); |
| xfree(consec_weight); |
| if (topo_eval->gres_per_job) { |
| for (i = 0; i < consec_size; i++) |
| FREE_NULL_LIST(consec_gres[i]); |
| xfree(consec_gres); |
| } |
| |
| return error_code; |
| } |
| |
| static int _eval_nodes_lln(topology_eval_t *topo_eval) |
| { |
| int i, i_start, i_end, error_code = SLURM_ERROR; |
| int rem_cpus, rem_nodes; /* remaining resources desired */ |
| int min_rem_nodes; /* remaining resources desired */ |
| int total_cpus = 0; /* #CPUs allocated to job */ |
| int64_t rem_max_cpus; |
| job_record_t *job_ptr = topo_eval->job_ptr; |
| job_details_t *details_ptr = job_ptr->details; |
| bitstr_t *req_map = details_ptr->req_node_bitmap; |
| bitstr_t *orig_node_map = bit_copy(topo_eval->node_map); |
| bool all_done = false; |
| node_record_t *node_ptr; |
| list_t *node_weight_list = NULL; |
| node_weight_type *nwt; |
| list_itr_t *iter; |
| avail_res_t **avail_res_array = topo_eval->avail_res_array; |
| uint32_t min_nodes = topo_eval->min_nodes; |
| uint32_t req_nodes = topo_eval->req_nodes; |
| uint64_t maxtasks; |
| |
| topo_eval->avail_cpus = 0; |
| |
| rem_cpus = details_ptr->min_cpus; |
| min_rem_nodes = min_nodes; |
| if ((details_ptr->num_tasks != NO_VAL) && |
| (details_ptr->num_tasks != 0)) |
| topo_eval->max_nodes = MIN(topo_eval->max_nodes, |
| details_ptr->num_tasks); |
| if ((topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req))) |
| rem_nodes = MIN(min_nodes, req_nodes); |
| else |
| rem_nodes = MAX(min_nodes, req_nodes); |
| rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes); |
| maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus, |
| topo_eval->max_nodes); |
| |
| i_start = bit_ffs(topo_eval->node_map); |
| if (i_start >= 0) |
| i_end = bit_fls(topo_eval->node_map); |
| else |
| i_end = i_start - 1; |
| if (req_map) { |
| for (i = i_start; i <= i_end; i++) { |
| if (!bit_test(req_map, i)) { |
| bit_clear(topo_eval->node_map, i); |
| continue; |
| } |
| node_ptr = node_record_table_ptr[i]; |
| if (!avail_res_array[i] || |
| !avail_res_array[i]->avail_cpus) { |
| debug("%pJ required node %s lacks available resources", |
| job_ptr, node_ptr->name); |
| goto fini; |
| } |
| if (topo_eval->max_nodes <= 0) { |
| log_flag(SELECT_TYPE, "%pJ requires nodes exceed maximum node limit", |
| job_ptr); |
| goto fini; |
| } |
| eval_nodes_select_cores(topo_eval, i, min_rem_nodes); |
| (void) eval_nodes_cpus_to_use(topo_eval, i, |
| rem_max_cpus, |
| min_rem_nodes, |
| &maxtasks, true); |
| if (topo_eval->avail_cpus <= 0) { |
| debug("%pJ required node %s not available", |
| job_ptr, node_ptr->name); |
| goto fini; |
| } |
| total_cpus += topo_eval->avail_cpus; |
| rem_cpus -= topo_eval->avail_cpus; |
| rem_max_cpus -= topo_eval->avail_cpus; |
| rem_nodes--; |
| min_rem_nodes--; |
| /* leaving bitmap set, decr max limit */ |
| topo_eval->max_nodes--; |
| } |
| if ((rem_nodes <= 0) && (rem_cpus <= 0) && |
| gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) { |
| error_code = SLURM_SUCCESS; |
| bit_and(topo_eval->node_map, req_map); |
| goto fini; |
| } |
| if (topo_eval->max_nodes <= 0) { |
| error_code = SLURM_ERROR; |
| goto fini; |
| } |
| bit_and_not(orig_node_map, topo_eval->node_map); |
| } else { |
| bit_clear_all(topo_eval->node_map); |
| } |
| |
| /* Compute CPUs already allocated to required nodes */ |
| if ((details_ptr->max_cpus != NO_VAL) && |
| (total_cpus > details_ptr->max_cpus)) { |
| info("%pJ can't use required nodes due to max CPU limit", |
| job_ptr); |
| goto fini; |
| } |
| |
| /* |
| * Accumulate nodes from those with highest available CPU count. |
| * Logic is optimized for small node/CPU count allocations. |
| * For larger allocation, use list_sort(). |
| */ |
| if (topo_eval->max_nodes == 0) |
| all_done = true; |
| node_weight_list = _build_node_weight_list(orig_node_map); |
| iter = list_iterator_create(node_weight_list); |
| while (!all_done && (nwt = list_next(iter))) { |
| int last_max_cpu_cnt = -1; |
| while (!all_done) { |
| int max_cpu_idx = -1; |
| for (i = i_start; i <= i_end; i++) { |
| /* Node not available or already selected */ |
| if (!bit_test(nwt->node_bitmap, i) || |
| bit_test(topo_eval->node_map, i)) |
| continue; |
| if (!avail_res_array[i] || |
| !avail_res_array[i]->avail_cpus) |
| continue; |
| eval_nodes_select_cores(topo_eval, i, |
| min_rem_nodes); |
| if (topo_eval->avail_cpus == 0) |
| continue; |
| /* |
| * Find the "least-loaded" node at the current |
| * node-weight level. This is defined as the |
| * node with the greatest ratio of available to |
| * total cpus. (But shift the divisors around |
| * to avoid any floating-point math.) |
| */ |
| if ((max_cpu_idx == -1) || |
| ((avail_res_array[max_cpu_idx]->max_cpus * |
| node_record_table_ptr[i]->cpus) < |
| (avail_res_array[i]->max_cpus * |
| node_record_table_ptr[max_cpu_idx]-> |
| cpus))) { |
| max_cpu_idx = i; |
| if (avail_res_array[max_cpu_idx]-> |
| max_cpus == last_max_cpu_cnt) |
| break; |
| } |
| } |
| if (max_cpu_idx == -1) { |
| /* No more usable nodes left, get next weight */ |
| break; |
| } |
| i = max_cpu_idx; |
| |
| (void) eval_nodes_cpus_to_use(topo_eval, i, |
| rem_max_cpus, |
| min_rem_nodes, &maxtasks, |
| true); |
| if (topo_eval->avail_cpus == 0) |
| continue; |
| |
| last_max_cpu_cnt = avail_res_array[i]->max_cpus; |
| total_cpus += topo_eval->avail_cpus; |
| rem_cpus -= topo_eval->avail_cpus; |
| rem_max_cpus -= topo_eval->avail_cpus; |
| rem_nodes--; |
| min_rem_nodes--; |
| topo_eval->max_nodes--; |
| bit_set(topo_eval->node_map, i); |
| if ((rem_nodes <= 0) && (rem_cpus <= 0) && |
| gres_sched_test(job_ptr->gres_list_req, |
| job_ptr->job_id)) { |
| error_code = SLURM_SUCCESS; |
| all_done = true; |
| break; |
| } |
| if (topo_eval->max_nodes == 0) { |
| all_done = true; |
| break; |
| } |
| } |
| } |
| list_iterator_destroy(iter); |
| |
| if (error_code == SLURM_SUCCESS) { |
| /* Already succeeded */ |
| } else if ((rem_cpus > 0) || (min_rem_nodes > 0) || |
| !gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) { |
| bit_clear_all(topo_eval->node_map); |
| error_code = SLURM_ERROR; |
| } else { |
| error_code = SLURM_SUCCESS; |
| } |
| |
| fini: |
| if (error_code == SLURM_SUCCESS) |
| eval_nodes_clip_socket_cores(topo_eval); |
| FREE_NULL_LIST(node_weight_list); |
| FREE_NULL_BITMAP(orig_node_map); |
| return error_code; |
| } |
| |
| /* |
| * A variation of _eval_nodes() to select resources at the end of the node |
| * list to reduce fragmentation |
| */ |
| static int _eval_nodes_serial(topology_eval_t *topo_eval) |
| { |
| int i, i_start, i_end, error_code = SLURM_ERROR; |
| int rem_cpus, rem_nodes; /* remaining resources desired */ |
| int min_rem_nodes; /* remaining resources desired */ |
| int total_cpus = 0; /* #CPUs allocated to job */ |
| int64_t rem_max_cpus; |
| job_record_t *job_ptr = topo_eval->job_ptr; |
| job_details_t *details_ptr = job_ptr->details; |
| bitstr_t *req_map = details_ptr->req_node_bitmap; |
| bitstr_t *orig_node_map = bit_copy(topo_eval->node_map); |
| avail_res_t **avail_res_array = topo_eval->avail_res_array; |
| uint32_t min_nodes = topo_eval->min_nodes; |
| uint32_t req_nodes = topo_eval->req_nodes; |
| bool all_done = false; |
| node_record_t *node_ptr; |
| list_t *node_weight_list = NULL; |
| node_weight_type *nwt; |
| list_itr_t *iter; |
| uint64_t maxtasks; |
| |
| topo_eval->avail_cpus = 0; |
| |
| rem_cpus = details_ptr->min_cpus; |
| min_rem_nodes = min_nodes; |
| if ((details_ptr->num_tasks != NO_VAL) && |
| (details_ptr->num_tasks != 0)) |
| topo_eval->max_nodes = MIN(topo_eval->max_nodes, |
| details_ptr->num_tasks); |
| if ((topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req))) |
| rem_nodes = MIN(min_nodes, req_nodes); |
| else |
| rem_nodes = MAX(min_nodes, req_nodes); |
| rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes); |
| maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus, |
| topo_eval->max_nodes); |
| |
| i_start = bit_ffs(topo_eval->node_map); |
| if (i_start >= 0) |
| i_end = bit_fls(topo_eval->node_map); |
| else |
| i_end = i_start - 1; |
| if (req_map) { |
| for (i = i_start; i <= i_end; i++) { |
| if (!bit_test(req_map, i)) { |
| bit_clear(topo_eval->node_map, i); |
| continue; |
| } |
| node_ptr = node_record_table_ptr[i]; |
| if (!avail_res_array[i] || |
| !avail_res_array[i]->avail_cpus) { |
| debug("%pJ required node %s lacks available resources", |
| job_ptr, node_ptr->name); |
| goto fini; |
| } |
| if (topo_eval->max_nodes <= 0) { |
| log_flag(SELECT_TYPE, "%pJ requires nodes exceed maximum node limit", |
| job_ptr); |
| goto fini; |
| } |
| eval_nodes_select_cores(topo_eval, i, min_rem_nodes); |
| (void) eval_nodes_cpus_to_use(topo_eval, i, |
| rem_max_cpus, min_rem_nodes, |
| &maxtasks, true); |
| if (topo_eval->avail_cpus <= 0) { |
| debug("%pJ required node %s lacks available resources", |
| job_ptr, node_ptr->name); |
| goto fini; |
| } |
| total_cpus += topo_eval->avail_cpus; |
| rem_cpus -= topo_eval->avail_cpus; |
| rem_max_cpus -= topo_eval->avail_cpus; |
| rem_nodes--; |
| min_rem_nodes--; |
| /* leaving bitmap set, decr max limit */ |
| topo_eval->max_nodes--; |
| } |
| if ((rem_nodes <= 0) && (rem_cpus <= 0) && |
| gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) { |
| error_code = SLURM_SUCCESS; |
| bit_and(topo_eval->node_map, req_map); |
| goto fini; |
| } |
| if (topo_eval->max_nodes <= 0) { |
| error_code = SLURM_ERROR; |
| goto fini; |
| } |
| bit_and_not(orig_node_map, topo_eval->node_map); |
| } else { |
| bit_clear_all(topo_eval->node_map); |
| } |
| |
| /* Compute CPUs already allocated to required nodes */ |
| if ((details_ptr->max_cpus != NO_VAL) && |
| (total_cpus > details_ptr->max_cpus)) { |
| info("%pJ can't use required nodes due to max CPU limit", |
| job_ptr); |
| goto fini; |
| } |
| |
| if (topo_eval->max_nodes == 0) |
| all_done = true; |
| node_weight_list = _build_node_weight_list(orig_node_map); |
| iter = list_iterator_create(node_weight_list); |
| while (!all_done && (nwt = list_next(iter))) { |
| for (i = i_end; |
| ((i >= i_start) && (topo_eval->max_nodes > 0)); |
| i--) { |
| if (!avail_res_array[i] || |
| !avail_res_array[i]->avail_cpus) |
| continue; |
| /* Node not available or already selected */ |
| if (!bit_test(nwt->node_bitmap, i) || |
| bit_test(topo_eval->node_map, i)) |
| continue; |
| eval_nodes_select_cores(topo_eval, i, min_rem_nodes); |
| (void) eval_nodes_cpus_to_use(topo_eval, i, |
| rem_max_cpus, min_rem_nodes, |
| &maxtasks, true); |
| if (topo_eval->avail_cpus == 0) |
| continue; |
| total_cpus += topo_eval->avail_cpus; |
| rem_cpus -= topo_eval->avail_cpus; |
| rem_max_cpus -= topo_eval->avail_cpus; |
| rem_nodes--; |
| min_rem_nodes--; |
| topo_eval->max_nodes--; |
| bit_set(topo_eval->node_map, i); |
| if ((rem_nodes <= 0) && (rem_cpus <= 0) && |
| gres_sched_test(job_ptr->gres_list_req, |
| job_ptr->job_id)) { |
| error_code = SLURM_SUCCESS; |
| all_done = true; |
| break; |
| } |
| if (topo_eval->max_nodes == 0) { |
| all_done = true; |
| break; |
| } |
| } |
| } |
| list_iterator_destroy(iter); |
| |
| if (error_code == SLURM_SUCCESS) { |
| /* Already succeeded */ |
| } else if ((rem_cpus > 0) || (min_rem_nodes > 0) || |
| !gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) { |
| bit_clear_all(topo_eval->node_map); |
| error_code = SLURM_ERROR; |
| } else { |
| error_code = SLURM_SUCCESS; |
| } |
| |
| fini: |
| if (error_code == SLURM_SUCCESS) |
| eval_nodes_clip_socket_cores(topo_eval); |
| FREE_NULL_LIST(node_weight_list); |
| FREE_NULL_BITMAP(orig_node_map); |
| return error_code; |
| } |
| |
| static int _add_nodes_by_weight_spread(void *x, void *arg) |
| { |
| foreach_add_nodes_lln_t *args = arg; |
| node_weight_type *nwt = x; |
| |
| for (int i = args->i_start; i <= args->i_end; i++) { |
| if (!args->avail_res_array[i] || |
| !args->avail_res_array[i]->avail_cpus) |
| continue; |
| /* Node not available or already selected */ |
| if (!bit_test(nwt->node_bitmap, i) || |
| bit_test(args->node_map, i)) |
| continue; |
| if (!args->avail_cpu_per_node[i]) |
| continue; |
| |
| bit_set(args->node_map, i); |
| args->used_cpu_per_node[i] = args->cpus_per_task; |
| |
| (*args->rem_nodes)--; |
| (*args->min_rem_nodes)--; |
| (*args->max_nodes)--; |
| *args->rem_max_cpus -= args->cpus_per_task; |
| *args->rem_cpus -= args->cpus_per_task; |
| if ((*args->max_nodes <= 0) || (*args->rem_nodes <= 0)) |
| return 1; |
| } |
| return 0; |
| } |
| /* |
| * A variation of _eval_nodes() to select resources using as many nodes as |
| * possible. |
| */ |
| static int _eval_nodes_spread(topology_eval_t *topo_eval) |
| { |
| int i, i_start, i_end, error_code = SLURM_ERROR; |
| int rem_cpus, rem_nodes; /* remaining resources desired */ |
| int min_rem_nodes, orig_min_rem_nodes; /* remaining resources desired */ |
| int64_t rem_max_cpus, orig_rem_max_cpus; |
| avail_res_t **avail_res_array = topo_eval->avail_res_array; |
| job_record_t *job_ptr = topo_eval->job_ptr; |
| job_details_t *details_ptr = job_ptr->details; |
| bitstr_t *req_map = details_ptr->req_node_bitmap; |
| bitstr_t *orig_node_map = bit_copy(topo_eval->node_map); |
| uint32_t min_nodes = topo_eval->min_nodes; |
| uint32_t req_nodes = topo_eval->req_nodes; |
| uint32_t cpus_per_task = job_ptr->details->cpus_per_task; |
| bool all_done = false; |
| node_record_t *node_ptr; |
| list_t *node_weight_list = NULL; |
| uint64_t maxtasks; |
| uint16_t *avail_cpu_per_node = NULL; |
| uint16_t *used_cpu_per_node = NULL; |
| uint32_t prev_max_nodes = topo_eval->max_nodes; |
| foreach_add_nodes_lln_t args = { 0 }; |
| |
| topo_eval->avail_cpus = 0; |
| |
| rem_cpus = details_ptr->min_cpus; |
| min_rem_nodes = min_nodes; |
| orig_min_rem_nodes = min_rem_nodes; |
| if ((details_ptr->num_tasks != NO_VAL) && |
| (details_ptr->num_tasks != 0)) |
| topo_eval->max_nodes = |
| MIN(topo_eval->max_nodes, details_ptr->num_tasks); |
| if ((topo_eval->gres_per_job = gres_sched_init(job_ptr->gres_list_req))) |
| rem_nodes = MIN(min_nodes, req_nodes); |
| else |
| rem_nodes = MAX(min_nodes, req_nodes); |
| rem_max_cpus = eval_nodes_get_rem_max_cpus(details_ptr, rem_nodes); |
| orig_rem_max_cpus = rem_max_cpus; |
| maxtasks = eval_nodes_set_max_tasks(job_ptr, rem_max_cpus, |
| topo_eval->max_nodes); |
| |
| avail_cpu_per_node = xcalloc(node_record_count, sizeof(uint16_t)); |
| used_cpu_per_node = xcalloc(node_record_count, sizeof(uint16_t)); |
| for (i = 0; (node_ptr = next_node_bitmap(orig_node_map, &i)); i++) { |
| node_ptr = node_record_table_ptr[i]; |
| eval_nodes_select_cores(topo_eval, i, min_rem_nodes); |
| if (topo_eval->avail_cpus == 0) |
| bit_clear(topo_eval->node_map, i); |
| avail_cpu_per_node[i] = topo_eval->avail_cpus; |
| } |
| |
| i_start = bit_ffs(topo_eval->node_map); |
| if (i_start >= 0) |
| i_end = bit_fls(topo_eval->node_map); |
| else |
| i_end = i_start - 1; |
| if (req_map) { |
| for (i = i_start; i <= i_end; i++) { |
| if (!bit_test(req_map, i)) { |
| bit_clear(topo_eval->node_map, i); |
| continue; |
| } |
| node_ptr = node_record_table_ptr[i]; |
| if (!avail_res_array[i] || |
| !avail_res_array[i]->avail_cpus) { |
| debug("%pJ required node %s lacks available resources", |
| job_ptr, node_ptr->name); |
| goto fini; |
| } |
| if (topo_eval->max_nodes <= 0) { |
| log_flag(SELECT_TYPE, "%pJ requires nodes exceed maximum node limit", |
| job_ptr); |
| goto fini; |
| } |
| used_cpu_per_node[i] = cpus_per_task; |
| |
| rem_nodes--; |
| min_rem_nodes--; |
| topo_eval->max_nodes--; |
| rem_max_cpus -= cpus_per_task; |
| rem_cpus -= cpus_per_task; |
| } |
| bit_and_not(orig_node_map, topo_eval->node_map); |
| } else { |
| bit_clear_all(topo_eval->node_map); |
| } |
| |
| if (topo_eval->max_nodes > 0) { |
| node_weight_list = _build_node_weight_list(orig_node_map); |
| |
| args.avail_res_array = avail_res_array; |
| args.node_map = topo_eval->node_map; |
| args.avail_cpu_per_node = avail_cpu_per_node; |
| args.used_cpu_per_node = used_cpu_per_node; |
| args.rem_nodes = &rem_nodes; |
| args.min_rem_nodes = &min_rem_nodes; |
| args.max_nodes = &(topo_eval->max_nodes); |
| args.rem_max_cpus = &rem_max_cpus; |
| args.rem_cpus = &rem_cpus; |
| args.i_start = i_start; |
| args.i_end = i_end; |
| args.cpus_per_task = cpus_per_task; |
| more_nodes: |
| list_for_each(node_weight_list, _add_nodes_by_weight_spread, |
| &args); |
| } |
| |
| if (rem_cpus <= 0) |
| all_done = true; |
| |
| while (!all_done) { |
| all_done = true; |
| for (i = 0; |
| (node_ptr = next_node_bitmap(topo_eval->node_map, &i)); |
| i++) { |
| if (used_cpu_per_node[i] >= avail_cpu_per_node[i]) |
| continue; |
| |
| used_cpu_per_node[i] += cpus_per_task; |
| rem_max_cpus -= cpus_per_task; |
| rem_cpus -= cpus_per_task; |
| |
| if (rem_cpus <= 0) { |
| all_done = true; |
| break; |
| } else { |
| all_done = false; |
| } |
| } |
| } |
| if ((rem_cpus > 0 || |
| !gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) && |
| (topo_eval->max_nodes > 0) && |
| (prev_max_nodes != topo_eval->max_nodes)) { |
| if (!rem_nodes) |
| rem_nodes++; |
| prev_max_nodes = topo_eval->max_nodes; |
| all_done = false; |
| goto more_nodes; |
| } |
| rem_max_cpus = orig_rem_max_cpus; |
| rem_cpus = details_ptr->min_cpus; |
| min_rem_nodes = orig_min_rem_nodes; |
| for (i = 0; (node_ptr = next_node_bitmap(topo_eval->node_map, &i)); |
| i++) { |
| topo_eval->avail_cpus = |
| MAX(used_cpu_per_node[i], details_ptr->pn_min_cpus); |
| if (!eval_nodes_cpus_to_use(topo_eval, i, rem_max_cpus, |
| min_rem_nodes, &maxtasks, true)) { |
| bit_clear(topo_eval->node_map, i); |
| continue; |
| } |
| rem_cpus -= topo_eval->avail_cpus; |
| rem_max_cpus -= topo_eval->avail_cpus; |
| min_rem_nodes--; |
| } |
| |
| if ((rem_cpus > 0) || (min_rem_nodes > 0) || |
| !gres_sched_test(job_ptr->gres_list_req, job_ptr->job_id)) { |
| bit_clear_all(topo_eval->node_map); |
| error_code = SLURM_ERROR; |
| } else { |
| error_code = SLURM_SUCCESS; |
| } |
| |
| fini: |
| if (error_code == SLURM_SUCCESS) |
| eval_nodes_clip_socket_cores(topo_eval); |
| FREE_NULL_LIST(node_weight_list); |
| FREE_NULL_BITMAP(orig_node_map); |
| xfree(avail_cpu_per_node); |
| xfree(used_cpu_per_node); |
| return error_code; |
| } |
| |
| extern int eval_nodes(topology_eval_t *topo_eval) |
| { |
| job_details_t *details_ptr = topo_eval->job_ptr->details; |
| static bool pack_serial_at_end = false; |
| |
| static bool set = false; |
| |
| if (!set) { |
| if (xstrcasestr(slurm_conf.sched_params, "pack_serial_at_end")) |
| pack_serial_at_end = true; |
| else |
| pack_serial_at_end = false; |
| set = true; |
| } |
| |
| xassert(topo_eval->node_map); |
| if (bit_set_count(topo_eval->node_map) < topo_eval->min_nodes) |
| return SLURM_ERROR; |
| |
| if ((details_ptr->req_node_bitmap) && |
| (!bit_super_set(details_ptr->req_node_bitmap, topo_eval->node_map))) |
| return SLURM_ERROR; |
| |
| if (topo_eval->trump_others && topo_eval->eval_nodes) { |
| int rc = topo_eval->eval_nodes(topo_eval); |
| if (rc != ESLURM_NOT_SUPPORTED) |
| return rc; |
| } |
| |
| if (topo_eval->job_ptr->bit_flags & SPREAD_JOB) { |
| /* Spread the job out over many nodes */ |
| return _eval_nodes_spread(topo_eval); |
| } |
| |
| if (topo_eval->prefer_alloc_nodes && !details_ptr->contiguous) { |
| /* |
| * Select resource on busy nodes first in order to leave |
| * idle resources free for as long as possible so that longer |
| * running jobs can get more easily started by the backfill |
| * scheduler plugin |
| */ |
| return _eval_nodes_busy(topo_eval); |
| } |
| |
| if ((topo_eval->cr_type & SELECT_LLN) || |
| (topo_eval->job_ptr->part_ptr && |
| (topo_eval->job_ptr->part_ptr->flags & PART_FLAG_LLN))) { |
| /* Select resource on the Least Loaded Node */ |
| return _eval_nodes_lln(topo_eval); |
| } |
| |
| if (pack_serial_at_end && |
| (details_ptr->min_cpus == 1) && (topo_eval->req_nodes == 1)) { |
| /* |
| * Put serial jobs at the end of the available node list |
| * rather than using a best-fit algorithm, which fragments |
| * resources. |
| */ |
| return _eval_nodes_serial(topo_eval); |
| } |
| |
| if (topo_eval->eval_nodes) { |
| int rc = topo_eval->eval_nodes(topo_eval); |
| if (rc != ESLURM_NOT_SUPPORTED) |
| return rc; |
| } |
| |
| return _eval_nodes_consec(topo_eval); |
| } |
| |
| extern bool eval_nodes_cpus_to_use(topology_eval_t *topo_eval, int node_inx, |
| int64_t rem_max_cpus, int rem_nodes, |
| uint64_t *max_tasks, bool check_gres) |
| { |
| job_record_t *job_ptr = topo_eval->job_ptr; |
| job_details_t *details_ptr = job_ptr->details; |
| avail_res_t *avail_res = topo_eval->avail_res_array[node_inx]; |
| int resv_cpus; /* CPUs to be allocated on other nodes */ |
| |
| /* Use all resources on node */ |
| if (details_ptr->whole_node & WHOLE_NODE_REQUIRED) |
| goto check_gres_per_job; |
| |
| resv_cpus = MAX((rem_nodes - 1), 0); |
| resv_cpus *= job_mgr_determine_cpus_per_core(details_ptr, node_inx); |
| if (topo_eval->cr_type & SELECT_SOCKET) |
| resv_cpus *= node_record_table_ptr[node_inx]->cores; |
| rem_max_cpus -= resv_cpus; |
| if (topo_eval->avail_cpus > rem_max_cpus) { |
| topo_eval->avail_cpus = MAX(rem_max_cpus, |
| (int)details_ptr->pn_min_cpus); |
| if (avail_res->gres_min_cpus) |
| topo_eval->avail_cpus = |
| MAX(topo_eval->avail_cpus, |
| avail_res->gres_min_cpus); |
| else |
| topo_eval->avail_cpus = |
| MAX(topo_eval->avail_cpus, |
| details_ptr->min_gres_cpu); |
| /* Round up CPU count to CPU in allocation unit (e.g. core) */ |
| avail_res->avail_cpus = topo_eval->avail_cpus; |
| } |
| check_gres_per_job: |
| if (check_gres && topo_eval->gres_per_job && topo_eval->avail_cpus) { |
| node_record_t *node_ptr = node_record_table_ptr[node_inx]; |
| return eval_nodes_gres(topo_eval, max_tasks, job_ptr, node_ptr, |
| rem_nodes, node_inx, 0); |
| } |
| |
| return true; |
| } |
| |
| extern void eval_nodes_select_cores(topology_eval_t *topo_eval, |
| int node_inx, int rem_nodes) |
| { |
| bitstr_t **avail_core = topo_eval->avail_core; |
| uint16_t *avail_cpus = &topo_eval->avail_cpus; |
| avail_res_t **avail_res_array = topo_eval->avail_res_array; |
| uint16_t cr_type = topo_eval->cr_type; |
| bool enforce_binding = topo_eval->enforce_binding; |
| bool first_pass = topo_eval->first_pass; |
| job_record_t *job_ptr = topo_eval->job_ptr; |
| gres_mc_data_t *mc_ptr = topo_eval->mc_ptr; |
| |
| uint32_t min_tasks_this_node = 0, max_tasks_this_node = 0; |
| uint32_t min_cores_this_node = 0; |
| job_details_t *details_ptr = job_ptr->details; |
| node_record_t *node_ptr = node_record_table_ptr[node_inx]; |
| |
| xassert(mc_ptr->cpus_per_task); |
| |
| rem_nodes = MAX(rem_nodes, 1); /* If range of node counts */ |
| if (mc_ptr->ntasks_per_node) { |
| min_tasks_this_node = mc_ptr->ntasks_per_node; |
| max_tasks_this_node = mc_ptr->ntasks_per_node; |
| } else if (mc_ptr->ntasks_per_board) { |
| min_tasks_this_node = mc_ptr->ntasks_per_board; |
| max_tasks_this_node = mc_ptr->ntasks_per_board * |
| node_ptr->boards; |
| } else if (mc_ptr->ntasks_per_socket) { |
| min_tasks_this_node = mc_ptr->ntasks_per_socket; |
| max_tasks_this_node = mc_ptr->ntasks_per_socket * |
| node_ptr->tot_sockets; |
| } else if (mc_ptr->ntasks_per_core) { |
| min_tasks_this_node = mc_ptr->ntasks_per_core; |
| max_tasks_this_node = mc_ptr->ntasks_per_core * |
| (node_ptr->tot_cores - |
| node_ptr->core_spec_cnt); |
| } else if (details_ptr && details_ptr->ntasks_per_tres && |
| (details_ptr->ntasks_per_tres != NO_VAL16)) { |
| /* Node ranges not allowed with --ntasks-per-gpu */ |
| if ((details_ptr->min_nodes != NO_VAL) && |
| (details_ptr->min_nodes != 0) && |
| (details_ptr->min_nodes == details_ptr->max_nodes)) { |
| min_tasks_this_node = details_ptr->num_tasks / |
| details_ptr->min_nodes; |
| max_tasks_this_node = min_tasks_this_node; |
| } else { |
| min_tasks_this_node = details_ptr->ntasks_per_tres; |
| max_tasks_this_node = details_ptr->num_tasks; |
| } |
| } else if (details_ptr && (details_ptr->max_nodes == 1)) { |
| if ((details_ptr->num_tasks == NO_VAL) || |
| (details_ptr->num_tasks == 0)) { |
| min_tasks_this_node = 1; |
| max_tasks_this_node = NO_VAL; |
| } else { |
| min_tasks_this_node = details_ptr->num_tasks; |
| max_tasks_this_node = details_ptr->num_tasks; |
| } |
| } else if (details_ptr && |
| ((details_ptr->num_tasks == 1) || |
| ((details_ptr->num_tasks == details_ptr->min_nodes) && |
| (details_ptr->num_tasks == details_ptr->max_nodes)))) { |
| min_tasks_this_node = 1; |
| max_tasks_this_node = 1; |
| } else { |
| min_tasks_this_node = 1; |
| max_tasks_this_node = NO_VAL; |
| } |
| /* Determine how many tasks can be started on this node */ |
| if ((!details_ptr || !details_ptr->overcommit)) { |
| int alloc_tasks = avail_res_array[node_inx]->avail_cpus / |
| mc_ptr->cpus_per_task; |
| if (alloc_tasks < min_tasks_this_node) |
| max_tasks_this_node = 0; |
| else if ((max_tasks_this_node == NO_VAL) || |
| (alloc_tasks < max_tasks_this_node)) |
| max_tasks_this_node = alloc_tasks; |
| } |
| |
| *avail_cpus = avail_res_array[node_inx]->avail_cpus; |
| /* |
| * _allocate_sc() filters available cpus and cores if the job does |
| * not request gres. If the job requests gres, _allocate_sc() defers |
| * filtering cpus and cores so that gres_select_filter_sock_core() can |
| * do it. |
| */ |
| if (job_ptr->gres_list_req) { |
| foreach_gres_filter_sock_core_args_t args = { |
| .job_ptr = job_ptr, |
| .mc_ptr = mc_ptr, |
| .sockets = avail_res_array[node_inx]->sock_cnt, |
| .cores_per_socket = node_ptr->cores, |
| .cpus_per_core = node_ptr->tpc, |
| .avail_cpus = avail_cpus, |
| .min_tasks_this_node = &min_tasks_this_node, |
| .max_tasks_this_node = &max_tasks_this_node, |
| .min_cores_this_node = &min_cores_this_node, |
| .rem_nodes = rem_nodes, |
| .enforce_binding = enforce_binding, |
| .first_pass = first_pass, |
| .avail_core = avail_core[node_inx], |
| .node_name = node_record_table_ptr[node_inx]->name, |
| .cr_type = cr_type, |
| .res_cores_per_gpu = node_ptr->res_cores_per_gpu, |
| .node_i = node_inx, |
| }; |
| |
| gres_filter_sock_core( |
| avail_res_array[node_inx]->sock_gres_list, |
| &avail_res_array[node_inx]->avail_cores_per_sock, |
| &args); |
| } |
| if (max_tasks_this_node == 0) { |
| *avail_cpus = 0; |
| } else if ((slurm_conf.select_type_param & SELECT_ONE_TASK_PER_CORE) && |
| ((mc_ptr->ntasks_per_core == INFINITE16) || |
| (mc_ptr->ntasks_per_core == 0)) && |
| details_ptr && (details_ptr->min_gres_cpu == 0)) { |
| *avail_cpus = bit_set_count(avail_core[node_inx]); |
| } |
| avail_res_array[node_inx]->gres_min_cpus = |
| job_mgr_determine_cpus_per_core(job_ptr->details, node_inx) * |
| min_cores_this_node; |
| avail_res_array[node_inx]->min_cpus = |
| avail_res_array[node_inx]->gres_min_cpus; |
| avail_res_array[node_inx]->gres_max_tasks = max_tasks_this_node; |
| } |
| |
| extern int64_t eval_nodes_get_rem_max_cpus( |
| job_details_t *details_ptr, int rem_nodes) |
| { |
| int64_t rem_max_cpus = details_ptr->min_cpus; |
| |
| if (details_ptr->max_cpus != NO_VAL) |
| rem_max_cpus = details_ptr->max_cpus; |
| if (details_ptr->min_gres_cpu) |
| rem_max_cpus = MAX(rem_max_cpus, |
| details_ptr->min_gres_cpu * rem_nodes); |
| if (details_ptr->min_job_gres_cpu) |
| rem_max_cpus = MAX(rem_max_cpus, details_ptr->min_job_gres_cpu); |
| |
| return rem_max_cpus; |
| |
| } |
| |
| extern int eval_nodes_topo_weight_find(void *x, void *key) |
| { |
| topo_weight_info_t *nw = x; |
| topo_weight_info_t *nw_key = key; |
| if (nw->weight == nw_key->weight) |
| return 1; |
| return 0; |
| } |
| |
| extern int eval_nodes_topo_node_find(void *x, void *key) |
| { |
| topo_weight_info_t *nw = x; |
| bitstr_t *nw_key = key; |
| if (bit_overlap_any(nw->node_bitmap, nw_key)) |
| return 1; |
| return 0; |
| } |
| |
| extern void eval_nodes_topo_weight_free(void *x) |
| { |
| topo_weight_info_t *nw = x; |
| FREE_NULL_BITMAP(nw->node_bitmap); |
| xfree(nw); |
| } |
| |
| extern int eval_nodes_topo_weight_log(void *x, void *arg) |
| { |
| topo_weight_info_t *nw = x; |
| char *node_names = bitmap2node_name(nw->node_bitmap); |
| info("Topo:%s weight:%"PRIu64, node_names, nw->weight); |
| xfree(node_names); |
| return 0; |
| } |
| |
| extern int eval_nodes_topo_weight_sort(void *x, void *y) |
| { |
| topo_weight_info_t *nwt1 = *(topo_weight_info_t **) x; |
| topo_weight_info_t *nwt2 = *(topo_weight_info_t **) y; |
| if (nwt1->weight < nwt2->weight) |
| return -1; |
| if (nwt1->weight > nwt2->weight) |
| return 1; |
| return 0; |
| } |
| |
| extern bool eval_nodes_enough_nodes(int avail_nodes, int rem_nodes, |
| uint32_t min_nodes, uint32_t req_nodes) |
| { |
| int needed_nodes; |
| |
| if (req_nodes > min_nodes) |
| needed_nodes = rem_nodes + min_nodes - req_nodes; |
| else |
| needed_nodes = rem_nodes; |
| |
| return (avail_nodes >= needed_nodes); |
| } |