|  | /*****************************************************************************\ | 
|  | *  common_topo.c - common functions for accounting storage | 
|  | ***************************************************************************** | 
|  | *  Copyright (C) SchedMD LLC. | 
|  | * | 
|  | *  This file is part of Slurm, a resource management program. | 
|  | *  For details, see <https://slurm.schedmd.com/>. | 
|  | *  Please also read the included file: DISCLAIMER. | 
|  | * | 
|  | *  Slurm is free software; you can redistribute it and/or modify it under | 
|  | *  the terms of the GNU General Public License as published by the Free | 
|  | *  Software Foundation; either version 2 of the License, or (at your option) | 
|  | *  any later version. | 
|  | * | 
|  | *  In addition, as a special exception, the copyright holders give permission | 
|  | *  to link the code of portions of this program with the OpenSSL library under | 
|  | *  certain conditions as described in each individual source file, and | 
|  | *  distribute linked combinations including the two. You must obey the GNU | 
|  | *  General Public License in all respects for all of the code used other than | 
|  | *  OpenSSL. If you modify file(s) with this exception, you may extend this | 
|  | *  exception to your version of the file(s), but you are not obligated to do | 
|  | *  so. If you do not wish to do so, delete this exception statement from your | 
|  | *  version.  If you delete this exception statement from all source files in | 
|  | *  the program, then also delete it here. | 
|  | * | 
|  | *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY | 
|  | *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more | 
|  | *  details. | 
|  | * | 
|  | *  You should have received a copy of the GNU General Public License along | 
|  | *  with Slurm; if not, write to the Free Software Foundation, Inc., | 
|  | *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA. | 
|  | \*****************************************************************************/ | 
|  |  | 
|  | #include "src/common/slurm_xlator.h" | 
|  |  | 
|  | #include "common_topo.h" | 
|  | #include "eval_nodes.h" | 
|  |  | 
|  | #include "src/common/bitstring.h" | 
|  | #include "src/common/core_array.h" | 
|  | #include "src/common/forward.h" | 
|  | #include "src/common/hostlist.h" | 
|  | #include "src/common/node_conf.h" | 
|  | #include "src/common/read_config.h" | 
|  | #include "src/common/slurm_protocol_api.h" | 
|  | #include "src/common/xmalloc.h" | 
|  | #include "src/common/xstring.h" | 
|  |  | 
|  | #include "src/slurmctld/slurmctld.h" | 
|  | #include "src/slurmctld/locks.h" | 
|  |  | 
|  | /* | 
|  | * These are defined here so when we link with something other than | 
|  | * the slurmctld we will have these symbols defined. They will get | 
|  | * overwritten when linking with the slurmctld. | 
|  | */ | 
|  |  | 
|  | #if defined (__APPLE__) | 
|  | extern list_t *part_list __attribute__((weak_import)); | 
|  | extern bitstr_t *idle_node_bitmap __attribute__((weak_import)); | 
|  | #else | 
|  | list_t *part_list = NULL; | 
|  | bitstr_t *idle_node_bitmap; | 
|  | #endif | 
|  |  | 
|  | typedef struct { | 
|  | int *count; | 
|  | int depth; | 
|  | bitstr_t *fwd_bitmap; | 
|  | int msg_count; | 
|  | bitstr_t *nodes_bitmap; | 
|  | hostlist_t ***sp_hl; | 
|  | uint16_t tree_width; | 
|  | } _foreach_part_split_hostlist_t; | 
|  |  | 
|  | typedef struct { | 
|  | avail_res_t *avail_res; | 
|  | int node_inx; | 
|  | } _sort_choose_nodes_t; | 
|  |  | 
|  | static int _cmp_res(const void *x, const void *y) | 
|  | { | 
|  | const _sort_choose_nodes_t *r1 = x, *r2 = y; | 
|  |  | 
|  | if (r1->avail_res->avail_res_cnt > r2->avail_res->avail_res_cnt) | 
|  | return 1; | 
|  | else if (r1->avail_res->avail_res_cnt < r2->avail_res->avail_res_cnt) | 
|  | return -1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _part_split_hostlist(void *x, void *y) | 
|  | { | 
|  | part_record_t *part_ptr = x; | 
|  | _foreach_part_split_hostlist_t *arg = y; | 
|  | int fwd_count, hl_count, hl_depth; | 
|  | hostlist_t *hl, **p_hl; | 
|  | size_t new_size; | 
|  |  | 
|  | if (!bit_overlap_any(part_ptr->node_bitmap, arg->nodes_bitmap)) | 
|  | return 0; | 
|  |  | 
|  | COPY_BITMAP(arg->fwd_bitmap, part_ptr->node_bitmap); | 
|  |  | 
|  | /* Extract partition's hostlist and node count */ | 
|  | bit_and(arg->fwd_bitmap, arg->nodes_bitmap); | 
|  | bit_and_not(arg->nodes_bitmap, arg->fwd_bitmap); | 
|  | fwd_count = bit_set_count(arg->fwd_bitmap); | 
|  | hl = bitmap2hostlist(arg->fwd_bitmap); | 
|  |  | 
|  | /* Generate FW tree hostlist array from partition's hostlist */ | 
|  | hl_depth = hostlist_split_treewidth(hl, &p_hl, &hl_count, | 
|  | arg->tree_width); | 
|  | hostlist_destroy(hl); | 
|  |  | 
|  | /* Make size for FW tree hostlist array in the main hostlist array */ | 
|  | new_size = xsize(*arg->sp_hl) + hl_count * sizeof(hostlist_t *); | 
|  | xrealloc(*arg->sp_hl, new_size); | 
|  |  | 
|  | /* Append the FW tree hostlist array to the main hostlist array */ | 
|  | for (int i = 0; i < hl_count; i++) | 
|  | (*arg->sp_hl)[*arg->count + i] = p_hl[i]; | 
|  | xfree(p_hl); | 
|  | *arg->count += hl_count; | 
|  | arg->depth = MAX(arg->depth, hl_depth); | 
|  | arg->msg_count -= fwd_count; | 
|  |  | 
|  | if (arg->msg_count == 0) | 
|  | return -1; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int _route_part_split_hostlist(hostlist_t *hl, hostlist_t ***sp_hl, | 
|  | int *count, uint16_t tree_width) | 
|  | { | 
|  | slurmctld_lock_t node_read_lock = { | 
|  | .node = READ_LOCK, | 
|  | .part = READ_LOCK, | 
|  | }; | 
|  | bitstr_t *nodes_bitmap = NULL; | 
|  | _foreach_part_split_hostlist_t part_split; | 
|  |  | 
|  | xassert(running_in_slurmctld()); | 
|  |  | 
|  | lock_slurmctld(node_read_lock); | 
|  | /* create bitmap of nodes to send message too */ | 
|  | if (hostlist2bitmap(hl, false, &nodes_bitmap) != SLURM_SUCCESS) { | 
|  | char *buf = hostlist_ranged_string_xmalloc(hl); | 
|  | fatal("ROUTE: Failed to make bitmap from hostlist=%s.", buf); | 
|  | } | 
|  |  | 
|  | *sp_hl = xcalloc(list_count(part_list), sizeof(hostlist_t *)); | 
|  | *count = 0; | 
|  |  | 
|  | part_split = (_foreach_part_split_hostlist_t) { | 
|  | .count = count, | 
|  | .depth = 0, | 
|  | .fwd_bitmap = NULL, | 
|  | .msg_count = hostlist_count(hl), | 
|  | .nodes_bitmap = nodes_bitmap, | 
|  | .sp_hl = sp_hl, | 
|  | .tree_width = tree_width, | 
|  | }; | 
|  |  | 
|  | list_for_each_ro(part_list, _part_split_hostlist, &part_split); | 
|  |  | 
|  | FREE_NULL_BITMAP(part_split.fwd_bitmap); | 
|  |  | 
|  | xassert(part_split.msg_count == bit_set_count(nodes_bitmap)); | 
|  | if (part_split.msg_count) { | 
|  | size_t new_size = *count * sizeof(hostlist_t *); | 
|  | node_record_t *node_ptr; | 
|  |  | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_ROUTE) { | 
|  | char *buf = bitmap2node_name(nodes_bitmap); | 
|  | log_flag(ROUTE, "didn't find partition containing nodes=%s", | 
|  | buf); | 
|  | xfree(buf); | 
|  | } | 
|  | new_size += part_split.msg_count * sizeof(hostlist_t *); | 
|  | xrealloc(*sp_hl, new_size); | 
|  |  | 
|  | for (int i = 0; | 
|  | (node_ptr = next_node_bitmap(nodes_bitmap, &i)); | 
|  | i++) { | 
|  | (*sp_hl)[*count] = hostlist_create(NULL); | 
|  | hostlist_push_host((*sp_hl)[*count], node_ptr->name); | 
|  | (*count)++; | 
|  | } | 
|  | part_split.depth = MAX(part_split.depth, 1); | 
|  | } | 
|  |  | 
|  | if (slurm_conf.debug_flags & DEBUG_FLAG_ROUTE) { | 
|  | char *hl_str = hostlist_ranged_string_xmalloc(hl); | 
|  | log_flag(ROUTE, "hl: %s", hl_str); | 
|  | xfree(hl_str); | 
|  | for (int i = 0; i < *count; i++) { | 
|  | char *nodes = | 
|  | hostlist_ranged_string_xmalloc((*sp_hl)[i]); | 
|  | log_flag(ROUTE, "sp_hl[%d]: %s", i, nodes); | 
|  | xfree(nodes); | 
|  | } | 
|  | } | 
|  |  | 
|  | unlock_slurmctld(node_read_lock); | 
|  |  | 
|  | FREE_NULL_BITMAP(nodes_bitmap); | 
|  | FREE_NULL_BITMAP(part_split.fwd_bitmap); | 
|  |  | 
|  | return part_split.depth; | 
|  | } | 
|  |  | 
|  | extern int common_topo_split_hostlist_treewidth(hostlist_t *hl, | 
|  | hostlist_t ***sp_hl, | 
|  | int *count, uint16_t tree_width) | 
|  | { | 
|  | if (running_in_slurmctld() && common_topo_route_part()) | 
|  | return _route_part_split_hostlist(hl, sp_hl, count, tree_width); | 
|  |  | 
|  | return hostlist_split_treewidth(hl, sp_hl, count, tree_width); | 
|  | } | 
|  |  | 
|  | extern int common_topo_get_node_addr(char *node_name, char **addr, | 
|  | char **pattern) | 
|  | { | 
|  | if (find_node_record(node_name) == NULL) | 
|  | return SLURM_ERROR; | 
|  |  | 
|  | *addr = xstrdup(node_name); | 
|  | *pattern = xstrdup("node"); | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | extern bool common_topo_route_tree(void) | 
|  | { | 
|  | static int route_tree = -1; | 
|  | if (route_tree == -1) { | 
|  | if (xstrcasestr(slurm_conf.topology_param, "routetree")) | 
|  | route_tree = true; | 
|  | else | 
|  | route_tree = false; | 
|  | } | 
|  |  | 
|  | return route_tree; | 
|  | } | 
|  |  | 
|  | extern bool common_topo_route_part(void) | 
|  | { | 
|  | static int route_part = -1; | 
|  | if (route_part == -1) { | 
|  | if (xstrcasestr(slurm_conf.topology_param, "routepart")) | 
|  | route_part = true; | 
|  | else | 
|  | route_part = false; | 
|  | } | 
|  |  | 
|  | return route_part; | 
|  | } | 
|  |  | 
|  | extern int common_topo_choose_nodes(topology_eval_t *topo_eval) | 
|  | { | 
|  | avail_res_t **avail_res_array = topo_eval->avail_res_array; | 
|  | job_record_t *job_ptr = topo_eval->job_ptr; | 
|  |  | 
|  | int ec; | 
|  | bitstr_t *orig_node_map, *req_node_map = NULL; | 
|  | bitstr_t **orig_core_array; | 
|  | int rem_nodes; | 
|  | uint32_t orig_max_nodes = topo_eval->max_nodes; | 
|  | _sort_choose_nodes_t *sorted_res = NULL; | 
|  | int res_cnt = 0, idx = 0; | 
|  |  | 
|  | if (job_ptr->details->req_node_bitmap) | 
|  | req_node_map = job_ptr->details->req_node_bitmap; | 
|  |  | 
|  | /* clear nodes from the bitmap that don't have available resources */ | 
|  | for (int i = 0; next_node_bitmap(topo_eval->node_map, &i); i++) { | 
|  | /* | 
|  | * Make sure we don't say we can use a node exclusively | 
|  | * that is bigger than our whole-job maximum CPU count. | 
|  | */ | 
|  | if (((job_ptr->details->whole_node & WHOLE_NODE_REQUIRED) && | 
|  | (job_ptr->details->max_cpus != NO_VAL) && | 
|  | (job_ptr->details->max_cpus < | 
|  |  | 
|  | avail_res_array[i]->avail_cpus)) || | 
|  | /* OR node has no CPUs */ | 
|  | (avail_res_array[i]->avail_cpus < 1)) { | 
|  | if (req_node_map && bit_test(req_node_map, i)) { | 
|  | /* can't clear a required node! */ | 
|  | return SLURM_ERROR; | 
|  | } | 
|  | bit_clear(topo_eval->node_map, i); | 
|  | } | 
|  | } | 
|  |  | 
|  | if (job_ptr->details->num_tasks && | 
|  | !(job_ptr->details->ntasks_per_node) && | 
|  | (topo_eval->max_nodes > job_ptr->details->num_tasks)) | 
|  | topo_eval->max_nodes = | 
|  | MAX(job_ptr->details->num_tasks, topo_eval->min_nodes); | 
|  |  | 
|  | /* | 
|  | * common_topo_eval_nodes() might need to be called more than once and | 
|  | * is destructive of node_map and avail_core. Copy those bitmaps. | 
|  | */ | 
|  | orig_node_map = bit_copy(topo_eval->node_map); | 
|  | orig_core_array = copy_core_array(topo_eval->avail_core); | 
|  |  | 
|  | topo_eval->first_pass = true; | 
|  |  | 
|  | ec = eval_nodes(topo_eval); | 
|  | if (ec == SLURM_SUCCESS) | 
|  | goto fini; | 
|  |  | 
|  | /* | 
|  | * This nodeset didn't work. To avoid a possible knapsack problem, | 
|  | * incrementally remove nodes with low resource counts (sum of CPU and | 
|  | * GPU count if using GPUs, otherwise the CPU count) and retry | 
|  | */ | 
|  | topo_eval->first_pass = false; | 
|  | rem_nodes = bit_set_count(orig_node_map); | 
|  |  | 
|  | /* | 
|  | * Perform first eval_nodes() with first_pass = false and then start | 
|  | * removing nodes. | 
|  | */ | 
|  | do { | 
|  | topo_eval->max_nodes = orig_max_nodes; | 
|  | bit_copybits(topo_eval->node_map, orig_node_map); | 
|  | core_array_or(topo_eval->avail_core, orig_core_array); | 
|  |  | 
|  | ec = eval_nodes(topo_eval); | 
|  |  | 
|  | if (ec == SLURM_SUCCESS) | 
|  | break; | 
|  | if (rem_nodes <= topo_eval->min_nodes) | 
|  | break; | 
|  |  | 
|  | if (!sorted_res) { | 
|  | sorted_res = xcalloc(rem_nodes, sizeof(*sorted_res)); | 
|  | for (int i = 0; next_node_bitmap(orig_node_map, &i); | 
|  | i++) { | 
|  | if (avail_res_array[i] && | 
|  | !(req_node_map && | 
|  | bit_test(req_node_map, i))) { | 
|  | sorted_res[res_cnt].node_inx = i; | 
|  | sorted_res[res_cnt].avail_res = | 
|  | avail_res_array[i]; | 
|  | res_cnt++; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (!res_cnt) | 
|  | break; | 
|  |  | 
|  | qsort(sorted_res, res_cnt, sizeof(*sorted_res), | 
|  | _cmp_res); | 
|  | } | 
|  |  | 
|  | bit_clear(orig_node_map, sorted_res[idx].node_inx); | 
|  | --rem_nodes; | 
|  | idx++; | 
|  | } while (idx < res_cnt); | 
|  |  | 
|  | fini:	if ((ec == SLURM_SUCCESS) && job_ptr->gres_list_req && | 
|  | orig_core_array) { | 
|  | /* | 
|  | * Update available CPU count for any removed cores. | 
|  | * Cores are only removed for jobs with GRES to enforce binding. | 
|  | */ | 
|  | for (int i = 0; next_node_bitmap(topo_eval->node_map, &i); | 
|  | i++) { | 
|  | int count; | 
|  | if (!orig_core_array[i] || !topo_eval->avail_core[i]) | 
|  | continue; | 
|  | count = bit_set_count(topo_eval->avail_core[i]); | 
|  | count *= node_record_table_ptr[i]->tpc; | 
|  | avail_res_array[i]->avail_cpus = | 
|  | MIN(count, avail_res_array[i]->avail_cpus); | 
|  | if (avail_res_array[i]->avail_cpus == 0) { | 
|  | error("avail_cpus underflow for %pJ", | 
|  | job_ptr); | 
|  | if (req_node_map && bit_test(req_node_map, i)) { | 
|  | /* can't clear a required node! */ | 
|  | ec = SLURM_ERROR; | 
|  | } | 
|  | bit_clear(topo_eval->node_map, i); | 
|  | } | 
|  | } | 
|  | } | 
|  | FREE_NULL_BITMAP(orig_node_map); | 
|  | free_core_array(&orig_core_array); | 
|  | xfree(sorted_res); | 
|  | return ec; | 
|  | } |