| /*****************************************************************************\ |
| * dist_tasks.c - Assign task count for each resource. |
| ***************************************************************************** |
| * Copyright (C) SchedMD LLC. |
| * Derived in large part from select/cons_res plugin |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "gres_select_util.h" |
| #include "select_cons_tres.h" |
| #include "dist_tasks.h" |
| |
| /* Max boards supported for best-fit across boards */ |
| /* Larger board configurations may require new algorithm */ |
| /* for acceptable performance */ |
| #define MAX_BOARDS 8 |
| |
| /* Combination counts |
| * comb_counts[n-1][k-1] = number of combinations of |
| * k items from a set of n items |
| * |
| * Formula is n!/k!(n-k)! |
| */ |
| static uint32_t comb_counts[MAX_BOARDS][MAX_BOARDS] = |
| {{1,0,0,0,0,0,0,0}, |
| {2,1,0,0,0,0,0,0}, |
| {3,3,1,0,0,0,0,0}, |
| {4,6,4,1,0,0,0,0}, |
| {5,10,10,5,1,0,0,0}, |
| {6,15,20,15,6,1,0,0}, |
| {7,21,35,35,21,7,1,0}, |
| {8,28,56,70,56,28,8,1}}; |
| |
| static int *sockets_core_cnt = NULL; |
| |
| /* |
| * Generate all combinations of k integers from the |
| * set of integers 0 to n-1. |
| * Return combinations in comb_list. |
| * |
| * Example: For k = 2 and n = 4, there are six |
| * combinations: |
| * {0,1},{0,2},{0,3},{1,2},{1,3},{2,3} |
| * |
| */ |
| static void _gen_combs(int *comb_list, int n, int k) |
| { |
| int i, b; |
| int *comb = xmalloc(k * sizeof(int)); |
| |
| /* Setup comb for the initial combination */ |
| for (i = 0; i < k; i++) |
| comb[i] = i; |
| b = 0; |
| |
| /* Generate all the other combinations */ |
| while (1) { |
| for (i = 0; i < k; i++) { |
| comb_list[b + i] = comb[i]; |
| } |
| b += k; |
| i = k - 1; |
| ++comb[i]; |
| while ((i >= 0) && (comb[i] >= n - k + 1 + i)) { |
| --i; |
| ++comb[i]; |
| } |
| |
| if (comb[0] > n - k) |
| break; /* No more combinations */ |
| |
| for (i = i + 1; i < k; ++i) |
| comb[i] = comb[i - 1] + 1; |
| } |
| xfree(comb); |
| } |
| |
| /* qsort compare function for board combination socket list |
| * NOTE: sockets_core_cnt is a global symbol in this module */ |
| static int _cmp_sock(const void *a, const void *b) |
| { |
| return slurm_sort_int_list_desc(&sockets_core_cnt[*((int *) a)], |
| &sockets_core_cnt[*((int *) b)]); |
| } |
| |
| /* Enable detailed logging of cr_dist() node and core bitmaps */ |
| static inline void _log_select_maps(char *loc, job_record_t *job_ptr) |
| { |
| job_resources_t *job_res = job_ptr->job_resrcs; |
| char tmp[100]; |
| int i; |
| |
| if (!(slurm_conf.debug_flags & DEBUG_FLAG_SELECT_TYPE)) |
| return; |
| |
| info("%s %pJ", loc, job_ptr); |
| if (job_res->node_bitmap) { |
| bit_fmt(tmp, sizeof(tmp), job_res->node_bitmap); |
| info(" node_bitmap:%s", tmp); |
| } |
| if (job_res->core_bitmap) { |
| bit_fmt(tmp, sizeof(tmp), job_res->core_bitmap); |
| info(" core_bitmap:%s", tmp); |
| } |
| if (job_res->cpus) { |
| for (i = 0; i < job_res->nhosts; i++) { |
| info(" avail_cpus[%d]:%u", i, |
| job_res->cpus[i]); |
| } |
| } |
| if (job_res->tasks_per_node) { |
| for (i = 0; i < job_res->nhosts; i++) { |
| info(" tasks_per_node[%d]:%u", i, |
| job_res->tasks_per_node[i]); |
| } |
| } |
| } |
| |
| /* Remove any specialized cores from those allocated to the job */ |
| static void _clear_spec_cores(job_record_t *job_ptr, |
| bitstr_t **core_array) |
| { |
| int first_core, last_core; |
| int alloc_node = -1, alloc_core = -1, c; |
| job_resources_t *job_res = job_ptr->job_resrcs; |
| multi_core_data_t *mc_ptr = NULL; |
| bitstr_t *use_core_array = NULL; |
| node_record_t *node_ptr; |
| |
| if (job_ptr->details && job_ptr->details->mc_ptr) |
| mc_ptr = job_ptr->details->mc_ptr; |
| |
| bit_set_all(job_res->core_bitmap); |
| |
| for (int i = 0; |
| (node_ptr = next_node_bitmap(job_res->node_bitmap, &i)); i++) { |
| job_res->cpus[++alloc_node] = 0; |
| |
| first_core = 0; |
| last_core = node_ptr->tot_cores; |
| use_core_array = core_array[i]; |
| |
| for (c = first_core; c < last_core; c++) { |
| alloc_core++; |
| if (bit_test(use_core_array, c)) { |
| uint16_t tpc = node_ptr->tpc; |
| if (mc_ptr && |
| (mc_ptr->threads_per_core != NO_VAL16) && |
| (mc_ptr->threads_per_core < tpc)) |
| tpc = mc_ptr->threads_per_core; |
| |
| job_res->cpus[alloc_node] += tpc; |
| } else { |
| bit_clear(job_res->core_bitmap, alloc_core); |
| } |
| } |
| } |
| } |
| |
| static int _get_task_count(job_record_t *job_ptr) |
| { |
| uint32_t maxtasks; |
| |
| /* |
| * Here we need to check to know if the num_tasks here were from the |
| * user or from us. As if the user requested a range of nodes we |
| * originally calculate off min_nodes if ntasks_per_node is given we |
| * will not have the right num_tasks, so recalculate. |
| */ |
| if (job_ptr->details->ntasks_per_node) { |
| maxtasks = job_ptr->details->ntasks_per_node * |
| job_ptr->job_resrcs->nhosts; |
| } else if (job_ptr->details->num_tasks && |
| (job_ptr->bit_flags & JOB_NTASKS_SET)) { |
| maxtasks = job_ptr->details->num_tasks; |
| } else { |
| maxtasks = job_ptr->job_resrcs->ncpus; |
| if (job_ptr->details->cpus_per_task > 1) |
| maxtasks /= job_ptr->details->cpus_per_task; |
| } |
| |
| return maxtasks; |
| } |
| |
| /* CPUs already selected for jobs, just distribute the tasks */ |
| static int _set_task_dist_internal(job_record_t *job_ptr) |
| { |
| uint32_t n, i, tid = 0, maxtasks; |
| uint16_t *avail_cpus; |
| job_resources_t *job_res = job_ptr->job_resrcs; |
| char *err_msg = NULL; |
| int rc = SLURM_SUCCESS, plane_size = 1; |
| |
| if (!job_res) |
| err_msg = "job_res is NULL"; |
| else if (!job_res->cpus) |
| err_msg = "job_res->cpus is NULL"; |
| else if (!job_res->nhosts) |
| err_msg = "job_res->nhosts is zero"; |
| if (err_msg) { |
| error("Invalid allocation for %pJ: %s", |
| job_ptr, err_msg); |
| return SLURM_ERROR; |
| } |
| |
| if ((job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) == |
| SLURM_DIST_PLANE) { |
| if (job_ptr->details->mc_ptr) |
| plane_size = job_ptr->details->mc_ptr->plane_size; |
| if (plane_size <= 0) { |
| error("invalid plane_size"); |
| return SLURM_ERROR; |
| } |
| } |
| |
| i = job_res->nhosts * sizeof(uint16_t); |
| avail_cpus = xmalloc(i); |
| memcpy(avail_cpus, job_res->cpus, i); |
| job_res->tasks_per_node = xmalloc(i); |
| maxtasks = _get_task_count(job_ptr); |
| |
| /* |
| * Safe guard if the user didn't specified a lower number of |
| * cpus than cpus_per_task or didn't specify the number. |
| */ |
| if (!maxtasks) { |
| error("changing task count from 0 to 1 for %pJ", |
| job_ptr); |
| maxtasks = 1; |
| } |
| if (job_ptr->details->cpus_per_task == 0) |
| job_ptr->details->cpus_per_task = 1; |
| |
| /* First put one task on each node node */ |
| for (n = 0; n < job_res->nhosts; n++) { |
| tid++; |
| job_res->tasks_per_node[n] = 1; |
| if (job_ptr->details->cpus_per_task > avail_cpus[n]) { |
| if (!job_ptr->details->overcommit) { |
| error("avail_cpus underflow on node %d for %pJ", |
| n, job_ptr); |
| } |
| avail_cpus[n] = 0; |
| } else { |
| avail_cpus[n] -= job_ptr->details->cpus_per_task; |
| } |
| } |
| |
| /* Distribute remaining tasks per plane size */ |
| while (maxtasks > tid) { |
| uint32_t last_tid = tid; |
| for (n = 0; n < job_res->nhosts; n++) { |
| if (job_ptr->details->cpus_per_task > avail_cpus[n]) |
| continue; |
| i = MAX(job_res->tasks_per_node[n] % plane_size, 1); |
| i = MIN(i, |
| avail_cpus[n] /job_ptr->details->cpus_per_task); |
| i = MIN(i, maxtasks - tid); |
| job_res->tasks_per_node[n] += i; |
| tid += i; |
| avail_cpus[n] -= (i * job_ptr->details->cpus_per_task); |
| } |
| if (last_tid == tid) |
| break; |
| } |
| |
| if (maxtasks > tid) |
| rc = ESLURM_BAD_TASK_COUNT; |
| xfree(avail_cpus); |
| |
| return rc; |
| } |
| |
| static int _set_task_dist(job_record_t *job_ptr, const uint16_t cr_type) |
| { |
| int error_code = _set_task_dist_internal(job_ptr); |
| |
| if (error_code != SLURM_SUCCESS) |
| return error_code; |
| |
| /* |
| * If we are asking for less threads per core than there are on the node |
| * we need to adjust for that for accounting. |
| * This will be reversed for getting the correct memory in |
| * cons_helpers.c _job_test() look for 'save_mem & MEM_PER_CPU'. |
| */ |
| if (job_ptr->job_resrcs && |
| (job_ptr->details->mc_ptr->threads_per_core != NO_VAL16) && |
| ((cr_type & SELECT_CORE) || (cr_type & SELECT_SOCKET))) { |
| job_resources_t *job_res = job_ptr->job_resrcs; |
| node_record_t *node_ptr; |
| int i = 0; |
| |
| if (!bit_set_count(job_res->node_bitmap)) |
| return SLURM_ERROR; |
| |
| for (int n = 0; |
| (node_ptr = next_node_bitmap(job_res->node_bitmap, &n)); |
| n++) { |
| if (job_ptr->details->mc_ptr->threads_per_core == |
| node_ptr->tpc) |
| continue; |
| job_res->cpus[i++] *= node_ptr->tpc; |
| } |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| /* distribute blocks (planes) of tasks cyclically */ |
| static int _compute_plane_dist(job_record_t *job_ptr, uint32_t *gres_task_limit, |
| uint32_t *gres_min_cpus) |
| { |
| bool do_gres_min_cpus = false; |
| uint32_t n, i, p, tid, maxtasks, l; |
| uint16_t *avail_cpus, plane_size = 1; |
| job_resources_t *job_res = job_ptr->job_resrcs; |
| bool test_tres_tasks = true; |
| int rc = SLURM_SUCCESS; |
| |
| if (!job_res || !job_res->cpus || !job_res->nhosts) { |
| error("invalid allocation for %pJ", |
| job_ptr); |
| return SLURM_ERROR; |
| } |
| |
| maxtasks = _get_task_count(job_ptr); |
| avail_cpus = job_res->cpus; |
| |
| if (job_ptr->details->mc_ptr) |
| plane_size = job_ptr->details->mc_ptr->plane_size; |
| if (plane_size <= 0) { |
| error("invalid plane_size"); |
| return SLURM_ERROR; |
| } |
| |
| job_res->cpus = xcalloc(job_res->nhosts, sizeof(uint16_t)); |
| job_res->tasks_per_node = xcalloc(job_res->nhosts, sizeof(uint16_t)); |
| for (tid = 0, i = 0; (tid < maxtasks); i++) { /* cycle counter */ |
| bool space_remaining = false; |
| for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) { |
| bool more_tres_tasks = false; |
| for (p = 0; p < plane_size && (tid < maxtasks); p++) { |
| if (test_tres_tasks && |
| !dist_tasks_tres_tasks_avail( |
| gres_task_limit, job_res, n)) |
| continue; |
| more_tres_tasks = true; |
| if ((job_res->cpus[n] < avail_cpus[n])) { |
| if (gres_min_cpus[n]) |
| do_gres_min_cpus = true; |
| tid++; |
| job_res->tasks_per_node[n]++; |
| for (l = 0; |
| l <job_ptr->details->cpus_per_task; |
| l++) { |
| if (job_res->cpus[n] < |
| avail_cpus[n]) |
| job_res->cpus[n]++; |
| } |
| } |
| } |
| if (!more_tres_tasks) |
| test_tres_tasks = false; |
| if (job_res->cpus[n] < avail_cpus[n]) |
| space_remaining = true; |
| } |
| if (!space_remaining && (tid < maxtasks)) { |
| /* |
| * If gres_task_limit is not associated with |
| * gres_per_task, it is a soft limit. |
| */ |
| if (gres_task_limit && |
| !gres_select_util_job_tres_per_task( |
| job_ptr->gres_list_req)) { |
| /* Try again without limit */ |
| gres_task_limit = NULL; |
| } else { |
| rc = ESLURM_BAD_TASK_COUNT; |
| break; |
| } |
| } |
| } |
| if (do_gres_min_cpus) |
| dist_tasks_gres_min_cpus(job_ptr, avail_cpus, gres_min_cpus); |
| xfree(avail_cpus); |
| return rc; |
| } |
| |
| /* |
| * sync up core bitmap arrays with job_resources_t struct using a best-fit |
| * approach on the available resources on each node |
| * |
| * "Best-fit" means: |
| * 1st priority: Use smallest number of boards with sufficient |
| * available resources |
| * 2nd priority: Use smallest number of sockets with sufficient |
| * available resources |
| * 3rd priority: Use board combination with the smallest number |
| * of available resources |
| * 4th priority: Use higher-numbered boards/sockets/cores first |
| * |
| * The job_resources_t struct can include threads based upon configuration |
| */ |
| static void _block_sync_core_bitmap(job_record_t *job_ptr, |
| const uint16_t cr_type) |
| { |
| uint32_t c, s, i, j, b, z, csize, core_cnt; |
| int n, n_first, n_last; |
| uint16_t cpus, num_bits, vpus = 1; |
| uint16_t cpus_per_task = job_ptr->details->cpus_per_task; |
| job_resources_t *job_res = job_ptr->job_resrcs; |
| bool alloc_cores = false, alloc_sockets = false; |
| uint16_t ntasks_per_core = INFINITE16; |
| int tmp_cpt = 0; |
| int count, core_min, b_min, elig, s_min, comb_idx, sock_idx; |
| int elig_idx, comb_brd_idx, sock_list_idx, comb_min, board_num; |
| int sock_per_comb; |
| int *boards_core_cnt; |
| int *sort_brds_core_cnt; |
| int *board_combs; |
| int *socket_list; |
| int *elig_brd_combs; |
| int *elig_core_cnt; |
| bool *sockets_used; |
| uint16_t boards_nb; |
| uint16_t nboards_nb; |
| uint16_t sockets_nb; |
| uint16_t ncores_nb; |
| uint16_t nsockets_nb; |
| uint16_t sock_per_brd; |
| uint16_t req_cores,best_fit_cores = 0; |
| uint32_t best_fit_location = 0; |
| uint64_t ncomb_brd; |
| bool sufficient, best_fit_sufficient; |
| |
| if (!job_res) |
| return; |
| if (!job_res->core_bitmap) { |
| error("core_bitmap for %pJ is NULL", |
| job_ptr); |
| return; |
| } |
| if (bit_ffs(job_res->core_bitmap) == -1) { |
| error("core_bitmap for %pJ has no bits set", |
| job_ptr); |
| return; |
| } |
| |
| n_first = bit_ffs(job_res->node_bitmap); |
| if (n_first != -1) { |
| n_last = bit_fls(job_res->node_bitmap); |
| sockets_nb = node_record_table_ptr[n_first]->tot_sockets; |
| sockets_core_cnt = xcalloc(sockets_nb, sizeof(int)); |
| sockets_used = xcalloc(sockets_nb, sizeof(bool)); |
| boards_nb = node_record_table_ptr[n_first]->boards; |
| boards_core_cnt = xcalloc(boards_nb, sizeof(int)); |
| sort_brds_core_cnt = xcalloc(boards_nb, sizeof(int)); |
| } else |
| return; |
| |
| if (cr_type & SELECT_SOCKET) |
| alloc_sockets = true; |
| else if (cr_type & SELECT_CORE) |
| alloc_cores = true; |
| |
| if (job_ptr->details->mc_ptr) { |
| multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr; |
| if ((mc_ptr->ntasks_per_core != INFINITE16) && |
| (mc_ptr->ntasks_per_core)) { |
| ntasks_per_core = mc_ptr->ntasks_per_core; |
| } |
| } |
| |
| csize = bit_size(job_res->core_bitmap); |
| |
| |
| for (c = 0, i = 0, n = n_first; n <= n_last; n++) { |
| if (!bit_test(job_res->node_bitmap, n)) |
| continue; |
| |
| core_cnt = 0; |
| ncores_nb = node_record_table_ptr[n]->cores; |
| nsockets_nb = node_record_table_ptr[n]->tot_sockets; |
| nboards_nb = node_record_table_ptr[n]->boards; |
| num_bits = nsockets_nb * ncores_nb; |
| |
| if ((c + num_bits) > csize) { |
| error("index error"); |
| break; |
| } |
| |
| cpus = job_res->cpus[i]; |
| vpus = job_mgr_determine_cpus_per_core(job_ptr->details, n); |
| |
| /* compute still required cores on the node */ |
| req_cores = cpus / vpus; |
| if (cpus % vpus) |
| req_cores++; |
| |
| /* |
| * figure out core cnt if task requires more than one core and |
| * tasks_per_core is 1 |
| */ |
| if ((ntasks_per_core == 1) && |
| (cpus_per_task > vpus)) { |
| /* how many cores a task will consume */ |
| int cores_per_task = ROUNDUP(cpus_per_task, vpus); |
| int tasks = cpus / cpus_per_task; |
| req_cores = tasks * cores_per_task; |
| } |
| |
| if (nboards_nb > MAX_BOARDS) { |
| info("node[%u]: exceeds max boards(%d); doing best-fit across sockets only", |
| n, MAX_BOARDS); |
| nboards_nb = 1; |
| } |
| |
| if (nsockets_nb > sockets_nb) { |
| sockets_nb = nsockets_nb; |
| xrecalloc(sockets_core_cnt, sockets_nb, sizeof(int)); |
| xrecalloc(sockets_used, sockets_nb, sizeof(bool)); |
| } |
| |
| if (nboards_nb > boards_nb) { |
| boards_nb = nboards_nb; |
| xrecalloc(boards_core_cnt, boards_nb, sizeof(int)); |
| xrecalloc(sort_brds_core_cnt, boards_nb, sizeof(int)); |
| } |
| |
| /* Count available cores on each socket and board */ |
| sock_per_brd = nsockets_nb / nboards_nb; |
| |
| for (b = 0; b < nboards_nb; b++) { |
| boards_core_cnt[b] = 0; |
| sort_brds_core_cnt[b] = 0; |
| } |
| for (s = 0; s < nsockets_nb; s++) { |
| sockets_core_cnt[s] = 0; |
| sockets_used[s] = false; |
| b = s / sock_per_brd; |
| for (j = c + (s * ncores_nb); |
| j < c + ((s+1) * ncores_nb); j++) { |
| if (bit_test(job_res->core_bitmap, j)) { |
| sockets_core_cnt[s]++; |
| boards_core_cnt[b]++; |
| sort_brds_core_cnt[b]++; |
| } |
| } |
| } |
| |
| /* Sort boards in descending order of available core count */ |
| qsort(sort_brds_core_cnt, nboards_nb, sizeof(int), |
| slurm_sort_int_list_desc); |
| /* |
| * Determine minimum number of boards required for the |
| * allocation (b_min) |
| */ |
| count = 0; |
| for (b = 0; b < nboards_nb; b++) { |
| count += sort_brds_core_cnt[b]; |
| if (count >= req_cores) |
| break; |
| } |
| b_min = b + 1; |
| if (b_min > nboards_nb) { |
| char core_str[64]; |
| bit_fmt(core_str, 64, job_res->core_bitmap); |
| error("b_min > nboards_nb (%d > %u) node:%s core_bitmap:%s", |
| b_min, nboards_nb, |
| node_record_table_ptr[n]->name, core_str); |
| break; |
| } |
| sock_per_comb = b_min * sock_per_brd; |
| |
| /* Allocate space for list of board combinations */ |
| ncomb_brd = comb_counts[nboards_nb-1][b_min-1]; |
| board_combs = xcalloc(ncomb_brd * b_min, sizeof(int)); |
| /* Generate all combinations of b_min boards on the node */ |
| _gen_combs(board_combs, nboards_nb, b_min); |
| |
| /* |
| * Determine which combinations have enough available cores |
| * for the allocation (eligible board combinations) |
| */ |
| elig_brd_combs = xcalloc(ncomb_brd, sizeof(int)); |
| elig_core_cnt = xcalloc(ncomb_brd, sizeof(int)); |
| elig = 0; |
| for (comb_idx = 0; comb_idx < ncomb_brd; comb_idx++) { |
| count = 0; |
| for (comb_brd_idx = 0; comb_brd_idx < b_min; |
| comb_brd_idx++) { |
| board_num = board_combs[(comb_idx * b_min) |
| + comb_brd_idx]; |
| count += boards_core_cnt[board_num]; |
| } |
| if (count >= req_cores) { |
| elig_brd_combs[elig] = comb_idx; |
| elig_core_cnt[elig] = count; |
| elig++; |
| } |
| } |
| |
| /* |
| * Allocate space for list of sockets for each eligible board |
| * combination |
| */ |
| socket_list = xcalloc(elig * sock_per_comb, sizeof(int)); |
| |
| /* |
| * Generate sorted list of sockets for each eligible board |
| * combination, and find combination with minimum number |
| * of sockets and minimum number of CPUs required for the |
| * allocation |
| */ |
| s_min = sock_per_comb; |
| comb_min = 0; |
| core_min = sock_per_comb * ncores_nb; |
| for (elig_idx = 0; elig_idx < elig; elig_idx++) { |
| comb_idx = elig_brd_combs[elig_idx]; |
| for (comb_brd_idx = 0; comb_brd_idx < b_min; |
| comb_brd_idx++) { |
| board_num = board_combs[(comb_idx * b_min) |
| + comb_brd_idx]; |
| sock_list_idx = (elig_idx * sock_per_comb) + |
| (comb_brd_idx * sock_per_brd); |
| for (sock_idx = 0; sock_idx < sock_per_brd; |
| sock_idx++) { |
| socket_list[sock_list_idx + sock_idx] |
| = (board_num * sock_per_brd) |
| + sock_idx; |
| } |
| } |
| /* |
| * Sort this socket list in descending order of |
| * available core count |
| */ |
| qsort(&socket_list[elig_idx*sock_per_comb], |
| sock_per_comb, sizeof (int), _cmp_sock); |
| /* |
| * Determine minimum number of sockets required for |
| * the allocation from this socket list |
| */ |
| count = 0; |
| for (b = 0; b < sock_per_comb; b++) { |
| sock_idx = |
| socket_list[(int)((elig_idx * |
| sock_per_comb) + b)]; |
| count += sockets_core_cnt[sock_idx]; |
| if (count >= req_cores) |
| break; |
| } |
| b++; |
| /* |
| * Use board combination with minimum number |
| * of required sockets and minimum number of CPUs |
| */ |
| if ((b < s_min) || |
| ((b == s_min) && |
| (elig_core_cnt[elig_idx] <= core_min))) { |
| s_min = b; |
| comb_min = elig_idx; |
| core_min = elig_core_cnt[elig_idx]; |
| } |
| } |
| log_flag(SELECT_TYPE, "node[%u]: required CPUs:%u min req boards:%u,", |
| n, cpus, b_min); |
| log_flag(SELECT_TYPE, "node[%u]: min req sockets:%u min avail cores:%u", |
| n, s_min, core_min); |
| /* |
| * Re-sort socket list for best-fit board combination in |
| * ascending order of socket number |
| */ |
| qsort(&socket_list[comb_min * sock_per_comb], sock_per_comb, |
| sizeof (int), slurm_sort_int_list_asc); |
| |
| xfree(board_combs); |
| xfree(elig_brd_combs); |
| xfree(elig_core_cnt); |
| |
| /* |
| * select cores from the sockets of the best-fit board |
| * combination using a best-fit approach |
| */ |
| tmp_cpt = cpus_per_task; |
| while (cpus > 0) { |
| best_fit_cores = 0; |
| best_fit_sufficient = false; |
| |
| /* search for the socket with best fit */ |
| for (z = 0; z < sock_per_comb; z++) { |
| s = socket_list[(comb_min*sock_per_comb)+z]; |
| sufficient = sockets_core_cnt[s] >= req_cores; |
| if ((best_fit_cores == 0) || |
| (sufficient && !best_fit_sufficient ) || |
| (sufficient && |
| (sockets_core_cnt[s] < best_fit_cores)) || |
| (!sufficient && |
| (sockets_core_cnt[s] > best_fit_cores))) { |
| best_fit_cores = sockets_core_cnt[s]; |
| best_fit_location = s; |
| best_fit_sufficient = sufficient; |
| } |
| } |
| |
| /* check that we have found a usable socket */ |
| if (best_fit_cores == 0) |
| break; |
| |
| j = best_fit_location; |
| if (sock_per_brd) |
| j /= sock_per_brd; |
| log_flag(SELECT_TYPE, "using node[%u]: board[%u]: socket[%u]: %u cores available", |
| n, j, |
| best_fit_location, |
| sockets_core_cnt[best_fit_location]); |
| |
| sockets_used[best_fit_location] = true; |
| for (j = (c + (best_fit_location * ncores_nb)); |
| j < (c + ((best_fit_location + 1) * ncores_nb)); |
| j++ ) { |
| /* |
| * if no more CPUs to select |
| * release remaining cores unless |
| * we are allocating whole sockets |
| */ |
| if (cpus == 0) { |
| if (alloc_sockets) { |
| bit_set(job_res->core_bitmap, |
| j); |
| core_cnt++; |
| } else { |
| bit_clear(job_res->core_bitmap, |
| j); |
| } |
| continue; |
| } |
| |
| /* |
| * remove cores from socket count and |
| * cpus count using hyperthreading requirement |
| */ |
| if (bit_test(job_res->core_bitmap, j)) { |
| sockets_core_cnt[best_fit_location]--; |
| core_cnt++; |
| if (cpus < vpus) |
| cpus = 0; |
| else if ((ntasks_per_core == 1) && |
| (cpus_per_task > vpus)) { |
| int used = MIN(tmp_cpt, vpus); |
| cpus -= used; |
| |
| if (tmp_cpt <= used) |
| tmp_cpt = cpus_per_task; |
| else |
| tmp_cpt -= used; |
| } else { |
| cpus -= vpus; |
| } |
| } else if (alloc_sockets) { |
| /* |
| * If the core is not used, add it |
| * anyway if allocating whole sockets |
| */ |
| bit_set(job_res->core_bitmap, j); |
| core_cnt++; |
| } |
| } |
| |
| /* loop again if more CPUs required */ |
| if (cpus > 0) |
| continue; |
| |
| /* release remaining cores of the unused sockets */ |
| for (s = 0; s < nsockets_nb; s++) { |
| if (sockets_used[s]) |
| continue; |
| bit_nclear(job_res->core_bitmap, |
| c + (s * ncores_nb), |
| c + ((s + 1) * ncores_nb) - 1); |
| } |
| |
| } |
| |
| xfree(socket_list); |
| if (cpus > 0) { |
| /* |
| * CPUs count should NEVER be greater than the number |
| * of set bits in the core bitmap for a given node |
| */ |
| error("CPUs computation error"); |
| break; |
| } |
| |
| /* adjust cpus count of the current node */ |
| if ((alloc_cores || alloc_sockets) && |
| (node_record_table_ptr[n]->tpc >= 1)) { |
| job_res->cpus[i] = core_cnt * |
| node_record_table_ptr[n]->tpc; |
| } |
| i++; |
| |
| /* move c to the next node in core_bitmap */ |
| c += num_bits; |
| } |
| |
| xfree(boards_core_cnt); |
| xfree(sort_brds_core_cnt); |
| xfree(sockets_core_cnt); |
| xfree(sockets_used); |
| } |
| |
| /* |
| * Sync up the core_bitmap with the CPU array using cyclic distribution |
| * |
| * The CPU array contains the distribution of CPUs, which can include |
| * virtual CPUs (hyperthreads) |
| */ |
| static int _cyclic_sync_core_bitmap(job_record_t *job_ptr, |
| const uint16_t cr_type, bool preempt_mode) |
| { |
| uint32_t c, i, j, k, s; |
| int n, n_first; |
| uint32_t *sock_start, *sock_end, csize, core_cnt; |
| uint16_t cps = 0, cpus, vpus, sockets, sock_size, orig_cpu_cnt; |
| job_resources_t *job_res = job_ptr->job_resrcs; |
| bitstr_t *core_map; |
| bool *sock_used, *sock_avoid; |
| bool alloc_cores = false, alloc_sockets = false; |
| uint16_t ntasks_per_socket = INFINITE16; |
| uint16_t ntasks_per_core = INFINITE16; |
| int error_code = SLURM_SUCCESS; |
| int tmp_cpt = 0; /* cpus_per_task */ |
| node_record_t *node_ptr; |
| |
| if ((job_res == NULL) || (job_res->core_bitmap == NULL) || |
| (job_ptr->details == NULL)) |
| return error_code; |
| |
| n_first = bit_ffs(job_res->node_bitmap); |
| if (n_first == -1) |
| return error_code; |
| |
| sock_size = node_record_table_ptr[n_first]->tot_sockets; |
| sock_avoid = xcalloc(sock_size, sizeof(bool)); |
| sock_start = xcalloc(sock_size, sizeof(uint32_t)); |
| sock_end = xcalloc(sock_size, sizeof(uint32_t)); |
| sock_used = xcalloc(sock_size, sizeof(bool)); |
| |
| if (cr_type & SELECT_SOCKET) |
| alloc_sockets = true; |
| else if (cr_type & SELECT_CORE) |
| alloc_cores = true; |
| |
| core_map = job_res->core_bitmap; |
| if (job_ptr->details->mc_ptr) { |
| multi_core_data_t *mc_ptr = job_ptr->details->mc_ptr; |
| if ((mc_ptr->ntasks_per_core != INFINITE16) && |
| (mc_ptr->ntasks_per_core)) { |
| ntasks_per_core = mc_ptr->ntasks_per_core; |
| } |
| |
| if (mc_ptr->ntasks_per_socket) |
| ntasks_per_socket = mc_ptr->ntasks_per_socket; |
| } |
| |
| csize = bit_size(core_map); |
| for (c = 0, i = 0, n = 0; |
| (node_ptr = next_node_bitmap(job_res->node_bitmap, &n)); n++) { |
| sockets = node_ptr->tot_sockets; |
| cps = node_ptr->cores; |
| vpus = job_mgr_determine_cpus_per_core(job_ptr->details, n); |
| |
| log_flag(SELECT_TYPE, "%pJ node %s vpus %u cpus %u", |
| job_ptr, node_ptr->name, vpus, job_res->cpus[i]); |
| |
| if ((c + (sockets * cps)) > csize) { |
| error("index error"); |
| break; |
| } |
| |
| if (sockets > sock_size) { |
| sock_size = sockets; |
| xrecalloc(sock_avoid, sock_size, sizeof(bool)); |
| xrecalloc(sock_start, sock_size, sizeof(uint32_t)); |
| xrecalloc(sock_end, sock_size, sizeof(uint32_t)); |
| xrecalloc(sock_used, sock_size, sizeof(bool)); |
| } |
| |
| for (s = 0; s < sockets; s++) { |
| sock_start[s] = c + (s * cps); |
| sock_end[s] = sock_start[s] + cps; |
| sock_avoid[s] = false; |
| sock_used[s] = false; |
| } |
| core_cnt = 0; |
| cpus = job_res->cpus[i]; |
| |
| if (ntasks_per_socket != INFINITE16) { |
| int x_cpus, cpus_per_socket; |
| uint32_t total_cpus = 0; |
| uint32_t *cpus_cnt; |
| |
| cpus_per_socket = ntasks_per_socket * |
| job_ptr->details->cpus_per_task; |
| cpus_cnt = xmalloc(sizeof(uint32_t) * sockets); |
| for (s = 0; s < sockets; s++) { |
| for (j = sock_start[s]; j < sock_end[s]; j++) { |
| if (bit_test(core_map, j)) |
| cpus_cnt[s] += vpus; |
| } |
| total_cpus += cpus_cnt[s]; |
| } |
| for (s = 0; s < sockets && total_cpus > cpus; s++) { |
| if (cpus_cnt[s] > cpus_per_socket) { |
| x_cpus = cpus_cnt[s] - cpus_per_socket; |
| cpus_cnt[s] = cpus_per_socket; |
| total_cpus -= x_cpus; |
| } |
| } |
| for (s = 0; s < sockets && total_cpus > cpus; s++) { |
| if ((cpus_cnt[s] <= cpus_per_socket) && |
| (total_cpus - cpus_cnt[s] >= cpus)) { |
| sock_avoid[s] = true; |
| total_cpus -= cpus_cnt[s]; |
| } |
| } |
| xfree(cpus_cnt); |
| } else if (job_ptr->details->cpus_per_task > 1) { |
| /* Try to pack all CPUs of each tasks on one socket. */ |
| uint32_t *cpus_cnt, cpus_per_task; |
| |
| cpus_per_task = job_ptr->details->cpus_per_task; |
| cpus_cnt = xmalloc(sizeof(uint32_t) * sockets); |
| for (s = 0; s < sockets; s++) { |
| for (j = sock_start[s]; j < sock_end[s]; j++) { |
| if (bit_test(core_map, j)) |
| cpus_cnt[s] += vpus; |
| } |
| cpus_cnt[s] -= (cpus_cnt[s] % cpus_per_task); |
| } |
| tmp_cpt = cpus_per_task; |
| for (s = 0; ((s < sockets) && (cpus > 0)); s++) { |
| while ((sock_start[s] < sock_end[s]) && |
| (cpus_cnt[s] > 0) && (cpus > 0)) { |
| if (bit_test(core_map, sock_start[s])) { |
| int used; |
| sock_used[s] = true; |
| core_cnt++; |
| |
| if ((ntasks_per_core == 1) && |
| (cpus_per_task > vpus)) { |
| used = MIN(tmp_cpt, |
| vpus); |
| if (tmp_cpt <= used) |
| tmp_cpt = cpus_per_task; |
| else |
| tmp_cpt -= used; |
| } else |
| used = vpus; |
| |
| if (cpus_cnt[s] < vpus) |
| cpus_cnt[s] = 0; |
| else |
| cpus_cnt[s] -= used; |
| if (cpus < vpus) |
| cpus = 0; |
| else |
| cpus -= used; |
| } |
| sock_start[s]++; |
| } |
| } |
| xfree(cpus_cnt); |
| } |
| |
| orig_cpu_cnt = cpus; |
| while (cpus > 0) { |
| uint16_t prev_cpus = cpus; |
| for (s = 0; s < sockets && cpus > 0; s++) { |
| if (sock_avoid[s]) |
| continue; |
| while (sock_start[s] < sock_end[s]) { |
| if (bit_test(core_map, sock_start[s])) { |
| sock_used[s] = true; |
| core_cnt++; |
| break; |
| } else |
| sock_start[s]++; |
| } |
| if (sock_start[s] == sock_end[s]) |
| /* this socket is unusable */ |
| continue; |
| if (cpus < vpus) |
| cpus = 0; |
| else |
| cpus -= vpus; |
| sock_start[s]++; |
| } |
| if (prev_cpus != cpus) |
| continue; |
| |
| if (job_ptr->details->overcommit) { |
| /* We've got all the CPUs that we need */ |
| break; |
| } |
| if (!preempt_mode) { |
| /* we're stuck! */ |
| char *core_str = NULL, *sock_str = NULL, *sep; |
| for (j = 0, k = c; j < (cps * sockets); |
| j++, k++) { |
| if (!bit_test(core_map, k)) |
| continue; |
| if (core_str) |
| sep = ","; |
| else |
| sep = ""; |
| xstrfmtcat(core_str, "%s%d", sep, j); |
| } |
| if (!core_str) |
| core_str = xstrdup("NONE"); |
| for (s = 0; s < sockets; s++) { |
| if (!sock_avoid[s]) |
| continue; |
| if (sock_str) |
| sep = ","; |
| else |
| sep = ""; |
| xstrfmtcat(sock_str, "%s%d", sep, s); |
| } |
| if (!sock_str) |
| sock_str = xstrdup("NONE"); |
| job_ptr->priority = 0; |
| job_ptr->state_reason = WAIT_HELD; |
| error("sync loop not progressing, holding %pJ, " |
| "tried to use %u CPUs on node %s core_map:%s avoided_sockets:%s vpus:%u", |
| job_ptr, orig_cpu_cnt, node_ptr->name, |
| core_str, sock_str, vpus); |
| xfree(core_str); |
| xfree(sock_str); |
| } |
| error_code = SLURM_ERROR; |
| goto fini; |
| } |
| |
| /* |
| * clear the rest of the cores in each socket |
| * FIXME: do we need min_core/min_socket checks here? |
| */ |
| for (s = 0; s < sockets; s++) { |
| if (sock_start[s] == sock_end[s]) |
| continue; |
| if (!alloc_sockets || !sock_used[s]) { |
| bit_nclear(core_map, sock_start[s], |
| sock_end[s]-1); |
| } |
| if ((node_ptr->tpc >= 1) && |
| (alloc_sockets || alloc_cores) && sock_used[s]) { |
| for (j = sock_start[s]; j < sock_end[s]; j++) { |
| /* Mark all cores as used */ |
| if (alloc_sockets) |
| bit_set(core_map, j); |
| if (bit_test(core_map, j)) |
| core_cnt++; |
| } |
| } |
| } |
| if ((alloc_cores || alloc_sockets) && (node_ptr->tpc >= 1)) { |
| job_res->cpus[i] = core_cnt * node_ptr->tpc; |
| } |
| i++; |
| /* advance 'c' to the beginning of the next node */ |
| c += sockets * cps; |
| } |
| fini: xfree(sock_avoid); |
| xfree(sock_start); |
| xfree(sock_end); |
| xfree(sock_used); |
| return error_code; |
| } |
| |
| /* |
| * Check if we're at job tasks_per_node limit for a given node when allocating |
| * tasks to a node. |
| * |
| * RETURNS rc |
| * rc > 0 if tpn limit or arbitrary tpn exceeded |
| * rc == 0 if exactly at tpn limit |
| * rc < 0 if not at limit yet |
| */ |
| static int _at_tpn_limit(const uint32_t n, const job_record_t *job_ptr, |
| const char *tag, bool log_error) |
| { |
| const job_resources_t *job_res = job_ptr->job_resrcs; |
| const log_level_t log_lvl = log_error ? LOG_LEVEL_ERROR : |
| LOG_LEVEL_INFO; |
| int limit_rc = -1; |
| int arbitrary_rc = -1; |
| |
| if (job_ptr->details->arbitrary_tpn) { |
| arbitrary_rc = job_res->tasks_per_node[n] - |
| job_ptr->details->arbitrary_tpn[n]; |
| } |
| |
| /* Special case where no limit is imposed - no overcommit */ |
| if (job_ptr->details->ntasks_per_node == 0) |
| return MAX(limit_rc, arbitrary_rc); |
| |
| limit_rc = job_res->tasks_per_node[n] - |
| job_ptr->details->ntasks_per_node; |
| |
| /* Limit exceeded */ |
| if ((limit_rc > 0) && (log_error || (slurm_conf.debug_flags & |
| DEBUG_FLAG_SELECT_TYPE))) |
| log_var(log_lvl, |
| "%s over tasks_per_node for %pJ node:%u task_per_node:%d max:%u", |
| tag, job_ptr, n, job_res->tasks_per_node[n], |
| job_ptr->details->ntasks_per_node); |
| |
| return MAX(limit_rc, arbitrary_rc); |
| } |
| |
| /* |
| * dist_tasks_compute_c_b - compute the number of tasks on each |
| * of the node for the cyclic and block distribution. We need to do |
| * this in the case of consumable resources so that we have an exact |
| * count for the needed hardware resources which will be used later to |
| * update the different used resources per node structures. |
| * |
| * The most common case is when we have more resources than needed. In |
| * that case we just "take" what we need and "release" the remaining |
| * resources for other jobs. In the case where we oversubscribe the |
| * processing units (PUs) we keep the initial set of resources. |
| * |
| * IN/OUT job_ptr - pointer to job being scheduled. The per-node |
| * job_res->cpus array is recomputed here. |
| * IN gres_task_limit - array of task limits based upon job's GRES specification |
| * offset based upon bits set in |
| * job_ptr->job_resrcs->node_bitmap |
| */ |
| static int _dist_tasks_compute_c_b(job_record_t *job_ptr, |
| uint32_t *gres_task_limit, |
| uint32_t *gres_min_cpus) |
| { |
| bool do_gres_min_cpus = false; |
| uint32_t n, tid, t, maxtasks, l; |
| uint16_t *avail_cpus; |
| job_resources_t *job_res = job_ptr->job_resrcs; |
| char *err_msg = NULL; |
| uint16_t *vpus; |
| int rc = SLURM_SUCCESS, rem_cpus, rem_tasks; |
| uint16_t cpus_per_task; |
| node_record_t *node_ptr; |
| |
| if (!job_res) |
| err_msg = "job_res is NULL"; |
| else if (!job_res->cpus) |
| err_msg = "job_res->cpus is NULL"; |
| else if (!job_res->nhosts) |
| err_msg = "job_res->nhosts is zero"; |
| if (err_msg) { |
| error("Invalid allocation for %pJ: %s", |
| job_ptr, err_msg); |
| return SLURM_ERROR; |
| } |
| |
| vpus = xmalloc(job_res->nhosts * sizeof(uint16_t)); |
| |
| if (job_ptr->details->cpus_per_task == 0) |
| job_ptr->details->cpus_per_task = 1; |
| cpus_per_task = job_ptr->details->cpus_per_task; |
| |
| for (int i = 0, n = 0; |
| (node_ptr = next_node_bitmap(job_res->node_bitmap, &i)); i++) { |
| vpus[n++] = node_ptr->tpc; |
| } |
| |
| maxtasks = _get_task_count(job_ptr); |
| avail_cpus = job_res->cpus; |
| job_res->cpus = xmalloc(job_res->nhosts * sizeof(uint16_t)); |
| job_res->tasks_per_node = xmalloc(job_res->nhosts * sizeof(uint16_t)); |
| |
| /* |
| * Safe guard if the user didn't specified a lower number of |
| * CPUs than cpus_per_task or didn't specify the number. |
| */ |
| if (!maxtasks) { |
| error("changing task count from 0 to 1 for %pJ", |
| job_ptr); |
| maxtasks = 1; |
| } |
| /* Start by allocating one task per node */ |
| tid = 0; |
| for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) { |
| if (avail_cpus[n]) { |
| if (gres_min_cpus[n]) |
| do_gres_min_cpus = true; |
| /* Ignore gres_task_limit for first task per node */ |
| tid++; |
| job_res->tasks_per_node[n]++; |
| for (l = 0; l < cpus_per_task; l++) { |
| if (job_res->cpus[n] < avail_cpus[n]) |
| job_res->cpus[n]++; |
| } |
| } |
| } |
| |
| /* Next fill out the CPUs on the cores already allocated to this job */ |
| for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) { |
| rem_cpus = job_res->cpus[n] % vpus[n]; |
| rem_tasks = rem_cpus / cpus_per_task; |
| if (rem_tasks == 0) |
| continue; |
| for (t = 0; ((t < rem_tasks) && (tid < maxtasks)); t++) { |
| if (((avail_cpus[n] - job_res->cpus[n]) < |
| cpus_per_task)) |
| break; |
| if (!dist_tasks_tres_tasks_avail( |
| gres_task_limit, job_res, n)) |
| break; |
| if (_at_tpn_limit(n, job_ptr, "fill allocated", |
| false) >= 0) |
| break; |
| tid++; |
| job_res->tasks_per_node[n]++; |
| for (l = 0; l < cpus_per_task; l++) { |
| if (job_res->cpus[n] < avail_cpus[n]) |
| job_res->cpus[n]++; |
| } |
| } |
| } |
| |
| /* |
| * Next distribute additional tasks, packing the cores or sockets as |
| * appropriate to avoid allocating more CPUs than needed. For example, |
| * with core allocations and 2 processors per core, we don't want to |
| * partially populate some cores on some nodes and allocate extra |
| * cores on other nodes. So "srun -n20 hostname" should not launch 7 |
| * tasks on node 0, 7 tasks on node 1, and 6 tasks on node 2. It should |
| * launch 8 tasks on node, 8 tasks on node 1, and 4 tasks on node 2. |
| */ |
| if (job_ptr->details->overcommit && !job_ptr->tres_per_task) |
| maxtasks = 0; /* Allocate have one_task_per_node */ |
| while (tid < maxtasks) { |
| bool space_remaining = false; |
| for (n = 0; ((n < job_res->nhosts) && (tid < maxtasks)); n++) { |
| rem_tasks = vpus[n] / cpus_per_task; |
| rem_tasks = MAX(rem_tasks, 1); |
| for (t = 0; ((t < rem_tasks) && (tid < maxtasks)); t++){ |
| if ((avail_cpus[n] - job_res->cpus[n]) < |
| cpus_per_task) |
| break; |
| if (!dist_tasks_tres_tasks_avail( |
| gres_task_limit, |
| job_res, n)) |
| break; |
| if (_at_tpn_limit(n, job_ptr, "fill allocated", |
| false) >= 0) |
| break; |
| |
| tid++; |
| job_res->tasks_per_node[n]++; |
| for (l = 0; l < cpus_per_task; |
| l++) { |
| if (job_res->cpus[n] < avail_cpus[n]) |
| job_res->cpus[n]++; |
| } |
| if ((avail_cpus[n] - job_res->cpus[n]) >= |
| cpus_per_task) |
| space_remaining = true; |
| } |
| } |
| if (!space_remaining && (tid < maxtasks)) { |
| /* |
| * If gres_task_limit is not associated with |
| * gres_per_task, it is a soft limit. |
| */ |
| if (gres_task_limit && |
| !gres_select_util_job_tres_per_task( |
| job_ptr->gres_list_req)) { |
| /* Try again without limit */ |
| gres_task_limit = NULL; |
| } else { |
| rc = ESLURM_BAD_TASK_COUNT; |
| break; |
| } |
| } |
| } |
| if (do_gres_min_cpus) |
| dist_tasks_gres_min_cpus(job_ptr, avail_cpus, gres_min_cpus); |
| xfree(avail_cpus); |
| xfree(vpus); |
| |
| return rc; |
| } |
| |
| /* |
| * To effectively deal with heterogeneous nodes, we fake a cyclic |
| * distribution to figure out how many cores are needed on each node. |
| * |
| * This routine is a slightly modified "version" of the routine |
| * _task_layout_block in src/common/dist_tasks.c. We do not need to |
| * assign tasks to job->hostid[] and job->tids[][] at this point so |
| * the core allocation is the same for cyclic and block. |
| * |
| * For the consumable resources support we need to determine what |
| * "node/Core/thread"-tuplets will be allocated for a given job. |
| * In the past we assumed that we only allocated one task per PU |
| * (processing unit, the lowest allocatable logical processor, |
| * core or thread depending upon configuration) and didn't allow |
| * the use of overcommit. We have changed this philosophy and are now |
| * allowing people to overcommit their resources and expect the system |
| * administrator to enable the task/affinity plug-in which will then |
| * bind all of a job's tasks to its allocated resources thereby |
| * avoiding interference between co-allocated running jobs. |
| * |
| * In the consumable resources environment we need to determine the |
| * layout schema within slurmctld. |
| * |
| * We have a core_bitmap of all available cores. All we're doing here |
| * is removing cores that are not needed based on the task count, and |
| * the choice of cores to remove is based on the distribution: |
| * - "cyclic" removes cores "evenly", starting from the last socket, |
| * - "block" removes cores from the "last" socket(s) |
| * - "plane" removes cores "in chunks" |
| * |
| * IN job_ptr - job to be allocated resources |
| * IN cr_type - allocation type (sockets, cores, etc.) |
| * IN preempt_mode - true if testing with simulated preempted jobs |
| * IN core_array - system-wide bitmap of cores originally available to |
| * the job, only used to identify specialized cores |
| * IN gres_task_limit - array of task limits based upon job GRES specification, |
| * offset based upon bits set in job_ptr->job_resrcs->node_bitmap |
| * IN gres_min_cpus - array of minimum required CPUs based upon job's GRES |
| * specification, offset based upon bits set in |
| * job_ptr->job_resrcs->node_bitmap |
| */ |
| extern int dist_tasks(job_record_t *job_ptr, const uint16_t cr_type, |
| bool preempt_mode, bitstr_t **core_array, |
| uint32_t *gres_task_limit, uint32_t *gres_min_cpus) |
| { |
| int error_code; |
| bool one_task_per_node = false; |
| |
| /* |
| * Zero size jobs are supported for the creation and deletion of |
| * persistent burst buffers. |
| */ |
| if (job_ptr->details->min_nodes == 0) |
| return SLURM_SUCCESS; |
| |
| if (job_ptr->details->core_spec != NO_VAL16) { |
| /* |
| * The job has been allocated all non-specialized cores. |
| * Just set the task distribution for tres_per_task support. |
| */ |
| error_code = _set_task_dist(job_ptr, cr_type); |
| if (error_code != SLURM_SUCCESS) |
| return error_code; |
| return SLURM_SUCCESS; |
| } |
| |
| if ((job_ptr->job_resrcs->node_req == NODE_CR_RESERVED) || |
| (job_ptr->details->whole_node & WHOLE_NODE_REQUIRED)) { |
| /* |
| * The job has been allocated an EXCLUSIVE set of nodes, |
| * so it gets all of the bits in the core_array except for |
| * specialized cores. Set the task distribution for |
| * tres_per_task support. |
| */ |
| _clear_spec_cores(job_ptr, core_array); |
| error_code = _set_task_dist(job_ptr, cr_type); |
| |
| if (error_code != SLURM_SUCCESS) |
| return error_code; |
| return SLURM_SUCCESS; |
| } |
| |
| if (job_ptr->details->overcommit && !job_ptr->tres_per_task) |
| one_task_per_node = true; |
| _log_select_maps("cr_dist/start", job_ptr); |
| if (((job_ptr->details->task_dist & SLURM_DIST_STATE_BASE) == |
| SLURM_DIST_PLANE) && !one_task_per_node) { |
| /* Perform plane distribution on the job_resources_t struct */ |
| error_code = _compute_plane_dist(job_ptr, gres_task_limit, |
| gres_min_cpus); |
| if (error_code != SLURM_SUCCESS) |
| return error_code; |
| } else { |
| /* Perform cyclic distribution on the job_resources_t struct */ |
| error_code = _dist_tasks_compute_c_b(job_ptr, gres_task_limit, |
| gres_min_cpus); |
| if (error_code != SLURM_SUCCESS) |
| return error_code; |
| } |
| _log_select_maps("cr_dist/middle", job_ptr); |
| |
| /* |
| * now sync up the core_bitmap with the job_resources_t struct |
| * based on the given distribution AND resource setting |
| */ |
| if (!(cr_type & SELECT_CORE) && !(cr_type & SELECT_SOCKET)) { |
| _block_sync_core_bitmap(job_ptr, cr_type); |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * If SelectTypeParameters mentions to use a block distribution for |
| * cores by default, use that kind of distribution if no particular |
| * cores distribution specified. |
| * Note : cyclic cores distribution, which is the default, is treated |
| * by the next code block |
| */ |
| if (slurm_conf.select_type_param & SELECT_CORE_DEFAULT_DIST_BLOCK) { |
| switch (job_ptr->details->task_dist & SLURM_DIST_NODESOCKMASK) { |
| case SLURM_DIST_ARBITRARY: |
| case SLURM_DIST_BLOCK: |
| case SLURM_DIST_CYCLIC: |
| case SLURM_DIST_UNKNOWN: |
| _block_sync_core_bitmap(job_ptr, cr_type); |
| return SLURM_SUCCESS; |
| } |
| } |
| |
| /* |
| * Determine the number of logical processors per node needed |
| * for this job. Make sure below matches the layouts in |
| * lllp_distribution in plugins/task/affinity/dist_task.c (FIXME) |
| */ |
| switch (job_ptr->details->task_dist & SLURM_DIST_NODESOCKMASK) { |
| case SLURM_DIST_BLOCK_BLOCK: |
| case SLURM_DIST_CYCLIC_BLOCK: |
| case SLURM_DIST_PLANE: |
| _block_sync_core_bitmap(job_ptr, cr_type); |
| break; |
| case SLURM_DIST_ARBITRARY: |
| case SLURM_DIST_BLOCK: |
| case SLURM_DIST_CYCLIC: |
| case SLURM_DIST_BLOCK_CYCLIC: |
| case SLURM_DIST_CYCLIC_CYCLIC: |
| case SLURM_DIST_BLOCK_CFULL: |
| case SLURM_DIST_CYCLIC_CFULL: |
| case SLURM_DIST_UNKNOWN: |
| error_code = _cyclic_sync_core_bitmap(job_ptr, cr_type, |
| preempt_mode); |
| break; |
| default: |
| error("invalid task_dist entry"); |
| return SLURM_ERROR; |
| } |
| |
| _log_select_maps("cr_dist/fini", job_ptr); |
| return error_code; |
| } |
| |
| /* Return true if more tasks can be allocated for this job on this node */ |
| extern bool dist_tasks_tres_tasks_avail(uint32_t *gres_task_limit, |
| job_resources_t *job_res, |
| uint32_t node_offset) |
| { |
| if (!gres_task_limit || !job_res) |
| return true; |
| if (gres_task_limit[node_offset] > job_res->tasks_per_node[node_offset]) |
| return true; |
| return false; |
| } |
| |
| extern void dist_tasks_gres_min_cpus(job_record_t *job_ptr, |
| uint16_t *avail_cpus, |
| uint32_t *gres_min_cpus) |
| { |
| job_resources_t *job_res = job_ptr->job_resrcs; |
| |
| for (int n = 0; n < job_res->nhosts; n++) { |
| /* |
| * Make sure that enough cpus are available to meet the minimum |
| * number of required cores to satisfy a gres request. This |
| * can increase the number of cpus per task on a given node. |
| */ |
| if (job_res->cpus[n] < gres_min_cpus[n]) { |
| /* |
| * If avail_cpus is less then gres_min_cpus, |
| * something went wrong. Get as many cpus |
| * as we can. |
| */ |
| if (avail_cpus[n] < gres_min_cpus[n]) { |
| log_flag( |
| SELECT_TYPE, |
| "%pJ: gres_min_cpus=%u is greater than avail_cpus=%u for node %u", |
| job_ptr, gres_min_cpus[n], |
| avail_cpus[n], n); |
| job_res->cpus[n] = avail_cpus[n]; |
| } else { |
| log_flag( |
| SELECT_TYPE, |
| "%pJ: Changing job_res->cpus from %u to gres_min_cpus %u for node %u", |
| job_ptr, job_res->cpus[n], |
| gres_min_cpus[n], n); |
| job_res->cpus[n] = gres_min_cpus[n]; |
| } |
| } |
| } |
| } |