| /*****************************************************************************\ |
| * Copyright (C) 2006-2009 Hewlett-Packard Development Company, L.P. |
| * Copyright (C) 2008-2009 Lawrence Livermore National Security. |
| * Written by Susanne M. Balle, <susanne.balle@hp.com> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #define _GNU_SOURCE |
| |
| #include "affinity.h" |
| #include "dist_tasks.h" |
| #include "src/common/bitstring.h" |
| #include "src/common/log.h" |
| #include "src/interfaces/cred.h" |
| #include "src/common/slurm_protocol_api.h" |
| #include "src/common/slurm_resource_info.h" |
| #include "src/common/strlcpy.h" |
| #include "src/common/xmalloc.h" |
| #include "src/slurmd/slurmd/slurmd.h" |
| |
| #ifdef HAVE_NUMA |
| #include <numa.h> |
| #endif |
| |
| static char *_alloc_mask(launch_tasks_request_msg_t *req, |
| int *whole_node_cnt, int *whole_socket_cnt, |
| int *whole_core_cnt, int *whole_thread_cnt, |
| int *part_socket_cnt, int *part_core_cnt); |
| static bitstr_t *_get_avail_map(slurm_cred_t *cred, uint16_t *hw_sockets, |
| uint16_t *hw_cores, uint16_t *hw_threads); |
| static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id, |
| uint16_t *sockets, uint16_t *cores); |
| |
| static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, |
| uint32_t node_id, bitstr_t ***masks_p); |
| static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req, |
| uint32_t node_id, bitstr_t ***masks_p); |
| |
| static void _lllp_map_abstract_masks(const uint32_t maxtasks, |
| bitstr_t **masks); |
| static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req, |
| const uint32_t maxtasks, |
| bitstr_t **masks); |
| |
| /* BLOCK_MAP physical machine LLLP index to abstract block LLLP index |
| * BLOCK_MAP_INV physical abstract block LLLP index to machine LLLP index |
| */ |
| #define BLOCK_MAP(index) _block_map(index, conf->block_map) |
| #define BLOCK_MAP_INV(index) _block_map(index, conf->block_map_inv) |
| |
| |
| /* _block_map |
| * |
| * safely returns a mapped index using a provided block map |
| * |
| * IN - index to map |
| * IN - map to use |
| */ |
| static uint16_t _block_map(uint16_t index, uint16_t *map) |
| { |
| if (map == NULL) { |
| return index; |
| } |
| /* make sure bit falls in map */ |
| if (index >= conf->block_map_size) { |
| debug3("wrapping index %u into block_map_size of %u", |
| index, conf->block_map_size); |
| index = index % conf->block_map_size; |
| } |
| index = map[index]; |
| return(index); |
| } |
| |
| static void _task_layout_display_masks(launch_tasks_request_msg_t *req, |
| const uint32_t *gtid, |
| const uint32_t maxtasks, |
| bitstr_t **masks) |
| { |
| int i; |
| char *str = NULL; |
| for(i = 0; i < maxtasks; i++) { |
| str = (char *)bit_fmt_hexmask(masks[i]); |
| debug3("_task_layout_display_masks jobid [%u:%d] %s", |
| req->step_id.job_id, gtid[i], str); |
| xfree(str); |
| } |
| } |
| |
| static void _lllp_free_masks(const uint32_t maxtasks, bitstr_t **masks) |
| { |
| int i; |
| bitstr_t *bitmask; |
| |
| for (i = 0; i < maxtasks; i++) { |
| bitmask = masks[i]; |
| FREE_NULL_BITMAP(bitmask); |
| } |
| xfree(masks); |
| } |
| |
| #ifdef HAVE_NUMA |
| /* _match_mask_to_ldom |
| * |
| * expand each mask to encompass the whole locality domain |
| * within which it currently exists |
| * NOTE: this assumes that the masks are already in logical |
| * (and not abstract) CPU order. |
| */ |
| static void _match_masks_to_ldom(const uint32_t maxtasks, bitstr_t **masks) |
| { |
| uint32_t i, b, size; |
| |
| if (!masks || !masks[0]) |
| return; |
| size = bit_size(masks[0]); |
| for(i = 0; i < maxtasks; i++) { |
| for (b = 0; b < size; b++) { |
| if (bit_test(masks[i], b)) { |
| /* get the NUMA node for this CPU, and then |
| * set all CPUs in the mask that exist in |
| * the same CPU */ |
| int c; |
| uint16_t nnid = slurm_get_numa_node(b); |
| for (c = 0; c < size; c++) { |
| if (slurm_get_numa_node(c) == nnid) |
| bit_set(masks[i], c); |
| } |
| } |
| } |
| } |
| } |
| #endif |
| |
| /* |
| * batch_bind - Set the batch request message so as to bind the shell to the |
| * proper resources |
| */ |
| void batch_bind(batch_job_launch_msg_t *req) |
| { |
| bitstr_t *hw_map; |
| int task_cnt = 0; |
| uint16_t sockets = 0, cores = 0, threads = 0; |
| |
| hw_map = _get_avail_map(req->cred, &sockets, &cores, &threads); |
| if (hw_map) |
| task_cnt = bit_set_count(hw_map); |
| |
| if (task_cnt) { |
| req->cpu_bind_type = CPU_BIND_MASK; |
| if (slurm_conf.task_plugin_param & CPU_BIND_VERBOSE) |
| req->cpu_bind_type |= CPU_BIND_VERBOSE; |
| xfree(req->cpu_bind); |
| req->cpu_bind = (char *)bit_fmt_hexmask(hw_map); |
| info("job %u CPU input mask for node: %s", |
| req->job_id, req->cpu_bind); |
| /* translate abstract masks to actual hardware layout */ |
| _lllp_map_abstract_masks(1, &hw_map); |
| #ifdef HAVE_NUMA |
| if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { |
| _match_masks_to_ldom(1, &hw_map); |
| } |
| #endif |
| xfree(req->cpu_bind); |
| req->cpu_bind = (char *)bit_fmt_hexmask(hw_map); |
| info("job %u CPU final HW mask for node: %s", |
| req->job_id, req->cpu_bind); |
| } else { |
| error("job %u allocated no CPUs", |
| req->job_id); |
| } |
| FREE_NULL_BITMAP(hw_map); |
| } |
| |
| static int _validate_map(launch_tasks_request_msg_t *req, char *avail_mask, |
| char **err_msg) |
| { |
| char *tmp_map, *save_ptr = NULL, *tok; |
| cpu_set_t avail_cpus; |
| bool superset = true; |
| int rc = SLURM_SUCCESS; |
| |
| if (!req->cpu_bind) { |
| char *err = "No list of CPU IDs provided to --cpu-bind=map_cpu:<list>"; |
| error("%s", err); |
| if (err_msg) |
| xstrfmtcat(*err_msg, "%s", err); |
| return ESLURMD_CPU_BIND_ERROR; |
| } |
| |
| CPU_ZERO(&avail_cpus); |
| if (task_str_to_cpuset(&avail_cpus, avail_mask)) { |
| char *err = "Failed to convert avail_mask into hex for CPU bind map"; |
| error("%s", err); |
| if (err_msg) |
| xstrfmtcat(*err_msg, "%s", err); |
| return ESLURMD_CPU_BIND_ERROR; |
| } |
| |
| tmp_map = xstrdup(req->cpu_bind); |
| tok = strtok_r(tmp_map, ",", &save_ptr); |
| while (tok) { |
| int i = atoi(tok); |
| if (!CPU_ISSET(i, &avail_cpus)) { |
| /* The task's CPU map is completely invalid. |
| * Disable CPU map. */ |
| superset = false; |
| break; |
| } |
| tok = strtok_r(NULL, ",", &save_ptr); |
| } |
| xfree(tmp_map); |
| |
| if (!superset) { |
| error("CPU binding outside of job step allocation, allocated CPUs are: %s.", |
| avail_mask); |
| if (err_msg) |
| xstrfmtcat(*err_msg, "CPU binding outside of job step allocation, allocated CPUs are: %s.", |
| avail_mask); |
| rc = ESLURMD_CPU_BIND_ERROR; |
| } |
| return rc; |
| } |
| |
| static int _validate_mask(launch_tasks_request_msg_t *req, char *avail_mask, |
| char **err_msg) |
| { |
| char *new_mask = NULL, *save_ptr = NULL, *tok; |
| cpu_set_t avail_cpus, task_cpus; |
| bool superset = true; |
| int rc = SLURM_SUCCESS; |
| |
| if (!req->cpu_bind) { |
| char *err = "No list of CPU masks provided to --cpu-bind=mask_cpu:<list>"; |
| error("%s", err); |
| if (err_msg) |
| xstrfmtcat(*err_msg, "%s", err); |
| return ESLURMD_CPU_BIND_ERROR; |
| } |
| |
| CPU_ZERO(&avail_cpus); |
| if (task_str_to_cpuset(&avail_cpus, avail_mask)) { |
| char *err = "Failed to convert avail_mask into hex for CPU bind mask"; |
| error("%s", err); |
| if (err_msg) |
| xstrfmtcat(*err_msg, "%s", err); |
| return ESLURMD_CPU_BIND_ERROR; |
| } |
| |
| tok = strtok_r(req->cpu_bind, ",", &save_ptr); |
| while (tok) { |
| int i, overlaps = 0; |
| char mask_str[CPU_SET_HEX_STR_SIZE]; |
| CPU_ZERO(&task_cpus); |
| if (task_str_to_cpuset(&task_cpus, tok)) { |
| char *err = "Failed to convert cpu bind string into hex for CPU bind mask"; |
| error("%s", err); |
| if (err_msg) |
| xstrfmtcat(*err_msg, "%s", err); |
| xfree(new_mask); |
| return ESLURMD_CPU_BIND_ERROR; |
| } |
| for (i = 0; i < CPU_SETSIZE; i++) { |
| if (!CPU_ISSET(i, &task_cpus)) |
| continue; |
| if (CPU_ISSET(i, &avail_cpus)) { |
| overlaps++; |
| } else { |
| CPU_CLR(i, &task_cpus); |
| superset = false; |
| } |
| } |
| if (overlaps == 0) { |
| /* The task's CPU mask is completely invalid. |
| * Give it all allowed CPUs. */ |
| for (i = 0; i < CPU_SETSIZE; i++) { |
| if (CPU_ISSET(i, &avail_cpus)) |
| CPU_SET(i, &task_cpus); |
| } |
| } |
| task_cpuset_to_str(&task_cpus, mask_str); |
| if (new_mask) |
| xstrcat(new_mask, ","); |
| xstrcat(new_mask, mask_str); |
| tok = strtok_r(NULL, ",", &save_ptr); |
| } |
| |
| if (!superset) { |
| error("CPU binding outside of job step allocation, allocated CPUs are: %s.", |
| avail_mask); |
| if (err_msg) |
| xstrfmtcat(*err_msg, "CPU binding outside of job step allocation, allocated CPUs are: %s.", |
| avail_mask); |
| rc = ESLURMD_CPU_BIND_ERROR; |
| } |
| |
| xfree(req->cpu_bind); |
| req->cpu_bind = new_mask; |
| return rc; |
| } |
| |
| /* |
| * lllp_distribution |
| * |
| * Note: lllp stands for Lowest Level of Logical Processors. |
| * |
| * When automatic binding is enabled: |
| * - no binding flags set >= CPU_BIND_NONE, and |
| * - a auto binding level selected CPU_BIND_TO_{SOCKETS,CORES,THREADS} |
| * Otherwise limit job step to the allocated CPUs |
| * |
| * generate the appropriate cpu_bind type and string which results in |
| * the specified lllp distribution. |
| * |
| * IN/OUT req - job launch request (cpu_bind_type and cpu_bind updated) |
| * IN node_id - global task id array |
| * OUT err_msg - optional string to pass out error message. |
| */ |
| extern int lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id, |
| char **err_msg) |
| { |
| int rc = SLURM_SUCCESS; |
| bitstr_t **masks = NULL; |
| char buf_type[100]; |
| int maxtasks = req->tasks_to_launch[node_id]; |
| int whole_nodes, whole_sockets, whole_cores, whole_threads; |
| int part_sockets, part_cores; |
| const uint32_t *gtid = req->global_task_ids[node_id]; |
| static uint16_t bind_entity = |
| CPU_BIND_TO_THREADS | CPU_BIND_TO_CORES | |
| CPU_BIND_TO_SOCKETS | CPU_BIND_TO_LDOMS; |
| static uint16_t bind_mode = |
| CPU_BIND_NONE | CPU_BIND_MASK | |
| CPU_BIND_MAP | |
| CPU_BIND_LDMASK | CPU_BIND_LDRANK | |
| CPU_BIND_LDMAP; |
| static int only_one_thread_per_core = -1; |
| |
| if (only_one_thread_per_core == -1) { |
| if (conf->cpus == (conf->sockets * conf->cores)) |
| only_one_thread_per_core = 1; |
| else |
| only_one_thread_per_core = 0; |
| } |
| |
| /* |
| * If we are telling the system we only want to use 1 thread |
| * per core with the CPUs node option this is the easiest way |
| * to portray that to the affinity plugin. |
| */ |
| if (only_one_thread_per_core) |
| req->cpu_bind_type |= CPU_BIND_ONE_THREAD_PER_CORE; |
| |
| if (req->cpu_bind_type & bind_mode) { |
| /* Explicit step binding specified by user */ |
| char *avail_mask = _alloc_mask(req, |
| &whole_nodes, &whole_sockets, |
| &whole_cores, &whole_threads, |
| &part_sockets, &part_cores); |
| if (!avail_mask) { |
| error("Could not determine allocated CPUs"); |
| if (err_msg) |
| xstrfmtcat(*err_msg, "Could not determine allocated CPUs"); |
| rc = ESLURMD_CPU_BIND_ERROR; |
| } else if ((whole_nodes == 0) && |
| (req->job_core_spec == NO_VAL16) && |
| (!(req->cpu_bind_type & CPU_BIND_MAP)) && |
| (!(req->cpu_bind_type & CPU_BIND_MASK))) { |
| |
| if (!(req->cpu_bind_type & CPU_BIND_NONE)) { |
| rc = ESLURMD_CPU_BIND_ERROR; |
| slurm_sprint_cpu_bind_type(buf_type, |
| req->cpu_bind_type); |
| error("Entire node must be allocated for %s", |
| buf_type); |
| if (err_msg) |
| xstrfmtcat(*err_msg, "Entire node must be allocated for %s", |
| buf_type); |
| } |
| xfree(req->cpu_bind); |
| req->cpu_bind = avail_mask; |
| req->cpu_bind_type &= (~bind_mode); |
| req->cpu_bind_type |= CPU_BIND_MASK; |
| } else { |
| if (req->job_core_spec == NO_VAL16) { |
| if (req->cpu_bind_type & CPU_BIND_MASK) |
| rc = _validate_mask(req, avail_mask, |
| err_msg); |
| else if (req->cpu_bind_type & CPU_BIND_MAP) |
| rc = _validate_map(req, avail_mask, |
| err_msg); |
| } |
| xfree(avail_mask); |
| } |
| slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); |
| info("JobId=%u manual binding: %s", |
| req->step_id.job_id, buf_type); |
| return rc; |
| } |
| |
| if (!(req->cpu_bind_type & bind_entity)) { |
| /* |
| * No bind unit (sockets, cores) specified by user, |
| * pick something reasonable |
| */ |
| bool auto_def_set = false; |
| int spec_thread_cnt = 0; |
| int max_tasks = req->tasks_to_launch[node_id] * |
| req->cpus_per_task; |
| char *avail_mask = _alloc_mask(req, |
| &whole_nodes, &whole_sockets, |
| &whole_cores, &whole_threads, |
| &part_sockets, &part_cores); |
| debug("binding tasks:%d to nodes:%d sockets:%d:%d cores:%d:%d threads:%d", |
| max_tasks, whole_nodes, whole_sockets, |
| part_sockets, whole_cores, part_cores, whole_threads); |
| if ((req->job_core_spec != NO_VAL16) && |
| (req->job_core_spec & CORE_SPEC_THREAD) && |
| (req->job_core_spec != CORE_SPEC_THREAD)) { |
| spec_thread_cnt = req->job_core_spec & |
| (~CORE_SPEC_THREAD); |
| } |
| if (((max_tasks == whole_sockets) && (part_sockets == 0)) || |
| (spec_thread_cnt && |
| (max_tasks == (whole_sockets + part_sockets)))) { |
| req->cpu_bind_type |= CPU_BIND_TO_SOCKETS; |
| goto make_auto; |
| } |
| if (((max_tasks == whole_cores) && (part_cores == 0)) || |
| (spec_thread_cnt && |
| (max_tasks == (whole_cores + part_cores)))) { |
| req->cpu_bind_type |= CPU_BIND_TO_CORES; |
| goto make_auto; |
| } |
| if (max_tasks == whole_threads) { |
| req->cpu_bind_type |= CPU_BIND_TO_THREADS; |
| goto make_auto; |
| } |
| |
| if (slurm_conf.task_plugin_param & CPU_AUTO_BIND_TO_THREADS) { |
| auto_def_set = true; |
| req->cpu_bind_type |= CPU_BIND_TO_THREADS; |
| goto make_auto; |
| } else if (slurm_conf.task_plugin_param & |
| CPU_AUTO_BIND_TO_CORES) { |
| auto_def_set = true; |
| req->cpu_bind_type |= CPU_BIND_TO_CORES; |
| goto make_auto; |
| } else if (slurm_conf.task_plugin_param & |
| CPU_AUTO_BIND_TO_SOCKETS) { |
| auto_def_set = true; |
| req->cpu_bind_type |= CPU_BIND_TO_SOCKETS; |
| goto make_auto; |
| } |
| |
| if (avail_mask) { |
| xfree(req->cpu_bind); |
| req->cpu_bind = avail_mask; |
| req->cpu_bind_type |= CPU_BIND_MASK; |
| } |
| |
| slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); |
| info("JobId=%u auto binding off: %s", |
| req->step_id.job_id, buf_type); |
| return rc; |
| |
| make_auto: xfree(avail_mask); |
| slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); |
| info("JobId=%u %s auto binding: %s, dist %d", |
| req->step_id.job_id, |
| (auto_def_set) ? "default" : "implicit", |
| buf_type, req->task_dist); |
| } else { |
| /* Explicit bind unit (sockets, cores) specified by user */ |
| slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); |
| info("JobId=%u binding: %s, dist %d", |
| req->step_id.job_id, buf_type, req->task_dist); |
| } |
| |
| switch (req->task_dist & SLURM_DIST_NODESOCKMASK) { |
| case SLURM_DIST_BLOCK_BLOCK: |
| case SLURM_DIST_CYCLIC_BLOCK: |
| case SLURM_DIST_PLANE: |
| debug2("JobId=%u will use lllp_block", |
| req->step_id.job_id); |
| /* tasks are distributed in blocks within a plane */ |
| rc = _task_layout_lllp_block(req, node_id, &masks); |
| break; |
| case SLURM_DIST_ARBITRARY: |
| case SLURM_DIST_BLOCK: |
| case SLURM_DIST_CYCLIC: |
| case SLURM_DIST_UNKNOWN: |
| if (slurm_conf.select_type_param & |
| SELECT_CORE_DEFAULT_DIST_BLOCK) { |
| debug2("JobId=%u will use lllp_block because of SelectTypeParameters", |
| req->step_id.job_id); |
| rc = _task_layout_lllp_block(req, node_id, &masks); |
| break; |
| } |
| /* |
| * We want to fall through here if we aren't doing a |
| * default dist block. |
| */ |
| default: |
| debug2("JobId=%u will use lllp_cyclic because of SelectTypeParameters", |
| req->step_id.job_id); |
| rc = _task_layout_lllp_cyclic(req, node_id, &masks); |
| break; |
| } |
| |
| /* |
| * FIXME: I'm worried about core_bitmap with CPU_BIND_TO_SOCKETS & |
| * max_cores - does select/cons_tres plugin allocate whole |
| * socket??? Maybe not. Check srun man page. |
| */ |
| |
| if (rc == SLURM_SUCCESS) { |
| _task_layout_display_masks(req, gtid, maxtasks, masks); |
| /* translate abstract masks to actual hardware layout */ |
| _lllp_map_abstract_masks(maxtasks, masks); |
| _task_layout_display_masks(req, gtid, maxtasks, masks); |
| #ifdef HAVE_NUMA |
| if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { |
| _match_masks_to_ldom(maxtasks, masks); |
| _task_layout_display_masks(req, gtid, maxtasks, masks); |
| } |
| #endif |
| /* convert masks into cpu_bind mask string */ |
| _lllp_generate_cpu_bind(req, maxtasks, masks); |
| } else { |
| char *avail_mask = _alloc_mask(req, |
| &whole_nodes, &whole_sockets, |
| &whole_cores, &whole_threads, |
| &part_sockets, &part_cores); |
| if (avail_mask) { |
| xfree(req->cpu_bind); |
| req->cpu_bind = avail_mask; |
| req->cpu_bind_type &= (~bind_mode); |
| req->cpu_bind_type |= CPU_BIND_MASK; |
| } |
| |
| if (req->flags & LAUNCH_OVERCOMMIT) { |
| /* |
| * Allow the step to run despite not being able to |
| * distribute tasks. |
| * e.g. Overcommit will fail to distribute tasks because |
| * the step has wants more cpus than allocated. |
| */ |
| rc = SLURM_SUCCESS; |
| } else if (err_msg) { |
| slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); |
| xstrfmtcat(*err_msg, "JobId=%u failed to distribute tasks (bind_type:%s) - this should never happen", |
| req->step_id.job_id, buf_type); |
| error("%s", *err_msg); |
| } |
| } |
| if (masks) |
| _lllp_free_masks(maxtasks, masks); |
| return rc; |
| } |
| |
| |
| /* |
| * _get_local_node_info - get job allocation details for this node |
| * IN: req - launch request structure |
| * IN: job_node_id - index of the local node in the job allocation |
| * IN/OUT: sockets - pointer to socket count variable |
| * IN/OUT: cores - pointer to cores_per_socket count variable |
| * OUT: returns the core_bitmap index of the first core for this node |
| */ |
| static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id, |
| uint16_t *sockets, uint16_t *cores) |
| { |
| int bit_start = 0, bit_finish = 0; |
| int i, index = -1, cur_node_id = -1; |
| |
| do { |
| index++; |
| for (i = 0; i < arg->sock_core_rep_count[index] && |
| cur_node_id < job_node_id; i++) { |
| bit_start = bit_finish; |
| bit_finish += arg->sockets_per_node[index] * |
| arg->cores_per_socket[index]; |
| cur_node_id++; |
| } |
| |
| } while (cur_node_id < job_node_id); |
| |
| *sockets = arg->sockets_per_node[index]; |
| *cores = arg->cores_per_socket[index]; |
| return bit_start; |
| } |
| |
| /* |
| * Determine which CPUs a job step can use. |
| * OUT whole_<entity>_count - returns count of whole <entities> in this |
| * allocation for this node |
| * OUT part__<entity>_count - returns count of partial <entities> in this |
| * allocation for this node |
| * RET - a string representation of the available mask or NULL on error |
| * NOTE: Caller must xfree() the return value. |
| */ |
| static char *_alloc_mask(launch_tasks_request_msg_t *req, |
| int *whole_node_cnt, int *whole_socket_cnt, |
| int *whole_core_cnt, int *whole_thread_cnt, |
| int *part_socket_cnt, int *part_core_cnt) |
| { |
| uint16_t sockets, cores, threads; |
| int c, s, t, i; |
| int c_miss, s_miss, t_miss, c_hit, t_hit; |
| bitstr_t *alloc_bitmap; |
| char *str_mask; |
| bitstr_t *alloc_mask; |
| |
| *whole_node_cnt = 0; |
| *whole_socket_cnt = 0; |
| *whole_core_cnt = 0; |
| *whole_thread_cnt = 0; |
| *part_socket_cnt = 0; |
| *part_core_cnt = 0; |
| |
| alloc_bitmap = _get_avail_map(req->cred, &sockets, &cores, &threads); |
| if (!alloc_bitmap) |
| return NULL; |
| |
| alloc_mask = bit_alloc(bit_size(alloc_bitmap)); |
| |
| i = 0; |
| for (s = 0, s_miss = false; s < sockets; s++) { |
| for (c = 0, c_hit = c_miss = false; c < cores; c++) { |
| for (t = 0, t_hit = t_miss = false; t < threads; t++) { |
| /* |
| * If we are pretending we have a larger system |
| * than we really have this is needed to make |
| * sure we don't bust the bank. |
| */ |
| if (i >= bit_size(alloc_bitmap)) |
| i = 0; |
| if (bit_test(alloc_bitmap, i)) { |
| bit_set(alloc_mask, i); |
| (*whole_thread_cnt)++; |
| t_hit = true; |
| c_hit = true; |
| } else |
| t_miss = true; |
| i++; |
| } |
| if (!t_miss) |
| (*whole_core_cnt)++; |
| else { |
| if (t_hit) |
| (*part_core_cnt)++; |
| c_miss = true; |
| } |
| } |
| if (!c_miss) |
| (*whole_socket_cnt)++; |
| else { |
| if (c_hit) |
| (*part_socket_cnt)++; |
| s_miss = true; |
| } |
| } |
| if (!s_miss) |
| (*whole_node_cnt)++; |
| FREE_NULL_BITMAP(alloc_bitmap); |
| |
| if ((req->job_core_spec != NO_VAL16) && |
| (req->job_core_spec & CORE_SPEC_THREAD) && |
| (req->job_core_spec != CORE_SPEC_THREAD)) { |
| int spec_thread_cnt; |
| spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD); |
| for (t = threads - 1; |
| ((t > 0) && (spec_thread_cnt > 0)); t--) { |
| for (c = cores - 1; |
| ((c > 0) && (spec_thread_cnt > 0)); c--) { |
| for (s = sockets - 1; |
| ((s >= 0) && (spec_thread_cnt > 0)); s--) { |
| i = s * cores + c; |
| i = (i * threads) + t; |
| /* |
| * If config_overrides is used bitmap |
| * may be too small for the counter |
| */ |
| i %= conf->block_map_size; |
| bit_clear(alloc_mask, i); |
| spec_thread_cnt--; |
| } |
| } |
| } |
| } |
| |
| /* translate abstract masks to actual hardware layout */ |
| _lllp_map_abstract_masks(1, &alloc_mask); |
| |
| #ifdef HAVE_NUMA |
| if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { |
| _match_masks_to_ldom(1, &alloc_mask); |
| } |
| #endif |
| |
| str_mask = bit_fmt_hexmask(alloc_mask); |
| FREE_NULL_BITMAP(alloc_mask); |
| return str_mask; |
| } |
| |
| /* |
| * Given a job step request, return an equivalent local bitmap for this node |
| * IN cred - The job step launch request credential |
| * OUT hw_sockets - number of actual sockets on this node |
| * OUT hw_cores - number of actual cores per socket on this node |
| * OUT hw_threads - number of actual threads per core on this node |
| * RET: bitmap of processors available to this job step on this node |
| * OR NULL on error |
| */ |
| static bitstr_t *_get_avail_map(slurm_cred_t *cred, uint16_t *hw_sockets, |
| uint16_t *hw_cores, uint16_t *hw_threads) |
| { |
| bitstr_t *req_map, *hw_map; |
| uint16_t p, t, new_p, num_cores, sockets, cores; |
| int job_node_id; |
| int start; |
| char *str; |
| int spec_thread_cnt = 0; |
| slurm_cred_arg_t *arg = slurm_cred_get_args(cred); |
| |
| *hw_sockets = conf->actual_sockets; |
| *hw_cores = conf->actual_cores; |
| *hw_threads = conf->actual_threads; |
| |
| /* we need this node's ID in relation to the whole |
| * job allocation, not just this jobstep */ |
| job_node_id = nodelist_find(arg->job_hostlist, conf->node_name); |
| if ((job_node_id < 0) || (job_node_id > arg->job_nhosts)) { |
| error("%s: missing node %s in job credential (%s)", |
| __func__, conf->node_name, arg->job_hostlist); |
| slurm_cred_unlock_args(cred); |
| return NULL; |
| } |
| start = _get_local_node_info(arg, job_node_id, &sockets, &cores); |
| debug3("slurmctld s %u c %u; hw s %u c %u t %u", |
| sockets, cores, *hw_sockets, *hw_cores, *hw_threads); |
| |
| num_cores = MIN((sockets * cores), ((*hw_sockets)*(*hw_cores))); |
| req_map = bit_alloc(num_cores); |
| hw_map = bit_alloc(conf->block_map_size); |
| |
| /* Transfer core_bitmap data to local req_map. |
| * The MOD function handles the case where fewer processes |
| * physically exist than are configured (slurmd is out of |
| * sync with the slurmctld daemon). */ |
| for (p = 0; p < (sockets * cores); p++) { |
| if (bit_test(arg->step_core_bitmap, start + p)) |
| bit_set(req_map, (p % num_cores)); |
| } |
| |
| str = (char *)bit_fmt_hexmask(req_map); |
| debug3("%ps core mask from slurmctld: %s", |
| &arg->step_id, str); |
| xfree(str); |
| |
| for (p = 0; p < num_cores; p++) { |
| if (bit_test(req_map, p) == 0) |
| continue; |
| /* If we are pretending we have a larger system than |
| we really have this is needed to make sure we |
| don't bust the bank. |
| */ |
| new_p = p % conf->block_map_size; |
| /* |
| * core_bitmap does not include threads, so we add them here. |
| * Add all configured threads. The step will be limited to |
| * requested threads later. |
| */ |
| for (t = 0; t < (conf->threads); t++) { |
| uint16_t bit = new_p * (*hw_threads) + t; |
| bit %= conf->block_map_size; |
| bit_set(hw_map, bit); |
| } |
| } |
| |
| if ((arg->job_core_spec != NO_VAL16) && |
| (arg->job_core_spec & CORE_SPEC_THREAD) && |
| (arg->job_core_spec != CORE_SPEC_THREAD)) { |
| spec_thread_cnt = arg->job_core_spec & (~CORE_SPEC_THREAD); |
| } |
| if (spec_thread_cnt) { |
| /* Skip specialized threads as needed */ |
| int i, t, c, s; |
| for (t = conf->threads - 1; |
| ((t >= 0) && (spec_thread_cnt > 0)); t--) { |
| for (c = conf->cores - 1; |
| ((c >= 0) && (spec_thread_cnt > 0)); c--) { |
| for (s = conf->sockets - 1; |
| ((s >= 0) && (spec_thread_cnt > 0)); s--) { |
| i = s * conf->cores + c; |
| i = (i * conf->threads) + t; |
| /* |
| * If config_overrides is used bitmap |
| * may be too small for the counter |
| */ |
| i %= conf->block_map_size; |
| bit_clear(hw_map, i); |
| spec_thread_cnt--; |
| } |
| } |
| } |
| } |
| |
| str = (char *)bit_fmt_hexmask(hw_map); |
| debug3("%ps CPU final mask for local node: %s", |
| &arg->step_id, str); |
| xfree(str); |
| |
| FREE_NULL_BITMAP(req_map); |
| slurm_cred_unlock_args(cred); |
| return hw_map; |
| } |
| |
| /* helper function for _expand_masks() */ |
| static void _blot_mask(bitstr_t *mask, bitstr_t *avail_map, uint16_t blot) |
| { |
| uint16_t i, j, size = 0; |
| int prev = -1; |
| |
| if (!mask) |
| return; |
| size = bit_size(mask); |
| for (i = 0; i < size; i++) { |
| if (bit_test(mask, i)) { |
| /* fill in this blot */ |
| uint16_t start = (i / blot) * blot; |
| if (start != prev) { |
| for (j = start; j < start + blot; j++) { |
| if (bit_test(avail_map, j)) |
| bit_set(mask, j); |
| } |
| prev = start; |
| } |
| } |
| } |
| } |
| |
| /* helper function for _expand_masks() |
| * for each task, consider which other bits are set in avail_map |
| * on the same socket */ |
| static void _blot_mask_sockets(const uint32_t maxtasks, const uint32_t task, |
| bitstr_t **masks, uint16_t hw_sockets, |
| uint16_t hw_cores, uint16_t hw_threads, |
| bitstr_t *avail_map) |
| { |
| uint16_t i, j, size = 0; |
| int blot; |
| |
| if (!masks[task]) |
| return; |
| |
| blot = bit_size(avail_map) / hw_sockets; |
| if (blot <= 0) |
| blot = 1; |
| size = bit_size(masks[task]); |
| for (i = 0; i < size; i++) { |
| if (bit_test(masks[task], i)) { |
| /* check if other bits are set in avail_map on this |
| * socket and set each corresponding bit in masks */ |
| uint16_t start = (i / blot) * blot; |
| for (j = start; j < start+blot; j++) { |
| if (bit_test(avail_map, j)) |
| bit_set(masks[task], j); |
| } |
| } |
| } |
| } |
| |
| /* for each mask, expand the mask around the set bits to include the |
| * complete resource to which the set bits are to be bound */ |
| static void _expand_masks(uint16_t cpu_bind_type, const uint32_t maxtasks, |
| bitstr_t **masks, uint16_t hw_sockets, |
| uint16_t hw_cores, uint16_t hw_threads, |
| bitstr_t *avail_map) |
| { |
| uint32_t i; |
| |
| if (cpu_bind_type & CPU_BIND_TO_THREADS) |
| return; |
| if (cpu_bind_type & CPU_BIND_TO_CORES) { |
| if (hw_threads < 2) |
| return; |
| for (i = 0; i < maxtasks; i++) { |
| _blot_mask(masks[i], avail_map, hw_threads); |
| } |
| return; |
| } |
| if (cpu_bind_type & CPU_BIND_TO_SOCKETS) { |
| if (hw_threads*hw_cores < 2) |
| return; |
| for (i = 0; i < maxtasks; i++) { |
| _blot_mask_sockets(maxtasks, i, masks, hw_sockets, |
| hw_cores, hw_threads, avail_map); |
| } |
| return; |
| } |
| } |
| |
| /* |
| * _task_layout_lllp_cyclic |
| * |
| * task_layout_lllp_cyclic creates a cyclic distribution at the |
| * lowest level of logical processor which is either socket, core or |
| * thread depending on the system architecture. The Cyclic algorithm |
| * is the same as the Cyclic distribution performed in srun. |
| * |
| * Distribution at the lllp: |
| * -m hostfile|block|cyclic:block|cyclic |
| * |
| * The first distribution "hostfile|block|cyclic" is computed |
| * in srun. The second distribution "block|cyclic" is computed |
| * locally by each slurmd. |
| * |
| * The input to the lllp distribution algorithms is the gids (tasks |
| * ids) generated for the local node. |
| * |
| * The output is a mapping of the gids onto logical processors |
| * (thread/core/socket) with is expressed cpu_bind masks. |
| * |
| * If a task asks for more than one CPU per task, put the tasks as |
| * close as possible (fill core rather than going next socket for the |
| * extra task) |
| * |
| */ |
| static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req, |
| uint32_t node_id, bitstr_t ***masks_p) |
| { |
| int last_taskcount = -1, taskcount = 0; |
| uint16_t i, s, hw_sockets = 0, hw_cores = 0, hw_threads = 0; |
| uint16_t offset = 0, p = 0; |
| int size, max_tasks = req->tasks_to_launch[node_id]; |
| int max_cpus = max_tasks * req->cpus_per_task; |
| bitstr_t *avail_map; |
| bitstr_t **masks = NULL; |
| int *socket_last_pu = NULL; |
| int core_inx, pu_per_core, *core_tasks = NULL, *core_threads = NULL; |
| int req_threads_per_core = 0; |
| |
| info ("_task_layout_lllp_cyclic "); |
| |
| avail_map = _get_avail_map(req->cred, &hw_sockets, &hw_cores, |
| &hw_threads); |
| if (!avail_map) |
| return ESLURMD_CPU_LAYOUT_ERROR; |
| |
| if (req->threads_per_core && (req->threads_per_core != NO_VAL16)) |
| req_threads_per_core = req->threads_per_core; |
| else if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) |
| req_threads_per_core = 1; |
| |
| size = bit_set_count(avail_map); |
| /* |
| * If configured threads > hw threads, then we are oversubscribing |
| * threads, so don't check the number of bits set. |
| */ |
| if (req_threads_per_core && (conf->threads <= hw_threads)) { |
| if (size < (req->cpus_per_task * (conf->threads / |
| req_threads_per_core))) { |
| error("only %d bits in avail_map, threads_per_core requires %d!", |
| size, |
| (req->cpus_per_task * (conf->threads / |
| req_threads_per_core))); |
| FREE_NULL_BITMAP(avail_map); |
| return ESLURMD_CPU_LAYOUT_ERROR; |
| } |
| } |
| if (size < max_tasks) { |
| if (!(req->flags & LAUNCH_OVERCOMMIT)) |
| error("only %d bits in avail_map for %d tasks!", |
| size, max_tasks); |
| FREE_NULL_BITMAP(avail_map); |
| return ESLURMD_CPU_LAYOUT_ERROR; |
| } |
| if (size < max_cpus) { |
| /* Possible result of overcommit */ |
| i = size / max_tasks; |
| info("reset cpus_per_task from %d to %d", |
| req->cpus_per_task, i); |
| req->cpus_per_task = i; |
| } |
| |
| pu_per_core = hw_threads; |
| core_tasks = xcalloc(hw_sockets * hw_cores, sizeof(int)); |
| core_threads = xcalloc(hw_sockets * hw_cores, sizeof(int)); |
| socket_last_pu = xcalloc(hw_sockets, sizeof(int)); |
| |
| *masks_p = xcalloc(max_tasks, sizeof(bitstr_t *)); |
| masks = *masks_p; |
| |
| size = bit_size(avail_map); |
| |
| offset = hw_cores * hw_threads; |
| s = 0; |
| while (taskcount < max_tasks) { |
| if (taskcount == last_taskcount) { |
| error("_task_layout_lllp_cyclic failure"); |
| FREE_NULL_BITMAP(avail_map); |
| xfree(core_tasks); |
| xfree(core_threads); |
| xfree(socket_last_pu); |
| return ESLURMD_CPU_LAYOUT_ERROR; |
| } |
| last_taskcount = taskcount; |
| for (i = 0; i < size; i++) { |
| bool already_switched = false; |
| uint16_t bit; |
| uint16_t orig_s = s; |
| |
| while (socket_last_pu[s] >= offset) { |
| /* Switch to the next socket we have |
| * ran out here. */ |
| |
| /* This only happens if the slurmctld |
| * gave us an allocation that made a |
| * task split sockets. Or if the |
| * entire allocation is on one socket. |
| */ |
| s = (s + 1) % hw_sockets; |
| if (orig_s == s) { |
| /* This should rarely happen, |
| * but is here for sanity sake. |
| */ |
| debug("allocation is full, " |
| "oversubscribing"); |
| memset(core_tasks, 0, |
| (sizeof(int) * |
| hw_sockets * hw_cores)); |
| memset(core_threads, 0, |
| (sizeof(int) * |
| hw_sockets * hw_cores)); |
| memset(socket_last_pu, 0, |
| (sizeof(int) * hw_sockets)); |
| } |
| } |
| |
| bit = socket_last_pu[s] + (s * offset); |
| |
| /* In case hardware and config differ */ |
| bit %= size; |
| |
| /* set up for the next one */ |
| socket_last_pu[s]++; |
| |
| if (!bit_test(avail_map, bit)) |
| continue; |
| |
| core_inx = bit / pu_per_core; |
| if ((req->ntasks_per_core != 0) && |
| (core_tasks[core_inx] >= req->ntasks_per_core)) |
| continue; |
| if (req_threads_per_core && |
| (core_threads[core_inx] >= req_threads_per_core)) |
| continue; |
| |
| if (!masks[taskcount]) |
| masks[taskcount] = |
| bit_alloc(conf->block_map_size); |
| |
| //info("setting %d %d", taskcount, bit); |
| bit_set(masks[taskcount], bit); |
| |
| if (!already_switched && |
| (((req->task_dist & SLURM_DIST_NODESOCKMASK) == |
| SLURM_DIST_CYCLIC_CFULL) || |
| ((req->task_dist & SLURM_DIST_NODESOCKMASK) == |
| SLURM_DIST_BLOCK_CFULL))) { |
| /* This means we are laying out cpus |
| * within a task cyclically as well. */ |
| s = (s + 1) % hw_sockets; |
| already_switched = true; |
| } |
| |
| core_threads[core_inx]++; |
| |
| if (++p < req->cpus_per_task) |
| continue; |
| |
| core_tasks[core_inx]++; |
| |
| /* Binding to cores, skip remaining of the threads */ |
| if ((req->cpu_bind_type & CPU_BIND_TO_CORES) || |
| (req->ntasks_per_core == 1)) { |
| int threads_not_used; |
| if (req->cpus_per_task < hw_threads) |
| threads_not_used = |
| hw_threads - req->cpus_per_task; |
| else |
| threads_not_used = |
| req->cpus_per_task % hw_threads; |
| socket_last_pu[s] += threads_not_used; |
| } |
| p = 0; |
| |
| if (!already_switched) { |
| /* Now that we have finished a task, switch to |
| * the next socket. */ |
| s = (s + 1) % hw_sockets; |
| } |
| |
| if (++taskcount >= max_tasks) |
| break; |
| } |
| } |
| |
| /* last step: expand the masks to bind each task |
| * to the requested resource */ |
| _expand_masks(req->cpu_bind_type, max_tasks, masks, |
| hw_sockets, hw_cores, hw_threads, avail_map); |
| FREE_NULL_BITMAP(avail_map); |
| xfree(core_tasks); |
| xfree(core_threads); |
| xfree(socket_last_pu); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * _task_layout_lllp_block |
| * |
| * task_layout_lllp_block will create a block distribution at the |
| * lowest level of logical processor which is either socket, core or |
| * thread depending on the system architecture. The Block algorithm |
| * is the same as the Block distribution performed in srun. |
| * |
| * Distribution at the lllp: |
| * -m hostfile|plane|block|cyclic:block|cyclic |
| * |
| * The first distribution "hostfile|plane|block|cyclic" is computed |
| * in srun. The second distribution "plane|block|cyclic" is computed |
| * locally by each slurmd. |
| * |
| * The input to the lllp distribution algorithms is the gids (tasks |
| * ids) generated for the local node. |
| * |
| * The output is a mapping of the gids onto logical processors |
| * (thread/core/socket) with is expressed cpu_bind masks. |
| * |
| */ |
| static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, |
| uint32_t node_id, bitstr_t ***masks_p) |
| { |
| int c, i, size, last_taskcount = -1, taskcount = 0; |
| uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0; |
| int max_tasks = req->tasks_to_launch[node_id]; |
| int max_cpus = max_tasks * req->cpus_per_task; |
| bitstr_t *avail_map; |
| bitstr_t **masks = NULL; |
| int core_inx, pu_per_core, *core_tasks = NULL, *core_threads = NULL; |
| int sock_inx, pu_per_socket, *socket_tasks = NULL; |
| int req_threads_per_core = 0; |
| |
| info("_task_layout_lllp_block "); |
| |
| avail_map = _get_avail_map(req->cred, &hw_sockets, &hw_cores, |
| &hw_threads); |
| if (!avail_map) { |
| return ESLURMD_CPU_LAYOUT_ERROR; |
| } |
| |
| if (req->threads_per_core && (req->threads_per_core != NO_VAL16)) |
| req_threads_per_core = req->threads_per_core; |
| else if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) |
| req_threads_per_core = 1; |
| |
| size = bit_set_count(avail_map); |
| /* |
| * If configured threads > hw threads, then we are oversubscribing |
| * threads, so don't check the number of bits set. |
| */ |
| if (req_threads_per_core && (conf->threads <= hw_threads)) { |
| if (size < (req->cpus_per_task * (conf->threads / |
| req_threads_per_core))) { |
| error("only %d bits in avail_map, threads_per_core requires %d!", |
| size, |
| (req->cpus_per_task * (conf->threads / |
| req_threads_per_core))); |
| FREE_NULL_BITMAP(avail_map); |
| return ESLURMD_CPU_LAYOUT_ERROR; |
| } |
| } |
| if (size < max_tasks) { |
| if (!(req->flags & LAUNCH_OVERCOMMIT)) |
| error("only %d bits in avail_map for %d tasks!", |
| size, max_tasks); |
| FREE_NULL_BITMAP(avail_map); |
| return ESLURMD_CPU_LAYOUT_ERROR; |
| } |
| if (size < max_cpus) { |
| /* Possible result of overcommit */ |
| i = size / max_tasks; |
| info("reset cpus_per_task from %d to %d", |
| req->cpus_per_task, i); |
| req->cpus_per_task = i; |
| } |
| size = bit_size(avail_map); |
| |
| *masks_p = xcalloc(max_tasks, sizeof(bitstr_t *)); |
| masks = *masks_p; |
| |
| pu_per_core = hw_threads; |
| core_tasks = xcalloc(hw_sockets * hw_cores, sizeof(int)); |
| core_threads = xcalloc(hw_sockets * hw_cores, sizeof(int)); |
| pu_per_socket = hw_cores * hw_threads; |
| socket_tasks = xcalloc(hw_sockets, sizeof(int)); |
| |
| /* block distribution with oversubsciption */ |
| c = 0; |
| while (taskcount < max_tasks) { |
| if (taskcount == last_taskcount) { |
| error("_task_layout_lllp_block infinite loop"); |
| FREE_NULL_BITMAP(avail_map); |
| xfree(core_tasks); |
| xfree(core_threads); |
| xfree(socket_tasks); |
| return ESLURMD_CPU_LAYOUT_ERROR; |
| } |
| if (taskcount > 0) { |
| /* Clear counters to over-subscribe, if necessary */ |
| memset(core_tasks, 0, |
| (sizeof(int) * hw_sockets * hw_cores)); |
| memset(core_threads, 0, |
| (sizeof(int) * hw_sockets * hw_cores)); |
| memset(socket_tasks, 0, |
| (sizeof(int) * hw_sockets)); |
| } |
| last_taskcount = taskcount; |
| /* the abstract map is already laid out in block order, |
| * so just iterate over it |
| */ |
| for (i = 0; i < size; i++) { |
| /* skip unavailable resources */ |
| if (bit_test(avail_map, i) == 0) |
| continue; |
| |
| core_inx = i / pu_per_core; |
| if ((req->ntasks_per_core != 0) && |
| (core_tasks[core_inx] >= req->ntasks_per_core)) |
| continue; |
| sock_inx = i / pu_per_socket; |
| if ((req->ntasks_per_socket != 0) && |
| (socket_tasks[sock_inx] >= req->ntasks_per_socket)) |
| continue; |
| if (req_threads_per_core && |
| (core_threads[core_inx] >= req_threads_per_core)) |
| continue; |
| |
| if (!masks[taskcount]) |
| masks[taskcount] = bit_alloc( |
| conf->block_map_size); |
| //info("setting %d %d", taskcount, i); |
| bit_set(masks[taskcount], i); |
| |
| core_threads[core_inx]++; |
| |
| if (++c < req->cpus_per_task) |
| continue; |
| |
| /* We found one! Increment the count on each unit */ |
| core_tasks[core_inx]++; |
| socket_tasks[sock_inx]++; |
| |
| /* Binding to cores, skip remaining of the threads */ |
| if ((req->cpu_bind_type & CPU_BIND_TO_CORES) || |
| (req->ntasks_per_core == 1)) { |
| int threads_not_used; |
| if (req->cpus_per_task < hw_threads) |
| threads_not_used = |
| hw_threads - req->cpus_per_task; |
| else |
| threads_not_used = |
| req->cpus_per_task % hw_threads; |
| i += threads_not_used; |
| } |
| c = 0; |
| if (++taskcount >= max_tasks) |
| break; |
| } |
| } |
| xfree(core_tasks); |
| xfree(core_threads); |
| xfree(socket_tasks); |
| |
| /* last step: expand the masks to bind each task |
| * to the requested resource */ |
| _expand_masks(req->cpu_bind_type, max_tasks, masks, |
| hw_sockets, hw_cores, hw_threads, avail_map); |
| FREE_NULL_BITMAP(avail_map); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * _lllp_map_abstract_mask |
| * |
| * Map one abstract block mask to a physical machine mask |
| * |
| * IN - mask to map |
| * OUT - mapped mask (storage allocated in this routine) |
| */ |
| static bitstr_t *_lllp_map_abstract_mask(bitstr_t *bitmask) |
| { |
| int i, bit; |
| int num_bits = bit_size(bitmask); |
| bitstr_t *newmask = bit_alloc(num_bits); |
| |
| /* remap to physical machine */ |
| for (i = 0; i < num_bits; i++) { |
| if (bit_test(bitmask,i)) { |
| bit = BLOCK_MAP(i); |
| if (bit < bit_size(newmask)) |
| bit_set(newmask, bit); |
| else |
| error("can't go from %d -> %d since we " |
| "only have %"BITSTR_FMT" bits", |
| i, bit, bit_size(newmask)); |
| } |
| } |
| return newmask; |
| } |
| |
| /* |
| * _lllp_map_abstract_masks |
| * |
| * Map an array of abstract block masks to physical machine masks |
| * |
| * IN- maximum number of tasks |
| * IN/OUT- array of masks |
| */ |
| static void _lllp_map_abstract_masks(const uint32_t maxtasks, bitstr_t **masks) |
| { |
| int i; |
| debug3("_lllp_map_abstract_masks"); |
| |
| for (i = 0; i < maxtasks; i++) { |
| bitstr_t *bitmask = masks[i]; |
| if (bitmask) { |
| bitstr_t *newmask = _lllp_map_abstract_mask(bitmask); |
| FREE_NULL_BITMAP(bitmask); |
| masks[i] = newmask; |
| } |
| } |
| } |
| |
| /* |
| * _lllp_generate_cpu_bind |
| * |
| * Generate the cpu_bind type and string given an array of bitstr_t masks |
| * |
| * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated) |
| * IN- maximum number of tasks |
| * IN- array of masks |
| */ |
| static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req, |
| const uint32_t maxtasks, bitstr_t **masks) |
| { |
| int i, num_bits = 0, masks_len; |
| bitstr_t *bitmask; |
| bitoff_t charsize; |
| char *masks_str = NULL; |
| char buf_type[100]; |
| |
| for (i = 0; i < maxtasks; i++) { |
| bitmask = masks[i]; |
| if (bitmask) { |
| num_bits = bit_size(bitmask); |
| break; |
| } |
| } |
| charsize = (num_bits + 3) / 4; /* ASCII hex digits */ |
| charsize += 3; /* "0x" and trailing "," */ |
| masks_len = maxtasks * charsize + 1; /* number of masks + null */ |
| |
| debug3("%d %"BITSTR_FMT" %d", maxtasks, charsize, |
| masks_len); |
| |
| masks_str = xmalloc(masks_len); |
| masks_len = 0; |
| for (i = 0; i < maxtasks; i++) { |
| char *str; |
| int curlen; |
| bitmask = masks[i]; |
| if (bitmask == NULL) { |
| continue; |
| } |
| str = (char *)bit_fmt_hexmask(bitmask); |
| curlen = strlen(str) + 1; |
| |
| if (masks_len > 0) |
| masks_str[masks_len - 1] = ','; |
| strlcpy(&masks_str[masks_len], str, curlen); |
| masks_len += curlen; |
| xfree(str); |
| } |
| |
| if (req->cpu_bind) { |
| xfree(req->cpu_bind); |
| } |
| if (masks_str[0] != '\0') { |
| req->cpu_bind = masks_str; |
| masks_str = NULL; |
| req->cpu_bind_type |= CPU_BIND_MASK; |
| } else { |
| req->cpu_bind = NULL; |
| req->cpu_bind_type &= ~CPU_BIND_VERBOSE; |
| } |
| xfree(masks_str); |
| |
| /* clear mask generation bits */ |
| req->cpu_bind_type &= ~CPU_BIND_TO_THREADS; |
| req->cpu_bind_type &= ~CPU_BIND_TO_CORES; |
| req->cpu_bind_type &= ~CPU_BIND_TO_SOCKETS; |
| req->cpu_bind_type &= ~CPU_BIND_TO_LDOMS; |
| |
| slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); |
| info("_lllp_generate_cpu_bind jobid [%u]: %s, %s", |
| req->step_id.job_id, buf_type, req->cpu_bind); |
| } |