|  | /*****************************************************************************\ | 
|  | *  Copyright (C) 2006-2009 Hewlett-Packard Development Company, L.P. | 
|  | *  Copyright (C) 2008-2009 Lawrence Livermore National Security. | 
|  | *  Written by Susanne M. Balle, <susanne.balle@hp.com> | 
|  | *  CODE-OCEC-09-009. All rights reserved. | 
|  | * | 
|  | *  This file is part of Slurm, a resource management program. | 
|  | *  For details, see <https://slurm.schedmd.com/>. | 
|  | *  Please also read the included file: DISCLAIMER. | 
|  | * | 
|  | *  Slurm is free software; you can redistribute it and/or modify it under | 
|  | *  the terms of the GNU General Public License as published by the Free | 
|  | *  Software Foundation; either version 2 of the License, or (at your option) | 
|  | *  any later version. | 
|  | * | 
|  | *  In addition, as a special exception, the copyright holders give permission | 
|  | *  to link the code of portions of this program with the OpenSSL library under | 
|  | *  certain conditions as described in each individual source file, and | 
|  | *  distribute linked combinations including the two. You must obey the GNU | 
|  | *  General Public License in all respects for all of the code used other than | 
|  | *  OpenSSL. If you modify file(s) with this exception, you may extend this | 
|  | *  exception to your version of the file(s), but you are not obligated to do | 
|  | *  so. If you do not wish to do so, delete this exception statement from your | 
|  | *  version.  If you delete this exception statement from all source files in | 
|  | *  the program, then also delete it here. | 
|  | * | 
|  | *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY | 
|  | *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more | 
|  | *  details. | 
|  | * | 
|  | *  You should have received a copy of the GNU General Public License along | 
|  | *  with Slurm; if not, write to the Free Software Foundation, Inc., | 
|  | *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA. | 
|  | \*****************************************************************************/ | 
|  |  | 
|  | #define _GNU_SOURCE | 
|  |  | 
|  | #include "affinity.h" | 
|  | #include "dist_tasks.h" | 
|  | #include "src/common/bitstring.h" | 
|  | #include "src/common/log.h" | 
|  | #include "src/interfaces/cred.h" | 
|  | #include "src/common/slurm_protocol_api.h" | 
|  | #include "src/common/slurm_resource_info.h" | 
|  | #include "src/common/strlcpy.h" | 
|  | #include "src/common/xmalloc.h" | 
|  | #include "src/slurmd/slurmd/slurmd.h" | 
|  |  | 
|  | #ifdef HAVE_NUMA | 
|  | #include <numa.h> | 
|  | #endif | 
|  |  | 
|  | static char *_alloc_mask(launch_tasks_request_msg_t *req, | 
|  | int *whole_node_cnt, int *whole_socket_cnt, | 
|  | int *whole_core_cnt, int *whole_thread_cnt, | 
|  | int *part_socket_cnt, int *part_core_cnt); | 
|  | static bitstr_t *_get_avail_map(slurm_cred_t *cred, uint16_t *hw_sockets, | 
|  | uint16_t *hw_cores, uint16_t *hw_threads); | 
|  | static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id, | 
|  | uint16_t *sockets, uint16_t *cores); | 
|  |  | 
|  | static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, | 
|  | uint32_t node_id, bitstr_t ***masks_p); | 
|  | static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req, | 
|  | uint32_t node_id, bitstr_t ***masks_p); | 
|  |  | 
|  | static void _lllp_map_abstract_masks(const uint32_t maxtasks, | 
|  | bitstr_t **masks); | 
|  | static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req, | 
|  | const uint32_t maxtasks, | 
|  | bitstr_t **masks); | 
|  |  | 
|  | /*     BLOCK_MAP     physical machine LLLP index to abstract block LLLP index | 
|  | *     BLOCK_MAP_INV physical abstract block LLLP index to machine LLLP index | 
|  | */ | 
|  | #define BLOCK_MAP(index)	_block_map(index, conf->block_map) | 
|  | #define BLOCK_MAP_INV(index)	_block_map(index, conf->block_map_inv) | 
|  |  | 
|  |  | 
|  | /* _block_map | 
|  | * | 
|  | * safely returns a mapped index using a provided block map | 
|  | * | 
|  | * IN - index to map | 
|  | * IN - map to use | 
|  | */ | 
|  | static uint16_t _block_map(uint16_t index, uint16_t *map) | 
|  | { | 
|  | if (map == NULL) { | 
|  | return index; | 
|  | } | 
|  | /* make sure bit falls in map */ | 
|  | if (index >= conf->block_map_size) { | 
|  | debug3("wrapping index %u into block_map_size of %u", | 
|  | index, conf->block_map_size); | 
|  | index = index % conf->block_map_size; | 
|  | } | 
|  | index = map[index]; | 
|  | return(index); | 
|  | } | 
|  |  | 
|  | static void _task_layout_display_masks(launch_tasks_request_msg_t *req, | 
|  | const uint32_t *gtid, | 
|  | const uint32_t maxtasks, | 
|  | bitstr_t **masks) | 
|  | { | 
|  | int i; | 
|  | char *str = NULL; | 
|  | for(i = 0; i < maxtasks; i++) { | 
|  | str = (char *)bit_fmt_hexmask(masks[i]); | 
|  | debug3("_task_layout_display_masks jobid [%u:%d] %s", | 
|  | req->step_id.job_id, gtid[i], str); | 
|  | xfree(str); | 
|  | } | 
|  | } | 
|  |  | 
|  | static void _lllp_free_masks(const uint32_t maxtasks, bitstr_t **masks) | 
|  | { | 
|  | int i; | 
|  | bitstr_t *bitmask; | 
|  |  | 
|  | for (i = 0; i < maxtasks; i++) { | 
|  | bitmask = masks[i]; | 
|  | FREE_NULL_BITMAP(bitmask); | 
|  | } | 
|  | xfree(masks); | 
|  | } | 
|  |  | 
|  | #ifdef HAVE_NUMA | 
|  | /* _match_mask_to_ldom | 
|  | * | 
|  | * expand each mask to encompass the whole locality domain | 
|  | * within which it currently exists | 
|  | * NOTE: this assumes that the masks are already in logical | 
|  | * (and not abstract) CPU order. | 
|  | */ | 
|  | static void _match_masks_to_ldom(const uint32_t maxtasks, bitstr_t **masks) | 
|  | { | 
|  | uint32_t i, b, size; | 
|  |  | 
|  | if (!masks || !masks[0]) | 
|  | return; | 
|  | size = bit_size(masks[0]); | 
|  | for(i = 0; i < maxtasks; i++) { | 
|  | for (b = 0; b < size; b++) { | 
|  | if (bit_test(masks[i], b)) { | 
|  | /* get the NUMA node for this CPU, and then | 
|  | * set all CPUs in the mask that exist in | 
|  | * the same CPU */ | 
|  | int c; | 
|  | uint16_t nnid = slurm_get_numa_node(b); | 
|  | for (c = 0; c < size; c++) { | 
|  | if (slurm_get_numa_node(c) == nnid) | 
|  | bit_set(masks[i], c); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | #endif | 
|  |  | 
|  | /* | 
|  | * batch_bind - Set the batch request message so as to bind the shell to the | 
|  | *	proper resources | 
|  | */ | 
|  | void batch_bind(batch_job_launch_msg_t *req) | 
|  | { | 
|  | bitstr_t *hw_map; | 
|  | int task_cnt = 0; | 
|  | uint16_t sockets = 0, cores = 0, threads = 0; | 
|  |  | 
|  | hw_map = _get_avail_map(req->cred, &sockets, &cores, &threads); | 
|  | if (hw_map) | 
|  | task_cnt = bit_set_count(hw_map); | 
|  |  | 
|  | if (task_cnt) { | 
|  | req->cpu_bind_type = CPU_BIND_MASK; | 
|  | if (slurm_conf.task_plugin_param & CPU_BIND_VERBOSE) | 
|  | req->cpu_bind_type |= CPU_BIND_VERBOSE; | 
|  | xfree(req->cpu_bind); | 
|  | req->cpu_bind = (char *)bit_fmt_hexmask(hw_map); | 
|  | info("job %u CPU input mask for node: %s", | 
|  | req->job_id, req->cpu_bind); | 
|  | /* translate abstract masks to actual hardware layout */ | 
|  | _lllp_map_abstract_masks(1, &hw_map); | 
|  | #ifdef HAVE_NUMA | 
|  | if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { | 
|  | _match_masks_to_ldom(1, &hw_map); | 
|  | } | 
|  | #endif | 
|  | xfree(req->cpu_bind); | 
|  | req->cpu_bind = (char *)bit_fmt_hexmask(hw_map); | 
|  | info("job %u CPU final HW mask for node: %s", | 
|  | req->job_id, req->cpu_bind); | 
|  | } else { | 
|  | error("job %u allocated no CPUs", | 
|  | req->job_id); | 
|  | } | 
|  | FREE_NULL_BITMAP(hw_map); | 
|  | } | 
|  |  | 
|  | static int _validate_map(launch_tasks_request_msg_t *req, char *avail_mask, | 
|  | char **err_msg) | 
|  | { | 
|  | char *tmp_map, *save_ptr = NULL, *tok; | 
|  | cpu_set_t avail_cpus; | 
|  | bool superset = true; | 
|  | int rc = SLURM_SUCCESS; | 
|  |  | 
|  | if (!req->cpu_bind) { | 
|  | char *err = "No list of CPU IDs provided to --cpu-bind=map_cpu:<list>"; | 
|  | error("%s", err); | 
|  | if (err_msg) | 
|  | xstrfmtcat(*err_msg, "%s", err); | 
|  | return ESLURMD_CPU_BIND_ERROR; | 
|  | } | 
|  |  | 
|  | CPU_ZERO(&avail_cpus); | 
|  | if (task_str_to_cpuset(&avail_cpus, avail_mask)) { | 
|  | char *err = "Failed to convert avail_mask into hex for CPU bind map"; | 
|  | error("%s", err); | 
|  | if (err_msg) | 
|  | xstrfmtcat(*err_msg, "%s", err); | 
|  | return ESLURMD_CPU_BIND_ERROR; | 
|  | } | 
|  |  | 
|  | tmp_map = xstrdup(req->cpu_bind); | 
|  | tok = strtok_r(tmp_map, ",", &save_ptr); | 
|  | while (tok) { | 
|  | int i = atoi(tok); | 
|  | if (!CPU_ISSET(i, &avail_cpus)) { | 
|  | /* The task's CPU map is completely invalid. | 
|  | * Disable CPU map. */ | 
|  | superset = false; | 
|  | break; | 
|  | } | 
|  | tok = strtok_r(NULL, ",", &save_ptr); | 
|  | } | 
|  | xfree(tmp_map); | 
|  |  | 
|  | if (!superset) { | 
|  | error("CPU binding outside of job step allocation, allocated CPUs are: %s.", | 
|  | avail_mask); | 
|  | if (err_msg) | 
|  | xstrfmtcat(*err_msg, "CPU binding outside of job step allocation, allocated CPUs are: %s.", | 
|  | avail_mask); | 
|  | rc = ESLURMD_CPU_BIND_ERROR; | 
|  | } | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | static int _validate_mask(launch_tasks_request_msg_t *req, char *avail_mask, | 
|  | char **err_msg) | 
|  | { | 
|  | char *new_mask = NULL, *save_ptr = NULL, *tok; | 
|  | cpu_set_t avail_cpus, task_cpus; | 
|  | bool superset = true; | 
|  | int rc = SLURM_SUCCESS; | 
|  |  | 
|  | if (!req->cpu_bind) { | 
|  | char *err = "No list of CPU masks provided to --cpu-bind=mask_cpu:<list>"; | 
|  | error("%s", err); | 
|  | if (err_msg) | 
|  | xstrfmtcat(*err_msg, "%s", err); | 
|  | return ESLURMD_CPU_BIND_ERROR; | 
|  | } | 
|  |  | 
|  | CPU_ZERO(&avail_cpus); | 
|  | if (task_str_to_cpuset(&avail_cpus, avail_mask)) { | 
|  | char *err = "Failed to convert avail_mask into hex for CPU bind mask"; | 
|  | error("%s", err); | 
|  | if (err_msg) | 
|  | xstrfmtcat(*err_msg, "%s", err); | 
|  | return ESLURMD_CPU_BIND_ERROR; | 
|  | } | 
|  |  | 
|  | tok = strtok_r(req->cpu_bind, ",", &save_ptr); | 
|  | while (tok) { | 
|  | int i, overlaps = 0; | 
|  | char mask_str[CPU_SET_HEX_STR_SIZE]; | 
|  | CPU_ZERO(&task_cpus); | 
|  | if (task_str_to_cpuset(&task_cpus, tok)) { | 
|  | char *err = "Failed to convert cpu bind string into hex for CPU bind mask"; | 
|  | error("%s", err); | 
|  | if (err_msg) | 
|  | xstrfmtcat(*err_msg, "%s", err); | 
|  | xfree(new_mask); | 
|  | return ESLURMD_CPU_BIND_ERROR; | 
|  | } | 
|  | for (i = 0; i < CPU_SETSIZE; i++) { | 
|  | if (!CPU_ISSET(i, &task_cpus)) | 
|  | continue; | 
|  | if (CPU_ISSET(i, &avail_cpus)) { | 
|  | overlaps++; | 
|  | } else { | 
|  | CPU_CLR(i, &task_cpus); | 
|  | superset = false; | 
|  | } | 
|  | } | 
|  | if (overlaps == 0) { | 
|  | /* The task's CPU mask is completely invalid. | 
|  | * Give it all allowed CPUs. */ | 
|  | for (i = 0; i < CPU_SETSIZE; i++) { | 
|  | if (CPU_ISSET(i, &avail_cpus)) | 
|  | CPU_SET(i, &task_cpus); | 
|  | } | 
|  | } | 
|  | task_cpuset_to_str(&task_cpus, mask_str); | 
|  | if (new_mask) | 
|  | xstrcat(new_mask, ","); | 
|  | xstrcat(new_mask, mask_str); | 
|  | tok = strtok_r(NULL, ",", &save_ptr); | 
|  | } | 
|  |  | 
|  | if (!superset) { | 
|  | error("CPU binding outside of job step allocation, allocated CPUs are: %s.", | 
|  | avail_mask); | 
|  | if (err_msg) | 
|  | xstrfmtcat(*err_msg, "CPU binding outside of job step allocation, allocated CPUs are: %s.", | 
|  | avail_mask); | 
|  | rc = ESLURMD_CPU_BIND_ERROR; | 
|  | } | 
|  |  | 
|  | xfree(req->cpu_bind); | 
|  | req->cpu_bind = new_mask; | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * lllp_distribution | 
|  | * | 
|  | * Note: lllp stands for Lowest Level of Logical Processors. | 
|  | * | 
|  | * When automatic binding is enabled: | 
|  | *      - no binding flags set >= CPU_BIND_NONE, and | 
|  | *      - a auto binding level selected CPU_BIND_TO_{SOCKETS,CORES,THREADS} | 
|  | * Otherwise limit job step to the allocated CPUs | 
|  | * | 
|  | * generate the appropriate cpu_bind type and string which results in | 
|  | * the specified lllp distribution. | 
|  | * | 
|  | * IN/OUT req - job launch request (cpu_bind_type and cpu_bind updated) | 
|  | * IN node_id - global task id array | 
|  | * OUT err_msg - optional string to pass out error message. | 
|  | */ | 
|  | extern int lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id, | 
|  | char **err_msg) | 
|  | { | 
|  | int rc = SLURM_SUCCESS; | 
|  | bitstr_t **masks = NULL; | 
|  | char buf_type[100]; | 
|  | int maxtasks = req->tasks_to_launch[node_id]; | 
|  | int whole_nodes, whole_sockets, whole_cores, whole_threads; | 
|  | int part_sockets, part_cores; | 
|  | const uint32_t *gtid = req->global_task_ids[node_id]; | 
|  | static uint16_t bind_entity = | 
|  | CPU_BIND_TO_THREADS | CPU_BIND_TO_CORES | | 
|  | CPU_BIND_TO_SOCKETS | CPU_BIND_TO_LDOMS; | 
|  | static uint16_t bind_mode = | 
|  | CPU_BIND_NONE | CPU_BIND_MASK | | 
|  | CPU_BIND_MAP | | 
|  | CPU_BIND_LDMASK | CPU_BIND_LDRANK | | 
|  | CPU_BIND_LDMAP; | 
|  | static int only_one_thread_per_core = -1; | 
|  |  | 
|  | if (only_one_thread_per_core == -1) { | 
|  | if (conf->cpus == (conf->sockets * conf->cores)) | 
|  | only_one_thread_per_core = 1; | 
|  | else | 
|  | only_one_thread_per_core = 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * If we are telling the system we only want to use 1 thread | 
|  | * per core with the CPUs node option this is the easiest way | 
|  | * to portray that to the affinity plugin. | 
|  | */ | 
|  | if (only_one_thread_per_core) | 
|  | req->cpu_bind_type |= CPU_BIND_ONE_THREAD_PER_CORE; | 
|  |  | 
|  | if (req->cpu_bind_type & bind_mode) { | 
|  | /* Explicit step binding specified by user */ | 
|  | char *avail_mask = _alloc_mask(req, | 
|  | &whole_nodes,  &whole_sockets, | 
|  | &whole_cores,  &whole_threads, | 
|  | &part_sockets, &part_cores); | 
|  | if (!avail_mask) { | 
|  | error("Could not determine allocated CPUs"); | 
|  | if (err_msg) | 
|  | xstrfmtcat(*err_msg, "Could not determine allocated CPUs"); | 
|  | rc = ESLURMD_CPU_BIND_ERROR; | 
|  | } else if ((whole_nodes == 0) && | 
|  | (req->job_core_spec == NO_VAL16) && | 
|  | (!(req->cpu_bind_type & CPU_BIND_MAP)) && | 
|  | (!(req->cpu_bind_type & CPU_BIND_MASK))) { | 
|  |  | 
|  | if (!(req->cpu_bind_type & CPU_BIND_NONE)) { | 
|  | rc = ESLURMD_CPU_BIND_ERROR; | 
|  | slurm_sprint_cpu_bind_type(buf_type, | 
|  | req->cpu_bind_type); | 
|  | error("Entire node must be allocated for %s", | 
|  | buf_type); | 
|  | if (err_msg) | 
|  | xstrfmtcat(*err_msg, "Entire node must be allocated for %s", | 
|  | buf_type); | 
|  | } | 
|  | xfree(req->cpu_bind); | 
|  | req->cpu_bind = avail_mask; | 
|  | req->cpu_bind_type &= (~bind_mode); | 
|  | req->cpu_bind_type |= CPU_BIND_MASK; | 
|  | } else { | 
|  | if (req->job_core_spec == NO_VAL16) { | 
|  | if (req->cpu_bind_type & CPU_BIND_MASK) | 
|  | rc = _validate_mask(req, avail_mask, | 
|  | err_msg); | 
|  | else if (req->cpu_bind_type & CPU_BIND_MAP) | 
|  | rc = _validate_map(req, avail_mask, | 
|  | err_msg); | 
|  | } | 
|  | xfree(avail_mask); | 
|  | } | 
|  | slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); | 
|  | info("JobId=%u manual binding: %s", | 
|  | req->step_id.job_id, buf_type); | 
|  | return rc; | 
|  | } | 
|  |  | 
|  | if (!(req->cpu_bind_type & bind_entity)) { | 
|  | /* | 
|  | * No bind unit (sockets, cores) specified by user, | 
|  | * pick something reasonable | 
|  | */ | 
|  | bool auto_def_set = false; | 
|  | int spec_thread_cnt = 0; | 
|  | int max_tasks = req->tasks_to_launch[node_id] * | 
|  | req->cpus_per_task; | 
|  | char *avail_mask = _alloc_mask(req, | 
|  | &whole_nodes,  &whole_sockets, | 
|  | &whole_cores,  &whole_threads, | 
|  | &part_sockets, &part_cores); | 
|  | debug("binding tasks:%d to nodes:%d sockets:%d:%d cores:%d:%d threads:%d", | 
|  | max_tasks, whole_nodes, whole_sockets, | 
|  | part_sockets, whole_cores, part_cores, whole_threads); | 
|  | if ((req->job_core_spec != NO_VAL16) && | 
|  | (req->job_core_spec &  CORE_SPEC_THREAD)  && | 
|  | (req->job_core_spec != CORE_SPEC_THREAD)) { | 
|  | spec_thread_cnt = req->job_core_spec & | 
|  | (~CORE_SPEC_THREAD); | 
|  | } | 
|  | if (((max_tasks == whole_sockets) && (part_sockets == 0)) || | 
|  | (spec_thread_cnt && | 
|  | (max_tasks == (whole_sockets + part_sockets)))) { | 
|  | req->cpu_bind_type |= CPU_BIND_TO_SOCKETS; | 
|  | goto make_auto; | 
|  | } | 
|  | if (((max_tasks == whole_cores) && (part_cores == 0)) || | 
|  | (spec_thread_cnt && | 
|  | (max_tasks == (whole_cores + part_cores)))) { | 
|  | req->cpu_bind_type |= CPU_BIND_TO_CORES; | 
|  | goto make_auto; | 
|  | } | 
|  | if (max_tasks == whole_threads) { | 
|  | req->cpu_bind_type |= CPU_BIND_TO_THREADS; | 
|  | goto make_auto; | 
|  | } | 
|  |  | 
|  | if (slurm_conf.task_plugin_param & CPU_AUTO_BIND_TO_THREADS) { | 
|  | auto_def_set = true; | 
|  | req->cpu_bind_type |= CPU_BIND_TO_THREADS; | 
|  | goto make_auto; | 
|  | } else if (slurm_conf.task_plugin_param & | 
|  | CPU_AUTO_BIND_TO_CORES) { | 
|  | auto_def_set = true; | 
|  | req->cpu_bind_type |= CPU_BIND_TO_CORES; | 
|  | goto make_auto; | 
|  | } else if (slurm_conf.task_plugin_param & | 
|  | CPU_AUTO_BIND_TO_SOCKETS) { | 
|  | auto_def_set = true; | 
|  | req->cpu_bind_type |= CPU_BIND_TO_SOCKETS; | 
|  | goto make_auto; | 
|  | } | 
|  |  | 
|  | if (avail_mask) { | 
|  | xfree(req->cpu_bind); | 
|  | req->cpu_bind = avail_mask; | 
|  | req->cpu_bind_type |= CPU_BIND_MASK; | 
|  | } | 
|  |  | 
|  | slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); | 
|  | info("JobId=%u auto binding off: %s", | 
|  | req->step_id.job_id, buf_type); | 
|  | return rc; | 
|  |  | 
|  | make_auto: xfree(avail_mask); | 
|  | slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); | 
|  | info("JobId=%u %s auto binding: %s, dist %d", | 
|  | req->step_id.job_id, | 
|  | (auto_def_set) ? "default" : "implicit", | 
|  | buf_type, req->task_dist); | 
|  | } else { | 
|  | /* Explicit bind unit (sockets, cores) specified by user */ | 
|  | slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); | 
|  | info("JobId=%u binding: %s, dist %d", | 
|  | req->step_id.job_id, buf_type, req->task_dist); | 
|  | } | 
|  |  | 
|  | switch (req->task_dist & SLURM_DIST_NODESOCKMASK) { | 
|  | case SLURM_DIST_BLOCK_BLOCK: | 
|  | case SLURM_DIST_CYCLIC_BLOCK: | 
|  | case SLURM_DIST_PLANE: | 
|  | debug2("JobId=%u will use lllp_block", | 
|  | req->step_id.job_id); | 
|  | /* tasks are distributed in blocks within a plane */ | 
|  | rc = _task_layout_lllp_block(req, node_id, &masks); | 
|  | break; | 
|  | case SLURM_DIST_ARBITRARY: | 
|  | case SLURM_DIST_BLOCK: | 
|  | case SLURM_DIST_CYCLIC: | 
|  | case SLURM_DIST_UNKNOWN: | 
|  | if (slurm_conf.select_type_param & | 
|  | SELECT_CORE_DEFAULT_DIST_BLOCK) { | 
|  | debug2("JobId=%u will use lllp_block because of SelectTypeParameters", | 
|  | req->step_id.job_id); | 
|  | rc = _task_layout_lllp_block(req, node_id, &masks); | 
|  | break; | 
|  | } | 
|  | /* | 
|  | * We want to fall through here if we aren't doing a | 
|  | * default dist block. | 
|  | */ | 
|  | default: | 
|  | debug2("JobId=%u will use lllp_cyclic because of SelectTypeParameters", | 
|  | req->step_id.job_id); | 
|  | rc = _task_layout_lllp_cyclic(req, node_id, &masks); | 
|  | break; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * FIXME: I'm worried about core_bitmap with CPU_BIND_TO_SOCKETS & | 
|  | * max_cores - does select/cons_tres plugin allocate whole | 
|  | * socket??? Maybe not. Check srun man page. | 
|  | */ | 
|  |  | 
|  | if (rc == SLURM_SUCCESS) { | 
|  | _task_layout_display_masks(req, gtid, maxtasks, masks); | 
|  | /* translate abstract masks to actual hardware layout */ | 
|  | _lllp_map_abstract_masks(maxtasks, masks); | 
|  | _task_layout_display_masks(req, gtid, maxtasks, masks); | 
|  | #ifdef HAVE_NUMA | 
|  | if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { | 
|  | _match_masks_to_ldom(maxtasks, masks); | 
|  | _task_layout_display_masks(req, gtid, maxtasks, masks); | 
|  | } | 
|  | #endif | 
|  | /* convert masks into cpu_bind mask string */ | 
|  | _lllp_generate_cpu_bind(req, maxtasks, masks); | 
|  | } else { | 
|  | char *avail_mask = _alloc_mask(req, | 
|  | &whole_nodes,  &whole_sockets, | 
|  | &whole_cores,  &whole_threads, | 
|  | &part_sockets, &part_cores); | 
|  | if (avail_mask) { | 
|  | xfree(req->cpu_bind); | 
|  | req->cpu_bind = avail_mask; | 
|  | req->cpu_bind_type &= (~bind_mode); | 
|  | req->cpu_bind_type |= CPU_BIND_MASK; | 
|  | } | 
|  |  | 
|  | if (req->flags & LAUNCH_OVERCOMMIT) { | 
|  | /* | 
|  | * Allow the step to run despite not being able to | 
|  | * distribute tasks. | 
|  | * e.g. Overcommit will fail to distribute tasks because | 
|  | * the step has wants more cpus than allocated. | 
|  | */ | 
|  | rc = SLURM_SUCCESS; | 
|  | } else if (err_msg) { | 
|  | slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); | 
|  | xstrfmtcat(*err_msg, "JobId=%u failed to distribute tasks (bind_type:%s) - this should never happen", | 
|  | req->step_id.job_id, buf_type); | 
|  | error("%s", *err_msg); | 
|  | } | 
|  | } | 
|  | if (masks) | 
|  | _lllp_free_masks(maxtasks, masks); | 
|  | return rc; | 
|  | } | 
|  |  | 
|  |  | 
|  | /* | 
|  | * _get_local_node_info - get job allocation details for this node | 
|  | * IN: req         - launch request structure | 
|  | * IN: job_node_id - index of the local node in the job allocation | 
|  | * IN/OUT: sockets - pointer to socket count variable | 
|  | * IN/OUT: cores   - pointer to cores_per_socket count variable | 
|  | * OUT:  returns the core_bitmap index of the first core for this node | 
|  | */ | 
|  | static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id, | 
|  | uint16_t *sockets, uint16_t *cores) | 
|  | { | 
|  | int bit_start = 0, bit_finish = 0; | 
|  | int i, index = -1, cur_node_id = -1; | 
|  |  | 
|  | do { | 
|  | index++; | 
|  | for (i = 0; i < arg->sock_core_rep_count[index] && | 
|  | cur_node_id < job_node_id; i++) { | 
|  | bit_start = bit_finish; | 
|  | bit_finish += arg->sockets_per_node[index] * | 
|  | arg->cores_per_socket[index]; | 
|  | cur_node_id++; | 
|  | } | 
|  |  | 
|  | } while (cur_node_id < job_node_id); | 
|  |  | 
|  | *sockets = arg->sockets_per_node[index]; | 
|  | *cores   = arg->cores_per_socket[index]; | 
|  | return bit_start; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Determine which CPUs a job step can use. | 
|  | * OUT whole_<entity>_count - returns count of whole <entities> in this | 
|  | *                            allocation for this node | 
|  | * OUT part__<entity>_count - returns count of partial <entities> in this | 
|  | *                            allocation for this node | 
|  | * RET - a string representation of the available mask or NULL on error | 
|  | * NOTE: Caller must xfree() the return value. | 
|  | */ | 
|  | static char *_alloc_mask(launch_tasks_request_msg_t *req, | 
|  | int *whole_node_cnt,  int *whole_socket_cnt, | 
|  | int *whole_core_cnt,  int *whole_thread_cnt, | 
|  | int *part_socket_cnt, int *part_core_cnt) | 
|  | { | 
|  | uint16_t sockets, cores, threads; | 
|  | int c, s, t, i; | 
|  | int c_miss, s_miss, t_miss, c_hit, t_hit; | 
|  | bitstr_t *alloc_bitmap; | 
|  | char *str_mask; | 
|  | bitstr_t *alloc_mask; | 
|  |  | 
|  | *whole_node_cnt   = 0; | 
|  | *whole_socket_cnt = 0; | 
|  | *whole_core_cnt   = 0; | 
|  | *whole_thread_cnt = 0; | 
|  | *part_socket_cnt  = 0; | 
|  | *part_core_cnt    = 0; | 
|  |  | 
|  | alloc_bitmap = _get_avail_map(req->cred, &sockets, &cores, &threads); | 
|  | if (!alloc_bitmap) | 
|  | return NULL; | 
|  |  | 
|  | alloc_mask = bit_alloc(bit_size(alloc_bitmap)); | 
|  |  | 
|  | i = 0; | 
|  | for (s = 0, s_miss = false; s < sockets; s++) { | 
|  | for (c = 0, c_hit = c_miss = false; c < cores; c++) { | 
|  | for (t = 0, t_hit = t_miss = false; t < threads; t++) { | 
|  | /* | 
|  | * If we are pretending we have a larger system | 
|  | * than we really have this is needed to make | 
|  | * sure we don't bust the bank. | 
|  | */ | 
|  | if (i >= bit_size(alloc_bitmap)) | 
|  | i = 0; | 
|  | if (bit_test(alloc_bitmap, i)) { | 
|  | bit_set(alloc_mask, i); | 
|  | (*whole_thread_cnt)++; | 
|  | t_hit = true; | 
|  | c_hit = true; | 
|  | } else | 
|  | t_miss = true; | 
|  | i++; | 
|  | } | 
|  | if (!t_miss) | 
|  | (*whole_core_cnt)++; | 
|  | else { | 
|  | if (t_hit) | 
|  | (*part_core_cnt)++; | 
|  | c_miss = true; | 
|  | } | 
|  | } | 
|  | if (!c_miss) | 
|  | (*whole_socket_cnt)++; | 
|  | else { | 
|  | if (c_hit) | 
|  | (*part_socket_cnt)++; | 
|  | s_miss = true; | 
|  | } | 
|  | } | 
|  | if (!s_miss) | 
|  | (*whole_node_cnt)++; | 
|  | FREE_NULL_BITMAP(alloc_bitmap); | 
|  |  | 
|  | if ((req->job_core_spec != NO_VAL16) && | 
|  | (req->job_core_spec &  CORE_SPEC_THREAD)  && | 
|  | (req->job_core_spec != CORE_SPEC_THREAD)) { | 
|  | int spec_thread_cnt; | 
|  | spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD); | 
|  | for (t = threads - 1; | 
|  | ((t > 0) && (spec_thread_cnt > 0)); t--) { | 
|  | for (c = cores - 1; | 
|  | ((c > 0) && (spec_thread_cnt > 0)); c--) { | 
|  | for (s = sockets - 1; | 
|  | ((s >= 0) && (spec_thread_cnt > 0)); s--) { | 
|  | i = s * cores + c; | 
|  | i = (i * threads) + t; | 
|  | /* | 
|  | * If config_overrides is used bitmap | 
|  | * may be too small for the counter | 
|  | */ | 
|  | i %= conf->block_map_size; | 
|  | bit_clear(alloc_mask, i); | 
|  | spec_thread_cnt--; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /* translate abstract masks to actual hardware layout */ | 
|  | _lllp_map_abstract_masks(1, &alloc_mask); | 
|  |  | 
|  | #ifdef HAVE_NUMA | 
|  | if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { | 
|  | _match_masks_to_ldom(1, &alloc_mask); | 
|  | } | 
|  | #endif | 
|  |  | 
|  | str_mask = bit_fmt_hexmask(alloc_mask); | 
|  | FREE_NULL_BITMAP(alloc_mask); | 
|  | return str_mask; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Given a job step request, return an equivalent local bitmap for this node | 
|  | * IN cred         - The job step launch request credential | 
|  | * OUT hw_sockets  - number of actual sockets on this node | 
|  | * OUT hw_cores    - number of actual cores per socket on this node | 
|  | * OUT hw_threads  - number of actual threads per core on this node | 
|  | * RET: bitmap of processors available to this job step on this node | 
|  | *      OR NULL on error | 
|  | */ | 
|  | static bitstr_t *_get_avail_map(slurm_cred_t *cred, uint16_t *hw_sockets, | 
|  | uint16_t *hw_cores, uint16_t *hw_threads) | 
|  | { | 
|  | bitstr_t *req_map, *hw_map; | 
|  | uint16_t p, t, new_p, num_cores, sockets, cores; | 
|  | int job_node_id; | 
|  | int start; | 
|  | char *str; | 
|  | int spec_thread_cnt = 0; | 
|  | slurm_cred_arg_t *arg = slurm_cred_get_args(cred); | 
|  |  | 
|  | *hw_sockets = conf->actual_sockets; | 
|  | *hw_cores   = conf->actual_cores; | 
|  | *hw_threads = conf->actual_threads; | 
|  |  | 
|  | /* we need this node's ID in relation to the whole | 
|  | * job allocation, not just this jobstep */ | 
|  | job_node_id = nodelist_find(arg->job_hostlist, conf->node_name); | 
|  | if ((job_node_id < 0) || (job_node_id > arg->job_nhosts)) { | 
|  | error("%s: missing node %s in job credential (%s)", | 
|  | __func__, conf->node_name, arg->job_hostlist); | 
|  | slurm_cred_unlock_args(cred); | 
|  | return NULL; | 
|  | } | 
|  | start = _get_local_node_info(arg, job_node_id, &sockets, &cores); | 
|  | debug3("slurmctld s %u c %u; hw s %u c %u t %u", | 
|  | sockets, cores, *hw_sockets, *hw_cores, *hw_threads); | 
|  |  | 
|  | num_cores = MIN((sockets * cores), ((*hw_sockets)*(*hw_cores))); | 
|  | req_map = bit_alloc(num_cores); | 
|  | hw_map = bit_alloc(conf->block_map_size); | 
|  |  | 
|  | /* Transfer core_bitmap data to local req_map. | 
|  | * The MOD function handles the case where fewer processes | 
|  | * physically exist than are configured (slurmd is out of | 
|  | * sync with the slurmctld daemon). */ | 
|  | for (p = 0; p < (sockets * cores); p++) { | 
|  | if (bit_test(arg->step_core_bitmap, start + p)) | 
|  | bit_set(req_map, (p % num_cores)); | 
|  | } | 
|  |  | 
|  | str = (char *)bit_fmt_hexmask(req_map); | 
|  | debug3("%ps core mask from slurmctld: %s", | 
|  | &arg->step_id, str); | 
|  | xfree(str); | 
|  |  | 
|  | for (p = 0; p < num_cores; p++) { | 
|  | if (bit_test(req_map, p) == 0) | 
|  | continue; | 
|  | /* If we are pretending we have a larger system than | 
|  | we really have this is needed to make sure we | 
|  | don't bust the bank. | 
|  | */ | 
|  | new_p = p % conf->block_map_size; | 
|  | /* | 
|  | * core_bitmap does not include threads, so we add them here. | 
|  | * Add all configured threads. The step will be limited to | 
|  | * requested threads later. | 
|  | */ | 
|  | for (t = 0; t < (conf->threads); t++) { | 
|  | uint16_t bit = new_p * (*hw_threads) + t; | 
|  | bit %= conf->block_map_size; | 
|  | bit_set(hw_map, bit); | 
|  | } | 
|  | } | 
|  |  | 
|  | if ((arg->job_core_spec != NO_VAL16) && | 
|  | (arg->job_core_spec &  CORE_SPEC_THREAD)  && | 
|  | (arg->job_core_spec != CORE_SPEC_THREAD)) { | 
|  | spec_thread_cnt = arg->job_core_spec & (~CORE_SPEC_THREAD); | 
|  | } | 
|  | if (spec_thread_cnt) { | 
|  | /* Skip specialized threads as needed */ | 
|  | int i, t, c, s; | 
|  | for (t = conf->threads - 1; | 
|  | ((t >= 0) && (spec_thread_cnt > 0)); t--) { | 
|  | for (c = conf->cores - 1; | 
|  | ((c >= 0) && (spec_thread_cnt > 0)); c--) { | 
|  | for (s = conf->sockets - 1; | 
|  | ((s >= 0) && (spec_thread_cnt > 0)); s--) { | 
|  | i = s * conf->cores + c; | 
|  | i = (i * conf->threads) + t; | 
|  | /* | 
|  | * If config_overrides is used bitmap | 
|  | * may be too small for the counter | 
|  | */ | 
|  | i %= conf->block_map_size; | 
|  | bit_clear(hw_map, i); | 
|  | spec_thread_cnt--; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | str = (char *)bit_fmt_hexmask(hw_map); | 
|  | debug3("%ps CPU final mask for local node: %s", | 
|  | &arg->step_id, str); | 
|  | xfree(str); | 
|  |  | 
|  | FREE_NULL_BITMAP(req_map); | 
|  | slurm_cred_unlock_args(cred); | 
|  | return hw_map; | 
|  | } | 
|  |  | 
|  | /* helper function for _expand_masks() */ | 
|  | static void _blot_mask(bitstr_t *mask, bitstr_t *avail_map, uint16_t blot) | 
|  | { | 
|  | uint16_t i, j, size = 0; | 
|  | int prev = -1; | 
|  |  | 
|  | if (!mask) | 
|  | return; | 
|  | size = bit_size(mask); | 
|  | for (i = 0; i < size; i++) { | 
|  | if (bit_test(mask, i)) { | 
|  | /* fill in this blot */ | 
|  | uint16_t start = (i / blot) * blot; | 
|  | if (start != prev) { | 
|  | for (j = start; j < start + blot; j++) { | 
|  | if (bit_test(avail_map, j)) | 
|  | bit_set(mask, j); | 
|  | } | 
|  | prev = start; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /* helper function for _expand_masks() | 
|  | * for each task, consider which other bits are set in avail_map | 
|  | * on the same socket */ | 
|  | static void _blot_mask_sockets(const uint32_t maxtasks, const uint32_t task, | 
|  | bitstr_t **masks, uint16_t hw_sockets, | 
|  | uint16_t hw_cores, uint16_t hw_threads, | 
|  | bitstr_t *avail_map) | 
|  | { | 
|  | uint16_t i, j, size = 0; | 
|  | int blot; | 
|  |  | 
|  | if (!masks[task]) | 
|  | return; | 
|  |  | 
|  | blot = bit_size(avail_map) / hw_sockets; | 
|  | if (blot <= 0) | 
|  | blot = 1; | 
|  | size = bit_size(masks[task]); | 
|  | for (i = 0; i < size; i++) { | 
|  | if (bit_test(masks[task], i)) { | 
|  | /* check if other bits are set in avail_map on this | 
|  | * socket and set each corresponding bit in masks */ | 
|  | uint16_t start = (i / blot) * blot; | 
|  | for (j = start; j < start+blot; j++) { | 
|  | if (bit_test(avail_map, j)) | 
|  | bit_set(masks[task], j); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /* for each mask, expand the mask around the set bits to include the | 
|  | * complete resource to which the set bits are to be bound */ | 
|  | static void _expand_masks(uint16_t cpu_bind_type, const uint32_t maxtasks, | 
|  | bitstr_t **masks, uint16_t hw_sockets, | 
|  | uint16_t hw_cores, uint16_t hw_threads, | 
|  | bitstr_t *avail_map) | 
|  | { | 
|  | uint32_t i; | 
|  |  | 
|  | if (cpu_bind_type & CPU_BIND_TO_THREADS) | 
|  | return; | 
|  | if (cpu_bind_type & CPU_BIND_TO_CORES) { | 
|  | if (hw_threads < 2) | 
|  | return; | 
|  | for (i = 0; i < maxtasks; i++) { | 
|  | _blot_mask(masks[i], avail_map, hw_threads); | 
|  | } | 
|  | return; | 
|  | } | 
|  | if (cpu_bind_type & CPU_BIND_TO_SOCKETS) { | 
|  | if (hw_threads*hw_cores < 2) | 
|  | return; | 
|  | for (i = 0; i < maxtasks; i++) { | 
|  | _blot_mask_sockets(maxtasks, i, masks, hw_sockets, | 
|  | hw_cores, hw_threads, avail_map); | 
|  | } | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * _task_layout_lllp_cyclic | 
|  | * | 
|  | * task_layout_lllp_cyclic creates a cyclic distribution at the | 
|  | * lowest level of logical processor which is either socket, core or | 
|  | * thread depending on the system architecture. The Cyclic algorithm | 
|  | * is the same as the Cyclic distribution performed in srun. | 
|  | * | 
|  | *  Distribution at the lllp: | 
|  | *  -m hostfile|block|cyclic:block|cyclic | 
|  | * | 
|  | * The first distribution "hostfile|block|cyclic" is computed | 
|  | * in srun. The second distribution "block|cyclic" is computed | 
|  | * locally by each slurmd. | 
|  | * | 
|  | * The input to the lllp distribution algorithms is the gids (tasks | 
|  | * ids) generated for the local node. | 
|  | * | 
|  | * The output is a mapping of the gids onto logical processors | 
|  | * (thread/core/socket) with is expressed cpu_bind masks. | 
|  | * | 
|  | * If a task asks for more than one CPU per task, put the tasks as | 
|  | * close as possible (fill core rather than going next socket for the | 
|  | * extra task) | 
|  | * | 
|  | */ | 
|  | static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req, | 
|  | uint32_t node_id, bitstr_t ***masks_p) | 
|  | { | 
|  | int last_taskcount = -1, taskcount = 0; | 
|  | uint16_t i, s, hw_sockets = 0, hw_cores = 0, hw_threads = 0; | 
|  | uint16_t offset = 0, p = 0; | 
|  | int size, max_tasks = req->tasks_to_launch[node_id]; | 
|  | int max_cpus = max_tasks * req->cpus_per_task; | 
|  | bitstr_t *avail_map; | 
|  | bitstr_t **masks = NULL; | 
|  | int *socket_last_pu = NULL; | 
|  | int core_inx, pu_per_core, *core_tasks = NULL, *core_threads = NULL; | 
|  | int req_threads_per_core = 0; | 
|  |  | 
|  | info ("_task_layout_lllp_cyclic "); | 
|  |  | 
|  | avail_map = _get_avail_map(req->cred, &hw_sockets, &hw_cores, | 
|  | &hw_threads); | 
|  | if (!avail_map) | 
|  | return ESLURMD_CPU_LAYOUT_ERROR; | 
|  |  | 
|  | if (req->threads_per_core && (req->threads_per_core != NO_VAL16)) | 
|  | req_threads_per_core = req->threads_per_core; | 
|  | else if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) | 
|  | req_threads_per_core = 1; | 
|  |  | 
|  | size = bit_set_count(avail_map); | 
|  | /* | 
|  | * If configured threads > hw threads, then we are oversubscribing | 
|  | * threads, so don't check the number of bits set. | 
|  | */ | 
|  | if (req_threads_per_core && (conf->threads <= hw_threads)) { | 
|  | if (size < (req->cpus_per_task * (conf->threads / | 
|  | req_threads_per_core))) { | 
|  | error("only %d bits in avail_map, threads_per_core requires %d!", | 
|  | size, | 
|  | (req->cpus_per_task * (conf->threads / | 
|  | req_threads_per_core))); | 
|  | FREE_NULL_BITMAP(avail_map); | 
|  | return ESLURMD_CPU_LAYOUT_ERROR; | 
|  | } | 
|  | } | 
|  | if (size < max_tasks) { | 
|  | if (!(req->flags & LAUNCH_OVERCOMMIT)) | 
|  | error("only %d bits in avail_map for %d tasks!", | 
|  | size, max_tasks); | 
|  | FREE_NULL_BITMAP(avail_map); | 
|  | return ESLURMD_CPU_LAYOUT_ERROR; | 
|  | } | 
|  | if (size < max_cpus) { | 
|  | /* Possible result of overcommit */ | 
|  | i = size / max_tasks; | 
|  | info("reset cpus_per_task from %d to %d", | 
|  | req->cpus_per_task, i); | 
|  | req->cpus_per_task = i; | 
|  | } | 
|  |  | 
|  | pu_per_core = hw_threads; | 
|  | core_tasks = xcalloc(hw_sockets * hw_cores, sizeof(int)); | 
|  | core_threads = xcalloc(hw_sockets * hw_cores, sizeof(int)); | 
|  | socket_last_pu = xcalloc(hw_sockets, sizeof(int)); | 
|  |  | 
|  | *masks_p = xcalloc(max_tasks, sizeof(bitstr_t *)); | 
|  | masks = *masks_p; | 
|  |  | 
|  | size = bit_size(avail_map); | 
|  |  | 
|  | offset = hw_cores * hw_threads; | 
|  | s = 0; | 
|  | while (taskcount < max_tasks) { | 
|  | if (taskcount == last_taskcount) { | 
|  | error("_task_layout_lllp_cyclic failure"); | 
|  | FREE_NULL_BITMAP(avail_map); | 
|  | xfree(core_tasks); | 
|  | xfree(core_threads); | 
|  | xfree(socket_last_pu); | 
|  | return ESLURMD_CPU_LAYOUT_ERROR; | 
|  | } | 
|  | last_taskcount = taskcount; | 
|  | for (i = 0; i < size; i++) { | 
|  | bool already_switched = false; | 
|  | uint16_t bit; | 
|  | uint16_t orig_s = s; | 
|  |  | 
|  | while (socket_last_pu[s] >= offset) { | 
|  | /* Switch to the next socket we have | 
|  | * ran out here. */ | 
|  |  | 
|  | /* This only happens if the slurmctld | 
|  | * gave us an allocation that made a | 
|  | * task split sockets.  Or if the | 
|  | * entire allocation is on one socket. | 
|  | */ | 
|  | s = (s + 1) % hw_sockets; | 
|  | if (orig_s == s) { | 
|  | /* This should rarely happen, | 
|  | * but is here for sanity sake. | 
|  | */ | 
|  | debug("allocation is full, " | 
|  | "oversubscribing"); | 
|  | memset(core_tasks, 0, | 
|  | (sizeof(int) * | 
|  | hw_sockets * hw_cores)); | 
|  | memset(core_threads, 0, | 
|  | (sizeof(int) * | 
|  | hw_sockets * hw_cores)); | 
|  | memset(socket_last_pu, 0, | 
|  | (sizeof(int) * hw_sockets)); | 
|  | } | 
|  | } | 
|  |  | 
|  | bit = socket_last_pu[s] + (s * offset); | 
|  |  | 
|  | /* In case hardware and config differ */ | 
|  | bit %= size; | 
|  |  | 
|  | /* set up for the next one */ | 
|  | socket_last_pu[s]++; | 
|  |  | 
|  | if (!bit_test(avail_map, bit)) | 
|  | continue; | 
|  |  | 
|  | core_inx = bit / pu_per_core; | 
|  | if ((req->ntasks_per_core != 0) && | 
|  | (core_tasks[core_inx] >= req->ntasks_per_core)) | 
|  | continue; | 
|  | if (req_threads_per_core && | 
|  | (core_threads[core_inx] >= req_threads_per_core)) | 
|  | continue; | 
|  |  | 
|  | if (!masks[taskcount]) | 
|  | masks[taskcount] = | 
|  | bit_alloc(conf->block_map_size); | 
|  |  | 
|  | //info("setting %d %d", taskcount, bit); | 
|  | bit_set(masks[taskcount], bit); | 
|  |  | 
|  | if (!already_switched && | 
|  | (((req->task_dist & SLURM_DIST_NODESOCKMASK) == | 
|  | SLURM_DIST_CYCLIC_CFULL) || | 
|  | ((req->task_dist & SLURM_DIST_NODESOCKMASK) == | 
|  | SLURM_DIST_BLOCK_CFULL))) { | 
|  | /* This means we are laying out cpus | 
|  | * within a task cyclically as well. */ | 
|  | s = (s + 1) % hw_sockets; | 
|  | already_switched = true; | 
|  | } | 
|  |  | 
|  | core_threads[core_inx]++; | 
|  |  | 
|  | if (++p < req->cpus_per_task) | 
|  | continue; | 
|  |  | 
|  | core_tasks[core_inx]++; | 
|  |  | 
|  | /* Binding to cores, skip remaining of the threads */ | 
|  | if ((req->cpu_bind_type & CPU_BIND_TO_CORES) || | 
|  | (req->ntasks_per_core == 1)) { | 
|  | int threads_not_used; | 
|  | if (req->cpus_per_task < hw_threads) | 
|  | threads_not_used = | 
|  | hw_threads - req->cpus_per_task; | 
|  | else | 
|  | threads_not_used = | 
|  | req->cpus_per_task % hw_threads; | 
|  | socket_last_pu[s] += threads_not_used; | 
|  | } | 
|  | p = 0; | 
|  |  | 
|  | if (!already_switched) { | 
|  | /* Now that we have finished a task, switch to | 
|  | * the next socket. */ | 
|  | s = (s + 1) % hw_sockets; | 
|  | } | 
|  |  | 
|  | if (++taskcount >= max_tasks) | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* last step: expand the masks to bind each task | 
|  | * to the requested resource */ | 
|  | _expand_masks(req->cpu_bind_type, max_tasks, masks, | 
|  | hw_sockets, hw_cores, hw_threads, avail_map); | 
|  | FREE_NULL_BITMAP(avail_map); | 
|  | xfree(core_tasks); | 
|  | xfree(core_threads); | 
|  | xfree(socket_last_pu); | 
|  |  | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * _task_layout_lllp_block | 
|  | * | 
|  | * task_layout_lllp_block will create a block distribution at the | 
|  | * lowest level of logical processor which is either socket, core or | 
|  | * thread depending on the system architecture. The Block algorithm | 
|  | * is the same as the Block distribution performed in srun. | 
|  | * | 
|  | *  Distribution at the lllp: | 
|  | *  -m hostfile|plane|block|cyclic:block|cyclic | 
|  | * | 
|  | * The first distribution "hostfile|plane|block|cyclic" is computed | 
|  | * in srun. The second distribution "plane|block|cyclic" is computed | 
|  | * locally by each slurmd. | 
|  | * | 
|  | * The input to the lllp distribution algorithms is the gids (tasks | 
|  | * ids) generated for the local node. | 
|  | * | 
|  | * The output is a mapping of the gids onto logical processors | 
|  | * (thread/core/socket)  with is expressed cpu_bind masks. | 
|  | * | 
|  | */ | 
|  | static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, | 
|  | uint32_t node_id, bitstr_t ***masks_p) | 
|  | { | 
|  | int c, i, size, last_taskcount = -1, taskcount = 0; | 
|  | uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0; | 
|  | int max_tasks = req->tasks_to_launch[node_id]; | 
|  | int max_cpus = max_tasks * req->cpus_per_task; | 
|  | bitstr_t *avail_map; | 
|  | bitstr_t **masks = NULL; | 
|  | int core_inx, pu_per_core, *core_tasks = NULL, *core_threads = NULL; | 
|  | int sock_inx, pu_per_socket, *socket_tasks = NULL; | 
|  | int req_threads_per_core = 0; | 
|  |  | 
|  | info("_task_layout_lllp_block "); | 
|  |  | 
|  | avail_map = _get_avail_map(req->cred, &hw_sockets, &hw_cores, | 
|  | &hw_threads); | 
|  | if (!avail_map) { | 
|  | return ESLURMD_CPU_LAYOUT_ERROR; | 
|  | } | 
|  |  | 
|  | if (req->threads_per_core && (req->threads_per_core != NO_VAL16)) | 
|  | req_threads_per_core = req->threads_per_core; | 
|  | else if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) | 
|  | req_threads_per_core = 1; | 
|  |  | 
|  | size = bit_set_count(avail_map); | 
|  | /* | 
|  | * If configured threads > hw threads, then we are oversubscribing | 
|  | * threads, so don't check the number of bits set. | 
|  | */ | 
|  | if (req_threads_per_core && (conf->threads <= hw_threads)) { | 
|  | if (size < (req->cpus_per_task * (conf->threads / | 
|  | req_threads_per_core))) { | 
|  | error("only %d bits in avail_map, threads_per_core requires %d!", | 
|  | size, | 
|  | (req->cpus_per_task * (conf->threads / | 
|  | req_threads_per_core))); | 
|  | FREE_NULL_BITMAP(avail_map); | 
|  | return ESLURMD_CPU_LAYOUT_ERROR; | 
|  | } | 
|  | } | 
|  | if (size < max_tasks) { | 
|  | if (!(req->flags & LAUNCH_OVERCOMMIT)) | 
|  | error("only %d bits in avail_map for %d tasks!", | 
|  | size, max_tasks); | 
|  | FREE_NULL_BITMAP(avail_map); | 
|  | return ESLURMD_CPU_LAYOUT_ERROR; | 
|  | } | 
|  | if (size < max_cpus) { | 
|  | /* Possible result of overcommit */ | 
|  | i = size / max_tasks; | 
|  | info("reset cpus_per_task from %d to %d", | 
|  | req->cpus_per_task, i); | 
|  | req->cpus_per_task = i; | 
|  | } | 
|  | size = bit_size(avail_map); | 
|  |  | 
|  | *masks_p = xcalloc(max_tasks, sizeof(bitstr_t *)); | 
|  | masks = *masks_p; | 
|  |  | 
|  | pu_per_core = hw_threads; | 
|  | core_tasks = xcalloc(hw_sockets * hw_cores, sizeof(int)); | 
|  | core_threads = xcalloc(hw_sockets * hw_cores, sizeof(int)); | 
|  | pu_per_socket = hw_cores * hw_threads; | 
|  | socket_tasks = xcalloc(hw_sockets, sizeof(int)); | 
|  |  | 
|  | /* block distribution with oversubsciption */ | 
|  | c = 0; | 
|  | while (taskcount < max_tasks) { | 
|  | if (taskcount == last_taskcount) { | 
|  | error("_task_layout_lllp_block infinite loop"); | 
|  | FREE_NULL_BITMAP(avail_map); | 
|  | xfree(core_tasks); | 
|  | xfree(core_threads); | 
|  | xfree(socket_tasks); | 
|  | return ESLURMD_CPU_LAYOUT_ERROR; | 
|  | } | 
|  | if (taskcount > 0) { | 
|  | /* Clear counters to over-subscribe, if necessary */ | 
|  | memset(core_tasks, 0, | 
|  | (sizeof(int) * hw_sockets * hw_cores)); | 
|  | memset(core_threads, 0, | 
|  | (sizeof(int) * hw_sockets * hw_cores)); | 
|  | memset(socket_tasks, 0, | 
|  | (sizeof(int) * hw_sockets)); | 
|  | } | 
|  | last_taskcount = taskcount; | 
|  | /* the abstract map is already laid out in block order, | 
|  | * so just iterate over it | 
|  | */ | 
|  | for (i = 0; i < size; i++) { | 
|  | /* skip unavailable resources */ | 
|  | if (bit_test(avail_map, i) == 0) | 
|  | continue; | 
|  |  | 
|  | core_inx = i / pu_per_core; | 
|  | if ((req->ntasks_per_core != 0) && | 
|  | (core_tasks[core_inx] >= req->ntasks_per_core)) | 
|  | continue; | 
|  | sock_inx = i / pu_per_socket; | 
|  | if ((req->ntasks_per_socket != 0) && | 
|  | (socket_tasks[sock_inx] >= req->ntasks_per_socket)) | 
|  | continue; | 
|  | if (req_threads_per_core && | 
|  | (core_threads[core_inx] >= req_threads_per_core)) | 
|  | continue; | 
|  |  | 
|  | if (!masks[taskcount]) | 
|  | masks[taskcount] = bit_alloc( | 
|  | conf->block_map_size); | 
|  | //info("setting %d %d", taskcount, i); | 
|  | bit_set(masks[taskcount], i); | 
|  |  | 
|  | core_threads[core_inx]++; | 
|  |  | 
|  | if (++c < req->cpus_per_task) | 
|  | continue; | 
|  |  | 
|  | /* We found one! Increment the count on each unit */ | 
|  | core_tasks[core_inx]++; | 
|  | socket_tasks[sock_inx]++; | 
|  |  | 
|  | /* Binding to cores, skip remaining of the threads */ | 
|  | if ((req->cpu_bind_type & CPU_BIND_TO_CORES) || | 
|  | (req->ntasks_per_core == 1)) { | 
|  | int threads_not_used; | 
|  | if (req->cpus_per_task < hw_threads) | 
|  | threads_not_used = | 
|  | hw_threads - req->cpus_per_task; | 
|  | else | 
|  | threads_not_used = | 
|  | req->cpus_per_task % hw_threads; | 
|  | i += threads_not_used; | 
|  | } | 
|  | c = 0; | 
|  | if (++taskcount >= max_tasks) | 
|  | break; | 
|  | } | 
|  | } | 
|  | xfree(core_tasks); | 
|  | xfree(core_threads); | 
|  | xfree(socket_tasks); | 
|  |  | 
|  | /* last step: expand the masks to bind each task | 
|  | * to the requested resource */ | 
|  | _expand_masks(req->cpu_bind_type, max_tasks, masks, | 
|  | hw_sockets, hw_cores, hw_threads, avail_map); | 
|  | FREE_NULL_BITMAP(avail_map); | 
|  |  | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * _lllp_map_abstract_mask | 
|  | * | 
|  | * Map one abstract block mask to a physical machine mask | 
|  | * | 
|  | * IN - mask to map | 
|  | * OUT - mapped mask (storage allocated in this routine) | 
|  | */ | 
|  | static bitstr_t *_lllp_map_abstract_mask(bitstr_t *bitmask) | 
|  | { | 
|  | int i, bit; | 
|  | int num_bits = bit_size(bitmask); | 
|  | bitstr_t *newmask = bit_alloc(num_bits); | 
|  |  | 
|  | /* remap to physical machine */ | 
|  | for (i = 0; i < num_bits; i++) { | 
|  | if (bit_test(bitmask,i)) { | 
|  | bit = BLOCK_MAP(i); | 
|  | if (bit < bit_size(newmask)) | 
|  | bit_set(newmask, bit); | 
|  | else | 
|  | error("can't go from %d -> %d since we " | 
|  | "only have %"BITSTR_FMT" bits", | 
|  | i, bit, bit_size(newmask)); | 
|  | } | 
|  | } | 
|  | return newmask; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * _lllp_map_abstract_masks | 
|  | * | 
|  | * Map an array of abstract block masks to physical machine masks | 
|  | * | 
|  | * IN- maximum number of tasks | 
|  | * IN/OUT- array of masks | 
|  | */ | 
|  | static void _lllp_map_abstract_masks(const uint32_t maxtasks, bitstr_t **masks) | 
|  | { | 
|  | int i; | 
|  | debug3("_lllp_map_abstract_masks"); | 
|  |  | 
|  | for (i = 0; i < maxtasks; i++) { | 
|  | bitstr_t *bitmask = masks[i]; | 
|  | if (bitmask) { | 
|  | bitstr_t *newmask = _lllp_map_abstract_mask(bitmask); | 
|  | FREE_NULL_BITMAP(bitmask); | 
|  | masks[i] = newmask; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * _lllp_generate_cpu_bind | 
|  | * | 
|  | * Generate the cpu_bind type and string given an array of bitstr_t masks | 
|  | * | 
|  | * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated) | 
|  | * IN- maximum number of tasks | 
|  | * IN- array of masks | 
|  | */ | 
|  | static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req, | 
|  | const uint32_t maxtasks, bitstr_t **masks) | 
|  | { | 
|  | int i, num_bits = 0, masks_len; | 
|  | bitstr_t *bitmask; | 
|  | bitoff_t charsize; | 
|  | char *masks_str = NULL; | 
|  | char buf_type[100]; | 
|  |  | 
|  | for (i = 0; i < maxtasks; i++) { | 
|  | bitmask = masks[i]; | 
|  | if (bitmask) { | 
|  | num_bits = bit_size(bitmask); | 
|  | break; | 
|  | } | 
|  | } | 
|  | charsize = (num_bits + 3) / 4;		/* ASCII hex digits */ | 
|  | charsize += 3;				/* "0x" and trailing "," */ | 
|  | masks_len = maxtasks * charsize + 1;	/* number of masks + null */ | 
|  |  | 
|  | debug3("%d %"BITSTR_FMT" %d", maxtasks, charsize, | 
|  | masks_len); | 
|  |  | 
|  | masks_str = xmalloc(masks_len); | 
|  | masks_len = 0; | 
|  | for (i = 0; i < maxtasks; i++) { | 
|  | char *str; | 
|  | int curlen; | 
|  | bitmask = masks[i]; | 
|  | if (bitmask == NULL) { | 
|  | continue; | 
|  | } | 
|  | str = (char *)bit_fmt_hexmask(bitmask); | 
|  | curlen = strlen(str) + 1; | 
|  |  | 
|  | if (masks_len > 0) | 
|  | masks_str[masks_len - 1] = ','; | 
|  | strlcpy(&masks_str[masks_len], str, curlen); | 
|  | masks_len += curlen; | 
|  | xfree(str); | 
|  | } | 
|  |  | 
|  | if (req->cpu_bind) { | 
|  | xfree(req->cpu_bind); | 
|  | } | 
|  | if (masks_str[0] != '\0') { | 
|  | req->cpu_bind = masks_str; | 
|  | masks_str = NULL; | 
|  | req->cpu_bind_type |= CPU_BIND_MASK; | 
|  | } else { | 
|  | req->cpu_bind = NULL; | 
|  | req->cpu_bind_type &= ~CPU_BIND_VERBOSE; | 
|  | } | 
|  | xfree(masks_str); | 
|  |  | 
|  | /* clear mask generation bits */ | 
|  | req->cpu_bind_type &= ~CPU_BIND_TO_THREADS; | 
|  | req->cpu_bind_type &= ~CPU_BIND_TO_CORES; | 
|  | req->cpu_bind_type &= ~CPU_BIND_TO_SOCKETS; | 
|  | req->cpu_bind_type &= ~CPU_BIND_TO_LDOMS; | 
|  |  | 
|  | slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); | 
|  | info("_lllp_generate_cpu_bind jobid [%u]: %s, %s", | 
|  | req->step_id.job_id, buf_type, req->cpu_bind); | 
|  | } |