|  | /*****************************************************************************\ | 
|  | *  slurm_step_layout.c - functions to distribute tasks over nodes. | 
|  | ***************************************************************************** | 
|  | *  Copyright (C) 2005 Hewlett-Packard Development Company, L.P. | 
|  | *  Written by Chris Holmes, <cholmes@hp.com>, who borrowed heavily | 
|  | *  from other parts of SLURM. | 
|  | *  CODE-OCEC-09-009. All rights reserved. | 
|  | * | 
|  | *  This file is part of Slurm, a resource management program. | 
|  | *  For details, see <https://slurm.schedmd.com/>. | 
|  | *  Please also read the included file: DISCLAIMER. | 
|  | * | 
|  | *  Slurm is free software; you can redistribute it and/or modify it under | 
|  | *  the terms of the GNU General Public License as published by the Free | 
|  | *  Software Foundation; either version 2 of the License, or (at your option) | 
|  | *  any later version. | 
|  | * | 
|  | *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY | 
|  | *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
|  | *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more | 
|  | *  details. | 
|  | * | 
|  | *  You should have received a copy of the GNU General Public License along | 
|  | *  with Slurm; if not, write to the Free Software Foundation, Inc., | 
|  | *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA. | 
|  | * | 
|  | *  This file is patterned after hostlist.c, written by Mark Grondona and | 
|  | *  Copyright (C) 2002 The Regents of the University of California. | 
|  | \*****************************************************************************/ | 
|  |  | 
|  | #include <stdlib.h> | 
|  | #include <string.h> | 
|  |  | 
|  | #include "slurm/slurm.h" | 
|  | #include "slurm/slurm_errno.h" | 
|  |  | 
|  | #include "src/common/log.h" | 
|  | #include "src/common/read_config.h" | 
|  | #include "src/interfaces/select.h" | 
|  | #include "src/common/slurm_protocol_api.h" | 
|  | #include "src/common/slurm_step_layout.h" | 
|  | #include "src/common/slurmdb_defs.h" | 
|  | #include "src/common/xmalloc.h" | 
|  | #include "src/common/xstring.h" | 
|  |  | 
|  | /* | 
|  | ** Define slurm-specific aliases for use by plugins, see slurm_xlator.h | 
|  | ** for details. | 
|  | */ | 
|  | strong_alias(pack_slurm_step_layout, slurm_pack_slurm_step_layout); | 
|  | strong_alias(unpack_slurm_step_layout, slurm_unpack_slurm_step_layout); | 
|  |  | 
|  | /* build maps for task layout on nodes */ | 
|  | static int _init_task_layout(slurm_step_layout_req_t *step_layout_req, | 
|  | slurm_step_layout_t *step_layout, | 
|  | const char *arbitrary_nodes); | 
|  |  | 
|  | static int _task_layout_block(slurm_step_layout_t *step_layout, | 
|  | uint16_t *cpus); | 
|  | static int _task_layout_cyclic(slurm_step_layout_t *step_layout, | 
|  | uint16_t *cpus); | 
|  | static int _task_layout_plane(slurm_step_layout_t *step_layout, | 
|  | uint16_t *cpus); | 
|  | static int _task_layout_hostfile(slurm_step_layout_t *step_layout, | 
|  | const char *arbitrary_nodes); | 
|  |  | 
|  | /* | 
|  | * slurm_step_layout_create - determine how many tasks of a job will be | 
|  | *                    run on each node. Distribution is influenced | 
|  | *                    by number of cpus on each host. | 
|  | * IN step_layout_req - information needed for task distibutionhostlist corresponding to task layout | 
|  | * RET a pointer to an slurm_step_layout_t structure | 
|  | * NOTE: allocates memory that should be xfreed by caller | 
|  | */ | 
|  | slurm_step_layout_t *slurm_step_layout_create( | 
|  | slurm_step_layout_req_t *step_layout_req) | 
|  | { | 
|  | char *arbitrary_nodes = NULL; | 
|  | slurm_step_layout_t *step_layout = | 
|  | xmalloc(sizeof(slurm_step_layout_t)); | 
|  |  | 
|  | step_layout->task_dist = step_layout_req->task_dist; | 
|  | if ((step_layout->task_dist & SLURM_DIST_STATE_BASE) | 
|  | == SLURM_DIST_ARBITRARY) { | 
|  | hostlist_t *hl = NULL; | 
|  | char *buf = NULL; | 
|  | /* set the node list for the task layout later if user | 
|  | * supplied could be different that the job allocation */ | 
|  | arbitrary_nodes = xstrdup(step_layout_req->node_list); | 
|  | hl = hostlist_create(step_layout_req->node_list); | 
|  | hostlist_uniq(hl); | 
|  | buf = hostlist_ranged_string_xmalloc(hl); | 
|  | step_layout_req->num_hosts = hostlist_count(hl); | 
|  | hostlist_destroy(hl); | 
|  | step_layout->node_list = buf; | 
|  | } else { | 
|  | step_layout->node_list = xstrdup(step_layout_req->node_list); | 
|  | } | 
|  |  | 
|  | step_layout->task_cnt  = step_layout_req->num_tasks; | 
|  | step_layout->node_cnt = step_layout_req->num_hosts; | 
|  |  | 
|  | if (_init_task_layout(step_layout_req, step_layout, arbitrary_nodes) | 
|  | != SLURM_SUCCESS) { | 
|  | slurm_step_layout_destroy(step_layout); | 
|  | step_layout = NULL; | 
|  | } | 
|  | xfree(arbitrary_nodes); | 
|  | return step_layout; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * fake_slurm_step_layout_create - used when you don't allocate a job from the | 
|  | *                    controller does not set up anything | 
|  | *                    that should really be used with a switch. | 
|  | *                    Or to really lay out tasks any any certain fashion. | 
|  | * IN tlist - hostlist corresponding to task layout | 
|  | * IN cpus_per_node - cpus per node NULL if no allocation | 
|  | * IN cpu_count_reps - how many nodes have same cpu count NULL if no allocation | 
|  | * IN node_cnt - number of nodes we have | 
|  | * IN task_cnt - number of tasks to distribute across these cpus 0 | 
|  | *               if using cpus_per_node | 
|  | * RET a pointer to an slurm_step_layout_t structure | 
|  | * NOTE: allocates memory that should be xfreed by caller | 
|  | */ | 
|  | extern slurm_step_layout_t *fake_slurm_step_layout_create( | 
|  | const char *tlist, | 
|  | uint16_t *cpus_per_node, | 
|  | uint32_t *cpu_count_reps, | 
|  | uint32_t node_cnt, | 
|  | uint32_t task_cnt, | 
|  | uint16_t protocol_version) | 
|  | { | 
|  | uint32_t cpn = 1; | 
|  | int cpu_cnt = 0, cpu_inx = 0, i, j; | 
|  | slurm_step_layout_t *step_layout = NULL; | 
|  |  | 
|  | if (!node_cnt || !tlist || | 
|  | (!cpus_per_node && (!task_cnt || (task_cnt == NO_VAL)))) { | 
|  | error("there is a problem with your fake_step_layout request\n" | 
|  | "node_cnt = %u, task_cnt = %u, tlist = %s", | 
|  | node_cnt, task_cnt, tlist); | 
|  | return NULL; | 
|  | } | 
|  |  | 
|  | step_layout = xmalloc(sizeof(slurm_step_layout_t)); | 
|  | step_layout->node_list = xstrdup(tlist); | 
|  | step_layout->node_cnt = node_cnt; | 
|  | step_layout->start_protocol_ver = protocol_version; | 
|  | step_layout->tasks = xcalloc(node_cnt, sizeof(uint16_t)); | 
|  | step_layout->tids = xcalloc(node_cnt, sizeof(uint32_t *)); | 
|  |  | 
|  | step_layout->task_cnt = 0; | 
|  | for (i = 0; i < step_layout->node_cnt; i++) { | 
|  | if (cpus_per_node && cpu_count_reps) { | 
|  | step_layout->tasks[i] = cpus_per_node[cpu_inx]; | 
|  | step_layout->tids[i] = xcalloc(step_layout->tasks[i], | 
|  | sizeof(uint32_t)); | 
|  |  | 
|  | for (j = 0; j < step_layout->tasks[i]; j++) | 
|  | step_layout->tids[i][j] = | 
|  | step_layout->task_cnt++; | 
|  |  | 
|  | if ((++cpu_cnt) >= cpu_count_reps[cpu_inx]) { | 
|  | /* move to next record */ | 
|  | cpu_inx++; | 
|  | cpu_cnt = 0; | 
|  | } | 
|  | } else { | 
|  | cpn = ROUNDUP((task_cnt - step_layout->task_cnt), | 
|  | (node_cnt - i)); | 
|  | if (step_layout->task_cnt >= task_cnt) { | 
|  | step_layout->tasks[i] = 0; | 
|  | step_layout->tids[i] = NULL; | 
|  | } else { | 
|  | step_layout->tasks[i] = cpn; | 
|  | step_layout->tids[i] = | 
|  | xcalloc(cpn, sizeof(uint32_t)); | 
|  |  | 
|  | for (j = 0; j < cpn; j++) { | 
|  | step_layout->tids[i][j] = | 
|  | step_layout->task_cnt++; | 
|  | if (step_layout->task_cnt >= task_cnt) { | 
|  | step_layout->tasks[i] = j + 1; | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | return step_layout; | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  | /* copies structure for step layout */ | 
|  | extern slurm_step_layout_t *slurm_step_layout_copy( | 
|  | slurm_step_layout_t *step_layout) | 
|  | { | 
|  | slurm_step_layout_t *layout; | 
|  | int i = 0; | 
|  | if (!step_layout) | 
|  | return NULL; | 
|  |  | 
|  | layout = xmalloc(sizeof(slurm_step_layout_t)); | 
|  | if (step_layout->alias_addrs) { | 
|  | layout->alias_addrs = xmalloc(sizeof(slurm_node_alias_addrs_t)); | 
|  | slurm_copy_node_alias_addrs_members(layout->alias_addrs, | 
|  | step_layout->alias_addrs); | 
|  | } | 
|  | layout->node_list = xstrdup(step_layout->node_list); | 
|  | layout->node_cnt = step_layout->node_cnt; | 
|  | layout->start_protocol_ver = step_layout->start_protocol_ver; | 
|  | layout->task_cnt = step_layout->task_cnt; | 
|  | layout->task_dist = step_layout->task_dist; | 
|  |  | 
|  | layout->tasks = xcalloc(layout->node_cnt, sizeof(uint16_t)); | 
|  | memcpy(layout->tasks, step_layout->tasks, | 
|  | (sizeof(uint16_t) * layout->node_cnt)); | 
|  | if (step_layout->cpt_compact_cnt) { | 
|  | uint32_t cnt = step_layout->cpt_compact_cnt; | 
|  |  | 
|  | layout->cpt_compact_cnt = cnt; | 
|  | layout->cpt_compact_array = | 
|  | xcalloc(cnt, sizeof(*layout->cpt_compact_array)); | 
|  | memcpy(layout->cpt_compact_array, | 
|  | step_layout->cpt_compact_array, | 
|  | (sizeof(*layout->cpt_compact_array) * cnt)); | 
|  |  | 
|  | layout->cpt_compact_reps = | 
|  | xcalloc(cnt, sizeof(*layout->cpt_compact_reps)); | 
|  | memcpy(layout->cpt_compact_reps, | 
|  | step_layout->cpt_compact_reps, | 
|  | (sizeof(*layout->cpt_compact_reps) * cnt)); | 
|  |  | 
|  | } | 
|  |  | 
|  | layout->tids = xcalloc(layout->node_cnt, sizeof(uint32_t *)); | 
|  | for (i = 0; i < layout->node_cnt; i++) { | 
|  | layout->tids[i] = xcalloc(layout->tasks[i], sizeof(uint32_t)); | 
|  | memcpy(layout->tids[i], step_layout->tids[i], | 
|  | (sizeof(uint32_t) * layout->tasks[i])); | 
|  | } | 
|  |  | 
|  | return layout; | 
|  | } | 
|  |  | 
|  | extern void slurm_step_layout_merge(slurm_step_layout_t *step_layout1, | 
|  | slurm_step_layout_t *step_layout2) | 
|  | { | 
|  | hostlist_t *hl, *hl2; | 
|  | hostlist_iterator_t *host_itr; | 
|  | int new_pos = 0, node_task_cnt; | 
|  | char *host; | 
|  |  | 
|  | xassert(step_layout1); | 
|  | xassert(step_layout2); | 
|  |  | 
|  | /* | 
|  | * cpt_compact* fields are currently not used by the clients who issue | 
|  | * the RPC that calls this function. So, we currently do not merge | 
|  | * the cpt_compact* fields. | 
|  | */ | 
|  |  | 
|  | hl = hostlist_create(step_layout1->node_list); | 
|  | hl2 = hostlist_create(step_layout2->node_list); | 
|  |  | 
|  | host_itr = hostlist_iterator_create(hl2); | 
|  | while ((host = hostlist_next(host_itr))) { | 
|  | int pos = hostlist_find(hl, host); | 
|  |  | 
|  | if (pos == -1) { | 
|  | /* If the host doesn't exist push it on the end */ | 
|  | hostlist_push_host(hl, host); | 
|  | pos = step_layout1->node_cnt++; | 
|  | xrecalloc(step_layout1->tasks, | 
|  | step_layout1->node_cnt, | 
|  | sizeof(uint16_t)); | 
|  | xrecalloc(step_layout1->tids, | 
|  | step_layout1->node_cnt, | 
|  | sizeof(uint32_t *)); | 
|  | } | 
|  | free(host); | 
|  |  | 
|  | /* set the end position of the array */ | 
|  | node_task_cnt = step_layout1->tasks[pos]; | 
|  | step_layout1->tasks[pos] += | 
|  | step_layout2->tasks[new_pos]; | 
|  | xrecalloc(step_layout1->tids[pos], | 
|  | step_layout1->tasks[pos], | 
|  | sizeof(uint32_t)); | 
|  | for (int i = 0; i < step_layout2->tasks[new_pos]; i++) { | 
|  | step_layout1->tids[pos][node_task_cnt++] = | 
|  | step_layout2->tids[new_pos][i]; | 
|  | } | 
|  | new_pos++; | 
|  | } | 
|  | hostlist_iterator_destroy(host_itr); | 
|  |  | 
|  | /* Don't need to merge alias_addrs it is per-job */ | 
|  | step_layout1->task_cnt += step_layout2->task_cnt; | 
|  | xfree(step_layout1->node_list); | 
|  | step_layout1->node_list = hostlist_ranged_string_xmalloc(hl); | 
|  | hostlist_destroy(hl); | 
|  | hostlist_destroy(hl2); | 
|  | } | 
|  |  | 
|  | extern void pack_slurm_step_layout(slurm_step_layout_t *step_layout, | 
|  | buf_t *buffer, uint16_t protocol_version) | 
|  | { | 
|  | uint32_t i = 0; | 
|  |  | 
|  | if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { | 
|  | if (step_layout) | 
|  | i = 1; | 
|  |  | 
|  | pack16(i, buffer); | 
|  | if (!i) | 
|  | return; | 
|  | packnull(buffer); | 
|  | packstr(step_layout->node_list, buffer); | 
|  | pack32(step_layout->node_cnt, buffer); | 
|  | pack16(step_layout->start_protocol_ver, buffer); | 
|  | pack32(step_layout->task_cnt, buffer); | 
|  | pack32(step_layout->task_dist, buffer); | 
|  |  | 
|  | for (i = 0; i < step_layout->node_cnt; i++) { | 
|  | pack32_array(step_layout->tids[i], | 
|  | step_layout->tasks[i], | 
|  | buffer); | 
|  | } | 
|  |  | 
|  | pack16_array(step_layout->cpt_compact_array, | 
|  | step_layout->cpt_compact_cnt, buffer); | 
|  | pack32_array(step_layout->cpt_compact_reps, | 
|  | step_layout->cpt_compact_cnt, buffer); | 
|  |  | 
|  | if (step_layout->alias_addrs) { | 
|  | char *tmp_str = | 
|  | create_net_cred(step_layout->alias_addrs, | 
|  | protocol_version); | 
|  | packstr(tmp_str, buffer); | 
|  | xfree(tmp_str); | 
|  | } else { | 
|  | packnull(buffer); | 
|  | } | 
|  | } else { | 
|  | error("%s: protocol_version %hu not supported", | 
|  | __func__, protocol_version); | 
|  | } | 
|  | } | 
|  |  | 
|  | extern int unpack_slurm_step_layout(slurm_step_layout_t **layout, buf_t *buffer, | 
|  | uint16_t protocol_version) | 
|  | { | 
|  | uint16_t uint16_tmp; | 
|  | uint32_t num_tids, uint32_tmp; | 
|  | slurm_step_layout_t *step_layout = NULL; | 
|  | int i; | 
|  | char *tmp_str = NULL; | 
|  |  | 
|  | if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { | 
|  | safe_unpack16(&uint16_tmp, buffer); | 
|  | if (!uint16_tmp) | 
|  | return SLURM_SUCCESS; | 
|  |  | 
|  | step_layout = xmalloc(sizeof(slurm_step_layout_t)); | 
|  | *layout = step_layout; | 
|  |  | 
|  | safe_skipstr(buffer); | 
|  | safe_unpackstr(&step_layout->node_list, buffer); | 
|  | safe_unpack32(&step_layout->node_cnt, buffer); | 
|  | safe_unpack16(&step_layout->start_protocol_ver, buffer); | 
|  | safe_unpack32(&step_layout->task_cnt, buffer); | 
|  | safe_unpack32(&step_layout->task_dist, buffer); | 
|  |  | 
|  | safe_xcalloc(step_layout->tasks, step_layout->node_cnt, | 
|  | sizeof(uint32_t)); | 
|  | safe_xcalloc(step_layout->tids, step_layout->node_cnt, | 
|  | sizeof(uint32_t *)); | 
|  | for (i = 0; i < step_layout->node_cnt; i++) { | 
|  | safe_unpack32_array(&(step_layout->tids[i]), | 
|  | &num_tids, | 
|  | buffer); | 
|  | step_layout->tasks[i] = num_tids; | 
|  | } | 
|  | safe_unpack16_array(&step_layout->cpt_compact_array, | 
|  | &step_layout->cpt_compact_cnt, buffer); | 
|  | safe_unpack32_array(&step_layout->cpt_compact_reps, | 
|  | &uint32_tmp, buffer); | 
|  | xassert(uint32_tmp == step_layout->cpt_compact_cnt); | 
|  |  | 
|  | safe_unpackstr(&tmp_str, buffer); | 
|  | if (tmp_str) { | 
|  | step_layout->alias_addrs = | 
|  | extract_net_cred(tmp_str, protocol_version); | 
|  | if (!step_layout->alias_addrs) { | 
|  | xfree(tmp_str); | 
|  | goto unpack_error; | 
|  | } | 
|  | step_layout->alias_addrs->net_cred = tmp_str; | 
|  | } | 
|  | } else { | 
|  | error("unpack_slurm_step_layout: protocol_version " | 
|  | "%hu not supported", protocol_version); | 
|  | goto unpack_error; | 
|  | } | 
|  | return SLURM_SUCCESS; | 
|  |  | 
|  | unpack_error: | 
|  | slurm_step_layout_destroy(step_layout); | 
|  | *layout = NULL; | 
|  | return SLURM_ERROR; | 
|  | } | 
|  |  | 
|  | /* destroys structure for step layout */ | 
|  | extern int slurm_step_layout_destroy(slurm_step_layout_t *step_layout) | 
|  | { | 
|  | int i=0; | 
|  | if (step_layout) { | 
|  | slurm_free_node_alias_addrs(step_layout->alias_addrs); | 
|  | xfree(step_layout->node_list); | 
|  | xfree(step_layout->tasks); | 
|  | xfree(step_layout->cpt_compact_array); | 
|  | xfree(step_layout->cpt_compact_reps); | 
|  | for (i = 0; i < step_layout->node_cnt; i++) { | 
|  | xfree(step_layout->tids[i]); | 
|  | } | 
|  | xfree(step_layout->tids); | 
|  |  | 
|  | xfree(step_layout); | 
|  | } | 
|  |  | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | int slurm_step_layout_host_id (slurm_step_layout_t *s, int taskid) | 
|  | { | 
|  | int i, j; | 
|  | if (!s->tasks || !s->tids || (taskid > s->task_cnt - 1)) | 
|  | return SLURM_ERROR; | 
|  | for (i = 0; i < s->node_cnt; i++) | 
|  | for (j = 0; j < s->tasks[i]; j++) | 
|  | if (s->tids[i][j] == taskid) | 
|  | return i; | 
|  |  | 
|  | return SLURM_ERROR; | 
|  | } | 
|  |  | 
|  | char *slurm_step_layout_host_name (slurm_step_layout_t *s, int taskid) | 
|  | { | 
|  | int hostid = slurm_step_layout_host_id (s, taskid); | 
|  |  | 
|  | if (hostid < 0) | 
|  | return NULL; | 
|  |  | 
|  | return nodelist_nth_host(s->node_list, hostid); | 
|  | } | 
|  |  | 
|  | /* build maps for task layout on nodes */ | 
|  | static int _init_task_layout(slurm_step_layout_req_t *step_layout_req, | 
|  | slurm_step_layout_t *step_layout, | 
|  | const char *arbitrary_nodes) | 
|  | { | 
|  | int cpu_cnt = 0, cpu_inx = 0, cpu_task_cnt = 0, cpu_task_inx = 0, i; | 
|  | hostlist_t *hl; | 
|  |  | 
|  | uint16_t cpus[step_layout->node_cnt]; | 
|  | uint16_t cpus_per_task[1]; | 
|  | uint32_t cpus_task_reps[1]; | 
|  |  | 
|  | if (step_layout->node_cnt == 0) | 
|  | return SLURM_ERROR; | 
|  | if (step_layout->tasks)	/* layout already completed */ | 
|  | return SLURM_SUCCESS; | 
|  |  | 
|  | if (!step_layout_req->cpus_per_task) { | 
|  | cpus_per_task[0] = 1; | 
|  | cpus_task_reps[0] = step_layout_req->num_hosts; | 
|  | step_layout_req->cpus_per_task = cpus_per_task; | 
|  | step_layout_req->cpus_task_reps = cpus_task_reps; | 
|  | } | 
|  |  | 
|  | if (((int)step_layout_req->cpus_per_task[0] < 1) || | 
|  | (step_layout_req->cpus_per_task[0] == NO_VAL16)) { | 
|  | step_layout_req->cpus_per_task[0] = 1; | 
|  | step_layout_req->cpus_task_reps[0] = step_layout_req->num_hosts; | 
|  | } | 
|  |  | 
|  | step_layout->plane_size = step_layout_req->plane_size; | 
|  |  | 
|  | step_layout->tasks = xcalloc(step_layout->node_cnt, sizeof(uint16_t)); | 
|  | step_layout->tids = xcalloc(step_layout->node_cnt, sizeof(uint32_t *)); | 
|  | hl = hostlist_create(step_layout->node_list); | 
|  | /* make sure the number of nodes we think we have | 
|  | * is the correct number */ | 
|  | i = hostlist_count(hl); | 
|  | if (step_layout->node_cnt > i) | 
|  | step_layout->node_cnt = i; | 
|  | hostlist_destroy(hl); | 
|  |  | 
|  | debug("laying out the %u tasks on %u hosts %s dist %u", | 
|  | step_layout->task_cnt, step_layout->node_cnt, | 
|  | step_layout->node_list, step_layout->task_dist); | 
|  | if (step_layout->node_cnt < 1) { | 
|  | error("no hostlist given can't layout tasks"); | 
|  | return SLURM_ERROR; | 
|  | } | 
|  |  | 
|  | /* hostlist_t *hl = hostlist_create(step_layout->node_list); */ | 
|  | for (i=0; i<step_layout->node_cnt; i++) { | 
|  | /* char *name = hostlist_shift(hl); */ | 
|  | /* if (!name) { */ | 
|  | /* 	error("hostlist incomplete for this job request"); */ | 
|  | /* 	hostlist_destroy(hl); */ | 
|  | /* 	return SLURM_ERROR; */ | 
|  | /* } */ | 
|  | /* debug2("host %d = %s", i, name); */ | 
|  | /* free(name); */ | 
|  | cpus[i] = (step_layout_req->cpus_per_node[cpu_inx] / | 
|  | step_layout_req->cpus_per_task[cpu_task_inx]); | 
|  | if (cpus[i] == 0) { | 
|  | /* this can be a result of a heterogeneous allocation | 
|  | * (e.g. 4 cpus on one node and 2 on the second with | 
|  | *  step_layout_req->cpus_per_task=3)  */ | 
|  | cpus[i] = 1; | 
|  | } | 
|  |  | 
|  | if (step_layout->plane_size && | 
|  | (step_layout->plane_size != NO_VAL16) && | 
|  | ((step_layout->task_dist & SLURM_DIST_STATE_BASE) | 
|  | != SLURM_DIST_PLANE)) { | 
|  | /* plane_size when dist != plane is used to | 
|  | convey ntasks_per_node. Adjust the number | 
|  | of cpus to reflect that. | 
|  | */ | 
|  | uint16_t cpus_per_node = | 
|  | step_layout->plane_size * | 
|  | step_layout_req->cpus_per_task[cpu_task_inx]; | 
|  | if (cpus[i] > cpus_per_node) | 
|  | cpus[i] = cpus_per_node; | 
|  | } | 
|  |  | 
|  | /* info("got %d cpus", cpus[i]); */ | 
|  | if ((++cpu_cnt) >= | 
|  | step_layout_req->cpu_count_reps[cpu_inx]) { | 
|  | /* move to next record */ | 
|  | cpu_inx++; | 
|  | cpu_cnt = 0; | 
|  | } | 
|  |  | 
|  | if ((++cpu_task_cnt) >= | 
|  | step_layout_req->cpus_task_reps[cpu_task_inx]) { | 
|  | /* move to next record */ | 
|  | cpu_task_inx++; | 
|  | cpu_task_cnt = 0; | 
|  | } | 
|  | } | 
|  |  | 
|  | if ((step_layout->task_dist & SLURM_DIST_NODEMASK) | 
|  | == SLURM_DIST_NODECYCLIC) | 
|  | return _task_layout_cyclic(step_layout, cpus); | 
|  | else if ((step_layout->task_dist & SLURM_DIST_STATE_BASE) | 
|  | == SLURM_DIST_ARBITRARY) | 
|  | return _task_layout_hostfile(step_layout, arbitrary_nodes); | 
|  | else if ((step_layout->task_dist & SLURM_DIST_STATE_BASE) | 
|  | == SLURM_DIST_PLANE) | 
|  | return _task_layout_plane(step_layout, cpus); | 
|  | else | 
|  | return _task_layout_block(step_layout, cpus); | 
|  | } | 
|  |  | 
|  | /* use specific set run tasks on each host listed in hostfile | 
|  | * XXX: Need to handle over-subscribe. | 
|  | */ | 
|  | static int _task_layout_hostfile(slurm_step_layout_t *step_layout, | 
|  | const char *arbitrary_nodes) | 
|  | { | 
|  | int i=0, j, taskid = 0, task_cnt=0; | 
|  | hostlist_iterator_t *itr = NULL, *itr_task = NULL; | 
|  | char *host = NULL; | 
|  |  | 
|  | hostlist_t *job_alloc_hosts = NULL; | 
|  | hostlist_t *step_alloc_hosts = NULL; | 
|  |  | 
|  | int step_inx = 0, step_hosts_cnt = 0; | 
|  | node_record_t **step_hosts_ptrs = NULL; | 
|  | node_record_t *host_ptr = NULL; | 
|  |  | 
|  | debug2("job list is %s", step_layout->node_list); | 
|  | if (!arbitrary_nodes) { | 
|  | error("no hostlist given for arbitrary dist"); | 
|  | return SLURM_ERROR; | 
|  | } | 
|  |  | 
|  | debug2("list is %s", arbitrary_nodes); | 
|  | step_alloc_hosts = hostlist_create(arbitrary_nodes); | 
|  | if (hostlist_count(step_alloc_hosts) != step_layout->task_cnt) { | 
|  | error("Asked for %u tasks have %d in the nodelist.  " | 
|  | "Check your nodelist, or set the -n option to be %d", | 
|  | step_layout->task_cnt, | 
|  | hostlist_count(step_alloc_hosts), | 
|  | hostlist_count(step_alloc_hosts)); | 
|  | hostlist_destroy(step_alloc_hosts); | 
|  | return SLURM_ERROR; | 
|  | } | 
|  |  | 
|  | job_alloc_hosts = hostlist_create(step_layout->node_list); | 
|  | itr             = hostlist_iterator_create(job_alloc_hosts); | 
|  | itr_task        = hostlist_iterator_create(step_alloc_hosts); | 
|  |  | 
|  | /* | 
|  | * Build array of pointers so that we can do pointer comparisons rather | 
|  | * than strcmp's on nodes. | 
|  | */ | 
|  | step_hosts_cnt  = hostlist_count(step_alloc_hosts); | 
|  | step_hosts_ptrs = xcalloc(step_hosts_cnt, | 
|  | sizeof(node_record_t *)); | 
|  |  | 
|  | if (!running_in_daemon()) { | 
|  | /* running in salloc - init node records */ | 
|  | init_node_conf(); | 
|  | build_all_nodeline_info(false, 0); | 
|  | rehash_node(); | 
|  | } | 
|  |  | 
|  | step_inx = 0; | 
|  | while((host = hostlist_next(itr_task))) { | 
|  | step_hosts_ptrs[step_inx++] = find_node_record_no_alias(host); | 
|  | free(host); | 
|  | } | 
|  |  | 
|  | while((host = hostlist_next(itr))) { | 
|  | host_ptr = find_node_record(host); | 
|  | step_layout->tasks[i] = 0; | 
|  |  | 
|  | for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) { | 
|  | if (host_ptr == step_hosts_ptrs[step_inx]) { | 
|  | step_layout->tasks[i]++; | 
|  | task_cnt++; | 
|  | } | 
|  | if (task_cnt >= step_layout->task_cnt) | 
|  | break; | 
|  | } | 
|  | debug3("%s got %u tasks", host, step_layout->tasks[i]); | 
|  | if (step_layout->tasks[i] == 0) | 
|  | goto reset_hosts; | 
|  | step_layout->tids[i] = xcalloc(step_layout->tasks[i], | 
|  | sizeof(uint32_t)); | 
|  | taskid = 0; | 
|  | j = 0; | 
|  |  | 
|  | for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) { | 
|  | if (host_ptr == step_hosts_ptrs[step_inx]) { | 
|  | step_layout->tids[i][j] = taskid; | 
|  | j++; | 
|  | } | 
|  | taskid++; | 
|  | if (j >= step_layout->tasks[i]) | 
|  | break; | 
|  | } | 
|  | i++; | 
|  | reset_hosts: | 
|  | free(host); | 
|  | if (i > step_layout->task_cnt) | 
|  | break; | 
|  | } | 
|  | hostlist_iterator_destroy(itr); | 
|  | hostlist_iterator_destroy(itr_task); | 
|  | hostlist_destroy(job_alloc_hosts); | 
|  | hostlist_destroy(step_alloc_hosts); | 
|  | xfree(step_hosts_ptrs); | 
|  |  | 
|  | if (task_cnt != step_layout->task_cnt) { | 
|  | error("Asked for %u tasks but placed %d. Check your nodelist", | 
|  | step_layout->task_cnt, task_cnt); | 
|  | return SLURM_ERROR; | 
|  | } | 
|  |  | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | static int _task_layout_block(slurm_step_layout_t *step_layout, uint16_t *cpus) | 
|  | { | 
|  | static uint16_t select_params = NO_VAL16; | 
|  | int i, j, task_id = 0; | 
|  | bool pack_nodes; | 
|  |  | 
|  | if (select_params == NO_VAL16) | 
|  | select_params = slurm_conf.select_type_param; | 
|  | if (step_layout->task_dist & SLURM_DIST_PACK_NODES) | 
|  | pack_nodes = true; | 
|  | else if (step_layout->task_dist & SLURM_DIST_NO_PACK_NODES) | 
|  | pack_nodes = false; | 
|  | else if (select_params & SELECT_PACK_NODES) | 
|  | pack_nodes = true; | 
|  | else | 
|  | pack_nodes = false; | 
|  |  | 
|  | if (pack_nodes) { | 
|  | /* Pass 1: Put one task on each node */ | 
|  | for (i = 0; ((i < step_layout->node_cnt) && | 
|  | (task_id < step_layout->task_cnt)); i++) { | 
|  | /* cpus has already been altered for cpus_per_task */ | 
|  | if (step_layout->tasks[i] < cpus[i]) { | 
|  | step_layout->tasks[i]++; | 
|  | task_id++; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Pass 2: Fill remaining CPUs on a node-by-node basis */ | 
|  | for (i = 0; ((i < step_layout->node_cnt) && | 
|  | (task_id < step_layout->task_cnt)); i++) { | 
|  | /* cpus has already been altered for cpus_per_task */ | 
|  | while ((step_layout->tasks[i] < cpus[i]) && | 
|  | (task_id < step_layout->task_cnt)) { | 
|  | step_layout->tasks[i]++; | 
|  | task_id++; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Pass 3: Spread remaining tasks across all nodes */ | 
|  | while (task_id < step_layout->task_cnt) { | 
|  | for (i = 0; ((i < step_layout->node_cnt) && | 
|  | (task_id < step_layout->task_cnt)); i++) { | 
|  | step_layout->tasks[i]++; | 
|  | task_id++; | 
|  | } | 
|  | } | 
|  | } else { | 
|  | /* To effectively deal with heterogeneous nodes, we fake a | 
|  | * cyclic distribution to determine how many tasks go on each | 
|  | * node and then make those assignments in a block fashion. */ | 
|  | bool over_subscribe = false; | 
|  | for (j = 0; task_id < step_layout->task_cnt; j++) { | 
|  | bool space_remaining = false; | 
|  | for (i = 0; ((i < step_layout->node_cnt) && | 
|  | (task_id < step_layout->task_cnt)); i++) { | 
|  | if ((j < cpus[i]) || over_subscribe) { | 
|  | step_layout->tasks[i]++; | 
|  | task_id++; | 
|  | if ((j + 1) < cpus[i]) | 
|  | space_remaining = true; | 
|  | } | 
|  | } | 
|  | if (!space_remaining) | 
|  | over_subscribe = true; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Now distribute the tasks */ | 
|  | task_id = 0; | 
|  | for (i = 0; i < step_layout->node_cnt; i++) { | 
|  | step_layout->tids[i] = xcalloc(step_layout->tasks[i], | 
|  | sizeof(uint32_t)); | 
|  | for (j = 0; j < step_layout->tasks[i]; j++) { | 
|  | step_layout->tids[i][j] = task_id; | 
|  | task_id++; | 
|  | } | 
|  | } | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  |  | 
|  | /* distribute tasks across available nodes: allocate tasks to nodes | 
|  | * in a cyclic fashion using available processors. once all available | 
|  | * processors are allocated, continue to allocate task over-subscribing | 
|  | * nodes as needed. for example | 
|  | * cpus per node        4  2  4  2 | 
|  | *                     -- -- -- -- | 
|  | * task distribution:   0  1  2  3 | 
|  | *                      4  5  6  7 | 
|  | *                      8     9 | 
|  | *                     10    11     all processors allocated now | 
|  | *                     12 13 14 15  etc. | 
|  | */ | 
|  | static int _task_layout_cyclic(slurm_step_layout_t *step_layout, | 
|  | uint16_t *cpus) | 
|  | { | 
|  | int i, j, max_over_subscribe = 0, taskid = 0, total_cpus = 0; | 
|  | bool over_subscribe = false; | 
|  |  | 
|  | for (i = 0; i < step_layout->node_cnt; i++) | 
|  | total_cpus += cpus[i]; | 
|  | if (total_cpus < step_layout->task_cnt) { | 
|  | over_subscribe = true; | 
|  | i = step_layout->task_cnt - total_cpus; | 
|  | max_over_subscribe = ROUNDUP(i, step_layout->node_cnt); | 
|  | } | 
|  |  | 
|  | for (j=0; taskid<step_layout->task_cnt; j++) {   /* cycle counter */ | 
|  | bool space_remaining = false; | 
|  | for (i=0; ((i<step_layout->node_cnt) | 
|  | && (taskid<step_layout->task_cnt)); i++) { | 
|  | if ((j < cpus[i]) || | 
|  | (over_subscribe && | 
|  | (j < (cpus[i] + max_over_subscribe)))) { | 
|  | xrealloc(step_layout->tids[i], sizeof(uint32_t) | 
|  | * (step_layout->tasks[i] + 1)); | 
|  |  | 
|  | step_layout->tids[i][step_layout->tasks[i]] = | 
|  | taskid; | 
|  | taskid++; | 
|  | step_layout->tasks[i]++; | 
|  | if ((j+1) < cpus[i]) | 
|  | space_remaining = true; | 
|  | } | 
|  | } | 
|  | if (!space_remaining) | 
|  | over_subscribe = true; | 
|  | } | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  |  | 
|  | /* | 
|  | * The plane distribution results in a block cyclic of block size | 
|  | * "plane_size". | 
|  | * To effectively deal with heterogeneous nodes, we fake a cyclic | 
|  | * distribution to figure out how many tasks go on each node and | 
|  | * then make the assignments of task numbers to nodes using the | 
|  | * user-specified plane size. | 
|  | * For example: | 
|  | *	plane_size = 2, #tasks = 6, #nodes = 3 | 
|  | * | 
|  | * Node#:              Node0 Node1 Node2 | 
|  | *                     ----- ----- ----- | 
|  | * #of allocated CPUs:   4     1     1 | 
|  | * | 
|  | * task distribution:   0  1   2     3 | 
|  | *                      4  5 | 
|  | */ | 
|  | static int _task_layout_plane(slurm_step_layout_t *step_layout, | 
|  | uint16_t *cpus) | 
|  | { | 
|  | int i, j, k, taskid = 0; | 
|  | bool over_subscribe = false; | 
|  | uint32_t cur_task[step_layout->node_cnt]; | 
|  | int plane_start = 0; | 
|  |  | 
|  | debug3("_task_layout_plane plane_size %u node_cnt %u task_cnt %u", | 
|  | step_layout->plane_size, | 
|  | step_layout->node_cnt, step_layout->task_cnt); | 
|  |  | 
|  | if (step_layout->plane_size <= 0) | 
|  | return SLURM_ERROR; | 
|  |  | 
|  | if (step_layout->tasks == NULL) | 
|  | return SLURM_ERROR; | 
|  |  | 
|  | /* figure out how many tasks go to each node */ | 
|  | for (j=0; taskid<step_layout->task_cnt; j++) {   /* cycle counter */ | 
|  | bool space_remaining = false; | 
|  | /* place one task on each node first */ | 
|  | if (j == 0) { | 
|  | for (i = 0; ((i < step_layout->node_cnt) && | 
|  | (taskid < step_layout->task_cnt)); i++) { | 
|  | taskid++; | 
|  | step_layout->tasks[i]++; | 
|  | } | 
|  | } | 
|  | for (i = 0; ((i < step_layout->node_cnt) && | 
|  | (taskid < step_layout->task_cnt)); i++) { | 
|  | /* handle placing first task on each node */ | 
|  | if (j == 0) | 
|  | plane_start = 1; | 
|  | else | 
|  | plane_start = 0; | 
|  | for (k = plane_start; (k < step_layout->plane_size) && | 
|  | (taskid < step_layout->task_cnt); k++) { | 
|  | if ((cpus[i] - step_layout->tasks[i]) || | 
|  | over_subscribe) { | 
|  | taskid++; | 
|  | step_layout->tasks[i]++; | 
|  | if (cpus[i] - (step_layout->tasks[i] | 
|  | + 1) >= 0) | 
|  | space_remaining = true; | 
|  | } | 
|  | } | 
|  | } | 
|  | if (!space_remaining) | 
|  | over_subscribe = true; | 
|  | } | 
|  |  | 
|  | /* now distribute the tasks */ | 
|  | taskid = 0; | 
|  | for (i=0; i < step_layout->node_cnt; i++) { | 
|  | step_layout->tids[i] = xcalloc(step_layout->tasks[i], | 
|  | sizeof(uint32_t)); | 
|  | cur_task[i] = 0; | 
|  | } | 
|  | for (j=0; taskid<step_layout->task_cnt; j++) {   /* cycle counter */ | 
|  | for (i=0; ((i<step_layout->node_cnt) | 
|  | && (taskid<step_layout->task_cnt)); i++) { | 
|  | /* assign a block of 'plane_size' tasks to this node */ | 
|  | for (k=0; ((k<step_layout->plane_size) | 
|  | && (cur_task[i] < step_layout->tasks[i]) | 
|  | && (taskid < step_layout->task_cnt)); k++) { | 
|  | step_layout->tids[i][cur_task[i]] = taskid; | 
|  | taskid++; | 
|  | cur_task[i]++; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | if (taskid != step_layout->task_cnt) { | 
|  | error("_task_layout_plane: Mismatch in task count (%d != %d) ", | 
|  | taskid, step_layout->task_cnt); | 
|  | return SLURM_ERROR; | 
|  | } | 
|  |  | 
|  | #if (0) | 
|  | /* debugging only */ | 
|  | for (i=0; i < step_layout->node_cnt; i++) { | 
|  | info("tasks[%d]: %u", i, step_layout->tasks[i]); | 
|  | } | 
|  |  | 
|  | for (i=0; i < step_layout->node_cnt; i++) { | 
|  | info ("Host %d _plane_ # of tasks %u", i, step_layout->tasks[i]); | 
|  | for (j=0; j<step_layout->tasks[i]; j++) { | 
|  | info ("Host %d _plane_ localid %d taskid %u", | 
|  | i, j, step_layout->tids[i][j]); | 
|  | } | 
|  | } | 
|  | #endif | 
|  |  | 
|  | return SLURM_SUCCESS; | 
|  | } | 
|  |  | 
|  | typedef struct { | 
|  | task_dist_states_t task_dist; | 
|  | const char *string; | 
|  | } layout_type_name_t; | 
|  |  | 
|  | static const layout_type_name_t layout_type_names[] = { | 
|  | { SLURM_DIST_CYCLIC, "Cyclic" }, | 
|  | /* distribute tasks filling node by node */ | 
|  | { SLURM_DIST_BLOCK, "Block" }, | 
|  | /* arbitrary task distribution  */ | 
|  | { SLURM_DIST_ARBITRARY, "arbitrary task distribution" }, | 
|  | /* | 
|  | * distribute tasks by filling up planes of lllp first and then by | 
|  | * going across the nodes See documentation for more information | 
|  | */ | 
|  | { SLURM_DIST_PLANE, "Plane" }, | 
|  | /* | 
|  | * distribute tasks 1 per node: round robin: same for lowest | 
|  | * level of logical processor (lllp) | 
|  | */ | 
|  | { SLURM_DIST_CYCLIC_CYCLIC, "CCyclic" }, | 
|  | /* cyclic for node and block for lllp  */ | 
|  | { SLURM_DIST_CYCLIC_BLOCK, "CBlock" }, | 
|  | /* block for node and cyclic for lllp  */ | 
|  | { SLURM_DIST_BLOCK_CYCLIC, "BCyclic" }, | 
|  | /* block for node and block for lllp  */ | 
|  | { SLURM_DIST_BLOCK_BLOCK, "BBlock" }, | 
|  | /* cyclic for node and full cyclic for lllp  */ | 
|  | { SLURM_DIST_CYCLIC_CFULL, "CFCyclic" }, | 
|  | /* block for node and full cyclic for lllp  */ | 
|  | { SLURM_DIST_BLOCK_CFULL, "BFCyclic" }, | 
|  | { SLURM_DIST_CYCLIC_CYCLIC_CYCLIC, "CCyclicCyclic" }, | 
|  | { SLURM_DIST_CYCLIC_CYCLIC_BLOCK, "CCyclicBlock" }, | 
|  | { SLURM_DIST_CYCLIC_CYCLIC_CFULL, "CCyclicFCyclic" }, | 
|  | { SLURM_DIST_CYCLIC_BLOCK_CYCLIC, "CBlockCyclic" }, | 
|  | { SLURM_DIST_CYCLIC_BLOCK_BLOCK, "CBlockBlock" }, | 
|  | { SLURM_DIST_CYCLIC_BLOCK_CFULL, "CCyclicFCyclic" }, | 
|  | { SLURM_DIST_CYCLIC_CFULL_CYCLIC, "CFCyclicCyclic" }, | 
|  | { SLURM_DIST_CYCLIC_CFULL_BLOCK, "CFCyclicBlock" }, | 
|  | { SLURM_DIST_CYCLIC_CFULL_CFULL, "CFCyclicFCyclic" }, | 
|  | { SLURM_DIST_BLOCK_CYCLIC_CYCLIC, "BCyclicCyclic" }, | 
|  | { SLURM_DIST_BLOCK_CYCLIC_BLOCK, "BCyclicBlock" }, | 
|  | { SLURM_DIST_BLOCK_CYCLIC_CFULL, "BCyclicFCyclic" }, | 
|  | { SLURM_DIST_BLOCK_BLOCK_CYCLIC, "BBlockCyclic" }, | 
|  | { SLURM_DIST_BLOCK_BLOCK_BLOCK, "BBlockBlock" }, | 
|  | { SLURM_DIST_BLOCK_BLOCK_CFULL, "BBlockFCyclic" }, | 
|  | { SLURM_DIST_BLOCK_CFULL_CYCLIC, "BFCyclicCyclic" }, | 
|  | { SLURM_DIST_BLOCK_CFULL_BLOCK, "BFCyclicBlock" }, | 
|  | { SLURM_DIST_BLOCK_CFULL_CFULL, "BFCyclicFCyclic" }, | 
|  | { 0 } | 
|  | }; | 
|  |  | 
|  | extern char *slurm_step_layout_type_name(task_dist_states_t task_dist) | 
|  | { | 
|  | char *name = NULL, *pos = NULL; | 
|  |  | 
|  | for (int i = 0; layout_type_names[i].task_dist; i++) { | 
|  | if (layout_type_names[i].task_dist == | 
|  | (task_dist & SLURM_DIST_STATE_BASE)) { | 
|  | xstrfmtcatat(name, &pos, "%s", | 
|  | layout_type_names[i].string); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (!name) { | 
|  | /* SLURM_DIST_UNKNOWN - No distribution specified */ | 
|  | xstrfmtcatat(name, &pos, "%s", "Unknown"); | 
|  | } | 
|  |  | 
|  | if (task_dist & SLURM_DIST_PACK_NODES) | 
|  | xstrfmtcatat(name, &pos, ",%s", "Pack"); | 
|  |  | 
|  | if (task_dist & SLURM_DIST_NO_PACK_NODES) | 
|  | xstrfmtcatat(name, &pos, ",%s", "NoPack"); | 
|  |  | 
|  | xassert(pos); | 
|  | return name; | 
|  | } |