| /*****************************************************************************\ |
| * slurm_step_layout.c - functions to distribute tasks over nodes. |
| ***************************************************************************** |
| * Copyright (C) 2005 Hewlett-Packard Development Company, L.P. |
| * Written by Chris Holmes, <cholmes@hp.com>, who borrowed heavily |
| * from other parts of SLURM. |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| * |
| * This file is patterned after hostlist.c, written by Mark Grondona and |
| * Copyright (C) 2002 The Regents of the University of California. |
| \*****************************************************************************/ |
| |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "slurm/slurm.h" |
| #include "slurm/slurm_errno.h" |
| |
| #include "src/common/log.h" |
| #include "src/common/read_config.h" |
| #include "src/interfaces/select.h" |
| #include "src/common/slurm_protocol_api.h" |
| #include "src/common/slurm_step_layout.h" |
| #include "src/common/slurmdb_defs.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xstring.h" |
| |
| /* |
| ** Define slurm-specific aliases for use by plugins, see slurm_xlator.h |
| ** for details. |
| */ |
| strong_alias(pack_slurm_step_layout, slurm_pack_slurm_step_layout); |
| strong_alias(unpack_slurm_step_layout, slurm_unpack_slurm_step_layout); |
| |
| /* build maps for task layout on nodes */ |
| static int _init_task_layout(slurm_step_layout_req_t *step_layout_req, |
| slurm_step_layout_t *step_layout, |
| const char *arbitrary_nodes); |
| |
| static int _task_layout_block(slurm_step_layout_t *step_layout, |
| uint16_t *cpus); |
| static int _task_layout_cyclic(slurm_step_layout_t *step_layout, |
| uint16_t *cpus); |
| static int _task_layout_plane(slurm_step_layout_t *step_layout, |
| uint16_t *cpus); |
| static int _task_layout_hostfile(slurm_step_layout_t *step_layout, |
| const char *arbitrary_nodes); |
| |
| /* |
| * slurm_step_layout_create - determine how many tasks of a job will be |
| * run on each node. Distribution is influenced |
| * by number of cpus on each host. |
| * IN step_layout_req - information needed for task distibutionhostlist corresponding to task layout |
| * RET a pointer to an slurm_step_layout_t structure |
| * NOTE: allocates memory that should be xfreed by caller |
| */ |
| slurm_step_layout_t *slurm_step_layout_create( |
| slurm_step_layout_req_t *step_layout_req) |
| { |
| char *arbitrary_nodes = NULL; |
| slurm_step_layout_t *step_layout = |
| xmalloc(sizeof(slurm_step_layout_t)); |
| |
| step_layout->task_dist = step_layout_req->task_dist; |
| if ((step_layout->task_dist & SLURM_DIST_STATE_BASE) |
| == SLURM_DIST_ARBITRARY) { |
| hostlist_t *hl = NULL; |
| char *buf = NULL; |
| /* set the node list for the task layout later if user |
| * supplied could be different that the job allocation */ |
| arbitrary_nodes = xstrdup(step_layout_req->node_list); |
| hl = hostlist_create(step_layout_req->node_list); |
| hostlist_uniq(hl); |
| buf = hostlist_ranged_string_xmalloc(hl); |
| step_layout_req->num_hosts = hostlist_count(hl); |
| hostlist_destroy(hl); |
| step_layout->node_list = buf; |
| } else { |
| step_layout->node_list = xstrdup(step_layout_req->node_list); |
| } |
| |
| step_layout->task_cnt = step_layout_req->num_tasks; |
| step_layout->node_cnt = step_layout_req->num_hosts; |
| |
| if (_init_task_layout(step_layout_req, step_layout, arbitrary_nodes) |
| != SLURM_SUCCESS) { |
| slurm_step_layout_destroy(step_layout); |
| step_layout = NULL; |
| } |
| xfree(arbitrary_nodes); |
| return step_layout; |
| } |
| |
| /* |
| * fake_slurm_step_layout_create - used when you don't allocate a job from the |
| * controller does not set up anything |
| * that should really be used with a switch. |
| * Or to really lay out tasks any any certain fashion. |
| * IN tlist - hostlist corresponding to task layout |
| * IN cpus_per_node - cpus per node NULL if no allocation |
| * IN cpu_count_reps - how many nodes have same cpu count NULL if no allocation |
| * IN node_cnt - number of nodes we have |
| * IN task_cnt - number of tasks to distribute across these cpus 0 |
| * if using cpus_per_node |
| * RET a pointer to an slurm_step_layout_t structure |
| * NOTE: allocates memory that should be xfreed by caller |
| */ |
| extern slurm_step_layout_t *fake_slurm_step_layout_create( |
| const char *tlist, |
| uint16_t *cpus_per_node, |
| uint32_t *cpu_count_reps, |
| uint32_t node_cnt, |
| uint32_t task_cnt, |
| uint16_t protocol_version) |
| { |
| uint32_t cpn = 1; |
| int cpu_cnt = 0, cpu_inx = 0, i, j; |
| slurm_step_layout_t *step_layout = NULL; |
| |
| if (!node_cnt || !tlist || |
| (!cpus_per_node && (!task_cnt || (task_cnt == NO_VAL)))) { |
| error("there is a problem with your fake_step_layout request\n" |
| "node_cnt = %u, task_cnt = %u, tlist = %s", |
| node_cnt, task_cnt, tlist); |
| return NULL; |
| } |
| |
| step_layout = xmalloc(sizeof(slurm_step_layout_t)); |
| step_layout->node_list = xstrdup(tlist); |
| step_layout->node_cnt = node_cnt; |
| step_layout->start_protocol_ver = protocol_version; |
| step_layout->tasks = xcalloc(node_cnt, sizeof(uint16_t)); |
| step_layout->tids = xcalloc(node_cnt, sizeof(uint32_t *)); |
| |
| step_layout->task_cnt = 0; |
| for (i = 0; i < step_layout->node_cnt; i++) { |
| if (cpus_per_node && cpu_count_reps) { |
| step_layout->tasks[i] = cpus_per_node[cpu_inx]; |
| step_layout->tids[i] = xcalloc(step_layout->tasks[i], |
| sizeof(uint32_t)); |
| |
| for (j = 0; j < step_layout->tasks[i]; j++) |
| step_layout->tids[i][j] = |
| step_layout->task_cnt++; |
| |
| if ((++cpu_cnt) >= cpu_count_reps[cpu_inx]) { |
| /* move to next record */ |
| cpu_inx++; |
| cpu_cnt = 0; |
| } |
| } else { |
| cpn = ROUNDUP((task_cnt - step_layout->task_cnt), |
| (node_cnt - i)); |
| if (step_layout->task_cnt >= task_cnt) { |
| step_layout->tasks[i] = 0; |
| step_layout->tids[i] = NULL; |
| } else { |
| step_layout->tasks[i] = cpn; |
| step_layout->tids[i] = |
| xcalloc(cpn, sizeof(uint32_t)); |
| |
| for (j = 0; j < cpn; j++) { |
| step_layout->tids[i][j] = |
| step_layout->task_cnt++; |
| if (step_layout->task_cnt >= task_cnt) { |
| step_layout->tasks[i] = j + 1; |
| break; |
| } |
| } |
| } |
| } |
| } |
| |
| return step_layout; |
| } |
| |
| |
| |
| /* copies structure for step layout */ |
| extern slurm_step_layout_t *slurm_step_layout_copy( |
| slurm_step_layout_t *step_layout) |
| { |
| slurm_step_layout_t *layout; |
| int i = 0; |
| if (!step_layout) |
| return NULL; |
| |
| layout = xmalloc(sizeof(slurm_step_layout_t)); |
| if (step_layout->alias_addrs) { |
| layout->alias_addrs = xmalloc(sizeof(slurm_node_alias_addrs_t)); |
| slurm_copy_node_alias_addrs_members(layout->alias_addrs, |
| step_layout->alias_addrs); |
| } |
| layout->node_list = xstrdup(step_layout->node_list); |
| layout->node_cnt = step_layout->node_cnt; |
| layout->start_protocol_ver = step_layout->start_protocol_ver; |
| layout->task_cnt = step_layout->task_cnt; |
| layout->task_dist = step_layout->task_dist; |
| |
| layout->tasks = xcalloc(layout->node_cnt, sizeof(uint16_t)); |
| memcpy(layout->tasks, step_layout->tasks, |
| (sizeof(uint16_t) * layout->node_cnt)); |
| if (step_layout->cpt_compact_cnt) { |
| uint32_t cnt = step_layout->cpt_compact_cnt; |
| |
| layout->cpt_compact_cnt = cnt; |
| layout->cpt_compact_array = |
| xcalloc(cnt, sizeof(*layout->cpt_compact_array)); |
| memcpy(layout->cpt_compact_array, |
| step_layout->cpt_compact_array, |
| (sizeof(*layout->cpt_compact_array) * cnt)); |
| |
| layout->cpt_compact_reps = |
| xcalloc(cnt, sizeof(*layout->cpt_compact_reps)); |
| memcpy(layout->cpt_compact_reps, |
| step_layout->cpt_compact_reps, |
| (sizeof(*layout->cpt_compact_reps) * cnt)); |
| |
| } |
| |
| layout->tids = xcalloc(layout->node_cnt, sizeof(uint32_t *)); |
| for (i = 0; i < layout->node_cnt; i++) { |
| layout->tids[i] = xcalloc(layout->tasks[i], sizeof(uint32_t)); |
| memcpy(layout->tids[i], step_layout->tids[i], |
| (sizeof(uint32_t) * layout->tasks[i])); |
| } |
| |
| return layout; |
| } |
| |
| extern void slurm_step_layout_merge(slurm_step_layout_t *step_layout1, |
| slurm_step_layout_t *step_layout2) |
| { |
| hostlist_t *hl, *hl2; |
| hostlist_iterator_t *host_itr; |
| int new_pos = 0, node_task_cnt; |
| char *host; |
| |
| xassert(step_layout1); |
| xassert(step_layout2); |
| |
| /* |
| * cpt_compact* fields are currently not used by the clients who issue |
| * the RPC that calls this function. So, we currently do not merge |
| * the cpt_compact* fields. |
| */ |
| |
| hl = hostlist_create(step_layout1->node_list); |
| hl2 = hostlist_create(step_layout2->node_list); |
| |
| host_itr = hostlist_iterator_create(hl2); |
| while ((host = hostlist_next(host_itr))) { |
| int pos = hostlist_find(hl, host); |
| |
| if (pos == -1) { |
| /* If the host doesn't exist push it on the end */ |
| hostlist_push_host(hl, host); |
| pos = step_layout1->node_cnt++; |
| xrecalloc(step_layout1->tasks, |
| step_layout1->node_cnt, |
| sizeof(uint16_t)); |
| xrecalloc(step_layout1->tids, |
| step_layout1->node_cnt, |
| sizeof(uint32_t *)); |
| } |
| free(host); |
| |
| /* set the end position of the array */ |
| node_task_cnt = step_layout1->tasks[pos]; |
| step_layout1->tasks[pos] += |
| step_layout2->tasks[new_pos]; |
| xrecalloc(step_layout1->tids[pos], |
| step_layout1->tasks[pos], |
| sizeof(uint32_t)); |
| for (int i = 0; i < step_layout2->tasks[new_pos]; i++) { |
| step_layout1->tids[pos][node_task_cnt++] = |
| step_layout2->tids[new_pos][i]; |
| } |
| new_pos++; |
| } |
| hostlist_iterator_destroy(host_itr); |
| |
| /* Don't need to merge alias_addrs it is per-job */ |
| step_layout1->task_cnt += step_layout2->task_cnt; |
| xfree(step_layout1->node_list); |
| step_layout1->node_list = hostlist_ranged_string_xmalloc(hl); |
| hostlist_destroy(hl); |
| hostlist_destroy(hl2); |
| } |
| |
| extern void pack_slurm_step_layout(slurm_step_layout_t *step_layout, |
| buf_t *buffer, uint16_t protocol_version) |
| { |
| uint32_t i = 0; |
| |
| if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| if (step_layout) |
| i = 1; |
| |
| pack16(i, buffer); |
| if (!i) |
| return; |
| packnull(buffer); |
| packstr(step_layout->node_list, buffer); |
| pack32(step_layout->node_cnt, buffer); |
| pack16(step_layout->start_protocol_ver, buffer); |
| pack32(step_layout->task_cnt, buffer); |
| pack32(step_layout->task_dist, buffer); |
| |
| for (i = 0; i < step_layout->node_cnt; i++) { |
| pack32_array(step_layout->tids[i], |
| step_layout->tasks[i], |
| buffer); |
| } |
| |
| pack16_array(step_layout->cpt_compact_array, |
| step_layout->cpt_compact_cnt, buffer); |
| pack32_array(step_layout->cpt_compact_reps, |
| step_layout->cpt_compact_cnt, buffer); |
| |
| if (step_layout->alias_addrs) { |
| char *tmp_str = |
| create_net_cred(step_layout->alias_addrs, |
| protocol_version); |
| packstr(tmp_str, buffer); |
| xfree(tmp_str); |
| } else { |
| packnull(buffer); |
| } |
| } else { |
| error("%s: protocol_version %hu not supported", |
| __func__, protocol_version); |
| } |
| } |
| |
| extern int unpack_slurm_step_layout(slurm_step_layout_t **layout, buf_t *buffer, |
| uint16_t protocol_version) |
| { |
| uint16_t uint16_tmp; |
| uint32_t num_tids, uint32_tmp; |
| slurm_step_layout_t *step_layout = NULL; |
| int i; |
| char *tmp_str = NULL; |
| |
| if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| safe_unpack16(&uint16_tmp, buffer); |
| if (!uint16_tmp) |
| return SLURM_SUCCESS; |
| |
| step_layout = xmalloc(sizeof(slurm_step_layout_t)); |
| *layout = step_layout; |
| |
| safe_skipstr(buffer); |
| safe_unpackstr(&step_layout->node_list, buffer); |
| safe_unpack32(&step_layout->node_cnt, buffer); |
| safe_unpack16(&step_layout->start_protocol_ver, buffer); |
| safe_unpack32(&step_layout->task_cnt, buffer); |
| safe_unpack32(&step_layout->task_dist, buffer); |
| |
| safe_xcalloc(step_layout->tasks, step_layout->node_cnt, |
| sizeof(uint32_t)); |
| safe_xcalloc(step_layout->tids, step_layout->node_cnt, |
| sizeof(uint32_t *)); |
| for (i = 0; i < step_layout->node_cnt; i++) { |
| safe_unpack32_array(&(step_layout->tids[i]), |
| &num_tids, |
| buffer); |
| step_layout->tasks[i] = num_tids; |
| } |
| safe_unpack16_array(&step_layout->cpt_compact_array, |
| &step_layout->cpt_compact_cnt, buffer); |
| safe_unpack32_array(&step_layout->cpt_compact_reps, |
| &uint32_tmp, buffer); |
| xassert(uint32_tmp == step_layout->cpt_compact_cnt); |
| |
| safe_unpackstr(&tmp_str, buffer); |
| if (tmp_str) { |
| step_layout->alias_addrs = |
| extract_net_cred(tmp_str, protocol_version); |
| if (!step_layout->alias_addrs) { |
| xfree(tmp_str); |
| goto unpack_error; |
| } |
| step_layout->alias_addrs->net_cred = tmp_str; |
| } |
| } else { |
| error("unpack_slurm_step_layout: protocol_version " |
| "%hu not supported", protocol_version); |
| goto unpack_error; |
| } |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| slurm_step_layout_destroy(step_layout); |
| *layout = NULL; |
| return SLURM_ERROR; |
| } |
| |
| /* destroys structure for step layout */ |
| extern int slurm_step_layout_destroy(slurm_step_layout_t *step_layout) |
| { |
| int i=0; |
| if (step_layout) { |
| slurm_free_node_alias_addrs(step_layout->alias_addrs); |
| xfree(step_layout->node_list); |
| xfree(step_layout->tasks); |
| xfree(step_layout->cpt_compact_array); |
| xfree(step_layout->cpt_compact_reps); |
| for (i = 0; i < step_layout->node_cnt; i++) { |
| xfree(step_layout->tids[i]); |
| } |
| xfree(step_layout->tids); |
| |
| xfree(step_layout); |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| int slurm_step_layout_host_id (slurm_step_layout_t *s, int taskid) |
| { |
| int i, j; |
| if (!s->tasks || !s->tids || (taskid > s->task_cnt - 1)) |
| return SLURM_ERROR; |
| for (i = 0; i < s->node_cnt; i++) |
| for (j = 0; j < s->tasks[i]; j++) |
| if (s->tids[i][j] == taskid) |
| return i; |
| |
| return SLURM_ERROR; |
| } |
| |
| char *slurm_step_layout_host_name (slurm_step_layout_t *s, int taskid) |
| { |
| int hostid = slurm_step_layout_host_id (s, taskid); |
| |
| if (hostid < 0) |
| return NULL; |
| |
| return nodelist_nth_host(s->node_list, hostid); |
| } |
| |
| /* build maps for task layout on nodes */ |
| static int _init_task_layout(slurm_step_layout_req_t *step_layout_req, |
| slurm_step_layout_t *step_layout, |
| const char *arbitrary_nodes) |
| { |
| int cpu_cnt = 0, cpu_inx = 0, cpu_task_cnt = 0, cpu_task_inx = 0, i; |
| hostlist_t *hl; |
| |
| uint16_t cpus[step_layout->node_cnt]; |
| uint16_t cpus_per_task[1]; |
| uint32_t cpus_task_reps[1]; |
| |
| if (step_layout->node_cnt == 0) |
| return SLURM_ERROR; |
| if (step_layout->tasks) /* layout already completed */ |
| return SLURM_SUCCESS; |
| |
| if (!step_layout_req->cpus_per_task) { |
| cpus_per_task[0] = 1; |
| cpus_task_reps[0] = step_layout_req->num_hosts; |
| step_layout_req->cpus_per_task = cpus_per_task; |
| step_layout_req->cpus_task_reps = cpus_task_reps; |
| } |
| |
| if (((int)step_layout_req->cpus_per_task[0] < 1) || |
| (step_layout_req->cpus_per_task[0] == NO_VAL16)) { |
| step_layout_req->cpus_per_task[0] = 1; |
| step_layout_req->cpus_task_reps[0] = step_layout_req->num_hosts; |
| } |
| |
| step_layout->plane_size = step_layout_req->plane_size; |
| |
| step_layout->tasks = xcalloc(step_layout->node_cnt, sizeof(uint16_t)); |
| step_layout->tids = xcalloc(step_layout->node_cnt, sizeof(uint32_t *)); |
| hl = hostlist_create(step_layout->node_list); |
| /* make sure the number of nodes we think we have |
| * is the correct number */ |
| i = hostlist_count(hl); |
| if (step_layout->node_cnt > i) |
| step_layout->node_cnt = i; |
| hostlist_destroy(hl); |
| |
| debug("laying out the %u tasks on %u hosts %s dist %u", |
| step_layout->task_cnt, step_layout->node_cnt, |
| step_layout->node_list, step_layout->task_dist); |
| if (step_layout->node_cnt < 1) { |
| error("no hostlist given can't layout tasks"); |
| return SLURM_ERROR; |
| } |
| |
| /* hostlist_t *hl = hostlist_create(step_layout->node_list); */ |
| for (i=0; i<step_layout->node_cnt; i++) { |
| /* char *name = hostlist_shift(hl); */ |
| /* if (!name) { */ |
| /* error("hostlist incomplete for this job request"); */ |
| /* hostlist_destroy(hl); */ |
| /* return SLURM_ERROR; */ |
| /* } */ |
| /* debug2("host %d = %s", i, name); */ |
| /* free(name); */ |
| cpus[i] = (step_layout_req->cpus_per_node[cpu_inx] / |
| step_layout_req->cpus_per_task[cpu_task_inx]); |
| if (cpus[i] == 0) { |
| /* this can be a result of a heterogeneous allocation |
| * (e.g. 4 cpus on one node and 2 on the second with |
| * step_layout_req->cpus_per_task=3) */ |
| cpus[i] = 1; |
| } |
| |
| if (step_layout->plane_size && |
| (step_layout->plane_size != NO_VAL16) && |
| ((step_layout->task_dist & SLURM_DIST_STATE_BASE) |
| != SLURM_DIST_PLANE)) { |
| /* plane_size when dist != plane is used to |
| convey ntasks_per_node. Adjust the number |
| of cpus to reflect that. |
| */ |
| uint16_t cpus_per_node = |
| step_layout->plane_size * |
| step_layout_req->cpus_per_task[cpu_task_inx]; |
| if (cpus[i] > cpus_per_node) |
| cpus[i] = cpus_per_node; |
| } |
| |
| /* info("got %d cpus", cpus[i]); */ |
| if ((++cpu_cnt) >= |
| step_layout_req->cpu_count_reps[cpu_inx]) { |
| /* move to next record */ |
| cpu_inx++; |
| cpu_cnt = 0; |
| } |
| |
| if ((++cpu_task_cnt) >= |
| step_layout_req->cpus_task_reps[cpu_task_inx]) { |
| /* move to next record */ |
| cpu_task_inx++; |
| cpu_task_cnt = 0; |
| } |
| } |
| |
| if ((step_layout->task_dist & SLURM_DIST_NODEMASK) |
| == SLURM_DIST_NODECYCLIC) |
| return _task_layout_cyclic(step_layout, cpus); |
| else if ((step_layout->task_dist & SLURM_DIST_STATE_BASE) |
| == SLURM_DIST_ARBITRARY) |
| return _task_layout_hostfile(step_layout, arbitrary_nodes); |
| else if ((step_layout->task_dist & SLURM_DIST_STATE_BASE) |
| == SLURM_DIST_PLANE) |
| return _task_layout_plane(step_layout, cpus); |
| else |
| return _task_layout_block(step_layout, cpus); |
| } |
| |
| /* use specific set run tasks on each host listed in hostfile |
| * XXX: Need to handle over-subscribe. |
| */ |
| static int _task_layout_hostfile(slurm_step_layout_t *step_layout, |
| const char *arbitrary_nodes) |
| { |
| int i=0, j, taskid = 0, task_cnt=0; |
| hostlist_iterator_t *itr = NULL, *itr_task = NULL; |
| char *host = NULL; |
| |
| hostlist_t *job_alloc_hosts = NULL; |
| hostlist_t *step_alloc_hosts = NULL; |
| |
| int step_inx = 0, step_hosts_cnt = 0; |
| node_record_t **step_hosts_ptrs = NULL; |
| node_record_t *host_ptr = NULL; |
| |
| debug2("job list is %s", step_layout->node_list); |
| if (!arbitrary_nodes) { |
| error("no hostlist given for arbitrary dist"); |
| return SLURM_ERROR; |
| } |
| |
| debug2("list is %s", arbitrary_nodes); |
| step_alloc_hosts = hostlist_create(arbitrary_nodes); |
| if (hostlist_count(step_alloc_hosts) != step_layout->task_cnt) { |
| error("Asked for %u tasks have %d in the nodelist. " |
| "Check your nodelist, or set the -n option to be %d", |
| step_layout->task_cnt, |
| hostlist_count(step_alloc_hosts), |
| hostlist_count(step_alloc_hosts)); |
| hostlist_destroy(step_alloc_hosts); |
| return SLURM_ERROR; |
| } |
| |
| job_alloc_hosts = hostlist_create(step_layout->node_list); |
| itr = hostlist_iterator_create(job_alloc_hosts); |
| itr_task = hostlist_iterator_create(step_alloc_hosts); |
| |
| /* |
| * Build array of pointers so that we can do pointer comparisons rather |
| * than strcmp's on nodes. |
| */ |
| step_hosts_cnt = hostlist_count(step_alloc_hosts); |
| step_hosts_ptrs = xcalloc(step_hosts_cnt, |
| sizeof(node_record_t *)); |
| |
| if (!running_in_daemon()) { |
| /* running in salloc - init node records */ |
| init_node_conf(); |
| build_all_nodeline_info(false, 0); |
| rehash_node(); |
| } |
| |
| step_inx = 0; |
| while((host = hostlist_next(itr_task))) { |
| step_hosts_ptrs[step_inx++] = find_node_record_no_alias(host); |
| free(host); |
| } |
| |
| while((host = hostlist_next(itr))) { |
| host_ptr = find_node_record(host); |
| step_layout->tasks[i] = 0; |
| |
| for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) { |
| if (host_ptr == step_hosts_ptrs[step_inx]) { |
| step_layout->tasks[i]++; |
| task_cnt++; |
| } |
| if (task_cnt >= step_layout->task_cnt) |
| break; |
| } |
| debug3("%s got %u tasks", host, step_layout->tasks[i]); |
| if (step_layout->tasks[i] == 0) |
| goto reset_hosts; |
| step_layout->tids[i] = xcalloc(step_layout->tasks[i], |
| sizeof(uint32_t)); |
| taskid = 0; |
| j = 0; |
| |
| for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) { |
| if (host_ptr == step_hosts_ptrs[step_inx]) { |
| step_layout->tids[i][j] = taskid; |
| j++; |
| } |
| taskid++; |
| if (j >= step_layout->tasks[i]) |
| break; |
| } |
| i++; |
| reset_hosts: |
| free(host); |
| if (i > step_layout->task_cnt) |
| break; |
| } |
| hostlist_iterator_destroy(itr); |
| hostlist_iterator_destroy(itr_task); |
| hostlist_destroy(job_alloc_hosts); |
| hostlist_destroy(step_alloc_hosts); |
| xfree(step_hosts_ptrs); |
| |
| if (task_cnt != step_layout->task_cnt) { |
| error("Asked for %u tasks but placed %d. Check your nodelist", |
| step_layout->task_cnt, task_cnt); |
| return SLURM_ERROR; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _task_layout_block(slurm_step_layout_t *step_layout, uint16_t *cpus) |
| { |
| static uint16_t select_params = NO_VAL16; |
| int i, j, task_id = 0; |
| bool pack_nodes; |
| |
| if (select_params == NO_VAL16) |
| select_params = slurm_conf.select_type_param; |
| if (step_layout->task_dist & SLURM_DIST_PACK_NODES) |
| pack_nodes = true; |
| else if (step_layout->task_dist & SLURM_DIST_NO_PACK_NODES) |
| pack_nodes = false; |
| else if (select_params & SELECT_PACK_NODES) |
| pack_nodes = true; |
| else |
| pack_nodes = false; |
| |
| if (pack_nodes) { |
| /* Pass 1: Put one task on each node */ |
| for (i = 0; ((i < step_layout->node_cnt) && |
| (task_id < step_layout->task_cnt)); i++) { |
| /* cpus has already been altered for cpus_per_task */ |
| if (step_layout->tasks[i] < cpus[i]) { |
| step_layout->tasks[i]++; |
| task_id++; |
| } |
| } |
| |
| /* Pass 2: Fill remaining CPUs on a node-by-node basis */ |
| for (i = 0; ((i < step_layout->node_cnt) && |
| (task_id < step_layout->task_cnt)); i++) { |
| /* cpus has already been altered for cpus_per_task */ |
| while ((step_layout->tasks[i] < cpus[i]) && |
| (task_id < step_layout->task_cnt)) { |
| step_layout->tasks[i]++; |
| task_id++; |
| } |
| } |
| |
| /* Pass 3: Spread remaining tasks across all nodes */ |
| while (task_id < step_layout->task_cnt) { |
| for (i = 0; ((i < step_layout->node_cnt) && |
| (task_id < step_layout->task_cnt)); i++) { |
| step_layout->tasks[i]++; |
| task_id++; |
| } |
| } |
| } else { |
| /* To effectively deal with heterogeneous nodes, we fake a |
| * cyclic distribution to determine how many tasks go on each |
| * node and then make those assignments in a block fashion. */ |
| bool over_subscribe = false; |
| for (j = 0; task_id < step_layout->task_cnt; j++) { |
| bool space_remaining = false; |
| for (i = 0; ((i < step_layout->node_cnt) && |
| (task_id < step_layout->task_cnt)); i++) { |
| if ((j < cpus[i]) || over_subscribe) { |
| step_layout->tasks[i]++; |
| task_id++; |
| if ((j + 1) < cpus[i]) |
| space_remaining = true; |
| } |
| } |
| if (!space_remaining) |
| over_subscribe = true; |
| } |
| } |
| |
| /* Now distribute the tasks */ |
| task_id = 0; |
| for (i = 0; i < step_layout->node_cnt; i++) { |
| step_layout->tids[i] = xcalloc(step_layout->tasks[i], |
| sizeof(uint32_t)); |
| for (j = 0; j < step_layout->tasks[i]; j++) { |
| step_layout->tids[i][j] = task_id; |
| task_id++; |
| } |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| |
| /* distribute tasks across available nodes: allocate tasks to nodes |
| * in a cyclic fashion using available processors. once all available |
| * processors are allocated, continue to allocate task over-subscribing |
| * nodes as needed. for example |
| * cpus per node 4 2 4 2 |
| * -- -- -- -- |
| * task distribution: 0 1 2 3 |
| * 4 5 6 7 |
| * 8 9 |
| * 10 11 all processors allocated now |
| * 12 13 14 15 etc. |
| */ |
| static int _task_layout_cyclic(slurm_step_layout_t *step_layout, |
| uint16_t *cpus) |
| { |
| int i, j, max_over_subscribe = 0, taskid = 0, total_cpus = 0; |
| bool over_subscribe = false; |
| |
| for (i = 0; i < step_layout->node_cnt; i++) |
| total_cpus += cpus[i]; |
| if (total_cpus < step_layout->task_cnt) { |
| over_subscribe = true; |
| i = step_layout->task_cnt - total_cpus; |
| max_over_subscribe = ROUNDUP(i, step_layout->node_cnt); |
| } |
| |
| for (j=0; taskid<step_layout->task_cnt; j++) { /* cycle counter */ |
| bool space_remaining = false; |
| for (i=0; ((i<step_layout->node_cnt) |
| && (taskid<step_layout->task_cnt)); i++) { |
| if ((j < cpus[i]) || |
| (over_subscribe && |
| (j < (cpus[i] + max_over_subscribe)))) { |
| xrealloc(step_layout->tids[i], sizeof(uint32_t) |
| * (step_layout->tasks[i] + 1)); |
| |
| step_layout->tids[i][step_layout->tasks[i]] = |
| taskid; |
| taskid++; |
| step_layout->tasks[i]++; |
| if ((j+1) < cpus[i]) |
| space_remaining = true; |
| } |
| } |
| if (!space_remaining) |
| over_subscribe = true; |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| |
| /* |
| * The plane distribution results in a block cyclic of block size |
| * "plane_size". |
| * To effectively deal with heterogeneous nodes, we fake a cyclic |
| * distribution to figure out how many tasks go on each node and |
| * then make the assignments of task numbers to nodes using the |
| * user-specified plane size. |
| * For example: |
| * plane_size = 2, #tasks = 6, #nodes = 3 |
| * |
| * Node#: Node0 Node1 Node2 |
| * ----- ----- ----- |
| * #of allocated CPUs: 4 1 1 |
| * |
| * task distribution: 0 1 2 3 |
| * 4 5 |
| */ |
| static int _task_layout_plane(slurm_step_layout_t *step_layout, |
| uint16_t *cpus) |
| { |
| int i, j, k, taskid = 0; |
| bool over_subscribe = false; |
| uint32_t cur_task[step_layout->node_cnt]; |
| int plane_start = 0; |
| |
| debug3("_task_layout_plane plane_size %u node_cnt %u task_cnt %u", |
| step_layout->plane_size, |
| step_layout->node_cnt, step_layout->task_cnt); |
| |
| if (step_layout->plane_size <= 0) |
| return SLURM_ERROR; |
| |
| if (step_layout->tasks == NULL) |
| return SLURM_ERROR; |
| |
| /* figure out how many tasks go to each node */ |
| for (j=0; taskid<step_layout->task_cnt; j++) { /* cycle counter */ |
| bool space_remaining = false; |
| /* place one task on each node first */ |
| if (j == 0) { |
| for (i = 0; ((i < step_layout->node_cnt) && |
| (taskid < step_layout->task_cnt)); i++) { |
| taskid++; |
| step_layout->tasks[i]++; |
| } |
| } |
| for (i = 0; ((i < step_layout->node_cnt) && |
| (taskid < step_layout->task_cnt)); i++) { |
| /* handle placing first task on each node */ |
| if (j == 0) |
| plane_start = 1; |
| else |
| plane_start = 0; |
| for (k = plane_start; (k < step_layout->plane_size) && |
| (taskid < step_layout->task_cnt); k++) { |
| if ((cpus[i] - step_layout->tasks[i]) || |
| over_subscribe) { |
| taskid++; |
| step_layout->tasks[i]++; |
| if (cpus[i] - (step_layout->tasks[i] |
| + 1) >= 0) |
| space_remaining = true; |
| } |
| } |
| } |
| if (!space_remaining) |
| over_subscribe = true; |
| } |
| |
| /* now distribute the tasks */ |
| taskid = 0; |
| for (i=0; i < step_layout->node_cnt; i++) { |
| step_layout->tids[i] = xcalloc(step_layout->tasks[i], |
| sizeof(uint32_t)); |
| cur_task[i] = 0; |
| } |
| for (j=0; taskid<step_layout->task_cnt; j++) { /* cycle counter */ |
| for (i=0; ((i<step_layout->node_cnt) |
| && (taskid<step_layout->task_cnt)); i++) { |
| /* assign a block of 'plane_size' tasks to this node */ |
| for (k=0; ((k<step_layout->plane_size) |
| && (cur_task[i] < step_layout->tasks[i]) |
| && (taskid < step_layout->task_cnt)); k++) { |
| step_layout->tids[i][cur_task[i]] = taskid; |
| taskid++; |
| cur_task[i]++; |
| } |
| } |
| } |
| |
| if (taskid != step_layout->task_cnt) { |
| error("_task_layout_plane: Mismatch in task count (%d != %d) ", |
| taskid, step_layout->task_cnt); |
| return SLURM_ERROR; |
| } |
| |
| #if (0) |
| /* debugging only */ |
| for (i=0; i < step_layout->node_cnt; i++) { |
| info("tasks[%d]: %u", i, step_layout->tasks[i]); |
| } |
| |
| for (i=0; i < step_layout->node_cnt; i++) { |
| info ("Host %d _plane_ # of tasks %u", i, step_layout->tasks[i]); |
| for (j=0; j<step_layout->tasks[i]; j++) { |
| info ("Host %d _plane_ localid %d taskid %u", |
| i, j, step_layout->tids[i][j]); |
| } |
| } |
| #endif |
| |
| return SLURM_SUCCESS; |
| } |
| |
| typedef struct { |
| task_dist_states_t task_dist; |
| const char *string; |
| } layout_type_name_t; |
| |
| static const layout_type_name_t layout_type_names[] = { |
| { SLURM_DIST_CYCLIC, "Cyclic" }, |
| /* distribute tasks filling node by node */ |
| { SLURM_DIST_BLOCK, "Block" }, |
| /* arbitrary task distribution */ |
| { SLURM_DIST_ARBITRARY, "arbitrary task distribution" }, |
| /* |
| * distribute tasks by filling up planes of lllp first and then by |
| * going across the nodes See documentation for more information |
| */ |
| { SLURM_DIST_PLANE, "Plane" }, |
| /* |
| * distribute tasks 1 per node: round robin: same for lowest |
| * level of logical processor (lllp) |
| */ |
| { SLURM_DIST_CYCLIC_CYCLIC, "CCyclic" }, |
| /* cyclic for node and block for lllp */ |
| { SLURM_DIST_CYCLIC_BLOCK, "CBlock" }, |
| /* block for node and cyclic for lllp */ |
| { SLURM_DIST_BLOCK_CYCLIC, "BCyclic" }, |
| /* block for node and block for lllp */ |
| { SLURM_DIST_BLOCK_BLOCK, "BBlock" }, |
| /* cyclic for node and full cyclic for lllp */ |
| { SLURM_DIST_CYCLIC_CFULL, "CFCyclic" }, |
| /* block for node and full cyclic for lllp */ |
| { SLURM_DIST_BLOCK_CFULL, "BFCyclic" }, |
| { SLURM_DIST_CYCLIC_CYCLIC_CYCLIC, "CCyclicCyclic" }, |
| { SLURM_DIST_CYCLIC_CYCLIC_BLOCK, "CCyclicBlock" }, |
| { SLURM_DIST_CYCLIC_CYCLIC_CFULL, "CCyclicFCyclic" }, |
| { SLURM_DIST_CYCLIC_BLOCK_CYCLIC, "CBlockCyclic" }, |
| { SLURM_DIST_CYCLIC_BLOCK_BLOCK, "CBlockBlock" }, |
| { SLURM_DIST_CYCLIC_BLOCK_CFULL, "CCyclicFCyclic" }, |
| { SLURM_DIST_CYCLIC_CFULL_CYCLIC, "CFCyclicCyclic" }, |
| { SLURM_DIST_CYCLIC_CFULL_BLOCK, "CFCyclicBlock" }, |
| { SLURM_DIST_CYCLIC_CFULL_CFULL, "CFCyclicFCyclic" }, |
| { SLURM_DIST_BLOCK_CYCLIC_CYCLIC, "BCyclicCyclic" }, |
| { SLURM_DIST_BLOCK_CYCLIC_BLOCK, "BCyclicBlock" }, |
| { SLURM_DIST_BLOCK_CYCLIC_CFULL, "BCyclicFCyclic" }, |
| { SLURM_DIST_BLOCK_BLOCK_CYCLIC, "BBlockCyclic" }, |
| { SLURM_DIST_BLOCK_BLOCK_BLOCK, "BBlockBlock" }, |
| { SLURM_DIST_BLOCK_BLOCK_CFULL, "BBlockFCyclic" }, |
| { SLURM_DIST_BLOCK_CFULL_CYCLIC, "BFCyclicCyclic" }, |
| { SLURM_DIST_BLOCK_CFULL_BLOCK, "BFCyclicBlock" }, |
| { SLURM_DIST_BLOCK_CFULL_CFULL, "BFCyclicFCyclic" }, |
| { 0 } |
| }; |
| |
| extern char *slurm_step_layout_type_name(task_dist_states_t task_dist) |
| { |
| char *name = NULL, *pos = NULL; |
| |
| for (int i = 0; layout_type_names[i].task_dist; i++) { |
| if (layout_type_names[i].task_dist == |
| (task_dist & SLURM_DIST_STATE_BASE)) { |
| xstrfmtcatat(name, &pos, "%s", |
| layout_type_names[i].string); |
| break; |
| } |
| } |
| |
| if (!name) { |
| /* SLURM_DIST_UNKNOWN - No distribution specified */ |
| xstrfmtcatat(name, &pos, "%s", "Unknown"); |
| } |
| |
| if (task_dist & SLURM_DIST_PACK_NODES) |
| xstrfmtcatat(name, &pos, ",%s", "Pack"); |
| |
| if (task_dist & SLURM_DIST_NO_PACK_NODES) |
| xstrfmtcatat(name, &pos, ",%s", "NoPack"); |
| |
| xassert(pos); |
| return name; |
| } |