blob: f5c0b4f0c2a77557b8f8667d87487952327050d4 [file] [log] [blame]
/*****************************************************************************\
* job_resources.c - functions to manage data structure identifying specific
* CPUs allocated to a job, step or partition
*****************************************************************************
* Copyright (C) 2008-2010 Lawrence Livermore National Security.
* Written by Morris Jette <jette1@llnl.gov>.
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#include <stdlib.h>
#include <string.h>
#include "slurm/slurm_errno.h"
#include "src/common/hostlist.h"
#include "src/common/job_resources.h"
#include "src/common/log.h"
#include "src/common/pack.h"
#include "src/common/xassert.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#include "src/slurmctld/slurmctld.h"
/* Create an empty job_resources data structure */
extern job_resources_t *create_job_resources(void)
{
job_resources_t *job_resrcs;
job_resrcs = xmalloc(sizeof(struct job_resources));
return job_resrcs;
}
/* Set the socket and core counts associated with a set of selected
* nodes of a job_resources data structure based upon slurmctld state.
* (sets cores_per_socket, sockets_per_node, and sock_core_rep_count based
* upon the value of node_bitmap, also creates core_bitmap based upon
* the total number of cores in the allocation). Call this ONLY from
* slurmctld. Example of use:
*
* job_resources_t *job_resrcs_ptr = create_job_resources();
* node_name2bitmap("dummy[2,5,12,16]", true, &(job_res_ptr->node_bitmap));
* rc = build_job_resources(job_resrcs_ptr);
*/
extern int build_job_resources(job_resources_t *job_resrcs)
{
int core_cnt = 0, sock_inx = -1;
node_record_t *node_ptr;
if (job_resrcs->node_bitmap == NULL) {
error("build_job_resources: node_bitmap is NULL");
return SLURM_ERROR;
}
xfree(job_resrcs->sockets_per_node);
xfree(job_resrcs->cores_per_socket);
xfree(job_resrcs->sock_core_rep_count);
job_resrcs->sockets_per_node = xcalloc(job_resrcs->nhosts,
sizeof(uint16_t));
job_resrcs->cores_per_socket = xcalloc(job_resrcs->nhosts,
sizeof(uint16_t));
job_resrcs->sock_core_rep_count = xcalloc(job_resrcs->nhosts,
sizeof(uint32_t));
for (int i = 0;
(node_ptr = next_node_bitmap(job_resrcs->node_bitmap, &i)); i++) {
if ((sock_inx < 0) ||
(node_ptr->tot_sockets !=
job_resrcs->sockets_per_node[sock_inx]) ||
(node_ptr->cores !=
job_resrcs->cores_per_socket[sock_inx])) {
sock_inx++;
job_resrcs->sockets_per_node[sock_inx] =
node_ptr->tot_sockets;
job_resrcs->cores_per_socket[sock_inx] =
node_ptr->cores;
}
job_resrcs->sock_core_rep_count[sock_inx]++;
core_cnt += node_ptr->tot_cores;
}
if (core_cnt) {
/*
* A zero size job (for burst buffer create/destroy only)
* will have no bitmaps.
*/
job_resrcs->core_bitmap = bit_alloc(core_cnt);
job_resrcs->core_bitmap_used = bit_alloc(core_cnt);
}
return SLURM_SUCCESS;
}
/* Rebuild cpu_array_cnt, cpu_array_value, and cpu_array_reps based upon the
* values of nhosts and cpus in an existing data structure
* Return total CPU count or -1 on error */
extern int build_job_resources_cpu_array(job_resources_t *job_resrcs_ptr)
{
int cpu_count = 0;
uint32_t last_cpu_cnt = NO_VAL;
int node_cpu_count;
if (job_resrcs_ptr->nhosts == 0)
return cpu_count; /* no work to do */
if (job_resrcs_ptr->cpus == NULL) {
error("build_job_resources_cpu_array: cpus==NULL");
return -1;
}
/* clear vestigial data and create new arrays of max size */
job_resrcs_ptr->cpu_array_cnt = 0;
xfree(job_resrcs_ptr->cpu_array_reps);
job_resrcs_ptr->cpu_array_reps = xcalloc(job_resrcs_ptr->nhosts,
sizeof(uint32_t));
xfree(job_resrcs_ptr->cpu_array_value);
job_resrcs_ptr->cpu_array_value = xcalloc(job_resrcs_ptr->nhosts,
sizeof(uint16_t));
for (int i = 0, j = 0;
next_node_bitmap(job_resrcs_ptr->node_bitmap, &i); i++) {
/*
* This needs to be the threads per core count to handle
* allocations correctly.
*/
node_cpu_count = job_resources_get_node_cpu_cnt(
job_resrcs_ptr, j, i);
if (node_cpu_count != last_cpu_cnt) {
last_cpu_cnt = node_cpu_count;
job_resrcs_ptr->cpu_array_value[
job_resrcs_ptr->cpu_array_cnt]
= last_cpu_cnt;
job_resrcs_ptr->cpu_array_reps[
job_resrcs_ptr->cpu_array_cnt] = 1;
job_resrcs_ptr->cpu_array_cnt++;
} else {
job_resrcs_ptr->cpu_array_reps[
job_resrcs_ptr->cpu_array_cnt-1]++;
}
/* This needs to be the full amount for accounting */
cpu_count += job_resrcs_ptr->cpus[j];
j++;
}
return cpu_count;
}
/* Reset the node_bitmap in a job_resources data structure
* This is needed after a restart/reconfiguration since nodes can
* be added or removed from the system resulting in changing in
* the bitmap size or bit positions */
extern int reset_node_bitmap(void *void_job_ptr)
{
job_record_t *job_ptr = (job_record_t *) void_job_ptr;
job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;
int i;
if (!job_resrcs_ptr)
return SLURM_SUCCESS;
FREE_NULL_BITMAP(job_resrcs_ptr->node_bitmap);
if (job_resrcs_ptr->nodes &&
(node_name2bitmap(job_resrcs_ptr->nodes, false,
&job_resrcs_ptr->node_bitmap, NULL))) {
error("Invalid nodes (%s) for %pJ",
job_resrcs_ptr->nodes, job_ptr);
return SLURM_ERROR;
} else if (job_resrcs_ptr->nodes == NULL) {
job_resrcs_ptr->node_bitmap = bit_alloc(node_record_count);
}
i = bit_set_count(job_resrcs_ptr->node_bitmap);
if (job_resrcs_ptr->nhosts != i) {
error("Invalid change in resource allocation node count for %pJ, %u to %d",
job_ptr, job_resrcs_ptr->nhosts, i);
return SLURM_ERROR;
}
return SLURM_SUCCESS;
}
extern int valid_job_resources(job_resources_t *job_resrcs)
{
int sock_inx = 0, sock_cnt = 0;
int total_job_cores, total_node_cores;
node_record_t *node_ptr;
if (job_resrcs->node_bitmap == NULL) {
error("valid_job_resources: node_bitmap is NULL");
return SLURM_ERROR;
}
if ((job_resrcs->sockets_per_node == NULL) ||
(job_resrcs->cores_per_socket == NULL) ||
(job_resrcs->sock_core_rep_count == NULL)) {
error("valid_job_resources: socket/core array is NULL");
return SLURM_ERROR;
}
for (int i = 0;
(node_ptr = next_node_bitmap(job_resrcs->node_bitmap, &i)); i++) {
if (sock_cnt >= job_resrcs->sock_core_rep_count[sock_inx]) {
sock_inx++;
sock_cnt = 0;
}
/* KNL nodes can should maintain a constant total core count,
* but the socket/NUMA count can change on reboot */
total_job_cores = job_resrcs->sockets_per_node[sock_inx] *
job_resrcs->cores_per_socket[sock_inx];
total_node_cores = node_ptr->tot_cores;
if (total_job_cores != total_node_cores) {
error("valid_job_resources: %s sockets:%u,%u, cores %u,%u",
node_ptr->name,
node_ptr->tot_sockets,
job_resrcs->sockets_per_node[sock_inx],
node_ptr->cores,
job_resrcs->cores_per_socket[sock_inx]);
return SLURM_ERROR;
}
sock_cnt++;
}
return SLURM_SUCCESS;
}
extern job_resources_t *copy_job_resources(job_resources_t *job_resrcs_ptr)
{
int i, sock_inx = 0;
job_resources_t *new_layout = xmalloc(sizeof(struct job_resources));
xassert(job_resrcs_ptr);
new_layout->nhosts = job_resrcs_ptr->nhosts;
new_layout->nodes = xstrdup(job_resrcs_ptr->nodes);
new_layout->ncpus = job_resrcs_ptr->ncpus;
new_layout->node_req = job_resrcs_ptr->node_req;
new_layout->whole_node = job_resrcs_ptr->whole_node;
if (job_resrcs_ptr->core_bitmap) {
new_layout->core_bitmap = bit_copy(job_resrcs_ptr->
core_bitmap);
}
if (job_resrcs_ptr->core_bitmap_used) {
new_layout->core_bitmap_used = bit_copy(job_resrcs_ptr->
core_bitmap_used);
}
if (job_resrcs_ptr->node_bitmap) {
new_layout->node_bitmap = bit_copy(job_resrcs_ptr->
node_bitmap);
}
new_layout->cpu_array_cnt = job_resrcs_ptr->cpu_array_cnt;
if (job_resrcs_ptr->cpu_array_reps &&
job_resrcs_ptr->cpu_array_cnt) {
new_layout->cpu_array_reps =
xcalloc(job_resrcs_ptr->cpu_array_cnt,
sizeof(uint32_t));
memcpy(new_layout->cpu_array_reps,
job_resrcs_ptr->cpu_array_reps,
(sizeof(uint32_t) * job_resrcs_ptr->cpu_array_cnt));
}
if (job_resrcs_ptr->cpu_array_value &&
job_resrcs_ptr->cpu_array_cnt) {
new_layout->cpu_array_value =
xcalloc(job_resrcs_ptr->cpu_array_cnt,
sizeof(uint16_t));
memcpy(new_layout->cpu_array_value,
job_resrcs_ptr->cpu_array_value,
(sizeof(uint16_t) * job_resrcs_ptr->cpu_array_cnt));
}
if (job_resrcs_ptr->cpus) {
new_layout->cpus = xcalloc(job_resrcs_ptr->nhosts,
sizeof(uint16_t));
memcpy(new_layout->cpus, job_resrcs_ptr->cpus,
(sizeof(uint16_t) * job_resrcs_ptr->nhosts));
}
if (job_resrcs_ptr->cpus_used) {
new_layout->cpus_used = xcalloc(job_resrcs_ptr->nhosts,
sizeof(uint16_t));
memcpy(new_layout->cpus_used, job_resrcs_ptr->cpus_used,
(sizeof(uint16_t) * job_resrcs_ptr->nhosts));
}
if (job_resrcs_ptr->memory_allocated) {
new_layout->memory_allocated = xcalloc(new_layout->nhosts,
sizeof(uint64_t));
memcpy(new_layout->memory_allocated,
job_resrcs_ptr->memory_allocated,
(sizeof(uint64_t) * job_resrcs_ptr->nhosts));
}
if (job_resrcs_ptr->memory_used) {
new_layout->memory_used = xcalloc(new_layout->nhosts,
sizeof(uint64_t));
memcpy(new_layout->memory_used,
job_resrcs_ptr->memory_used,
(sizeof(uint64_t) * job_resrcs_ptr->nhosts));
}
/* Copy sockets_per_node, cores_per_socket and core_sock_rep_count */
new_layout->sockets_per_node = xcalloc(new_layout->nhosts,
sizeof(uint16_t));
new_layout->cores_per_socket = xcalloc(new_layout->nhosts,
sizeof(uint16_t));
new_layout->sock_core_rep_count = xcalloc(new_layout->nhosts,
sizeof(uint32_t));
for (i=0; i<new_layout->nhosts; i++) {
if (job_resrcs_ptr->sock_core_rep_count[i] == 0) {
error("copy_job_resources: sock_core_rep_count=0");
break;
}
sock_inx += job_resrcs_ptr->sock_core_rep_count[i];
if (sock_inx >= job_resrcs_ptr->nhosts) {
i++;
break;
}
}
memcpy(new_layout->sockets_per_node,
job_resrcs_ptr->sockets_per_node, (sizeof(uint16_t) * i));
memcpy(new_layout->cores_per_socket,
job_resrcs_ptr->cores_per_socket, (sizeof(uint16_t) * i));
memcpy(new_layout->sock_core_rep_count,
job_resrcs_ptr->sock_core_rep_count,
(sizeof(uint32_t) * i));
return new_layout;
}
extern void free_job_resources(job_resources_t **job_resrcs_pptr)
{
job_resources_t *job_resrcs_ptr = *job_resrcs_pptr;
if (job_resrcs_ptr) {
FREE_NULL_BITMAP(job_resrcs_ptr->core_bitmap);
FREE_NULL_BITMAP(job_resrcs_ptr->core_bitmap_used);
xfree(job_resrcs_ptr->cores_per_socket);
xfree(job_resrcs_ptr->cpu_array_reps);
xfree(job_resrcs_ptr->cpu_array_value);
xfree(job_resrcs_ptr->cpus);
xfree(job_resrcs_ptr->cpus_used);
xfree(job_resrcs_ptr->memory_allocated);
xfree(job_resrcs_ptr->memory_used);
FREE_NULL_BITMAP(job_resrcs_ptr->node_bitmap);
xfree(job_resrcs_ptr->nodes);
xfree(job_resrcs_ptr->sock_core_rep_count);
xfree(job_resrcs_ptr->sockets_per_node);
xfree(job_resrcs_ptr->tasks_per_node);
xfree(job_resrcs_ptr);
*job_resrcs_pptr = NULL;
}
}
/*
* Log the contents of a job_resources data structure using info()
*
* Function argument is void * to avoid a circular dependency between
* job_resources.h and slurmctld.h. Cast inside the function here to
* resolve that problem for now.
*/
extern void log_job_resources(void *void_job_ptr)
{
job_record_t *job_ptr = (job_record_t *) void_job_ptr;
job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;
int bit_inx = 0, bit_reps, i;
int array_size, node_inx;
int sock_inx = 0, sock_reps = 0;
if (job_resrcs_ptr == NULL) {
error("%s: job_resrcs_ptr is NULL", __func__);
return;
}
info("====================");
info("%pJ nhosts:%u ncpus:%u node_req:%u nodes=%s",
job_ptr, job_resrcs_ptr->nhosts, job_resrcs_ptr->ncpus,
job_resrcs_ptr->node_req, job_resrcs_ptr->nodes);
if (job_resrcs_ptr->cpus == NULL) {
error("%s: cpus array is NULL", __func__);
return;
}
if (job_resrcs_ptr->memory_allocated == NULL) {
error("%s: memory array is NULL", __func__);
return;
}
if ((job_resrcs_ptr->cores_per_socket == NULL) ||
(job_resrcs_ptr->sockets_per_node == NULL) ||
(job_resrcs_ptr->sock_core_rep_count == NULL)) {
error("%s: socket/core array is NULL", __func__);
return;
}
if (job_resrcs_ptr->core_bitmap == NULL) {
error("%s: core_bitmap is NULL", __func__);
return;
}
if (job_resrcs_ptr->core_bitmap_used == NULL) {
error("%s: core_bitmap_used is NULL", __func__);
return;
}
array_size = bit_size(job_resrcs_ptr->core_bitmap);
/* Can only log node_bitmap from slurmctld, so don't bother here */
for (node_inx=0; node_inx<job_resrcs_ptr->nhosts; node_inx++) {
uint32_t cpus_used = 0;
uint64_t memory_allocated = 0, memory_used = 0;
info("Node[%d]:", node_inx);
if (sock_reps >=
job_resrcs_ptr->sock_core_rep_count[sock_inx]) {
sock_inx++;
sock_reps = 0;
}
sock_reps++;
if (job_resrcs_ptr->cpus_used)
cpus_used = job_resrcs_ptr->cpus_used[node_inx];
if (job_resrcs_ptr->memory_used)
memory_used = job_resrcs_ptr->memory_used[node_inx];
if (job_resrcs_ptr->memory_allocated)
memory_allocated = job_resrcs_ptr->
memory_allocated[node_inx];
info(" Mem(MB):%"PRIu64":%"PRIu64" Sockets:%u"
" Cores:%u CPUs:%u:%u",
memory_allocated, memory_used,
job_resrcs_ptr->sockets_per_node[sock_inx],
job_resrcs_ptr->cores_per_socket[sock_inx],
job_resrcs_ptr->cpus[node_inx],
cpus_used);
bit_reps = job_resrcs_ptr->sockets_per_node[sock_inx] *
job_resrcs_ptr->cores_per_socket[sock_inx];
for (i=0; i<bit_reps; i++) {
if (bit_inx >= array_size) {
error("%s: array size wrong", __func__);
break;
}
if (bit_test(job_resrcs_ptr->core_bitmap,
bit_inx)) {
char *core_used = "";
if (bit_test(job_resrcs_ptr->
core_bitmap_used, bit_inx))
core_used = " and in use";
info(" Socket[%d] Core[%d] is allocated%s",
(i / job_resrcs_ptr->
cores_per_socket[sock_inx]),
(i % job_resrcs_ptr->
cores_per_socket[sock_inx]),
core_used);
}
bit_inx++;
}
}
for (node_inx=0; node_inx<job_resrcs_ptr->cpu_array_cnt;
node_inx++) {
if (node_inx == 0)
info("--------------------");
info("cpu_array_value[%d]:%u reps:%u", node_inx,
job_resrcs_ptr->cpu_array_value[node_inx],
job_resrcs_ptr->cpu_array_reps[node_inx]);
}
info("====================");
}
extern void pack_job_resources(job_resources_t *job_resrcs_ptr, buf_t *buffer,
uint16_t protocol_version)
{
int i;
uint32_t sock_recs = 0;
if (protocol_version >= SLURM_24_11_PROTOCOL_VERSION) {
if (job_resrcs_ptr == NULL) {
uint32_t empty = NO_VAL;
pack32(empty, buffer);
return;
}
pack32(job_resrcs_ptr->nhosts, buffer);
pack32(job_resrcs_ptr->ncpus, buffer);
pack32(job_resrcs_ptr->next_step_node_inx, buffer);
pack32(job_resrcs_ptr->node_req, buffer);
packstr(job_resrcs_ptr->nodes, buffer);
pack8(job_resrcs_ptr->whole_node, buffer);
pack16(job_resrcs_ptr->threads_per_core, buffer);
pack16(job_resrcs_ptr->cr_type, buffer);
if (job_resrcs_ptr->cpu_array_reps)
pack32_array(job_resrcs_ptr->cpu_array_reps,
job_resrcs_ptr->cpu_array_cnt, buffer);
else
pack32_array(job_resrcs_ptr->cpu_array_reps, 0, buffer);
if (job_resrcs_ptr->cpu_array_value)
pack16_array(job_resrcs_ptr->cpu_array_value,
job_resrcs_ptr->cpu_array_cnt, buffer);
else
pack16_array(job_resrcs_ptr->cpu_array_value,
0, buffer);
if (job_resrcs_ptr->cpus)
pack16_array(job_resrcs_ptr->cpus,
job_resrcs_ptr->nhosts, buffer);
else
pack16_array(job_resrcs_ptr->cpus, 0, buffer);
if (job_resrcs_ptr->cpus_used)
pack16_array(job_resrcs_ptr->cpus_used,
job_resrcs_ptr->nhosts, buffer);
else
pack16_array(job_resrcs_ptr->cpus_used, 0, buffer);
if (job_resrcs_ptr->memory_allocated)
pack64_array(job_resrcs_ptr->memory_allocated,
job_resrcs_ptr->nhosts, buffer);
else
pack64_array(job_resrcs_ptr->memory_allocated,
0, buffer);
if (job_resrcs_ptr->memory_used)
pack64_array(job_resrcs_ptr->memory_used,
job_resrcs_ptr->nhosts, buffer);
else
pack64_array(job_resrcs_ptr->memory_used, 0, buffer);
xassert(job_resrcs_ptr->cores_per_socket);
xassert(job_resrcs_ptr->sock_core_rep_count);
xassert(job_resrcs_ptr->sockets_per_node);
for (i=0; i < job_resrcs_ptr->nhosts; i++) {
sock_recs += job_resrcs_ptr->
sock_core_rep_count[i];
if (sock_recs >= job_resrcs_ptr->nhosts)
break;
}
i++;
pack16_array(job_resrcs_ptr->sockets_per_node,
(uint32_t) i, buffer);
pack16_array(job_resrcs_ptr->cores_per_socket,
(uint32_t) i, buffer);
pack32_array(job_resrcs_ptr->sock_core_rep_count,
(uint32_t) i, buffer);
xassert(job_resrcs_ptr->core_bitmap);
xassert(job_resrcs_ptr->core_bitmap_used);
pack_bit_str_hex(job_resrcs_ptr->core_bitmap, buffer);
pack_bit_str_hex(job_resrcs_ptr->core_bitmap_used,
buffer);
pack_bit_str_hex(job_resrcs_ptr->node_bitmap, buffer);
} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
if (job_resrcs_ptr == NULL) {
uint32_t empty = NO_VAL;
pack32(empty, buffer);
return;
}
pack32(job_resrcs_ptr->nhosts, buffer);
pack32(job_resrcs_ptr->ncpus, buffer);
pack32(job_resrcs_ptr->node_req, buffer);
packstr(job_resrcs_ptr->nodes, buffer);
pack8(job_resrcs_ptr->whole_node, buffer);
pack16(job_resrcs_ptr->threads_per_core, buffer);
pack16(job_resrcs_ptr->cr_type, buffer);
if (job_resrcs_ptr->cpu_array_reps)
pack32_array(job_resrcs_ptr->cpu_array_reps,
job_resrcs_ptr->cpu_array_cnt, buffer);
else
pack32_array(job_resrcs_ptr->cpu_array_reps, 0, buffer);
if (job_resrcs_ptr->cpu_array_value)
pack16_array(job_resrcs_ptr->cpu_array_value,
job_resrcs_ptr->cpu_array_cnt, buffer);
else
pack16_array(job_resrcs_ptr->cpu_array_value,
0, buffer);
if (job_resrcs_ptr->cpus)
pack16_array(job_resrcs_ptr->cpus,
job_resrcs_ptr->nhosts, buffer);
else
pack16_array(job_resrcs_ptr->cpus, 0, buffer);
if (job_resrcs_ptr->cpus_used)
pack16_array(job_resrcs_ptr->cpus_used,
job_resrcs_ptr->nhosts, buffer);
else
pack16_array(job_resrcs_ptr->cpus_used, 0, buffer);
if (job_resrcs_ptr->memory_allocated)
pack64_array(job_resrcs_ptr->memory_allocated,
job_resrcs_ptr->nhosts, buffer);
else
pack64_array(job_resrcs_ptr->memory_allocated,
0, buffer);
if (job_resrcs_ptr->memory_used)
pack64_array(job_resrcs_ptr->memory_used,
job_resrcs_ptr->nhosts, buffer);
else
pack64_array(job_resrcs_ptr->memory_used, 0, buffer);
xassert(job_resrcs_ptr->cores_per_socket);
xassert(job_resrcs_ptr->sock_core_rep_count);
xassert(job_resrcs_ptr->sockets_per_node);
for (i=0; i < job_resrcs_ptr->nhosts; i++) {
sock_recs += job_resrcs_ptr->
sock_core_rep_count[i];
if (sock_recs >= job_resrcs_ptr->nhosts)
break;
}
i++;
pack16_array(job_resrcs_ptr->sockets_per_node,
(uint32_t) i, buffer);
pack16_array(job_resrcs_ptr->cores_per_socket,
(uint32_t) i, buffer);
pack32_array(job_resrcs_ptr->sock_core_rep_count,
(uint32_t) i, buffer);
xassert(job_resrcs_ptr->core_bitmap);
xassert(job_resrcs_ptr->core_bitmap_used);
pack_bit_str_hex(job_resrcs_ptr->core_bitmap, buffer);
pack_bit_str_hex(job_resrcs_ptr->core_bitmap_used,
buffer);
pack_bit_str_hex(job_resrcs_ptr->node_bitmap, buffer);
} else {
error("pack_job_resources: protocol_version %hu not supported",
protocol_version);
}
}
extern int unpack_job_resources(job_resources_t **job_resrcs_pptr,
buf_t *buffer, uint16_t protocol_version)
{
char *bit_fmt = NULL;
uint32_t empty, tmp32;
job_resources_t *job_resrcs;
xassert(job_resrcs_pptr);
if (protocol_version >= SLURM_24_11_PROTOCOL_VERSION) {
safe_unpack32(&empty, buffer);
if (empty == NO_VAL) {
*job_resrcs_pptr = NULL;
return SLURM_SUCCESS;
}
job_resrcs = xmalloc(sizeof(struct job_resources));
job_resrcs->nhosts = empty;
safe_unpack32(&job_resrcs->ncpus, buffer);
safe_unpack32(&job_resrcs->next_step_node_inx, buffer);
safe_unpack32(&job_resrcs->node_req, buffer);
safe_unpackstr_xmalloc(&job_resrcs->nodes, &tmp32, buffer);
safe_unpack8(&job_resrcs->whole_node, buffer);
safe_unpack16(&job_resrcs->threads_per_core, buffer);
safe_unpack16(&job_resrcs->cr_type, buffer);
safe_unpack32_array(&job_resrcs->cpu_array_reps,
&tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->cpu_array_reps);
job_resrcs->cpu_array_cnt = tmp32;
safe_unpack16_array(&job_resrcs->cpu_array_value,
&tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->cpu_array_value);
if (tmp32 != job_resrcs->cpu_array_cnt)
goto unpack_error;
safe_unpack16_array(&job_resrcs->cpus, &tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->cpus);
if (tmp32 != job_resrcs->nhosts)
goto unpack_error;
safe_unpack16_array(&job_resrcs->cpus_used, &tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->cpus_used);
safe_unpack64_array(&job_resrcs->memory_allocated,
&tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->memory_allocated);
safe_unpack64_array(&job_resrcs->memory_used, &tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->memory_used);
safe_unpack16_array(&job_resrcs->sockets_per_node,
&tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->sockets_per_node);
safe_unpack16_array(&job_resrcs->cores_per_socket,
&tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->cores_per_socket);
safe_unpack32_array(&job_resrcs->sock_core_rep_count,
&tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->sock_core_rep_count);
unpack_bit_str_hex(&job_resrcs->core_bitmap, buffer);
unpack_bit_str_hex(&job_resrcs->core_bitmap_used,
buffer);
unpack_bit_str_hex(&job_resrcs->node_bitmap, buffer);
} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
safe_unpack32(&empty, buffer);
if (empty == NO_VAL) {
*job_resrcs_pptr = NULL;
return SLURM_SUCCESS;
}
job_resrcs = xmalloc(sizeof(struct job_resources));
job_resrcs->nhosts = empty;
safe_unpack32(&job_resrcs->ncpus, buffer);
safe_unpack32(&job_resrcs->node_req, buffer);
safe_unpackstr(&job_resrcs->nodes, buffer);
safe_unpack8(&job_resrcs->whole_node, buffer);
safe_unpack16(&job_resrcs->threads_per_core, buffer);
safe_unpack16(&job_resrcs->cr_type, buffer);
safe_unpack32_array(&job_resrcs->cpu_array_reps,
&tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->cpu_array_reps);
job_resrcs->cpu_array_cnt = tmp32;
safe_unpack16_array(&job_resrcs->cpu_array_value,
&tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->cpu_array_value);
if (tmp32 != job_resrcs->cpu_array_cnt)
goto unpack_error;
safe_unpack16_array(&job_resrcs->cpus, &tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->cpus);
if (tmp32 != job_resrcs->nhosts)
goto unpack_error;
safe_unpack16_array(&job_resrcs->cpus_used, &tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->cpus_used);
safe_unpack64_array(&job_resrcs->memory_allocated,
&tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->memory_allocated);
safe_unpack64_array(&job_resrcs->memory_used, &tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->memory_used);
safe_unpack16_array(&job_resrcs->sockets_per_node,
&tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->sockets_per_node);
safe_unpack16_array(&job_resrcs->cores_per_socket,
&tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->cores_per_socket);
safe_unpack32_array(&job_resrcs->sock_core_rep_count,
&tmp32, buffer);
if (tmp32 == 0)
xfree(job_resrcs->sock_core_rep_count);
unpack_bit_str_hex(&job_resrcs->core_bitmap, buffer);
unpack_bit_str_hex(&job_resrcs->core_bitmap_used,
buffer);
unpack_bit_str_hex(&job_resrcs->node_bitmap, buffer);
} else {
error("unpack_job_resources: protocol_version %hu not "
"supported", protocol_version);
goto unpack_error;
}
/*
* SELECT_LINEAR overlapped with SELECT_MULTIPLE_SHARING_GRES_PJ until
* 25.11. SELECT_MULTIPLE_SHARING_GRES_PJ was never put on
* job_resrcs->cr_type so no real overlap happened, but it isn't good in
* practice. We just need to set it to the correct value here.
* Once 25.05 is no longer supported we can remove this 'if'.
*/
if (job_resrcs->cr_type & 0x8000) {
job_resrcs->cr_type &= ~(0x8000);
job_resrcs->cr_type |= SELECT_LINEAR;
}
*job_resrcs_pptr = job_resrcs;
return SLURM_SUCCESS;
unpack_error:
error("unpack_job_resources: unpack error");
free_job_resources(&job_resrcs);
xfree(bit_fmt);
*job_resrcs_pptr = NULL;
return SLURM_ERROR;
}
extern int get_job_resources_offset(job_resources_t *job_resrcs_ptr,
uint32_t node_id, uint16_t socket_id,
uint16_t core_id)
{
int i, bit_inx = 0;
xassert(job_resrcs_ptr);
for (i=0; i<job_resrcs_ptr->nhosts; i++) {
if (job_resrcs_ptr->sock_core_rep_count[i] <= node_id) {
bit_inx += job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i] *
job_resrcs_ptr->sock_core_rep_count[i];
node_id -= job_resrcs_ptr->sock_core_rep_count[i];
} else if (socket_id >= job_resrcs_ptr->sockets_per_node[i]) {
error("get_job_resrcs_bit: socket_id >= socket_cnt "
"(%u >= %u)", socket_id,
job_resrcs_ptr->sockets_per_node[i]);
return -1;
} else if (core_id >= job_resrcs_ptr->cores_per_socket[i]) {
error("get_job_resrcs_bit: core_id >= core_cnt "
"(%u >= %u)", core_id,
job_resrcs_ptr->cores_per_socket[i]);
return -1;
} else {
bit_inx += job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i] *
node_id;
bit_inx += job_resrcs_ptr->cores_per_socket[i] *
socket_id;
bit_inx += core_id;
break;
}
}
i = bit_size(job_resrcs_ptr->core_bitmap);
if (bit_inx >= i) {
error("get_job_resources_bit: offset >= bitmap size "
"(%d >= %d)", bit_inx, i);
return -1;
}
return bit_inx;
}
extern int get_job_resources_bit(job_resources_t *job_resrcs_ptr,
uint32_t node_id, uint16_t socket_id,
uint16_t core_id)
{
int bit_inx = get_job_resources_offset(job_resrcs_ptr, node_id,
socket_id, core_id);
if (bit_inx < 0)
return SLURM_ERROR;
return bit_test(job_resrcs_ptr->core_bitmap, bit_inx);
}
extern int set_job_resources_bit(job_resources_t *job_resrcs_ptr,
uint32_t node_id, uint16_t socket_id,
uint16_t core_id)
{
int bit_inx = get_job_resources_offset(job_resrcs_ptr, node_id,
socket_id, core_id);
if (bit_inx < 0)
return SLURM_ERROR;
bit_set(job_resrcs_ptr->core_bitmap, bit_inx);
return SLURM_SUCCESS;
}
/* For every core bitmap and core_bitmap_used set in the "from" resources
* structure at from_node_offset, set the corresponding bit in the "new"
* resources structure at new_node_offset */
extern int job_resources_bits_copy(job_resources_t *new_job_resrcs_ptr,
uint16_t new_node_offset,
job_resources_t *from_job_resrcs_ptr,
uint16_t from_node_offset)
{
int i, rc = SLURM_SUCCESS;
int new_core_cnt = 0, from_core_cnt = 0;
xassert(new_job_resrcs_ptr);
xassert(from_job_resrcs_ptr);
if (new_node_offset >= new_job_resrcs_ptr->nhosts) {
error("job_resources_bits_move: new_node_offset invalid "
"(%u is 0 or >=%u)", new_node_offset,
new_job_resrcs_ptr->nhosts);
return SLURM_ERROR;
}
for (i = 0; i < new_job_resrcs_ptr->nhosts; i++) {
if (new_job_resrcs_ptr->sock_core_rep_count[i] <=
new_node_offset) {
new_node_offset -= new_job_resrcs_ptr->
sock_core_rep_count[i];
} else {
new_core_cnt = new_job_resrcs_ptr->sockets_per_node[i] *
new_job_resrcs_ptr->cores_per_socket[i];
break;
}
}
if (from_node_offset >= from_job_resrcs_ptr->nhosts) {
error("job_resources_bits_move: from_node_offset invalid "
"(%u is 0 or >=%u)", from_node_offset,
from_job_resrcs_ptr->nhosts);
return SLURM_ERROR;
}
for (i = 0; i < from_job_resrcs_ptr->nhosts; i++) {
if (from_job_resrcs_ptr->sock_core_rep_count[i] <=
from_node_offset) {
from_node_offset -= from_job_resrcs_ptr->
sock_core_rep_count[i];
} else {
from_core_cnt = from_job_resrcs_ptr->sockets_per_node[i] *
from_job_resrcs_ptr->cores_per_socket[i];
break;
}
}
if (new_core_cnt != from_core_cnt) {
error("job_resources_bits_move: core_cnt mismatch (%d != %d)",
new_core_cnt, from_core_cnt);
rc = SLURM_ERROR;
}
bit_or(new_job_resrcs_ptr->core_bitmap,
from_job_resrcs_ptr->core_bitmap);
bit_or(new_job_resrcs_ptr->core_bitmap_used,
from_job_resrcs_ptr->core_bitmap_used);
return rc;
}
/*
* AND two job_resources structures.
* Every node/core set in job_resrcs1_ptr and job_resrcs2_ptr is set in the
* resulting job_resrcs1_ptr data structure
* RET SLURM_SUCCESS or an error code
*/
extern int job_resources_and(job_resources_t *job_resrcs1_ptr,
job_resources_t *job_resrcs2_ptr)
{
int i, i_first, i_last, j;
int node_cnt;
int sock_core_cnt1 = 0, sock_core_cnt2 = 0;
int so_co_off1 = 0, so_co_off2 = 0;
int core_cnt, core_cnt1, core_cnt2;;
int core_off1 = 0, core_off2 = 0;
int rc = SLURM_SUCCESS;
xassert(job_resrcs1_ptr);
xassert(job_resrcs2_ptr);
xassert(job_resrcs1_ptr->core_bitmap);
xassert(job_resrcs2_ptr->core_bitmap);
xassert(job_resrcs1_ptr->node_bitmap);
xassert(job_resrcs2_ptr->node_bitmap);
/* Allocate space for merged arrays */
node_cnt = bit_size(job_resrcs1_ptr->node_bitmap);
i = bit_size(job_resrcs2_ptr->node_bitmap);
if (node_cnt != i) {
error("%s: node_bitmap sizes differ (%d != %d)", __func__,
node_cnt, i);
rc = SLURM_ERROR;
node_cnt = MIN(node_cnt, i);
}
/* Set the values in data structure used for merging */
i_first = bit_ffs(job_resrcs1_ptr->node_bitmap);
i = bit_ffs(job_resrcs2_ptr->node_bitmap);
if ((i != -1) && (i < i_first))
i_first = i;
i_last = bit_fls(job_resrcs1_ptr->node_bitmap);
i = bit_fls(job_resrcs2_ptr->node_bitmap);
if ((i != -1) && (i > i_last))
i_last = i;
if (i_last >= node_cnt)
i_last = node_cnt - 1;
if (i_last == -1) /* node_bitmap empty in both inputs */
i_last = -2;
for (i = i_first; i <= i_last; i++) {
bool match1 = false, match2 = false;
if (bit_test(job_resrcs1_ptr->node_bitmap, i))
match1 = true;
if (bit_test(job_resrcs2_ptr->node_bitmap, i))
match2 = true;
if (!match1 && !match2) /* Unused node */
continue;
if (match1 && match2) { /* Merge (AND) core_bitmaps */
if (++sock_core_cnt1 >
job_resrcs1_ptr->sock_core_rep_count[so_co_off1]) {
sock_core_cnt1 = 0;
so_co_off1++;
}
if (++sock_core_cnt2 >
job_resrcs2_ptr->sock_core_rep_count[so_co_off2]) {
sock_core_cnt2 = 0;
so_co_off2++;
}
core_cnt1 =
job_resrcs1_ptr->cores_per_socket[so_co_off1] *
job_resrcs1_ptr->sockets_per_node[so_co_off1];
core_cnt2 =
job_resrcs2_ptr->cores_per_socket[so_co_off2] *
job_resrcs2_ptr->sockets_per_node[so_co_off2];
if (core_cnt1 != core_cnt2) {
error("%s: Inconsistent socket/core count for node_inx %d (%d != %d)",
__func__, i, core_cnt1, core_cnt2);
rc = SLURM_ERROR;
}
core_cnt = MIN(core_cnt1, core_cnt2);
for (j = 0; j < core_cnt; j++) {
if (bit_test(job_resrcs1_ptr->core_bitmap,
core_off1 + j) &&
!bit_test(job_resrcs2_ptr->core_bitmap,
core_off2 + j)) {
bit_clear(job_resrcs1_ptr->core_bitmap,
core_off1 + j);
}
}
core_off1 += core_cnt1;
core_off2 += core_cnt2;
} else if (match1) {
if (++sock_core_cnt1 >
job_resrcs1_ptr->sock_core_rep_count[so_co_off1]) {
sock_core_cnt1 = 0;
so_co_off1++;
}
core_cnt1 =
job_resrcs1_ptr->cores_per_socket[so_co_off1] *
job_resrcs1_ptr->sockets_per_node[so_co_off1];
for (j = 0; j < core_cnt1; j++) {
bit_clear(job_resrcs1_ptr->core_bitmap,
core_off1 + j);
}
core_off1 += core_cnt1;
} else { /* match2 only */
if (++sock_core_cnt2 >
job_resrcs2_ptr->sock_core_rep_count[so_co_off2]) {
sock_core_cnt2 = 0;
so_co_off2++;
}
core_cnt2 =
job_resrcs2_ptr->cores_per_socket[so_co_off2] *
job_resrcs2_ptr->sockets_per_node[so_co_off2];
core_off2 += core_cnt2;
}
}
return rc;
}
/*
* OR two job_resources structures.
* Every node/core set in job_resrcs1_ptr or job_resrcs2_ptr is set in the
* resulting job_resrcs1_ptr data structure.
* NOTE: Only these job_resources_t fields in job_resrcs1_ptr are changed:
* core_bitmap, node_bitmap
* cores_per_socket, sockets_per_node, sock_core_rep_count, nhosts
* RET SLURM_SUCCESS or an error code, best effort operation happens on error
*/
extern int job_resources_or(job_resources_t *job_resrcs1_ptr,
job_resources_t *job_resrcs2_ptr)
{
job_resources_t *job_resrcs_new;
int i, i_first, i_last, j;
int node_cnt, node_inx = -1;
int sock_core_cnt1 = 0, sock_core_cnt2 = 0;
int so_co_off1 = 0, so_co_off2 = 0;
int core_cnt, core_cnt1, core_cnt2;
int core_off = 0, core_off1 = 0, core_off2 = 0;
int rc = SLURM_SUCCESS;
xassert(job_resrcs1_ptr);
xassert(job_resrcs2_ptr);
xassert(job_resrcs1_ptr->core_bitmap);
xassert(job_resrcs2_ptr->core_bitmap);
xassert(job_resrcs1_ptr->node_bitmap);
xassert(job_resrcs2_ptr->node_bitmap);
/* Allocate space for merged arrays */
job_resrcs_new = xmalloc(sizeof(job_resources_t));
node_cnt = bit_size(job_resrcs1_ptr->node_bitmap);
i = bit_size(job_resrcs2_ptr->node_bitmap);
if (node_cnt != i) {
error("%s: node_bitmap sizes differ (%d != %d)", __func__,
node_cnt, i);
rc = SLURM_ERROR;
node_cnt = MIN(node_cnt, i);
}
job_resrcs_new->node_bitmap = bit_alloc(node_cnt);
i = bit_set_count(job_resrcs1_ptr->node_bitmap) +
bit_set_count(job_resrcs2_ptr->node_bitmap);
job_resrcs_new->cores_per_socket = xcalloc(i, sizeof(uint32_t));
job_resrcs_new->sockets_per_node = xcalloc(i, sizeof(uint32_t));
job_resrcs_new->sock_core_rep_count = xcalloc(i, sizeof(uint32_t));
i = bit_size(job_resrcs1_ptr->core_bitmap) +
bit_size(job_resrcs2_ptr->core_bitmap);
job_resrcs_new->core_bitmap = bit_alloc(i); /* May be over-sized */
/* Set the values in data structure used for merging */
i_first = bit_ffs(job_resrcs1_ptr->node_bitmap);
i = bit_ffs(job_resrcs2_ptr->node_bitmap);
if ((i != -1) && (i < i_first))
i_first = i;
i_last = bit_fls(job_resrcs1_ptr->node_bitmap);
i = bit_fls(job_resrcs2_ptr->node_bitmap);
if ((i != -1) && (i > i_last))
i_last = i;
if (i_last >= node_cnt)
i_last = node_cnt - 1;
if (i_last == -1) /* node_bitmap empty in both inputs */
i_last = -2;
for (i = i_first; i <= i_last; i++) {
bool match1 = false, match2 = false;
if (bit_test(job_resrcs1_ptr->node_bitmap, i))
match1 = true;
if (bit_test(job_resrcs2_ptr->node_bitmap, i))
match2 = true;
if (!match1 && !match2) /* Unused node */
continue;
bit_set(job_resrcs_new->node_bitmap, i);
node_inx++;
if (match1 && match2) { /* Merge (OR) core_bitmaps */
if (++sock_core_cnt1 >
job_resrcs1_ptr->sock_core_rep_count[so_co_off1]) {
sock_core_cnt1 = 0;
so_co_off1++;
}
if (++sock_core_cnt2 >
job_resrcs2_ptr->sock_core_rep_count[so_co_off2]) {
sock_core_cnt2 = 0;
so_co_off2++;
}
job_resrcs_new->cores_per_socket[node_inx] =
job_resrcs1_ptr->cores_per_socket[so_co_off1];
job_resrcs_new->sockets_per_node[node_inx] =
job_resrcs1_ptr->sockets_per_node[so_co_off1];
core_cnt1 =
job_resrcs1_ptr->cores_per_socket[so_co_off1] *
job_resrcs1_ptr->sockets_per_node[so_co_off1];
core_cnt2 =
job_resrcs2_ptr->cores_per_socket[so_co_off2] *
job_resrcs2_ptr->sockets_per_node[so_co_off2];
if (core_cnt1 != core_cnt2) {
error("%s: Inconsistent socket/core count for node_inx %d (%d != %d)",
__func__, i, core_cnt1, core_cnt2);
rc = SLURM_ERROR;
}
core_cnt = MIN(core_cnt1, core_cnt2);
for (j = 0; j < core_cnt; j++) {
if (bit_test(job_resrcs1_ptr->core_bitmap,
core_off1 + j) ||
bit_test(job_resrcs2_ptr->core_bitmap,
core_off2 + j)) {
bit_set(job_resrcs_new->core_bitmap,
core_off + j);
}
}
core_off += core_cnt;
core_off1 += core_cnt1;
core_off2 += core_cnt2;
} else if (match1) { /* Copy core bitmap */
if (++sock_core_cnt1 >
job_resrcs1_ptr->sock_core_rep_count[so_co_off1]) {
sock_core_cnt1 = 0;
so_co_off1++;
}
job_resrcs_new->cores_per_socket[node_inx] =
job_resrcs1_ptr->cores_per_socket[so_co_off1];
job_resrcs_new->sockets_per_node[node_inx] =
job_resrcs1_ptr->sockets_per_node[so_co_off1];
core_cnt1 = job_resrcs_new->cores_per_socket[node_inx] *
job_resrcs_new->sockets_per_node[node_inx];
for (j = 0; j < core_cnt1; j++) {
if (bit_test(job_resrcs1_ptr->core_bitmap,
core_off1 + j)) {
bit_set(job_resrcs_new->core_bitmap,
core_off + j);
}
}
core_off += core_cnt1;
core_off1 += core_cnt1;
} else { /* match2 only */ /* Copy core bitmap */
if (++sock_core_cnt2 >
job_resrcs2_ptr->sock_core_rep_count[so_co_off2]) {
sock_core_cnt2 = 0;
so_co_off2++;
}
job_resrcs_new->cores_per_socket[node_inx] =
job_resrcs2_ptr->cores_per_socket[so_co_off2];
job_resrcs_new->sockets_per_node[node_inx] =
job_resrcs2_ptr->sockets_per_node[so_co_off2];
core_cnt2 = job_resrcs_new->cores_per_socket[node_inx] *
job_resrcs_new->sockets_per_node[node_inx];
for (j = 0; j < core_cnt2; j++) {
if (bit_test(job_resrcs2_ptr->core_bitmap,
core_off2 + j)) {
bit_set(job_resrcs_new->core_bitmap,
core_off + j);
}
}
core_off += core_cnt2;
core_off2 += core_cnt2;
}
job_resrcs_new->sock_core_rep_count[node_inx] = 1;
}
/* Update data structure fields as needed */
job_resrcs1_ptr->nhosts = node_inx + 1;
FREE_NULL_BITMAP(job_resrcs1_ptr->core_bitmap);
job_resrcs1_ptr->core_bitmap = job_resrcs_new->core_bitmap;
FREE_NULL_BITMAP(job_resrcs1_ptr->node_bitmap);
job_resrcs1_ptr->node_bitmap = job_resrcs_new->node_bitmap;
xfree(job_resrcs1_ptr->cores_per_socket);
job_resrcs1_ptr->cores_per_socket = job_resrcs_new->cores_per_socket;
xfree(job_resrcs1_ptr->sock_core_rep_count);
job_resrcs1_ptr->sock_core_rep_count =
job_resrcs_new->sock_core_rep_count;
xfree(job_resrcs1_ptr->sockets_per_node);
job_resrcs1_ptr->sockets_per_node = job_resrcs_new->sockets_per_node;
xfree(job_resrcs_new);
return rc;
}
extern int get_job_resources_node(job_resources_t *job_resrcs_ptr,
uint32_t node_id)
{
int i, bit_inx = 0, core_cnt = 0;
xassert(job_resrcs_ptr);
for (i=0; i<job_resrcs_ptr->nhosts; i++) {
if (job_resrcs_ptr->sock_core_rep_count[i] <= node_id) {
bit_inx += job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i] *
job_resrcs_ptr->sock_core_rep_count[i];
node_id -= job_resrcs_ptr->sock_core_rep_count[i];
} else {
bit_inx += job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i] *
node_id;
core_cnt = job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i];
break;
}
}
if (core_cnt < 1) {
error("get_job_resources_node: core_cnt=0");
return 0;
}
i = bit_size(job_resrcs_ptr->core_bitmap);
if ((bit_inx + core_cnt) > i) {
error("get_job_resources_node: offset > bitmap size "
"(%d >= %d)", (bit_inx + core_cnt), i);
return 0;
}
for (i=0; i<core_cnt; i++) {
if (bit_test(job_resrcs_ptr->core_bitmap, bit_inx++))
return 1;
}
return 0;
}
static int _change_job_resources_node(job_resources_t *job_resrcs_ptr,
uint32_t node_id, bool new_value)
{
int i, bit_inx = 0, core_cnt = 0;
xassert(job_resrcs_ptr);
for (i=0; i<job_resrcs_ptr->nhosts; i++) {
if (job_resrcs_ptr->sock_core_rep_count[i] <= node_id) {
bit_inx += job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i] *
job_resrcs_ptr->sock_core_rep_count[i];
node_id -= job_resrcs_ptr->sock_core_rep_count[i];
} else {
bit_inx += job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i] *
node_id;
core_cnt = job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i];
break;
}
}
if (core_cnt < 1) {
error("_change_job_resources_node: core_cnt=0");
return SLURM_ERROR;
}
i = bit_size(job_resrcs_ptr->core_bitmap);
if ((bit_inx + core_cnt) > i) {
error("_change_job_resources_node: offset > bitmap size "
"(%d >= %d)", (bit_inx + core_cnt), i);
return SLURM_ERROR;
}
for (i=0; i<core_cnt; i++) {
if (new_value)
bit_set(job_resrcs_ptr->core_bitmap, bit_inx++);
else
bit_clear(job_resrcs_ptr->core_bitmap, bit_inx++);
}
return SLURM_SUCCESS;
}
extern int set_job_resources_node(job_resources_t *job_resrcs_ptr,
uint32_t node_id)
{
return _change_job_resources_node(job_resrcs_ptr, node_id, true);
}
extern int clear_job_resources_node(job_resources_t *job_resrcs_ptr,
uint32_t node_id)
{
return _change_job_resources_node(job_resrcs_ptr, node_id, false);
}
/* Completely remove specified node from job resources structure */
extern int extract_job_resources_node(job_resources_t *job, uint32_t node_id)
{
int i, n;
int bit_inx = 0, core_cnt = 0, host_cnt, len, node_inx = node_id;
xassert(job);
/* Modify core/socket counter arrays to remove this node */
host_cnt = job->nhosts;
for (i = 0; i < job->nhosts; i++) {
host_cnt -= job->sock_core_rep_count[i];
if (job->sock_core_rep_count[i] <= node_inx) {
bit_inx += job->sockets_per_node[i] *
job->cores_per_socket[i] *
job->sock_core_rep_count[i];
node_inx -= job->sock_core_rep_count[i];
} else {
bit_inx += job->sockets_per_node[i] *
job->cores_per_socket[i] * node_inx;
core_cnt = job->sockets_per_node[i] *
job->cores_per_socket[i];
job->sock_core_rep_count[i]--;
if (job->sock_core_rep_count[i] == 0) {
for ( ; host_cnt > 0; i++) {
job->cores_per_socket[i] =
job->cores_per_socket[i+1];
job->sock_core_rep_count[i] =
job->sock_core_rep_count[i+1];
job->sockets_per_node[i] =
job->sockets_per_node[i+1];
host_cnt -= job->sock_core_rep_count[i];
}
}
break;
}
}
if (core_cnt < 1) {
error("%s: core_cnt=0", __func__);
return SLURM_ERROR;
}
/* Shift core_bitmap contents and shrink it to remove this node */
len = bit_size(job->core_bitmap);
for (i = bit_inx; (i + core_cnt) < len; i++) {
if (bit_test(job->core_bitmap, i + core_cnt))
bit_set(job->core_bitmap, i);
else
bit_clear(job->core_bitmap, i);
if (!job->core_bitmap_used)
;
else if (bit_test(job->core_bitmap_used, i + core_cnt))
bit_set(job->core_bitmap_used, i);
else
bit_clear(job->core_bitmap_used, i);
}
bit_realloc(job->core_bitmap, len - core_cnt);
if (job->core_bitmap_used)
bit_realloc(job->core_bitmap_used, len - core_cnt);
/* Shift cpus, cpus_used, memory_allocated, and memory_used arrays */
for (i = 0, n = -1; next_node_bitmap(job->node_bitmap, &i); i++) {
if (++n == node_id) {
bit_clear(job->node_bitmap, i);
break;
}
}
job->nhosts--;
for (i = n; i < job->nhosts; i++) {
job->cpus[i] = job->cpus[i+1];
job->cpus_used[i] = job->cpus_used[i+1];
job->memory_allocated[i] = job->memory_allocated[i+1];
job->memory_used[i] = job->memory_used[i+1];
}
xfree(job->nodes);
job->nodes = bitmap2node_name(job->node_bitmap);
job->ncpus = build_job_resources_cpu_array(job);
return SLURM_SUCCESS;
}
/* Return the count of core bitmaps set for the specific node */
extern int count_job_resources_node(job_resources_t *job_resrcs_ptr,
uint32_t node_id)
{
int i, bit_inx = 0, core_cnt = 0;
int set_cnt = 0;
xassert(job_resrcs_ptr);
for (i=0; i<job_resrcs_ptr->nhosts; i++) {
if (job_resrcs_ptr->sock_core_rep_count[i] <= node_id) {
bit_inx += job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i] *
job_resrcs_ptr->sock_core_rep_count[i];
node_id -= job_resrcs_ptr->sock_core_rep_count[i];
} else {
bit_inx += job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i] *
node_id;
core_cnt = job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i];
break;
}
}
if (core_cnt < 1) {
error("count_job_resources_node: core_cnt=0");
return set_cnt;
}
i = bit_size(job_resrcs_ptr->core_bitmap);
if ((bit_inx + core_cnt) > i) {
error("count_job_resources_node: offset > bitmap size "
"(%d >= %d)", (bit_inx + core_cnt), i);
return set_cnt;
}
for (i=0; i<core_cnt; i++) {
if (bit_test(job_resrcs_ptr->core_bitmap, bit_inx++))
set_cnt++;
}
return set_cnt;
}
/* Return a copy of core_bitmap only for the specific node */
extern bitstr_t * copy_job_resources_node(job_resources_t *job_resrcs_ptr,
uint32_t node_id)
{
int i, bit_inx = 0, core_cnt = 0;
bitstr_t *core_bitmap;
xassert(job_resrcs_ptr);
for (i = 0; i < job_resrcs_ptr->nhosts; i++) {
if (job_resrcs_ptr->sock_core_rep_count[i] <= node_id) {
bit_inx += job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i] *
job_resrcs_ptr->sock_core_rep_count[i];
node_id -= job_resrcs_ptr->sock_core_rep_count[i];
} else {
bit_inx += job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i] *
node_id;
core_cnt = job_resrcs_ptr->sockets_per_node[i] *
job_resrcs_ptr->cores_per_socket[i];
break;
}
}
if (core_cnt < 1) {
error("copy_job_resources_node: core_cnt=0");
return NULL;
}
i = bit_size(job_resrcs_ptr->core_bitmap);
if ((bit_inx + core_cnt) > i) {
error("copy_job_resources_node: offset > bitmap size "
"(%d >= %d)", (bit_inx + core_cnt), i);
return NULL;
}
core_bitmap = bit_alloc(core_cnt);
for (i = 0; i < core_cnt; i++) {
if (bit_test(job_resrcs_ptr->core_bitmap, bit_inx++))
bit_set(core_bitmap, i);
}
return core_bitmap;
}
extern int get_job_resources_cnt(job_resources_t *job_resrcs_ptr,
uint32_t node_id, uint16_t *socket_cnt,
uint16_t *cores_per_socket_cnt)
{
int i, node_inx = -1;
xassert(socket_cnt);
xassert(cores_per_socket_cnt);
xassert(job_resrcs_ptr->cores_per_socket);
xassert(job_resrcs_ptr->sock_core_rep_count);
xassert(job_resrcs_ptr->sockets_per_node);
for (i=0; i<job_resrcs_ptr->nhosts; i++) {
node_inx += job_resrcs_ptr->sock_core_rep_count[i];
if (node_id <= node_inx) {
*cores_per_socket_cnt = job_resrcs_ptr->
cores_per_socket[i];
*socket_cnt = job_resrcs_ptr->sockets_per_node[i];
return SLURM_SUCCESS;
}
}
error("get_job_resources_cnt: invalid node_id: %u", node_id);
*cores_per_socket_cnt = 0;
*socket_cnt = 0;
return SLURM_ERROR;
}
/* Get CPU count for a specific node_id (zero origin), return -1 on error */
extern int get_job_resources_cpus(job_resources_t *job_resrcs_ptr,
uint32_t node_id)
{
xassert(job_resrcs_ptr->cpus);
if (node_id >= job_resrcs_ptr->nhosts)
return -1;
return (int) job_resrcs_ptr->cpus[node_id];
}
/*
* Test if job can fit into the given full-length core_bitmap
* IN job_resrcs_ptr - resources allocated to a job
* IN full_bitmap - bitmap of available CPUs
* RET 1 on success, 0 otherwise
*/
extern int job_fits_into_cores(job_resources_t *job_resrcs_ptr,
bitstr_t *full_bitmap)
{
node_record_t *node_ptr;
int job_bit_inx = 0;
if (!full_bitmap)
return 1;
for (int full_node_inx = 0;
(node_ptr = next_node_bitmap(
job_resrcs_ptr->node_bitmap, &full_node_inx));
full_node_inx++) {
int full_bit_inx = cr_node_cores_offset[full_node_inx];
for (int core = 0; core < node_ptr->tot_cores; core++) {
if (!bit_test(full_bitmap, full_bit_inx + core))
continue;
if ((job_resrcs_ptr->whole_node &
WHOLE_NODE_REQUIRED) ||
bit_test(job_resrcs_ptr->core_bitmap,
job_bit_inx + core)) {
return 0;
}
}
job_bit_inx += node_ptr->tot_cores;
}
return 1;
}
/*
* Add job to full-length core_bitmap
* IN job_resrcs_ptr - resources allocated to a job
* IN/OUT full_bitmap - bitmap of available CPUs, allocate as needed
* RET 1 on success, 0 otherwise
*/
extern void add_job_to_cores(job_resources_t *job_resrcs_ptr,
bitstr_t **full_core_bitmap)
{
node_record_t *node_ptr;
int job_bit_inx = 0;
if (!job_resrcs_ptr->core_bitmap)
return;
/* add the job to the row_bitmap */
node_conf_create_cluster_core_bitmap(full_core_bitmap);
for (int full_node_inx = 0;
(node_ptr = next_node_bitmap(
job_resrcs_ptr->node_bitmap, &full_node_inx));
full_node_inx++) {
int full_bit_inx = cr_node_cores_offset[full_node_inx];
for (int core = 0; core < node_ptr->tot_cores; core++) {
if (!(job_resrcs_ptr->whole_node &
WHOLE_NODE_REQUIRED) &&
!bit_test(job_resrcs_ptr->core_bitmap,
job_bit_inx + core))
continue;
bit_set(*full_core_bitmap, full_bit_inx + core);
}
job_bit_inx += node_ptr->tot_cores;
}
}
/*
* Given a job pointer and a global node index, return the index of that
* node in the job_resrcs_ptr->cpus. Return -1 if invalid
*/
extern int job_resources_node_inx_to_cpu_inx(job_resources_t *job_resrcs_ptr,
int node_inx)
{
int node_offset;
/* Test for error cases */
if (!job_resrcs_ptr || !job_resrcs_ptr->node_bitmap) {
error("%s: no job_resrcs or node_bitmap", __func__);
return -1;
}
if (!bit_test(job_resrcs_ptr->node_bitmap, node_inx)) {
/*
* This could happen if a job shrinks and epilog completes on
* node no longer in this job's allocation
*/
char node_str[128];
bit_fmt(node_str, sizeof(node_str),job_resrcs_ptr->node_bitmap);
error("%s: Invalid node_inx:%d node_bitmap:%s", __func__,
node_inx, node_str);
return -1;
}
if (job_resrcs_ptr->cpu_array_cnt == 0) {
error("%s: Invalid cpu_array_cnt", __func__);
return -1;
}
/* Only one record, no need to search */
if (job_resrcs_ptr->nhosts == 1)
return 0;
node_offset = bit_set_count_range(job_resrcs_ptr->node_bitmap, 0,
node_inx);
if (node_offset >= job_resrcs_ptr->nhosts) {
error("%s: Found %d of %d nodes", __func__,
job_resrcs_ptr->nhosts, node_offset);
return -1;
}
return node_offset;
}
extern uint16_t job_resources_get_node_cpu_cnt(job_resources_t *job_resrcs_ptr,
int job_node_inx,
int sys_node_inx)
{
uint16_t cpu_count = job_resrcs_ptr->cpus[job_node_inx];
if ((job_resrcs_ptr->cr_type &
(SELECT_CORE | SELECT_SOCKET | SELECT_LINEAR)) &&
(job_resrcs_ptr->threads_per_core <
node_record_table_ptr[sys_node_inx]->tpc)) {
cpu_count = ROUNDUP(cpu_count,
node_record_table_ptr[sys_node_inx]->tpc);
cpu_count *= job_resrcs_ptr->threads_per_core;
}
return cpu_count;
}