blob: 069a23bec7d6b0cf2978cc1e14e6cbfa25779049 [file] [log] [blame]
/*****************************************************************************\
* gres_stepmgr.c - Functions for gres used only in the slurmctld
*****************************************************************************
* Copyright (C) SchedMD LLC.
* Derived in large part from code previously in common/gres.c
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "gres_stepmgr.h"
#include "src/common/assoc_mgr.h"
#include "src/common/xstring.h"
typedef struct {
bitstr_t *core_bitmap;
bool decr_job_alloc;
uint64_t gres_needed;
gres_key_t *job_search_key;
uint64_t max_gres;
list_t *node_gres_list;
int node_offset;
int rc;
list_t *step_gres_list_alloc;
gres_state_t *gres_state_step;
uint64_t *step_node_mem_alloc;
slurm_step_id_t tmp_step_id;
int total_gres_cpu_cnt;
} foreach_step_alloc_t;
typedef struct {
uint64_t gres_cnt;
bool ignore_alloc;
gres_key_t *job_search_key;
slurm_step_id_t *step_id;
} foreach_gres_cnt_t;
typedef struct {
bitstr_t *core_bitmap;
gres_state_t *gres_state_node;
uint32_t job_id;
list_t **job_gres_list;
bool new_alloc;
int node_cnt;
int node_index;
int node_offset;
char *node_name;
int rc;
} foreach_explicit_alloc_t;
/*
* Determine if specific GRES index on node is available to a job's allocated
* cores
* IN core_bitmap - bitmap of cores allocated to the job on this node
* IN/OUT alloc_core_bitmap - cores already allocated, NULL if don't care,
* updated when the function returns true
* IN gres_ns - GRES data for this node
* IN gres_inx - index of GRES being considered for use
* IN job_gres_ptr - GRES data for this job
* RET true if available to those core, false otherwise
*/
static bool _cores_on_gres(bitstr_t *core_bitmap, bitstr_t *alloc_core_bitmap,
gres_node_state_t *gres_ns, int gres_inx,
gres_job_state_t *gres_js)
{
int i, avail_cores;
if ((core_bitmap == NULL) || (gres_ns->topo_cnt == 0))
return true;
for (i = 0; i < gres_ns->topo_cnt; i++) {
if (!gres_ns->topo_gres_bitmap[i])
continue;
if (bit_size(gres_ns->topo_gres_bitmap[i]) < gres_inx)
continue;
if (!bit_test(gres_ns->topo_gres_bitmap[i], gres_inx))
continue;
if (gres_js->type_name &&
(!gres_ns->topo_type_name[i] ||
(gres_js->type_id != gres_ns->topo_type_id[i])))
continue;
if (!gres_ns->topo_core_bitmap[i])
return true;
if (bit_size(gres_ns->topo_core_bitmap[i]) !=
bit_size(core_bitmap))
break;
avail_cores = bit_overlap(gres_ns->topo_core_bitmap[i],
core_bitmap);
if (avail_cores && alloc_core_bitmap) {
avail_cores -= bit_overlap(gres_ns->
topo_core_bitmap[i],
alloc_core_bitmap);
if (avail_cores) {
bit_or(alloc_core_bitmap,
gres_ns->topo_core_bitmap[i]);
}
}
if (avail_cores)
return true;
}
return false;
}
static gres_job_state_t *_get_job_alloc_gres_ptr(list_t *job_gres_list_alloc,
gres_state_t *gres_state_in,
uint32_t type_id,
char *type_name,
uint32_t node_cnt)
{
gres_key_t job_search_key;
gres_job_state_t *gres_js;
gres_state_t *gres_state_job;
/* Find in job_gres_list_alloc if it exists */
job_search_key.config_flags = gres_state_in->config_flags;
job_search_key.plugin_id = gres_state_in->plugin_id;
job_search_key.type_id = type_id;
if (!(gres_state_job = list_find_first(job_gres_list_alloc,
gres_find_job_by_key_exact_type,
&job_search_key))) {
gres_js = xmalloc(sizeof(*gres_js));
gres_js->type_id = type_id;
gres_js->type_name = xstrdup(type_name);
gres_js->node_cnt = node_cnt;
gres_js->gres_bit_alloc = xcalloc(
node_cnt,
sizeof(*gres_js->gres_bit_alloc));
gres_js->gres_cnt_node_alloc = xcalloc(
node_cnt,
sizeof(*gres_js->gres_cnt_node_alloc));
gres_js->gres_bit_step_alloc = xcalloc(
node_cnt,
sizeof(*gres_js->gres_bit_step_alloc));
gres_js->gres_cnt_step_alloc = xcalloc(
node_cnt,
sizeof(*gres_js->gres_cnt_step_alloc));
gres_state_job = xmalloc(sizeof(*gres_state_job));
gres_state_job->config_flags = gres_state_in->config_flags;
/* Use gres_state_node here as plugin_id might be NO_VAL */
gres_state_job->plugin_id = gres_state_in->plugin_id;
gres_state_job->gres_data = gres_js;
gres_state_job->gres_name = xstrdup(gres_state_in->gres_name);
gres_state_job->state_type = GRES_STATE_TYPE_JOB;
list_append(job_gres_list_alloc, gres_state_job);
} else
gres_js = gres_state_job->gres_data;
return gres_js;
}
static uint64_t _get_sharing_cnt_from_shared_cnt(gres_job_state_t *gres_js,
bitstr_t *left_over_bits,
int n, int64_t shared_cnt)
{
uint64_t sharing_cnt = 0;
if (!gres_js->gres_per_bit_alloc || !gres_js->gres_per_bit_alloc[n]) {
error("Allocated shared gres with no gres_per_bit_alloc");
return shared_cnt;
}
for (int i = 0; (i = bit_ffs_from_bit(left_over_bits, i)) >= 0; i++) {
if (shared_cnt <= 0)
break;
sharing_cnt++;
shared_cnt -= gres_js->gres_per_bit_alloc[n][i];
}
return sharing_cnt;
}
static uint64_t _cnt_topo_gres(gres_job_state_t *gres_js, int n,
bitstr_t *topo_gres_bitmap)
{
uint64_t gres_cnt = 0;
if (gres_js->gres_per_bit_alloc && gres_js->gres_per_bit_alloc[n]) {
for (int i = 0;
(i = bit_ffs_from_bit(gres_js->gres_bit_alloc[n], i)) >= 0;
i++) {
if (bit_test(topo_gres_bitmap, i))
gres_cnt += gres_js->gres_per_bit_alloc[n][i];
}
} else {
gres_cnt = bit_overlap(gres_js->gres_bit_alloc[n],
topo_gres_bitmap);
}
return gres_cnt;
}
static void _copy_matching_gres_per_bit(gres_job_state_t *gres_js,
gres_job_state_t *gres_js_alloc, int n)
{
if (!gres_js_alloc->gres_per_bit_alloc) {
gres_js_alloc->gres_per_bit_alloc = xcalloc(
gres_js_alloc->node_cnt, sizeof(uint64_t *));
}
gres_js_alloc->gres_per_bit_alloc[n] = xcalloc(
bit_size(gres_js_alloc->gres_bit_alloc[n]), sizeof(uint64_t));
for (int i = 0;
(i = bit_ffs_from_bit(gres_js_alloc->gres_bit_alloc[n], i)) >= 0;
i++) {
gres_js_alloc->gres_per_bit_alloc[n][i] =
gres_js->gres_per_bit_alloc[n][i];
}
}
static void _allocate_gres_bits(gres_node_state_t *gres_ns,
gres_job_state_t *gres_js,
int64_t gres_bits,
int64_t *gres_cnt,
int node_offset,
bool shared_gres,
bitstr_t *core_bitmap,
bool overlap_all_cores)
{
bitstr_t *alloc_core_bitmap = NULL;
if (core_bitmap && overlap_all_cores)
alloc_core_bitmap = bit_alloc(bit_size(core_bitmap));
for (int i = 0; i < gres_bits && *gres_cnt > 0; i++) {
if (bit_test(gres_ns->gres_bit_alloc, i))
continue;
if (core_bitmap &&
!_cores_on_gres(core_bitmap, alloc_core_bitmap, gres_ns, i,
gres_js))
continue;
bit_set(gres_ns->gres_bit_alloc, i);
bit_set(gres_js->gres_bit_alloc[node_offset], i);
if (shared_gres) { /* Allocate whole sharing gres */
int n = gres_ns->topo_gres_cnt_avail[i];
gres_js->gres_per_bit_alloc[node_offset][i] = n;
gres_ns->gres_cnt_alloc += n;
(*gres_cnt) -= n;
} else {
gres_ns->gres_cnt_alloc++;
(*gres_cnt)--;
}
}
FREE_NULL_BITMAP(alloc_core_bitmap);
}
static int _job_alloc(gres_state_t *gres_state_job, list_t *job_gres_list_alloc,
gres_state_t *gres_state_node,
int node_cnt, int node_index,
int node_offset, uint32_t job_id,
char *node_name, bitstr_t *core_bitmap,
bool new_alloc)
{
gres_job_state_t *gres_js = gres_state_job->gres_data;
char *gres_name = gres_state_job->gres_name;
uint32_t config_flags = gres_state_job->config_flags;
gres_node_state_t *gres_ns = gres_state_node->gres_data;
int j, sz1, sz2, rc = SLURM_SUCCESS;
int64_t gres_cnt, i;
gres_job_state_t *gres_js_alloc;
bitstr_t *left_over_bits = NULL;
bool log_cnt_err = true;
char *log_type;
bool shared_gres = false;
bool use_busy_dev = gres_use_busy_dev(gres_state_node, 0);
uint64_t pre_alloc_gres_cnt;
uint64_t *pre_alloc_type_cnt = NULL;
/*
* Validate data structures. Either job_gres_data->node_cnt and
* job_gres_data->gres_bit_alloc are both set or both zero/NULL.
*/
xassert(node_cnt);
xassert(node_offset >= 0);
xassert(gres_js);
xassert(gres_ns);
if (gres_id_shared(config_flags)) {
shared_gres = true;
}
if (gres_js->type_name && !gres_js->type_name[0])
xfree(gres_js->type_name);
xfree(gres_ns->gres_used); /* Clear cache */
/*
* Check if no nodes, then the next 2 checks were added long before job
* resizing was allowed. They are not errors as we need to keep the
* original size around for any steps that might still be out there with
* the larger size. If the job was sized up the gres_job_merge()
* function handles the resize so we are set there.
*/
if (gres_js->node_cnt == 0) {
gres_js->node_cnt = node_cnt;
if (gres_js->gres_bit_alloc) {
error("gres/%s: job %u node_cnt==0 and gres_bit_alloc is set",
gres_name, job_id);
xfree(gres_js->gres_bit_alloc);
}
}
else if (gres_js->node_cnt < node_cnt) {
debug2("gres/%s: job %u node_cnt is now larger than it was when allocated from %u to %d",
gres_name, job_id, gres_js->node_cnt, node_cnt);
if (node_offset >= gres_js->node_cnt)
return SLURM_ERROR;
} else if (gres_js->node_cnt > node_cnt) {
debug2("gres/%s: job %u node_cnt is now smaller than it was when allocated %u to %d",
gres_name, job_id, gres_js->node_cnt, node_cnt);
}
if (!gres_js->gres_bit_alloc) {
gres_js->gres_bit_alloc = xcalloc(node_cnt,
sizeof(bitstr_t *));
}
if (!gres_js->gres_cnt_node_alloc) {
gres_js->gres_cnt_node_alloc = xcalloc(node_cnt,
sizeof(uint64_t));
}
/*
* select/cons_tres pre-selects the resources and we just need to update
* the data structures to reflect the selected GRES.
*/
/* Resuming job */
if (gres_js->gres_cnt_node_alloc[node_offset]) {
gres_cnt = gres_js->
gres_cnt_node_alloc[node_offset];
} else if (gres_js->gres_bit_alloc[node_offset]) {
gres_cnt = bit_set_count(
gres_js->gres_bit_alloc[node_offset]);
if (gres_js->gres_per_bit_alloc &&
gres_js->gres_per_bit_alloc[node_offset]) {
error("gres_per_bit_alloc and not gres_cnt_node_alloc");
}
} else if (gres_js->total_node_cnt) {
/* Using pre-selected GRES */
if (gres_js->gres_cnt_node_select &&
gres_js->gres_cnt_node_select[node_index]) {
gres_cnt = gres_js->
gres_cnt_node_select[node_index];
/* gres_bit_select should always match gres_cnt_node_select */
} else {
error("gres/%s: job %u node %s no resources selected",
gres_name, job_id, node_name);
return SLURM_ERROR;
}
} else {
gres_cnt = gres_js->gres_per_node;
}
/*
* Check that sufficient resources exist on this node
*/
gres_js->gres_cnt_node_alloc[node_offset] = gres_cnt;
i = gres_ns->gres_cnt_alloc + gres_cnt;
if (i > gres_ns->gres_cnt_avail) {
error("gres/%s: job %u node %s overallocated resources by %"
PRIu64", (%"PRIu64" > %"PRIu64")",
gres_name, job_id, node_name,
i - gres_ns->gres_cnt_avail,
i, gres_ns->gres_cnt_avail);
return SLURM_ERROR;
}
/*
* Grab these here since gres_ns->[gres|type]_cnt_alloc can change
* later.
*/
pre_alloc_gres_cnt = gres_ns->gres_cnt_alloc;
pre_alloc_type_cnt = xcalloc(gres_ns->type_cnt,
sizeof(*pre_alloc_type_cnt));
memcpy(pre_alloc_type_cnt, gres_ns->type_cnt_alloc,
sizeof(*pre_alloc_type_cnt) * gres_ns->type_cnt);
if (!node_offset && gres_js->gres_cnt_step_alloc) {
uint64_t *tmp = xcalloc(gres_js->node_cnt,
sizeof(uint64_t));
memcpy(tmp, gres_js->gres_cnt_step_alloc,
sizeof(uint64_t) * MIN(node_cnt,
gres_js->node_cnt));
xfree(gres_js->gres_cnt_step_alloc);
gres_js->gres_cnt_step_alloc = tmp;
}
if (gres_js->gres_cnt_step_alloc == NULL) {
gres_js->gres_cnt_step_alloc =
xcalloc(gres_js->node_cnt, sizeof(uint64_t));
}
/*
* Select and/or allocate specific resources for this job.
*/
if (gres_js->gres_bit_alloc[node_offset]) {
/*
* Restarted slurmctld with active job or resuming a suspended
* job. In any case, the resources already selected.
*/
if (gres_ns->gres_bit_alloc == NULL) {
gres_ns->gres_bit_alloc =
bit_copy(gres_js->
gres_bit_alloc[node_offset]);
gres_ns->gres_cnt_alloc +=
gres_js->gres_cnt_node_alloc[node_offset];
} else if (gres_ns->gres_bit_alloc) {
gres_cnt = (int64_t)MIN(
bit_size(gres_ns->gres_bit_alloc),
bit_size(gres_js->
gres_bit_alloc[node_offset]));
for (i = 0; i < gres_cnt; i++) {
uint64_t gres_per_bit = 1;
if (gres_js->gres_per_bit_alloc &&
gres_js->gres_per_bit_alloc[node_offset] &&
gres_js->gres_per_bit_alloc[node_offset][i])
gres_per_bit =
gres_js->gres_per_bit_alloc
[node_offset][i];
if (bit_test(gres_js->
gres_bit_alloc[node_offset], i) &&
(shared_gres ||
!bit_test(gres_ns->gres_bit_alloc,
i))) {
bit_set(gres_ns->
gres_bit_alloc,i);
gres_ns->gres_cnt_alloc +=
gres_per_bit;
}
}
}
} else if (gres_js->total_node_cnt &&
gres_js->gres_bit_select &&
gres_js->gres_bit_select[node_index] &&
gres_js->gres_cnt_node_select) {
/* Specific GRES already selected, update the node record */
bool job_mod = false;
sz1 = bit_size(gres_js->gres_bit_select[node_index]);
sz2 = bit_size(gres_ns->gres_bit_alloc);
if (sz1 > sz2) {
error("gres/%s: job %u node %s gres bitmap size bad (%d > %d)",
gres_name, job_id, node_name, sz1, sz2);
bit_realloc(gres_js->gres_bit_select[node_index], sz2);
job_mod = true;
} else if (sz1 < sz2) {
error("gres/%s: job %u node %s gres bitmap size bad (%d < %d)",
gres_name, job_id, node_name, sz1, sz2);
bit_realloc(gres_js->gres_bit_select[node_index], sz2);
}
if (!shared_gres &&
bit_overlap_any(gres_js->gres_bit_select[node_index],
gres_ns->gres_bit_alloc)) {
error("gres/%s: job %u node %s gres bitmap overlap",
gres_name, job_id, node_name);
bit_and_not(gres_js->gres_bit_select[node_index],
gres_ns->gres_bit_alloc);
}
gres_js->gres_bit_alloc[node_offset] =
bit_copy(gres_js->gres_bit_select[node_index]);
if (gres_js->gres_per_bit_select &&
gres_js->gres_per_bit_select[node_index]){
if (!gres_js->gres_per_bit_alloc) {
gres_js->gres_per_bit_alloc = xcalloc(
gres_js->node_cnt, sizeof(uint64_t *));
}
gres_js->gres_per_bit_alloc[node_offset] = xcalloc(
bit_size(gres_js->gres_bit_alloc[node_offset]),
sizeof(uint64_t));
memcpy(gres_js->gres_per_bit_alloc[node_offset],
gres_js->gres_per_bit_select[node_index],
bit_size(gres_js->gres_bit_select[node_index]) *
sizeof(uint64_t));
}
gres_js->gres_cnt_node_alloc[node_offset] =
gres_js->gres_cnt_node_select[node_index];
if (!gres_ns->gres_bit_alloc) {
gres_ns->gres_bit_alloc =
bit_copy(gres_js->
gres_bit_alloc[node_offset]);
} else {
bit_or(gres_ns->gres_bit_alloc,
gres_js->gres_bit_alloc[node_offset]);
}
if (job_mod) {
gres_ns->gres_cnt_alloc =
bit_set_count(gres_ns->gres_bit_alloc);
if (shared_gres &&
(bit_size(gres_ns->gres_bit_alloc) !=
gres_ns->gres_cnt_avail))
gres_ns->gres_cnt_alloc *=
(gres_ns->gres_cnt_avail /
bit_size(gres_ns->gres_bit_alloc));
} else {
gres_ns->gres_cnt_alloc += gres_cnt;
}
} else if (gres_ns->gres_bit_alloc) {
int64_t gres_bits = bit_size(gres_ns->gres_bit_alloc);
if (!shared_gres && (gres_bits < gres_ns->gres_cnt_avail)) {
error("gres/%s: node %s gres bitmap size bad (%"PRIi64" < %"PRIi64")",
gres_name, node_name,
gres_bits, gres_ns->gres_cnt_avail);
gres_bits = gres_ns->gres_cnt_avail;
bit_realloc(gres_ns->gres_bit_alloc, gres_bits);
}
gres_js->gres_bit_alloc[node_offset] =
bit_alloc(gres_bits);
if (shared_gres) {
if (!gres_js->gres_per_bit_alloc) {
gres_js->gres_per_bit_alloc = xcalloc(
gres_js->node_cnt, sizeof(uint64_t *));
}
gres_js->gres_per_bit_alloc[node_offset] = xcalloc(
bit_size(gres_js->gres_bit_alloc[node_offset]),
sizeof(uint64_t));
}
/* Pass 1: Allocate GRES overlapping all allocated cores */
_allocate_gres_bits(gres_ns, gres_js, gres_bits,
&gres_cnt, node_offset, shared_gres,
core_bitmap, true);
/* Pass 2: Allocate GRES overlapping any allocated cores */
_allocate_gres_bits(gres_ns, gres_js, gres_bits,
&gres_cnt, node_offset, shared_gres,
core_bitmap, false);
if (gres_cnt) {
verbose("gres/%s topology sub-optimal for job %u",
gres_name, job_id);
}
/* Pass 3: Allocate any available GRES */
_allocate_gres_bits(gres_ns, gres_js, gres_bits,
&gres_cnt, node_offset, shared_gres,
NULL, false);
} else {
gres_ns->gres_cnt_alloc += gres_cnt;
}
if (gres_js->gres_bit_alloc[node_offset] &&
gres_ns->topo_gres_bitmap &&
gres_ns->topo_gres_cnt_alloc) {
for (i = 0; i < gres_ns->topo_cnt; i++) {
if (gres_js->type_name &&
(!gres_ns->topo_type_name[i] ||
(gres_js->type_id !=
gres_ns->topo_type_id[i])))
continue;
if (use_busy_dev &&
(gres_ns->topo_gres_cnt_alloc[i] == 0))
continue;
sz1 = bit_size(
gres_js->gres_bit_alloc[node_offset]);
sz2 = bit_size(gres_ns->topo_gres_bitmap[i]);
if ((sz1 != sz2) && log_cnt_err) {
if (shared_gres)
log_type = "File";
else
log_type = "Count";
/* Avoid abort on bit_overlap below */
error("gres/%s %s mismatch for node %s (%d != %d)",
gres_name, log_type, node_name, sz1, sz2);
log_cnt_err = false;
}
if (sz1 != sz2)
continue; /* See error above */
gres_cnt = _cnt_topo_gres(gres_js, node_offset,
gres_ns->topo_gres_bitmap[i]);
gres_ns->topo_gres_cnt_alloc[i] += gres_cnt;
if ((gres_ns->type_cnt == 0) ||
(gres_ns->topo_type_name == NULL) ||
(gres_ns->topo_type_name[i] == NULL))
continue;
for (j = 0; j < gres_ns->type_cnt; j++) {
if (!gres_ns->type_name[j] ||
(gres_ns->topo_type_id[i] !=
gres_ns->type_id[j]))
continue;
gres_ns->type_cnt_alloc[j] += gres_cnt;
break;
}
}
} else if (gres_js->gres_bit_alloc[node_offset]) {
int len; /* length of the gres bitmap on this node */
len = bit_size(gres_js->gres_bit_alloc[node_offset]);
if (!gres_ns->topo_gres_cnt_alloc) {
gres_ns->topo_gres_cnt_alloc =
xcalloc(len, sizeof(uint64_t));
} else {
len = MIN(len, gres_ns->gres_cnt_config);
}
for (i = 0; i < len; i++) {
gres_cnt = 0;
if (!bit_test(gres_js->
gres_bit_alloc[node_offset], i))
continue;
uint64_t gres_per_bit = 1;
if (gres_js->gres_per_bit_alloc &&
gres_js->gres_per_bit_alloc[node_offset] &&
gres_js->gres_per_bit_alloc[node_offset][i])
gres_per_bit =
gres_js->gres_per_bit_alloc
[node_offset][i];
/*
* NOTE: Immediately after slurmctld restart and before
* the node's registration, the GRES type and topology
* information will not be available and we will be
* unable to update topo_gres_cnt_alloc or
* type_cnt_alloc. This results in some incorrect
* internal bookkeeping, but does not cause failures
* in terms of allocating GRES to jobs.
*/
for (j = 0; j < gres_ns->topo_cnt; j++) {
if (use_busy_dev &&
!gres_ns->topo_gres_cnt_alloc[j])
continue;
if (gres_ns->topo_gres_bitmap &&
gres_ns->topo_gres_bitmap[j] &&
bit_test(gres_ns->topo_gres_bitmap[j],
i)) {
gres_ns->topo_gres_cnt_alloc[i] +=
gres_per_bit;
gres_cnt += gres_per_bit;
}
}
if ((gres_ns->type_cnt == 0) ||
(gres_ns->topo_type_name == NULL) ||
(gres_ns->topo_type_name[i] == NULL))
continue;
for (j = 0; j < gres_ns->type_cnt; j++) {
if (!gres_ns->type_name[j] ||
(gres_ns->topo_type_id[i] !=
gres_ns->type_id[j]))
continue;
gres_ns->type_cnt_alloc[j] += gres_cnt;
break;
}
}
if (gres_js->type_name && gres_js->type_name[0]) {
/*
* We may not know how many GRES of this type will be
* available on this node, but need to track how many
* are allocated to this job from here to avoid
* underflows when this job is deallocated
*/
gres_add_type(gres_js->type_name, gres_ns,
0);
for (j = 0; j < gres_ns->type_cnt; j++) {
if (gres_js->type_id !=
gres_ns->type_id[j])
continue;
gres_ns->type_cnt_alloc[j] +=
gres_js->gres_per_node;
break;
}
}
} else {
gres_cnt = gres_js->gres_per_node;
for (j = 0; j < gres_ns->type_cnt; j++) {
int64_t k;
if (gres_js->type_name &&
(gres_js->type_id !=
gres_ns->type_id[j]))
continue;
k = gres_ns->type_cnt_avail[j] -
gres_ns->type_cnt_alloc[j];
k = MIN(gres_cnt, k);
gres_ns->type_cnt_alloc[j] += k;
gres_cnt -= k;
if (gres_cnt == 0)
break;
}
}
/* If we are already allocated (state restore | reconfig) end now. */
if (!new_alloc) {
if (gres_ns->no_consume) {
gres_ns->gres_cnt_alloc = pre_alloc_gres_cnt;
for (j = 0; j < gres_ns->type_cnt; j++)
gres_ns->type_cnt_alloc[j] =
pre_alloc_type_cnt[j];
}
goto cleanup;
}
/*
* Here we fill job_gres_list_alloc with
* one entry for each type of gres separately
*/
if (gres_js->gres_bit_alloc && gres_js->gres_bit_alloc[node_offset])
left_over_bits = bit_copy(gres_js->gres_bit_alloc[node_offset]);
for (j = 0; j < gres_ns->type_cnt; j++) {
if (gres_js->type_id &&
gres_js->type_id != gres_ns->type_id[j])
continue;
gres_js_alloc = _get_job_alloc_gres_ptr(
job_gres_list_alloc, gres_state_job,
gres_ns->type_id[j], gres_ns->type_name[j], node_cnt);
gres_cnt = gres_ns->type_cnt_alloc[j] -
pre_alloc_type_cnt[j];
if (gres_ns->no_consume) {
gres_ns->type_cnt_alloc[j] =
pre_alloc_type_cnt[j];
gres_ns->gres_cnt_alloc = pre_alloc_gres_cnt;
gres_js_alloc->gres_cnt_node_alloc[node_offset] =
NO_CONSUME_VAL64;
gres_js_alloc->total_gres = NO_CONSUME_VAL64;
} else {
gres_js_alloc->gres_cnt_node_alloc[node_offset] =
gres_cnt;
gres_js_alloc->total_gres += gres_cnt;
}
if (gres_js->gres_bit_alloc &&
gres_js->gres_bit_alloc[node_offset]) {
if (shared_gres)
gres_cnt = _get_sharing_cnt_from_shared_cnt(
gres_js, left_over_bits, node_offset,
gres_cnt);
gres_js_alloc->gres_bit_alloc[node_offset] =
bit_pick_cnt(left_over_bits, gres_cnt);
bit_and_not(left_over_bits,
gres_js_alloc->gres_bit_alloc[node_offset]);
}
if (gres_js->gres_per_bit_alloc &&
gres_js->gres_per_bit_alloc[node_offset]) {
_copy_matching_gres_per_bit(gres_js, gres_js_alloc,
node_offset);
}
}
FREE_NULL_BITMAP(left_over_bits);
/* Also track non typed node gres */
if (gres_ns->type_cnt == 0) {
gres_js_alloc = _get_job_alloc_gres_ptr(
job_gres_list_alloc, gres_state_job,
0, NULL, node_cnt);
gres_cnt = gres_ns->gres_cnt_alloc - pre_alloc_gres_cnt;
if (gres_ns->no_consume) {
gres_ns->gres_cnt_alloc = pre_alloc_gres_cnt;
gres_js_alloc->gres_cnt_node_alloc[node_offset] =
NO_CONSUME_VAL64;
gres_js_alloc->total_gres = NO_CONSUME_VAL64;
} else {
gres_js_alloc->gres_cnt_node_alloc[node_offset] =
gres_cnt;
gres_js_alloc->total_gres += gres_cnt;
}
if (gres_js->gres_bit_alloc &&
gres_js->gres_bit_alloc[node_offset])
gres_js_alloc->gres_bit_alloc[node_offset] = bit_copy(
gres_js->gres_bit_alloc[node_offset]);
if (gres_js->gres_per_bit_alloc &&
gres_js->gres_per_bit_alloc[node_offset]) {
_copy_matching_gres_per_bit(gres_js, gres_js_alloc,
node_offset);
}
}
cleanup:
xfree(pre_alloc_type_cnt);
return rc;
}
static int _job_alloc_whole_node_internal(
gres_key_t *job_search_key, gres_state_t *gres_state_node,
list_t *job_gres_list, list_t **job_gres_list_alloc, int node_cnt,
int node_index, int node_offset, int type_index, uint32_t job_id,
char *node_name, bitstr_t *core_bitmap, bool new_alloc)
{
gres_state_t *gres_state_job;
gres_job_state_t *gres_js;
gres_node_state_t *gres_ns = gres_state_node->gres_data;
if (*job_gres_list_alloc == NULL) {
*job_gres_list_alloc = list_create(gres_job_list_delete);
}
if (!(gres_state_job = list_find_first(job_gres_list,
gres_find_job_by_key,
job_search_key))) {
error("%s: This should never happen, we couldn't find the gres %u:%u",
__func__,
job_search_key->plugin_id,
job_search_key->type_id);
return SLURM_ERROR;
}
gres_js = (gres_job_state_t *)gres_state_job->gres_data;
/*
* As the amount of gres on each node could
* differ. We need to set the gres_per_node
* correctly here to avoid heterogeneous node
* issues.
*/
if (type_index != -1)
gres_js->gres_per_node =
gres_ns->type_cnt_avail[type_index];
else
gres_js->gres_per_node = gres_ns->gres_cnt_avail;
return _job_alloc(gres_state_job, *job_gres_list_alloc, gres_state_node,
node_cnt, node_index, node_offset,
job_id, node_name, core_bitmap, new_alloc);
}
static void _job_select_whole_node_internal(
gres_key_t *job_search_key, gres_node_state_t *gres_ns,
int type_inx, char *gres_name, list_t *job_gres_list)
{
gres_state_t *gres_state_job;
gres_job_state_t *gres_js;
if (!(gres_state_job = list_find_first(job_gres_list,
gres_find_job_by_key,
job_search_key))) {
gres_js = xmalloc(sizeof(gres_job_state_t));
gres_state_job = gres_create_state(job_search_key,
GRES_STATE_SRC_KEY_PTR,
GRES_STATE_TYPE_JOB,
gres_js);
gres_state_job->gres_name = xstrdup(gres_name);
if (type_inx != -1)
gres_js->type_name =
xstrdup(gres_ns->type_name[type_inx]);
gres_js->type_id = job_search_key->type_id;
list_append(job_gres_list, gres_state_job);
} else
gres_js = gres_state_job->gres_data;
/*
* Add the total_gres here but no count, that will be done after
* allocation.
*/
if (gres_ns->no_consume) {
gres_js->total_gres = NO_CONSUME_VAL64;
} else if (type_inx != -1)
gres_js->total_gres +=
gres_ns->type_cnt_avail[type_inx];
else
gres_js->total_gres += gres_ns->gres_cnt_avail;
}
static void _handle_explicit_alloc(void *x, void *arg)
{
gres_state_t *gres_state_job = x;
foreach_explicit_alloc_t *explicit_alloc = arg;
int rc;
if (!(gres_state_job->config_flags & GRES_CONF_EXPLICIT) ||
!gres_find_id(x, &explicit_alloc->gres_state_node->plugin_id))
return;
if (!*explicit_alloc->job_gres_list)
*explicit_alloc->job_gres_list =
list_create(gres_job_list_delete);
rc = _job_alloc(gres_state_job,
*explicit_alloc->job_gres_list,
explicit_alloc->gres_state_node,
explicit_alloc->node_cnt,
explicit_alloc->node_index,
explicit_alloc->node_offset,
explicit_alloc->job_id,
explicit_alloc->node_name,
explicit_alloc->core_bitmap,
explicit_alloc->new_alloc);
if (rc != SLURM_SUCCESS)
explicit_alloc->rc = rc;
}
static void _job_alloc_explicit(
list_t *req_gres_list, foreach_explicit_alloc_t *explicit_alloc)
{
if (!req_gres_list)
return;
(void) list_for_each(req_gres_list,
(ListForF) _handle_explicit_alloc,
explicit_alloc);
}
static int _foreach_clear_job_gres(void *x, void *arg)
{
gres_job_clear_alloc(((gres_state_t *)x)->gres_data);
return 0;
}
/*
* Fill in job_gres_list with the total amount of GRES on a node.
* OUT job_gres_list - This list will be destroyed and remade with all GRES on
* node.
* IN node_gres_list - node's gres_list built by
* gres_node_config_validate()
* IN job_id - job's ID (for logging)
* IN node_name - name of the node (for logging)
* RET SLURM_SUCCESS or error code
*/
extern int gres_stepmgr_job_select_whole_node(
list_t **job_gres_list, list_t *node_gres_list,
uint32_t job_id, char *node_name)
{
list_itr_t *node_gres_iter;
gres_state_t *gres_state_node;
gres_node_state_t *gres_ns;
if (job_gres_list == NULL)
return SLURM_SUCCESS;
if (node_gres_list == NULL) {
error("%s: job %u has gres specification while node %s has none",
__func__, job_id, node_name);
return SLURM_ERROR;
}
if (!*job_gres_list)
*job_gres_list = list_create(gres_job_list_delete);
node_gres_iter = list_iterator_create(node_gres_list);
while ((gres_state_node = list_next(node_gres_iter))) {
gres_key_t job_search_key;
gres_ns = (gres_node_state_t *) gres_state_node->gres_data;
/*
* Don't check for no_consume here, we need them added here and
* will filter them out in gres_job_alloc_whole_node()
*/
if (!gres_ns->gres_cnt_config)
continue;
if (gres_state_node->config_flags & GRES_CONF_EXPLICIT)
continue;
/* Select shared GRES if requested */
if (gres_id_shared(gres_state_node->config_flags)) {
/*
* If we find it, delete it and add back to the list as
* a whole node selection.
* This is because we didn't delete it in
* _handle_explicit_req() in node_scheduler.c
*/
if (!list_delete_first(*job_gres_list, gres_find_id,
&gres_state_node->plugin_id))
continue;
}
/* If we select the shared gres don't select sharing gres */
if (gres_ns->alt_gres &&
gres_id_sharing(gres_state_node->plugin_id)) {
if (list_find_first(*job_gres_list, gres_find_id,
&(gres_ns->alt_gres->plugin_id)))
continue;
}
job_search_key.config_flags = gres_state_node->config_flags;
job_search_key.plugin_id = gres_state_node->plugin_id;
/* Add the non-typed one first/always */
job_search_key.type_id = 0;
_job_select_whole_node_internal(
&job_search_key, gres_ns,
-1, gres_state_node->gres_name, *job_gres_list);
/* Then add the typed ones if any */
for (int j = 0; j < gres_ns->type_cnt; j++) {
job_search_key.type_id = gres_build_id(
gres_ns->type_name[j]);
_job_select_whole_node_internal(
&job_search_key, gres_ns,
j, gres_state_node->gres_name,
*job_gres_list);
}
}
list_iterator_destroy(node_gres_iter);
return SLURM_SUCCESS;
}
/*
* On a slurmctld restart the type counts are not set on a node, this function
* fixes this. At this point it is really just cosmetic though as the parent
* GRES is already correct on the gres_node_state_t only the types are wrong if
* only generic GRES was requested by the job.
*/
static int _set_node_type_cnt(gres_state_t *gres_state_job,
list_t *node_gres_list)
{
gres_job_state_t *gres_js = gres_state_job->gres_data;
gres_state_t *gres_state_node;
gres_node_state_t *gres_ns;
if (!gres_js->total_gres || !gres_js->type_id)
return 0;
if (!(gres_state_node = list_find_first(node_gres_list, gres_find_id,
&gres_state_job->plugin_id)))
return 0;
gres_ns = gres_state_node->gres_data;
for (int j = 0; j < gres_ns->type_cnt; j++) {
/*
* Already set (typed GRES was requested) ||
* Not the right type
*/
if (gres_ns->type_cnt_alloc[j] ||
(gres_ns->type_id[j] != gres_js->type_id) ||
(gres_js->total_gres == NO_CONSUME_VAL64))
continue;
gres_ns->type_cnt_alloc[j] = gres_js->total_gres;
break;
}
return 0;
}
/*
* Select and allocate GRES to a job and update node and job GRES information
* IN job_gres_list - job's gres_list built by gres_job_state_validate()
* OUT job_gres_list_alloc - job's list of allocated gres
* IN node_gres_list - node's gres_list built by
* gres_node_config_validate()
* IN node_cnt - total number of nodes originally allocated to the job
* IN node_index - zero-origin global node index
* IN node_offset - zero-origin index in job allocation to the node of interest
* IN job_id - job's ID (for logging)
* IN node_name - name of the node (for logging)
* IN core_bitmap - cores allocated to this job on this node (NULL if not
* available)
* IN new_alloc - If this is a new allocation or not.
* RET SLURM_SUCCESS or error code
*/
extern int gres_stepmgr_job_alloc(
list_t *job_gres_list, list_t **job_gres_list_alloc,
list_t *node_gres_list, int node_cnt,
int node_index, int node_offset,
uint32_t job_id, char *node_name,
bitstr_t *core_bitmap, bool new_alloc)
{
int rc = SLURM_ERROR, rc2;
list_itr_t *job_gres_iter;
gres_state_t *gres_state_job, *gres_state_node;
if (job_gres_list == NULL)
return SLURM_SUCCESS;
if (node_gres_list == NULL) {
error("%s: job %u has gres specification while node %s has none",
__func__, job_id, node_name);
return SLURM_ERROR;
}
if (*job_gres_list_alloc == NULL) {
*job_gres_list_alloc = list_create(gres_job_list_delete);
}
job_gres_iter = list_iterator_create(job_gres_list);
while ((gres_state_job = (gres_state_t *) list_next(job_gres_iter))) {
gres_state_node = list_find_first(node_gres_list, gres_find_id,
&gres_state_job->plugin_id);
if (gres_state_node == NULL) {
error("%s: job %u allocated gres/%s on node %s lacking that gres",
__func__, job_id, gres_state_job->gres_name,
node_name);
continue;
}
rc2 = _job_alloc(gres_state_job, *job_gres_list_alloc,
gres_state_node, node_cnt,
node_index,
node_offset, job_id, node_name, core_bitmap,
new_alloc);
if (rc2 != SLURM_SUCCESS)
rc = rc2;
}
list_iterator_destroy(job_gres_iter);
/*
* On a slurmctld restart the node doesn't know anything about types so
* they are not setup, in this situation we can go set them here. We
* can't do it in the req loop above since if the request has typed GRES
* in there we could potentially get duplicate counts.
*/
if (!new_alloc)
(void) list_for_each(*job_gres_list_alloc,
(ListForF) _set_node_type_cnt,
node_gres_list);
return rc;
}
/*
* Select and allocate all GRES on a node to a job and update node and job GRES
* information
* IN job_gres_list - job's gres_list built by gres_job_whole_node().
* OUT job_gres_list_alloc - job's list of allocated gres
* IN node_gres_list - node's gres_list built by
* gres_node_config_validate()
* IN node_cnt - total number of nodes originally allocated to the job
* IN node_index - zero-origin global node index
* IN node_offset - zero-origin index in job allocation to the node of interest
* IN job_id - job's ID (for logging)
* IN node_name - name of the node (for logging)
* IN core_bitmap - cores allocated to this job on this node (NULL if not
* available)
* IN new_alloc - If this is a new allocation or not.
* RET SLURM_SUCCESS or error code
*/
extern int gres_stepmgr_job_alloc_whole_node(
list_t *job_gres_list, list_t **job_gres_list_alloc, list_t *node_gres_list,
int node_cnt, int node_index, int node_offset,
uint32_t job_id, char *node_name,
bitstr_t *core_bitmap, bool new_alloc)
{
int rc = SLURM_ERROR, rc2;
list_itr_t *node_gres_iter;
gres_state_t *gres_state_node;
gres_node_state_t *gres_ns;
if (job_gres_list == NULL)
return SLURM_SUCCESS;
if (node_gres_list == NULL) {
error("%s: job %u has gres specification while node %s has none",
__func__, job_id, node_name);
return SLURM_ERROR;
}
node_gres_iter = list_iterator_create(node_gres_list);
while ((gres_state_node = list_next(node_gres_iter))) {
gres_key_t job_search_key;
gres_ns = (gres_node_state_t *) gres_state_node->gres_data;
if (!gres_ns->gres_cnt_config)
continue;
/* Allocate shared GRES if requested */
if (gres_id_shared(gres_state_node->config_flags)) {
if (!list_find_first(job_gres_list, gres_find_id,
&gres_state_node->plugin_id))
continue;
}
/* If we allocate the shared gres don't allocate sharing gres */
if (gres_ns->alt_gres &&
gres_id_sharing(gres_state_node->plugin_id)) {
if (list_find_first(job_gres_list, gres_find_id,
&(gres_ns->alt_gres->plugin_id)))
continue;
}
if (gres_state_node->config_flags & GRES_CONF_EXPLICIT) {
if (job_gres_list) {
foreach_explicit_alloc_t explicit_alloc = {
.core_bitmap = core_bitmap,
.gres_state_node = gres_state_node,
.job_id = job_id,
.job_gres_list = job_gres_list_alloc,
.new_alloc = new_alloc,
.node_cnt = node_cnt,
.node_index = node_index,
.node_offset = node_offset,
.node_name = node_name,
.rc = rc,
};
_job_alloc_explicit(job_gres_list,
&explicit_alloc);
}
continue;
}
job_search_key.config_flags = gres_state_node->config_flags;
job_search_key.plugin_id = gres_state_node->plugin_id;
/*
* This check is needed and different from the one in
* gres_stepmgr_job_select_whole_node(). _job_alloc() handles
* all the heavy lifting later on to make this all correct.
*/
if (!gres_ns->type_cnt) {
job_search_key.type_id = 0;
rc2 = _job_alloc_whole_node_internal(
&job_search_key, gres_state_node,
job_gres_list, job_gres_list_alloc,
node_cnt, node_index,
node_offset, -1, job_id, node_name,
core_bitmap, new_alloc);
if (rc2 != SLURM_SUCCESS)
rc = rc2;
} else {
for (int j = 0; j < gres_ns->type_cnt; j++) {
job_search_key.type_id = gres_build_id(
gres_ns->type_name[j]);
rc2 = _job_alloc_whole_node_internal(
&job_search_key, gres_state_node,
job_gres_list, job_gres_list_alloc,
node_cnt, node_index,
node_offset, j, job_id, node_name,
core_bitmap, new_alloc);
if (rc2 != SLURM_SUCCESS)
rc = rc2;
}
}
}
list_iterator_destroy(node_gres_iter);
return rc;
}
static int _job_dealloc(gres_state_t *gres_state_job,
gres_node_state_t *gres_ns,
int node_offset, uint32_t job_id,
char *node_name, bool old_job, bool resize)
{
gres_job_state_t *gres_js = gres_state_job->gres_data;
char *gres_name = gres_state_job->gres_name;
uint32_t config_flags = gres_state_job->config_flags;
int i, j, len, sz1, sz2, last_node;
uint64_t gres_cnt = 0, k;
bool shared_gres = false;
/*
* Validate data structures. Either gres_js->node_cnt and
* gres_js->gres_bit_alloc are both set or both zero/NULL.
*/
xassert(node_offset >= 0);
xassert(gres_js);
xassert(gres_ns);
if (gres_ns->no_consume)
return SLURM_SUCCESS;
if (gres_js->node_cnt <= node_offset) {
error("gres/%s: job %u dealloc of node %s bad node_offset %d "
"count is %u", gres_name, job_id, node_name, node_offset,
gres_js->node_cnt);
return SLURM_ERROR;
}
if (gres_id_shared(config_flags)) {
shared_gres = true;
if (!(gres_js->gres_per_bit_alloc &&
gres_js->gres_per_bit_alloc[node_offset]) &&
(gres_js->gres_bit_alloc &&
gres_js->gres_bit_alloc[node_offset])) {
error("gres/%s: job %u dealloc node %s where gres shared but there is no gres_per_bit_alloc",
gres_name, job_id, node_name);
return SLURM_ERROR;
}
}
xfree(gres_ns->gres_used); /* Clear cache */
/* Clear the node's regular GRES bitmaps based on what the job has */
if (gres_ns->gres_bit_alloc && gres_js->gres_bit_alloc &&
gres_js->gres_bit_alloc[node_offset] &&
gres_js->gres_cnt_node_alloc &&
gres_js->gres_cnt_node_alloc[node_offset]) {
len = bit_size(gres_js->gres_bit_alloc[node_offset]);
i = bit_size(gres_ns->gres_bit_alloc);
if (i != len) {
error("gres/%s: job %u and node %s bitmap sizes differ "
"(%d != %d)", gres_name, job_id, node_name, len,
i);
len = MIN(len, i);
/* proceed with request, make best effort */
}
if (gres_ns->gres_cnt_alloc >=
gres_js->gres_cnt_node_alloc[node_offset]) {
gres_ns->gres_cnt_alloc -=
gres_js->gres_cnt_node_alloc[node_offset];
} else {
error("gres/%s: job %u dealloc node %s GRES count underflow (%"PRIu64" < %"PRIu64")",
gres_name, job_id, node_name,
gres_ns->gres_cnt_alloc,
gres_js->gres_cnt_node_alloc[node_offset]);
gres_ns->gres_cnt_alloc = 0;
}
if (!shared_gres) { /* Clear shared later based on topo info */
for (i = 0; i < len; i++) {
if (!bit_test(
gres_js->gres_bit_alloc[node_offset],
i)) {
continue;
}
bit_clear(gres_ns->gres_bit_alloc, i);
}
}
} else if (gres_js->gres_cnt_node_alloc) {
gres_cnt = gres_js->gres_cnt_node_alloc[node_offset];
} else {
error("gres/%s: job %u node %s no gres allocation recorded.",
gres_name, job_id, node_name);
}
if (gres_cnt && (gres_ns->gres_cnt_alloc >= gres_cnt))
gres_ns->gres_cnt_alloc -= gres_cnt;
else if (gres_cnt) {
error("gres/%s: job %u node %s GRES count underflow (%"PRIu64" < %"PRIu64")",
gres_name, job_id, node_name,
gres_ns->gres_cnt_alloc, gres_cnt);
gres_ns->gres_cnt_alloc = 0;
}
/* Clear the node's topo GRES bitmaps based on what the job has */
if (gres_js->gres_bit_alloc &&
gres_js->gres_bit_alloc[node_offset] &&
gres_ns->topo_gres_bitmap &&
gres_ns->topo_gres_cnt_alloc) {
for (i = 0; i < gres_ns->topo_cnt; i++) {
sz1 = bit_size(
gres_js->gres_bit_alloc[node_offset]);
sz2 = bit_size(gres_ns->topo_gres_bitmap[i]);
if (sz1 != sz2)
continue;
gres_cnt = _cnt_topo_gres(gres_js, node_offset,
gres_ns->topo_gres_bitmap[i]);
if (gres_ns->topo_gres_cnt_alloc[i] >= gres_cnt) {
gres_ns->topo_gres_cnt_alloc[i] -=
gres_cnt;
} else if (old_job) {
gres_ns->topo_gres_cnt_alloc[i] = 0;
} else {
error("gres/%s: job %u dealloc node %s topo gres count underflow "
"(%"PRIu64" %"PRIu64")",
gres_name, job_id, node_name,
gres_ns->topo_gres_cnt_alloc[i],
gres_cnt);
gres_ns->topo_gres_cnt_alloc[i] = 0;
}
if (shared_gres && !gres_ns->topo_gres_cnt_alloc[i])
bit_clear(gres_ns->gres_bit_alloc, i);
if ((gres_ns->type_cnt == 0) ||
(gres_ns->topo_type_name == NULL) ||
(gres_ns->topo_type_name[i] == NULL))
continue;
for (j = 0; j < gres_ns->type_cnt; j++) {
if (!gres_ns->type_name[j] ||
(gres_ns->topo_type_id[i] !=
gres_ns->type_id[j]))
continue;
if (gres_ns->type_cnt_alloc[j] >=
gres_cnt) {
gres_ns->type_cnt_alloc[j] -=
gres_cnt;
} else if (old_job) {
gres_ns->type_cnt_alloc[j] = 0;
} else {
error("gres/%s: job %u dealloc node %s type %s gres count underflow "
"(%"PRIu64" %"PRIu64")",
gres_name, job_id, node_name,
gres_ns->type_name[j],
gres_ns->type_cnt_alloc[j],
gres_cnt);
gres_ns->type_cnt_alloc[j] = 0;
}
}
}
} else if (gres_js->gres_bit_alloc &&
gres_js->gres_bit_alloc[node_offset] &&
gres_ns->topo_gres_cnt_alloc) {
/* Avoid crash if configuration inconsistent */
len = MIN(gres_ns->gres_cnt_config,
bit_size(gres_js->
gres_bit_alloc[node_offset]));
for (i = 0; i < len; i++) {
uint64_t gres_per_bit;
if (!bit_test(gres_js->
gres_bit_alloc[node_offset], i) ||
!gres_ns->topo_gres_cnt_alloc[i])
continue;
gres_per_bit = shared_gres ?
gres_js->gres_per_bit_alloc[node_offset][i] : 1;
if (gres_ns->topo_gres_cnt_alloc[i] >=
gres_per_bit) {
gres_ns->topo_gres_cnt_alloc[i] -=
gres_per_bit;
} else {
error("gres/%s: job %u dealloc node %s "
"topo_gres_cnt_alloc[%d] count underflow "
"(%"PRIu64" %"PRIu64")",
gres_name, job_id, node_name, i,
gres_ns->topo_gres_cnt_alloc[i],
gres_per_bit);
gres_ns->topo_gres_cnt_alloc[i] = 0;
}
if (shared_gres && !gres_ns->topo_gres_cnt_alloc[i])
bit_clear(gres_ns->gres_bit_alloc, i);
if ((gres_ns->type_cnt == 0) ||
(gres_ns->topo_type_name == NULL) ||
(gres_ns->topo_type_name[i] == NULL))
continue;
for (j = 0; j < gres_ns->type_cnt; j++) {
if (!gres_ns->type_name[j] ||
(gres_ns->topo_type_id[i] !=
gres_ns->type_id[j]))
continue;
if (gres_ns->type_cnt_alloc[j] >=
gres_per_bit) {
gres_ns->type_cnt_alloc[j] -=
gres_per_bit;
} else {
error("gres/%s: job %u dealloc node %s "
"type %s type_cnt_alloc count underflow "
"(%"PRIu64" %"PRIu64")",
gres_name, job_id, node_name,
gres_ns->type_name[j],
gres_ns->type_cnt_alloc[j],
gres_per_bit);
gres_ns->type_cnt_alloc[j] = 0;
}
}
}
} else if (gres_js->type_name) {
for (j = 0; j < gres_ns->type_cnt; j++) {
if (gres_js->type_id !=
gres_ns->type_id[j])
continue;
k = MIN(gres_cnt, gres_ns->type_cnt_alloc[j]);
gres_ns->type_cnt_alloc[j] -= k;
gres_cnt -= k;
if (gres_cnt == 0)
break;
}
}
if (!resize)
return SLURM_SUCCESS;
xassert(gres_js->node_cnt >= 1);
/*
* If resizing, alter the job's GRES bitmaps. Normally, a job's GRES
* bitmaps will get automatically freed when the job is destroyed.
* However, a job isn't destroyed when it is resized. So we need to
* remove this node's GRES from the job's GRES bitmaps.
*/
last_node = gres_js->node_cnt - 1;
if (gres_js->gres_cnt_node_alloc) {
/*
* This GRES is no longer part of the job, remove it from the
* alloc list.
*/
if (gres_js->gres_cnt_node_alloc[node_offset] >=
gres_js->total_gres)
return ESLURM_UNSUPPORTED_GRES;
gres_js->total_gres -=
gres_js->gres_cnt_node_alloc[node_offset];
/* Shift job GRES counts down, if necessary */
for (int i = node_offset + 1; i < gres_js->node_cnt; i++) {
gres_js->gres_cnt_node_alloc[i - 1] =
gres_js->gres_cnt_node_alloc[i];
}
/* Zero this out since we are reducing the node count */
gres_js->gres_cnt_node_alloc[last_node] = 0;
}
/* Downsize job GRES for this node */
if (gres_js->gres_bit_alloc) {
/* Free the job's GRES bitmap */
FREE_NULL_BITMAP(gres_js->gres_bit_alloc[node_offset]);
/* Shift job GRES bitmaps down, if necessary */
for (int i = node_offset + 1; i < gres_js->node_cnt; i++) {
gres_js->gres_bit_alloc[i - 1] =
gres_js->gres_bit_alloc[i];
}
/* NULL the last node since we are reducing the node count. */
gres_js->gres_bit_alloc[last_node] = NULL;
}
/* Downsize job step GRES for this node */
if (gres_js->gres_bit_step_alloc) {
/* Free the step's GRES bitmap */
FREE_NULL_BITMAP(gres_js->gres_bit_step_alloc[node_offset]);
/* Shift step GRES bitmaps down, if necessary */
for (int i = node_offset + 1; i < gres_js->node_cnt; i++) {
gres_js->gres_bit_step_alloc[i - 1] =
gres_js->gres_bit_step_alloc[i];
}
/* NULL the last node since we are reducing the node count. */
gres_js->gres_bit_step_alloc[last_node] = NULL;
}
if (gres_js->gres_cnt_step_alloc) {
/* Shift step GRES counts down, if necessary */
for (int i = node_offset + 1; i < gres_js->node_cnt; i++) {
gres_js->gres_cnt_step_alloc[i - 1] =
gres_js->gres_cnt_step_alloc[i];
}
/* Zero this out since we are reducing the node count */
gres_js->gres_cnt_step_alloc[last_node] = 0;
}
/* Finally, reduce the node count, since this node is deallocated */
gres_js->node_cnt--;
return SLURM_SUCCESS;
}
/*
* Deallocate resource from a job and update node and job gres information
* IN job_gres_list - job's allocated gres list
* IN node_gres_list - node's gres_list built by
* gres_node_config_validate()
* IN node_offset - zero-origin index to the node of interest
* IN job_id - job's ID (for logging)
* IN node_name - name of the node (for logging)
* IN old_job - true if job started before last slurmctld reboot.
* Immediately after slurmctld restart and before the node's
* registration, the GRES type and topology. This results in
* some incorrect internal bookkeeping, but does not cause
* failures in terms of allocating GRES to jobs.
* IN: resize - True if dealloc is due to a node being removed via a job
* resize; false if dealloc is due to a job test or a real job
* that is terminating.
* RET SLURM_SUCCESS or error code
*/
extern int gres_stepmgr_job_dealloc(
list_t *job_gres_list, list_t *node_gres_list,
int node_offset, uint32_t job_id,
char *node_name, bool old_job, bool resize)
{
int rc = SLURM_SUCCESS, rc2;
list_itr_t *job_gres_iter;
gres_state_t *gres_state_job, *gres_state_node;
if (job_gres_list == NULL)
return SLURM_SUCCESS;
if (node_gres_list == NULL) {
error("%s: job %u has gres specification while node %s has none",
__func__, job_id, node_name);
return SLURM_ERROR;
}
job_gres_iter = list_iterator_create(job_gres_list);
while ((gres_state_job = (gres_state_t *) list_next(job_gres_iter))) {
gres_state_node = list_find_first(node_gres_list, gres_find_id,
&gres_state_job->plugin_id);
if (gres_state_node == NULL) {
error("%s: node %s lacks gres/%s for job %u", __func__,
node_name, gres_state_job->gres_name, job_id);
continue;
}
rc2 = _job_dealloc(gres_state_job,
gres_state_node->gres_data, node_offset,
job_id, node_name, old_job, resize);
if (rc2 == ESLURM_UNSUPPORTED_GRES) {
list_delete_item(job_gres_iter);
} else if (rc2 != SLURM_SUCCESS)
rc = rc2;
}
list_iterator_destroy(job_gres_iter);
return rc;
}
/*
* Merge one job's gres allocation into another job's gres allocation.
* IN from_job_gres_list - List of gres records for the job being merged
* into another job
* IN from_job_node_bitmap - bitmap of nodes for the job being merged into
* another job
* IN/OUT to_job_gres_list - List of gres records for the job being merged
* into job
* IN to_job_node_bitmap - bitmap of nodes for the job being merged into
*/
extern void gres_stepmgr_job_merge(
list_t *from_job_gres_list,
bitstr_t *from_job_node_bitmap,
list_t *to_job_gres_list,
bitstr_t *to_job_node_bitmap)
{
static int select_hetero = -1;
list_itr_t *gres_iter;
gres_state_t *gres_state_job, *gres_state_job2;
gres_job_state_t *gres_js, *gres_js2;
int new_node_cnt;
int i_first, i_last, i;
int from_inx, to_inx, new_inx;
bitstr_t **new_gres_bit_alloc, **new_gres_bit_step_alloc;
uint64_t *new_gres_cnt_step_alloc, *new_gres_cnt_node_alloc;
bool free_to_job_gres_list = false;
if (select_hetero == -1) {
/*
* Determine if the select plugin supports heterogeneous
* GRES allocations (count differ by node): 1=yes, 0=no
*/
char *select_type = slurm_get_select_type();
if (xstrstr(select_type, "cons_tres"))
select_hetero = 1;
else
select_hetero = 0;
xfree(select_type);
}
new_node_cnt = bit_set_count(from_job_node_bitmap) +
bit_set_count(to_job_node_bitmap) -
bit_overlap(from_job_node_bitmap, to_job_node_bitmap);
i_first = MIN(bit_ffs(from_job_node_bitmap),
bit_ffs(to_job_node_bitmap));
i_first = MAX(i_first, 0);
i_last = MAX(bit_fls(from_job_node_bitmap),
bit_fls(to_job_node_bitmap));
if (i_last == -1) {
error("%s: node_bitmaps are empty", __func__);
return;
}
/* Step one - Expand the gres data structures in "to" job */
if (!to_job_gres_list)
goto step2;
gres_iter = list_iterator_create(to_job_gres_list);
while ((gres_state_job = (gres_state_t *) list_next(gres_iter))) {
gres_js = (gres_job_state_t *) gres_state_job->gres_data;
new_gres_bit_alloc = xcalloc(new_node_cnt, sizeof(bitstr_t *));
new_gres_cnt_node_alloc = xcalloc(new_node_cnt,
sizeof(uint64_t));
new_gres_bit_step_alloc = xcalloc(new_node_cnt,
sizeof(bitstr_t *));
new_gres_cnt_step_alloc = xcalloc(new_node_cnt,
sizeof(uint64_t));
from_inx = to_inx = new_inx = -1;
for (i = i_first; i <= i_last; i++) {
bool from_match = false, to_match = false;
if (bit_test(to_job_node_bitmap, i)) {
to_match = true;
to_inx++;
}
if (bit_test(from_job_node_bitmap, i)) {
from_match = true;
from_inx++;
}
if (from_match || to_match)
new_inx++;
if (to_match) {
if (gres_js->gres_bit_alloc) {
new_gres_bit_alloc[new_inx] =
gres_js->
gres_bit_alloc[to_inx];
}
if (gres_js->gres_cnt_node_alloc) {
new_gres_cnt_node_alloc[new_inx] =
gres_js->
gres_cnt_node_alloc[to_inx];
}
if (gres_js->gres_bit_step_alloc) {
new_gres_bit_step_alloc[new_inx] =
gres_js->
gres_bit_step_alloc[to_inx];
}
if (gres_js->gres_cnt_step_alloc) {
new_gres_cnt_step_alloc[new_inx] =
gres_js->
gres_cnt_step_alloc[to_inx];
}
}
}
gres_js->node_cnt = new_node_cnt;
xfree(gres_js->gres_bit_alloc);
gres_js->gres_bit_alloc = new_gres_bit_alloc;
xfree(gres_js->gres_cnt_node_alloc);
gres_js->gres_cnt_node_alloc = new_gres_cnt_node_alloc;
xfree(gres_js->gres_bit_step_alloc);
gres_js->gres_bit_step_alloc = new_gres_bit_step_alloc;
xfree(gres_js->gres_cnt_step_alloc);
gres_js->gres_cnt_step_alloc = new_gres_cnt_step_alloc;
}
list_iterator_destroy(gres_iter);
/*
* Step two - Merge the gres information from the "from" job into the
* existing gres information for the "to" job
*/
step2: if (!from_job_gres_list)
goto step3;
if (!to_job_gres_list) {
to_job_gres_list = list_create(gres_job_list_delete);
free_to_job_gres_list = true;
}
gres_iter = list_iterator_create(from_job_gres_list);
while ((gres_state_job = (gres_state_t *) list_next(gres_iter))) {
gres_js = (gres_job_state_t *) gres_state_job->gres_data;
gres_state_job2 = list_find_first(to_job_gres_list,
gres_find_id,
&gres_state_job->plugin_id);
if (gres_state_job2) {
gres_js2 = gres_state_job2->gres_data;
} else {
gres_js2 = xmalloc(sizeof(gres_job_state_t));
gres_js2->cpus_per_gres =
gres_js->cpus_per_gres;
gres_js2->gres_per_job =
gres_js->gres_per_job;
gres_js2->gres_per_job =
gres_js->gres_per_job;
gres_js2->gres_per_socket =
gres_js->gres_per_socket;
gres_js2->gres_per_task =
gres_js->gres_per_task;
gres_js2->mem_per_gres =
gres_js->mem_per_gres;
gres_js2->ntasks_per_gres =
gres_js->ntasks_per_gres;
gres_js2->node_cnt = new_node_cnt;
gres_js2->gres_bit_alloc =
xcalloc(new_node_cnt, sizeof(bitstr_t *));
gres_js2->gres_cnt_node_alloc =
xcalloc(new_node_cnt, sizeof(uint64_t));
gres_js2->gres_bit_step_alloc =
xcalloc(new_node_cnt, sizeof(bitstr_t *));
gres_js2->gres_cnt_step_alloc =
xcalloc(new_node_cnt, sizeof(uint64_t));
gres_state_job2 = gres_create_state(
gres_state_job, GRES_STATE_SRC_STATE_PTR,
GRES_STATE_TYPE_JOB, gres_js2);
list_append(to_job_gres_list, gres_state_job2);
}
from_inx = to_inx = new_inx = -1;
for (i = i_first; i <= i_last; i++) {
bool from_match = false, to_match = false;
if (bit_test(to_job_node_bitmap, i)) {
to_match = true;
to_inx++;
}
if (bit_test(from_job_node_bitmap, i)) {
from_match = true;
from_inx++;
}
if (from_match || to_match)
new_inx++;
if (from_match) {
if (!gres_js->gres_bit_alloc) {
;
} else if (select_hetero &&
gres_js2->
gres_bit_alloc[new_inx] &&
gres_js->gres_bit_alloc &&
gres_js->
gres_bit_alloc[new_inx]) {
/* Merge job's GRES bitmaps */
bit_or(gres_js2->
gres_bit_alloc[new_inx],
gres_js->
gres_bit_alloc[from_inx]);
} else if (gres_js2->
gres_bit_alloc[new_inx]) {
/* Keep original job's GRES bitmap */
} else {
gres_js2->gres_bit_alloc[new_inx] =
gres_js->
gres_bit_alloc[from_inx];
gres_js->
gres_bit_alloc
[from_inx] = NULL;
}
if (!gres_js->gres_cnt_node_alloc) {
;
} else if (select_hetero &&
gres_js2->
gres_cnt_node_alloc[new_inx] &&
gres_js->gres_cnt_node_alloc &&
gres_js->
gres_cnt_node_alloc[new_inx]) {
gres_js2->
gres_cnt_node_alloc[new_inx] +=
gres_js->
gres_cnt_node_alloc[from_inx];
} else if (gres_js2->
gres_cnt_node_alloc[new_inx]) {
/* Keep original job's GRES bitmap */
} else {
gres_js2->
gres_cnt_node_alloc[new_inx] =
gres_js->
gres_cnt_node_alloc[from_inx];
gres_js->
gres_cnt_node_alloc[from_inx]=0;
}
if (gres_js->gres_cnt_step_alloc &&
gres_js->
gres_cnt_step_alloc[from_inx]) {
error("Attempt to merge gres, from "
"job has active steps");
}
}
}
}
list_iterator_destroy(gres_iter);
step3:
if (free_to_job_gres_list)
FREE_NULL_LIST(to_job_gres_list);
}
/* Clear any vestigial job gres state. This may be needed on job requeue. */
extern void gres_stepmgr_job_clear_alloc(list_t *job_gres_list)
{
if (job_gres_list == NULL)
return;
list_for_each(job_gres_list, _foreach_clear_job_gres, NULL);
}
static char *_build_shared_gres_details(char *nodes, int node_index,
gres_state_t *gres_state_job,
gres_job_state_t *gres_js)
{
int gres_cnt_on_node = 0;
gres_node_state_t *gres_ns = NULL;
gres_state_t *gres_state_node;
hostlist_t *host_list;
char *node;
node_record_t *node_ptr = NULL;
char *pos = NULL;
char *shared_gres_details_str = NULL;
/* Use host list so that gres_js node index matches correct gres_ns */
if (!(host_list = hostlist_create(nodes))) {
error("Could not create hostlist from nodes '%s'", nodes);
return NULL;
}
/* Find node record based on host list and node index */
if (!(node = hostlist_nth(host_list, node_index))) {
hostlist_destroy(host_list);
return NULL;
}
hostlist_destroy(host_list);
if (!(node_ptr = find_node_record(node))) {
error("Could not find record for node '%s'", node);
free(node);
return NULL;
}
free(node);
/* Find gres_state_node with plugin_id that matches gres_state_job */
gres_state_node = list_find_first(node_ptr->gres_list, gres_find_id,
&gres_state_job->plugin_id);
if (!gres_state_node)
return NULL;
gres_ns = gres_state_node->gres_data;
if (!gres_ns)
return NULL;
/*
* Fill shared gres details string with info about allocated shared gres
* from gres_js->gres_bit_alloc, and info about available shared gres
* from gres_ns->topo_gres_cnt_avail
*/
gres_cnt_on_node = bit_size(gres_js->gres_bit_alloc[node_index]);
for (int i = 0; i < gres_cnt_on_node; i++) {
xstrfmtcatat(shared_gres_details_str, &pos,
"%"PRIu64"/%"PRIu64",",
gres_js->gres_per_bit_alloc[node_index][i],
gres_ns->topo_gres_cnt_avail[i]);
}
if (pos) {
/* Strip the last comma off. */
pos--;
pos[0] = '\0';
}
return shared_gres_details_str;
}
/* Given a job's GRES data structure, return the indices for selected elements
* IN job_gres_list - job's allocated GRES data structure
* IN nodes - list of nodes allocated to job
* OUT gres_detail_cnt - Number of elements (nodes) in gres_detail_str
* OUT gres_detail_str - Description of GRES on each node
* OUT total_gres_str - String containing all gres in the job and counts.
*/
extern void gres_stepmgr_job_build_details(
list_t *job_gres_list, char *nodes,
uint32_t *gres_detail_cnt,
char ***gres_detail_str,
char **total_gres_str)
{
int i, j;
list_itr_t *job_gres_iter;
gres_state_t *gres_state_job;
gres_job_state_t *gres_js;
char *sep1, *sep2, tmp_str[128], *type, **my_gres_details = NULL;
uint32_t my_gres_cnt = 0;
char *gres_name, *gres_str = NULL;
uint64_t gres_cnt;
/* Release any vestigial data (e.g. from job requeue) */
for (i = 0; i < *gres_detail_cnt; i++)
xfree(gres_detail_str[0][i]);
xfree(*gres_detail_str);
xfree(*total_gres_str);
*gres_detail_cnt = 0;
if (job_gres_list == NULL) /* No GRES allocated */
return;
(void) gres_init();
job_gres_iter = list_iterator_create(job_gres_list);
while ((gres_state_job = list_next(job_gres_iter))) {
gres_js = (gres_job_state_t *) gres_state_job->gres_data;
if (gres_js->gres_bit_alloc == NULL)
continue;
if (my_gres_details == NULL) {
my_gres_cnt = gres_js->node_cnt;
my_gres_details = xcalloc(my_gres_cnt, sizeof(char *));
}
if (gres_js->type_name) {
sep2 = ":";
type = gres_js->type_name;
} else {
sep2 = "";
type = "";
}
gres_name = xstrdup_printf(
"%s%s%s",
gres_state_job->gres_name, sep2, type);
gres_cnt = 0;
for (j = 0; j < my_gres_cnt; j++) {
uint64_t alloc_cnt;
if (j >= gres_js->node_cnt)
break; /* node count mismatch */
if (my_gres_details[j])
sep1 = ",";
else
sep1 = "";
if (gres_js->gres_cnt_node_alloc[j] == NO_CONSUME_VAL64)
alloc_cnt = 0;
else
alloc_cnt = gres_js->gres_cnt_node_alloc[j];
gres_cnt += alloc_cnt;
if (gres_js->gres_bit_alloc[j] &&
(gres_js->gres_per_bit_alloc &&
gres_js->gres_per_bit_alloc[j])) {
char *shared_gres_details =
_build_shared_gres_details(
nodes, j, gres_state_job, gres_js);
xstrfmtcat(my_gres_details[j],
"%s%s:%" PRIu64 "(%s)", sep1,
gres_name, alloc_cnt,
shared_gres_details);
xfree(shared_gres_details);
} else if (gres_js->gres_bit_alloc[j]) {
bit_fmt(tmp_str, sizeof(tmp_str),
gres_js->gres_bit_alloc[j]);
xstrfmtcat(my_gres_details[j],
"%s%s:%"PRIu64"(IDX:%s)",
sep1, gres_name,
alloc_cnt,
tmp_str);
} else if (gres_js->gres_cnt_node_alloc[j]) {
xstrfmtcat(my_gres_details[j],
"%s%s(CNT:%"PRIu64")",
sep1, gres_name,
alloc_cnt);
}
}
xstrfmtcat(gres_str, "%s%s:%"PRIu64,
gres_str ? "," : "", gres_name, gres_cnt);
xfree(gres_name);
}
list_iterator_destroy(job_gres_iter);
*gres_detail_cnt = my_gres_cnt;
*gres_detail_str = my_gres_details;
*total_gres_str = gres_str;
}
/* Fill in job/node TRES arrays with allocated GRES. */
static void _set_type_tres_cnt(list_t *gres_list,
uint64_t *tres_cnt,
bool locked)
{
list_itr_t *itr;
gres_state_t *gres_state_ptr;
static bool first_run = 1;
static slurmdb_tres_rec_t tres_rec;
bool typeless_found = false, typeless = false;
char *col_name = NULL, *prev_gres_name = NULL;
uint64_t count;
int tres_pos;
assoc_mgr_lock_t locks = { .tres = READ_LOCK };
/* we only need to init this once */
if (first_run) {
first_run = 0;
memset(&tres_rec, 0, sizeof(slurmdb_tres_rec_t));
tres_rec.type = "gres";
}
if (!gres_list || !tres_cnt)
return;
/* must be locked first before gres_contrex_lock!!! */
if (!locked)
assoc_mgr_lock(&locks);
gres_clear_tres_cnt(tres_cnt, true);
itr = list_iterator_create(gres_list);
while ((gres_state_ptr = list_next(itr))) {
bool set_total = false;
tres_rec.name = gres_state_ptr->gres_name;
/* Get alloc count for main GRES. */
switch (gres_state_ptr->state_type) {
case GRES_STATE_TYPE_JOB:
{
gres_job_state_t *gres_js = (gres_job_state_t *)
gres_state_ptr->gres_data;
/*
* If total_gres is set for selected (i.e.
* non-allocated) GRES and we had per job request we
* shouldn't use total_gres since it may be higher than
* actually requested. The way gres_sched_add works is
* that it adds as many GRES devices as we can use on
* the node. It may be more than requested to allow
* further optimization for instance based on nvlink,
* e.g. _set_task_bits.
*/
if (gres_js->gres_cnt_node_alloc ||
!gres_js->gres_per_job)
count = gres_js->total_gres;
else
count = gres_js->gres_per_job;
/*
* Resetting typeless_found to false when GRES name
* changes with respect to previous iteration until it
* is found again.
*
* This is needed in situations like i.e.:
* "--gres=gpu:1,tmpfs:foo:2,tmpfs:bar:7" where typeless
* is found for GRES name "gpu" but then for "tmpfs"
* it isn't, and thus the logic later around
* typeless_found would not set the count for "tmpfs"
* off of the sum of tmpfs:foo and tmpfs:bar counts.
*/
if (xstrcmp(prev_gres_name, tres_rec.name)) {
typeless_found = false;
xfree(prev_gres_name);
prev_gres_name = xstrdup(tres_rec.name);
}
if (!gres_js->type_name) {
typeless_found = true;
typeless = true;
} else {
typeless = false;
}
break;
}
case GRES_STATE_TYPE_NODE:
{
gres_node_state_t *gres_ns = (gres_node_state_t *)
gres_state_ptr->gres_data;
count = gres_ns->gres_cnt_alloc;
break;
}
default:
error("%s: unsupported state type %d", __func__,
gres_state_ptr->state_type);
continue;
}
/*
* Set main TRES's count (i.e. if no GRES "type" is being
* accounted for). We need to increment counter since the job
* may have been allocated multiple GRES types, but Slurm is
* only configured to track the total count. For example, a job
* allocated 1 GPU of type "tesla" and 1 GPU of type "volta",
* but we want to record that the job was allocated a total of
* 2 GPUs.
*/
if ((tres_pos = assoc_mgr_find_tres_pos(&tres_rec,true)) != -1){
if (count == NO_CONSUME_VAL64)
tres_cnt[tres_pos] = NO_CONSUME_VAL64;
else if (!typeless_found)
tres_cnt[tres_pos] += count;
else if (typeless)
tres_cnt[tres_pos] = count;
/*
* No need for else statement, as all cases above should
* always cover setting main TRES's count.
*/
set_total = true;
}
/*
* Set TRES count for GRES model types. This would be handy for
* GRES like "gpu:tesla", where you might want to track both as
* TRES.
*/
switch (gres_state_ptr->state_type) {
case GRES_STATE_TYPE_JOB:
{
gres_job_state_t *gres_js = (gres_job_state_t *)
gres_state_ptr->gres_data;
col_name = gres_js->type_name;
if (col_name) {
tres_rec.name = xstrdup_printf(
"%s:%s",
gres_state_ptr->gres_name,
col_name);
if ((tres_pos = assoc_mgr_find_tres_pos(
&tres_rec, true)) != -1)
tres_cnt[tres_pos] = count;
xfree(tres_rec.name);
} else if (!set_total) {
/*
* Job allocated GRES without "type"
* specification, but Slurm is only accounting
* for this GRES by specific "type", so pick
* some valid "type" to get some accounting.
* Although the reported "type" may not be
* accurate, it is better than nothing...
*/
tres_rec.name = gres_state_ptr->gres_name;
if ((tres_pos = assoc_mgr_find_tres_pos2(
&tres_rec, true)) != -1)
tres_cnt[tres_pos] = count;
}
break;
}
case GRES_STATE_TYPE_NODE:
{
int type;
gres_node_state_t *gres_ns = (gres_node_state_t *)
gres_state_ptr->gres_data;
for (type = 0; type < gres_ns->type_cnt; type++) {
col_name = gres_ns->type_name[type];
if (!col_name)
continue;
tres_rec.name = xstrdup_printf(
"%s:%s",
gres_state_ptr->gres_name,
col_name);
count = gres_ns->type_cnt_alloc[type];
if ((tres_pos = assoc_mgr_find_tres_pos(
&tres_rec, true)) != -1)
tres_cnt[tres_pos] = count;
xfree(tres_rec.name);
}
break;
}
default:
error("%s: unsupported state type %d", __func__,
gres_state_ptr->state_type);
continue;
}
}
list_iterator_destroy(itr);
xfree(prev_gres_name);
if (!locked)
assoc_mgr_unlock(&locks);
}
extern void gres_stepmgr_set_job_tres_cnt(
list_t *gres_list, uint32_t node_cnt, uint64_t *tres_cnt, bool locked)
{
if (!node_cnt || (node_cnt == NO_VAL))
return;
_set_type_tres_cnt(gres_list, tres_cnt, locked);
}
extern void gres_stepmgr_set_node_tres_cnt(
list_t *gres_list, uint64_t *tres_cnt, bool locked)
{
_set_type_tres_cnt(gres_list, tres_cnt, locked);
}
static uint64_t _step_get_gres_needed(gres_step_state_t *gres_ss,
bool first_step_node,
uint16_t tasks_on_node,
uint32_t rem_nodes, uint64_t *max_gres)
{
uint64_t gres_needed;
*max_gres = 0;
if (first_step_node)
gres_ss->total_gres = 0;
if (gres_ss->gres_per_node) {
gres_needed = gres_ss->gres_per_node;
} else if (gres_ss->gres_per_task) {
gres_needed = gres_ss->gres_per_task * tasks_on_node;
} else if (gres_ss->ntasks_per_gres) {
gres_needed = tasks_on_node / gres_ss->ntasks_per_gres;
} else if (gres_ss->gres_per_step && (rem_nodes == 1)) {
gres_needed = gres_ss->gres_per_step -
gres_ss->total_gres;
} else if (gres_ss->gres_per_step) {
uint64_t tmp = gres_ss->total_gres + (rem_nodes - 1);
/* Note: total_gres is the number of accumulated gres. */
if (gres_ss->total_gres >= gres_ss->gres_per_step) {
/* If we already have the gres required, get no more. */
gres_needed = 0;
*max_gres = 0;
} else if (gres_ss->gres_per_step > tmp) {
/* Leave at least one GRES per remaining node. */
*max_gres = gres_ss->gres_per_step - tmp;
gres_needed = 1;
} else {
/*
* We don't need enough gres to have one on every
* remaining node. Get all possible gres on each
* remaining node instead of trying to spread them out
* over the nodes.
*/
gres_needed = 1;
*max_gres = gres_ss->gres_per_step -
gres_ss->total_gres;
}
} else {
/*
* No explicit step GRES specification.
* Note that gres_per_socket is not supported for steps
*/
gres_needed = INFINITE64; /* All allocated to job on Node */
}
return gres_needed;
}
static void _init_step_gres_per_bit(gres_job_state_t *gres_js,
gres_step_state_t *gres_ss, int n,
bool decr_job_alloc)
{
if (!gres_js->gres_per_bit_alloc || !gres_js->gres_per_bit_alloc[n])
error("Job has shared gres but there is no job gres_per_bit_alloc");
if (decr_job_alloc && !gres_js->gres_per_bit_step_alloc)
gres_js->gres_per_bit_step_alloc = xcalloc(gres_js->node_cnt,
sizeof(uint64_t *));
if (decr_job_alloc && !gres_js->gres_per_bit_step_alloc[n])
gres_js->gres_per_bit_step_alloc[n] = xcalloc(
bit_size(gres_js->gres_bit_alloc[n]), sizeof(uint64_t));
if (!gres_ss->gres_per_bit_alloc)
gres_ss->gres_per_bit_alloc = xcalloc(gres_ss->node_cnt,
sizeof(uint64_t *));
if (!gres_ss->gres_per_bit_alloc[n])
gres_ss->gres_per_bit_alloc[n] = xcalloc(
bit_size(gres_js->gres_bit_alloc[n]), sizeof(uint64_t));
}
static bool _shared_step_gres_avail(gres_job_state_t *gres_js,
gres_step_state_t *gres_ss,
uint64_t *gres_alloc, bool decr_job_alloc,
int n, int i)
{
uint64_t cnt = MIN(*gres_alloc, gres_js->gres_per_bit_alloc[n][i]);
if (decr_job_alloc)
cnt = MIN(cnt,
(gres_js->gres_per_bit_alloc[n][i] -
gres_js->gres_per_bit_step_alloc[n][i]));
if (!cnt)
return false;
if (decr_job_alloc)
gres_js->gres_per_bit_step_alloc[n][i] += cnt;
gres_ss->gres_per_bit_alloc[n][i] = cnt;
*gres_alloc -= cnt;
return true;
}
static int _set_step_gres_bit_alloc(gres_step_state_t *gres_ss,
gres_state_t *gres_state_job,
int node_offset,
slurm_step_id_t *step_id,
uint64_t gres_alloc,
bool decr_job_alloc,
list_t *node_gres_list,
bitstr_t *core_bitmap)
{
gres_job_state_t *gres_js = gres_state_job->gres_data;
int len = bit_size(gres_js->gres_bit_alloc[node_offset]);
bitstr_t *gres_bit_alloc = bit_alloc(len);
bitstr_t *gres_bit_avail = bit_copy(
gres_js->gres_bit_alloc[node_offset]);
gres_state_t *gres_state_node;
gres_node_state_t *gres_ns;
if (!(gres_state_node = list_find_first(node_gres_list,
gres_find_id,
&gres_state_job->plugin_id))) {
error("No node gres when step gres is allocated. This should never happen.");
return 0;
}
gres_ns = gres_state_node->gres_data;
if (gres_id_shared(gres_state_job->config_flags)) {
_init_step_gres_per_bit(gres_js, gres_ss, node_offset,
decr_job_alloc);
}
if (decr_job_alloc &&
gres_js->gres_bit_step_alloc &&
gres_js->gres_bit_step_alloc[node_offset] &&
!gres_id_shared(gres_state_job->config_flags)) {
bit_and_not(gres_bit_avail,
gres_js->gres_bit_step_alloc[node_offset]);
}
for (int i = 0; i < len && gres_alloc; i++) {
if (!bit_test(gres_bit_avail, i) ||
bit_test(gres_bit_alloc, i) ||
!_cores_on_gres(core_bitmap, NULL, gres_ns, i, gres_js))
continue;
if (gres_id_shared(gres_state_job->config_flags)) {
if (_shared_step_gres_avail(gres_js, gres_ss,
&gres_alloc, decr_job_alloc,
node_offset, i))
bit_set(gres_bit_alloc, i);
} else {
bit_set(gres_bit_alloc, i);
gres_alloc--;
}
}
FREE_NULL_BITMAP(gres_bit_avail);
if (decr_job_alloc) {
if (!gres_js->gres_bit_step_alloc) {
gres_js->gres_bit_step_alloc =
xcalloc(gres_js->node_cnt,
sizeof(bitstr_t *));
}
if (gres_js->gres_bit_step_alloc[node_offset]) {
bit_or(gres_js->gres_bit_step_alloc[node_offset],
gres_bit_alloc);
} else {
gres_js->gres_bit_step_alloc[node_offset] =
bit_copy(gres_bit_alloc);
}
}
if (!gres_ss->gres_bit_alloc) {
gres_ss->gres_bit_alloc =
xcalloc(gres_js->node_cnt, sizeof(bitstr_t *));
}
if (gres_ss->gres_bit_alloc[node_offset]) {
bit_or(gres_ss->gres_bit_alloc[node_offset],
gres_bit_alloc);
FREE_NULL_BITMAP(gres_bit_alloc);
} else {
gres_ss->gres_bit_alloc[node_offset] = gres_bit_alloc;
}
return gres_alloc;
}
static int _step_alloc(gres_step_state_t *gres_ss,
gres_state_t *gres_state_step_req,
gres_state_t *gres_state_job,
int node_offset,
slurm_step_id_t *step_id,
uint64_t *gres_needed, uint64_t *max_gres,
bool decr_job_alloc,
uint64_t *step_node_mem_alloc,
list_t *node_gres_list,
bitstr_t *core_bitmap,
int *total_gres_cpu_cnt)
{
gres_job_state_t *gres_js = gres_state_job->gres_data;
gres_step_state_t *gres_ss_req = gres_state_step_req->gres_data;
uint64_t gres_alloc, gres_left;
xassert(gres_js);
xassert(gres_ss);
xassert(gres_ss_req);
if (!gres_js->gres_cnt_node_alloc) {
error("gres/%s: %s gres_cnt_node_alloc is not allocated",
gres_state_job->gres_name, __func__);
return SLURM_ERROR;
}
if ((gres_js->gres_cnt_node_alloc[node_offset] == NO_CONSUME_VAL64) ||
(gres_js->total_gres == NO_CONSUME_VAL64)) {
if (*gres_needed != INFINITE64)
*gres_needed = 0;
gres_ss->total_gres = NO_CONSUME_VAL64;
return SLURM_SUCCESS;
}
if (node_offset >= gres_js->node_cnt) {
error("gres/%s: %s for %ps, node offset invalid (%d >= %u)",
gres_state_job->gres_name, __func__, step_id, node_offset,
gres_js->node_cnt);
return SLURM_ERROR;
}
if (gres_ss->node_cnt == 0)
gres_ss->node_cnt = gres_js->node_cnt;
if (!gres_ss->gres_cnt_node_alloc) {
gres_ss->gres_cnt_node_alloc =
xcalloc(gres_ss->node_cnt, sizeof(uint64_t));
}
if (!gres_js->gres_cnt_step_alloc) {
gres_js->gres_cnt_step_alloc = xcalloc(
gres_js->node_cnt, sizeof(uint64_t));
}
gres_alloc = gres_js->gres_cnt_node_alloc[node_offset];
if (decr_job_alloc)
gres_alloc -= gres_js->gres_cnt_step_alloc[node_offset];
if (*gres_needed != INFINITE64) {
if (*max_gres && decr_job_alloc) {
gres_alloc = MIN(gres_alloc, *max_gres);
} else
gres_alloc = MIN(gres_alloc,*gres_needed);
}
if (gres_js->gres_bit_alloc && gres_js->gres_bit_alloc[node_offset]) {
gres_left = _set_step_gres_bit_alloc(gres_ss, gres_state_job,
node_offset, step_id,
gres_alloc,
decr_job_alloc,
node_gres_list,
core_bitmap);
if (gres_left && !core_bitmap) /* only on Pass 2 */
error("gres/%s: %s %ps oversubscribed resources on node %d",
gres_state_job->gres_name, __func__, step_id,
node_offset);
else
gres_alloc -= gres_left;
} else
debug3("gres/%s: %s gres_bit_alloc for %ps is NULL",
gres_state_job->gres_name, __func__, step_id);
if (*gres_needed != INFINITE64) {
if (*max_gres && decr_job_alloc)
*max_gres -= gres_alloc;
if (gres_alloc < *gres_needed)
*gres_needed -= gres_alloc;
else
*gres_needed = 0;
}
if (gres_ss->gres_cnt_node_alloc &&
(node_offset < gres_ss->node_cnt)) {
gres_ss->gres_cnt_node_alloc[node_offset] += gres_alloc;
/*
* Calculate memory allocated to the step based on the
* mem_per_gres limit.
* FIXME: Currently the only option that sets mem_per_gres is
* --mem-per-gpu. Adding another option will require a change
* here - perhaps we should take the MAX of all mem_per_gres.
* Similar logic is in gres_select_util_job_mem_set(),
* which would also need to be changed if another
* mem_per_gres option was added.
*/
if (gres_ss_req->mem_per_gres &&
(gres_ss_req->mem_per_gres != NO_VAL64))
*step_node_mem_alloc +=
gres_ss_req->mem_per_gres * gres_alloc;
}
gres_ss_req->total_gres += gres_alloc;
gres_ss->total_gres += gres_alloc;
if (gres_ss->node_in_use == NULL) {
gres_ss->node_in_use = bit_alloc(gres_js->node_cnt);
}
bit_set(gres_ss->node_in_use, node_offset);
if (decr_job_alloc)
gres_js->gres_cnt_step_alloc[node_offset] += gres_alloc;
if (gres_ss_req->cpus_per_gres != NO_VAL16)
*total_gres_cpu_cnt += gres_alloc * gres_ss_req->cpus_per_gres;
return SLURM_SUCCESS;
}
static gres_step_state_t *_step_get_alloc_gres_ptr(list_t *step_gres_list_alloc,
gres_state_t *gres_state_job)
{
gres_key_t step_search_key;
gres_step_state_t *gres_ss;
gres_state_t *gres_state_step;
gres_job_state_t *gres_js = gres_state_job->gres_data;
/* Find in job_gres_list_alloc if it exists */
step_search_key.config_flags = gres_state_job->config_flags;
step_search_key.plugin_id = gres_state_job->plugin_id;
step_search_key.type_id = gres_js->type_id;
if (!(gres_state_step = list_find_first(step_gres_list_alloc,
gres_find_step_by_key,
&step_search_key))) {
gres_ss = xmalloc(sizeof(*gres_ss));
gres_ss->type_id = gres_js->type_id;
gres_ss->type_name = xstrdup(gres_js->type_name);
gres_state_step = xmalloc(sizeof(*gres_state_step));
gres_state_step->config_flags = step_search_key.config_flags;
gres_state_step->plugin_id = step_search_key.plugin_id;
gres_state_step->gres_data = gres_ss;
gres_state_step->gres_name = xstrdup(gres_state_job->gres_name);
gres_state_step->state_type = GRES_STATE_TYPE_STEP;
list_append(step_gres_list_alloc, gres_state_step);
} else
gres_ss = gres_state_step->gres_data;
return gres_ss;
}
static int _step_alloc_type(gres_state_t *gres_state_job,
foreach_step_alloc_t *args)
{
gres_job_state_t *gres_js = (gres_job_state_t *)
gres_state_job->gres_data;
gres_step_state_t *gres_ss = (gres_step_state_t *)
args->gres_state_step->gres_data;
gres_step_state_t *gres_ss_alloc;
/*
* This isn't the gres we are looking for, or we already have allocated
* all of this GRES to other steps. If decr_job_alloc is false, then
* this step can share GRES. So, only do the last check if the step
* cannot share GRES (decr_job_alloc is true).
*/
if ((!args->gres_needed && !args->max_gres) ||
!gres_find_job_by_key_with_cnt(gres_state_job,
args->job_search_key) ||
(args->decr_job_alloc &&
(gres_js->gres_cnt_step_alloc[args->node_offset] ==
gres_js->gres_cnt_node_alloc[args->node_offset])))
return 0;
gres_ss_alloc = _step_get_alloc_gres_ptr(
args->step_gres_list_alloc, gres_state_job);
args->rc = _step_alloc(gres_ss_alloc, args->gres_state_step,
gres_state_job,
args->node_offset, &args->tmp_step_id,
&args->gres_needed, &args->max_gres,
args->decr_job_alloc,
args->step_node_mem_alloc,
args->node_gres_list,
args->core_bitmap,
&args->total_gres_cpu_cnt);
if (args->rc != SLURM_SUCCESS) {
return -1;
}
if (gres_ss->node_cnt == 0)
gres_ss->node_cnt = gres_js->node_cnt;
return 0;
}
extern int gres_stepmgr_step_alloc(
list_t *step_gres_list,
list_t **step_gres_list_alloc,
list_t *job_gres_list,
int node_offset, bool first_step_node,
uint16_t tasks_on_node, uint32_t rem_nodes,
uint32_t job_id, uint32_t step_id,
bool decr_job_alloc,
uint64_t *step_node_mem_alloc,
list_t *node_gres_list,
bitstr_t *core_bitmap,
int *total_gres_cpu_cnt)
{
int rc = SLURM_SUCCESS;
list_itr_t *step_gres_iter;
gres_state_t *gres_state_step;
slurm_step_id_t tmp_step_id = { 0 };
if (step_gres_list == NULL)
return SLURM_SUCCESS;
if (job_gres_list == NULL) {
error("%s: step allocates GRES, but job %u has none",
__func__, job_id);
return ESLURM_INSUFFICIENT_GRES;
}
if (!*step_gres_list_alloc)
*step_gres_list_alloc = list_create(gres_step_list_delete);
xassert(step_node_mem_alloc);
*step_node_mem_alloc = 0;
tmp_step_id.job_id = job_id;
tmp_step_id.step_het_comp = NO_VAL;
tmp_step_id.step_id = step_id;
step_gres_iter = list_iterator_create(step_gres_list);
while ((gres_state_step = list_next(step_gres_iter))) {
gres_step_state_t *gres_ss =
(gres_step_state_t *) gres_state_step->gres_data;
gres_key_t job_search_key;
foreach_step_alloc_t args;
job_search_key.config_flags = gres_state_step->config_flags;
job_search_key.plugin_id = gres_state_step->plugin_id;
if (gres_ss->type_name)
job_search_key.type_id = gres_ss->type_id;
else
job_search_key.type_id = NO_VAL;
job_search_key.node_offset = node_offset;
args.core_bitmap = core_bitmap;
args.decr_job_alloc = decr_job_alloc;
args.gres_needed = _step_get_gres_needed(
gres_ss, first_step_node, tasks_on_node,
rem_nodes, &args.max_gres);
args.job_search_key = &job_search_key;
args.node_gres_list = node_gres_list;
args.node_offset = node_offset;
args.rc = SLURM_SUCCESS;
args.step_gres_list_alloc = *step_gres_list_alloc;
args.gres_state_step = gres_state_step;
args.step_node_mem_alloc = step_node_mem_alloc;
args.tmp_step_id = tmp_step_id;
args.total_gres_cpu_cnt = 0;
/* Pass 1: Allocate GRES overlapping available cores */
(void) list_for_each(job_gres_list, (ListForF) _step_alloc_type,
&args);
if (args.gres_needed) {
log_flag(STEPS, "cpus for optimal gres/%s topology unavailable for %ps allocating anyway.",
gres_state_step->gres_name, &tmp_step_id);
}
/* Pass 2: Allocate any available GRES */
args.core_bitmap = NULL;
(void) list_for_each(job_gres_list, (ListForF) _step_alloc_type,
&args);
*total_gres_cpu_cnt += args.total_gres_cpu_cnt;
if (args.rc != SLURM_SUCCESS)
rc = args.rc;
if (args.gres_needed && args.gres_needed != INFINITE64 &&
rc == SLURM_SUCCESS) {
error("gres/%s: %s for %ps, step's > job's for node %d (gres still needed: %"PRIu64")",
gres_state_step->gres_name, __func__, &tmp_step_id,
node_offset, args.gres_needed);
rc = ESLURM_INSUFFICIENT_GRES;
}
}
list_iterator_destroy(step_gres_iter);
return rc;
}
static int _step_dealloc(gres_state_t *gres_state_step, list_t *job_gres_list,
slurm_step_id_t *step_id, int node_offset,
bool decr_job_alloc)
{
gres_state_t *gres_state_job;
gres_step_state_t *gres_ss =
(gres_step_state_t *)gres_state_step->gres_data;
gres_job_state_t *gres_js;
uint32_t j;
uint64_t gres_cnt;
int len_j, len_s;
gres_key_t job_search_key;
xassert(job_gres_list);
xassert(gres_ss);
job_search_key.config_flags = gres_state_step->config_flags;
job_search_key.plugin_id = gres_state_step->plugin_id;
if (gres_ss->type_name)
job_search_key.type_id = gres_ss->type_id;
else
job_search_key.type_id = NO_VAL;
job_search_key.node_offset = node_offset;
if (!(gres_state_job = list_find_first(
job_gres_list,
gres_find_job_by_key_with_cnt,
&job_search_key)))
return SLURM_SUCCESS;
gres_js = (gres_job_state_t *)gres_state_job->gres_data;
if (gres_js->total_gres == NO_CONSUME_VAL64) {
xassert(!gres_ss->node_in_use);
xassert(!gres_ss->gres_bit_alloc);
return SLURM_SUCCESS;
} else if (gres_js->node_cnt < node_offset) {
/*
* gres_find_job_by_key_with_cnt() already does this
* check so we should never get here, but here as a
* sanity check.
*/
return SLURM_SUCCESS;
}
if (!gres_ss->node_in_use) {
error("gres/%s: %s %ps dealloc, node_in_use is NULL",
gres_state_job->gres_name, __func__, step_id);
return SLURM_ERROR;
}
if (!bit_test(gres_ss->node_in_use, node_offset))
return SLURM_SUCCESS;
if (!decr_job_alloc) {
/* This step was not counted against job allocation */
if (gres_ss->gres_bit_alloc)
FREE_NULL_BITMAP(gres_ss->gres_bit_alloc[node_offset]);
return SLURM_SUCCESS;
}
if (gres_ss->gres_cnt_node_alloc)
gres_cnt = gres_ss->gres_cnt_node_alloc[node_offset];
else {
error("gres/%s: %s %ps dealloc, gres_cnt_node_alloc is NULL",
gres_state_job->gres_name, __func__, step_id);
return SLURM_ERROR;
}
if (gres_js->gres_cnt_step_alloc) {
if (gres_js->gres_cnt_step_alloc[node_offset] >= gres_cnt) {
gres_js->gres_cnt_step_alloc[node_offset] -= gres_cnt;
} else {
error("gres/%s: %s %ps dealloc count underflow",
gres_state_job->gres_name, __func__,
step_id);
gres_js->gres_cnt_step_alloc[node_offset] = 0;
}
}
if ((gres_ss->gres_bit_alloc == NULL) ||
(gres_ss->gres_bit_alloc[node_offset] == NULL))
return SLURM_SUCCESS;
if (gres_js->gres_bit_alloc[node_offset] == NULL) {
error("gres/%s: %s job %u gres_bit_alloc[%d] is NULL",
gres_state_job->gres_name, __func__,
step_id->job_id, node_offset);
return SLURM_SUCCESS;
}
len_j = bit_size(gres_js->gres_bit_alloc[node_offset]);
len_s = bit_size(gres_ss->gres_bit_alloc[node_offset]);
if (len_j != len_s) {
error("gres/%s: %s %ps dealloc, bit_alloc[%d] size mismatch (%d != %d)",
gres_state_job->gres_name, __func__,
step_id, node_offset, len_j, len_s);
len_j = MIN(len_j, len_s);
}
for (j = 0; j < len_j; j++) {
if (!bit_test(gres_ss->gres_bit_alloc[node_offset], j))
continue;
if (gres_js->gres_bit_step_alloc &&
gres_js->gres_bit_step_alloc[node_offset]) {
bit_clear(gres_js->gres_bit_step_alloc[node_offset],
j);
if (gres_id_shared(gres_state_job->config_flags) &&
gres_js->gres_per_bit_step_alloc[node_offset] &&
gres_ss->gres_per_bit_alloc[node_offset])
gres_js->gres_per_bit_step_alloc[node_offset]
[j] -=
gres_ss->gres_per_bit_alloc[node_offset]
[j];
}
}
FREE_NULL_BITMAP(gres_ss->gres_bit_alloc[node_offset]);
if (gres_ss->gres_per_bit_alloc)
xfree(gres_ss->gres_per_bit_alloc[node_offset]);
return SLURM_SUCCESS;
}
extern int gres_stepmgr_step_dealloc(
list_t *step_gres_list, list_t *job_gres_list,
uint32_t job_id, uint32_t step_id,
int node_offset,
bool decr_job_alloc)
{
int rc = SLURM_SUCCESS, rc2;
list_itr_t *step_gres_iter;
gres_state_t *gres_state_step;
slurm_step_id_t tmp_step_id;
if (step_gres_list == NULL)
return SLURM_SUCCESS;
if (job_gres_list == NULL) {
error("%s: step deallocates gres, but job %u has none",
__func__, job_id);
return SLURM_ERROR;
}
tmp_step_id.job_id = job_id;
tmp_step_id.step_het_comp = NO_VAL;
tmp_step_id.step_id = step_id;
step_gres_iter = list_iterator_create(step_gres_list);
while ((gres_state_step = list_next(step_gres_iter))) {
rc2 = _step_dealloc(gres_state_step,
job_gres_list,
&tmp_step_id,
node_offset,
decr_job_alloc);
if (rc2 != SLURM_SUCCESS)
rc = rc2;
}
list_iterator_destroy(step_gres_iter);
return rc;
}
/*
* A job allocation size has changed. Update the job step gres information
* bitmaps and other data structures.
* IN gres_list - List of Gres records for this step to track usage
* IN orig_job_node_bitmap - bitmap of nodes in the original job allocation
* IN new_job_node_bitmap - bitmap of nodes in the new job allocation
*/
void gres_stepmgr_step_state_rebase(
list_t *gres_list,
bitstr_t *orig_job_node_bitmap,
bitstr_t *new_job_node_bitmap)
{
list_itr_t *gres_iter;
gres_state_t *gres_state_step;
gres_step_state_t *gres_ss;
int new_node_cnt;
int i_first, i_last, i;
int old_inx, new_inx;
bitstr_t *new_node_in_use;
bitstr_t **new_gres_bit_alloc = NULL;
if (gres_list == NULL)
return;
gres_iter = list_iterator_create(gres_list);
while ((gres_state_step = list_next(gres_iter))) {
gres_ss = (gres_step_state_t *) gres_state_step->gres_data;
if (!gres_ss)
continue;
if (!gres_ss->node_in_use) {
error("gres_step_state_rebase: node_in_use is NULL");
continue;
}
new_node_cnt = bit_set_count(new_job_node_bitmap);
i_first = MIN(bit_ffs(orig_job_node_bitmap),
bit_ffs(new_job_node_bitmap));
i_first = MAX(i_first, 0);
i_last = MAX(bit_fls(orig_job_node_bitmap),
bit_fls(new_job_node_bitmap));
if (i_last == -1) {
error("gres_step_state_rebase: node_bitmaps "
"are empty");
continue;
}
new_node_in_use = bit_alloc(new_node_cnt);
old_inx = new_inx = -1;
for (i = i_first; i <= i_last; i++) {
bool old_match = false, new_match = false;
if (bit_test(orig_job_node_bitmap, i)) {
old_match = true;
old_inx++;
}
if (bit_test(new_job_node_bitmap, i)) {
new_match = true;
new_inx++;
}
if (old_match && new_match) {
bit_set(new_node_in_use, new_inx);
if (gres_ss->gres_bit_alloc) {
if (!new_gres_bit_alloc) {
new_gres_bit_alloc = xcalloc(
new_node_cnt,
sizeof(bitstr_t *));
}
new_gres_bit_alloc[new_inx] =
gres_ss->
gres_bit_alloc[old_inx];
}
} else if (old_match &&
gres_ss->gres_bit_alloc &&
gres_ss->gres_bit_alloc[old_inx]) {
/*
* Node removed from job allocation,
* release step's resources
*/
FREE_NULL_BITMAP(gres_ss->
gres_bit_alloc[old_inx]);
}
}
gres_ss->node_cnt = new_node_cnt;
FREE_NULL_BITMAP(gres_ss->node_in_use);
gres_ss->node_in_use = new_node_in_use;
xfree(gres_ss->gres_bit_alloc);
gres_ss->gres_bit_alloc = new_gres_bit_alloc;
}
list_iterator_destroy(gres_iter);
}
static void _gres_add_2_tres_str(char **tres_str, slurmdb_tres_rec_t *tres_rec,
uint64_t count)
{
uint64_t old_count;
old_count = slurmdb_find_tres_count_in_string(*tres_str, tres_rec->id);
if (old_count == INFINITE64) {
/* New gres */
xstrfmtcat(*tres_str, "%s%u=%"PRIu64, *tres_str ? "," : "",
tres_rec->id, count);
} else {
/* Add gres counts together */
char *tmp_str = xstrdup_printf("%u=", tres_rec->id);
char *cut = xstrstr(*tres_str, tmp_str) + strlen(tmp_str);
xfree(tmp_str);
cut[0] = 0;
xstrfmtcat(*tres_str, "%"PRIu64"%s", old_count + count,
xstrstr(cut + 1, ","));
}
}
static void _gres_2_tres_str_internal(char **tres_str,
char *gres_name, char *gres_type,
uint64_t count)
{
slurmdb_tres_rec_t *tres_rec;
static bool first_run = 1;
static slurmdb_tres_rec_t tres_req;
/* we only need to init this once */
if (first_run) {
first_run = 0;
memset(&tres_req, 0, sizeof(slurmdb_tres_rec_t));
tres_req.type = "gres";
}
xassert(verify_assoc_lock(TRES_LOCK, READ_LOCK));
xassert(gres_name);
xassert(tres_str);
tres_req.name = gres_name;
tres_rec = assoc_mgr_find_tres_rec(&tres_req);
if (tres_rec)
_gres_add_2_tres_str(tres_str, tres_rec, count);
if (gres_type) {
/*
* Now let's put of the : name TRES if we are
* tracking it as well. This would be handy
* for GRES like "gpu:tesla", where you might
* want to track both as TRES.
*/
tres_req.name = xstrdup_printf("%s:%s", gres_name, gres_type);
tres_rec = assoc_mgr_find_tres_rec(&tres_req);
xfree(tres_req.name);
if (tres_rec)
_gres_add_2_tres_str(tres_str, tres_rec, count);
}
}
/*
* Given a job's GRES data structure, return a simple tres string of gres
* allocated on the node_inx requested
* IN job_gres_list - job's allocated GRES data structure
* IN node_inx - position of node in gres_js->gres_cnt_node_alloc
* IN locked - if the assoc_mgr tres read locked is locked or not
*
* RET - simple string containing gres this job is allocated on the node
* requested.
*/
extern char *gres_stepmgr_gres_on_node_as_tres(
list_t *job_gres_list, int node_inx, bool locked)
{
list_itr_t *job_gres_iter;
gres_state_t *gres_state_job;
char *tres_str = NULL;
assoc_mgr_lock_t locks = { .tres = READ_LOCK };
if (!job_gres_list) /* No GRES allocated */
return NULL;
/* must be locked first before gres_contrex_lock!!! */
if (!locked)
assoc_mgr_lock(&locks);
job_gres_iter = list_iterator_create(job_gres_list);
while ((gres_state_job = list_next(job_gres_iter))) {
uint64_t count;
gres_job_state_t *gres_js =
(gres_job_state_t *)gres_state_job->gres_data;
if (!gres_js->gres_bit_alloc)
continue;
if (node_inx > gres_js->node_cnt)
break;
if (!gres_state_job->gres_name) {
debug("%s: couldn't find name", __func__);
continue;
}
/* If we are no_consume, print a 0 */
if (gres_js->total_gres == NO_CONSUME_VAL64)
count = 0;
else if (gres_js->gres_cnt_node_alloc[node_inx])
count = gres_js->gres_cnt_node_alloc[node_inx];
else /* If this gres isn't on the node skip it */
continue;
_gres_2_tres_str_internal(&tres_str,
gres_state_job->gres_name,
gres_js->type_name,
count);
}
list_iterator_destroy(job_gres_iter);
if (!locked)
assoc_mgr_unlock(&locks);
return tres_str;
}
static uint64_t _step_test(gres_step_state_t *gres_ss, bool first_step_node,
uint16_t cpus_per_task, int max_rem_nodes,
bool ignore_alloc, uint64_t gres_cnt, bool test_mem,
int node_offset, slurm_step_id_t *step_id,
job_resources_t *job_resrcs_ptr, int *err_code)
{
uint64_t cpu_cnt, min_gres = 1, task_cnt;
xassert(gres_ss);
if (!gres_cnt)
return 0;
if (first_step_node) {
gres_ss->gross_gres = 0;
gres_ss->total_gres = 0;
}
if (gres_ss->gres_per_node)
min_gres = gres_ss-> gres_per_node;
if (gres_ss->gres_per_socket)
min_gres = MAX(min_gres, gres_ss->gres_per_socket);
if (gres_ss->gres_per_task)
min_gres = MAX(min_gres, gres_ss->gres_per_task);
if (gres_ss->gres_per_step &&
(gres_ss->gres_per_step > gres_ss->total_gres) &&
(max_rem_nodes == 1)) {
uint64_t gres_per_step = gres_ss->gres_per_step;
if (ignore_alloc)
gres_per_step -= gres_ss->gross_gres;
else
gres_per_step -= gres_ss->total_gres;
min_gres = MAX(min_gres, gres_per_step);
}
if (gres_cnt != NO_VAL64) {
uint16_t cpus_per_gres = gres_ss->cpus_per_gres;
if (min_gres > gres_cnt) {
cpu_cnt = 0;
} else if (cpus_per_gres && (cpus_per_gres != NO_VAL16)) {
cpu_cnt = cpus_per_gres * gres_cnt;
} else if (gres_ss->gres_per_task) {
task_cnt = (gres_cnt + gres_ss->gres_per_task - 1)
/ gres_ss->gres_per_task;
cpu_cnt = task_cnt * cpus_per_task;
} else
cpu_cnt = NO_VAL64;
} else {
gres_cnt = 0;
cpu_cnt = NO_VAL64;
}
/* Test if there is enough memory available to run the step. */
if (test_mem && cpu_cnt && gres_cnt && gres_ss->mem_per_gres &&
(gres_ss->mem_per_gres != NO_VAL64)) {
uint64_t mem_per_gres, mem_req, mem_avail;
mem_per_gres = gres_ss->mem_per_gres;
mem_req = min_gres * mem_per_gres;
mem_avail = job_resrcs_ptr->memory_allocated[node_offset];
if (!ignore_alloc)
mem_avail -= job_resrcs_ptr->memory_used[node_offset];
if (mem_avail < mem_req) {
log_flag(STEPS, "%s: JobId=%u: Usable memory on node: %"PRIu64" is less than requested %"PRIu64", skipping the node",
__func__, step_id->job_id, mem_avail,
mem_req);
cpu_cnt = 0;
*err_code = ESLURM_INVALID_TASK_MEMORY;
}
}
if (cpu_cnt != 0) {
if (ignore_alloc)
gres_ss->gross_gres += gres_cnt;
else
gres_ss->total_gres += gres_cnt;
}
return cpu_cnt;
}
static int _step_get_gres_cnt(void *x, void *arg)
{
gres_state_t *gres_state_job = (gres_state_t *)x;
foreach_gres_cnt_t *foreach_gres_cnt = (foreach_gres_cnt_t *)arg;
gres_job_state_t *gres_js;
gres_key_t *job_search_key = foreach_gres_cnt->job_search_key;
bool ignore_alloc = foreach_gres_cnt->ignore_alloc;
slurm_step_id_t *step_id = foreach_gres_cnt->step_id;
int node_offset = job_search_key->node_offset;
/* This isn't the gres we are looking for */
if (!gres_find_job_by_key_with_cnt(gres_state_job, job_search_key))
return 0;
/* This is the first time we have found a matching GRES. */
if (foreach_gres_cnt->gres_cnt == INFINITE64)
foreach_gres_cnt->gres_cnt = 0;
gres_js = gres_state_job->gres_data;
if (gres_js->total_gres == NO_CONSUME_VAL64) {
foreach_gres_cnt->gres_cnt = NO_CONSUME_VAL64;
return -1;
}
if ((node_offset >= gres_js->node_cnt)) {
error("gres/%s: %s %ps node offset invalid (%d >= %u)",
gres_state_job->gres_name, __func__, step_id,
node_offset, gres_js->node_cnt);
foreach_gres_cnt->gres_cnt = 0;
return -1;
}
if (!gres_id_shared(job_search_key->config_flags) &&
gres_js->gres_bit_alloc &&
gres_js->gres_bit_alloc[node_offset]) {
foreach_gres_cnt->gres_cnt += bit_set_count(
gres_js->gres_bit_alloc[node_offset]);
if (!ignore_alloc &&
gres_js->gres_bit_step_alloc &&
gres_js->gres_bit_step_alloc[node_offset]) {
foreach_gres_cnt->gres_cnt -=
bit_set_count(gres_js->
gres_bit_step_alloc[node_offset]);
}
} else if (gres_js->gres_cnt_node_alloc &&
gres_js->gres_cnt_step_alloc) {
foreach_gres_cnt->gres_cnt +=
gres_js->gres_cnt_node_alloc[node_offset];
if (!ignore_alloc) {
foreach_gres_cnt->gres_cnt -= gres_js->
gres_cnt_step_alloc[node_offset];
}
} else {
debug3("gres/%s:%s: %s %ps gres_bit_alloc and gres_cnt_node_alloc are NULL",
gres_state_job->gres_name, gres_js->type_name,
__func__, step_id);
foreach_gres_cnt->gres_cnt = NO_VAL64;
return -1;
}
return 0;
}
extern uint64_t gres_stepmgr_step_test(gres_stepmgr_step_test_args_t *args)
{
uint64_t cpu_cnt, tmp_cnt;
uint16_t cpus_per_task = args->cpus_per_task;
list_itr_t *step_gres_iter;
gres_state_t *gres_state_step;
gres_step_state_t *gres_ss = NULL;
slurm_step_id_t tmp_step_id;
foreach_gres_cnt_t foreach_gres_cnt;
if (args->step_gres_list == NULL)
return NO_VAL64;
if (args->job_gres_list == NULL)
return 0;
if (cpus_per_task == 0)
cpus_per_task = 1;
cpu_cnt = NO_VAL64;
(void) gres_init();
*(args->err_code) = SLURM_SUCCESS;
tmp_step_id.job_id = args->job_id;
tmp_step_id.step_het_comp = NO_VAL;
tmp_step_id.step_id = args->step_id;
memset(&foreach_gres_cnt, 0, sizeof(foreach_gres_cnt));
foreach_gres_cnt.ignore_alloc = args->ignore_alloc;
foreach_gres_cnt.step_id = &tmp_step_id;
step_gres_iter = list_iterator_create(args->step_gres_list);
while ((gres_state_step = (gres_state_t *) list_next(step_gres_iter))) {
gres_key_t job_search_key;
gres_ss = (gres_step_state_t *)gres_state_step->gres_data;
job_search_key.config_flags = gres_state_step->config_flags;
job_search_key.plugin_id = gres_state_step->plugin_id;
if (gres_ss->type_name)
job_search_key.type_id = gres_ss->type_id;
else
job_search_key.type_id = NO_VAL;
job_search_key.node_offset = args->node_offset;
foreach_gres_cnt.job_search_key = &job_search_key;
foreach_gres_cnt.gres_cnt = INFINITE64;
(void)list_for_each(args->job_gres_list, _step_get_gres_cnt,
&foreach_gres_cnt);
if (foreach_gres_cnt.gres_cnt == INFINITE64) {
log_flag(STEPS, "%s: Job lacks GRES (%s:%s) required by the step",
__func__, gres_state_step->gres_name,
gres_ss->type_name);
cpu_cnt = 0;
break;
}
if (foreach_gres_cnt.gres_cnt == NO_CONSUME_VAL64) {
cpu_cnt = NO_VAL64;
break;
}
tmp_cnt = _step_test(gres_ss, args->first_step_node,
cpus_per_task, args->max_rem_nodes,
args->ignore_alloc,
foreach_gres_cnt.gres_cnt,
args->test_mem, args->node_offset,
&tmp_step_id,
args->job_resrcs_ptr, args->err_code);
if ((tmp_cnt != NO_VAL64) && (tmp_cnt < cpu_cnt))
cpu_cnt = tmp_cnt;
if (cpu_cnt == 0)
break;
}
list_iterator_destroy(step_gres_iter);
return cpu_cnt;
}
extern char *gres_stepmgr_gres_2_tres_str(list_t *gres_list, bool locked)
{
list_itr_t *itr;
gres_state_t *gres_state_ptr;
uint64_t count;
char *col_name = NULL;
char *tres_str = NULL;
assoc_mgr_lock_t locks = { .tres = READ_LOCK };
if (!gres_list)
return NULL;
/* must be locked first before gres_contrex_lock!!! */
if (!locked)
assoc_mgr_lock(&locks);
itr = list_iterator_create(gres_list);
while ((gres_state_ptr = list_next(itr))) {
switch (gres_state_ptr->state_type) {
case GRES_STATE_TYPE_JOB:
{
gres_job_state_t *gres_js = (gres_job_state_t *)
gres_state_ptr->gres_data;
col_name = gres_js->type_name;
count = gres_js->total_gres;
break;
}
case GRES_STATE_TYPE_STEP:
{
gres_step_state_t *gres_ss = (gres_step_state_t *)
gres_state_ptr->gres_data;
col_name = gres_ss->type_name;
count = gres_ss->total_gres;
break;
}
default:
error("%s: unsupported state type %d", __func__,
gres_state_ptr->state_type);
continue;
}
/* If we are no_consume, print a 0 */
if (count == NO_CONSUME_VAL64)
count = 0;
_gres_2_tres_str_internal(&tres_str,
gres_state_ptr->gres_name,
col_name, count);
}
list_iterator_destroy(itr);
if (!locked)
assoc_mgr_unlock(&locks);
return tres_str;
}
/*
* Increment indexes to next round-robin index
* IN/OUT cur_inx - bitmap index
* IN/OUT node_inx - job node index
*/
static int _gres_next_node_inx(int *cur_inx, int *node_inx, int len,
int node_cnt, bitstr_t *nodes_bitmap,
int start_inx)
{
bool wrapped = false;
xassert(cur_inx);
xassert(node_inx);
xassert(nodes_bitmap);
if (!len)
return SLURM_ERROR;
if (*node_inx == -1) {
if (start_inx)
*node_inx += bit_set_count_range(nodes_bitmap, 0,
start_inx);
*cur_inx = start_inx;
} else {
*cur_inx = (*cur_inx + 1) % len;
wrapped = *cur_inx <= start_inx;
if (*cur_inx == start_inx)
return SLURM_ERROR; /* Normal break case */
}
*cur_inx = bit_ffs_from_bit(nodes_bitmap, *cur_inx);
if (wrapped && (*cur_inx >= start_inx))
return SLURM_ERROR; /* Normal break case */
if (*cur_inx < 0) {
xassert(false);
return SLURM_ERROR; /* This should never happen */
}
*node_inx = (*node_inx + 1) % node_cnt;
return SLURM_SUCCESS;
}
/*
* If a step gres request used gres_per_step it must be tested more than just in
* gres_stepmgr_step_test. This function only acts when gres_per_step is used
* IN step_gres_list - step's requested GRES data structure
* IN job_ptr - Job data
* IN/OUT nodes_avail - Bitstring of nodes available for this step to use
* IN min_nodes - minimum nodes required for this step
*/
extern void gres_stepmgr_step_test_per_step(
list_t *step_gres_list,
job_record_t *job_ptr,
bitstr_t *nodes_avail,
int min_nodes)
{
list_itr_t *step_gres_iter;
gres_state_t *gres_state_step;
slurm_step_id_t tmp_step_id;
foreach_gres_cnt_t foreach_gres_cnt;
bitstr_t *node_bitmap = job_ptr->job_resrcs->node_bitmap;
int i_first, bit_len;
if (!step_gres_list)
return;
if (!job_ptr->gres_list_alloc)
return;
(void) gres_init();
i_first = job_ptr->job_resrcs->next_step_node_inx;
bit_len = bit_fls(node_bitmap) + 1;
if (i_first >= bit_len)
i_first = 0;
tmp_step_id.job_id = job_ptr->job_id;
tmp_step_id.step_het_comp = NO_VAL;
tmp_step_id.step_id = NO_VAL;
memset(&foreach_gres_cnt, 0, sizeof(foreach_gres_cnt));
foreach_gres_cnt.ignore_alloc = false;
foreach_gres_cnt.step_id = &tmp_step_id;
step_gres_iter = list_iterator_create(step_gres_list);
while ((gres_state_step = list_next(step_gres_iter))) {
gres_key_t job_search_key;
int32_t *gres_cnts;
int gres_req, limit;
bitstr_t *nodes_picked;
gres_step_state_t *gres_ss = gres_state_step->gres_data;
if (!gres_ss->gres_per_step)
continue;
gres_req = gres_ss->gres_per_step;
limit = ROUNDUP(gres_req, min_nodes);
job_search_key.config_flags = gres_state_step->config_flags;
job_search_key.plugin_id = gres_state_step->plugin_id;
if (gres_ss->type_name)
job_search_key.type_id = gres_ss->type_id;
else
job_search_key.type_id = NO_VAL;
foreach_gres_cnt.job_search_key = &job_search_key;
nodes_picked = bit_alloc(bit_size(nodes_avail));
gres_cnts = xcalloc(job_ptr->node_cnt, sizeof(uint64_t));
for (int node_inx = 0; node_inx < job_ptr->node_cnt; node_inx++)
gres_cnts[node_inx] = NO_VAL;
/*
* Select nodes until enough gres has been allocated.
* Starting with nodes that have an equal share available each,
*/
while (limit >= 0) {
int next_smallest = -1;
int i, node_inx = -1;
while (_gres_next_node_inx(&i, &node_inx, bit_len,
job_ptr->job_resrcs->nhosts,
node_bitmap, i_first) ==
SLURM_SUCCESS) {
if (!bit_test(nodes_avail, i) ||
bit_test(nodes_picked, i))
continue;
/* Only calculate gres cnt once */
if (gres_cnts[node_inx] == NO_VAL) {
job_search_key.node_offset = node_inx;
foreach_gres_cnt.gres_cnt = INFINITE64;
(void) list_for_each(
job_ptr->gres_list_alloc,
_step_get_gres_cnt,
&foreach_gres_cnt);
gres_cnts[node_inx] =
foreach_gres_cnt.gres_cnt;
}
if (gres_cnts[node_inx] >= limit) {
bit_set(nodes_picked, i);
gres_req -= gres_cnts[node_inx];
} else if (gres_cnts[node_inx] >
next_smallest) {
next_smallest = gres_cnts[node_inx];
}
if ((gres_req <= 0) &&
(bit_set_count(nodes_picked) >=
min_nodes)) {
bit_and(nodes_avail, nodes_picked);
next_smallest = -1; /* exit loop */
break;
}
}
limit = next_smallest;
}
FREE_NULL_BITMAP(nodes_picked);
xfree(gres_cnts);
}
list_iterator_destroy(step_gres_iter);
}