blob: 89deededaa22dfbf7b41ecdbfde82ca7db40bb02 [file] [log] [blame] [edit]
/*****************************************************************************\
* gres.c - driver for gres plugin
*****************************************************************************
* Copyright (C) 2010 Lawrence Livermore National Security.
* Portions Copyright (C) 2014-2019 SchedMD LLC
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Morris Jette <jette1@llnl.gov>
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see <https://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#include "config.h"
#define _GNU_SOURCE
#ifdef __FreeBSD__
# include <sys/param.h>
# include <sys/cpuset.h>
typedef cpuset_t cpu_set_t;
#endif
#include <ctype.h>
#include <inttypes.h>
#include <limits.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#ifdef MAJOR_IN_MKDEV
# include <sys/mkdev.h>
#endif
#ifdef MAJOR_IN_SYSMACROS
# include <sys/sysmacros.h>
#endif
#include <math.h>
#ifdef __NetBSD__
#define CPU_ZERO(c) cpuset_zero(*(c))
#define CPU_ISSET(i,c) cpuset_isset((i),*(c))
#define sched_getaffinity sched_getaffinity_np
#endif
#include "slurm/slurm.h"
#include "slurm/slurm_errno.h"
#include "src/common/assoc_mgr.h"
#include "src/common/bitstring.h"
#include "src/common/gres.h"
#include "src/common/job_resources.h"
#include "src/common/list.h"
#include "src/common/log.h"
#include "src/common/macros.h"
#include "src/common/node_conf.h"
#include "src/common/node_select.h"
#include "src/common/pack.h"
#include "src/common/parse_config.h"
#include "src/common/plugin.h"
#include "src/common/plugrack.h"
#include "src/common/read_config.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/strlcpy.h"
#include "src/common/xmalloc.h"
#include "src/common/xstring.h"
#define MAX_GRES_BITMAP 1024
strong_alias(gres_gresid_to_gresname, slurm_gres_gresid_to_gresname);
strong_alias(gres_get_node_used, slurm_gres_get_node_used);
strong_alias(gres_get_system_cnt, slurm_gres_get_system_cnt);
strong_alias(gres_get_value_by_type, slurm_gres_get_value_by_type);
strong_alias(gres_get_job_info, slurm_gres_get_job_info);
strong_alias(gres_build_job_details, slurm_gres_build_job_details);
strong_alias(gres_get_step_info, slurm_gres_get_step_info);
strong_alias(gres_get_step_state, slurm_gres_get_step_state);
strong_alias(gres_get_job_state, slurm_gres_get_job_state);
strong_alias(gres_2_tres_str, slurm_gres_2_tres_str);
strong_alias(gres_set_job_tres_cnt, slurm_gres_set_job_tres_cnt);
strong_alias(gres_set_node_tres_cnt, slurm_gres_set_node_tres_cnt);
strong_alias(gres_device_major, slurm_gres_device_major);
strong_alias(destroy_gres_device, slurm_destroy_gres_device);
strong_alias(destroy_gres_slurmd_conf, slurm_destroy_gres_slurmd_conf);
/* Gres symbols provided by the plugin */
typedef struct slurm_gres_ops {
int (*node_config_load) ( List gres_conf_list,
node_config_load_t *node_conf);
void (*job_set_env) ( char ***job_env_ptr,
void *gres_ptr, int node_inx );
void (*step_set_env) ( char ***job_env_ptr,
void *gres_ptr );
void (*step_reset_env) ( char ***job_env_ptr,
void *gres_ptr,
bitstr_t *usable_gres );
void (*send_stepd) ( int fd );
void (*recv_stepd) ( int fd );
int (*job_info) ( gres_job_state_t *job_gres_data,
uint32_t node_inx,
enum gres_job_data_type data_type,
void *data);
int (*step_info) ( gres_step_state_t *step_gres_data,
uint32_t node_inx,
enum gres_step_data_type data_type,
void *data);
List (*get_devices) ( void );
void (*step_hardware_init) ( bitstr_t *, char * );
void (*step_hardware_fini) ( void );
gres_epilog_info_t *(*epilog_build_env)(gres_job_state_t *gres_job_ptr);
void (*epilog_set_env) ( char ***epilog_env_ptr,
gres_epilog_info_t *epilog_info,
int node_inx );
} slurm_gres_ops_t;
/*
* Gres plugin context, one for each gres type.
* Add to gres_context through _add_gres_context().
*/
typedef struct slurm_gres_context {
plugin_handle_t cur_plugin;
uint8_t config_flags; /* See GRES_CONF_* in gres.h */
char * gres_name; /* name (e.g. "gpu") */
char * gres_name_colon; /* name + colon (e.g. "gpu:") */
int gres_name_colon_len; /* size of gres_name_colon */
char * gres_type; /* plugin name (e.g. "gres/gpu") */
slurm_gres_ops_t ops; /* pointers to plugin symbols */
uint32_t plugin_id; /* key for searches */
plugrack_t *plugin_list; /* plugrack info */
uint64_t total_cnt; /* Total GRES across all nodes */
} slurm_gres_context_t;
/* Generic gres data structure for adding to a list. Depending upon the
* context, gres_data points to gres_node_state_t, gres_job_state_t or
* gres_step_state_t */
typedef struct gres_state {
uint32_t plugin_id;
void *gres_data;
} gres_state_t;
typedef struct gres_search_key {
int node_offset;
uint32_t plugin_id;
uint32_t type_id;
} gres_key_t;
/* Pointers to functions in src/slurmd/common/xcpuinfo.h that we may use */
typedef struct xcpuinfo_funcs {
int (*xcpuinfo_abs_to_mac) (char *abs, char **mac);
} xcpuinfo_funcs_t;
xcpuinfo_funcs_t xcpuinfo_ops;
/* Local variables */
static int gres_context_cnt = -1;
static uint32_t gres_cpu_cnt = 0;
static bool gres_debug = false;
static slurm_gres_context_t *gres_context = NULL;
static char *gres_node_name = NULL;
static char *gres_plugin_list = NULL;
static pthread_mutex_t gres_context_lock = PTHREAD_MUTEX_INITIALIZER;
static List gres_conf_list = NULL;
static bool init_run = false;
static bool have_gpu = false, have_mps = false;
static uint32_t gpu_plugin_id = NO_VAL, mps_plugin_id = NO_VAL;
static volatile uint32_t autodetect_types = GRES_AUTODETECT_NONE;
static uint32_t select_plugin_type = NO_VAL;
/* Local functions */
static void _add_gres_context(char *gres_name);
static gres_node_state_t *
_build_gres_node_state(void);
static void _build_node_gres_str(List *gres_list, char **gres_str,
int cores_per_sock, int sock_per_node);
static uint32_t **_build_tasks_per_node_sock(struct job_resources *job_res,
uint8_t overcommit,
gres_mc_data_t *tres_mc_ptr,
node_record_t *node_table_ptr);
static bitstr_t *_core_bitmap_rebuild(bitstr_t *old_core_bitmap, int new_size);
static void _epilog_list_del(void *x);
static int _find_job_by_sock_gres(void *x, void *key);
static int _find_sock_by_job_gres(void *x, void *key);
static void _free_tasks_per_node_sock(uint32_t **tasks_per_node_socket,
int node_cnt);
static void _get_gres_cnt(gres_node_state_t *gres_data, char *orig_config,
char *gres_name, char *gres_name_colon,
int gres_name_colon_len);
static uint32_t _get_task_cnt_node(uint32_t **tasks_per_node_socket,
int node_inx, int sock_cnt);
static uint64_t _get_tot_gres_cnt(uint32_t plugin_id, uint64_t *topo_cnt,
int *config_type_cnt);
static int _gres_find_id(void *x, void *key);
static int _gres_find_job_by_key(void *x, void *key);
static int _gres_find_step_by_key(void *x, void *key);
static void _gres_job_list_delete(void *list_element);
static int _job_alloc(void *job_gres_data, void *node_gres_data,
int node_cnt, int node_index, int node_offset,
char *gres_name, uint32_t job_id, char *node_name,
bitstr_t *core_bitmap, uint32_t plugin_id,
uint32_t user_id);
static void _job_core_filter(void *job_gres_data, void *node_gres_data,
bool use_total_gres, bitstr_t *core_bitmap,
int core_start_bit, int core_end_bit,
char *gres_name, char *node_name,
uint32_t plugin_id);
static int _job_dealloc(void *job_gres_data, void *node_gres_data,
int node_offset, char *gres_name, uint32_t job_id,
char *node_name, bool old_job, uint32_t plugin_id,
uint32_t user_id, bool job_fini);
static void _job_state_delete(void *gres_data);
static void * _job_state_dup(void *gres_data);
static void * _job_state_dup2(void *gres_data, int node_index);
static void _job_state_log(void *gres_data, uint32_t job_id,
uint32_t plugin_id);
static uint32_t _job_test(void *job_gres_data, void *node_gres_data,
bool use_total_gres, bitstr_t *core_bitmap,
int core_start_bit, int core_end_bit, bool *topo_set,
uint32_t job_id, char *node_name, char *gres_name,
uint32_t plugin_id, bool disable_binding);
static int _load_gres_plugin(slurm_gres_context_t *plugin_context);
static int _log_gres_slurmd_conf(void *x, void *arg);
static void _my_stat(char *file_name);
static int _node_config_init(char *node_name, char *orig_config,
slurm_gres_context_t *context_ptr,
gres_state_t *gres_ptr);
static char * _node_gres_used(void *gres_data, char *gres_name);
static int _node_reconfig(char *node_name, char *new_gres, char **gres_str,
gres_state_t *gres_ptr, bool config_overrides,
slurm_gres_context_t *context_ptr,
bool *updated_gpu_cnt);
static int _node_reconfig_test(char *node_name, char *new_gres,
gres_state_t *gres_ptr,
slurm_gres_context_t *context_ptr);
static void _node_state_dealloc(gres_state_t *gres_ptr);
static void * _node_state_dup(void *gres_data);
static void _node_state_log(void *gres_data, char *node_name,
char *gres_name);
static int _parse_gres_config(void **dest, slurm_parser_enum_t type,
const char *key, const char *value,
const char *line, char **leftover);
static int _parse_gres_config2(void **dest, slurm_parser_enum_t type,
const char *key, const char *value,
const char *line, char **leftover);
static bool _shared_gres(uint32_t plugin_id);
static bool _sharing_gres(uint32_t plugin_id);
static void _sock_gres_del(void *x);
static int _step_alloc(void *step_gres_data, void *job_gres_data,
uint32_t plugin_id, int node_offset,
bool first_step_node,
uint32_t job_id, uint32_t step_id,
uint16_t tasks_on_node, uint32_t rem_nodes);
static int _step_dealloc(gres_state_t *step_gres_ptr, List job_gres_list,
uint32_t job_id, uint32_t step_id);
static void * _step_state_dup(void *gres_data);
static void * _step_state_dup2(void *gres_data, int node_index);
static void _step_state_log(void *gres_data, uint32_t job_id,
uint32_t step_id, char *gres_name);
static uint64_t _step_test(void *step_gres_data, void *job_gres_data,
int node_offset, bool first_step_node,
uint16_t cpus_per_task, int max_rem_nodes,
bool ignore_alloc,
uint32_t job_id, uint32_t step_id,
uint32_t plugin_id);
static void _sync_node_mps_to_gpu(gres_state_t *mps_gres_ptr,
gres_state_t *gpu_gres_ptr);
static int _unload_gres_plugin(slurm_gres_context_t *plugin_context);
static void _validate_slurm_conf(List slurm_conf_list,
slurm_gres_context_t *context_ptr);
static void _validate_gres_conf(List gres_conf_list,
slurm_gres_context_t *context_ptr);
static int _validate_file(char *path_name, char *gres_name);
static void _validate_links(gres_slurmd_conf_t *p);
static void _validate_gres_node_cores(gres_node_state_t *node_gres_ptr,
int cpus_ctld, char *node_name);
static int _valid_gres_type(char *gres_name, gres_node_state_t *gres_data,
bool config_overrides, char **reason_down);
extern uint32_t gres_plugin_build_id(char *name)
{
int i, j;
uint32_t id = 0;
if (!name)
return id;
for (i = 0, j = 0; name[i]; i++) {
id += (name[i] << j);
j = (j + 8) % 32;
}
return id;
}
static int _gres_find_id(void *x, void *key)
{
uint32_t *plugin_id = (uint32_t *)key;
gres_state_t *state_ptr = (gres_state_t *) x;
if (state_ptr->plugin_id == *plugin_id)
return 1;
return 0;
}
/* Find job record with matching name and type */
static int _gres_find_job_by_key(void *x, void *key)
{
gres_state_t *state_ptr = (gres_state_t *) x;
gres_key_t *job_key = (gres_key_t *) key;
gres_job_state_t *gres_data_ptr;
gres_data_ptr = (gres_job_state_t *)state_ptr->gres_data;
if ((state_ptr->plugin_id == job_key->plugin_id) &&
((job_key->type_id == NO_VAL) ||
(gres_data_ptr->type_id == job_key->type_id)))
return 1;
return 0;
}
/* Find job record with matching name and type */
static int _gres_find_job_by_key_with_cnt(void *x, void *key)
{
gres_state_t *state_ptr = (gres_state_t *) x;
gres_key_t *job_key = (gres_key_t *) key;
gres_job_state_t *gres_data_ptr;
gres_data_ptr = (gres_job_state_t *)state_ptr->gres_data;
if (!_gres_find_job_by_key(x, key))
return 0;
/* ignore count on no_consume gres */
if (!gres_data_ptr->node_cnt ||
gres_data_ptr->gres_cnt_node_alloc[job_key->node_offset])
return 1;
return 0;
}
static int _gres_find_step_by_key(void *x, void *key)
{
gres_state_t *state_ptr = (gres_state_t *) x;
gres_key_t *step_key = (gres_key_t *) key;
gres_step_state_t *gres_data_ptr;
gres_data_ptr = (gres_step_state_t *)state_ptr->gres_data;
if ((state_ptr->plugin_id == step_key->plugin_id) &&
(gres_data_ptr->type_id == step_key->type_id))
return 1;
return 0;
}
static int _gres_find_name_internal(char *name, char *key, uint32_t plugin_id)
{
if (!name) {
int i;
for (i = 0; i < gres_context_cnt; i++) {
if (gres_context[i].plugin_id == plugin_id) {
name = gres_context[i].gres_name;
break;
}
}
if (!name) {
debug("%s: couldn't find name", __func__);
return 0;
}
}
if (!xstrcmp(name, key))
return 1;
return 0;
}
static int _gres_job_find_name(void *x, void *key)
{
gres_state_t *state_ptr = (gres_state_t *) x;
gres_job_state_t *gres_data_ptr =
(gres_job_state_t *)state_ptr->gres_data;
return _gres_find_name_internal(gres_data_ptr->type_name, (char *)key,
state_ptr->plugin_id);
}
static int _gres_step_find_name(void *x, void *key)
{
gres_state_t *state_ptr = (gres_state_t *) x;
gres_step_state_t *gres_data_ptr =
(gres_step_state_t *)state_ptr->gres_data;
return _gres_find_name_internal(gres_data_ptr->type_name, (char *)key,
state_ptr->plugin_id);
}
static int _load_gres_plugin(slurm_gres_context_t *plugin_context)
{
/*
* Must be synchronized with slurm_gres_ops_t above.
*/
static const char *syms[] = {
"node_config_load",
"job_set_env",
"step_set_env",
"step_reset_env",
"send_stepd",
"recv_stepd",
"job_info",
"step_info",
"get_devices",
"step_hardware_init",
"step_hardware_fini",
"epilog_build_env",
"epilog_set_env"
};
int n_syms = sizeof(syms) / sizeof(char *);
/* Find the correct plugin */
if (plugin_context->config_flags & GRES_CONF_COUNT_ONLY) {
debug("Plugin of type %s only tracks gres counts",
plugin_context->gres_type);
return SLURM_SUCCESS;
}
plugin_context->cur_plugin = plugin_load_and_link(
plugin_context->gres_type,
n_syms, syms,
(void **) &plugin_context->ops);
if (plugin_context->cur_plugin != PLUGIN_INVALID_HANDLE)
return SLURM_SUCCESS;
if (errno != EPLUGIN_NOTFOUND) {
error("Couldn't load specified plugin name for %s: %s",
plugin_context->gres_type, plugin_strerror(errno));
return SLURM_ERROR;
}
debug("gres: Couldn't find the specified plugin name for %s looking "
"at all files", plugin_context->gres_type);
/* Get plugin list */
if (plugin_context->plugin_list == NULL) {
char *plugin_dir;
plugin_context->plugin_list = plugrack_create("gres");
plugin_dir = slurm_get_plugin_dir();
plugrack_read_dir(plugin_context->plugin_list, plugin_dir);
xfree(plugin_dir);
}
plugin_context->cur_plugin = plugrack_use_by_type(
plugin_context->plugin_list,
plugin_context->gres_type );
if (plugin_context->cur_plugin == PLUGIN_INVALID_HANDLE) {
debug("Cannot find plugin of type %s, just track gres counts",
plugin_context->gres_type);
plugin_context->config_flags |= GRES_CONF_COUNT_ONLY;
return SLURM_ERROR;
}
/* Dereference the API. */
if (plugin_get_syms(plugin_context->cur_plugin,
n_syms, syms,
(void **) &plugin_context->ops ) < n_syms ) {
error("Incomplete %s plugin detected",
plugin_context->gres_type);
return SLURM_ERROR;
}
return SLURM_SUCCESS;
}
static int _unload_gres_plugin(slurm_gres_context_t *plugin_context)
{
int rc;
/*
* Must check return code here because plugins might still
* be loaded and active.
*/
if (plugin_context->plugin_list)
rc = plugrack_destroy(plugin_context->plugin_list);
else {
rc = SLURM_SUCCESS;
plugin_unload(plugin_context->cur_plugin);
}
xfree(plugin_context->gres_name);
xfree(plugin_context->gres_name_colon);
xfree(plugin_context->gres_type);
return rc;
}
/*
* Add new gres context to gres_context array and load the plugin.
* Must hold gres_context_lock before calling.
*/
static void _add_gres_context(char *gres_name)
{
slurm_gres_context_t *plugin_context;
if (!gres_name || !gres_name[0])
fatal("%s: invalid empty gres_name", __func__);
xrecalloc(gres_context, (gres_context_cnt + 1),
sizeof(slurm_gres_context_t));
plugin_context = &gres_context[gres_context_cnt];
plugin_context->gres_name = xstrdup(gres_name);
plugin_context->plugin_id = gres_plugin_build_id(gres_name);
plugin_context->gres_type = xstrdup_printf("gres/%s", gres_name);
plugin_context->plugin_list = NULL;
plugin_context->cur_plugin = PLUGIN_INVALID_HANDLE;
gres_context_cnt++;
}
/*
* Initialize the GRES plugins.
*
* Returns a Slurm errno.
*/
extern int gres_plugin_init(void)
{
int i, j, rc = SLURM_SUCCESS;
char *last = NULL, *names, *one_name, *full_name;
char *sorted_names = NULL, *sep = "";
bool append_mps = false;
if (init_run && (gres_context_cnt >= 0))
return rc;
slurm_mutex_lock(&gres_context_lock);
if (slurm_get_debug_flags() & DEBUG_FLAG_GRES)
gres_debug = true;
else
gres_debug = false;
if (gres_context_cnt >= 0)
goto fini;
gres_plugin_list = slurm_get_gres_plugins();
gres_context_cnt = 0;
if ((gres_plugin_list == NULL) || (gres_plugin_list[0] == '\0'))
goto fini;
/* Ensure that "gres/mps" follows "gres/gpu" */
have_gpu = false;
have_mps = false;
names = xstrdup(gres_plugin_list);
one_name = strtok_r(names, ",", &last);
while (one_name) {
bool skip_name = false;
if (!xstrcmp(one_name, "mps")) {
have_mps = true;
if (!have_gpu) {
append_mps = true; /* "mps" must follow "gpu" */
skip_name = true;
}
mps_plugin_id = gres_plugin_build_id("mps");
} else if (!xstrcmp(one_name, "gpu")) {
have_gpu = true;
gpu_plugin_id = gres_plugin_build_id("gpu");
}
if (!skip_name) {
xstrfmtcat(sorted_names, "%s%s", sep, one_name);
sep = ",";
}
one_name = strtok_r(NULL, ",", &last);
}
if (append_mps) {
if (!have_gpu)
fatal("GresTypes: gres/mps requires that gres/gpu also be configured");
xstrfmtcat(sorted_names, "%s%s", sep, "mps");
}
xfree(names);
gres_context_cnt = 0;
one_name = strtok_r(sorted_names, ",", &last);
while (one_name) {
full_name = xstrdup("gres/");
xstrcat(full_name, one_name);
for (i = 0; i < gres_context_cnt; i++) {
if (!xstrcmp(full_name, gres_context[i].gres_type))
break;
}
xfree(full_name);
if (i < gres_context_cnt) {
error("Duplicate plugin %s ignored",
gres_context[i].gres_type);
} else {
_add_gres_context(one_name);
}
one_name = strtok_r(NULL, ",", &last);
}
xfree(sorted_names);
/* Ensure that plugin_id is valid and unique */
for (i = 0; i < gres_context_cnt; i++) {
for (j = i + 1; j < gres_context_cnt; j++) {
if (gres_context[i].plugin_id !=
gres_context[j].plugin_id)
continue;
fatal("Gres: Duplicate plugin_id %u for %s and %s, "
"change gres name for one of them",
gres_context[i].plugin_id,
gres_context[i].gres_type,
gres_context[j].gres_type);
}
xassert(gres_context[i].gres_name);
gres_context[i].gres_name_colon =
xstrdup_printf("%s:", gres_context[i].gres_name);
gres_context[i].gres_name_colon_len =
strlen(gres_context[i].gres_name_colon);
}
init_run = true;
if ((select_plugin_type == NO_VAL) &&
(select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL,
&select_plugin_type) != SLURM_SUCCESS)) {
select_plugin_type = NO_VAL; /* error */
}
if (have_mps && running_in_slurmctld() &&
(select_plugin_type != SELECT_TYPE_CONS_TRES)) {
fatal("Use of gres/mps requires the use of select/cons_tres");
}
fini: slurm_mutex_unlock(&gres_context_lock);
return rc;
}
extern int gres_plugin_get_gres_cnt(void)
{
static int cnt = -1;
if (cnt != -1)
return cnt;
gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
cnt = gres_context_cnt;
slurm_mutex_unlock(&gres_context_lock);
return cnt;
}
/*
* Add a GRES record. This is used by the node_features plugin after the
* slurm.conf file is read and the initial GRES records are built by
* gres_plugin_init().
*/
extern void gres_plugin_add(char *gres_name)
{
int i;
slurm_mutex_lock(&gres_context_lock);
for (i = 0; i < gres_context_cnt; i++) {
if (!xstrcmp(gres_context[i].gres_name, gres_name))
goto fini;
}
_add_gres_context(gres_name);
fini: slurm_mutex_unlock(&gres_context_lock);
}
/* Given a gres_name, return its context index or -1 if not found */
static int _gres_name_context(char *gres_name)
{
int i;
for (i = 0; i < gres_context_cnt; i++) {
if (!xstrcmp(gres_context[i].gres_name, gres_name))
return i;
}
return -1;
}
/*
* Takes a GRES config line (typically from slurm.conf) and remove any
* records for GRES which are not defined in GresTypes.
* RET string of valid GRES, Release memory using xfree()
*/
extern char *gres_plugin_name_filter(char *orig_gres, char *nodes)
{
char *new_gres = NULL, *save_ptr = NULL;
char *colon, *sep = "", *tmp, *tok, *name;
slurm_mutex_lock(&gres_context_lock);
if (!orig_gres || !orig_gres[0] || !gres_context_cnt) {
slurm_mutex_unlock(&gres_context_lock);
return new_gres;
}
tmp = xstrdup(orig_gres);
tok = strtok_r(tmp, ",", &save_ptr);
while (tok) {
name = xstrdup(tok);
if ((colon = strchr(name, ':')))
colon[0] = '\0';
if (_gres_name_context(name) != -1) {
xstrfmtcat(new_gres, "%s%s", sep, tok);
sep = ",";
} else {
/* Logging may not be initialized at this point */
error("Invalid GRES configured on node %s: %s", nodes,
tok);
}
xfree(name);
tok = strtok_r(NULL, ",", &save_ptr);
}
slurm_mutex_unlock(&gres_context_lock);
xfree(tmp);
return new_gres;
}
/*
* Terminate the gres plugin. Free memory.
*
* Returns a Slurm errno.
*/
extern int gres_plugin_fini(void)
{
int i, j, rc = SLURM_SUCCESS;
slurm_mutex_lock(&gres_context_lock);
xfree(gres_node_name);
if (gres_context_cnt < 0)
goto fini;
init_run = false;
for (i = 0; i < gres_context_cnt; i++) {
j = _unload_gres_plugin(gres_context + i);
if (j != SLURM_SUCCESS)
rc = j;
}
xfree(gres_context);
xfree(gres_plugin_list);
FREE_NULL_LIST(gres_conf_list);
gres_context_cnt = -1;
fini: slurm_mutex_unlock(&gres_context_lock);
return rc;
}
/*
**************************************************************************
* P L U G I N C A L L S *
**************************************************************************
*/
/*
* Return a plugin-specific help message for salloc, sbatch and srun
* Result must be xfree()'d.
*
* NOTE: GRES "type" (e.g. model) information is only available from slurmctld
* after slurmd registers. It is not readily available from srun (as used here).
*/
extern char *gres_plugin_help_msg(void)
{
int i;
char *msg = xstrdup("Valid gres options are:\n");
gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
for (i = 0; i < gres_context_cnt; i++) {
xstrcat(msg, gres_context[i].gres_name);
xstrcat(msg, "[[:type]:count]\n");
}
slurm_mutex_unlock(&gres_context_lock);
return msg;
}
/*
* Perform reconfig, re-read any configuration files
* OUT did_change - set if gres configuration changed
*/
extern int gres_plugin_reconfig(void)
{
int rc = SLURM_SUCCESS;
char *plugin_names = slurm_get_gres_plugins();
bool plugin_change;
slurm_mutex_lock(&gres_context_lock);
if (slurm_get_debug_flags() & DEBUG_FLAG_GRES)
gres_debug = true;
else
gres_debug = false;
if (xstrcmp(plugin_names, gres_plugin_list))
plugin_change = true;
else
plugin_change = false;
slurm_mutex_unlock(&gres_context_lock);
if (plugin_change) {
error("GresPlugins changed from %s to %s ignored",
gres_plugin_list, plugin_names);
error("Restart the slurmctld daemon to change GresPlugins");
#if 0
/* This logic would load new plugins, but we need the old
* plugins to persist in order to process old state
* information. */
rc = gres_plugin_fini();
if (rc == SLURM_SUCCESS)
rc = gres_plugin_init();
#endif
}
xfree(plugin_names);
return rc;
}
/*
* Remove file-less GPUs from the final GRES list, since File is a requirement.
*/
static void _remove_fileless_gpus(List gres_conf_list,
slurm_gres_context_t *context_ptr)
{
gres_slurmd_conf_t *gres_conf;
ListIterator iter;
if (!gres_conf_list)
return;
/* Only work in the GPU plugin */
if (context_ptr->plugin_id != gres_plugin_build_id("gpu"))
return;
iter = list_iterator_create(gres_conf_list);
while ((gres_conf = list_next(iter))) {
if (gres_conf->plugin_id != context_ptr->plugin_id)
continue;
if (!gres_conf->file) {
debug("Removing file-less GPU %s:%s from final GRES list",
gres_conf->name, gres_conf->type_name);
list_delete_item(iter);
}
}
list_iterator_destroy(iter);
}
/*
* Log the contents of a gres_slurmd_conf_t record
*/
static int _log_gres_slurmd_conf(void *x, void *arg)
{
gres_slurmd_conf_t *p;
char *links = NULL;
int index = -1, offset, mult = 1;
p = (gres_slurmd_conf_t *) x;
xassert(p);
if (!gres_debug) {
verbose("Gres Name=%s Type=%s Count=%"PRIu64,
p->name, p->type_name, p->count);
return 0;
}
if (p->file) {
index = 0;
offset = strlen(p->file);
while (offset > 0) {
offset--;
if ((p->file[offset] < '0') || (p->file[offset] > '9'))
break;
index += (p->file[offset] - '0') * mult;
mult *= 10;
}
}
if (p->links)
xstrfmtcat(links, "Links=%s", p->links);
if (p->cpus && (index != -1)) {
info("Gres Name=%s Type=%s Count=%"PRIu64" Index=%d ID=%u "
"File=%s Cores=%s CoreCnt=%u %s",
p->name, p->type_name, p->count, index, p->plugin_id,
p->file, p->cpus, p->cpu_cnt, links);
} else if (index != -1) {
info("Gres Name=%s Type=%s Count=%"PRIu64" Index=%d ID=%u File=%s %s",
p->name, p->type_name, p->count, index, p->plugin_id,
p->file, links);
} else if (p->file) {
info("Gres Name=%s Type=%s Count=%"PRIu64" ID=%u File=%s %s",
p->name, p->type_name, p->count, p->plugin_id, p->file,
links);
} else {
info("Gres Name=%s Type=%s Count=%"PRIu64" ID=%u %s", p->name,
p->type_name, p->count, p->plugin_id, links);
}
xfree(links);
return 0;
}
/* Make sure that specified file name exists, wait up to 20 seconds or generate
* fatal error and exit. */
static void _my_stat(char *file_name)
{
struct stat config_stat;
bool sent_msg = false;
int i;
if (!running_in_slurmdstepd())
return;
for (i = 0; i < 20; i++) {
if (i)
sleep(1);
if (stat(file_name, &config_stat) == 0) {
if (sent_msg)
info("gres.conf file %s now exists", file_name);
return;
}
if (errno != ENOENT)
break;
if (!sent_msg) {
error("Waiting for gres.conf file %s", file_name);
sent_msg = true;
}
}
fatal("can't stat gres.conf file %s: %m", file_name);
return;
}
static int _validate_file(char *path_name, char *gres_name)
{
char *file_name, *slash, *one_name, *root_path;
hostlist_t hl;
int i, file_count = 0;
i = strlen(path_name);
if ((i < 3) || (path_name[i-1] != ']')) {
_my_stat(path_name);
return 1;
}
slash = strrchr(path_name, '/');
if (slash) {
slash[0] = '\0';
root_path = xstrdup(path_name);
xstrcat(root_path, "/");
slash[0] = '/';
file_name = slash + 1;
} else {
file_name = path_name;
root_path = NULL;
}
hl = hostlist_create(file_name);
if (hl == NULL)
fatal("can't parse File=%s", path_name);
while ((one_name = hostlist_shift(hl))) {
if (slash) {
char *formatted_path = NULL;
xstrfmtcat(formatted_path, "%s/%s",
root_path, one_name);
_my_stat(formatted_path);
xfree(formatted_path);
} else {
_my_stat(one_name);
}
file_count++;
free(one_name);
}
hostlist_destroy(hl);
xfree(root_path);
return file_count;
}
/*
* Check that we have a comma-delimited list of numbers
*/
static void _validate_links(gres_slurmd_conf_t *p)
{
char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL;
long int val;
if (!p->links)
return;
if (p->links[0] == '\0') {
xfree(p->links);
return;
}
tmp = xstrdup(p->links);
tok = strtok_r(tmp, ",", &save_ptr);
while (tok) {
val = strtol(tok, &end_ptr, 10);
if ((val < -2) || (val > GRES_MAX_LINK) || (val == LONG_MIN) ||
(end_ptr[0] != '\0')) {
error("gres.conf: Ignoring invalid Link (%s) for Name=%s",
tok, p->name);
xfree(p->links);
break;
}
tok = strtok_r(NULL, ",", &save_ptr);
}
xfree(tmp);
}
/*
* Return true if count can be greater than 1 for a given file.
* For example, each GPU can have arbitrary count of MPS elements.
*/
static bool _multi_count_per_file(char *name)
{
if (!xstrcmp(name, "mps"))
return true;
return false;
}
/*
* Build gres_slurmd_conf_t record based upon a line from the gres.conf file
*/
static int _parse_gres_config(void **dest, slurm_parser_enum_t type,
const char *key, const char *value,
const char *line, char **leftover)
{
static s_p_options_t _gres_options[] = {
{"Count", S_P_STRING}, /* Number of Gres available */
{"CPUs" , S_P_STRING}, /* CPUs to bind to Gres resource
* (deprecated, use Cores) */
{"Cores", S_P_STRING}, /* Cores to bind to Gres resource */
{"File", S_P_STRING}, /* Path to Gres device */
{"Files", S_P_STRING}, /* Path to Gres device */
{"Flags", S_P_STRING}, /* GRES Flags */
{"Link", S_P_STRING}, /* Communication link IDs */
{"Links", S_P_STRING}, /* Communication link IDs */
{"Name", S_P_STRING}, /* Gres name */
{"Type", S_P_STRING}, /* Gres type (e.g. model name) */
{NULL}
};
int i;
s_p_hashtbl_t *tbl;
gres_slurmd_conf_t *p;
uint64_t tmp_uint64, mult;
char *tmp_str, *last;
bool cores_flag = false, cpus_flag = false;
char *type_str = NULL;
tbl = s_p_hashtbl_create(_gres_options);
s_p_parse_line(tbl, *leftover, leftover);
p = xmalloc(sizeof(gres_slurmd_conf_t));
if (!value) {
if (!s_p_get_string(&p->name, "Name", tbl)) {
error("Invalid GRES data, no type name (%s)", line);
xfree(p);
s_p_hashtbl_destroy(tbl);
return 0;
}
} else {
p->name = xstrdup(value);
}
p->cpu_cnt = gres_cpu_cnt;
if (s_p_get_string(&p->cpus, "Cores", tbl)) {
cores_flag = true;
type_str = "Cores";
} else if (s_p_get_string(&p->cpus, "CPUs", tbl)) {
cpus_flag = true;
type_str = "CPUs";
}
if (cores_flag || cpus_flag) {
char *local_cpus = NULL;
if (xcpuinfo_ops.xcpuinfo_abs_to_mac) {
i = (xcpuinfo_ops.xcpuinfo_abs_to_mac)
(p->cpus, &local_cpus);
/*
* Only executed by slurmstepd and we don't want
* fatal here. Ignore bad Core/CPU configuration.
*/
if (i != SLURM_SUCCESS) {
error("Invalid GRES data for %s, %s=%s",
p->name, type_str, p->cpus);
}
} else {
local_cpus = xstrdup(p->cpus);
i = SLURM_SUCCESS;
}
if (i == SLURM_SUCCESS) {
p->cpus_bitmap = bit_alloc(gres_cpu_cnt);
if ((bit_size(p->cpus_bitmap) == 0) ||
bit_unfmt(p->cpus_bitmap, local_cpus) != 0) {
fatal("Invalid GRES data for %s, %s=%s (only %u CPUs are available)",
p->name, type_str, p->cpus, gres_cpu_cnt);
}
}
xfree(local_cpus);
}
if (s_p_get_string(&p->file, "File", tbl) ||
s_p_get_string(&p->file, "Files", tbl)) {
p->count = _validate_file(p->file, p->name);
p->config_flags |= GRES_CONF_HAS_FILE;
}
if (s_p_get_string(&tmp_str, "Flags", tbl)) {
if (xstrcasestr(tmp_str, "CountOnly"))
p->config_flags |= GRES_CONF_COUNT_ONLY;
xfree(tmp_str);
}
if (s_p_get_string(&p->links, "Link", tbl) ||
s_p_get_string(&p->links, "Links", tbl)) {
_validate_links(p);
}
if (s_p_get_string(&p->type_name, "Type", tbl)) {
p->config_flags |= GRES_CONF_HAS_TYPE;
}
if (s_p_get_string(&tmp_str, "Count", tbl)) {
tmp_uint64 = strtoll(tmp_str, &last, 10);
if ((tmp_uint64 == LONG_MIN) || (tmp_uint64 == LONG_MAX)) {
fatal("Invalid GRES record for %s, invalid count %s",
p->name, tmp_str);
}
if ((mult = suffix_mult(last)) != NO_VAL64) {
tmp_uint64 *= mult;
} else {
fatal("Invalid GRES record for %s, invalid count %s",
p->name, tmp_str);
}
/*
* Some GRES can have count > 1 for a given file. For example,
* each GPU can have arbitrary count of MPS elements.
*/
if (p->count && (p->count != tmp_uint64) &&
!_multi_count_per_file(p->name)) {
fatal("Invalid GRES record for %s, count does not match File value",
p->name);
}
if (tmp_uint64 >= NO_VAL64) {
fatal("GRES %s has invalid count value %"PRIu64,
p->name, tmp_uint64);
}
p->count = tmp_uint64;
xfree(tmp_str);
} else if (p->count == 0)
p->count = 1;
s_p_hashtbl_destroy(tbl);
for (i = 0; i < gres_context_cnt; i++) {
if (xstrcasecmp(p->name, gres_context[i].gres_name) == 0)
break;
}
if (i >= gres_context_cnt) {
error("Ignoring gres.conf record, invalid name: %s", p->name);
destroy_gres_slurmd_conf(p);
return 0;
}
p->plugin_id = gres_context[i].plugin_id;
*dest = (void *)p;
return 1;
}
static int _parse_gres_config2(void **dest, slurm_parser_enum_t type,
const char *key, const char *value,
const char *line, char **leftover)
{
static s_p_options_t _gres_options[] = {
{"Count", S_P_STRING}, /* Number of Gres available */
{"CPUs" , S_P_STRING}, /* CPUs to bind to Gres resource */
{"Cores", S_P_STRING}, /* Cores to bind to Gres resource */
{"File", S_P_STRING}, /* Path to Gres device */
{"Files", S_P_STRING}, /* Path to Gres device */
{"Flags", S_P_STRING}, /* GRES Flags */
{"Link", S_P_STRING}, /* Communication link IDs */
{"Links", S_P_STRING}, /* Communication link IDs */
{"Name", S_P_STRING}, /* Gres name */
{"Type", S_P_STRING}, /* Gres type (e.g. model name) */
{NULL}
};
s_p_hashtbl_t *tbl;
if (gres_node_name && value) {
bool match = false;
hostlist_t hl;
hl = hostlist_create(value);
if (hl) {
match = (hostlist_find(hl, gres_node_name) >= 0);
hostlist_destroy(hl);
}
if (!match) {
debug("skipping GRES for NodeName=%s %s", value, line);
tbl = s_p_hashtbl_create(_gres_options);
s_p_parse_line(tbl, *leftover, leftover);
s_p_hashtbl_destroy(tbl);
return 0;
}
}
return _parse_gres_config(dest, type, key, NULL, line, leftover);
}
static void _validate_slurm_conf(List slurm_conf_list,
slurm_gres_context_t *context_ptr)
{
ListIterator iter;
gres_state_t *gres_ptr;
if (!slurm_conf_list)
return;
iter = list_iterator_create(slurm_conf_list);
while ((gres_ptr = list_next(iter))) {
gres_node_state_t *slurm_gres;
uint64_t tmp_count = 0;
/* Only look at the GRES under the current plugin (same name) */
if (gres_ptr->plugin_id != context_ptr->plugin_id)
continue;
slurm_gres = (gres_node_state_t *)gres_ptr->gres_data;
/*
* gres_cnt_config should equal the combined count from
* type_cnt_avail if there are no untyped GRES
*/
for (uint16_t i = 0; i < slurm_gres->type_cnt; i++)
tmp_count += slurm_gres->type_cnt_avail[i];
/* Forbid mixing typed and untyped GRES under the same name */
if (slurm_gres->type_cnt &&
slurm_gres->gres_cnt_config > tmp_count)
fatal("%s: Some %s GRES in slurm.conf have a type while others do not (slurm_gres->gres_cnt_config (%"PRIu64") > tmp_count (%"PRIu64"))",
__func__, context_ptr->gres_name,
slurm_gres->gres_cnt_config, tmp_count);
}
}
static void _validate_gres_conf(List gres_conf_list,
slurm_gres_context_t *context_ptr)
{
ListIterator iter;
gres_slurmd_conf_t *gres_slurmd_conf;
int new_has_file = -1, new_has_type = -1, rec_count = 0;
bool orig_has_file, orig_has_type;
iter = list_iterator_create(gres_conf_list);
while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
if (gres_slurmd_conf->plugin_id != context_ptr->plugin_id)
continue;
/*
* If any plugin of this type has this set it will virally set
* any other to be the same as we use the context_ptr from here
* on out.
*/
if (gres_slurmd_conf->config_flags & GRES_CONF_COUNT_ONLY)
context_ptr->config_flags |= GRES_CONF_COUNT_ONLY;
/*
* Since there could be multiple types of the same plugin we
* need to only make sure we load it once.
*/
if (!(context_ptr->config_flags & GRES_CONF_LOADED)) {
/*
* Ignore return code, as we will still support the gres
* with or without the plugin.
*/
if (_load_gres_plugin(context_ptr) == SLURM_SUCCESS)
context_ptr->config_flags |= GRES_CONF_LOADED;
}
rec_count++;
orig_has_file = gres_slurmd_conf->config_flags &
GRES_CONF_HAS_FILE;
if (new_has_file == -1) {
if (gres_slurmd_conf->config_flags &
GRES_CONF_HAS_FILE) {
new_has_file = 1;
} else
new_has_file = 0;
} else if (( new_has_file && !orig_has_file) ||
(!new_has_file && orig_has_file)) {
fatal("gres.conf for %s, some records have \"File\" specification while others do not",
context_ptr->gres_name);
}
orig_has_type = gres_slurmd_conf->config_flags &
GRES_CONF_HAS_TYPE;
if (new_has_type == -1) {
if (gres_slurmd_conf->config_flags &
GRES_CONF_HAS_TYPE) {
new_has_type = 1;
} else
new_has_type = 0;
} else if (( new_has_type && !orig_has_type) ||
(!new_has_type && orig_has_type)) {
fatal("gres.conf for %s, some records have \"Type=\" specification while others do not",
context_ptr->gres_name);
}
if ((new_has_file == 0) && (new_has_type == 0) &&
(rec_count > 1)) {
fatal("gres.conf duplicate records for %s",
context_ptr->gres_name);
}
if (new_has_file)
context_ptr->config_flags |= GRES_CONF_HAS_FILE;
}
list_iterator_destroy(iter);
if (!(context_ptr->config_flags & GRES_CONF_LOADED)) {
/*
* This means there was no gre.conf line for this gres found.
* We still need to try to load it for AutoDetect's sake.
* If we fail loading we will treat it as a count
* only GRES since the stepd will try to load it elsewise.
*/
if (_load_gres_plugin(context_ptr) != SLURM_SUCCESS)
context_ptr->config_flags |= GRES_CONF_COUNT_ONLY;
} else
/* Remove as this is only really used locally */
context_ptr->config_flags &= (~GRES_CONF_LOADED);
}
/*
* Keep track of which gres.conf lines have a count greater than expected
* according to the current slurm.conf GRES. Modify the count of throw-away
* records in gres_conf_list_tmp to keep track of this. Any gres.conf records
* with a count > 0 means that slurm.conf did not account for it completely.
*
* gres_conf_list_tmp - (in/out) The temporary gres.conf list.
* count - (in) The count of the current slurm.conf GRES record.
* type_name - (in) The type of the current slurm.conf GRES record.
*/
static void _compare_conf_counts(List gres_conf_list_tmp, uint64_t count,
char *type_name)
{
gres_slurmd_conf_t *gres_conf;
ListIterator iter = list_iterator_create(gres_conf_list_tmp);
while ((gres_conf = list_next(iter))) {
/* Note: plugin type filter already applied */
/* Check that type is the same */
if (xstrcasecmp(gres_conf->type_name, type_name))
continue;
/* Keep track of counts */
if (gres_conf->count > count) {
gres_conf->count -= count;
/* This slurm.conf GRES specification is now used up */
list_iterator_destroy(iter);
return;
} else {
count -= gres_conf->count;
gres_conf->count = 0;
}
}
list_iterator_destroy(iter);
}
/*
* Loop through each entry in gres.conf and see if there is a corresponding
* entry in slurm.conf. If so, see if the counts line up. If there are more
* devices specified in gres.conf than in slurm.conf, emit errors.
*
* slurm_conf_list - (in) The slurm.conf GRES list.
* gres_conf_list - (in) The gres.conf GRES list.
* context_ptr - (in) Which GRES plugin we are currently working in.
*/
static void _check_conf_mismatch(List slurm_conf_list, List gres_conf_list,
slurm_gres_context_t *context_ptr)
{
ListIterator iter;
gres_slurmd_conf_t *gres_conf;
gres_state_t *slurm_conf;
List gres_conf_list_tmp;
/* E.g. slurm_conf_list will be NULL in the case of --gpu-bind */
if (!slurm_conf_list || !gres_conf_list)
return;
/*
* Duplicate the gres.conf list with records relevant to this GRES plugin
* only so we can mangle records. Only add records under the current plugin.
*/
gres_conf_list_tmp = list_create(destroy_gres_slurmd_conf);
iter = list_iterator_create(gres_conf_list);
while ((gres_conf = list_next(iter))) {
gres_slurmd_conf_t *gres_conf_tmp;
if (gres_conf->plugin_id != context_ptr->plugin_id)
continue;
gres_conf_tmp = xmalloc(sizeof(*gres_conf_tmp));
gres_conf_tmp->name = xstrdup(gres_conf->name);
gres_conf_tmp->type_name = xstrdup(gres_conf->type_name);
gres_conf_tmp->count = gres_conf->count;
list_append(gres_conf_list_tmp, gres_conf_tmp);
}
list_iterator_destroy(iter);
/*
* Loop through the slurm.conf list and see if there are more gres.conf
* GRES than expected.
*/
iter = list_iterator_create(slurm_conf_list);
while ((slurm_conf = list_next(iter))) {
gres_node_state_t *slurm_gres;
if (slurm_conf->plugin_id != context_ptr->plugin_id)
continue;
/* Determine if typed or untyped, and act accordingly */
slurm_gres = (gres_node_state_t *)slurm_conf->gres_data;
if (!slurm_gres->type_name) {
_compare_conf_counts(gres_conf_list_tmp,
slurm_gres->gres_cnt_config, NULL);
continue;
}
for (int i = 0; i < slurm_gres->type_cnt; ++i) {
_compare_conf_counts(gres_conf_list_tmp,
slurm_gres->type_cnt_avail[i],
slurm_gres->type_name[i]);
}
}
list_iterator_destroy(iter);
/*
* Loop through gres_conf_list_tmp to print errors for gres.conf
* records that were not completely accounted for in slurm.conf.
*/
iter = list_iterator_create(gres_conf_list_tmp);
while ((gres_conf = list_next(iter)))
if (gres_conf->count > 0)
info("WARNING: A line in gres.conf for GRES %s%s%s has %"PRIu64" more configured than expected in slurm.conf. Ignoring extra GRES.",
gres_conf->name,
(gres_conf->type_name) ? ":" : "",
(gres_conf->type_name) ? gres_conf->type_name : "",
gres_conf->count);
list_iterator_destroy(iter);
FREE_NULL_LIST(gres_conf_list_tmp);
}
/*
* Match the type of a GRES from slurm.conf to a GRES in the gres.conf list. If
* a match is found, pop it off the gres.conf list and return it.
*
* gres_conf_list - (in) The gres.conf list to search through.
* gres_context - (in) Which GRES plugin we are currently working in.
* type_name - (in) The type of the slurm.conf GRES record. If null, then
* it's an untyped GRES.
*
* Returns the first gres.conf record from gres_conf_list with the same type
* name as the slurm.conf record.
*/
static gres_slurmd_conf_t *_match_type(List gres_conf_list,
slurm_gres_context_t *gres_context,
char *type_name)
{
ListIterator gres_conf_itr;
gres_slurmd_conf_t *gres_conf = NULL;
gres_conf_itr = list_iterator_create(gres_conf_list);
while ((gres_conf = list_next(gres_conf_itr))) {
if (gres_conf->plugin_id != gres_context->plugin_id)
continue;
/*
* If type_name is NULL we will take the first matching
* gres_conf that we find. This means we also will remove the
* type from the gres_conf to match 18.08 stylings.
*/
if (!type_name)
xfree(gres_conf->type_name);
else if (xstrcasecmp(gres_conf->type_name, type_name))
continue;
/* We found a match, so remove from gres_conf_list and break */
list_remove(gres_conf_itr);
break;
}
list_iterator_destroy(gres_conf_itr);
return gres_conf;
}
/*
* Add a GRES conf record with count == 0 to gres_list.
*
* gres_list - (in/out) The gres list to add to.
* gres_context - (in) The GRES plugin to add a GRES record for.
* cpu_cnt - (in) The cpu count configured for the node.
*/
static void _add_gres_config_empty(List gres_list,
slurm_gres_context_t *gres_context,
uint32_t cpu_cnt)
{
gres_slurmd_conf_t *gres_conf = xmalloc(sizeof(*gres_conf));
gres_conf->cpu_cnt = cpu_cnt;
gres_conf->name = xstrdup(gres_context->gres_name);
gres_conf->plugin_id = gres_context->plugin_id;
list_append(gres_list, gres_conf);
}
/*
* Truncate the File hostrange string of a GRES record to be to be at most
* new_count entries. The extra entries will be removed.
*
* gres_conf - (in/out) The GRES record to modify.
* count - (in) The new number of entries in File
*/
static void _set_file_subset(gres_slurmd_conf_t *gres_conf, uint64_t new_count)
{
/* Convert file to hostrange */
hostlist_t hl = hostlist_create(gres_conf->file);
unsigned long old_count = hostlist_count(hl);
if (new_count >= old_count) {
hostlist_destroy(hl);
/* Nothing to do */
return;
}
/* Remove all but the first entries */
for (int i = old_count; i > new_count; --i) {
free(hostlist_pop(hl));
}
debug3("%s: Truncating %s:%s File from (%ld) %s", __func__,
gres_conf->name, gres_conf->type_name, old_count,
gres_conf->file);
/* Set file to the new subset */
xfree(gres_conf->file);
gres_conf->file = hostlist_ranged_string_xmalloc(hl);
debug3("%s: to (%"PRIu64") %s", __func__, new_count, gres_conf->file);
hostlist_destroy(hl);
}
/*
* A continuation of _merge_gres() depending on if the slurm.conf GRES is typed
* or not.
*
* gres_conf_list - (in) The gres.conf list.
* new_list - (out) The new merged [slurm|gres].conf list.
* count - (in) The count of the slurm.conf GRES record.
* type_name - (in) The type of the slurm.conf GRES record, if it exists.
* gres_context - (in) Which GRES plugin we are working in.
* cpu_cnt - (in) A count of CPUs on the node.
*/
static void _merge_gres2(List gres_conf_list, List new_list, uint64_t count,
char *type_name, slurm_gres_context_t *gres_context,
uint32_t cpu_count)
{
gres_slurmd_conf_t *gres_conf, *match;
/* If slurm.conf count is initially 0, don't waste time on it */
if (count == 0)
return;
/*
* There can be multiple gres.conf GRES lines contained within a
* single slurm.conf GRES line, due to different values of Cores
* and Links. Append them to the list where possible.
*/
while ((match = _match_type(gres_conf_list, gres_context, type_name))) {
list_append(new_list, match);
debug3("%s: From gres.conf, using %s:%s:%"PRIu64":%s", __func__,
match->name, match->type_name, match->count,
match->file);
/* See if we need to merge with any more gres.conf records. */
if (match->count > count) {
/*
* Truncate excess count of gres.conf to match total
* count of slurm.conf.
*/
match->count = count;
/*
* Truncate excess file of gres.conf to match total
* count of slurm.conf.
*/
if (match->file)
_set_file_subset(match, count);
/* Floor to 0 to break out of loop. */
count = 0;
} else
/*
* Subtract this gres.conf line count from the
* slurm.conf total.
*/
count -= match->count;
/*
* All devices outlined by this slurm.conf record have now been
* merged with gres.conf records and added to new_list, so exit.
*/
if (count == 0)
break;
}
if (count == 0)
return;
/*
* There are leftover GRES specified in this slurm.conf record that are
* not accounted for in gres.conf that still need to be added.
*/
gres_conf = xmalloc(sizeof(*gres_conf));
gres_conf->count = count;
gres_conf->cpu_cnt = cpu_count;
gres_conf->name = xstrdup(gres_context->gres_name);
gres_conf->plugin_id = gres_context->plugin_id;
if (type_name) {
gres_conf->config_flags = GRES_CONF_HAS_TYPE;
gres_conf->type_name = xstrdup(type_name);
}
if (gres_context->config_flags & GRES_CONF_COUNT_ONLY)
gres_conf->config_flags |= GRES_CONF_COUNT_ONLY;
list_append(new_list, gres_conf);
}
/*
* Merge a single slurm.conf GRES specification with any relevant gres.conf
* records and append the result to new_list.
*
* gres_conf_list - (in) The gres.conf list.
* new_list - (out) The new merged [slurm|gres].conf list.
* ptr - (in) A slurm.conf GRES record.
* gres_context - (in) Which GRES plugin we are working in.
* cpu_cnt - (in) A count of CPUs on the node.
*/
static void _merge_gres(List gres_conf_list, List new_list, gres_state_t *ptr,
slurm_gres_context_t *gres_context, uint32_t cpu_cnt)
{
gres_node_state_t *slurm_gres = (gres_node_state_t *)ptr->gres_data;
/* If this GRES has no types, merge in the single untyped GRES */
if (slurm_gres->type_cnt == 0) {
_merge_gres2(gres_conf_list, new_list,
slurm_gres->gres_cnt_config, NULL, gres_context,
cpu_cnt);
return;
}
/* If this GRES has types, merge in each typed GRES */
for (int i = 0; i < slurm_gres->type_cnt; i++) {
_merge_gres2(gres_conf_list, new_list,
slurm_gres->type_cnt_avail[i],
slurm_gres->type_name[i], gres_context, cpu_cnt);
}
}
/*
* Merge slurm.conf and gres.conf GRES configuration.
* gres.conf can only work within what is outlined in slurm.conf. Every
* gres.conf device that does not match up to a device in slurm.conf is
* discarded with an error. If no gres conf found for what is specified in
* slurm.conf, create a zero-count conf record.
*
* node_conf - (in) node configuration info (cpu count).
* gres_conf_list - (in/out) GRES data from gres.conf. This becomes the new
* merged slurm.conf/gres.conf list.
* slurm_conf_list - (in) GRES data from slurm.conf.
*/
static void _merge_config(node_config_load_t *node_conf, List gres_conf_list,
List slurm_conf_list)
{
int i;
gres_state_t *gres_ptr;
ListIterator iter;
bool found;
List new_gres_list = list_create(destroy_gres_slurmd_conf);
for (i = 0; i < gres_context_cnt; i++) {
/* Copy GRES configuration from slurm.conf */
if (slurm_conf_list) {
found = false;
iter = list_iterator_create(slurm_conf_list);
while ((gres_ptr = (gres_state_t *) list_next(iter))) {
if (gres_ptr->plugin_id !=
gres_context[i].plugin_id)
continue;
found = true;
_merge_gres(gres_conf_list, new_gres_list,
gres_ptr, &gres_context[i],
node_conf->cpu_cnt);
}
list_iterator_destroy(iter);
if (found)
continue;
}
/* Add GRES record with zero count */
_add_gres_config_empty(new_gres_list, &gres_context[i],
node_conf->cpu_cnt);
}
/* Set gres_conf_list to be the new merged list */
list_flush(gres_conf_list);
list_transfer(gres_conf_list, new_gres_list);
FREE_NULL_LIST(new_gres_list);
}
/*
* Load this node's configuration (how many resources it has, topology, etc.)
* IN cpu_cnt - Number of CPUs configured on this node
* IN node_name - Name of this node
* IN gres_list - Node's GRES information as loaded from slurm.conf by slurmd
* IN xcpuinfo_abs_to_mac - Pointer to xcpuinfo_abs_to_mac() funct, if available
* IN xcpuinfo_mac_to_abs - Pointer to xcpuinfo_mac_to_abs() funct, if available
* NOTE: Called from slurmd and slurmstepd
*/
extern int gres_plugin_node_config_load(uint32_t cpu_cnt, char *node_name,
List gres_list,
void *xcpuinfo_abs_to_mac,
void *xcpuinfo_mac_to_abs)
{
static s_p_options_t _gres_options[] = {
{"AutoDetect", S_P_STRING},
{"Name", S_P_ARRAY, _parse_gres_config, NULL},
{"NodeName", S_P_ARRAY, _parse_gres_config2, NULL},
{NULL}
};
int count = 0, i, rc, rc2;
struct stat config_stat;
s_p_hashtbl_t *tbl;
gres_slurmd_conf_t **gres_array;
char *gres_conf_file;
char *autodetect_string = NULL;
node_config_load_t node_conf = {
.cpu_cnt = cpu_cnt,
.xcpuinfo_mac_to_abs = xcpuinfo_mac_to_abs
};
if (cpu_cnt == 0) {
error("%s: Invalid cpu_cnt of 0 for node %s",
__func__, node_name);
return SLURM_ERROR;
}
if (xcpuinfo_abs_to_mac)
xcpuinfo_ops.xcpuinfo_abs_to_mac = xcpuinfo_abs_to_mac;
rc = gres_plugin_init();
if (gres_context_cnt == 0)
return SLURM_SUCCESS;
slurm_mutex_lock(&gres_context_lock);
FREE_NULL_LIST(gres_conf_list);
gres_conf_list = list_create(destroy_gres_slurmd_conf);
gres_conf_file = get_extra_conf_path("gres.conf");
if (stat(gres_conf_file, &config_stat) < 0) {
info("Can not stat gres.conf file (%s), using slurm.conf data",
gres_conf_file);
} else {
if (xstrcmp(gres_node_name, node_name)) {
xfree(gres_node_name);
gres_node_name = xstrdup(node_name);
}
gres_cpu_cnt = cpu_cnt;
tbl = s_p_hashtbl_create(_gres_options);
if (s_p_parse_file(tbl, NULL, gres_conf_file, false) == SLURM_ERROR)
fatal("error opening/reading %s", gres_conf_file);
if (s_p_get_string(&autodetect_string, "Autodetect", tbl)) {
if (xstrcasestr(autodetect_string, "nvml"))
autodetect_types |= GRES_AUTODETECT_NVML;
if (xstrcasestr(autodetect_string, "rsmi"))
autodetect_types |= GRES_AUTODETECT_RSMI;
xfree(autodetect_string);
}
if (s_p_get_array((void ***) &gres_array, &count, "Name", tbl)) {
for (i = 0; i < count; i++) {
list_append(gres_conf_list, gres_array[i]);
gres_array[i] = NULL;
}
}
if (s_p_get_array((void ***) &gres_array, &count, "NodeName", tbl)) {
for (i = 0; i < count; i++) {
list_append(gres_conf_list, gres_array[i]);
gres_array[i] = NULL;
}
}
s_p_hashtbl_destroy(tbl);
}
xfree(gres_conf_file);
/* Validate gres.conf and slurm.conf somewhat before merging */
for (i = 0; i < gres_context_cnt; i++) {
_validate_slurm_conf(gres_list, &gres_context[i]);
_validate_gres_conf(gres_conf_list, &gres_context[i]);
_check_conf_mismatch(gres_list, gres_conf_list,
&gres_context[i]);
}
/* Merge slurm.conf and gres.conf together into gres_conf_list */
_merge_config(&node_conf, gres_conf_list, gres_list);
for (i = 0; i < gres_context_cnt; i++) {
if (gres_context[i].ops.node_config_load == NULL)
continue; /* No plugin */
rc2 = (*(gres_context[i].ops.node_config_load))(gres_conf_list,
&node_conf);
if (rc == SLURM_SUCCESS)
rc = rc2;
}
/* Postprocess gres_conf_list after all plugins' node_config_load */
for (i = 0; i < gres_context_cnt; i++) {
/* Remove every GPU with an empty File */
_remove_fileless_gpus(gres_conf_list, &gres_context[i]);
}
list_for_each(gres_conf_list, _log_gres_slurmd_conf, NULL);
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
/*
* Pack this node's gres configuration into a buffer
* IN/OUT buffer - message buffer to pack
*/
extern int gres_plugin_node_config_pack(Buf buffer)
{
int rc;
uint32_t magic = GRES_MAGIC;
uint16_t rec_cnt = 0, version = SLURM_PROTOCOL_VERSION;
ListIterator iter;
gres_slurmd_conf_t *gres_slurmd_conf;
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
pack16(version, buffer);
if (gres_conf_list)
rec_cnt = list_count(gres_conf_list);
pack16(rec_cnt, buffer);
if (rec_cnt) {
iter = list_iterator_create(gres_conf_list);
while ((gres_slurmd_conf =
(gres_slurmd_conf_t *) list_next(iter))) {
pack32(magic, buffer);
pack64(gres_slurmd_conf->count, buffer);
pack32(gres_slurmd_conf->cpu_cnt, buffer);
pack8(gres_slurmd_conf->config_flags, buffer);
pack32(gres_slurmd_conf->plugin_id, buffer);
packstr(gres_slurmd_conf->cpus, buffer);
packstr(gres_slurmd_conf->links, buffer);
packstr(gres_slurmd_conf->name, buffer);
packstr(gres_slurmd_conf->type_name, buffer);
}
list_iterator_destroy(iter);
}
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
/*
* Unpack this node's configuration from a buffer (built/packed by slurmd)
* IN/OUT buffer - message buffer to unpack
* IN node_name - name of node whose data is being unpacked
*/
extern int gres_plugin_node_config_unpack(Buf buffer, char *node_name)
{
int i, j, rc;
uint32_t cpu_cnt = 0, magic = 0, plugin_id = 0, utmp32 = 0;
uint64_t count64 = 0;
uint16_t rec_cnt = 0, protocol_version = 0;
uint8_t config_flags = 0;
char *tmp_cpus = NULL, *tmp_links = NULL, *tmp_name = NULL;
char *tmp_type = NULL;
gres_slurmd_conf_t *p;
rc = gres_plugin_init();
FREE_NULL_LIST(gres_conf_list);
gres_conf_list = list_create(destroy_gres_slurmd_conf);
safe_unpack16(&protocol_version, buffer);
safe_unpack16(&rec_cnt, buffer);
if (rec_cnt == 0)
return SLURM_SUCCESS;
if (rec_cnt > NO_VAL16)
goto unpack_error;
slurm_mutex_lock(&gres_context_lock);
if (protocol_version < SLURM_MIN_PROTOCOL_VERSION) {
error("%s: protocol_version %hu not supported",
__func__, protocol_version);
goto unpack_error;
}
for (i = 0; i < rec_cnt; i++) {
if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
safe_unpack32(&magic, buffer);
if (magic != GRES_MAGIC)
goto unpack_error;
safe_unpack64(&count64, buffer);
safe_unpack32(&cpu_cnt, buffer);
safe_unpack8(&config_flags, buffer);
safe_unpack32(&plugin_id, buffer);
safe_unpackstr_xmalloc(&tmp_cpus, &utmp32, buffer);
safe_unpackstr_xmalloc(&tmp_links, &utmp32, buffer);
safe_unpackstr_xmalloc(&tmp_name, &utmp32, buffer);
safe_unpackstr_xmalloc(&tmp_type, &utmp32, buffer);
}
if (slurm_get_debug_flags() & DEBUG_FLAG_GRES) {
info("Node:%s Gres:%s Type:%s Flags:%s CPU_IDs:%s CPU#:%u Count:%"
PRIu64" Links:%s",
node_name, tmp_name, tmp_type,
gres_flags2str(config_flags), tmp_cpus, cpu_cnt,
count64, tmp_links);
}
for (j = 0; j < gres_context_cnt; j++) {
bool new_has_file, new_has_type;
bool orig_has_file, orig_has_type;
if (gres_context[j].plugin_id != plugin_id)
continue;
if (xstrcmp(gres_context[j].gres_name, tmp_name)) {
/*
* Should have been caught in
* gres_plugin_init()
*/
error("%s: gres/%s duplicate plugin ID with %s, unable to process",
__func__, tmp_name,
gres_context[j].gres_name);
continue;
}
new_has_file = config_flags & GRES_CONF_HAS_FILE;
orig_has_file = gres_context[j].config_flags &
GRES_CONF_HAS_FILE;
if (orig_has_file && !new_has_file && count64) {
error("%s: gres/%s lacks \"File=\" parameter for node %s",
__func__, tmp_name, node_name);
config_flags |= GRES_CONF_HAS_FILE;
}
if (new_has_file && (count64 > MAX_GRES_BITMAP)) {
/*
* Avoid over-subscribing memory with
* huge bitmaps
*/
error("%s: gres/%s has \"File=\" plus very large "
"\"Count\" (%"PRIu64") for node %s, "
"resetting value to %d",
__func__, tmp_name, count64,
node_name, MAX_GRES_BITMAP);
count64 = MAX_GRES_BITMAP;
}
new_has_type = config_flags & GRES_CONF_HAS_TYPE;
orig_has_type = gres_context[j].config_flags &
GRES_CONF_HAS_TYPE;
if (orig_has_type && !new_has_type && count64) {
error("%s: gres/%s lacks \"Type\" parameter for node %s",
__func__, tmp_name, node_name);
config_flags |= GRES_CONF_HAS_TYPE;
}
gres_context[j].config_flags |= config_flags;
/*
* On the slurmctld we need to load the plugins to
* correctly set env vars. We want to call this only
* after we have the config_flags so we can tell if we
* are CountOnly or not.
*/
if (!(gres_context[j].config_flags &
GRES_CONF_LOADED)) {
(void)_load_gres_plugin(&gres_context[j]);
gres_context[j].config_flags |=
GRES_CONF_LOADED;
}
break;
}
if (j >= gres_context_cnt) {
/*
* GresPlugins is inconsistently configured.
* Not a fatal error, but skip this data.
*/
error("%s: No plugin configured to process GRES data from node %s (Name:%s Type:%s PluginID:%u Count:%"PRIu64")",
__func__, node_name, tmp_name, tmp_type,
plugin_id, count64);
xfree(tmp_cpus);
xfree(tmp_links);
xfree(tmp_name);
xfree(tmp_type);
continue;
}
p = xmalloc(sizeof(gres_slurmd_conf_t));
p->config_flags = config_flags;
p->count = count64;
p->cpu_cnt = cpu_cnt;
p->cpus = tmp_cpus;
tmp_cpus = NULL; /* Nothing left to xfree */
p->links = tmp_links;
tmp_links = NULL; /* Nothing left to xfree */
p->name = tmp_name; /* Preserve for accounting! */
p->type_name = tmp_type;
tmp_type = NULL; /* Nothing left to xfree */
p->plugin_id = plugin_id;
_validate_links(p);
list_append(gres_conf_list, p);
}
slurm_mutex_unlock(&gres_context_lock);
return rc;
unpack_error:
error("%s: unpack error from node %s", __func__, node_name);
xfree(tmp_cpus);
xfree(tmp_links);
xfree(tmp_name);
xfree(tmp_type);
slurm_mutex_unlock(&gres_context_lock);
return SLURM_ERROR;
}
static void _gres_node_state_delete_topo(gres_node_state_t *gres_node_ptr)
{
int i;
for (i = 0; i < gres_node_ptr->topo_cnt; i++) {
if (gres_node_ptr->topo_gres_bitmap)
FREE_NULL_BITMAP(gres_node_ptr->topo_gres_bitmap[i]);
if (gres_node_ptr->topo_core_bitmap)
FREE_NULL_BITMAP(gres_node_ptr->topo_core_bitmap[i]);
xfree(gres_node_ptr->topo_type_name[i]);
}
xfree(gres_node_ptr->topo_gres_bitmap);
xfree(gres_node_ptr->topo_core_bitmap);
xfree(gres_node_ptr->topo_gres_cnt_alloc);
xfree(gres_node_ptr->topo_gres_cnt_avail);
xfree(gres_node_ptr->topo_type_id);
xfree(gres_node_ptr->topo_type_name);
}
static void _gres_node_state_delete(gres_node_state_t *gres_node_ptr)
{
int i;
FREE_NULL_BITMAP(gres_node_ptr->gres_bit_alloc);
xfree(gres_node_ptr->gres_used);
if (gres_node_ptr->links_cnt) {
for (i = 0; i < gres_node_ptr->link_len; i++)
xfree(gres_node_ptr->links_cnt[i]);
xfree(gres_node_ptr->links_cnt);
}
_gres_node_state_delete_topo(gres_node_ptr);
for (i = 0; i < gres_node_ptr->type_cnt; i++) {
xfree(gres_node_ptr->type_name[i]);
}
xfree(gres_node_ptr->type_cnt_alloc);
xfree(gres_node_ptr->type_cnt_avail);
xfree(gres_node_ptr->type_id);
xfree(gres_node_ptr->type_name);
xfree(gres_node_ptr);
}
/*
* Delete an element placed on gres_list by _node_config_validate()
* free associated memory
*/
static void _gres_node_list_delete(void *list_element)
{
gres_state_t *gres_ptr;
gres_node_state_t *gres_node_ptr;
gres_ptr = (gres_state_t *) list_element;
gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
_gres_node_state_delete(gres_node_ptr);
xfree(gres_ptr);
}
static void _add_gres_type(char *type, gres_node_state_t *gres_data,
uint64_t tmp_gres_cnt)
{
int i;
uint32_t type_id;
if (!xstrcasecmp(type, "no_consume")) {
gres_data->no_consume = true;
return;
}
type_id = gres_plugin_build_id(type);
for (i = 0; i < gres_data->type_cnt; i++) {
if (gres_data->type_id[i] != type_id)
continue;
gres_data->type_cnt_avail[i] += tmp_gres_cnt;
break;
}
if (i >= gres_data->type_cnt) {
gres_data->type_cnt++;
gres_data->type_cnt_alloc =
xrealloc(gres_data->type_cnt_alloc,
sizeof(uint64_t) * gres_data->type_cnt);
gres_data->type_cnt_avail =
xrealloc(gres_data->type_cnt_avail,
sizeof(uint64_t) * gres_data->type_cnt);
gres_data->type_id =
xrealloc(gres_data->type_id,
sizeof(uint32_t) * gres_data->type_cnt);
gres_data->type_name =
xrealloc(gres_data->type_name,
sizeof(char *) * gres_data->type_cnt);
gres_data->type_cnt_avail[i] += tmp_gres_cnt;
gres_data->type_id[i] = type_id;
gres_data->type_name[i] = xstrdup(type);
}
}
/*
* Compute the total GRES count for a particular gres_name.
* Note that a given gres_name can appear multiple times in the orig_config
* string for multiple types (e.g. "gres=gpu:kepler:1,gpu:tesla:2").
* IN/OUT gres_data - set gres_cnt_config field in this structure
* IN orig_config - gres configuration from slurm.conf
* IN gres_name - name of the gres type (e.g. "gpu")
* IN gres_name_colon - gres name with appended colon
* IN gres_name_colon_len - size of gres_name_colon
* RET - Total configured count for this GRES type
*/
static void _get_gres_cnt(gres_node_state_t *gres_data, char *orig_config,
char *gres_name, char *gres_name_colon,
int gres_name_colon_len)
{
char *node_gres_config, *tok, *last_tok = NULL;
char *sub_tok, *last_sub_tok = NULL;
char *num, *paren, *last_num = NULL;
uint64_t gres_config_cnt = 0, tmp_gres_cnt = 0, mult;
int i;
xassert(gres_data);
if (orig_config == NULL) {
gres_data->gres_cnt_config = 0;
return;
}
for (i = 0; i < gres_data->type_cnt; i++) {
gres_data->type_cnt_avail[i] = 0;
}
node_gres_config = xstrdup(orig_config);
tok = strtok_r(node_gres_config, ",", &last_tok);
while (tok) {
if (!xstrcmp(tok, gres_name)) {
gres_config_cnt = 1;
break;
}
if (!xstrncmp(tok, gres_name_colon, gres_name_colon_len)) {
paren = strrchr(tok, '(');
if (paren) /* Ignore socket binding info */
paren[0] = '\0';
num = strrchr(tok, ':');
if (!num) {
error("Bad GRES configuration: %s", tok);
break;
}
tmp_gres_cnt = strtoll(num + 1, &last_num, 10);
if ((num[1] < '0') || (num[1] > '9')) {
/*
* Type name, no count (e.g. "gpu:tesla").
* assume count of 1.
*/
tmp_gres_cnt = 1;
} else if ((mult = suffix_mult(last_num)) != NO_VAL64) {
tmp_gres_cnt *= mult;
} else {
error("Bad GRES configuration: %s", tok);
break;
}
gres_config_cnt += tmp_gres_cnt;
num[0] = '\0';
sub_tok = strtok_r(tok, ":", &last_sub_tok);
if (sub_tok) /* Skip GRES name */
sub_tok = strtok_r(NULL, ":", &last_sub_tok);
while (sub_tok) {
_add_gres_type(sub_tok, gres_data,
tmp_gres_cnt);
sub_tok = strtok_r(NULL, ":", &last_sub_tok);
}
}
tok = strtok_r(NULL, ",", &last_tok);
}
xfree(node_gres_config);
gres_data->gres_cnt_config = gres_config_cnt;
}
static int _valid_gres_type(char *gres_name, gres_node_state_t *gres_data,
bool config_overrides, char **reason_down)
{
int i, j;
uint64_t model_cnt;
if (gres_data->type_cnt == 0)
return 0;
for (i = 0; i < gres_data->type_cnt; i++) {
model_cnt = 0;
if (gres_data->type_cnt) {
for (j = 0; j < gres_data->type_cnt; j++) {
if (gres_data->type_id[i] ==
gres_data->type_id[j])
model_cnt +=
gres_data->type_cnt_avail[j];
}
} else {
for (j = 0; j < gres_data->topo_cnt; j++) {
if (gres_data->type_id[i] ==
gres_data->topo_type_id[j])
model_cnt +=
gres_data->topo_gres_cnt_avail[j];
}
}
if (config_overrides) {
gres_data->type_cnt_avail[i] = model_cnt;
} else if (model_cnt < gres_data->type_cnt_avail[i]) {
if (reason_down) {
xstrfmtcat(*reason_down,
"%s:%s count too low "
"(%"PRIu64" < %"PRIu64")",
gres_name, gres_data->type_name[i],
model_cnt,
gres_data->type_cnt_avail[i]);
}
return -1;
}
}
return 0;
}
static gres_node_state_t *_build_gres_node_state(void)
{
gres_node_state_t *gres_data;
gres_data = xmalloc(sizeof(gres_node_state_t));
gres_data->gres_cnt_config = NO_VAL64;
gres_data->gres_cnt_found = NO_VAL64;
return gres_data;
}
/*
* Build a node's gres record based only upon the slurm.conf contents
*/
static int _node_config_init(char *node_name, char *orig_config,
slurm_gres_context_t *context_ptr,
gres_state_t *gres_ptr)
{
int rc = SLURM_SUCCESS;
gres_node_state_t *gres_data;
if (!gres_ptr->gres_data)
gres_ptr->gres_data = _build_gres_node_state();
gres_data = (gres_node_state_t *) gres_ptr->gres_data;
/* If the resource isn't configured for use with this node */
if ((orig_config == NULL) || (orig_config[0] == '\0')) {
gres_data->gres_cnt_config = 0;
return rc;
}
_get_gres_cnt(gres_data, orig_config,
context_ptr->gres_name,
context_ptr->gres_name_colon,
context_ptr->gres_name_colon_len);
context_ptr->total_cnt += gres_data->gres_cnt_config;
/* Use count from recovered state, if higher */
gres_data->gres_cnt_avail = MAX(gres_data->gres_cnt_avail,
gres_data->gres_cnt_config);
if ((gres_data->gres_bit_alloc != NULL) &&
(gres_data->gres_cnt_avail >
bit_size(gres_data->gres_bit_alloc)) &&
!_shared_gres(context_ptr->plugin_id)) {
gres_data->gres_bit_alloc =
bit_realloc(gres_data->gres_bit_alloc,
gres_data->gres_cnt_avail);
}
return rc;
}
/*
* Build a node's gres record based only upon the slurm.conf contents
* IN node_name - name of the node for which the gres information applies
* IN orig_config - Gres information supplied from slurm.conf
* IN/OUT gres_list - List of Gres records for this node to track usage
*/
extern int gres_plugin_init_node_config(char *node_name, char *orig_config,
List *gres_list)
{
int i, rc, rc2;
ListIterator gres_iter;
gres_state_t *gres_ptr;
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
*gres_list = list_create(_gres_node_list_delete);
}
for (i = 0; i < gres_context_cnt; i++) {
/* Find or create gres_state entry on the list */
gres_iter = list_iterator_create(*gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
if (gres_ptr->plugin_id == gres_context[i].plugin_id)
break;
}
list_iterator_destroy(gres_iter);
if (gres_ptr == NULL) {
gres_ptr = xmalloc(sizeof(gres_state_t));
gres_ptr->plugin_id = gres_context[i].plugin_id;
list_append(*gres_list, gres_ptr);
}
rc2 = _node_config_init(node_name, orig_config,
&gres_context[i], gres_ptr);
if (rc == SLURM_SUCCESS)
rc = rc2;
}
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
/*
* Determine GRES availability on some node
* plugin_id IN - plugin number to search for
* topo_cnt OUT - count of gres.conf records of this ID found by slurmd
* (each can have different topology)
* config_type_cnt OUT - Count of records for this GRES found in configuration,
* each of this represesents a different Type of of GRES with
* with this name (e.g. GPU model)
* RET - total number of GRES available of this ID on this node in (sum
* across all records of this ID)
*/
static uint64_t _get_tot_gres_cnt(uint32_t plugin_id, uint64_t *topo_cnt,
int *config_type_cnt)
{
ListIterator iter;
gres_slurmd_conf_t *gres_slurmd_conf;
uint32_t cpu_set_cnt = 0, rec_cnt = 0;
uint64_t gres_cnt = 0;
xassert(config_type_cnt);
xassert(topo_cnt);
*config_type_cnt = 0;
*topo_cnt = 0;
if (gres_conf_list == NULL)
return gres_cnt;
iter = list_iterator_create(gres_conf_list);
while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
if (gres_slurmd_conf->plugin_id != plugin_id)
continue;
gres_cnt += gres_slurmd_conf->count;
rec_cnt++;
if (gres_slurmd_conf->cpus || gres_slurmd_conf->type_name)
cpu_set_cnt++;
}
list_iterator_destroy(iter);
*config_type_cnt = rec_cnt;
if (cpu_set_cnt)
*topo_cnt = rec_cnt;
return gres_cnt;
}
/*
* Map a given GRES type ID back to a GRES type name.
* gres_id IN - GRES type ID to search for.
* gres_name IN - Pre-allocated string in which to store the GRES type name.
* gres_name_len - Size of gres_name in bytes
* RET - error code (currently not used--always return SLURM_SUCCESS)
*/
extern int gres_gresid_to_gresname(uint32_t gres_id, char* gres_name,
int gres_name_len)
{
int rc = SLURM_SUCCESS;
int found = 0;
int i;
/*
* Check GresTypes from slurm.conf (gres_context) for GRES type name
*/
slurm_mutex_lock(&gres_context_lock);
for (i = 0; i < gres_context_cnt; ++i) {
if (gres_id == gres_context[i].plugin_id) {
strlcpy(gres_name, gres_context[i].gres_name,
gres_name_len);
found = 1;
break;
}
}
slurm_mutex_unlock(&gres_context_lock);
/*
* If can't find GRES type name, emit error and default to GRES type ID
*/
if (!found) {
error("Could not find GRES type name in slurm.conf that corresponds to GRES type ID `%d`. Using ID as GRES type name instead.",
gres_id);
snprintf(gres_name, gres_name_len, "%u", gres_id);
}
return rc;
}
/* Convert comma-delimited array of link counts to an integer array */
static void _links_str2array(char *links, char *node_name,
gres_node_state_t *gres_data,
int gres_inx, int gres_cnt)
{
char *start_ptr, *end_ptr = NULL;
int i = 0;
if (!links) /* No "Links=" data */
return;
if (gres_inx >= gres_data->link_len) {
error("%s: Invalid GRES index (%d >= %d)", __func__, gres_inx,
gres_cnt);
return;
}
start_ptr = links;
while (1) {
gres_data->links_cnt[gres_inx][i] =
strtol(start_ptr, &end_ptr, 10);
if (gres_data->links_cnt[gres_inx][i] < -2) {
error("%s: Invalid GRES Links value (%s) on node %s:"
"Link value '%d' < -2", __func__, links,
node_name, gres_data->links_cnt[gres_inx][i]);
gres_data->links_cnt[gres_inx][i] = 0;
return;
}
if (end_ptr[0] == '\0')
return;
if (end_ptr[0] != ',') {
error("%s: Invalid GRES Links value (%s) on node %s:"
"end_ptr[0]='%c' != ','", __func__, links,
node_name, end_ptr[0]);
return;
}
if (++i >= gres_data->link_len) {
error("%s: Invalid GRES Links value (%s) on node %s:"
"i=%d >= link_len=%d", __func__, links, node_name,
i, gres_data->link_len);
return;
}
start_ptr = end_ptr + 1;
}
}
static bool _valid_gres_types(char *gres_name, gres_node_state_t *gres_data,
char **reason_down)
{
bool rc = true;
uint64_t gres_cnt_found = 0, gres_sum;
int topo_inx, type_inx;
if ((gres_data->type_cnt == 0) || (gres_data->topo_cnt == 0))
return rc;
for (type_inx = 0; type_inx < gres_data->type_cnt; type_inx++) {
gres_cnt_found = 0;
for (topo_inx = 0; topo_inx < gres_data->topo_cnt; topo_inx++) {
if (gres_data->topo_type_id[topo_inx] !=
gres_data->type_id[type_inx])
continue;
gres_sum = gres_cnt_found +
gres_data->topo_gres_cnt_avail[topo_inx];
if (gres_sum > gres_data->type_cnt_avail[type_inx]) {
gres_data->topo_gres_cnt_avail[topo_inx] -=
(gres_sum -
gres_data->type_cnt_avail[type_inx]);
}
gres_cnt_found +=
gres_data->topo_gres_cnt_avail[topo_inx];
}
if (gres_cnt_found < gres_data->type_cnt_avail[type_inx]) {
rc = false;
break;
}
}
if (!rc && reason_down && (*reason_down == NULL)) {
xstrfmtcat(*reason_down,
"%s:%s count too low (%"PRIu64" < %"PRIu64")",
gres_name, gres_data->type_name[type_inx],
gres_cnt_found, gres_data->type_cnt_avail[type_inx]);
}
return rc;
}
static void _gres_bit_alloc_resize(gres_node_state_t *gres_data,
uint64_t gres_bits)
{
if (!gres_bits) {
FREE_NULL_BITMAP(gres_data->gres_bit_alloc);
return;
}
if (!gres_data->gres_bit_alloc)
gres_data->gres_bit_alloc = bit_alloc(gres_bits);
else if (gres_bits != bit_size(gres_data->gres_bit_alloc))
gres_data->gres_bit_alloc =
bit_realloc(gres_data->gres_bit_alloc, gres_bits);
}
static int _node_config_validate(char *node_name, char *orig_config,
gres_state_t *gres_ptr,
int cpu_cnt, int core_cnt, int sock_cnt,
bool config_overrides, char **reason_down,
slurm_gres_context_t *context_ptr)
{
int cpus_config = 0, i, j, gres_inx, rc = SLURM_SUCCESS;
int config_type_cnt = 0;
uint64_t dev_cnt, gres_cnt, topo_cnt = 0;
bool cpu_config_err = false, updated_config = false;
gres_node_state_t *gres_data;
ListIterator iter;
gres_slurmd_conf_t *gres_slurmd_conf;
bool has_file, has_type, rebuild_topo = false;
uint32_t type_id;
xassert(core_cnt);
if (gres_ptr->gres_data == NULL)
gres_ptr->gres_data = _build_gres_node_state();
gres_data = (gres_node_state_t *) gres_ptr->gres_data;
if (gres_data->node_feature)
return rc;
gres_cnt = _get_tot_gres_cnt(context_ptr->plugin_id, &topo_cnt,
&config_type_cnt);
if ((gres_data->gres_cnt_config > gres_cnt) && !config_overrides) {
if (reason_down && (*reason_down == NULL)) {
xstrfmtcat(*reason_down,
"%s count reported lower than configured "
"(%"PRIu64" < %"PRIu64")",
context_ptr->gres_type,
gres_cnt, gres_data->gres_cnt_config);
}
rc = EINVAL;
}
if ((gres_cnt > gres_data->gres_cnt_config)) {
debug("%s: %s: Ignoring excess count on node %s (%"
PRIu64" > %"PRIu64")",
__func__, context_ptr->gres_type, node_name, gres_cnt,
gres_data->gres_cnt_config);
gres_cnt = gres_data->gres_cnt_config;
}
if (gres_data->gres_cnt_found != gres_cnt) {
if (gres_data->gres_cnt_found != NO_VAL64) {
info("%s: %s: Count changed on node %s (%"PRIu64" != %"PRIu64")",
__func__, context_ptr->gres_type, node_name,
gres_data->gres_cnt_found, gres_cnt);
}
if ((gres_data->gres_cnt_found != NO_VAL64) &&
(gres_data->gres_cnt_alloc != 0)) {
if (reason_down && (*reason_down == NULL)) {
xstrfmtcat(*reason_down,
"%s count changed and jobs are using them "
"(%"PRIu64" != %"PRIu64")",
context_ptr->gres_type,
gres_data->gres_cnt_found, gres_cnt);
}
rc = EINVAL;
} else {
gres_data->gres_cnt_found = gres_cnt;
updated_config = true;
}
}
if (!updated_config && gres_data->type_cnt) {
/*
* This is needed to address the GRES specification in
* gres.conf having a Type option, while the GRES specification
* in slurm.conf does not.
*/
for (i = 0; i < gres_data->type_cnt; i++) {
if (gres_data->type_cnt_avail[i])
continue;
updated_config = true;
break;
}
}
if (!updated_config)
return rc;
if ((gres_cnt > gres_data->gres_cnt_config) && config_overrides) {
info("%s: %s: count on node %s inconsistent with slurmctld count (%"PRIu64" != %"PRIu64")",
__func__, context_ptr->gres_type, node_name,
gres_cnt, gres_data->gres_cnt_config);
gres_cnt = gres_data->gres_cnt_config; /* Ignore excess GRES */
}
if ((topo_cnt == 0) && (topo_cnt != gres_data->topo_cnt)) {
/* Need to clear topology info */
_gres_node_state_delete_topo(gres_data);
gres_data->topo_cnt = topo_cnt;
}
has_file = context_ptr->config_flags & GRES_CONF_HAS_FILE;
has_type = context_ptr->config_flags & GRES_CONF_HAS_TYPE;
if (_shared_gres(context_ptr->plugin_id))
dev_cnt = topo_cnt;
else
dev_cnt = gres_cnt;
if (has_file && (topo_cnt != gres_data->topo_cnt) && (dev_cnt == 0)) {
/*
* Clear any vestigial GRES node state info.
*/
_gres_node_state_delete_topo(gres_data);
xfree(gres_data->gres_bit_alloc);
gres_data->topo_cnt = 0;
} else if (has_file && (topo_cnt != gres_data->topo_cnt)) {
/*
* Need to rebuild topology info.
* Resize the data structures here.
*/
rebuild_topo = true;
gres_data->topo_gres_cnt_alloc =
xrealloc(gres_data->topo_gres_cnt_alloc,
topo_cnt * sizeof(uint64_t));
gres_data->topo_gres_cnt_avail =
xrealloc(gres_data->topo_gres_cnt_avail,
topo_cnt * sizeof(uint64_t));
for (i = 0; i < gres_data->topo_cnt; i++) {
if (gres_data->topo_gres_bitmap) {
FREE_NULL_BITMAP(gres_data->
topo_gres_bitmap[i]);
}
if (gres_data->topo_core_bitmap) {
FREE_NULL_BITMAP(gres_data->
topo_core_bitmap[i]);
}
xfree(gres_data->topo_type_name[i]);
}
gres_data->topo_gres_bitmap =
xrealloc(gres_data->topo_gres_bitmap,
topo_cnt * sizeof(bitstr_t *));
gres_data->topo_core_bitmap =
xrealloc(gres_data->topo_core_bitmap,
topo_cnt * sizeof(bitstr_t *));
gres_data->topo_type_id = xrealloc(gres_data->topo_type_id,
topo_cnt * sizeof(uint32_t));
gres_data->topo_type_name = xrealloc(gres_data->topo_type_name,
topo_cnt * sizeof(char *));
if (gres_data->gres_bit_alloc)
gres_data->gres_bit_alloc = bit_realloc(
gres_data->gres_bit_alloc, dev_cnt);
gres_data->topo_cnt = topo_cnt;
} else if (_shared_gres(context_ptr->plugin_id) && gres_data->topo_cnt){
/*
* Need to rebuild topology info to recover state after
* slurmctld restart with running jobs.
*/
rebuild_topo = true;
}
if (rebuild_topo) {
iter = list_iterator_create(gres_conf_list);
gres_inx = i = 0;
while ((gres_slurmd_conf = (gres_slurmd_conf_t *)
list_next(iter))) {
if (gres_slurmd_conf->plugin_id !=
context_ptr->plugin_id)
continue;
if ((gres_data->gres_bit_alloc) &&
!_shared_gres(context_ptr->plugin_id))
gres_data->topo_gres_cnt_alloc[i] = 0;
gres_data->topo_gres_cnt_avail[i] =
gres_slurmd_conf->count;
if (gres_slurmd_conf->cpus) {
bitstr_t *tmp_bitmap;
tmp_bitmap =
bit_alloc(gres_slurmd_conf->cpu_cnt);
bit_unfmt(tmp_bitmap, gres_slurmd_conf->cpus);
if (gres_slurmd_conf->cpu_cnt == core_cnt) {
gres_data->topo_core_bitmap[i] =
tmp_bitmap;
tmp_bitmap = NULL; /* Nothing to free */
} else if (gres_slurmd_conf->cpu_cnt ==
cpu_cnt) {
/* Translate CPU to core bitmap */
int cpus_per_core = cpu_cnt / core_cnt;
int j, core_inx;
gres_data->topo_core_bitmap[i] =
bit_alloc(core_cnt);
for (j = 0; j < cpu_cnt; j++) {
if (!bit_test(tmp_bitmap, j))
continue;
core_inx = j / cpus_per_core;
bit_set(gres_data->
topo_core_bitmap[i],
core_inx);
}
} else if (i == 0) {
error("%s: %s: invalid GRES cpu count (%u) on node %s",
__func__, context_ptr->gres_type,
gres_slurmd_conf->cpu_cnt,
node_name);
}
FREE_NULL_BITMAP(tmp_bitmap);
cpus_config = core_cnt;
} else if (cpus_config && !cpu_config_err) {
cpu_config_err = true;
error("%s: %s: has CPUs configured for only some of the records on node %s",
__func__, context_ptr->gres_type,
node_name);
}
if (gres_slurmd_conf->links) {
if (gres_data->links_cnt &&
(gres_data->link_len != gres_cnt)) {
/* Size changed, need to rebuild */
for (j = 0; j < gres_data->link_len;j++)
xfree(gres_data->links_cnt[j]);
xfree(gres_data->links_cnt);
}
if (!gres_data->links_cnt) {
gres_data->link_len = gres_cnt;
gres_data->links_cnt =
xcalloc(gres_cnt,
sizeof(int *));
for (j = 0; j < gres_cnt; j++) {
gres_data->links_cnt[j] =
xcalloc(gres_cnt,
sizeof(int));
}
}
}
if (_shared_gres(gres_slurmd_conf->plugin_id)) {
/* If running jobs recovered then already set */
if (!gres_data->topo_gres_bitmap[i]) {
gres_data->topo_gres_bitmap[i] =
bit_alloc(dev_cnt);
bit_set(gres_data->topo_gres_bitmap[i],
gres_inx);
}
gres_inx++;
} else if (dev_cnt == 0) {
/*
* Slurmd found GRES, but slurmctld can't use
* them. Avoid creating zero-size bitmaps.
*/
has_file = false;
} else {
gres_data->topo_gres_bitmap[i] =
bit_alloc(dev_cnt);
for (j = 0; j < gres_slurmd_conf->count; j++) {
if (gres_inx >= dev_cnt) {
/* Ignore excess GRES on node */
break;
}
bit_set(gres_data->topo_gres_bitmap[i],
gres_inx);
if (gres_data->gres_bit_alloc &&
bit_test(gres_data->gres_bit_alloc,
gres_inx)) {
/* Set by recovered job */
gres_data->topo_gres_cnt_alloc[i]++;
}
_links_str2array(
gres_slurmd_conf->links,
node_name, gres_data,
gres_inx, gres_cnt);
gres_inx++;
}
}
gres_data->topo_type_id[i] =
gres_plugin_build_id(gres_slurmd_conf->
type_name);
gres_data->topo_type_name[i] =
xstrdup(gres_slurmd_conf->type_name);
i++;
if (i >= gres_data->topo_cnt)
break;
}
list_iterator_destroy(iter);
if (cpu_config_err) {
/*
* Some GRES of this type have "CPUs" configured. Set
* topo_core_bitmap for all others with all bits set.
*/
iter = list_iterator_create(gres_conf_list);
while ((gres_slurmd_conf = (gres_slurmd_conf_t *)
list_next(iter))) {
if (gres_slurmd_conf->plugin_id !=
context_ptr->plugin_id)
continue;
for (j = 0; j < i; j++) {
if (gres_data->topo_core_bitmap[j])
continue;
gres_data->topo_core_bitmap[j] =
bit_alloc(cpus_config);
bit_set_all(gres_data->
topo_core_bitmap[j]);
}
}
list_iterator_destroy(iter);
}
} else if (!has_file && has_type) {
/* Add GRES Type information as needed */
iter = list_iterator_create(gres_conf_list);
while ((gres_slurmd_conf = (gres_slurmd_conf_t *)
list_next(iter))) {
if (gres_slurmd_conf->plugin_id !=
context_ptr->plugin_id)
continue;
type_id = gres_plugin_build_id(
gres_slurmd_conf->type_name);
for (i = 0; i < gres_data->type_cnt; i++) {
if (type_id == gres_data->type_id[i])
break;
}
if (i < gres_data->type_cnt) {
/* Update count as needed */
gres_data->type_cnt_avail[i] =
gres_slurmd_conf->count;
} else {
_add_gres_type(gres_slurmd_conf->type_name,
gres_data,
gres_slurmd_conf->count);
}
}
list_iterator_destroy(iter);
}
if ((orig_config == NULL) || (orig_config[0] == '\0'))
gres_data->gres_cnt_config = 0;
else if (gres_data->gres_cnt_config == NO_VAL64) {
/* This should have been filled in by _node_config_init() */
_get_gres_cnt(gres_data, orig_config,
context_ptr->gres_name,
context_ptr->gres_name_colon,
context_ptr->gres_name_colon_len);
}
gres_data->gres_cnt_avail = gres_data->gres_cnt_config;
if (has_file) {
uint64_t gres_bits;
if (_shared_gres(context_ptr->plugin_id)) {
gres_bits = topo_cnt;
} else {
if (gres_data->gres_cnt_avail > MAX_GRES_BITMAP) {
error("%s: %s has \"File\" plus very large \"Count\" "
"(%"PRIu64") for node %s, resetting value to %u",
__func__, context_ptr->gres_type,
gres_data->gres_cnt_avail, node_name,
MAX_GRES_BITMAP);
gres_data->gres_cnt_avail = MAX_GRES_BITMAP;
gres_data->gres_cnt_found = MAX_GRES_BITMAP;
}
gres_bits = gres_data->gres_cnt_avail;
}
_gres_bit_alloc_resize(gres_data, gres_bits);
}
if ((config_type_cnt > 1) &&
!_valid_gres_types(context_ptr->gres_type, gres_data, reason_down)){
rc = EINVAL;
} else if (!config_overrides &&
(gres_data->gres_cnt_found < gres_data->gres_cnt_config)) {
if (reason_down && (*reason_down == NULL)) {
xstrfmtcat(*reason_down,
"%s count too low (%"PRIu64" < %"PRIu64")",
context_ptr->gres_type,
gres_data->gres_cnt_found,
gres_data->gres_cnt_config);
}
rc = EINVAL;
} else if (_valid_gres_type(context_ptr->gres_type, gres_data,
config_overrides, reason_down)) {
rc = EINVAL;
} else if (config_overrides && gres_data->topo_cnt &&
(gres_data->gres_cnt_found != gres_data->gres_cnt_config)) {
error("%s on node %s configured for %"PRIu64" resources but "
"%"PRIu64" found, ignoring topology support",
context_ptr->gres_type, node_name,
gres_data->gres_cnt_config, gres_data->gres_cnt_found);
if (gres_data->topo_core_bitmap) {
for (i = 0; i < gres_data->topo_cnt; i++) {
if (gres_data->topo_core_bitmap) {
FREE_NULL_BITMAP(gres_data->
topo_core_bitmap[i]);
}
if (gres_data->topo_gres_bitmap) {
FREE_NULL_BITMAP(gres_data->
topo_gres_bitmap[i]);
}
xfree(gres_data->topo_type_name[i]);
}
xfree(gres_data->topo_core_bitmap);
xfree(gres_data->topo_gres_bitmap);
xfree(gres_data->topo_gres_cnt_alloc);
xfree(gres_data->topo_gres_cnt_avail);
xfree(gres_data->topo_type_id);
xfree(gres_data->topo_type_name);
}
gres_data->topo_cnt = 0;
}
return rc;
}
/*
* Validate a node's configuration and put a gres record onto a list
* Called immediately after gres_plugin_node_config_unpack().
* IN node_name - name of the node for which the gres information applies
* IN orig_config - Gres information supplied from merged slurm.conf/gres.conf
* IN/OUT new_config - Updated gres info from slurm.conf
* IN/OUT gres_list - List of Gres records for this node to track usage
* IN threads_per_core - Count of CPUs (threads) per core on this node
* IN cores_per_sock - Count of cores per socket on this node
* IN sock_cnt - Count of sockets on this node
* IN config_overrides - true: Don't validate hardware, use slurm.conf
* configuration
* false: Validate hardware config, but use slurm.conf
* config
* OUT reason_down - set to an explanation of failure, if any, don't set if NULL
*/
extern int gres_plugin_node_config_validate(char *node_name,
char *orig_config,
char **new_config,
List *gres_list,
int threads_per_core,
int cores_per_sock, int sock_cnt,
bool config_overrides,
char **reason_down)
{
int i, rc, rc2;
gres_state_t *gres_ptr, *gres_gpu_ptr = NULL, *gres_mps_ptr = NULL;
int core_cnt = sock_cnt * cores_per_sock;
int cpu_cnt = core_cnt * threads_per_core;
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
if ((gres_context_cnt > 0) && (*gres_list == NULL))
*gres_list = list_create(_gres_node_list_delete);
for (i = 0; i < gres_context_cnt; i++) {
/* Find or create gres_state entry on the list */
gres_ptr = list_find_first(*gres_list, _gres_find_id,
&gres_context[i].plugin_id);
if (gres_ptr == NULL) {
gres_ptr = xmalloc(sizeof(gres_state_t));
gres_ptr->plugin_id = gres_context[i].plugin_id;
list_append(*gres_list, gres_ptr);
}
rc2 = _node_config_validate(node_name, orig_config,
gres_ptr, cpu_cnt, core_cnt,
sock_cnt, config_overrides,
reason_down, &gres_context[i]);
rc = MAX(rc, rc2);
if (gres_ptr->plugin_id == gpu_plugin_id)
gres_gpu_ptr = gres_ptr;
else if (gres_ptr->plugin_id == mps_plugin_id)
gres_mps_ptr = gres_ptr;
}
_sync_node_mps_to_gpu(gres_mps_ptr, gres_gpu_ptr);
_build_node_gres_str(gres_list, new_config, cores_per_sock, sock_cnt);
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
/* Convert number to new value with suffix (e.g. 2096 -> 2K) */
static void _gres_scale_value(uint64_t gres_size, uint64_t *gres_scaled,
char **suffix)
{
uint64_t tmp_gres_size = gres_size;
int i;
tmp_gres_size = gres_size;
for (i = 0; i < 4; i++) {
if ((tmp_gres_size != 0) && ((tmp_gres_size % 1024) == 0))
tmp_gres_size /= 1024;
else
break;
}
*gres_scaled = tmp_gres_size;
if (i == 0)
*suffix = "";
else if (i == 1)
*suffix = "K";
else if (i == 2)
*suffix = "M";
else if (i == 3)
*suffix = "G";
else
*suffix = "T";
}
/*
* Add a GRES from node_feature plugin
* IN node_name - name of the node for which the gres information applies
* IN gres_name - name of the GRES being added or updated from the plugin
* IN gres_size - count of this GRES on this node
* IN/OUT new_config - Updated GRES info from slurm.conf
* IN/OUT gres_list - List of GRES records for this node to track usage
*/
extern void gres_plugin_node_feature(char *node_name,
char *gres_name, uint64_t gres_size,
char **new_config, List *gres_list)
{
char *new_gres = NULL, *tok, *save_ptr = NULL, *sep = "", *suffix = "";
gres_state_t *gres_ptr;
gres_node_state_t *gres_node_ptr;
uint32_t plugin_id;
uint64_t gres_scaled = 0;
int gres_name_len;
xassert(gres_name);
gres_name_len = strlen(gres_name);
plugin_id = gres_plugin_build_id(gres_name);
if (*new_config) {
tok = strtok_r(*new_config, ",", &save_ptr);
while (tok) {
if (!strncmp(tok, gres_name, gres_name_len) &&
((tok[gres_name_len] == ':') ||
(tok[gres_name_len] == '\0'))) {
/* Skip this record */
} else {
xstrfmtcat(new_gres, "%s%s", sep, tok);
sep = ",";
}
tok = strtok_r(NULL, ",", &save_ptr);
}
}
_gres_scale_value(gres_size, &gres_scaled, &suffix);
xstrfmtcat(new_gres, "%s%s:%"PRIu64"%s",
sep, gres_name, gres_scaled, suffix);
xfree(*new_config);
*new_config = new_gres;
slurm_mutex_lock(&gres_context_lock);
if (gres_context_cnt > 0) {
if (*gres_list == NULL)
*gres_list = list_create(_gres_node_list_delete);
gres_ptr = list_find_first(*gres_list, _gres_find_id,
&plugin_id);
if (gres_ptr == NULL) {
gres_ptr = xmalloc(sizeof(gres_state_t));
gres_ptr->plugin_id = plugin_id;
gres_ptr->gres_data = _build_gres_node_state();
list_append(*gres_list, gres_ptr);
}
gres_node_ptr = gres_ptr->gres_data;
if (gres_size >= gres_node_ptr->gres_cnt_alloc) {
gres_node_ptr->gres_cnt_avail = gres_size -
gres_node_ptr->gres_cnt_alloc;
} else {
error("%s: Changed size count of GRES %s from %"PRIu64
" to %"PRIu64", resource over allocated",
__func__, gres_name,
gres_node_ptr->gres_cnt_avail, gres_size);
gres_node_ptr->gres_cnt_avail = 0;
}
gres_node_ptr->gres_cnt_config = gres_size;
gres_node_ptr->gres_cnt_found = gres_size;
gres_node_ptr->node_feature = true;
}
slurm_mutex_unlock(&gres_context_lock);
}
/*
* Check validity of a GRES change. Specifically if a GRES type has "Files"
* configured then the only valid new counts are the current count or zero
*
* RET true of the requested change is valid
*/
static int _node_reconfig_test(char *node_name, char *new_gres,
gres_state_t *gres_ptr,
slurm_gres_context_t *context_ptr)
{
gres_node_state_t *orig_gres_data, *new_gres_data;
int rc = SLURM_SUCCESS;
xassert(gres_ptr);
if (!(context_ptr->config_flags & GRES_CONF_HAS_FILE))
return SLURM_SUCCESS;
orig_gres_data = gres_ptr->gres_data;
new_gres_data = _build_gres_node_state();
_get_gres_cnt(new_gres_data, new_gres,
context_ptr->gres_name,
context_ptr->gres_name_colon,
context_ptr->gres_name_colon_len);
if ((new_gres_data->gres_cnt_config != 0) &&
(new_gres_data->gres_cnt_config !=
orig_gres_data->gres_cnt_config)) {
error("Attempt to change gres/%s Count on node %s from %"
PRIu64" to %"PRIu64" invalid with File configuration",
context_ptr->gres_name, node_name,
orig_gres_data->gres_cnt_config,
new_gres_data->gres_cnt_config);
rc = ESLURM_INVALID_GRES;
}
_gres_node_state_delete(new_gres_data);
return rc;
}
static int _node_reconfig(char *node_name, char *new_gres, char **gres_str,
gres_state_t *gres_ptr, bool config_overrides,
slurm_gres_context_t *context_ptr,
bool *updated_gpu_cnt)
{
int i;
gres_node_state_t *gres_data;
uint64_t gres_bits, orig_cnt;
xassert(gres_ptr);
xassert(updated_gpu_cnt);
*updated_gpu_cnt = false;
if (gres_ptr->gres_data == NULL)
gres_ptr->gres_data = _build_gres_node_state();
gres_data = gres_ptr->gres_data;
orig_cnt = gres_data->gres_cnt_config;
_get_gres_cnt(gres_data, new_gres,
context_ptr->gres_name,
context_ptr->gres_name_colon,
context_ptr->gres_name_colon_len);
if (gres_data->gres_cnt_config == orig_cnt)
return SLURM_SUCCESS; /* No change in count */
/* Update count */
context_ptr->total_cnt -= orig_cnt;
context_ptr->total_cnt += gres_data->gres_cnt_config;
if (!gres_data->gres_cnt_config)
gres_data->gres_cnt_avail = gres_data->gres_cnt_config;
else if (gres_data->gres_cnt_found != NO_VAL64)
gres_data->gres_cnt_avail = gres_data->gres_cnt_found;
else if (gres_data->gres_cnt_avail == NO_VAL64)
gres_data->gres_cnt_avail = 0;
if (context_ptr->config_flags & GRES_CONF_HAS_FILE) {
if (_shared_gres(context_ptr->plugin_id))
gres_bits = gres_data->topo_cnt;
else
gres_bits = gres_data->gres_cnt_avail;
_gres_bit_alloc_resize(gres_data, gres_bits);
} else if (gres_data->gres_bit_alloc &&
!_shared_gres(context_ptr->plugin_id)) {
/*
* If GRES count changed in configuration between reboots,
* update bitmap sizes as needed.
*/
gres_bits = gres_data->gres_cnt_avail;
if (gres_bits != bit_size(gres_data->gres_bit_alloc)) {
info("gres/%s count changed on node %s to %"PRIu64,
context_ptr->gres_name, node_name, gres_bits);
if (_sharing_gres(context_ptr->plugin_id))
*updated_gpu_cnt = true;
gres_data->gres_bit_alloc =
bit_realloc(gres_data->gres_bit_alloc,
gres_bits);
for (i = 0; i < gres_data->topo_cnt; i++) {
if (gres_data->topo_gres_bitmap &&
gres_data->topo_gres_bitmap[i] &&
(gres_bits !=
bit_size(gres_data->topo_gres_bitmap[i]))){
gres_data->topo_gres_bitmap[i] =
bit_realloc(
gres_data->topo_gres_bitmap[i],
gres_bits);
}
}
}
}
return SLURM_SUCCESS;
}
/* The GPU count on a node changed. Update MPS data structures to match */
static void _sync_node_mps_to_gpu(gres_state_t *mps_gres_ptr,
gres_state_t *gpu_gres_ptr)
{
gres_node_state_t *gpu_gres_data, *mps_gres_data;
uint64_t gpu_cnt, mps_alloc = 0, mps_rem;
int i;
if (!gpu_gres_ptr || !mps_gres_ptr)
return;
gpu_gres_data = gpu_gres_ptr->gres_data;
mps_gres_data = mps_gres_ptr->gres_data;
gpu_cnt = gpu_gres_data->gres_cnt_avail;
if (mps_gres_data->gres_bit_alloc) {
if (gpu_cnt == bit_size(mps_gres_data->gres_bit_alloc))
return; /* No change for gres/mps */
}
if (gpu_cnt == 0)
return; /* Still no GPUs */
/* Free any excess gres/mps topo records */
for (i = gpu_cnt; i < mps_gres_data->topo_cnt; i++) {
if (mps_gres_data->topo_core_bitmap)
FREE_NULL_BITMAP(mps_gres_data->topo_core_bitmap[i]);
if (mps_gres_data->topo_gres_bitmap)
FREE_NULL_BITMAP(mps_gres_data->topo_gres_bitmap[i]);
xfree(mps_gres_data->topo_type_name[i]);
}
if (mps_gres_data->gres_cnt_avail == 0) {
/* No gres/mps on this node */
mps_gres_data->topo_cnt = 0;
return;
}
if (!mps_gres_data->gres_bit_alloc) {
mps_gres_data->gres_bit_alloc = bit_alloc(gpu_cnt);
} else {
mps_gres_data->gres_bit_alloc =
bit_realloc(mps_gres_data->gres_bit_alloc,
gpu_cnt);
}
/* Add any additional required gres/mps topo records */
if (mps_gres_data->topo_cnt) {
mps_gres_data->topo_core_bitmap =
xrealloc(mps_gres_data->topo_core_bitmap,
sizeof(bitstr_t *) * gpu_cnt);
mps_gres_data->topo_gres_bitmap =
xrealloc(mps_gres_data->topo_gres_bitmap,
sizeof(bitstr_t *) * gpu_cnt);
mps_gres_data->topo_gres_cnt_alloc =
xrealloc(mps_gres_data->topo_gres_cnt_alloc,
sizeof(uint64_t) * gpu_cnt);
mps_gres_data->topo_gres_cnt_avail =
xrealloc(mps_gres_data->topo_gres_cnt_avail,
sizeof(uint64_t) * gpu_cnt);
mps_gres_data->topo_type_id =
xrealloc(mps_gres_data->topo_type_id,
sizeof(uint32_t) * gpu_cnt);
mps_gres_data->topo_type_name =
xrealloc(mps_gres_data->topo_type_name,
sizeof(char *) * gpu_cnt);
} else {
mps_gres_data->topo_core_bitmap =
xcalloc(gpu_cnt, sizeof(bitstr_t *));
mps_gres_data->topo_gres_bitmap =
xcalloc(gpu_cnt, sizeof(bitstr_t *));
mps_gres_data->topo_gres_cnt_alloc =
xcalloc(gpu_cnt, sizeof(uint64_t));
mps_gres_data->topo_gres_cnt_avail =
xcalloc(gpu_cnt, sizeof(uint64_t));
mps_gres_data->topo_type_id =
xcalloc(gpu_cnt, sizeof(uint32_t));
mps_gres_data->topo_type_name =
xcalloc(gpu_cnt, sizeof(char *));
}
/*
* Evenly distribute any remaining MPS counts.
* Counts get reset as needed when the node registers.
*/
for (i = 0; i < mps_gres_data->topo_cnt; i++)
mps_alloc += mps_gres_data->topo_gres_cnt_avail[i];
if (mps_alloc >= mps_gres_data->gres_cnt_avail)
mps_rem = 0;
else
mps_rem = mps_gres_data->gres_cnt_avail - mps_alloc;
for (i = mps_gres_data->topo_cnt; i < gpu_cnt; i++) {
mps_gres_data->topo_gres_bitmap[i] = bit_alloc(gpu_cnt);
bit_set(mps_gres_data->topo_gres_bitmap[i], i);
mps_alloc = mps_rem / (gpu_cnt - i);
mps_gres_data->topo_gres_cnt_avail[i] = mps_alloc;
mps_rem -= mps_alloc;
}
mps_gres_data->topo_cnt = gpu_cnt;
for (i = 0; i < mps_gres_data->topo_cnt; i++) {
if (mps_gres_data->topo_gres_bitmap &&
mps_gres_data->topo_gres_bitmap[i] &&
(gpu_cnt != bit_size(mps_gres_data->topo_gres_bitmap[i]))) {
mps_gres_data->topo_gres_bitmap[i] =
bit_realloc(mps_gres_data->topo_gres_bitmap[i],
gpu_cnt);
}
}
}
/* Convert core bitmap into socket string, xfree return value */
static char *_core_bitmap2str(bitstr_t *core_map, int cores_per_sock,
int sock_per_node)
{
char *sock_info = NULL, tmp[256];
bitstr_t *sock_map;
int c, s, core_offset, max_core;
bool any_set = false;
xassert(core_map);
max_core = bit_size(core_map) - 1;
sock_map = bit_alloc(sock_per_node);
for (s = 0; s < sock_per_node; s++) {
core_offset = s * cores_per_sock;
for (c = 0; c < cores_per_sock; c++) {
if (core_offset > max_core) {
error("%s: bad core offset (%d >= %d)",
__func__, core_offset, max_core);
break;
}
if (bit_test(core_map, core_offset++)) {
bit_set(sock_map, s);
any_set = true;
break;
}
}
}
if (any_set) {
bit_fmt(tmp, sizeof(tmp), sock_map);
xstrfmtcat(sock_info, "(S:%s)", tmp);
} else {
/* We have a core bitmap with no bits set */
sock_info = xstrdup("");
}
bit_free(sock_map);
return sock_info;
}
/* Given a count, modify it as needed and return suffix (e.g. "M" for mega ) */
static char *_get_suffix(uint64_t *count)
{
if (*count == 0)
return "";
if ((*count % ((uint64_t)1024 * 1024 * 1024 * 1024 * 1024)) == 0) {
*count /= ((uint64_t)1024 * 1024 * 1024 * 1024 * 1024);
return "P";
} else if ((*count % ((uint64_t)1024 * 1024 * 1024 * 1024)) == 0) {
*count /= ((uint64_t)1024 * 1024 * 1024 * 1024);
return "T";
} else if ((*count % ((uint64_t)1024 * 1024 * 1024)) == 0) {
*count /= ((uint64_t)1024 * 1024 * 1024);
return "G";
} else if ((*count % (1024 * 1024)) == 0) {
*count /= (1024 * 1024);
return "M";
} else if ((*count % 1024) == 0) {
*count /= 1024;
return "K";
} else {
return "";
}
}
/* Build node's GRES string based upon data in that node's GRES list */
static void _build_node_gres_str(List *gres_list, char **gres_str,
int cores_per_sock, int sock_per_node)
{
gres_state_t *gres_ptr;
gres_node_state_t *gres_node_state;
bitstr_t *done_topo, *core_map;
uint64_t gres_sum;
char *sep = "", *suffix, *sock_info = NULL, *sock_str;
int c, i, j;
xassert(gres_str);
xfree(*gres_str);
for (c = 0; c < gres_context_cnt; c++) {
/* Find gres_state entry on the list */
gres_ptr = list_find_first(*gres_list, _gres_find_id,
&gres_context[c].plugin_id);
if (gres_ptr == NULL)
continue; /* Node has none of this GRES */
gres_node_state = (gres_node_state_t *) gres_ptr->gres_data;
if (gres_node_state->topo_cnt &&
gres_node_state->gres_cnt_avail) {
done_topo = bit_alloc(gres_node_state->topo_cnt);
for (i = 0; i < gres_node_state->topo_cnt; i++) {
if (bit_test(done_topo, i))
continue;
bit_set(done_topo, i);
gres_sum = gres_node_state->
topo_gres_cnt_avail[i];
if (gres_node_state->topo_core_bitmap[i]) {
core_map = bit_copy(
gres_node_state->
topo_core_bitmap[i]);
} else
core_map = NULL;
for (j = 0; j < gres_node_state->topo_cnt; j++){
if (gres_node_state->topo_type_id[i] !=
gres_node_state->topo_type_id[j])
continue;
if (bit_test(done_topo, j))
continue;
bit_set(done_topo, j);
gres_sum += gres_node_state->
topo_gres_cnt_avail[j];
if (core_map &&
gres_node_state->
topo_core_bitmap[j]) {
bit_or(core_map,
gres_node_state->
topo_core_bitmap[j]);
} else if (gres_node_state->
topo_core_bitmap[j]) {
core_map = bit_copy(
gres_node_state->
topo_core_bitmap[j]);
}
}
if (core_map) {
sock_info = _core_bitmap2str(core_map,
cores_per_sock,
sock_per_node);
bit_free(core_map);
sock_str = sock_info;
} else
sock_str = "";
suffix = _get_suffix(&gres_sum);
if (gres_node_state->topo_type_name[i]) {
xstrfmtcat(*gres_str,
"%s%s:%s:%"PRIu64"%s%s", sep,
gres_context[c].gres_name,
gres_node_state->
topo_type_name[i],
gres_sum, suffix, sock_str);
} else {
xstrfmtcat(*gres_str,
"%s%s:%"PRIu64"%s%s", sep,
gres_context[c].gres_name,
gres_sum, suffix, sock_str);
}
xfree(sock_info);
sep = ",";
}
bit_free(done_topo);
} else if (gres_node_state->type_cnt &&
gres_node_state->gres_cnt_avail) {
for (i = 0; i < gres_node_state->type_cnt; i++) {
gres_sum = gres_node_state->type_cnt_avail[i];
suffix = _get_suffix(&gres_sum);
xstrfmtcat(*gres_str, "%s%s:%s:%"PRIu64"%s",
sep, gres_context[c].gres_name,
gres_node_state->type_name[i],
gres_sum, suffix);
sep = ",";
}
} else if (gres_node_state->gres_cnt_avail) {
gres_sum = gres_node_state->gres_cnt_avail;
suffix = _get_suffix(&gres_sum);
xstrfmtcat(*gres_str, "%s%s:%"PRIu64"%s",
sep, gres_context[c].gres_name,
gres_sum, suffix);
sep = ",";
}
}
}
/*
* Note that a node's configuration has been modified (e.g. "scontol update ..")
* IN node_name - name of the node for which the gres information applies
* IN new_gres - Updated GRES information supplied from slurm.conf or scontrol
* IN/OUT gres_str - Node's current GRES string, updated as needed
* IN/OUT gres_list - List of Gres records for this node to track usage
* IN config_overrides - true: Don't validate hardware, use slurm.conf
* configuration
* false: Validate hardware config, but use slurm.conf
* config
* IN cores_per_sock - Number of cores per socket on this node
* IN sock_per_node - Total count of sockets on this node (on any board)
*/
extern int gres_plugin_node_reconfig(char *node_name,
char *new_gres,
char **gres_str,
List *gres_list,
bool config_overrides,
int cores_per_sock,
int sock_per_node)
{
int i, rc;
ListIterator gres_iter;
gres_state_t *gres_ptr = NULL, **gres_ptr_array;
gres_state_t *gpu_gres_ptr = NULL, *mps_gres_ptr;
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_ptr_array = xcalloc(gres_context_cnt, sizeof(gres_state_t *));
if ((gres_context_cnt > 0) && (*gres_list == NULL))
*gres_list = list_create(_gres_node_list_delete);
/* First validate all of the requested GRES changes */
for (i = 0; (rc == SLURM_SUCCESS) && (i < gres_context_cnt); i++) {
/* Find gres_state entry on the list */
gres_ptr = list_find_first(*gres_list, _gres_find_id,
&gres_context[i].plugin_id);
if (gres_ptr == NULL)
continue;
gres_ptr_array[i] = gres_ptr;
rc = _node_reconfig_test(node_name, new_gres, gres_ptr,
&gres_context[i]);
}
/* Now update the GRES counts */
for (i = 0; (rc == SLURM_SUCCESS) && (i < gres_context_cnt); i++) {
bool updated_gpu_cnt = false;
if (gres_ptr_array[i] == NULL)
continue;
rc = _node_reconfig(node_name, new_gres, gres_str,
gres_ptr_array[i], config_overrides,
&gres_context[i], &updated_gpu_cnt);
if (updated_gpu_cnt)
gpu_gres_ptr = gres_ptr;
}
/* Now synchronize gres/gpu and gres/mps state */
if (gpu_gres_ptr && have_mps) {
/* Update gres/mps counts and bitmaps to match gres/gpu */
gres_iter = list_iterator_create(*gres_list);
while ((mps_gres_ptr = (gres_state_t *) list_next(gres_iter))) {
if (_shared_gres(mps_gres_ptr->plugin_id))
break;
}
list_iterator_destroy(gres_iter);
_sync_node_mps_to_gpu(mps_gres_ptr, gpu_gres_ptr);
}
/* Build new per-node gres_str */
_build_node_gres_str(gres_list, gres_str, cores_per_sock,sock_per_node);
slurm_mutex_unlock(&gres_context_lock);
xfree(gres_ptr_array);
return rc;
}
/*
* Pack a node's current gres status, called from slurmctld for save/restore
* IN gres_list - generated by gres_plugin_node_config_validate()
* IN/OUT buffer - location to write state to
* IN node_name - name of the node for which the gres information applies
*/
extern int gres_plugin_node_state_pack(List gres_list, Buf buffer,
char *node_name)
{
int rc = SLURM_SUCCESS;
uint32_t top_offset, tail_offset;
uint32_t magic = GRES_MAGIC;
uint16_t gres_bitmap_size, rec_cnt = 0;
ListIterator gres_iter;
gres_state_t *gres_ptr;
gres_node_state_t *gres_node_ptr;
if (gres_list == NULL) {
pack16(rec_cnt, buffer);
return rc;
}
top_offset = get_buf_offset(buffer);
pack16(rec_cnt, buffer); /* placeholder if data */
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
pack32(magic, buffer);
pack32(gres_ptr->plugin_id, buffer);
pack64(gres_node_ptr->gres_cnt_avail, buffer);
/*
* Just note if gres_bit_alloc exists.
* Rebuild it based upon the state of recovered jobs
*/
if (gres_node_ptr->gres_bit_alloc)
gres_bitmap_size = bit_size(gres_node_ptr->gres_bit_alloc);
else
gres_bitmap_size = 0;
pack16(gres_bitmap_size, buffer);
rec_cnt++;
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
tail_offset = get_buf_offset(buffer);
set_buf_offset(buffer, top_offset);
pack16(rec_cnt, buffer);
set_buf_offset(buffer, tail_offset);
return rc;
}
/*
* Unpack a node's current gres status, called from slurmctld for save/restore
* OUT gres_list - restored state stored by gres_plugin_node_state_pack()
* IN/OUT buffer - location to read state from
* IN node_name - name of the node for which the gres information applies
*/
extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer,
char *node_name,
uint16_t protocol_version)
{
int i, rc;
uint32_t magic = 0, plugin_id = 0;
uint64_t gres_cnt_avail = 0;
uint16_t gres_bitmap_size = 0, rec_cnt = 0;
uint8_t has_bitmap = 0;
gres_state_t *gres_ptr;
gres_node_state_t *gres_node_ptr;
safe_unpack16(&rec_cnt, buffer);
if (rec_cnt == 0)
return SLURM_SUCCESS;
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
if ((gres_context_cnt > 0) && (*gres_list == NULL))
*gres_list = list_create(_gres_node_list_delete);
while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
if ((buffer == NULL) || (remaining_buf(buffer) == 0))
break;
rec_cnt--;
if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
safe_unpack32(&magic, buffer);
if (magic != GRES_MAGIC)
goto unpack_error;
safe_unpack32(&plugin_id, buffer);
safe_unpack64(&gres_cnt_avail, buffer);
safe_unpack16(&gres_bitmap_size, buffer);
} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
safe_unpack32(&magic, buffer);
if (magic != GRES_MAGIC)
goto unpack_error;
safe_unpack32(&plugin_id, buffer);
safe_unpack64(&gres_cnt_avail, buffer);
safe_unpack8(&has_bitmap, buffer);
if (has_bitmap)
gres_bitmap_size = gres_cnt_avail;
else
gres_bitmap_size = 0;
} else {
error("%s: protocol_version %hu not supported",
__func__, protocol_version);
goto unpack_error;
}
for (i = 0; i < gres_context_cnt; i++) {
if (gres_context[i].plugin_id == plugin_id)
break;
}
if (i >= gres_context_cnt) {
error("%s: no plugin configured to unpack data type %u from node %s",
__func__, plugin_id, node_name);
/*
* A likely sign that GresPlugins has changed.
* Not a fatal error, skip over the data.
*/
continue;
}
gres_node_ptr = _build_gres_node_state();
gres_node_ptr->gres_cnt_avail = gres_cnt_avail;
if (gres_bitmap_size) {
gres_node_ptr->gres_bit_alloc =
bit_alloc(gres_bitmap_size);
}
gres_ptr = xmalloc(sizeof(gres_state_t));
gres_ptr->plugin_id = gres_context[i].plugin_id;
gres_ptr->gres_data = gres_node_ptr;
list_append(*gres_list, gres_ptr);
}
slurm_mutex_unlock(&gres_context_lock);
return rc;
unpack_error:
error("%s: unpack error from node %s", __func__, node_name);
slurm_mutex_unlock(&gres_context_lock);
return SLURM_ERROR;
}
static void *_node_state_dup(void *gres_data)
{
int i, j;
gres_node_state_t *gres_ptr = (gres_node_state_t *) gres_data;
gres_node_state_t *new_gres;
if (gres_ptr == NULL)
return NULL;
new_gres = xmalloc(sizeof(gres_node_state_t));
new_gres->gres_cnt_found = gres_ptr->gres_cnt_found;
new_gres->gres_cnt_config = gres_ptr->gres_cnt_config;
new_gres->gres_cnt_avail = gres_ptr->gres_cnt_avail;
new_gres->gres_cnt_alloc = gres_ptr->gres_cnt_alloc;
new_gres->no_consume = gres_ptr->no_consume;
if (gres_ptr->gres_bit_alloc)
new_gres->gres_bit_alloc = bit_copy(gres_ptr->gres_bit_alloc);
if (gres_ptr->links_cnt && gres_ptr->link_len) {
new_gres->links_cnt = xcalloc(gres_ptr->link_len,
sizeof(int *));
j = sizeof(int) * gres_ptr->link_len;
for (i = 0; i < gres_ptr->link_len; i++) {
new_gres->links_cnt[i] = xmalloc(j);
memcpy(new_gres->links_cnt[i],gres_ptr->links_cnt[i],j);
}
new_gres->link_len = gres_ptr->link_len;
}
if (gres_ptr->topo_cnt) {
new_gres->topo_cnt = gres_ptr->topo_cnt;
new_gres->topo_core_bitmap = xcalloc(gres_ptr->topo_cnt,
sizeof(bitstr_t *));
new_gres->topo_gres_bitmap = xcalloc(gres_ptr->topo_cnt,
sizeof(bitstr_t *));
new_gres->topo_gres_cnt_alloc = xcalloc(gres_ptr->topo_cnt,
sizeof(uint64_t));
new_gres->topo_gres_cnt_avail = xcalloc(gres_ptr->topo_cnt,
sizeof(uint64_t));
new_gres->topo_type_id = xcalloc(gres_ptr->topo_cnt,
sizeof(uint32_t));
new_gres->topo_type_name = xcalloc(gres_ptr->topo_cnt,
sizeof(char *));
for (i = 0; i < gres_ptr->topo_cnt; i++) {
if (gres_ptr->topo_core_bitmap[i]) {
new_gres->topo_core_bitmap[i] =
bit_copy(gres_ptr->topo_core_bitmap[i]);
}
new_gres->topo_gres_bitmap[i] =
bit_copy(gres_ptr->topo_gres_bitmap[i]);
new_gres->topo_gres_cnt_alloc[i] =
gres_ptr->topo_gres_cnt_alloc[i];
new_gres->topo_gres_cnt_avail[i] =
gres_ptr->topo_gres_cnt_avail[i];
new_gres->topo_type_id[i] = gres_ptr->topo_type_id[i];
new_gres->topo_type_name[i] =
xstrdup(gres_ptr->topo_type_name[i]);
}
}
if (gres_ptr->type_cnt) {
new_gres->type_cnt = gres_ptr->type_cnt;
new_gres->type_cnt_alloc = xcalloc(gres_ptr->type_cnt,
sizeof(uint64_t));
new_gres->type_cnt_avail = xcalloc(gres_ptr->type_cnt,
sizeof(uint64_t));
new_gres->type_id = xcalloc(gres_ptr->type_cnt,
sizeof(uint32_t));
new_gres->type_name = xcalloc(gres_ptr->type_cnt,
sizeof(char *));
for (i = 0; i < gres_ptr->type_cnt; i++) {
new_gres->type_cnt_alloc[i] =
gres_ptr->type_cnt_alloc[i];
new_gres->type_cnt_avail[i] =
gres_ptr->type_cnt_avail[i];
new_gres->type_id[i] = gres_ptr->type_id[i];
new_gres->type_name[i] =
xstrdup(gres_ptr->type_name[i]);
}
}
return new_gres;
}
/*
* Duplicate a node gres status (used for will-run logic)
* IN gres_list - node gres state information
* RET a copy of gres_list or NULL on failure
*/
extern List gres_plugin_node_state_dup(List gres_list)
{
int i;
List new_list = NULL;
ListIterator gres_iter;
gres_state_t *gres_ptr, *new_gres;
void *gres_data;
if (gres_list == NULL)
return new_list;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
if ((gres_context_cnt > 0)) {
new_list = list_create(_gres_node_list_delete);
}
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
for (i=0; i<gres_context_cnt; i++) {
if (gres_ptr->plugin_id != gres_context[i].plugin_id)
continue;
gres_data = _node_state_dup(gres_ptr->gres_data);
if (gres_data) {
new_gres = xmalloc(sizeof(gres_state_t));
new_gres->plugin_id = gres_ptr->plugin_id;
new_gres->gres_data = gres_data;
list_append(new_list, new_gres);
}
break;
}
if (i >= gres_context_cnt) {
error("Could not find plugin id %u to dup node record",
gres_ptr->plugin_id);
}
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return new_list;
}
static void _node_state_dealloc(gres_state_t *gres_ptr)
{
int i;
gres_node_state_t *gres_node_ptr;
char *gres_name = NULL;
gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
gres_node_ptr->gres_cnt_alloc = 0;
if (gres_node_ptr->gres_bit_alloc) {
int i = bit_size(gres_node_ptr->gres_bit_alloc) - 1;
if (i >= 0)
bit_nclear(gres_node_ptr->gres_bit_alloc, 0, i);
}
if (gres_node_ptr->topo_cnt && !gres_node_ptr->topo_gres_cnt_alloc) {
for (i = 0; i < gres_context_cnt; i++) {
if (gres_ptr->plugin_id == gres_context[i].plugin_id) {
gres_name = gres_context[i].gres_name;
break;
}
}
error("gres_plugin_node_state_dealloc_all: gres/%s topo_cnt!=0 "
"and topo_gres_cnt_alloc is NULL", gres_name);
} else if (gres_node_ptr->topo_cnt) {
for (i = 0; i < gres_node_ptr->topo_cnt; i++) {
gres_node_ptr->topo_gres_cnt_alloc[i] = 0;
}
} else {
/*
* This array can be set at startup if a job has been allocated
* specific GRES and the node has not registered with the
* details needed to track individual GRES (rather than only
* a GRES count).
*/
xfree(gres_node_ptr->topo_gres_cnt_alloc);
}
for (i = 0; i < gres_node_ptr->type_cnt; i++) {
gres_node_ptr->type_cnt_alloc[i] = 0;
}
}
/*
* Deallocate all resources on this node previous allocated to any jobs.
* This function isused to synchronize state after slurmctld restarts or
* is reconfigured.
* IN gres_list - node gres state information
*/
extern void gres_plugin_node_state_dealloc_all(List gres_list)
{
ListIterator gres_iter;
gres_state_t *gres_ptr;
if (gres_list == NULL)
return;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
_node_state_dealloc(gres_ptr);
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
}
static char *_node_gres_used(void *gres_data, char *gres_name)
{
gres_node_state_t *gres_node_ptr;
char *sep = "";
int i, j;
xassert(gres_data);
gres_node_ptr = (gres_node_state_t *) gres_data;
if ((gres_node_ptr->topo_cnt != 0) &&
(gres_node_ptr->no_consume == false)) {
bitstr_t *topo_printed = bit_alloc(gres_node_ptr->topo_cnt);
xfree(gres_node_ptr->gres_used); /* Free any cached value */
for (i = 0; i < gres_node_ptr->topo_cnt; i++) {
bitstr_t *topo_gres_bitmap = NULL;
uint64_t gres_alloc_cnt = 0;
char *gres_alloc_idx, tmp_str[64];
if (bit_test(topo_printed, i))
continue;
bit_set(topo_printed, i);
if (gres_node_ptr->topo_gres_bitmap[i]) {
topo_gres_bitmap =
bit_copy(gres_node_ptr->
topo_gres_bitmap[i]);
}
for (j = i + 1; j < gres_node_ptr->topo_cnt; j++) {
if (bit_test(topo_printed, j))
continue;
if (gres_node_ptr->topo_type_id[i] !=
gres_node_ptr->topo_type_id[j])
continue;
bit_set(topo_printed, j);
if (gres_node_ptr->topo_gres_bitmap[j]) {
if (!topo_gres_bitmap) {
topo_gres_bitmap =
bit_copy(gres_node_ptr->
topo_gres_bitmap[j]);
} else if (bit_size(topo_gres_bitmap) ==
bit_size(gres_node_ptr->
topo_gres_bitmap[j])){
bit_or(topo_gres_bitmap,
gres_node_ptr->
topo_gres_bitmap[j]);
}
}
}
if (gres_node_ptr->gres_bit_alloc && topo_gres_bitmap &&
(bit_size(topo_gres_bitmap) ==
bit_size(gres_node_ptr->gres_bit_alloc))) {
bit_and(topo_gres_bitmap,
gres_node_ptr->gres_bit_alloc);
gres_alloc_cnt = bit_set_count(topo_gres_bitmap);
}
if (gres_alloc_cnt > 0) {
bit_fmt(tmp_str, sizeof(tmp_str),
topo_gres_bitmap);
gres_alloc_idx = tmp_str;
} else {
gres_alloc_idx = "N/A";
}
xstrfmtcat(gres_node_ptr->gres_used,
"%s%s:%s:%"PRIu64"(IDX:%s)", sep, gres_name,
gres_node_ptr->topo_type_name[i],
gres_alloc_cnt, gres_alloc_idx);
sep = ",";
FREE_NULL_BITMAP(topo_gres_bitmap);
}
FREE_NULL_BITMAP(topo_printed);
} else if (gres_node_ptr->gres_used) {
; /* Used cached value */
} else if (gres_node_ptr->type_cnt == 0) {
if (gres_node_ptr->no_consume) {
xstrfmtcat(gres_node_ptr->gres_used, "%s:0", gres_name);
} else {
xstrfmtcat(gres_node_ptr->gres_used, "%s:%"PRIu64,
gres_name, gres_node_ptr->gres_cnt_alloc);
}
} else {
for (i = 0; i < gres_node_ptr->type_cnt; i++) {
if (gres_node_ptr->no_consume) {
xstrfmtcat(gres_node_ptr->gres_used,
"%s%s:%s:0", sep, gres_name,
gres_node_ptr->type_name[i]);
} else {
xstrfmtcat(gres_node_ptr->gres_used,
"%s%s:%s:%"PRIu64, sep, gres_name,
gres_node_ptr->type_name[i],
gres_node_ptr->type_cnt_alloc[i]);
}
sep = ",";
}
}
return gres_node_ptr->gres_used;
}
static void _node_state_log(void *gres_data, char *node_name, char *gres_name)
{
gres_node_state_t *gres_node_ptr;
int i, j;
char *buf = NULL, *sep, tmp_str[128];
xassert(gres_data);
gres_node_ptr = (gres_node_state_t *) gres_data;
info("gres/%s: state for %s", gres_name, node_name);
if (gres_node_ptr->gres_cnt_found == NO_VAL64) {
snprintf(tmp_str, sizeof(tmp_str), "TBD");
} else {
snprintf(tmp_str, sizeof(tmp_str), "%"PRIu64,
gres_node_ptr->gres_cnt_found);
}
if (gres_node_ptr->no_consume) {
info(" gres_cnt found:%s configured:%"PRIu64" "
"avail:%"PRIu64" no_consume",
tmp_str, gres_node_ptr->gres_cnt_config,
gres_node_ptr->gres_cnt_avail);
} else {
info(" gres_cnt found:%s configured:%"PRIu64" "
"avail:%"PRIu64" alloc:%"PRIu64"",
tmp_str, gres_node_ptr->gres_cnt_config,
gres_node_ptr->gres_cnt_avail,
gres_node_ptr->gres_cnt_alloc);
}
if (gres_node_ptr->gres_bit_alloc) {
bit_fmt(tmp_str, sizeof(tmp_str),gres_node_ptr->gres_bit_alloc);
info(" gres_bit_alloc:%s of %d",
tmp_str, (int) bit_size(gres_node_ptr->gres_bit_alloc));
} else {
info(" gres_bit_alloc:NULL");
}
info(" gres_used:%s", gres_node_ptr->gres_used);
if (gres_node_ptr->links_cnt && gres_node_ptr->link_len) {
for (i = 0; i < gres_node_ptr->link_len; i++) {
sep = "";
for (j = 0; j < gres_node_ptr->link_len; j++) {
xstrfmtcat(buf, "%s%d", sep,
gres_node_ptr->links_cnt[i][j]);
sep = ", ";
}
info(" links[%d]:%s", i, buf);
xfree(buf);
}
}
for (i = 0; i < gres_node_ptr->topo_cnt; i++) {
info(" topo[%d]:%s(%u)", i, gres_node_ptr->topo_type_name[i],
gres_node_ptr->topo_type_id[i]);
if (gres_node_ptr->topo_core_bitmap[i]) {
bit_fmt(tmp_str, sizeof(tmp_str),
gres_node_ptr->topo_core_bitmap[i]);
info(" topo_core_bitmap[%d]:%s of %d", i, tmp_str,
(int)bit_size(gres_node_ptr->topo_core_bitmap[i]));
} else
info(" topo_core_bitmap[%d]:NULL", i);
if (gres_node_ptr->topo_gres_bitmap[i]) {
bit_fmt(tmp_str, sizeof(tmp_str),
gres_node_ptr->topo_gres_bitmap[i]);
info(" topo_gres_bitmap[%d]:%s of %d", i, tmp_str,
(int)bit_size(gres_node_ptr->topo_gres_bitmap[i]));
} else
info(" topo_gres_bitmap[%d]:NULL", i);
info(" topo_gres_cnt_alloc[%d]:%"PRIu64"", i,
gres_node_ptr->topo_gres_cnt_alloc[i]);
info(" topo_gres_cnt_avail[%d]:%"PRIu64"", i,
gres_node_ptr->topo_gres_cnt_avail[i]);
}
for (i = 0; i < gres_node_ptr->type_cnt; i++) {
info(" type[%d]:%s(%u)", i, gres_node_ptr->type_name[i],
gres_node_ptr->type_id[i]);
info(" type_cnt_alloc[%d]:%"PRIu64, i,
gres_node_ptr->type_cnt_alloc[i]);
info(" type_cnt_avail[%d]:%"PRIu64, i,
gres_node_ptr->type_cnt_avail[i]);
}
}
/*
* Log a node's current gres state
* IN gres_list - generated by gres_plugin_node_config_validate()
* IN node_name - name of the node for which the gres information applies
*/
extern void gres_plugin_node_state_log(List gres_list, char *node_name)
{
int i;
ListIterator gres_iter;
gres_state_t *gres_ptr;
if (!gres_debug || (gres_list == NULL))
return;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
for (i = 0; i < gres_context_cnt; i++) {
if (gres_ptr->plugin_id !=
gres_context[i].plugin_id)
continue;
_node_state_log(gres_ptr->gres_data, node_name,
gres_context[i].gres_name);
break;
}
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
}
/*
* Build a string indicating a node's drained GRES
* IN gres_list - generated by gres_plugin_node_config_validate()
* RET - string, must be xfreed by caller
*/
extern char *gres_get_node_drain(List gres_list)
{
char *node_drain = xstrdup("N/A");
return node_drain;
}
/*
* Build a string indicating a node's used GRES
* IN gres_list - generated by gres_plugin_node_config_validate()
* RET - string, must be xfreed by caller
*/
extern char *gres_get_node_used(List gres_list)
{
int i;
ListIterator gres_iter;
gres_state_t *gres_ptr;
char *gres_used = NULL, *tmp;
if (!gres_list)
return gres_used;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
for (i = 0; i < gres_context_cnt; i++) {
if (gres_ptr->plugin_id !=
gres_context[i].plugin_id)
continue;
tmp = _node_gres_used(gres_ptr->gres_data,
gres_context[i].gres_name);
if (!tmp)
continue;
if (gres_used)
xstrcat(gres_used, ",");
xstrcat(gres_used, tmp);
break;
}
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return gres_used;
}
/*
* Give the total system count of a given GRES
* Returns NO_VAL64 if name not found
*/
extern uint64_t gres_get_system_cnt(char *name)
{
uint64_t count = NO_VAL64;
int i;
if (!name)
return NO_VAL64;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
for (i = 0; i < gres_context_cnt; i++) {
if (!xstrcmp(gres_context[i].gres_name, name)) {
count = gres_context[i].total_cnt;
break;
}
}
slurm_mutex_unlock(&gres_context_lock);
return count;
}
/*
* Get the count of a node's GRES
* IN gres_list - List of Gres records for this node to track usage
* IN name - name of gres
*/
extern uint64_t gres_plugin_node_config_cnt(List gres_list, char *name)
{
int i;
gres_state_t *gres_ptr;
gres_node_state_t *data_ptr;
uint64_t count = 0;
if (!gres_list || !name || !list_count(gres_list))
return count;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
for (i = 0; i < gres_context_cnt; i++) {
if (!xstrcmp(gres_context[i].gres_name, name)) {
/* Find or create gres_state entry on the list */
gres_ptr = list_find_first(gres_list, _gres_find_id,
&gres_context[i].plugin_id);
if (!gres_ptr || !gres_ptr->gres_data)
break;
data_ptr = (gres_node_state_t *)gres_ptr->gres_data;
count = data_ptr->gres_cnt_config;
break;
} else if (!xstrncmp(name, gres_context[i].gres_name_colon,
gres_context[i].gres_name_colon_len)) {
int type;
uint32_t type_id;
char *type_str = NULL;
if (!(type_str = strchr(name, ':'))) {
error("Invalid gres name '%s'", name);
break;
}
type_str++;
gres_ptr = list_find_first(gres_list, _gres_find_id,
&gres_context[i].plugin_id);
if (!gres_ptr || !gres_ptr->gres_data)
break;
data_ptr = (gres_node_state_t *)gres_ptr->gres_data;
type_id = gres_plugin_build_id(type_str);
for (type = 0; type < data_ptr->type_cnt; type++) {
if (data_ptr->type_id[type] == type_id) {
count = data_ptr->type_cnt_avail[type];
break;
}
}
break;
}
}
slurm_mutex_unlock(&gres_context_lock);
return count;
}
static void _job_state_delete(void *gres_data)
{
int i;
gres_job_state_t *gres_ptr = (gres_job_state_t *) gres_data;
if (gres_ptr == NULL)
return;
for (i = 0; i < gres_ptr->node_cnt; i++) {
if (gres_ptr->gres_bit_alloc)
FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc[i]);
if (gres_ptr->gres_bit_step_alloc)
FREE_NULL_BITMAP(gres_ptr->gres_bit_step_alloc[i]);
}
xfree(gres_ptr->gres_bit_alloc);
xfree(gres_ptr->gres_cnt_node_alloc);
xfree(gres_ptr->gres_bit_step_alloc);
xfree(gres_ptr->gres_cnt_step_alloc);
if (gres_ptr->gres_bit_select) {
for (i = 0; i < gres_ptr->total_node_cnt; i++)
FREE_NULL_BITMAP(gres_ptr->gres_bit_select[i]);
xfree(gres_ptr->gres_bit_select);
}
xfree(gres_ptr->gres_cnt_node_alloc);
xfree(gres_ptr->gres_cnt_node_select);
xfree(gres_ptr->gres_name);
xfree(gres_ptr->type_name);
xfree(gres_ptr);
}
static void _gres_job_list_delete(void *list_element)
{
gres_state_t *gres_ptr;
if (gres_plugin_init() != SLURM_SUCCESS)
return;
gres_ptr = (gres_state_t *) list_element;
slurm_mutex_lock(&gres_context_lock);
_job_state_delete(gres_ptr->gres_data);
xfree(gres_ptr);
slurm_mutex_unlock(&gres_context_lock);
}
static int _clear_cpus_per_gres(void *x, void *arg)
{
gres_state_t *gres_ptr = (gres_state_t *) x;
gres_job_state_t *job_gres_data;
job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
job_gres_data->cpus_per_gres = 0;
return 0;
}
static int _clear_gres_per_job(void *x, void *arg)
{
gres_state_t *gres_ptr = (gres_state_t *) x;
gres_job_state_t *job_gres_data;
job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
job_gres_data->gres_per_job = 0;
return 0;
}
static int _clear_gres_per_node(void *x, void *arg)
{
gres_state_t *gres_ptr = (gres_state_t *) x;
gres_job_state_t *job_gres_data;
job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
job_gres_data->gres_per_node = 0;
return 0;
}
static int _clear_gres_per_socket(void *x, void *arg)
{
gres_state_t *gres_ptr = (gres_state_t *) x;
gres_job_state_t *job_gres_data;
job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
job_gres_data->gres_per_socket = 0;
return 0;
}
static int _clear_gres_per_task(void *x, void *arg)
{
gres_state_t *gres_ptr = (gres_state_t *) x;
gres_job_state_t *job_gres_data;
job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
job_gres_data->gres_per_task = 0;
return 0;
}
static int _clear_mem_per_gres(void *x, void *arg)
{
gres_state_t *gres_ptr = (gres_state_t *) x;
gres_job_state_t *job_gres_data;
job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
job_gres_data->mem_per_gres = 0;
return 0;
}
static int _clear_total_gres(void *x, void *arg)
{
gres_state_t *gres_ptr = (gres_state_t *) x;
gres_job_state_t *job_gres_data;
job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
job_gres_data->total_gres = 0;
return 0;
}
/*
* Ensure consistency of gres_per_* options
* Modify task and node count as needed for consistentcy with GRES options
* RET -1 on failure, 0 on success
*/
static int _test_gres_cnt(gres_job_state_t *job_gres_data,
uint32_t *num_tasks,
uint32_t *min_nodes, uint32_t *max_nodes,
uint16_t *ntasks_per_node,
uint16_t *ntasks_per_socket,
uint16_t *sockets_per_node,
uint16_t *cpus_per_task)
{
int req_nodes, req_tasks, req_tasks_per_node, req_tasks_per_socket;
int req_sockets, req_cpus_per_task;
uint16_t cpus_per_gres;
/* Ensure gres_per_job >= gres_per_node >= gres_per_socket */
if (job_gres_data->gres_per_job &&
((job_gres_data->gres_per_node &&
(job_gres_data->gres_per_node > job_gres_data->gres_per_job)) ||
(job_gres_data->gres_per_task &&
(job_gres_data->gres_per_task > job_gres_data->gres_per_job)) ||
(job_gres_data->gres_per_socket &&
(job_gres_data->gres_per_socket > job_gres_data->gres_per_job))))
return -1;
/* Ensure gres_per_job >= gres_per_task */
if (job_gres_data->gres_per_node &&
((job_gres_data->gres_per_task &&
(job_gres_data->gres_per_task > job_gres_data->gres_per_node)) ||
(job_gres_data->gres_per_socket &&
(job_gres_data->gres_per_socket > job_gres_data->gres_per_node))))
return -1;
/* gres_per_socket requires sockets-per-node count specification */
if (job_gres_data->gres_per_socket) {
if (*sockets_per_node == NO_VAL16)
return -1;
}
/*
* Ensure gres_per_job is multiple of gres_per_node
* Ensure node count is consistent with GRES parameters
*/
if (job_gres_data->gres_per_job && job_gres_data->gres_per_node) {
if (job_gres_data->gres_per_job % job_gres_data->gres_per_node){
/* gres_per_job not multiple of gres_per_node */
return -1;
}
req_nodes = job_gres_data->gres_per_job /
job_gres_data->gres_per_node;
if ((req_nodes < *min_nodes) || (req_nodes > *max_nodes))
return -1;
*min_nodes = *max_nodes = req_nodes;
}
/*
* Ensure gres_per_node is multiple of gres_per_socket
* Ensure task count is consistent with GRES parameters
*/
if (job_gres_data->gres_per_node && job_gres_data->gres_per_socket) {
if (job_gres_data->gres_per_node %
job_gres_data->gres_per_socket) {
/* gres_per_node not multiple of gres_per_socket */
return -1;
}
req_sockets = job_gres_data->gres_per_node /
job_gres_data->gres_per_socket;
if (*sockets_per_node == NO_VAL16)
*sockets_per_node = req_sockets;
else if (*sockets_per_node != req_sockets)
return -1;
}
/*
* Ensure gres_per_job is multiple of gres_per_task
* Ensure task count is consistent with GRES parameters
*/
if (job_gres_data->gres_per_task) {
if(job_gres_data->gres_per_job) {
if (job_gres_data->gres_per_job %
job_gres_data->gres_per_task) {
/* gres_per_job not multiple of gres_per_task */
return -1;
}
req_tasks = job_gres_data->gres_per_job /
job_gres_data->gres_per_task;
if (*num_tasks == NO_VAL)
*num_tasks = req_tasks;
else if (*num_tasks != req_tasks)
return -1;
} else if (*num_tasks != NO_VAL) {
job_gres_data->gres_per_job = *num_tasks *
job_gres_data->gres_per_task;
} else {
return -1;
}
}
/*
* Ensure gres_per_node is multiple of gres_per_task
* Ensure tasks_per_node is consistent with GRES parameters
*/
if (job_gres_data->gres_per_node && job_gres_data->gres_per_task) {
if (job_gres_data->gres_per_node %
job_gres_data->gres_per_task) {
/* gres_per_node not multiple of gres_per_task */
return -1;
}
req_tasks_per_node = job_gres_data->gres_per_node /
job_gres_data->gres_per_task;
if ((*ntasks_per_node == NO_VAL16) ||
(*ntasks_per_node == 0))
*ntasks_per_node = req_tasks_per_node;
else if (*ntasks_per_node != req_tasks_per_node)
return -1;
}
/*
* Ensure gres_per_socket is multiple of gres_per_task
* Ensure ntasks_per_socket is consistent with GRES parameters
*/
if (job_gres_data->gres_per_socket && job_gres_data->gres_per_task) {
if (job_gres_data->gres_per_socket %
job_gres_data->gres_per_task) {
/* gres_per_socket not multiple of gres_per_task */
return -1;
}
req_tasks_per_socket = job_gres_data->gres_per_socket /
job_gres_data->gres_per_task;
if ((*ntasks_per_socket == NO_VAL16) ||
(*ntasks_per_socket == 0))
*ntasks_per_socket = req_tasks_per_socket;
else if (*ntasks_per_socket != req_tasks_per_socket)
return -1;
}
/* Ensure that cpus_per_gres * gres_per_task == cpus_per_task */
if (job_gres_data->cpus_per_gres)
cpus_per_gres = job_gres_data->cpus_per_gres;
else
cpus_per_gres = job_gres_data->def_cpus_per_gres;
if (cpus_per_gres && job_gres_data->gres_per_task) {
req_cpus_per_task = cpus_per_gres *job_gres_data->gres_per_task;
if ((*cpus_per_task == NO_VAL16) ||
(*cpus_per_task == 0))
*cpus_per_task = req_cpus_per_task;
else if (*cpus_per_task != req_cpus_per_task)
return -1;
}
/* Ensure tres_per_job >= node count */
if (job_gres_data->gres_per_job) {
if (job_gres_data->gres_per_job < *min_nodes)
return -1;
if (job_gres_data->gres_per_job < *max_nodes)
*max_nodes = job_gres_data->gres_per_job;
}
return 0;
}
/*
* Translate a string, with optional suffix, into its equivalent numeric value
* tok IN - the string to translate
* value IN - numeric value
* RET true if "tok" is a valid number
*/
static bool _is_valid_number(char *tok, unsigned long long int *value)
{
unsigned long long int tmp_val;
uint64_t mult;
char *end_ptr = NULL;
tmp_val = strtoull(tok, &end_ptr, 10);
if (tmp_val == ULLONG_MAX)
return false;
if ((mult = suffix_mult(end_ptr)) == NO_VAL64)
return false;
tmp_val *= mult;
*value = tmp_val;
return true;
}
/*
* Reentrant TRES specification parse logic
* in_val IN - initial input string
* type OUT - must be xfreed by caller
* cnt OUT - count of values
* flags OUT - user flags (GRES_NO_CONSUME)
* save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
* RET rc - error code
*/
static int _get_next_gres(char *in_val, char **type_ptr, int *context_inx_ptr,
uint64_t *cnt, uint16_t *flags, char **save_ptr)
{
char *comma, *sep, *sep2, *name = NULL, *type = NULL;
int i, rc = SLURM_SUCCESS;
unsigned long long int value = 0;
xassert(cnt);
xassert(flags);
xassert(save_ptr);
*flags = 0;
if (!in_val && (*save_ptr == NULL)) {
return rc;
}
if (*save_ptr == NULL) {
*save_ptr = in_val;
}
next: if (*save_ptr[0] == '\0') { /* Empty input token */
*save_ptr = NULL;
goto fini;
}
name = xstrdup(*save_ptr);
comma = strchr(name, ',');
if (comma) {
*save_ptr += (comma - name + 1);
comma[0] = '\0';
} else {
*save_ptr += strlen(name);
}
if (name[0] == '\0') {
/* Nothing but a comma */
xfree(name);
goto next;
}
sep = strchr(name, ':');
if (sep) {
sep[0] = '\0';
sep++;
sep2 = strchr(sep, ':');
if (sep2) {
sep2[0] = '\0';
sep2++;
}
} else {
sep2 = NULL;
}
if (sep2) { /* Two colons */
/* We have both type and count */
if ((sep[0] == '\0') || (sep2[0] == '\0')) {
/* Bad format (e.g. "gpu:tesla:" or "gpu::1") */
rc = ESLURM_INVALID_GRES;
goto fini;
}
type = xstrdup(sep);
if (!_is_valid_number(sep2, &value)) {
debug("%s: Invalid count value GRES %s:%s:%s", __func__,
name, type, sep2);
rc = ESLURM_INVALID_GRES;
goto fini;
}
} else if (sep) { /* One colon */
if (sep[0] == '\0') {
/* Bad format (e.g. "gpu:") */
rc = ESLURM_INVALID_GRES;
goto fini;
} else if (_is_valid_number(sep, &value)) {
/* We have count, but no type */
type = NULL;
} else {
/* We have type with implicit count of 1 */
type = xstrdup(sep);
value = 1;
}
} else { /* No colon */
/* We have no type and implicit count of 1 */
type = NULL;
value = 1;
}
if (value == 0) {
xfree(name);
xfree(type);
goto next;
}
for (i = 0; i < gres_context_cnt; i++) {
if (!xstrcmp(name, gres_context[i].gres_name) ||
!xstrncmp(name, gres_context[i].gres_name_colon,
gres_context[i].gres_name_colon_len))
break; /* GRES name match found */
}
if (i >= gres_context_cnt) {
debug("%s: Failed to locate GRES %s", __func__, name);
rc = ESLURM_INVALID_GRES;
goto fini;
}
*context_inx_ptr = i;
fini: if (rc != SLURM_SUCCESS) {
*save_ptr = NULL;
if (rc == ESLURM_INVALID_GRES) {
info("%s: Invalid GRES job specification %s", __func__,
in_val);
}
xfree(type);
*type_ptr = NULL;
} else {
*cnt = value;
*type_ptr = type;
}
xfree(name);
return rc;
}
/*
* TRES specification parse logic
* in_val IN - initial input string
* cnt OUT - count of values
* gres_list IN/OUT - where to search for (or add) new job TRES record
* save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
* rc OUT - unchanged or an error code
* RET gres - job record to set value in, found or created by this function
*/
static gres_job_state_t *_get_next_job_gres(char *in_val, uint64_t *cnt,
List gres_list, char **save_ptr,
int *rc)
{
static char *prev_save_ptr = NULL;
int context_inx = NO_VAL, my_rc = SLURM_SUCCESS;
gres_job_state_t *job_gres_data = NULL;
gres_state_t *gres_ptr;
gres_key_t job_search_key;
char *type = NULL, *name = NULL;
uint16_t flags = 0;
xassert(save_ptr);
if (!in_val && (*save_ptr == NULL)) {
return NULL;
}
if (*save_ptr == NULL) {
prev_save_ptr = in_val;
} else if (*save_ptr != prev_save_ptr) {
error("%s: parsing error", __func__);
my_rc = SLURM_ERROR;
goto fini;
}
if (prev_save_ptr[0] == '\0') { /* Empty input token */
*save_ptr = NULL;
return NULL;
}
if ((my_rc = _get_next_gres(in_val, &type, &context_inx,
cnt, &flags, &prev_save_ptr)) ||
(context_inx == NO_VAL)) {
prev_save_ptr = NULL;
goto fini;
}
/* Find the job GRES record */
job_search_key.plugin_id = gres_context[context_inx].plugin_id;
job_search_key.type_id = gres_plugin_build_id(type);
gres_ptr = list_find_first(gres_list, _gres_find_job_by_key,
&job_search_key);
if (gres_ptr) {
job_gres_data = gres_ptr->gres_data;
} else {
job_gres_data = xmalloc(sizeof(gres_job_state_t));
job_gres_data->gres_name =
xstrdup(gres_context[context_inx].gres_name);
job_gres_data->type_id = gres_plugin_build_id(type);
job_gres_data->type_name = type;
type = NULL; /* String moved above */
gres_ptr = xmalloc(sizeof(gres_state_t));
gres_ptr->plugin_id = gres_context[context_inx].plugin_id;
gres_ptr->gres_data = job_gres_data;
list_append(gres_list, gres_ptr);
}
job_gres_data->flags = flags;
fini: xfree(name);
xfree(type);
if (my_rc != SLURM_SUCCESS) {
prev_save_ptr = NULL;
if (my_rc == ESLURM_INVALID_GRES) {
info("%s: Invalid GRES job specification %s", __func__,
in_val);
}
*rc = my_rc;
}
*save_ptr = prev_save_ptr;
return job_gres_data;
}
/* Return true if job specification only includes cpus_per_gres or mem_per_gres
* Return false if any other field set
*/
static bool _generic_job_state(gres_job_state_t *job_state)
{
if (job_state->gres_per_job ||
job_state->gres_per_node ||
job_state->gres_per_socket ||
job_state->gres_per_task)
return false;
return true;
}
/*
* Given a job's requested GRES configuration, validate it and build a GRES list
* Note: This function can be used for a new request with gres_list==NULL or
* used to update an existing job, in which case gres_list is a copy
* of the job's original value (so we can clear fields as needed)
* IN *tres* - job requested gres input string
* IN/OUT num_tasks - requested task count, may be reset to provide
* consistent gres_per_node/task values
* IN/OUT min_nodes - requested minimum node count, may be reset to provide
* consistent gres_per_node/task values
* IN/OUT max_nodes - requested maximum node count, may be reset to provide
* consistent gres_per_node/task values
* IN/OUT ntasks_per_node - requested tasks_per_node count, may be reset to
* provide consistent gres_per_node/task values
* IN/OUT ntasks_per_socket - requested ntasks_per_socket count, may be reset to
* provide consistent gres_per_node/task values
* IN/OUT sockets_per_node - requested sockets_per_node count, may be reset to
* provide consistent gres_per_socket/node values
* IN/OUT cpus_per_task - requested cpus_per_task count, may be reset to
* provide consistent gres_per_task/cpus_per_gres values
* OUT gres_list - List of GRES records for this job to track usage
* RET SLURM_SUCCESS or ESLURM_INVALID_GRES
*/
extern int gres_plugin_job_state_validate(char *cpus_per_tres,
char *tres_freq,
char *tres_per_job,
char *tres_per_node,
char *tres_per_socket,
char *tres_per_task,
char *mem_per_tres,
uint32_t *num_tasks,
uint32_t *min_nodes,
uint32_t *max_nodes,
uint16_t *ntasks_per_node,
uint16_t *ntasks_per_socket,
uint16_t *sockets_per_node,
uint16_t *cpus_per_task,
List *gres_list)
{
typedef struct overlap_check {
gres_job_state_t *without_model_state;
uint32_t plugin_id;
bool with_model;
bool without_model;
} overlap_check_t;
overlap_check_t *over_list;
int i, over_count = 0, rc = SLURM_SUCCESS, size;
bool have_gres_gpu = false, have_gres_mps = false;
bool overlap_merge = false;
gres_state_t *gres_state;
gres_job_state_t *job_gres_data;
uint64_t cnt = 0;
ListIterator iter;
if (!cpus_per_tres && !tres_per_job && !tres_per_node &&
!tres_per_socket && !tres_per_task && !mem_per_tres)
return SLURM_SUCCESS;
if (tres_per_task && (*num_tasks == NO_VAL) &&
(*min_nodes != NO_VAL) && (*min_nodes == *max_nodes)) {
/* Implicitly set task count */
if (*ntasks_per_node != NO_VAL16)
*num_tasks = *min_nodes * *ntasks_per_node;
else if (*cpus_per_task == NO_VAL16)
*num_tasks = *min_nodes;
}
if ((rc = gres_plugin_init()) != SLURM_SUCCESS)
return rc;
if ((select_plugin_type != SELECT_TYPE_CONS_TRES) &&
(cpus_per_tres || tres_per_job || tres_per_socket ||
tres_per_task || mem_per_tres))
return ESLURM_UNSUPPORTED_GRES;
/*
* Clear fields as requested by job update (i.e. input value is "")
*/
if (*gres_list)
(void) list_for_each(*gres_list, _clear_total_gres, NULL);
if (*gres_list && cpus_per_tres && (cpus_per_tres[0] == '\0')) {
(void) list_for_each(*gres_list, _clear_cpus_per_gres, NULL);
cpus_per_tres = NULL;
}
if (*gres_list && tres_per_job && (tres_per_job[0] == '\0')) {
(void) list_for_each(*gres_list, _clear_gres_per_job, NULL);
tres_per_job = NULL;
}
if (*gres_list && tres_per_node && (tres_per_node[0] == '\0')) {
(void) list_for_each(*gres_list, _clear_gres_per_node, NULL);
tres_per_node = NULL;
}
if (*gres_list && tres_per_socket && (tres_per_socket[0] == '\0')) {
(void) list_for_each(*gres_list, _clear_gres_per_socket, NULL);
tres_per_socket = NULL;
}
if (*gres_list && tres_per_task && (tres_per_task[0] == '\0')) {
(void) list_for_each(*gres_list, _clear_gres_per_task, NULL);
tres_per_task = NULL;
}
if (*gres_list && mem_per_tres && (mem_per_tres[0] == '\0')) {
(void) list_for_each(*gres_list, _clear_mem_per_gres, NULL);
mem_per_tres = NULL;
}
/*
* Set new values as requested
*/
if (*gres_list == NULL)
*gres_list = list_create(_gres_job_list_delete);
slurm_mutex_lock(&gres_context_lock);
if (cpus_per_tres) {
char *in_val = cpus_per_tres, *save_ptr = NULL;
while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
*gres_list,
&save_ptr, &rc))) {
job_gres_data->cpus_per_gres = cnt;
in_val = NULL;
}
}
if (tres_per_job) {
char *in_val = tres_per_job, *save_ptr = NULL;
while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
*gres_list,
&save_ptr, &rc))) {
job_gres_data->gres_per_job = cnt;
in_val = NULL;
job_gres_data->total_gres =
MAX(job_gres_data->total_gres, cnt);
}
}
if (tres_per_node) {
char *in_val = tres_per_node, *save_ptr = NULL;
while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
*gres_list,
&save_ptr, &rc))) {
job_gres_data->gres_per_node = cnt;
in_val = NULL;
if (*min_nodes != NO_VAL)
cnt *= *min_nodes;
job_gres_data->total_gres =
MAX(job_gres_data->total_gres, cnt);
}
}
if (tres_per_socket) {
char *in_val = tres_per_socket, *save_ptr = NULL;
while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
*gres_list,
&save_ptr, &rc))) {
job_gres_data->gres_per_socket = cnt;
in_val = NULL;
if ((*min_nodes != NO_VAL) &&
(*sockets_per_node != NO_VAL16)) {
cnt *= (*min_nodes * *sockets_per_node);
} else if ((*num_tasks != NO_VAL) &&
(*ntasks_per_socket != NO_VAL16)) {
cnt *= ((*num_tasks + *ntasks_per_socket - 1) /
*ntasks_per_socket);
}
}
}
if (tres_per_task) {
char *in_val = tres_per_task, *save_ptr = NULL;
while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
*gres_list,
&save_ptr, &rc))) {
job_gres_data->gres_per_task = cnt;
in_val = NULL;
if (*num_tasks != NO_VAL)
cnt *= *num_tasks;
job_gres_data->total_gres =
MAX(job_gres_data->total_gres, cnt);
}
}
if (mem_per_tres) {
char *in_val = mem_per_tres, *save_ptr = NULL;
while ((job_gres_data = _get_next_job_gres(in_val, &cnt,
*gres_list,
&save_ptr, &rc))) {
job_gres_data->mem_per_gres = cnt;
in_val = NULL;
}
}
slurm_mutex_unlock(&gres_context_lock);
if (rc != SLURM_SUCCESS)
return rc;
size = list_count(*gres_list);
if (size == 0) {
FREE_NULL_LIST(*gres_list);
return rc;
}
/*
* Check for record overlap (e.g. "gpu:2,gpu:tesla:1")
* Ensure tres_per_job >= tres_per_node >= tres_per_socket
*/
over_list = xcalloc(size, sizeof(overlap_check_t));
iter = list_iterator_create(*gres_list);
while ((gres_state = (gres_state_t *) list_next(iter))) {
job_gres_data = (gres_job_state_t *) gres_state->gres_data;
if (_test_gres_cnt(job_gres_data, num_tasks, min_nodes,
max_nodes, ntasks_per_node,
ntasks_per_socket, sockets_per_node,
cpus_per_task) != 0) {
rc = ESLURM_INVALID_GRES;
break;
}
if (!have_gres_gpu && !xstrcmp(job_gres_data->gres_name, "gpu"))
have_gres_gpu = true;
if (!xstrcmp(job_gres_data->gres_name, "mps")) {
have_gres_mps = true;
/*
* gres/mps only supports a per-node count,
* set either explicitly or implicitly.
*/
if (job_gres_data->gres_per_job &&
(*max_nodes != 1)) {
rc = ESLURM_INVALID_GRES;
break;
}
if (job_gres_data->gres_per_socket &&
(*sockets_per_node != 1)) {
rc = ESLURM_INVALID_GRES;
break;
}
if (job_gres_data->gres_per_task && (*num_tasks != 1)) {
rc = ESLURM_INVALID_GRES;
break;
}
}
if (have_gres_gpu && have_gres_mps) {
rc = ESLURM_INVALID_GRES;
break;
}
for (i = 0; i < over_count; i++) {
if (over_list[i].plugin_id == gres_state->plugin_id)
break;
}
if (i >= over_count) {
over_list[over_count++].plugin_id =
gres_state->plugin_id;
if (job_gres_data->type_name) {
over_list[i].with_model = true;
} else {
over_list[i].without_model = true;
over_list[i].without_model_state =
job_gres_data;
}
} else if (job_gres_data->type_name) {
over_list[i].with_model = true;
if (over_list[i].without_model)
overlap_merge = true;
} else {
over_list[i].without_model = true;
over_list[i].without_model_state = job_gres_data;
if (over_list[i].with_model)
overlap_merge = true;
}
}
if (have_gres_mps && (rc == SLURM_SUCCESS) && tres_freq &&
strstr(tres_freq, "gpu")) {
rc = ESLURM_INVALID_GRES;
}
if (overlap_merge) { /* Merge generic data if possible */
uint16_t cpus_per_gres;
uint64_t mem_per_gres;
for (i = 0; i < over_count; i++) {
if (!over_list[i].with_model ||
!over_list[i].without_model_state)
continue;
if (!_generic_job_state(
over_list[i].without_model_state)) {
rc = ESLURM_INVALID_GRES_TYPE;
break;
}
/* Propagate generic parameters */
cpus_per_gres =
over_list[i].without_model_state->cpus_per_gres;
mem_per_gres =
over_list[i].without_model_state->mem_per_gres;
list_iterator_reset(iter);
while ((gres_state = (gres_state_t *)list_next(iter))) {
job_gres_data = (gres_job_state_t *)
gres_state->gres_data;
if (over_list[i].plugin_id !=
gres_state->plugin_id)
continue;
if (job_gres_data ==
over_list[i].without_model_state) {
list_remove(iter);
continue;
}
if (job_gres_data->cpus_per_gres == 0) {
job_gres_data->cpus_per_gres =
cpus_per_gres;
}
if (job_gres_data->mem_per_gres == 0) {
job_gres_data->mem_per_gres =
mem_per_gres;
}
}
}
}
list_iterator_destroy(iter);
xfree(over_list);
return rc;
}
/*
* Determine if a job's specified GRES can be supported. This is designed to
* prevent the running of a job using the GRES options only supported by the
* select/cons_tres plugin when switching (on slurmctld restart) from the
* cons_tres plugin to any other select plugin.
*
* IN gres_list - List of GRES records for this job to track usage
* RET SLURM_SUCCESS or ESLURM_INVALID_GRES
*/
extern int gres_plugin_job_revalidate(List gres_list)
{
gres_state_t *gres_state;
gres_job_state_t *job_gres_data;
ListIterator iter;
int rc = SLURM_SUCCESS;
if (!gres_list || (select_plugin_type == SELECT_TYPE_CONS_TRES))
return SLURM_SUCCESS;
iter = list_iterator_create(gres_list);
while ((gres_state = (gres_state_t *) list_next(iter))) {
job_gres_data = (gres_job_state_t *) gres_state->gres_data;
if (job_gres_data->gres_per_job ||
job_gres_data->gres_per_socket ||
job_gres_data->gres_per_task) {
rc = ESLURM_UNSUPPORTED_GRES;
break;
}
}
list_iterator_destroy(iter);
return rc;
}
/*
* Return TRUE if any of this job's GRES has a populated gres_bit_alloc element.
* This indicates the allocated GRES has a File configuration parameter and is
* tracking individual file assignments.
*/
static bool _job_has_gres_bits(List job_gres_list)
{
ListIterator job_gres_iter;
gres_state_t *gres_ptr;
gres_job_state_t *job_gres_ptr;
bool rc = false;
int i;
if (!job_gres_list)
return false;
job_gres_iter = list_iterator_create(job_gres_list);
while ((gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
job_gres_ptr = gres_ptr->gres_data;
if (!job_gres_ptr)
continue;
for (i = 0; i < job_gres_ptr->node_cnt; i++) {
if (job_gres_ptr->gres_bit_alloc &&
job_gres_ptr->gres_bit_alloc[i]) {
rc = true;
break;
}
}
if (rc)
break;
}
list_iterator_destroy(job_gres_iter);
return rc;
}
/*
* Return count of configured GRES.
* NOTE: For gres/mps return count of gres/gpu
*/
static int _get_node_gres_cnt(List node_gres_list, uint32_t plugin_id)
{
ListIterator node_gres_iter;
gres_node_state_t *gres_node_ptr;
gres_state_t *gres_ptr;
int gres_cnt = 0;
if (!node_gres_list)
return 0;
if (plugin_id == mps_plugin_id)
plugin_id = gpu_plugin_id;
node_gres_iter = list_iterator_create(node_gres_list);
while ((gres_ptr = (gres_state_t *) list_next(node_gres_iter))) {
if (gres_ptr->plugin_id != plugin_id)
continue;
gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
gres_cnt = (int) gres_node_ptr->gres_cnt_config;
break;
}
list_iterator_destroy(node_gres_iter);
return gres_cnt;
}
/*
* Return TRUE if the identified node in the job allocation can satisfy the
* job's GRES specification without change in its bitmaps. In other words,
* return FALSE if the job allocation identifies specific GRES devices and the
* count of those devices on this node has changed.
*
* IN job_gres_list - List of GRES records for this job to track usage
* IN node_inx - zero-origin index into this job's node allocation
* IN node_gres_list - List of GRES records for this node
*/
static bool _validate_node_gres_cnt(uint32_t job_id, List job_gres_list,
int node_inx, List node_gres_list,
char *node_name)
{
ListIterator job_gres_iter;
gres_state_t *gres_ptr;
gres_job_state_t *job_gres_ptr;
bool rc = true;
int job_gres_cnt, node_gres_cnt;
if (!job_gres_list)
return true;
(void) gres_plugin_init();
job_gres_iter = list_iterator_create(job_gres_list);
while ((gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
job_gres_ptr = gres_ptr->gres_data;
if (!job_gres_ptr || !job_gres_ptr->gres_bit_alloc)
continue;
if ((node_inx >= job_gres_ptr->node_cnt) ||
!job_gres_ptr->gres_bit_alloc[node_inx])
continue;
job_gres_cnt = bit_size(job_gres_ptr->gres_bit_alloc[node_inx]);
node_gres_cnt = _get_node_gres_cnt(node_gres_list,
gres_ptr->plugin_id);
if (job_gres_cnt != node_gres_cnt) {
error("%s: Killing job %u: gres/%s count mismatch on node "
"%s (%d != %d)",
__func__, job_id, job_gres_ptr->gres_name,
node_name, job_gres_cnt, node_gres_cnt);
rc = false;
break;
}
}
list_iterator_destroy(job_gres_iter);
return rc;
}
/*
* Determine if a job's specified GRES are currently valid. This is designed to
* manage jobs allocated GRES which are either no longer supported or a GRES
* configured with the "File" option in gres.conf where the count has changed,
* in which case we don't know how to map the job's old GRES bitmap onto the
* current GRES bitmaps.
*
* IN job_id - ID of job being validated (used for logging)
* IN job_gres_list - List of GRES records for this job to track usage
* RET SLURM_SUCCESS or ESLURM_INVALID_GRES
*/
extern int gres_plugin_job_revalidate2(uint32_t job_id, List job_gres_list,
bitstr_t *node_bitmap)
{
node_record_t *node_ptr;
int rc = SLURM_SUCCESS;
int i_first, i_last, i;
int node_inx = -1;
if (!job_gres_list || !node_bitmap ||
!_job_has_gres_bits(job_gres_list))
return SLURM_SUCCESS;
i_first = bit_ffs(node_bitmap);
if (i_first >= 0)
i_last = bit_fls(node_bitmap);
else
i_last = -2;
for (i = i_first; i <= i_last; i++) {
if (!bit_test(node_bitmap, i))
continue;
node_ptr = node_record_table_ptr + i;
node_inx++;
if (!_validate_node_gres_cnt(job_id, job_gres_list, node_inx,
node_ptr->gres_list,
node_ptr->name)) {
rc = ESLURM_INVALID_GRES;
break;
}
}
return rc;
}
/*
* Find a sock_gres_t record in a list by matching the plugin_id and type_id
* from a gres_state_t job record
* IN x - a sock_gres_t record to test
* IN key - the gres_state_t record (from a job) we want to match
* RET 1 on match, otherwise 0
*/
static int _find_sock_by_job_gres(void *x, void *key)
{
sock_gres_t *sock_data = (sock_gres_t *) x;
gres_state_t *job_gres_state = (gres_state_t *) key;
gres_job_state_t *job_data;
job_data = (gres_job_state_t *) job_gres_state->gres_data;
if ((sock_data->plugin_id == job_gres_state->plugin_id) &&
(sock_data->type_id == job_data->type_id))
return 1;
return 0;
}
/*
* Find a gres_state_t job record in a list by matching the plugin_id and
* type_id from a sock_gres_t record
* IN x - a gres_state_t record (from a job) to test
* IN key - the sock_gres_t record we want to match
* RET 1 on match, otherwise 0
*/
static int _find_job_by_sock_gres(void *x, void *key)
{
gres_state_t *job_gres_state = (gres_state_t *) x;
gres_job_state_t *job_data;
sock_gres_t *sock_data = (sock_gres_t *) key;
job_data = (gres_job_state_t *) job_gres_state->gres_data;
if ((sock_data->plugin_id == job_gres_state->plugin_id) &&
(sock_data->type_id == job_data->type_id))
return 1;
return 0;
}
/*
* Clear GRES allocation info for all job GRES at start of scheduling cycle
* Return TRUE if any gres_per_job constraints to satisfy
*/
extern bool gres_plugin_job_sched_init(List job_gres_list)
{
ListIterator iter;
gres_state_t *job_gres_state;
gres_job_state_t *job_data;
bool rc = false;
if (!job_gres_list)
return rc;
iter = list_iterator_create(job_gres_list);
while ((job_gres_state = (gres_state_t *) list_next(iter))) {
job_data = (gres_job_state_t *) job_gres_state->gres_data;
if (!job_data->gres_per_job)
continue;
job_data->total_gres = 0;
rc = true;
}
list_iterator_destroy(iter);
return rc;
}
/*
* Return TRUE if all gres_per_job specifications are satisfied
*/
extern bool gres_plugin_job_sched_test(List job_gres_list, uint32_t job_id)
{
ListIterator iter;
gres_state_t *job_gres_state;
gres_job_state_t *job_data;
bool rc = true;
if (!job_gres_list)
return rc;
iter = list_iterator_create(job_gres_list);
while ((job_gres_state = (gres_state_t *) list_next(iter))) {
job_data = (gres_job_state_t *) job_gres_state->gres_data;
if (job_data->gres_per_job &&
(job_data->gres_per_job > job_data->total_gres)) {
rc = false;
break;
}
}
list_iterator_destroy(iter);
return rc;
}
/*
* Return TRUE if all gres_per_job specifications will be satisfied with
* the addtitional resources provided by a single node
* IN job_gres_list - List of job's GRES requirements (job_gres_state_t)
* IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t)
* IN job_id - The job being tested
*/
extern bool gres_plugin_job_sched_test2(List job_gres_list, List sock_gres_list,
uint32_t job_id)
{
ListIterator iter;
gres_state_t *job_gres_state;
gres_job_state_t *job_data;
sock_gres_t *sock_data;
bool rc = true;
if (!job_gres_list)
return rc;
iter = list_iterator_create(job_gres_list);
while ((job_gres_state = (gres_state_t *) list_next(iter))) {
job_data = (gres_job_state_t *) job_gres_state->gres_data;
if ((job_data->gres_per_job == 0) ||
(job_data->gres_per_job < job_data->total_gres))
continue;
sock_data = list_find_first(sock_gres_list,
_find_sock_by_job_gres,
job_gres_state);
if (!sock_data ||
(job_data->gres_per_job >
(job_data->total_gres + sock_data->total_cnt))) {
rc = false;
break;
}
}
list_iterator_destroy(iter);
return rc;
}
/*
* Update a job's total_gres counter as we add a node to potential allocation
* IN job_gres_list - List of job's GRES requirements (job_gres_state_t)
* IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t)
* IN avail_cpus - CPUs currently available on this node
*/
extern void gres_plugin_job_sched_add(List job_gres_list, List sock_gres_list,
uint16_t avail_cpus)
{
ListIterator iter;
gres_state_t *job_gres_state;
gres_job_state_t *job_data;
sock_gres_t *sock_data;
uint64_t gres_limit;
if (!job_gres_list)
return;
iter = list_iterator_create(job_gres_list);
while ((job_gres_state = (gres_state_t *) list_next(iter))) {
job_data = (gres_job_state_t *) job_gres_state->gres_data;
if (!job_data->gres_per_job) /* Don't care about totals */
continue;
sock_data = list_find_first(sock_gres_list,
_find_sock_by_job_gres,
job_gres_state);
if (!sock_data) /* None of this GRES available */
continue;
if (job_data->cpus_per_gres) {
gres_limit = avail_cpus / job_data->cpus_per_gres;
gres_limit = MIN(gres_limit, sock_data->total_cnt);
} else
gres_limit = sock_data->total_cnt;
job_data->total_gres += gres_limit;
}
list_iterator_destroy(iter);
}
/*
* Create/update List GRES that can be made available on the specified node
* IN/OUT consec_gres - List of sock_gres_t that can be made available on
* a set of nodes
* IN job_gres_list - List of job's GRES requirements (gres_job_state_t)
* IN sock_gres_list - Per socket GRES availability on this node (sock_gres_t)
*/
extern void gres_plugin_job_sched_consec(List *consec_gres, List job_gres_list,
List sock_gres_list)
{
ListIterator iter;
gres_state_t *job_gres_state;
gres_job_state_t *job_data;
sock_gres_t *sock_data, *consec_data;
if (!job_gres_list)
return;
iter = list_iterator_create(job_gres_list);
while ((job_gres_state = (gres_state_t *) list_next(iter))) {
job_data = (gres_job_state_t *) job_gres_state->gres_data;
if (!job_data->gres_per_job) /* Don't care about totals */
continue;
sock_data = list_find_first(sock_gres_list,
_find_sock_by_job_gres,
job_gres_state);
if (!sock_data) /* None of this GRES available */
continue;
if (*consec_gres == NULL)
*consec_gres = list_create(_sock_gres_del);
consec_data = list_find_first(*consec_gres,
_find_sock_by_job_gres,
job_gres_state);
if (!consec_data) {
consec_data = xmalloc(sizeof(sock_gres_t));
consec_data->plugin_id = sock_data->plugin_id;
consec_data->type_id = sock_data->type_id;
list_append(*consec_gres, consec_data);
}
consec_data->total_cnt += sock_data->total_cnt;
}
list_iterator_destroy(iter);
}
/*
* Determine if the additional sock_gres_list resources will result in
* satisfying the job's gres_per_job constraints
* IN job_gres_list - job's GRES requirements
* IN sock_gres_list - available GRES in a set of nodes, data structure built
* by gres_plugin_job_sched_consec()
*/
extern bool gres_plugin_job_sched_sufficient(List job_gres_list,
List sock_gres_list)
{
ListIterator iter;
gres_state_t *job_gres_state;
gres_job_state_t *job_data;
sock_gres_t *sock_data;
bool rc = true;
if (!job_gres_list)
return true;
if (!sock_gres_list)
return false;
iter = list_iterator_create(job_gres_list);
while ((job_gres_state = (gres_state_t *) list_next(iter))) {
job_data = (gres_job_state_t *) job_gres_state->gres_data;
if (!job_data->gres_per_job) /* Don't care about totals */
continue;
if (job_data->total_gres >= job_data->gres_per_job)
continue;
sock_data = list_find_first(sock_gres_list,
_find_sock_by_job_gres,
job_gres_state);
if (!sock_data) { /* None of this GRES available */
rc = false;
break;
}
if ((job_data->total_gres + sock_data->total_cnt) <
job_data->gres_per_job) {
rc = false;
break;
}
}
list_iterator_destroy(iter);
return rc;
}
/*
* Given a List of sock_gres_t entries, return a string identifying the
* count of each GRES available on this set of nodes
* IN sock_gres_list - count of GRES available in this group of nodes
* IN job_gres_list - job GRES specification, used only to get GRES name/type
* RET xfree the returned string
*/
extern char *gres_plugin_job_sched_str(List sock_gres_list, List job_gres_list)
{
ListIterator iter;
sock_gres_t *sock_data;
gres_state_t *job_gres_state;
gres_job_state_t *job_data;
char *out_str = NULL, *sep;
if (!sock_gres_list)
return NULL;
iter = list_iterator_create(sock_gres_list);
while ((sock_data = (sock_gres_t *) list_next(iter))) {
job_gres_state = list_find_first(job_gres_list,
_find_job_by_sock_gres, sock_data);
if (!job_gres_state) { /* Should never happen */
error("%s: Could not find job GRES for type %u:%u",
__func__, sock_data->plugin_id,
sock_data->type_id);
continue;
}
job_data = (gres_job_state_t *) job_gres_state->gres_data;
if (out_str)
sep = ",";
else
sep = "GRES:";
if (job_data->type_name) {
xstrfmtcat(out_str, "%s%s:%s:%"PRIu64, sep,
job_data->gres_name, job_data->type_name,
sock_data->total_cnt);
} else {
xstrfmtcat(out_str, "%s%s:%"PRIu64, sep,
job_data->gres_name, sock_data->total_cnt);
}
}
list_iterator_destroy(iter);
return out_str;
}
/*
* Create a (partial) copy of a job's gres state for job binding
* IN gres_list - List of Gres records for this job to track usage
* RET The copy or NULL on failure
* NOTE: Only job details are copied, NOT the job step details
*/
extern List gres_plugin_job_state_dup(List gres_list)
{
return gres_plugin_job_state_extract(gres_list, -1);
}
/* Copy gres_job_state_t record for ALL nodes */
static void *_job_state_dup(void *gres_data)
{
int i;
gres_job_state_t *gres_ptr = (gres_job_state_t *) gres_data;
gres_job_state_t *new_gres_ptr;
if (gres_ptr == NULL)
return NULL;
new_gres_ptr = xmalloc(sizeof(gres_job_state_t));
new_gres_ptr->cpus_per_gres = gres_ptr->cpus_per_gres;
new_gres_ptr->gres_name = xstrdup(gres_ptr->gres_name);
new_gres_ptr->gres_per_job = gres_ptr->gres_per_job;
new_gres_ptr->gres_per_node = gres_ptr->gres_per_node;
new_gres_ptr->gres_per_socket = gres_ptr->gres_per_socket;
new_gres_ptr->gres_per_task = gres_ptr->gres_per_task;
new_gres_ptr->mem_per_gres = gres_ptr->mem_per_gres;
new_gres_ptr->node_cnt = gres_ptr->node_cnt;
new_gres_ptr->total_gres = gres_ptr->total_gres;
new_gres_ptr->type_id = gres_ptr->type_id;
new_gres_ptr->type_name = xstrdup(gres_ptr->type_name);
if (gres_ptr->gres_cnt_node_alloc) {
i = sizeof(uint64_t) * gres_ptr->node_cnt;
new_gres_ptr->gres_cnt_node_alloc = xmalloc(i);
memcpy(new_gres_ptr->gres_cnt_node_alloc,
gres_ptr->gres_cnt_node_alloc, i);
}
if (gres_ptr->gres_bit_alloc) {
new_gres_ptr->gres_bit_alloc = xcalloc(gres_ptr->node_cnt,
sizeof(bitstr_t *));
for (i = 0; i < gres_ptr->node_cnt; i++) {
if (gres_ptr->gres_bit_alloc[i] == NULL)
continue;
new_gres_ptr->gres_bit_alloc[i] =
bit_copy(gres_ptr->gres_bit_alloc[i]);
}
}
return new_gres_ptr;
}
/* Copy gres_job_state_t record for one specific node */
static void *_job_state_dup2(void *gres_data, int node_index)
{
gres_job_state_t *gres_ptr = (gres_job_state_t *) gres_data;
gres_job_state_t *new_gres_ptr;
if (gres_ptr == NULL)
return NULL;
new_gres_ptr = xmalloc(sizeof(gres_job_state_t));
new_gres_ptr->cpus_per_gres = gres_ptr->cpus_per_gres;
new_gres_ptr->gres_name = xstrdup(gres_ptr->gres_name);
new_gres_ptr->gres_per_job = gres_ptr->gres_per_job;
new_gres_ptr->gres_per_node = gres_ptr->gres_per_node;
new_gres_ptr->gres_per_socket = gres_ptr->gres_per_socket;
new_gres_ptr->gres_per_task = gres_ptr->gres_per_task;
new_gres_ptr->mem_per_gres = gres_ptr->mem_per_gres;
new_gres_ptr->node_cnt = 1;
new_gres_ptr->total_gres = gres_ptr->total_gres;
new_gres_ptr->type_id = gres_ptr->type_id;
new_gres_ptr->type_name = xstrdup(gres_ptr->type_name);
if (gres_ptr->gres_cnt_node_alloc) {
new_gres_ptr->gres_cnt_node_alloc = xmalloc(sizeof(uint64_t));
new_gres_ptr->gres_cnt_node_alloc[0] =
gres_ptr->gres_cnt_node_alloc[node_index];
}
if (gres_ptr->gres_bit_alloc && gres_ptr->gres_bit_alloc[node_index]) {
new_gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *));
new_gres_ptr->gres_bit_alloc[0] =
bit_copy(gres_ptr->gres_bit_alloc[node_index]);
}
return new_gres_ptr;
}
/*
* Create a (partial) copy of a job's gres state for a particular node index
* IN gres_list - List of Gres records for this job to track usage
* IN node_index - zero-origin index to the node
* RET The copy or NULL on failure
*/
extern List gres_plugin_job_state_extract(List gres_list, int node_index)
{
ListIterator gres_iter;
gres_state_t *gres_ptr, *new_gres_state;
List new_gres_list = NULL;
void *new_gres_data;
if (gres_list == NULL)
return new_gres_list;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
if (node_index == -1)
new_gres_data = _job_state_dup(gres_ptr->gres_data);
else {
new_gres_data = _job_state_dup2(gres_ptr->gres_data,
node_index);
}
if (new_gres_data == NULL)
break;
if (new_gres_list == NULL) {
new_gres_list = list_create(_gres_job_list_delete);
}
new_gres_state = xmalloc(sizeof(gres_state_t));
new_gres_state->plugin_id = gres_ptr->plugin_id;
new_gres_state->gres_data = new_gres_data;
list_append(new_gres_list, new_gres_state);
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return new_gres_list;
}
/*
* Pack a job's current gres status, called from slurmctld for save/restore
* IN gres_list - generated by gres_plugin_job_config_validate()
* IN/OUT buffer - location to write state to
* IN job_id - job's ID
* IN details - if set then pack job step allocation details (only needed to
* save/restore job state, not needed in job credential for
* slurmd task binding)
*
* NOTE: A job's allocation to steps is not recorded here, but recovered with
* the job step state information upon slurmctld restart.
*/
extern int gres_plugin_job_state_pack(List gres_list, Buf buffer,
uint32_t job_id, bool details,
uint16_t protocol_version)
{
int i, rc = SLURM_SUCCESS;
uint32_t top_offset, tail_offset;
uint32_t magic = GRES_MAGIC;
uint16_t rec_cnt = 0;
ListIterator gres_iter;
gres_state_t *gres_ptr;
gres_job_state_t *gres_job_ptr;
top_offset = get_buf_offset(buffer);
pack16(rec_cnt, buffer); /* placeholder if data */
if (gres_list == NULL)
return rc;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
gres_job_ptr = (gres_job_state_t *) gres_ptr->gres_data;
if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
pack32(magic, buffer);
pack32(gres_ptr->plugin_id, buffer);
pack16(gres_job_ptr->cpus_per_gres, buffer);
pack16(gres_job_ptr->flags, buffer);
pack64(gres_job_ptr->gres_per_job, buffer);
pack64(gres_job_ptr->gres_per_node, buffer);
pack64(gres_job_ptr->gres_per_socket, buffer);
pack64(gres_job_ptr->gres_per_task, buffer);
pack64(gres_job_ptr->mem_per_gres, buffer);
pack64(gres_job_ptr->total_gres, buffer);
packstr(gres_job_ptr->type_name, buffer);
pack32(gres_job_ptr->node_cnt, buffer);
if (gres_job_ptr->gres_cnt_node_alloc) {
pack8((uint8_t) 1, buffer);
pack64_array(gres_job_ptr->gres_cnt_node_alloc,
gres_job_ptr->node_cnt, buffer);
} else {
pack8((uint8_t) 0, buffer);
}
if (gres_job_ptr->gres_bit_alloc) {
pack8((uint8_t) 1, buffer);
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
pack_bit_str_hex(gres_job_ptr->
gres_bit_alloc[i],
buffer);
}
} else {
pack8((uint8_t) 0, buffer);
}
if (details && gres_job_ptr->gres_bit_step_alloc) {
pack8((uint8_t) 1, buffer);
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
pack_bit_str_hex(gres_job_ptr->
gres_bit_step_alloc[i],
buffer);
}
} else {
pack8((uint8_t) 0, buffer);
}
if (details && gres_job_ptr->gres_cnt_step_alloc) {
pack8((uint8_t) 1, buffer);
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
pack64(gres_job_ptr->
gres_cnt_step_alloc[i],
buffer);
}
} else {
pack8((uint8_t) 0, buffer);
}
rec_cnt++;
} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
pack32(magic, buffer);
pack32(gres_ptr->plugin_id, buffer);
pack16(gres_job_ptr->cpus_per_gres, buffer);
pack64(gres_job_ptr->gres_per_job, buffer);
pack64(gres_job_ptr->gres_per_node, buffer);
pack64(gres_job_ptr->gres_per_socket, buffer);
pack64(gres_job_ptr->gres_per_task, buffer);
pack64(gres_job_ptr->mem_per_gres, buffer);
pack64(gres_job_ptr->total_gres, buffer);
packstr(gres_job_ptr->type_name, buffer);
pack32(gres_job_ptr->node_cnt, buffer);
if (gres_job_ptr->gres_cnt_node_alloc) {
pack8((uint8_t) 1, buffer);
pack64_array(gres_job_ptr->gres_cnt_node_alloc,
gres_job_ptr->node_cnt, buffer);
} else {
pack8((uint8_t) 0, buffer);
}
if (gres_job_ptr->gres_bit_alloc) {
pack8((uint8_t) 1, buffer);
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
pack_bit_str_hex(gres_job_ptr->
gres_bit_alloc[i],
buffer);
}
} else {
pack8((uint8_t) 0, buffer);
}
if (details && gres_job_ptr->gres_bit_step_alloc) {
pack8((uint8_t) 1, buffer);
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
pack_bit_str_hex(gres_job_ptr->
gres_bit_step_alloc[i],
buffer);
}
} else {
pack8((uint8_t) 0, buffer);
}
if (details && gres_job_ptr->gres_cnt_step_alloc) {
pack8((uint8_t) 1, buffer);
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
pack64(gres_job_ptr->
gres_cnt_step_alloc[i],
buffer);
}
} else {
pack8((uint8_t) 0, buffer);
}
rec_cnt++;
} else {
error("%s: protocol_version %hu not supported",
__func__, protocol_version);
break;
}
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
tail_offset = get_buf_offset(buffer);
set_buf_offset(buffer, top_offset);
pack16(rec_cnt, buffer);
set_buf_offset(buffer, tail_offset);
return rc;
}
/*
* Unpack a job's current gres status, called from slurmctld for save/restore
* OUT gres_list - restored state stored by gres_plugin_job_state_pack()
* IN/OUT buffer - location to read state from
* IN job_id - job's ID
*/
extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer,
uint32_t job_id,
uint16_t protocol_version)
{
int i = 0, rc;
uint32_t magic = 0, plugin_id = 0, utmp32 = 0;
uint16_t rec_cnt = 0;
uint8_t has_more = 0;
gres_state_t *gres_ptr;
gres_job_state_t *gres_job_ptr = NULL;
safe_unpack16(&rec_cnt, buffer);
if (rec_cnt == 0)
return SLURM_SUCCESS;
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
*gres_list = list_create(_gres_job_list_delete);
}
while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
if ((buffer == NULL) || (remaining_buf(buffer) == 0))
break;
rec_cnt--;
if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
safe_unpack32(&magic, buffer);
if (magic != GRES_MAGIC)
goto unpack_error;
safe_unpack32(&plugin_id, buffer);
gres_job_ptr = xmalloc(sizeof(gres_job_state_t));
safe_unpack16(&gres_job_ptr->cpus_per_gres, buffer);
safe_unpack16(&gres_job_ptr->flags, buffer);
safe_unpack64(&gres_job_ptr->gres_per_job, buffer);
safe_unpack64(&gres_job_ptr->gres_per_node, buffer);
safe_unpack64(&gres_job_ptr->gres_per_socket, buffer);
safe_unpack64(&gres_job_ptr->gres_per_task, buffer);
safe_unpack64(&gres_job_ptr->mem_per_gres, buffer);
safe_unpack64(&gres_job_ptr->total_gres, buffer);
safe_unpackstr_xmalloc(&gres_job_ptr->type_name,
&utmp32, buffer);
gres_job_ptr->type_id =
gres_plugin_build_id(gres_job_ptr->type_name);
safe_unpack32(&gres_job_ptr->node_cnt, buffer);
if (gres_job_ptr->node_cnt > NO_VAL)
goto unpack_error;
safe_unpack8(&has_more, buffer);
if (has_more) {
safe_unpack64_array(
&gres_job_ptr->gres_cnt_node_alloc,
&utmp32, buffer);
}
safe_unpack8(&has_more, buffer);
if (has_more) {
safe_xcalloc(gres_job_ptr->gres_bit_alloc,
gres_job_ptr->node_cnt,
sizeof(bitstr_t *));
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
unpack_bit_str_hex(&gres_job_ptr->
gres_bit_alloc[i],
buffer);
}
}
safe_unpack8(&has_more, buffer);
if (has_more) {
safe_xcalloc(gres_job_ptr->gres_bit_step_alloc,
gres_job_ptr->node_cnt,
sizeof(bitstr_t *));
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
unpack_bit_str_hex(&gres_job_ptr->
gres_bit_step_alloc[i],
buffer);
}
}
safe_unpack8(&has_more, buffer);
if (has_more) {
safe_xcalloc(gres_job_ptr->gres_cnt_step_alloc,
gres_job_ptr->node_cnt,
sizeof(uint64_t));
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
safe_unpack64(&gres_job_ptr->
gres_cnt_step_alloc[i],
buffer);
}
}
} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
safe_unpack32(&magic, buffer);
if (magic != GRES_MAGIC)
goto unpack_error;
safe_unpack32(&plugin_id, buffer);
gres_job_ptr = xmalloc(sizeof(gres_job_state_t));
safe_unpack16(&gres_job_ptr->cpus_per_gres, buffer);
safe_unpack64(&gres_job_ptr->gres_per_job, buffer);
safe_unpack64(&gres_job_ptr->gres_per_node, buffer);
safe_unpack64(&gres_job_ptr->gres_per_socket, buffer);
safe_unpack64(&gres_job_ptr->gres_per_task, buffer);
safe_unpack64(&gres_job_ptr->mem_per_gres, buffer);
safe_unpack64(&gres_job_ptr->total_gres, buffer);
safe_unpackstr_xmalloc(&gres_job_ptr->type_name,
&utmp32, buffer);
gres_job_ptr->type_id =
gres_plugin_build_id(gres_job_ptr->type_name);
safe_unpack32(&gres_job_ptr->node_cnt, buffer);
if (gres_job_ptr->node_cnt > NO_VAL)
goto unpack_error;
safe_unpack8(&has_more, buffer);
if (has_more) {
safe_unpack64_array(
&gres_job_ptr->gres_cnt_node_alloc,
&utmp32, buffer);
}
safe_unpack8(&has_more, buffer);
if (has_more) {
safe_xcalloc(gres_job_ptr->gres_bit_alloc,
gres_job_ptr->node_cnt,
sizeof(bitstr_t *));
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
unpack_bit_str_hex(&gres_job_ptr->
gres_bit_alloc[i],
buffer);
}
}
safe_unpack8(&has_more, buffer);
if (has_more) {
safe_xcalloc(gres_job_ptr->gres_bit_step_alloc,
gres_job_ptr->node_cnt,
sizeof(bitstr_t *));
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
unpack_bit_str_hex(&gres_job_ptr->
gres_bit_step_alloc[i],
buffer);
}
}
safe_unpack8(&has_more, buffer);
if (has_more) {
safe_xcalloc(gres_job_ptr->gres_cnt_step_alloc,
gres_job_ptr->node_cnt,
sizeof(uint64_t));
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
safe_unpack64(&gres_job_ptr->
gres_cnt_step_alloc[i],
buffer);
}
}
} else {
error("%s: protocol_version %hu not supported",
__func__, protocol_version);
goto unpack_error;
}
for (i = 0; i < gres_context_cnt; i++) {
if (gres_context[i].plugin_id == plugin_id)
break;
}
if (i >= gres_context_cnt) {
/*
* A likely sign that GresPlugins has changed.
* Not a fatal error, skip over the data.
*/
error("%s: no plugin configured to unpack data type %u from job %u",
__func__, plugin_id, job_id);
_job_state_delete(gres_job_ptr);
continue;
}
gres_job_ptr->gres_name = xstrdup(gres_context[i].gres_name);
gres_ptr = xmalloc(sizeof(gres_state_t));
gres_ptr->plugin_id = gres_context[i].plugin_id;
gres_ptr->gres_data = gres_job_ptr;
gres_job_ptr = NULL; /* nothing left to free on error */
list_append(*gres_list, gres_ptr);
}
slurm_mutex_unlock(&gres_context_lock);
return rc;
unpack_error:
error("%s: unpack error from job %u", __func__, job_id);
if (gres_job_ptr)
_job_state_delete(gres_job_ptr);
slurm_mutex_unlock(&gres_context_lock);
return SLURM_ERROR;
}
/*
* Pack a job's allocated gres information for use by prolog/epilog
* IN gres_list - generated by gres_plugin_job_config_validate()
* IN/OUT buffer - location to write state to
*/
extern int gres_plugin_job_alloc_pack(List gres_list, Buf buffer,
uint16_t protocol_version)
{
int i, rc = SLURM_SUCCESS;
uint32_t top_offset, tail_offset;
uint32_t magic = GRES_MAGIC;
uint16_t rec_cnt = 0;
ListIterator gres_iter;
gres_epilog_info_t *gres_job_ptr;
top_offset = get_buf_offset(buffer);
pack16(rec_cnt, buffer); /* placeholder if data */
if (gres_list == NULL)
return rc;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_job_ptr = (gres_epilog_info_t *) list_next(gres_iter))) {
if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
pack32(magic, buffer);
pack32(gres_job_ptr->plugin_id, buffer);
pack32(gres_job_ptr->node_cnt, buffer);
if (gres_job_ptr->gres_cnt_node_alloc) {
pack8((uint8_t) 1, buffer);
pack64_array(gres_job_ptr->gres_cnt_node_alloc,
gres_job_ptr->node_cnt, buffer);
} else {
pack8((uint8_t) 0, buffer);
}
if (gres_job_ptr->gres_bit_alloc) {
pack8((uint8_t) 1, buffer);
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
pack_bit_str_hex(gres_job_ptr->
gres_bit_alloc[i],
buffer);
}
} else {
pack8((uint8_t) 0, buffer);
}
rec_cnt++;
} else {
error("%s: protocol_version %hu not supported",
__func__, protocol_version);
break;
}
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
tail_offset = get_buf_offset(buffer);
set_buf_offset(buffer, top_offset);
pack16(rec_cnt, buffer);
set_buf_offset(buffer, tail_offset);
return rc;
}
static void _epilog_list_del(void *x)
{
gres_epilog_info_t *epilog_info = (gres_epilog_info_t *) x;
int i;
if (!epilog_info)
return;
if (epilog_info->gres_bit_alloc) {
for (i = 0; i < epilog_info->node_cnt; i++)
FREE_NULL_BITMAP(epilog_info->gres_bit_alloc[i]);
xfree(epilog_info->gres_bit_alloc);
}
xfree(epilog_info->gres_cnt_node_alloc);
xfree(epilog_info->node_list);
xfree(epilog_info);
}
/*
* Unpack a job's allocated gres information for use by prolog/epilog
* OUT gres_list - restored state stored by gres_plugin_job_alloc_pack()
* IN/OUT buffer - location to read state from
*/
extern int gres_plugin_job_alloc_unpack(List *gres_list, Buf buffer,
uint16_t protocol_version)
{
int i = 0, rc;
uint32_t magic = 0, utmp32 = 0;
uint16_t rec_cnt = 0;
uint8_t filled = 0;
gres_epilog_info_t *gres_job_ptr = NULL;
safe_unpack16(&rec_cnt, buffer);
if (rec_cnt == 0)
return SLURM_SUCCESS;
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
*gres_list = list_create(_epilog_list_del);
}
while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
if ((buffer == NULL) || (remaining_buf(buffer) == 0))
break;
rec_cnt--;
if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
safe_unpack32(&magic, buffer);
if (magic != GRES_MAGIC)
goto unpack_error;
gres_job_ptr = xmalloc(sizeof(gres_epilog_info_t));
safe_unpack32(&gres_job_ptr->plugin_id, buffer);
safe_unpack32(&gres_job_ptr->node_cnt, buffer);
if (gres_job_ptr->node_cnt > NO_VAL)
goto unpack_error;
safe_unpack8(&filled, buffer);
if (filled) {
safe_unpack64_array(
&gres_job_ptr->gres_cnt_node_alloc,
&utmp32, buffer);
}
safe_unpack8(&filled, buffer);
if (filled) {
safe_xcalloc(gres_job_ptr->gres_bit_alloc,
gres_job_ptr->node_cnt,
sizeof(bitstr_t *));
for (i = 0; i < gres_job_ptr->node_cnt; i++) {
unpack_bit_str_hex(&gres_job_ptr->
gres_bit_alloc[i],
buffer);
}
}
} else {
error("%s: protocol_version %hu not supported",
__func__, protocol_version);
goto unpack_error;
}
for (i = 0; i < gres_context_cnt; i++) {
if (gres_context[i].plugin_id ==
gres_job_ptr->plugin_id)
break;
}
if (i >= gres_context_cnt) {
/*
* A likely sign that GresPlugins has changed.
* Not a fatal error, skip over the data.
*/
error("%s: no plugin configured to unpack data type %u",
__func__, gres_job_ptr->plugin_id);
_epilog_list_del(gres_job_ptr);
continue;
}
list_append(*gres_list, gres_job_ptr);
gres_job_ptr = NULL;
}
slurm_mutex_unlock(&gres_context_lock);
return rc;
unpack_error:
error("%s: unpack error", __func__);
if (gres_job_ptr)
_epilog_list_del(gres_job_ptr);
slurm_mutex_unlock(&gres_context_lock);
return SLURM_ERROR;
}
/*
* Build List of information needed to set job's Prolog or Epilog environment
* variables
*
* IN job_gres_list - job's GRES allocation info
* IN hostlist - list of nodes associated with the job
* RET information about the job's GRES allocation needed by Prolog or Epilog
*/
extern List gres_plugin_epilog_build_env(List job_gres_list, char *node_list)
{
int i;
ListIterator gres_iter;
gres_state_t *gres_ptr = NULL;
gres_epilog_info_t *epilog_info;
List epilog_gres_list = NULL;
if (!job_gres_list)
return NULL;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(job_gres_list);
while ((gres_ptr = list_next(gres_iter))) {
for (i = 0; i < gres_context_cnt; i++) {
if (gres_ptr->plugin_id == gres_context[i].plugin_id)
break;
}
if (i >= gres_context_cnt) {
error("%s: gres not found in context. This should never happen",
__func__);
continue;
}
if (!gres_context[i].ops.epilog_build_env)
continue; /* No plugin to call */
epilog_info = (*(gres_context[i].ops.epilog_build_env))
(gres_ptr->gres_data);
if (!epilog_info)
continue; /* No info to add for this plugin */
if (!epilog_gres_list)
epilog_gres_list = list_create(_epilog_list_del);
epilog_info->plugin_id = gres_context[i].plugin_id;
epilog_info->node_list = xstrdup(node_list);
list_append(epilog_gres_list, epilog_info);
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return epilog_gres_list;
}
/*
* Set environment variables as appropriate for a job's prolog or epilog based
* GRES allocated to the job.
*
* IN/OUT epilog_env_ptr - environment variable array
* IN epilog_gres_list - generated by TBD
* IN node_inx - zero origin node index
*/
extern void gres_plugin_epilog_set_env(char ***epilog_env_ptr,
List epilog_gres_list, int node_inx)
{
int i;
ListIterator epilog_iter;
gres_epilog_info_t *epilog_info;
*epilog_env_ptr = NULL;
if (!epilog_gres_list)
return;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
epilog_iter = list_iterator_create(epilog_gres_list);
while ((epilog_info = list_next(epilog_iter))) {
for (i = 0; i < gres_context_cnt; i++) {
if (epilog_info->plugin_id == gres_context[i].plugin_id)
break;
}
if (i >= gres_context_cnt) {
error("%s: GRES ID %u not found in context",
__func__, epilog_info->plugin_id);
continue;
}
if (!gres_context[i].ops.epilog_set_env)
continue; /* No plugin to call */
(*(gres_context[i].ops.epilog_set_env))
(epilog_env_ptr, epilog_info, node_inx);
}
list_iterator_destroy(epilog_iter);
slurm_mutex_unlock(&gres_context_lock);
}
/*
* If core bitmap from slurmd differs in size from that in slurmctld,
* then modify bitmap from slurmd so we can use bit_and, bit_or, etc.
*/
static bitstr_t *_core_bitmap_rebuild(bitstr_t *old_core_bitmap, int new_size)
{
int i, j, old_size, ratio;
bitstr_t *new_core_bitmap;
new_core_bitmap = bit_alloc(new_size);
old_size = bit_size(old_core_bitmap);
if (old_size > new_size) {
ratio = old_size / new_size;
for (i = 0; i < new_size; i++) {
for (j = 0; j < ratio; j++) {
if (bit_test(old_core_bitmap, i*ratio+j)) {
bit_set(new_core_bitmap, i);
break;
}
}
}
} else {
ratio = new_size / old_size;
for (i = 0; i < old_size; i++) {
if (!bit_test(old_core_bitmap, i))
continue;
for (j = 0; j < ratio; j++) {
bit_set(new_core_bitmap, i*ratio+j);
}
}
}
return new_core_bitmap;
}
static void _validate_gres_node_cores(gres_node_state_t *node_gres_ptr,
int cores_ctld, char *node_name)
{
int i, cores_slurmd;
bitstr_t *new_core_bitmap;
int log_mismatch = true;
if (node_gres_ptr->topo_cnt == 0)
return;
if (node_gres_ptr->topo_core_bitmap == NULL) {
error("Gres topo_core_bitmap is NULL on node %s", node_name);
return;
}
for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
if (!node_gres_ptr->topo_core_bitmap[i])
continue;
cores_slurmd = bit_size(node_gres_ptr->topo_core_bitmap[i]);
if (cores_slurmd == cores_ctld)
continue;
if (log_mismatch) {
debug("Rebuilding node %s gres core bitmap (%d != %d)",
node_name, cores_slurmd, cores_ctld);
log_mismatch = false;
}
new_core_bitmap = _core_bitmap_rebuild(
node_gres_ptr->topo_core_bitmap[i],
cores_ctld);
FREE_NULL_BITMAP(node_gres_ptr->topo_core_bitmap[i]);
node_gres_ptr->topo_core_bitmap[i] = new_core_bitmap;
}
}
static void _job_core_filter(void *job_gres_data, void *node_gres_data,
bool use_total_gres, bitstr_t *core_bitmap,
int core_start_bit, int core_end_bit,
char *gres_name, char *node_name,
uint32_t plugin_id)
{
int i, j, core_ctld;
gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data;
gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data;
bitstr_t *avail_core_bitmap = NULL;
bool use_busy_dev = false;
if (!node_gres_ptr->topo_cnt || !core_bitmap || /* No topology info */
!job_gres_ptr->gres_per_node) /* No job GRES */
return;
if (!use_total_gres &&
(plugin_id == mps_plugin_id) &&
(node_gres_ptr->gres_cnt_alloc != 0)) {
/* We must use the ONE already active GRES of this type */
use_busy_dev = true;
}
/* Determine which specific cores can be used */
avail_core_bitmap = bit_copy(core_bitmap);
bit_nclear(avail_core_bitmap, core_start_bit, core_end_bit);
for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
if (node_gres_ptr->topo_gres_cnt_avail[i] == 0)
continue;
if (!use_total_gres &&
(node_gres_ptr->topo_gres_cnt_alloc[i] >=
node_gres_ptr->topo_gres_cnt_avail[i]))
continue;
if (use_busy_dev &&
(node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
continue;
if (job_gres_ptr->type_name &&
(!node_gres_ptr->topo_type_name[i] ||
(job_gres_ptr->type_id != node_gres_ptr->topo_type_id[i])))
continue;
if (!node_gres_ptr->topo_core_bitmap[i]) {
FREE_NULL_BITMAP(avail_core_bitmap); /* No filter */
return;
}
core_ctld = core_end_bit - core_start_bit + 1;
_validate_gres_node_cores(node_gres_ptr, core_ctld, node_name);
core_ctld = bit_size(node_gres_ptr->topo_core_bitmap[i]);
for (j = 0; j < core_ctld; j++) {
if (bit_test(node_gres_ptr->topo_core_bitmap[i], j)) {
bit_set(avail_core_bitmap, core_start_bit + j);
}
}
}
bit_and(core_bitmap, avail_core_bitmap);
FREE_NULL_BITMAP(avail_core_bitmap);
}
static uint32_t _job_test(void *job_gres_data, void *node_gres_data,
bool use_total_gres, bitstr_t *core_bitmap,
int core_start_bit, int core_end_bit, bool *topo_set,
uint32_t job_id, char *node_name, char *gres_name,
uint32_t plugin_id, bool disable_binding)
{
int i, j, core_size, core_ctld, top_inx = -1;
uint64_t gres_avail = 0, gres_max = 0, gres_total, gres_tmp;
uint64_t min_gres_node = 0;
gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data;
gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data;
uint32_t *cores_addnt = NULL; /* Additional cores avail from this GRES */
uint32_t *cores_avail = NULL; /* cores initially avail from this GRES */
uint32_t core_cnt = 0;
bitstr_t *alloc_core_bitmap = NULL;
bitstr_t *avail_core_bitmap = NULL;
bool shared_gres = _shared_gres(plugin_id);
bool use_busy_dev = false;
if (node_gres_ptr->no_consume)
use_total_gres = true;
if (!use_total_gres &&
(plugin_id == mps_plugin_id) &&
(node_gres_ptr->gres_cnt_alloc != 0)) {
/* We must use the ONE already active GRES of this type */
use_busy_dev = true;
}
/* Determine minimum GRES count needed on this node */
if (job_gres_ptr->gres_per_job)
min_gres_node = 1;
min_gres_node = MAX(min_gres_node, job_gres_ptr->gres_per_node);
min_gres_node = MAX(min_gres_node, job_gres_ptr->gres_per_socket);
min_gres_node = MAX(min_gres_node, job_gres_ptr->gres_per_task);
if (min_gres_node && node_gres_ptr->topo_cnt && *topo_set) {
/*
* Need to determine how many GRES available for these
* specific cores
*/
if (core_bitmap) {
core_ctld = core_end_bit - core_start_bit + 1;
if (core_ctld < 1) {
error("gres/%s: job %u cores on node %s < 1",
gres_name, job_id, node_name);
return (uint32_t) 0;
}
_validate_gres_node_cores(node_gres_ptr, core_ctld,
node_name);
}
for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
if (job_gres_ptr->type_name &&
(!node_gres_ptr->topo_type_name[i] ||
(node_gres_ptr->topo_type_id[i] !=
job_gres_ptr->type_id)))
continue;
if (use_busy_dev &&
(node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
continue;
if (!node_gres_ptr->topo_core_bitmap[i]) {
gres_avail += node_gres_ptr->
topo_gres_cnt_avail[i];
if (!use_total_gres) {
gres_avail -= node_gres_ptr->
topo_gres_cnt_alloc[i];
}
if (shared_gres)
gres_max = MAX(gres_max, gres_avail);
continue;
}
core_ctld = bit_size(node_gres_ptr->
topo_core_bitmap[i]);
for (j = 0; j < core_ctld; j++) {
if (core_bitmap &&
!bit_test(core_bitmap, core_start_bit + j))
continue;
if (!bit_test(node_gres_ptr->
topo_core_bitmap[i], j))
continue; /* not avail for this gres */
gres_avail += node_gres_ptr->
topo_gres_cnt_avail[i];
if (!use_total_gres) {
gres_avail -= node_gres_ptr->
topo_gres_cnt_alloc[i];
}
if (shared_gres)
gres_max = MAX(gres_max, gres_avail);
break;
}
}
if (shared_gres)
gres_avail = gres_max;
if (min_gres_node > gres_avail)
return (uint32_t) 0; /* insufficient GRES avail */
return NO_VAL;
} else if (min_gres_node && node_gres_ptr->topo_cnt &&
!disable_binding) {
/* Need to determine which specific cores can be used */
gres_avail = node_gres_ptr->gres_cnt_avail;
if (!use_total_gres)
gres_avail -= node_gres_ptr->gres_cnt_alloc;
if (min_gres_node > gres_avail)
return (uint32_t) 0; /* insufficient GRES avail */
core_ctld = core_end_bit - core_start_bit + 1;
if (core_bitmap) {
if (core_ctld < 1) {
error("gres/%s: job %u cores on node %s < 1",
gres_name, job_id, node_name);
return (uint32_t) 0;
}
_validate_gres_node_cores(node_gres_ptr, core_ctld,
node_name);
} else {
for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
if (!node_gres_ptr->topo_core_bitmap[i])
continue;
core_ctld = bit_size(node_gres_ptr->
topo_core_bitmap[i]);
break;
}
}
alloc_core_bitmap = bit_alloc(core_ctld);
if (core_bitmap) {
for (j = 0; j < core_ctld; j++) {
if (bit_test(core_bitmap, core_start_bit + j))
bit_set(alloc_core_bitmap, j);
}
} else {
bit_nset(alloc_core_bitmap, 0, core_ctld - 1);
}
avail_core_bitmap = bit_copy(alloc_core_bitmap);
cores_addnt = xcalloc(node_gres_ptr->topo_cnt,
sizeof(uint32_t));
cores_avail = xcalloc(node_gres_ptr->topo_cnt,
sizeof(uint32_t));
for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
if (node_gres_ptr->topo_gres_cnt_avail[i] == 0)
continue;
if (use_busy_dev &&
(node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
continue;
if (!use_total_gres &&
(node_gres_ptr->topo_gres_cnt_alloc[i] >=
node_gres_ptr->topo_gres_cnt_avail[i]))
continue;
if (job_gres_ptr->type_name &&
(!node_gres_ptr->topo_type_name[i] ||
(node_gres_ptr->topo_type_id[i] !=
job_gres_ptr->type_id)))
continue;
if (!node_gres_ptr->topo_core_bitmap[i]) {
cores_avail[i] = core_end_bit -
core_start_bit + 1;
continue;
}
core_size = bit_size(node_gres_ptr->topo_core_bitmap[i]);
for (j = 0; j < core_size; j++) {
if (core_bitmap &&
!bit_test(core_bitmap, core_start_bit + j))
continue;
if (bit_test(node_gres_ptr->
topo_core_bitmap[i], j)) {
cores_avail[i]++;
}
}
}
/* Pick the topology entries with the most cores available */
gres_avail = 0;
gres_total = 0;
while (gres_avail < min_gres_node) {
top_inx = -1;
for (j = 0; j < node_gres_ptr->topo_cnt; j++) {
if ((gres_avail == 0) || (cores_avail[j] == 0) ||
!node_gres_ptr->topo_core_bitmap[j]) {
cores_addnt[j] = cores_avail[j];
} else {
cores_addnt[j] = cores_avail[j] -
bit_overlap(alloc_core_bitmap,
node_gres_ptr->
topo_core_bitmap[j]);
}
if (top_inx == -1) {
if (cores_avail[j])
top_inx = j;
} else if (cores_addnt[j] > cores_addnt[top_inx])
top_inx = j;
}
if ((top_inx < 0) || (cores_avail[top_inx] == 0)) {
if (gres_total < min_gres_node)
core_cnt = 0;
break;
}
cores_avail[top_inx] = 0; /* Flag as used */
gres_tmp = node_gres_ptr->topo_gres_cnt_avail[top_inx];
if (!use_total_gres &&
(gres_tmp >=
node_gres_ptr->topo_gres_cnt_alloc[top_inx])) {
gres_tmp -= node_gres_ptr->
topo_gres_cnt_alloc[top_inx];
} else if (!use_total_gres) {
gres_tmp = 0;
}
if (gres_tmp == 0) {
error("gres/%s: topology allocation error on node %s",
gres_name, node_name);
break;
}
/* update counts of allocated cores and GRES */
if (shared_gres) {
/*
* Process outside of loop after specific
* device selected
*/
} else if (!node_gres_ptr->topo_core_bitmap[top_inx]) {
bit_nset(alloc_core_bitmap, 0, core_ctld - 1);
} else if (gres_avail) {
bit_or(alloc_core_bitmap,
node_gres_ptr->
topo_core_bitmap[top_inx]);
if (core_bitmap)
bit_and(alloc_core_bitmap,
avail_core_bitmap);
} else {
bit_and(alloc_core_bitmap,
node_gres_ptr->
topo_core_bitmap[top_inx]);
}
if (shared_gres) {
gres_total = MAX(gres_total, gres_tmp);
gres_avail = gres_total;
} else {
/*
* Available GRES count is up to gres_tmp,
* but take 1 per loop to maximize available
* core count
*/
gres_avail += 1;
gres_total += gres_tmp;
core_cnt = bit_set_count(alloc_core_bitmap);
}
}
if (shared_gres && (top_inx >= 0) &&
(gres_avail >= min_gres_node)) {
if (!node_gres_ptr->topo_core_bitmap[top_inx]) {
bit_nset(alloc_core_bitmap, 0, core_ctld - 1);
} else {
bit_or(alloc_core_bitmap,
node_gres_ptr->
topo_core_bitmap[top_inx]);
if (core_bitmap)
bit_and(alloc_core_bitmap,
avail_core_bitmap);
}
core_cnt = bit_set_count(alloc_core_bitmap);
}
if (core_bitmap && (core_cnt > 0)) {
*topo_set = true;
for (i = 0; i < core_ctld; i++) {
if (!bit_test(alloc_core_bitmap, i)) {
bit_clear(core_bitmap,
core_start_bit + i);
}
}
}
FREE_NULL_BITMAP(alloc_core_bitmap);
FREE_NULL_BITMAP(avail_core_bitmap);
xfree(cores_addnt);
xfree(cores_avail);
return core_cnt;
} else if (job_gres_ptr->type_name) {
for (i = 0; i < node_gres_ptr->type_cnt; i++) {
if (node_gres_ptr->type_name[i] &&
(node_gres_ptr->type_id[i] ==
job_gres_ptr->type_id))
break;
}
if (i >= node_gres_ptr->type_cnt)
return (uint32_t) 0; /* no such type */
gres_avail = node_gres_ptr->type_cnt_avail[i];
if (!use_total_gres)
gres_avail -= node_gres_ptr->type_cnt_alloc[i];
gres_tmp = node_gres_ptr->gres_cnt_avail;
if (!use_total_gres)
gres_tmp -= node_gres_ptr->gres_cnt_alloc;
gres_avail = MIN(gres_avail, gres_tmp);
if (min_gres_node > gres_avail)
return (uint32_t) 0; /* insufficient GRES avail */
return NO_VAL;
} else {
gres_avail = node_gres_ptr->gres_cnt_avail;
if (!use_total_gres)
gres_avail -= node_gres_ptr->gres_cnt_alloc;
if (min_gres_node > gres_avail)
return (uint32_t) 0; /* insufficient GRES avail */
return NO_VAL;
}
}
/*
* Clear the core_bitmap for cores which are not usable by this job (i.e. for
* cores which are already bound to other jobs or lack GRES)
* IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
* IN node_gres_list - node's gres_list built by
* gres_plugin_node_config_validate()
* IN use_total_gres - if set then consider all GRES resources as available,
* and none are commited to running jobs
* IN/OUT core_bitmap - Identification of available cores (NULL if no restriction)
* IN core_start_bit - index into core_bitmap for this node's first cores
* IN core_end_bit - index into core_bitmap for this node's last cores
*/
extern void gres_plugin_job_core_filter(List job_gres_list, List node_gres_list,
bool use_total_gres,
bitstr_t *core_bitmap,
int core_start_bit, int core_end_bit,
char *node_name)
{
int i;
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr, *node_gres_ptr;
if ((job_gres_list == NULL) || (core_bitmap == NULL))
return;
if (node_gres_list == NULL) {
bit_nclear(core_bitmap, core_start_bit, core_end_bit);
return;
}
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
node_gres_ptr = list_find_first(node_gres_list, _gres_find_id,
&job_gres_ptr->plugin_id);
if (node_gres_ptr == NULL) {
/* node lack resources required by the job */
bit_nclear(core_bitmap, core_start_bit, core_end_bit);
break;
}
for (i = 0; i < gres_context_cnt; i++) {
if (job_gres_ptr->plugin_id !=
gres_context[i].plugin_id)
continue;
_job_core_filter(job_gres_ptr->gres_data,
node_gres_ptr->gres_data,
use_total_gres, core_bitmap,
core_start_bit, core_end_bit,
gres_context[i].gres_name, node_name,
job_gres_ptr->plugin_id);
break;
}
}
list_iterator_destroy(job_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return;
}
/*
* Determine how many cores on the node can be used by this job
* IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
* IN node_gres_list - node's gres_list built by gres_plugin_node_config_validate()
* IN use_total_gres - if set then consider all gres resources as available,
* and none are commited to running jobs
* IN core_bitmap - Identification of available cores (NULL if no restriction)
* IN core_start_bit - index into core_bitmap for this node's first core
* IN core_end_bit - index into core_bitmap for this node's last core
* IN job_id - job's ID (for logging)
* IN node_name - name of the node (for logging)
* IN disable binding- --gres-flags=disable-binding
* RET: NO_VAL - All cores on node are available
* otherwise - Count of available cores
*/
extern uint32_t gres_plugin_job_test(List job_gres_list, List node_gres_list,
bool use_total_gres, bitstr_t *core_bitmap,
int core_start_bit, int core_end_bit,
uint32_t job_id, char *node_name,
bool disable_binding)
{
int i;
uint32_t core_cnt, tmp_cnt;
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr, *node_gres_ptr;
bool topo_set = false;
if (job_gres_list == NULL)
return NO_VAL;
if (node_gres_list == NULL)
return 0;
core_cnt = NO_VAL;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
node_gres_ptr = list_find_first(node_gres_list, _gres_find_id,
&job_gres_ptr->plugin_id);
if (node_gres_ptr == NULL) {
/* node lack resources required by the job */
core_cnt = 0;
break;
}
for (i = 0; i < gres_context_cnt; i++) {
if (job_gres_ptr->plugin_id !=
gres_context[i].plugin_id)
continue;
tmp_cnt = _job_test(job_gres_ptr->gres_data,
node_gres_ptr->gres_data,
use_total_gres, core_bitmap,
core_start_bit, core_end_bit,
&topo_set, job_id, node_name,
gres_context[i].gres_name,
gres_context[i].plugin_id,
disable_binding);
if (tmp_cnt != NO_VAL) {
if (core_cnt == NO_VAL)
core_cnt = tmp_cnt;
else
core_cnt = MIN(tmp_cnt, core_cnt);
}
break;
}
if (core_cnt == 0)
break;
}
list_iterator_destroy(job_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return core_cnt;
}
static void _sock_gres_del(void *x)
{
sock_gres_t *sock_gres = (sock_gres_t *) x;
int s;
if (sock_gres) {
FREE_NULL_BITMAP(sock_gres->bits_any_sock);
if (sock_gres->bits_by_sock) {
for (s = 0; s < sock_gres->sock_cnt; s++)
FREE_NULL_BITMAP(sock_gres->bits_by_sock[s]);
xfree(sock_gres->bits_by_sock);
}
xfree(sock_gres->cnt_by_sock);
xfree(sock_gres->gres_name);
/* NOTE: sock_gres->job_specs is just a pointer, do not free */
xfree(sock_gres->type_name);
xfree(sock_gres);
}
}
/*
* Build a string containing the GRES details for a given node and socket
* sock_gres_list IN - List of sock_gres_t entries
* sock_inx IN - zero-origin socket for which information is to be returned
* if value < 0, then report GRES unconstrained by core
* RET string, must call xfree() to release memory
*/
extern char *gres_plugin_sock_str(List sock_gres_list, int sock_inx)
{
ListIterator iter;
sock_gres_t *sock_gres;
char *gres_str = NULL, *sep = "";
if (!sock_gres_list)
return NULL;
iter = list_iterator_create(sock_gres_list);
while ((sock_gres = (sock_gres_t *) list_next(iter))) {
if (sock_inx < 0) {
if (sock_gres->cnt_any_sock) {
if (sock_gres->type_name) {
xstrfmtcat(gres_str, "%s%s:%s:%"PRIu64,
sep, sock_gres->gres_name,
sock_gres->type_name,
sock_gres->cnt_any_sock);
} else {
xstrfmtcat(gres_str, "%s%s:%"PRIu64,
sep, sock_gres->gres_name,
sock_gres->cnt_any_sock);
}
sep = " ";
}
continue;
}
if (!sock_gres->cnt_by_sock ||
(sock_gres->cnt_by_sock[sock_inx] == 0))
continue;
if (sock_gres->type_name) {
xstrfmtcat(gres_str, "%s%s:%s:%"PRIu64, sep,
sock_gres->gres_name, sock_gres->type_name,
sock_gres->cnt_by_sock[sock_inx]);
} else {
xstrfmtcat(gres_str, "%s%s:%"PRIu64, sep,
sock_gres->gres_name,
sock_gres->cnt_by_sock[sock_inx]);
}
sep = " ";
}
list_iterator_destroy(iter);
return gres_str;
}
/*
* Determine how many GRES of a given type can be used by this job on a
* given node and return a structure with the details. Note that multiple
* GRES of a given type model can be distributed over multiple topo structures,
* so we need to OR the core_bitmap over all of them.
*/
static sock_gres_t *_build_sock_gres_by_topo(gres_job_state_t *job_gres_ptr,
gres_node_state_t *node_gres_ptr,
bool use_total_gres, bitstr_t *core_bitmap,
uint16_t sockets, uint16_t cores_per_sock,
uint32_t job_id, char *node_name,
bool enforce_binding, uint32_t s_p_n,
bitstr_t **req_sock_map,
uint32_t main_plugin_id, uint32_t alt_plugin_id,
gres_node_state_t *alt_node_gres_ptr,
uint32_t user_id, const uint32_t node_inx)
{
int i, j, s, c, tot_cores;
sock_gres_t *sock_gres;
int64_t add_gres;
uint64_t avail_gres, min_gres = 1;
bool match = false;
bool use_busy_dev = false;
if (node_gres_ptr->gres_cnt_avail == 0)
return NULL;
if (!use_total_gres &&
(main_plugin_id == mps_plugin_id) &&
(node_gres_ptr->gres_cnt_alloc != 0)) {
/* We must use the ONE already active GRES of this type */
use_busy_dev = true;
}
sock_gres = xmalloc(sizeof(sock_gres_t));
sock_gres->sock_cnt = sockets;
sock_gres->bits_by_sock = xcalloc(sockets, sizeof(bitstr_t *));
sock_gres->cnt_by_sock = xcalloc(sockets, sizeof(uint64_t));
for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
bool use_all_sockets = false;
if (job_gres_ptr->type_name &&
(job_gres_ptr->type_id != node_gres_ptr->topo_type_id[i]))
continue; /* Wrong type_model */
if (use_busy_dev &&
(node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
continue;
if (!use_total_gres && !node_gres_ptr->no_consume &&
(node_gres_ptr->topo_gres_cnt_alloc[i] >=
node_gres_ptr->topo_gres_cnt_avail[i])) {
continue; /* No GRES remaining */
}
if (!use_total_gres && !node_gres_ptr->no_consume) {
avail_gres = node_gres_ptr->topo_gres_cnt_avail[i] -
node_gres_ptr->topo_gres_cnt_alloc[i];
} else {
avail_gres = node_gres_ptr->topo_gres_cnt_avail[i];
}
if (avail_gres == 0)
continue;
/*
* Job requested GPUs or MPS. Filter out resources already
* allocated to the other GRES type.
*/
if (alt_node_gres_ptr && alt_node_gres_ptr->gres_bit_alloc &&
node_gres_ptr->topo_gres_bitmap[i]) {
c = bit_overlap(node_gres_ptr->topo_gres_bitmap[i],
alt_node_gres_ptr->gres_bit_alloc);
if ((alt_plugin_id == gpu_plugin_id) && (c > 0))
continue;
if ((alt_plugin_id == mps_plugin_id) && (c > 0)) {
avail_gres -= c;
if (avail_gres == 0)
continue;
}
}
/* gres/mps can only use one GPU per node */
if ((main_plugin_id == mps_plugin_id) &&
(avail_gres > sock_gres->max_node_gres))
sock_gres->max_node_gres = avail_gres;
/*
* If some GRES is available on every socket,
* treat like no topo_core_bitmap is specified
*/
tot_cores = sockets * cores_per_sock;
if (node_gres_ptr->topo_core_bitmap &&
node_gres_ptr->topo_core_bitmap[i]) {
use_all_sockets = true;
for (s = 0; s < sockets; s++) {
bool use_this_socket = false;
for (c = 0; c < cores_per_sock; c++) {
j = (s * cores_per_sock) + c;
if (bit_test(node_gres_ptr->
topo_core_bitmap[i], j)) {
use_this_socket = true;
break;
}
}
if (!use_this_socket) {
use_all_sockets = false;
break;
}
}
}
if (!node_gres_ptr->topo_core_bitmap ||
!node_gres_ptr->topo_core_bitmap[i] ||
use_all_sockets) {
/*
* Not constrained by core, but only specific
* GRES may be available (save their bitmap)
*/
sock_gres->cnt_any_sock += avail_gres;
sock_gres->total_cnt += avail_gres;
if (!sock_gres->bits_any_sock) {
sock_gres->bits_any_sock =
bit_copy(node_gres_ptr->
topo_gres_bitmap[i]);
} else {
bit_or(sock_gres->bits_any_sock,
node_gres_ptr->topo_gres_bitmap[i]);
}
match = true;
continue;
}
/* Constrained by core */
if (core_bitmap)
tot_cores = MIN(tot_cores, bit_size(core_bitmap));
if (node_gres_ptr->topo_core_bitmap[i]) {
tot_cores = MIN(tot_cores,
bit_size(node_gres_ptr->
topo_core_bitmap[i]));
}
for (s = 0; ((s < sockets) && avail_gres); s++) {
if (enforce_binding && core_bitmap) {
for (c = 0; c < cores_per_sock; c++) {
j = (s * cores_per_sock) + c;
if (bit_test(core_bitmap, j))
break;
}
if (c >= cores_per_sock) {
/* No available cores on this socket */
continue;
}
}
for (c = 0; c < cores_per_sock; c++) {
j = (s * cores_per_sock) + c;
if (j >= tot_cores)
break; /* Off end of core bitmap */
if (node_gres_ptr->topo_core_bitmap[i] &&
!bit_test(node_gres_ptr->topo_core_bitmap[i],
j))
continue;
if (!node_gres_ptr->topo_gres_bitmap[i]) {
error("%s: topo_gres_bitmap NULL on node %s",
__func__, node_name);
continue;
}
if (!sock_gres->bits_by_sock[s]) {
sock_gres->bits_by_sock[s] =
bit_copy(node_gres_ptr->
topo_gres_bitmap[i]);
} else {
bit_or(sock_gres->bits_by_sock[s],
node_gres_ptr->topo_gres_bitmap[i]);
}
sock_gres->cnt_by_sock[s] += avail_gres;
sock_gres->total_cnt += avail_gres;
avail_gres = 0;
match = true;
break;
}
}
}
/* Process per-GRES limits */
if (match && job_gres_ptr->gres_per_socket) {
/*
* Clear core bitmap on sockets with insufficient GRES
* and disable excess GRES per socket
*/
for (s = 0; s < sockets; s++) {
if (sock_gres->cnt_by_sock[s] <
job_gres_ptr->gres_per_socket) {
/* Insufficient GRES, clear count */
sock_gres->total_cnt -=
sock_gres->cnt_by_sock[s];
sock_gres->cnt_by_sock[s] = 0;
if (enforce_binding && core_bitmap) {
i = s * cores_per_sock;
bit_nclear(core_bitmap, i,
i + cores_per_sock - 1);
}
} else if (sock_gres->cnt_by_sock[s] >
job_gres_ptr->gres_per_socket) {
/* Excess GRES, reduce count */
i = sock_gres->cnt_by_sock[s] -
job_gres_ptr->gres_per_socket;
sock_gres->cnt_by_sock[s] =
job_gres_ptr->gres_per_socket;
sock_gres->total_cnt -= i;
}
}
}
/*
* Satisfy sockets-per-node (s_p_n) limit by selecting the sockets with
* the most GRES. Sockets with low GRES counts have their core_bitmap
* cleared so that _allocate_sc() in cons_tres/job_test.c does not
* remove sockets needed to satisfy the job's GRES specification.
*/
if (match && enforce_binding && core_bitmap && (s_p_n < sockets)) {
int avail_sock = 0;
bool *avail_sock_flag = xcalloc(sockets, sizeof(bool));
for (s = 0; s < sockets; s++) {
if (sock_gres->cnt_by_sock[s] == 0)
continue;
for (c = 0; c < cores_per_sock; c++) {
i = (s * cores_per_sock) + c;
if (!bit_test(core_bitmap, i))
continue;
avail_sock++;
avail_sock_flag[s] = true;
break;
}
}
while (avail_sock > s_p_n) {
int low_gres_sock_inx = -1;
for (s = 0; s < sockets; s++) {
if (!avail_sock_flag[s])
continue;
if ((low_gres_sock_inx == -1) ||
(sock_gres->cnt_by_sock[s] <
sock_gres->cnt_by_sock[low_gres_sock_inx]))
low_gres_sock_inx = s;
}
if (low_gres_sock_inx == -1)
break;
s = low_gres_sock_inx;
i = s * cores_per_sock;
bit_nclear(core_bitmap, i, i + cores_per_sock - 1);
sock_gres->total_cnt -= sock_gres->cnt_by_sock[s];
sock_gres->cnt_by_sock[s] = 0;
avail_sock--;
avail_sock_flag[s] = false;
}
xfree(avail_sock_flag);
}
if (match) {
if (job_gres_ptr->gres_per_node)
min_gres = job_gres_ptr->gres_per_node;
if (job_gres_ptr->gres_per_task)
min_gres = MAX(min_gres, job_gres_ptr->gres_per_task);
if (sock_gres->total_cnt < min_gres)
match = false;
}
/*
* If sockets-per-node (s_p_n) not specified then identify sockets
* which are required to satisfy gres_per_node or task specification
* so that allocated tasks can be distributed over multiple sockets
* if necessary.
*/
add_gres = min_gres - sock_gres->cnt_any_sock;
if (match && core_bitmap && (s_p_n == NO_VAL) && (add_gres > 0) &&
job_gres_ptr->gres_per_node) {
int avail_sock = 0, best_sock_inx = -1;
bool *avail_sock_flag = xcalloc(sockets, sizeof(bool));
for (s = 0; s < sockets; s++) {
if (sock_gres->cnt_by_sock[s] == 0)
continue;
for (c = 0; c < cores_per_sock; c++) {
i = (s * cores_per_sock) + c;
if (!bit_test(core_bitmap, i))
continue;
avail_sock++;
avail_sock_flag[s] = true;
if ((best_sock_inx == -1) ||
(sock_gres->cnt_by_sock[s] >
sock_gres->cnt_by_sock[best_sock_inx])) {
best_sock_inx = s;
}
break;
}
}
while ((best_sock_inx != -1) && (add_gres > 0)) {
if (*req_sock_map == NULL)
*req_sock_map = bit_alloc(sockets);
bit_set(*req_sock_map, best_sock_inx);
add_gres -= sock_gres->cnt_by_sock[best_sock_inx];
avail_sock_flag[best_sock_inx] = false;
if (add_gres <= 0)
break;
/* Find next best socket */
best_sock_inx = -1;
for (s = 0; s < sockets; s++) {
if ((sock_gres->cnt_by_sock[s] == 0) ||
!avail_sock_flag[s])
continue;
if ((best_sock_inx == -1) ||
(sock_gres->cnt_by_sock[s] >
sock_gres->cnt_by_sock[best_sock_inx])) {
best_sock_inx = s;
}
}
}
xfree(avail_sock_flag);
}
if (match) {
sock_gres->type_id = job_gres_ptr->type_id;
sock_gres->type_name = xstrdup(job_gres_ptr->type_name);
} else {
_sock_gres_del(sock_gres);
sock_gres = NULL;
}
return sock_gres;
}
/*
* Determine how many GRES of a given type can be used by this job on a
* given node and return a structure with the details. Note that multiple
* GRES of a given type model can be configured, so pick the right one.
*/
static sock_gres_t *_build_sock_gres_by_type(gres_job_state_t *job_gres_ptr,
gres_node_state_t *node_gres_ptr,
bool use_total_gres, bitstr_t *core_bitmap,
uint16_t sockets, uint16_t cores_per_sock,
uint32_t job_id, char *node_name)
{
int i;
sock_gres_t *sock_gres;
uint64_t avail_gres, min_gres = 1, gres_tmp;
bool match = false;
if (job_gres_ptr->gres_per_node)
min_gres = job_gres_ptr-> gres_per_node;
if (job_gres_ptr->gres_per_socket)
min_gres = MAX(min_gres, job_gres_ptr->gres_per_socket);
if (job_gres_ptr->gres_per_task)
min_gres = MAX(min_gres, job_gres_ptr->gres_per_task);
sock_gres = xmalloc(sizeof(sock_gres_t));
for (i = 0; i < node_gres_ptr->type_cnt; i++) {
if (job_gres_ptr->type_name &&
(job_gres_ptr->type_id != node_gres_ptr->type_id[i]))
continue; /* Wrong type_model */
if (!use_total_gres &&
(node_gres_ptr->type_cnt_alloc[i] >=
node_gres_ptr->type_cnt_avail[i])) {
continue; /* No GRES remaining */
} else if (!use_total_gres) {
avail_gres = node_gres_ptr->type_cnt_avail[i] -
node_gres_ptr->type_cnt_alloc[i];
} else {
avail_gres = node_gres_ptr->type_cnt_avail[i];
}
gres_tmp = node_gres_ptr->gres_cnt_avail;
if (!use_total_gres)
gres_tmp -= node_gres_ptr->gres_cnt_alloc;
avail_gres = MIN(avail_gres, gres_tmp);
if (avail_gres < min_gres)
continue; /* Insufficient GRES remaining */
sock_gres->cnt_any_sock += avail_gres;
sock_gres->total_cnt += avail_gres;
match = true;
}
if (match) {
sock_gres->type_id = job_gres_ptr->type_id;
sock_gres->type_name = xstrdup(job_gres_ptr->type_name);
} else
xfree(sock_gres);
return sock_gres;
}
/*
* Determine how many GRES of a given type can be used by this job on a
* given node and return a structure with the details. No GRES type.
*/
static sock_gres_t *_build_sock_gres_basic(gres_job_state_t *job_gres_ptr,
gres_node_state_t *node_gres_ptr,
bool use_total_gres, bitstr_t *core_bitmap,
uint16_t sockets, uint16_t cores_per_sock,
uint32_t job_id, char *node_name)
{
sock_gres_t *sock_gres;
uint64_t avail_gres, min_gres = 1;
if (job_gres_ptr->type_name)
return NULL;
if (!use_total_gres &&
(node_gres_ptr->gres_cnt_alloc >= node_gres_ptr->gres_cnt_avail))
return NULL; /* No GRES remaining */
if (job_gres_ptr->gres_per_node)
min_gres = job_gres_ptr-> gres_per_node;
if (job_gres_ptr->gres_per_socket)
min_gres = MAX(min_gres, job_gres_ptr->gres_per_socket);
if (job_gres_ptr->gres_per_task)
min_gres = MAX(min_gres, job_gres_ptr->gres_per_task);
if (!use_total_gres) {
avail_gres = node_gres_ptr->gres_cnt_avail -
node_gres_ptr->gres_cnt_alloc;
} else
avail_gres = node_gres_ptr->gres_cnt_avail;
if (avail_gres < min_gres)
return NULL; /* Insufficient GRES remaining */
sock_gres = xmalloc(sizeof(sock_gres_t));
sock_gres->cnt_any_sock += avail_gres;
sock_gres->total_cnt += avail_gres;
return sock_gres;
}
static void _sock_gres_log(List sock_gres_list, char *node_name)
{
sock_gres_t *sock_gres;
ListIterator iter;
int i, len = -1;
char tmp[32] = "";
if (!sock_gres_list)
return;
info("Sock_gres state for %s", node_name);
iter = list_iterator_create(sock_gres_list);
while ((sock_gres = (sock_gres_t *) list_next(iter))) {
info("Gres:%s Type:%s TotalCnt:%"PRIu64" MaxNodeGres:%"PRIu64,
sock_gres->gres_name, sock_gres->type_name,
sock_gres->total_cnt, sock_gres->max_node_gres);
if (sock_gres->bits_any_sock) {
bit_fmt(tmp, sizeof(tmp), sock_gres->bits_any_sock);
len = bit_size(sock_gres->bits_any_sock);
}
info(" Sock[ANY]Cnt:%"PRIu64" Bits:%s of %d",
sock_gres->cnt_any_sock, tmp, len);
for (i = 0; i < sock_gres->sock_cnt; i++) {
if (sock_gres->cnt_by_sock[i] == 0)
continue;
tmp[0] = '\0';
len = -1;
if (sock_gres->bits_by_sock &&
sock_gres->bits_by_sock[i]) {
bit_fmt(tmp, sizeof(tmp),
sock_gres->bits_by_sock[i]);
len = bit_size(sock_gres->bits_by_sock[i]);
}
info(" Sock[%d]Cnt:%"PRIu64" Bits:%s of %d", i,
sock_gres->cnt_by_sock[i], tmp, len);
}
}
list_iterator_destroy(iter);
}
/*
* Determine how many cores on each socket of a node can be used by this job
* IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
* IN node_gres_list - node's gres_list built by gres_plugin_node_config_validate()
* IN use_total_gres - if set then consider all gres resources as available,
* and none are commited to running jobs
* IN/OUT core_bitmap - Identification of available cores on this node
* IN sockets - Count of sockets on the node
* IN cores_per_sock - Count of cores per socket on this node
* IN job_id - job's ID (for logging)
* IN node_name - name of the node (for logging)
* IN enforce_binding - if true then only use GRES with direct access to cores
* IN s_p_n - Expected sockets_per_node (NO_VAL if not limited)
* OUT req_sock_map - bitmap of specific requires sockets
* IN user_id - job's user ID
* IN node_inx - index of node to be evaluated
* RET: List of sock_gres_t entries identifying what resources are available on
* each socket. Returns NULL if none available. Call FREE_NULL_LIST() to
* release memory.
*/
extern List gres_plugin_job_test2(List job_gres_list, List node_gres_list,
bool use_total_gres, bitstr_t *core_bitmap,
uint16_t sockets, uint16_t cores_per_sock,
uint32_t job_id, char *node_name,
bool enforce_binding, uint32_t s_p_n,
bitstr_t **req_sock_map, uint32_t user_id,
const uint32_t node_inx)
{
List sock_gres_list = NULL;
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr, *node_gres_ptr;
gres_job_state_t *job_data_ptr;
gres_node_state_t *node_data_ptr;
uint32_t local_s_p_n;
if (!job_gres_list || (list_count(job_gres_list) == 0))
return sock_gres_list;
if (!node_gres_list) /* Node lacks GRES to match */
return sock_gres_list;
(void) gres_plugin_init();
sock_gres_list = list_create(_sock_gres_del);
slurm_mutex_lock(&gres_context_lock);
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
sock_gres_t *sock_gres = NULL;
node_gres_ptr = list_find_first(node_gres_list, _gres_find_id,
&job_gres_ptr->plugin_id);
if (node_gres_ptr == NULL) {
/* node lack GRES of type required by the job */
FREE_NULL_LIST(sock_gres_list);
break;
}
job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
node_data_ptr = (gres_node_state_t *) node_gres_ptr->gres_data;
if (job_data_ptr->gres_per_job &&
!job_data_ptr->gres_per_socket)
local_s_p_n = s_p_n; /* Maximize GRES per node */
else
local_s_p_n = NO_VAL; /* No need to optimize socket */
if (core_bitmap && (bit_ffs(core_bitmap) == -1)) {
sock_gres = NULL; /* No cores available */
} else if (node_data_ptr->topo_cnt) {
uint32_t alt_plugin_id = 0;
gres_node_state_t *alt_node_data_ptr = NULL;
if (!use_total_gres && have_gpu && have_mps) {
if (job_gres_ptr->plugin_id == gpu_plugin_id)
alt_plugin_id = mps_plugin_id;
if (job_gres_ptr->plugin_id == mps_plugin_id)
alt_plugin_id = gpu_plugin_id;
}
if (alt_plugin_id) {
node_gres_ptr = list_find_first(node_gres_list,
_gres_find_id,
&alt_plugin_id);
}
if (alt_plugin_id && node_gres_ptr) {
alt_node_data_ptr = (gres_node_state_t *)
node_gres_ptr->gres_data;
} else {
/* GRES of interest not on this node */
alt_plugin_id = 0;
}
sock_gres = _build_sock_gres_by_topo(job_data_ptr,
node_data_ptr, use_total_gres,
core_bitmap, sockets, cores_per_sock,
job_id, node_name, enforce_binding,
local_s_p_n, req_sock_map,
job_gres_ptr->plugin_id,
alt_plugin_id, alt_node_data_ptr,
user_id, node_inx);
} else if (node_data_ptr->type_cnt) {
sock_gres = _build_sock_gres_by_type(job_data_ptr,
node_data_ptr, use_total_gres,
core_bitmap, sockets, cores_per_sock,
job_id, node_name);
} else {
sock_gres = _build_sock_gres_basic(job_data_ptr,
node_data_ptr, use_total_gres,
core_bitmap, sockets, cores_per_sock,
job_id, node_name);
}
if (!sock_gres) {
/* node lack available resources required by the job */
bit_clear_all(core_bitmap);
FREE_NULL_LIST(sock_gres_list);
break;
}
sock_gres->job_specs = job_data_ptr;
sock_gres->gres_name = xstrdup(job_data_ptr->gres_name);
sock_gres->node_specs = node_data_ptr;
sock_gres->plugin_id = job_gres_ptr->plugin_id;
list_append(sock_gres_list, sock_gres);
}
list_iterator_destroy(job_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
if (gres_debug)
_sock_gres_log(sock_gres_list, node_name);
return sock_gres_list;
}
static bool *_build_avail_cores_by_sock(bitstr_t *core_bitmap,
uint16_t sockets,
uint16_t cores_per_sock)
{
bool *avail_cores_by_sock = xcalloc(sockets, sizeof(bool));
int s, c, i, lim = 0;
lim = bit_size(core_bitmap);
for (s = 0; s < sockets; s++) {
for (c = 0; c < cores_per_sock; c++) {
i = (s * cores_per_sock) + c;
if (i >= lim)
goto fini; /* should never happen */
if (bit_test(core_bitmap, i)) {
avail_cores_by_sock[s] = true;
break;
}
}
}
fini: return avail_cores_by_sock;
}
/*
* Determine which GRES can be used on this node given the available cores.
* Filter out unusable GRES.
* IN sock_gres_list - list of sock_gres_t entries built by gres_plugin_job_test2()
* IN avail_mem - memory available for the job
* IN max_cpus - maximum CPUs available on this node (limited by
* specialized cores and partition CPUs-per-node)
* IN enforce_binding - GRES must be co-allocated with cores
* IN core_bitmap - Identification of available cores on this node
* IN sockets - Count of sockets on the node
* IN cores_per_sock - Count of cores per socket on this node
* IN cpus_per_core - Count of CPUs per core on this node
* IN sock_per_node - sockets requested by job per node or NO_VAL
* IN task_per_node - tasks requested by job per node or NO_VAL16
* IN whole_node - we are requesting the whole node or not
* OUT avail_gpus - Count of available GPUs on this node
* OUT near_gpus - Count of GPUs available on sockets with available CPUs
* RET - 0 if job can use this node, -1 otherwise (some GRES limit prevents use)
*/
extern int gres_plugin_job_core_filter2(List sock_gres_list, uint64_t avail_mem,
uint16_t max_cpus,
bool enforce_binding,
bitstr_t *core_bitmap,
uint16_t sockets,
uint16_t cores_per_sock,
uint16_t cpus_per_core,
uint32_t sock_per_node,
uint16_t task_per_node,
bool whole_node,
uint16_t *avail_gpus,
uint16_t *near_gpus)
{
ListIterator sock_gres_iter;
sock_gres_t *sock_gres;
bool *avail_cores_by_sock = NULL;
uint64_t max_gres, mem_per_gres = 0, near_gres_cnt = 0;
uint16_t cpus_per_gres;
int s, rc = 0;
*avail_gpus = 0;
*near_gpus = 0;
if (!core_bitmap || !sock_gres_list ||
(list_count(sock_gres_list) == 0))
return rc;
sock_gres_iter = list_iterator_create(sock_gres_list);
while ((sock_gres = (sock_gres_t *) list_next(sock_gres_iter))) {
uint64_t min_gres = 1, tmp_u64;
if (sock_gres->job_specs) {
gres_job_state_t *job_gres_ptr = sock_gres->job_specs;
if (whole_node)
min_gres = sock_gres->total_cnt;
else if (job_gres_ptr->gres_per_node)
min_gres = job_gres_ptr-> gres_per_node;
if (job_gres_ptr->gres_per_socket) {
tmp_u64 = job_gres_ptr->gres_per_socket;
if (sock_per_node != NO_VAL)
tmp_u64 *= sock_per_node;
min_gres = MAX(min_gres, tmp_u64);
}
if (job_gres_ptr->gres_per_task) {
tmp_u64 = job_gres_ptr->gres_per_task;
if (task_per_node != NO_VAL16)
tmp_u64 *= task_per_node;
min_gres = MAX(min_gres, tmp_u64);
}
}
if (!sock_gres->job_specs)
cpus_per_gres = 0;
else if (sock_gres->job_specs->cpus_per_gres)
cpus_per_gres = sock_gres->job_specs->cpus_per_gres;
else
cpus_per_gres = sock_gres->job_specs->def_cpus_per_gres;
if (cpus_per_gres) {
max_gres = max_cpus / cpus_per_gres;
if ((max_gres == 0) ||
(sock_gres->job_specs->gres_per_node > max_gres) ||
(sock_gres->job_specs->gres_per_task > max_gres) ||
(sock_gres->job_specs->gres_per_socket > max_gres)){
/* Insufficient CPUs for any GRES */
rc = -1;
break;
}
}
if (!sock_gres->job_specs)
mem_per_gres = 0;
else if (sock_gres->job_specs->mem_per_gres)
mem_per_gres = sock_gres->job_specs->mem_per_gres;
else
mem_per_gres = sock_gres->job_specs->def_mem_per_gres;
if (mem_per_gres && avail_mem) {
if (mem_per_gres <= avail_mem) {
sock_gres->max_node_gres = avail_mem /
mem_per_gres;
} else { /* Insufficient memory for any GRES */
rc = -1;
break;
}
}
if (sock_gres->cnt_by_sock || enforce_binding) {
if (!avail_cores_by_sock) {
avail_cores_by_sock =_build_avail_cores_by_sock(
core_bitmap, sockets,
cores_per_sock);
}
}
/*
* NOTE: gres_per_socket enforcement is performed by
* _build_sock_gres_by_topo(), called by gres_plugin_job_test2()
*/
if (sock_gres->cnt_by_sock && enforce_binding) {
for (s = 0; s < sockets; s++) {
if (avail_cores_by_sock[s] == 0) {
sock_gres->total_cnt -=
sock_gres->cnt_by_sock[s];
sock_gres->cnt_by_sock[s] = 0;
}
}
near_gres_cnt = sock_gres->total_cnt;
} else if (sock_gres->cnt_by_sock) { /* NO enforce_binding */
near_gres_cnt = sock_gres->total_cnt;
for (s = 0; s < sockets; s++) {
if (avail_cores_by_sock[s] == 0) {
near_gres_cnt -=
sock_gres->cnt_by_sock[s];
}
}
} else {
near_gres_cnt = sock_gres->total_cnt;
}
if (sock_gres->job_specs && !whole_node &&
sock_gres->job_specs->gres_per_node) {
if ((sock_gres->max_node_gres == 0) ||
(sock_gres->max_node_gres >
sock_gres->job_specs->gres_per_node)) {
sock_gres->max_node_gres =
sock_gres->job_specs->gres_per_node;
}
}
if (cpus_per_gres) {
int cpu_cnt;
cpu_cnt = bit_set_count(core_bitmap);
cpu_cnt *= cpus_per_core;
max_gres = cpu_cnt / cpus_per_gres;
if (max_gres == 0) {
rc = -1;
break;
} else if ((sock_gres->max_node_gres == 0) ||
(sock_gres->max_node_gres > max_gres)) {
sock_gres->max_node_gres = max_gres;
}
}
if (mem_per_gres) {
max_gres = avail_mem / mem_per_gres;
sock_gres->total_cnt = MIN(sock_gres->total_cnt,
max_gres);
}
if ((sock_gres->total_cnt < min_gres) ||
((sock_gres->max_node_gres != 0) &&
(sock_gres->max_node_gres < min_gres))) {
rc = -1;
break;
}
if (_sharing_gres(sock_gres->plugin_id)) {
*avail_gpus += sock_gres->total_cnt;
if (sock_gres->max_node_gres &&
(sock_gres->max_node_gres < near_gres_cnt))
near_gres_cnt = sock_gres->max_node_gres;
if (*near_gpus < 0xff) /* avoid overflow */
*near_gpus += near_gres_cnt;
}
}
list_iterator_destroy(sock_gres_iter);
xfree(avail_cores_by_sock);
return rc;
}
/* Order GRES scheduling. Schedule GRES requiring specific sockets first */
static int _sock_gres_sort(void *x, void *y)
{
sock_gres_t *sock_gres1 = *(sock_gres_t **) x;
sock_gres_t *sock_gres2 = *(sock_gres_t **) y;
int weight1 = 0, weight2 = 0;
if (sock_gres1->node_specs && !sock_gres1->node_specs->topo_cnt)
weight1 += 0x02;
if (sock_gres1->job_specs && !sock_gres1->job_specs->gres_per_socket)
weight1 += 0x01;
if (sock_gres2->node_specs && !sock_gres2->node_specs->topo_cnt)
weight2 += 0x02;
if (sock_gres2->job_specs && !sock_gres2->job_specs->gres_per_socket)
weight2 += 0x01;
return weight1 - weight2;
}
int _sort_sockets_by_avail_cores(const void *x, const void *y,
void *socket_avail_cores)
{
uint16_t *sockets = (uint16_t *)socket_avail_cores;
return (sockets[*(int *)y] - sockets[*(int *)x]);
}
/*
* Determine how many tasks can be started on a given node and which
* sockets/cores are required
* IN mc_ptr - job's multi-core specs, NO_VAL and INFINITE mapped to zero
* IN sock_gres_list - list of sock_gres_t entries built by gres_plugin_job_test2()
* IN sockets - Count of sockets on the node
* IN cores_per_socket - Count of cores per socket on the node
* IN cpus_per_core - Count of CPUs per core on the node
* IN avail_cpus - Count of available CPUs on the node, UPDATED
* IN min_tasks_this_node - Minimum count of tasks that can be started on this
* node, UPDATED
* IN max_tasks_this_node - Maximum count of tasks that can be started on this
* node or NO_VAL, UPDATED
* IN rem_nodes - desired additional node count to allocate, including this node
* IN enforce_binding - GRES must be co-allocated with cores
* IN first_pass - set if first scheduling attempt for this job, use
* co-located GRES and cores if possible
* IN avail_core - cores available on this node, UPDATED
*/
extern void gres_plugin_job_core_filter3(gres_mc_data_t *mc_ptr,
List sock_gres_list,
uint16_t sockets,
uint16_t cores_per_socket,
uint16_t cpus_per_core,
uint16_t *avail_cpus,
uint32_t *min_tasks_this_node,
uint32_t *max_tasks_this_node,
int rem_nodes,
bool enforce_binding,
bool first_pass,
bitstr_t *avail_core)
{
static uint16_t select_type_param = NO_VAL16;
ListIterator sock_gres_iter;
sock_gres_t *sock_gres;
gres_job_state_t *job_specs;
int i, j, c, s, sock_cnt = 0, req_cores, rem_sockets, full_socket;
int tot_core_cnt = 0, min_core_cnt = 1;
uint64_t cnt_avail_sock, cnt_avail_total, max_gres = 0, rem_gres = 0;
uint64_t tot_gres_sock, max_tasks;
uint32_t task_cnt_incr;
bool *req_sock = NULL; /* Required socket */
int *socket_index = NULL; /* Socket indexes */
uint16_t *avail_cores_per_sock, cpus_per_gres;
uint16_t avail_cores_tot;
if (*max_tasks_this_node == 0)
return;
xassert(avail_core);
avail_cores_per_sock = xcalloc(sockets, sizeof(uint16_t));
for (s = 0; s < sockets; s++) {
for (c = 0; c < cores_per_socket; c++) {
i = (s * cores_per_socket) + c;
if (bit_test(avail_core, i))
avail_cores_per_sock[s]++;
}
tot_core_cnt += avail_cores_per_sock[s];
}
task_cnt_incr = *min_tasks_this_node;
req_sock = xcalloc(sockets, sizeof(bool));
socket_index = xcalloc(sockets, sizeof(int));
list_sort(sock_gres_list, _sock_gres_sort);
sock_gres_iter = list_iterator_create(sock_gres_list);
while ((sock_gres = (sock_gres_t *) list_next(sock_gres_iter))) {
bool sufficient_gres;
job_specs = sock_gres->job_specs;
if (!job_specs)
continue;
if (job_specs->gres_per_job &&
(job_specs->total_gres < job_specs->gres_per_job)) {
rem_gres = job_specs->gres_per_job -
job_specs->total_gres;
}
/*
* gres_plugin_job_core_filter2() sets sock_gres->max_node_gres
* for mem_per_gres enforcement; use it to set GRES limit for
* this node (max_gres).
*/
if (sock_gres->max_node_gres) {
if (rem_gres && (rem_gres < sock_gres->max_node_gres))
max_gres = rem_gres;
else
max_gres = sock_gres->max_node_gres;
}
rem_nodes = MAX(rem_nodes, 1);
rem_sockets = MAX(1, mc_ptr->sockets_per_node);
if (max_gres &&
((job_specs->gres_per_node > max_gres) ||
((job_specs->gres_per_socket * rem_sockets) > max_gres))) {
*max_tasks_this_node = 0;
break;
}
if (job_specs->gres_per_node && job_specs->gres_per_task) {
max_tasks = job_specs->gres_per_node /
job_specs->gres_per_task;
if ((max_tasks == 0) ||
(max_tasks > *max_tasks_this_node) ||
(max_tasks < *min_tasks_this_node)) {
*max_tasks_this_node = 0;
break;
}
if ((*max_tasks_this_node == NO_VAL) ||
(*max_tasks_this_node > max_tasks))
*max_tasks_this_node = max_gres;
}
min_core_cnt = MAX(*min_tasks_this_node, 1) *
MAX(mc_ptr->cpus_per_task, 1);
min_core_cnt = (min_core_cnt + cpus_per_core - 1) /
cpus_per_core;
if (job_specs->cpus_per_gres)
cpus_per_gres = job_specs->cpus_per_gres;
else
cpus_per_gres = job_specs->def_cpus_per_gres;
/* Filter out unusable GRES by socket */
avail_cores_tot = 0;
cnt_avail_total = sock_gres->cnt_any_sock;
sufficient_gres = false;
for (s = 0; s < sockets; s++)
socket_index[s] = s;
qsort_r(socket_index, sockets, sizeof(int),
_sort_sockets_by_avail_cores, avail_cores_per_sock);
for (j = 0; j < sockets; j++) {
/*
* Test for sufficient gres_per_socket
*
* Start with socket with most cores available,
* so we know that we have max number of cores on socket
* with allocated GRES.
*/
s = socket_index[j];
if (sock_gres->cnt_by_sock) {
cnt_avail_sock = sock_gres->cnt_by_sock[s];
} else
cnt_avail_sock = 0;
/*
* If enforce binding number of gres allocated per
* socket has to be limited by cpus_per_gres
*/
if ((enforce_binding || first_pass) && cpus_per_gres) {
int max_gres_socket = (avail_cores_per_sock[s] *
cpus_per_core) /
cpus_per_gres;
cnt_avail_sock = MIN(cnt_avail_sock,
max_gres_socket);
}
tot_gres_sock = sock_gres->cnt_any_sock +
cnt_avail_sock;
if ((job_specs->gres_per_socket > tot_gres_sock) ||
(tot_gres_sock == 0)) {
/*
* Insufficient GRES on this socket
* GRES removed here won't be used in 2nd pass
*/
if (((job_specs->gres_per_socket >
tot_gres_sock) ||
enforce_binding) &&
sock_gres->cnt_by_sock) {
sock_gres->total_cnt -=
sock_gres->cnt_by_sock[s];
sock_gres->cnt_by_sock[s] = 0;
}
if (first_pass &&
(tot_core_cnt > min_core_cnt)) {
for (c = cores_per_socket - 1;
c >= 0; c--) {
i = (s * cores_per_socket) + c;
if (!bit_test(avail_core, i))
continue;
bit_clear(avail_core, i);
avail_cores_per_sock[s]--;
if (bit_set_count(avail_core) *
cpus_per_core <
*avail_cpus) {
*avail_cpus -=
cpus_per_core;
}
if (--tot_core_cnt <=
min_core_cnt)
break;
}
}
}
avail_cores_tot += avail_cores_per_sock[s];
/* Test for available cores on this socket */
if ((enforce_binding || first_pass) &&
(avail_cores_per_sock[s] == 0))
continue;
cnt_avail_total += cnt_avail_sock;
if (!sufficient_gres) {
req_sock[s] = true;
sock_cnt++;
}
if (job_specs->gres_per_node &&
(cnt_avail_total >= job_specs->gres_per_node) &&
!sock_gres->cnt_any_sock) {
/*
* Sufficient gres will leave remaining CPUs as
* !req_sock. We do this only when we
* collected enough and all collected gres of
* considered type are bound to socket.
*/
sufficient_gres = true;
}
}
if (cpus_per_gres) {
max_gres = *avail_cpus / cpus_per_gres;
cnt_avail_total = MIN(cnt_avail_total, max_gres);
}
if ((cnt_avail_total == 0) ||
(job_specs->gres_per_node > cnt_avail_total) ||
(job_specs->gres_per_task > cnt_avail_total)) {
*max_tasks_this_node = 0;
}
if (job_specs->gres_per_task) {
max_tasks = cnt_avail_total / job_specs->gres_per_task;
*max_tasks_this_node = MIN(*max_tasks_this_node,
max_tasks);
}
/*
* min_tasks_this_node and max_tasks_this_node must be multiple
* of original min_tasks_this_node value. This is to support
* ntasks_per_* option and we just need to select a count of
* tasks, sockets, etc. Round the values down.
*/
*min_tasks_this_node = (*min_tasks_this_node / task_cnt_incr) *
task_cnt_incr;
*max_tasks_this_node = (*max_tasks_this_node / task_cnt_incr) *
task_cnt_incr;
if (*max_tasks_this_node == 0)
break;
/*
* Remove cores on not required sockets when enforce-binding,
* this has to happen also when max_tasks_this_node == NO_VAL
*/
if (enforce_binding || first_pass) {
for (s = 0; s < sockets; s++) {
if (req_sock[s])
continue;
for (c = cores_per_socket - 1; c >= 0; c--) {
i = (s * cores_per_socket) + c;
if (!bit_test(avail_core, i))
continue;
bit_clear(avail_core, i);
if (bit_set_count(avail_core) *
cpus_per_core < *avail_cpus) {
*avail_cpus -= cpus_per_core;
}
avail_cores_tot--;
avail_cores_per_sock[s]--;
}
}
}
if (*max_tasks_this_node == NO_VAL) {
if (cpus_per_gres) {
i = *avail_cpus / cpus_per_gres;
sock_gres->total_cnt =
MIN(i, sock_gres->total_cnt);
}
log_flag(GRES, "%s: max_tasks_this_node is set to NO_VAL, won't clear non-needed cores",
__func__);
continue;
}
if (*max_tasks_this_node < *min_tasks_this_node) {
error("%s: min_tasks_this_node:%u > max_tasks_this_node:%u",
__func__,
*min_tasks_this_node,
*max_tasks_this_node);
}
/*
* Determine how many cores are needed for this job.
* Consider rounding errors if cpus_per_task not divisible
* by cpus_per_core
*/
req_cores = *max_tasks_this_node;
if (mc_ptr->cpus_per_task) {
int threads_per_core, removed_tasks = 0;
if (mc_ptr->threads_per_core)
threads_per_core =
MIN(cpus_per_core,
mc_ptr->threads_per_core);
else
threads_per_core = cpus_per_core;
req_cores *= mc_ptr->cpus_per_task;
while (*max_tasks_this_node >= *min_tasks_this_node) {
/* round up by full threads per core */
req_cores += threads_per_core - 1;
req_cores /= threads_per_core;
if (req_cores <= avail_cores_tot) {
if (removed_tasks)
log_flag(GRES, "%s: settings required_cores=%d by max_tasks_this_node=%u(reduced=%d) cpus_per_task=%d cpus_per_core=%d threads_per_core:%d",
__func__,
req_cores,
*max_tasks_this_node,
removed_tasks,
mc_ptr->cpus_per_task,
cpus_per_core,
mc_ptr->
threads_per_core);
break;
}
removed_tasks++;
(*max_tasks_this_node)--;
req_cores = *max_tasks_this_node;
}
}
if (cpus_per_gres) {
if (job_specs->gres_per_node) {
i = job_specs->gres_per_node;
log_flag(GRES, "%s: estimating req_cores gres_per_node=%"PRIu64,
__func__, job_specs->gres_per_node);
} else if (job_specs->gres_per_socket) {
i = job_specs->gres_per_socket * sock_cnt;
log_flag(GRES, "%s: estimating req_cores gres_per_socket=%"PRIu64,
__func__, job_specs->gres_per_socket);
} else if (job_specs->gres_per_task) {
i = job_specs->gres_per_task *
*max_tasks_this_node;
log_flag(GRES, "%s: estimating req_cores max_tasks_this_node=%u gres_per_task=%"PRIu64,
__func__,
*max_tasks_this_node,
job_specs->gres_per_task);
} else if (cnt_avail_total) {
i = cnt_avail_total;
log_flag(GRES, "%s: estimating req_cores cnt_avail_total=%"PRIu64,
__func__, cnt_avail_total);
} else {
i = 1;
log_flag(GRES, "%s: estimating req_cores default to 1 task",
__func__);
}
i *= cpus_per_gres;
i = (i + cpus_per_core - 1) / cpus_per_core;
if (req_cores < i)
log_flag(GRES, "%s: Increasing req_cores=%d from cpus_per_gres=%d cpus_per_core=%"PRIu16,
__func__, i, cpus_per_gres,
cpus_per_core);
req_cores = MAX(req_cores, i);
}
if (req_cores > avail_cores_tot) {
log_flag(GRES, "%s: Job cannot run on node req_cores:%d > aval_cores_tot:%d",
__func__, req_cores, avail_cores_tot);
*max_tasks_this_node = 0;
break;
}
/*
* Clear extra avail_core bits on sockets we don't need
* up to required number of cores based on max_tasks_this_node.
* In case of enforce-binding those are already cleared.
*/
if ((avail_cores_tot > req_cores) &&
!enforce_binding && !first_pass) {
for (s = 0; s < sockets; s++) {
if (avail_cores_tot == req_cores)
break;
if (req_sock[s])
continue;
for (c = cores_per_socket - 1; c >= 0; c--) {
i = (s * cores_per_socket) + c;
if (!bit_test(avail_core, i))
continue;
bit_clear(avail_core, i);
if (bit_set_count(avail_core) *
cpus_per_core < *avail_cpus) {
*avail_cpus -= cpus_per_core;
}
avail_cores_tot--;
avail_cores_per_sock[s]--;
if (avail_cores_tot == req_cores)
break;
}
}
}
/*
* Clear extra avail_core bits on sockets we do need, but
* spread them out so that every socket has some cores
* available to use with the nearby GRES that we do need.
*/
while (avail_cores_tot > req_cores) {
full_socket = -1;
for (s = 0; s < sockets; s++) {
if (avail_cores_tot == req_cores)
break;
if (!req_sock[s] ||
(avail_cores_per_sock[s] == 0))
continue;
if ((full_socket == -1) ||
(avail_cores_per_sock[full_socket] <
avail_cores_per_sock[s])) {
full_socket = s;
}
}
if (full_socket == -1)
break;
s = full_socket;
for (c = cores_per_socket - 1; c >= 0; c--) {
i = (s * cores_per_socket) + c;
if (!bit_test(avail_core, i))
continue;
bit_clear(avail_core, i);
if (bit_set_count(avail_core) * cpus_per_core <
*avail_cpus) {
*avail_cpus -= cpus_per_core;
}
avail_cores_per_sock[s]--;
avail_cores_tot--;
break;
}
}
if (cpus_per_gres) {
i = *avail_cpus / cpus_per_gres;
sock_gres->total_cnt = MIN(i, sock_gres->total_cnt);
if ((job_specs->gres_per_node > sock_gres->total_cnt) ||
(job_specs->gres_per_task > sock_gres->total_cnt)) {
*max_tasks_this_node = 0;
}
}
}
list_iterator_destroy(sock_gres_iter);
xfree(avail_cores_per_sock);
xfree(req_sock);
xfree(socket_index);
if (select_type_param == NO_VAL16)
select_type_param = slurm_get_select_type_param();
if ((mc_ptr->cpus_per_task > 1) ||
((select_type_param & CR_ONE_TASK_PER_CORE) == 0)) {
/*
* Only adjust *avail_cpus for the maximum task count if
* cpus_per_task is explicitly set. There is currently no way
* to tell if cpus_per_task==1 is explicitly set by the job
* when SelectTypeParameters includes CR_ONE_TASK_PER_CORE.
*/
*avail_cpus = MIN(*avail_cpus,
*max_tasks_this_node * mc_ptr->cpus_per_task);
}
}
/*
* Return the maximum number of tasks that can be started on a node with
* sock_gres_list (per-socket GRES details for some node)
*/
extern uint32_t gres_plugin_get_task_limit(List sock_gres_list)
{
ListIterator sock_gres_iter;
sock_gres_t *sock_gres;
uint32_t max_tasks = NO_VAL;
uint64_t task_limit;
sock_gres_iter = list_iterator_create(sock_gres_list);
while ((sock_gres = (sock_gres_t *) list_next(sock_gres_iter))) {
xassert(sock_gres->job_specs);
if (sock_gres->job_specs->gres_per_task == 0)
continue;
task_limit = sock_gres->total_cnt /
sock_gres->job_specs->gres_per_task;
max_tasks = MIN(max_tasks, task_limit);
}
list_iterator_destroy(sock_gres_iter);
return max_tasks;
}
/*
* Return count of sockets allocated to this job on this node
* job_res IN - job resource allocation
* node_inx IN - global node index
* job_node_inx IN - node index for this job's allocation
* RET socket count
*/
static int _get_sock_cnt(struct job_resources *job_res, int node_inx,
int job_node_inx)
{
int core_offset, used_sock_cnt = 0;
uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
int c, i, rc, s;
rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
&cores_per_socket_cnt);
if (rc != SLURM_SUCCESS) {
error("%s: Invalid socket/core count", __func__);
return 1;
}
core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
if (core_offset < 0) {
error("%s: Invalid core offset", __func__);
return 1;
}
for (s = 0; s < sock_cnt; s++) {
for (c = 0; c < cores_per_socket_cnt; c++) {
i = (s * cores_per_socket_cnt) + c;
if (bit_test(job_res->core_bitmap, (core_offset + i)))
used_sock_cnt++;
}
}
if (used_sock_cnt == 0) {
error("%s: No allocated cores found", __func__);
return 1;
}
return used_sock_cnt;
}
/*
* Select specific GRES (set GRES bitmap) for this job on this node based upon
* per-job resource specification. Use only socket-local GRES
* job_res IN - job resource allocation
* node_inx IN - global node index
* job_node_inx IN - node index for this job's allocation
* rem_nodes IN - count of nodes remaining to place resources on
* job_specs IN - job request specifications, UPDATED: set bits in
* gres_bit_select
* node_specs IN - node resource request specifications
* job_id IN - job ID for logging
* tres_mc_ptr IN - job's multi-core options
* cpus_per_core IN - CPUs per core on this node
* RET 0:more work, 1:fini
*/
static int _set_job_bits1(struct job_resources *job_res, int node_inx,
int job_node_inx, int rem_nodes,
sock_gres_t *sock_gres, uint32_t job_id,
gres_mc_data_t *tres_mc_ptr, uint16_t cpus_per_core)
{
int core_offset, gres_cnt;
uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
int c, i, g, rc, s;
gres_job_state_t *job_specs;
gres_node_state_t *node_specs;
int *cores_on_sock = NULL, alloc_gres_cnt = 0;
int max_gres, pick_gres, total_cores = 0;
int fini = 0;
job_specs = sock_gres->job_specs;
node_specs = sock_gres->node_specs;
if (job_specs->gres_per_job == job_specs->total_gres)
fini = 1;
rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
&cores_per_socket_cnt);
if (rc != SLURM_SUCCESS) {
error("%s: Invalid socket/core count for job %u on node %d",
__func__, job_id, node_inx);
return rc;
}
core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
if (core_offset < 0) {
error("%s: Invalid core offset for job %u on node %d",
__func__, job_id, node_inx);
return rc;
}
i = sock_gres->sock_cnt;
if ((i != 0) && (i != sock_cnt)) {
error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
__func__, i, sock_cnt, job_id, node_inx);
sock_cnt = MIN(sock_cnt, i);
}
xassert(job_res->core_bitmap);
if (job_node_inx == 0)
job_specs->total_gres = 0;
max_gres = job_specs->gres_per_job - job_specs->total_gres -
(rem_nodes - 1);
cores_on_sock = xcalloc(sock_cnt, sizeof(int));
gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
for (s = 0; s < sock_cnt; s++) {
for (c = 0; c < cores_per_socket_cnt; c++) {
i = (s * cores_per_socket_cnt) + c;
if (bit_test(job_res->core_bitmap, (core_offset + i))) {
cores_on_sock[s]++;
total_cores++;
}
}
}
if (job_specs->cpus_per_gres) {
max_gres = MIN(max_gres,
((total_cores * cpus_per_core) /
job_specs->cpus_per_gres));
}
if ((max_gres > 1) && (node_specs->link_len == gres_cnt))
pick_gres = NO_VAL16;
else
pick_gres = max_gres;
/*
* Now pick specific GRES for these sockets.
* First select all GRES that we might possibly use, starting with
* those not constrained by socket, then contrained by socket.
* Then remove those which are not required and not "best".
*/
for (s = -1; /* Socket == - 1 if GRES avail from any socket */
((s < sock_cnt) && (alloc_gres_cnt < pick_gres)); s++) {
if ((s >= 0) && !cores_on_sock[s])
continue;
for (g = 0; ((g < gres_cnt) && (alloc_gres_cnt < pick_gres));
g++) {
if ((s == -1) &&
(!sock_gres->bits_any_sock ||
!bit_test(sock_gres->bits_any_sock, g)))
continue; /* GRES not avail any socket */
if ((s >= 0) &&
(!sock_gres->bits_by_sock ||
!sock_gres->bits_by_sock[s] ||
!bit_test(sock_gres->bits_by_sock[s], g)))
continue; /* GRES not on this socket */
if (bit_test(node_specs->gres_bit_alloc, g) ||
bit_test(job_specs->gres_bit_select[node_inx], g))
continue; /* Already allocated GRES */
bit_set(job_specs->gres_bit_select[node_inx], g);
job_specs->gres_cnt_node_select[node_inx]++;
alloc_gres_cnt++;
job_specs->total_gres++;
}
}
if (alloc_gres_cnt == 0) {
for (s = 0; ((s < sock_cnt) && (alloc_gres_cnt == 0)); s++) {
if (cores_on_sock[s])
continue;
for (g = 0; g < gres_cnt; g++) {
if (!sock_gres->bits_by_sock ||
!sock_gres->bits_by_sock[s] ||
!bit_test(sock_gres->bits_by_sock[s], g))
continue; /* GRES not on this socket */
if (bit_test(node_specs->gres_bit_alloc, g) ||
bit_test(job_specs->
gres_bit_select[node_inx], g))
continue; /* Already allocated GRES */
bit_set(job_specs->gres_bit_select[node_inx],g);
job_specs->gres_cnt_node_select[node_inx]++;
alloc_gres_cnt++;
job_specs->total_gres++;
break;
}
}
}
if (alloc_gres_cnt == 0) {
error("%s: job %u failed to find any available GRES on node %d",
__func__, job_id, node_inx);
}
/* Now pick the "best" max_gres GRES with respect to link counts. */
if (alloc_gres_cnt > max_gres) {
int best_link_cnt = -1, best_inx = -1;
for (s = 0; s < gres_cnt; s++) {
if (!bit_test(job_specs->gres_bit_select[node_inx], s))
continue;
for (g = s + 1; g < gres_cnt; g++) {
if (!bit_test(job_specs->
gres_bit_select[node_inx], g))
continue;
if (node_specs->links_cnt[s][g] <=
best_link_cnt)
continue;
best_link_cnt = node_specs->links_cnt[s][g];
best_inx = s;
}
}
while ((alloc_gres_cnt > max_gres) && (best_link_cnt != -1)) {
int worst_inx = -1, worst_link_cnt = NO_VAL16;
for (g = 0; g < gres_cnt; g++) {
if (g == best_inx)
continue;
if (!bit_test(job_specs->
gres_bit_select[node_inx], g))
continue;
if (node_specs->links_cnt[best_inx][g] >=
worst_link_cnt)
continue;
worst_link_cnt =
node_specs->links_cnt[best_inx][g];
worst_inx = g;
}
if (worst_inx == -1) {
error("%s: error managing links_cnt", __func__);
break;
}
bit_clear(job_specs->gres_bit_select[node_inx],
worst_inx);
job_specs->gres_cnt_node_select[node_inx]--;
alloc_gres_cnt--;
job_specs->total_gres--;
}
}
xfree(cores_on_sock);
if (job_specs->total_gres >= job_specs->gres_per_job)
fini = 1;
return fini;
}
/*
* Select specific GRES (set GRES bitmap) for this job on this node based upon
* per-job resource specification. Use any GRES on the node
* job_res IN - job resource allocation
* node_inx IN - global node index
* job_node_inx IN - node index for this job's allocation
* job_specs IN - job request specifications, UPDATED: set bits in
* gres_bit_select
* node_specs IN - node resource request specifications
* job_id IN - job ID for logging
* tres_mc_ptr IN - job's multi-core options
* RET 0:more work, 1:fini
*/
static int _set_job_bits2(struct job_resources *job_res, int node_inx,
int job_node_inx, sock_gres_t *sock_gres,
uint32_t job_id, gres_mc_data_t *tres_mc_ptr)
{
int core_offset, gres_cnt;
uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
int i, g, l, rc, s;
gres_job_state_t *job_specs;
gres_node_state_t *node_specs;
int fini = 0;
int best_link_cnt = 0, best_inx = -1;
job_specs = sock_gres->job_specs;
node_specs = sock_gres->node_specs;
if (job_specs->gres_per_job == job_specs->total_gres) {
fini = 1;
return fini;
}
if (!job_specs->gres_bit_select ||
!job_specs->gres_bit_select[node_inx]) {
error("%s: gres_bit_select NULL for job %u on node %d",
__func__, job_id, node_inx);
return SLURM_ERROR;
}
rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
&cores_per_socket_cnt);
if (rc != SLURM_SUCCESS) {
error("%s: Invalid socket/core count for job %u on node %d",
__func__, job_id, node_inx);
return rc;
}
core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
if (core_offset < 0) {
error("%s: Invalid core offset for job %u on node %d",
__func__, job_id, node_inx);
return rc;
}
i = sock_gres->sock_cnt;
if ((i != 0) && (i != sock_cnt)) {
error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
__func__, i, sock_cnt, job_id, node_inx);
sock_cnt = MIN(sock_cnt, i);
}
/*
* Identify the GRES (if any) that we want to use as a basis for
* maximizing link count (connectivity of the GRES).
*/
xassert(job_res->core_bitmap);
gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
if ((job_specs->gres_per_job > job_specs->total_gres) &&
(node_specs->link_len == gres_cnt)) {
for (g = 0; g < gres_cnt; g++) {
if (!bit_test(job_specs->gres_bit_select[node_inx], g))
continue;
best_inx = g;
for (s = 0; s < gres_cnt; s++) {
best_link_cnt = MAX(node_specs->links_cnt[s][g],
best_link_cnt);
}
break;
}
}
/*
* Now pick specific GRES for these sockets.
* Start with GRES available from any socket, then specific sockets
*/
for (l = best_link_cnt;
((l >= 0) && (job_specs->gres_per_job > job_specs->total_gres));
l--) {
for (s = -1; /* Socket == - 1 if GRES avail from any socket */
((s < sock_cnt) &&
(job_specs->gres_per_job > job_specs->total_gres)); s++) {
for (g = 0;
((g < gres_cnt) &&
(job_specs->gres_per_job >job_specs->total_gres));
g++) {
if ((l > 0) &&
(node_specs->links_cnt[best_inx][g] < l))
continue; /* Want better link count */
if ((s == -1) &&
(!sock_gres->bits_any_sock ||
!bit_test(sock_gres->bits_any_sock, g)))
continue; /* GRES not avail any sock */
if ((s >= 0) &&
(!sock_gres->bits_by_sock ||
!sock_gres->bits_by_sock[s] ||
!bit_test(sock_gres->bits_by_sock[s], g)))
continue; /* GRES not on this socket */
if (bit_test(node_specs->gres_bit_alloc, g) ||
bit_test(job_specs->gres_bit_select[node_inx],
g))
continue; /* Already allocated GRES */
bit_set(job_specs->gres_bit_select[node_inx],g);
job_specs->gres_cnt_node_select[node_inx]++;
job_specs->total_gres++;
}
}
}
if (job_specs->gres_per_job == job_specs->total_gres)
fini = 1;
return fini;
}
/*
* Select specific GRES (set GRES bitmap) for this job on this node based upon
* per-node resource specification
* job_res IN - job resource allocation
* node_inx IN - global node index
* job_node_inx IN - node index for this job's allocation
* job_specs IN - job request specifications, UPDATED: set bits in
* gres_bit_select
* node_specs IN - node resource request specifications
* job_id IN - job ID for logging
* tres_mc_ptr IN - job's multi-core options
*/
static void _set_node_bits(struct job_resources *job_res, int node_inx,
int job_node_inx, sock_gres_t *sock_gres,
uint32_t job_id, gres_mc_data_t *tres_mc_ptr)
{
int core_offset, gres_cnt;
uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
int c, i, g, l, rc, s;
gres_job_state_t *job_specs;
gres_node_state_t *node_specs;
int *used_sock = NULL, alloc_gres_cnt = 0;
int *links_cnt = NULL, best_link_cnt = 0;
uint64_t gres_per_bit = 1;
job_specs = sock_gres->job_specs;
node_specs = sock_gres->node_specs;
rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
&cores_per_socket_cnt);
if (rc != SLURM_SUCCESS) {
error("%s: Invalid socket/core count for job %u on node %d",
__func__, job_id, node_inx);
return;
}
core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
if (core_offset < 0) {
error("%s: Invalid core offset for job %u on node %d",
__func__, job_id, node_inx);
return;
}
i = sock_gres->sock_cnt;
if ((i != 0) && (i != sock_cnt)) {
error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
__func__, i, sock_cnt, job_id, node_inx);
sock_cnt = MIN(sock_cnt, i);
}
xassert(job_res->core_bitmap);
used_sock = xcalloc(sock_cnt, sizeof(int));
gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
for (s = 0; s < sock_cnt; s++) {
for (c = 0; c < cores_per_socket_cnt; c++) {
i = (s * cores_per_socket_cnt) + c;
if (bit_test(job_res->core_bitmap, (core_offset + i))) {
used_sock[s]++;
break;
}
}
}
/*
* Now pick specific GRES for these sockets.
* First: Try to place one GRES per socket in this job's allocation.
* Second: Try to place additional GRES on allocated sockets.
* Third: Use any additional available GRES.
*/
if (node_specs->link_len == gres_cnt)
links_cnt = xcalloc(gres_cnt, sizeof(int));
if (_shared_gres(sock_gres->plugin_id))
gres_per_bit = job_specs->gres_per_node;
for (s = -1; /* Socket == - 1 if GRES avail from any socket */
((s < sock_cnt) && (alloc_gres_cnt < job_specs->gres_per_node));
s++) {
if ((s >= 0) && !used_sock[s])
continue;
for (g = 0; g < gres_cnt; g++) {
if ((s == -1) &&
(!sock_gres->bits_any_sock ||
!bit_test(sock_gres->bits_any_sock, g)))
continue; /* GRES not avail any socket */
if ((s >= 0) &&
(!sock_gres->bits_by_sock ||
!sock_gres->bits_by_sock[s] ||
!bit_test(sock_gres->bits_by_sock[s], g)))
continue; /* GRES not on this socket */
if (bit_test(job_specs->gres_bit_select[node_inx], g) ||
((gres_per_bit == 1) &&
bit_test(node_specs->gres_bit_alloc, g)))
continue; /* Already allocated GRES */
bit_set(job_specs->gres_bit_select[node_inx], g);
job_specs->gres_cnt_node_select[node_inx] +=
gres_per_bit;
alloc_gres_cnt += gres_per_bit;
for (l = 0; links_cnt && (l < gres_cnt); l++) {
if ((l == g) ||
bit_test(node_specs->gres_bit_alloc, l))
continue;
links_cnt[l] += node_specs->links_cnt[g][l];
}
break;
}
}
if (links_cnt) {
for (l = 0; l < gres_cnt; l++)
best_link_cnt = MAX(links_cnt[l], best_link_cnt);
if (best_link_cnt > 4) {
/* Scale down to reasonable iteration count (<= 4) */
g = (best_link_cnt + 3) / 4;
best_link_cnt = 0;
for (l = 0; l < gres_cnt; l++) {
links_cnt[l] /= g;
best_link_cnt = MAX(links_cnt[l],best_link_cnt);
}
}
}
/*
* Try to place additional GRES on allocated sockets. Favor use of
* GRES which are best linked to GRES which have already been selected.
*/
for (l = best_link_cnt;
((l >= 0) && (alloc_gres_cnt < job_specs->gres_per_node)); l--) {
for (s = -1; /* Socket == - 1 if GRES avail from any socket */
((s < sock_cnt) &&
(alloc_gres_cnt < job_specs->gres_per_node)); s++) {
if ((s >= 0) && !used_sock[s])
continue;
for (g = 0; g < gres_cnt; g++) {
if (links_cnt && (links_cnt[g] < l))
continue;
if ((s == -1) &&
(!sock_gres->bits_any_sock ||
!bit_test(sock_gres->bits_any_sock, g)))
continue;/* GRES not avail any socket */
if ((s >= 0) &&
(!sock_gres->bits_by_sock ||
!sock_gres->bits_by_sock[s] ||
!bit_test(sock_gres->bits_by_sock[s], g)))
continue; /* GRES not on this socket */
if (bit_test(job_specs->gres_bit_select[node_inx],
g) ||
((gres_per_bit == 1) &&
bit_test(node_specs->gres_bit_alloc, g)))
continue; /* Already allocated GRES */
bit_set(job_specs->gres_bit_select[node_inx],g);
job_specs->gres_cnt_node_select[node_inx] +=
gres_per_bit;
alloc_gres_cnt += gres_per_bit;
if (alloc_gres_cnt >= job_specs->gres_per_node)
break;
}
}
}
/*
* Use any additional available GRES. Again, favor use of GRES
* which are best linked to GRES which have already been selected.
*/
for (l = best_link_cnt;
((l >= 0) && (alloc_gres_cnt < job_specs->gres_per_node)); l--) {
for (s = 0;
((s < sock_cnt) &&
(alloc_gres_cnt < job_specs->gres_per_node)); s++) {
if (used_sock[s])
continue;
for (g = 0; g < gres_cnt; g++) {
if (links_cnt && (links_cnt[g] < l))
continue;
if (!sock_gres->bits_by_sock ||
!sock_gres->bits_by_sock[s] ||
!bit_test(sock_gres->bits_by_sock[s], g))
continue; /* GRES not on this socket */
if (bit_test(job_specs->gres_bit_select[node_inx],
g) ||
((gres_per_bit == 1) &&
bit_test(node_specs->gres_bit_alloc, g)))
continue; /* Already allocated GRES */
bit_set(job_specs->gres_bit_select[node_inx],g);
job_specs->gres_cnt_node_select[node_inx] +=
gres_per_bit;
alloc_gres_cnt += gres_per_bit;
if (alloc_gres_cnt >= job_specs->gres_per_node)
break;
}
}
}
xfree(links_cnt);
xfree(used_sock);
}
/*
* Select one specific GRES topo entry (set GRES bitmap) for this job on this
* node based upon per-node resource specification
* job_res IN - job resource allocation
* node_inx IN - global node index
* job_node_inx IN - node index for this job's allocation
* job_specs IN - job request specifications, UPDATED: set bits in
* gres_bit_select
* node_specs IN - node resource request specifications
* job_id IN - job ID for logging
* tres_mc_ptr IN - job's multi-core options
*/
static void _pick_specific_topo(struct job_resources *job_res, int node_inx,
int job_node_inx, sock_gres_t *sock_gres,
uint32_t job_id, gres_mc_data_t *tres_mc_ptr)
{
int core_offset;
uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
int c, i, rc, s, t;
gres_job_state_t *job_specs;
gres_node_state_t *node_specs;
int *used_sock = NULL, alloc_gres_cnt = 0;
uint64_t gres_per_bit;
bool use_busy_dev = false;
job_specs = sock_gres->job_specs;
gres_per_bit = job_specs->gres_per_node;
node_specs = sock_gres->node_specs;
rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
&cores_per_socket_cnt);
if (rc != SLURM_SUCCESS) {
error("%s: Invalid socket/core count for job %u on node %d",
__func__, job_id, node_inx);
return;
}
core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
if (core_offset < 0) {
error("%s: Invalid core offset for job %u on node %d",
__func__, job_id, node_inx);
return;
}
i = sock_gres->sock_cnt;
if ((i != 0) && (i != sock_cnt)) {
error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
__func__, i, sock_cnt, job_id, node_inx);
sock_cnt = MIN(sock_cnt, i);
}
xassert(job_res->core_bitmap);
used_sock = xcalloc(sock_cnt, sizeof(int));
for (s = 0; s < sock_cnt; s++) {
for (c = 0; c < cores_per_socket_cnt; c++) {
i = (s * cores_per_socket_cnt) + c;
if (bit_test(job_res->core_bitmap, (core_offset + i))) {
used_sock[s]++;
break;
}
}
}
if ((sock_gres->plugin_id == mps_plugin_id) &&
(node_specs->gres_cnt_alloc != 0)) {
/* We must use the ONE already active GRES of this type */
use_busy_dev = true;
}
/*
* Now pick specific GRES for these sockets.
* First: Try to select a GRES local to allocated socket with
* sufficient resources.
* Second: Use available GRES with sufficient resources.
* Third: Use any available GRES.
*/
for (s = -1; /* Socket == - 1 if GRES avail from any socket */
(s < sock_cnt) && (alloc_gres_cnt == 0); s++) {
if ((s >= 0) && !used_sock[s])
continue;
for (t = 0; t < node_specs->topo_cnt; t++) {
if (use_busy_dev &&
(node_specs->topo_gres_cnt_alloc[t] == 0))
continue;
if (node_specs->topo_gres_cnt_alloc &&
node_specs->topo_gres_cnt_avail &&
((node_specs->topo_gres_cnt_avail[t] -
node_specs->topo_gres_cnt_alloc[t]) <
gres_per_bit))
continue; /* Insufficient resources */
if ((s == -1) &&
(!sock_gres->bits_any_sock ||
!bit_test(sock_gres->bits_any_sock, t)))
continue; /* GRES not avail any socket */
if ((s >= 0) &&
(!sock_gres->bits_by_sock ||
!sock_gres->bits_by_sock[s] ||
!bit_test(sock_gres->bits_by_sock[s], t)))
continue; /* GRES not on this socket */
bit_set(job_specs->gres_bit_select[node_inx], t);
job_specs->gres_cnt_node_select[node_inx] +=
gres_per_bit;
alloc_gres_cnt += gres_per_bit;
break;
}
}
/* Select available GRES with sufficient resources */
for (t = 0; (t < node_specs->topo_cnt) && (alloc_gres_cnt == 0); t++) {
if (use_busy_dev &&
(node_specs->topo_gres_cnt_alloc[t] == 0))
continue;
if (node_specs->topo_gres_cnt_alloc &&
node_specs->topo_gres_cnt_avail &&
node_specs->topo_gres_cnt_avail[t] &&
((node_specs->topo_gres_cnt_avail[t] -
node_specs->topo_gres_cnt_alloc[t]) < gres_per_bit))
continue; /* Insufficient resources */
bit_set(job_specs->gres_bit_select[node_inx], t);
job_specs->gres_cnt_node_select[node_inx] += gres_per_bit;
alloc_gres_cnt += gres_per_bit;
break;
}
/* Select available GRES with any resources */
for (t = 0; (t < node_specs->topo_cnt) && (alloc_gres_cnt == 0); t++) {
if (node_specs->topo_gres_cnt_alloc &&
node_specs->topo_gres_cnt_avail &&
node_specs->topo_gres_cnt_avail[t])
continue; /* No resources */
bit_set(job_specs->gres_bit_select[node_inx], t);
job_specs->gres_cnt_node_select[node_inx] += gres_per_bit;
alloc_gres_cnt += gres_per_bit;
}
xfree(used_sock);
}
/*
* Select specific GRES (set GRES bitmap) for this job on this node based upon
* per-socket resource specification
* job_res IN - job resource allocation
* node_inx IN - global node index
* job_node_inx IN - node index for this job's allocation
* job_specs IN - job request specifications, UPDATED: set bits in
* gres_bit_select
* node_specs IN - node resource request specifications
* job_id IN - job ID for logging
* tres_mc_ptr IN - job's multi-core options
*/
static void _set_sock_bits(struct job_resources *job_res, int node_inx,
int job_node_inx, sock_gres_t *sock_gres,
uint32_t job_id, gres_mc_data_t *tres_mc_ptr)
{
int core_offset, gres_cnt;
uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
int c, i, g, l, rc, s;
gres_job_state_t *job_specs;
gres_node_state_t *node_specs;
int *used_sock = NULL, used_sock_cnt = 0;
int *links_cnt = NULL, best_link_cnt = 0;
job_specs = sock_gres->job_specs;
node_specs = sock_gres->node_specs;
rc = get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
&cores_per_socket_cnt);
if (rc != SLURM_SUCCESS) {
error("%s: Invalid socket/core count for job %u on node %d",
__func__, job_id, node_inx);
return;
}
core_offset = get_job_resources_offset(job_res, job_node_inx, 0, 0);
if (core_offset < 0) {
error("%s: Invalid core offset for job %u on node %d",
__func__, job_id, node_inx);
return;
}
i = sock_gres->sock_cnt;
if ((i != 0) && (i != sock_cnt)) {
error("%s: Inconsistent socket count (%d != %d) for job %u on node %d",
__func__, i, sock_cnt, job_id, node_inx);
sock_cnt = MIN(sock_cnt, i);
}
xassert(job_res->core_bitmap);
used_sock = xcalloc(sock_cnt, sizeof(int));
gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
for (s = 0; s < sock_cnt; s++) {
for (c = 0; c < cores_per_socket_cnt; c++) {
i = (s * cores_per_socket_cnt) + c;
if (bit_test(job_res->core_bitmap, (core_offset + i))) {
used_sock[s]++;
used_sock_cnt++;
break;
}
}
}
if (tres_mc_ptr && tres_mc_ptr->sockets_per_node &&
(tres_mc_ptr->sockets_per_node != used_sock_cnt) &&
node_specs->gres_bit_alloc && sock_gres->bits_by_sock) {
if (tres_mc_ptr->sockets_per_node > used_sock_cnt) {
/* Somehow we have too few sockets in job allocation */
error("%s: Inconsistent requested/allocated socket count "
"(%d > %d) for job %u on node %d",
__func__, tres_mc_ptr->sockets_per_node,
used_sock_cnt, job_id, node_inx);
for (s = 0; s < sock_cnt; s++) {
if (used_sock[s] || !sock_gres->bits_by_sock[s])
continue;
/* Determine currently free GRES by socket */
used_sock[s] = bit_set_count(
sock_gres->bits_by_sock[s]) -
bit_overlap(
sock_gres->bits_by_sock[s],
node_specs->gres_bit_alloc);
if ((used_sock[s] == 0) ||
(used_sock[s] < job_specs->gres_per_socket)){
used_sock[s] = 0;
} else if (++used_sock_cnt ==
tres_mc_ptr->sockets_per_node) {
break;
}
}
} else {
/* May have needed extra CPUs, exceeding socket count */
debug("%s: Inconsistent requested/allocated socket count "
"(%d < %d) for job %u on node %d",
__func__, tres_mc_ptr->sockets_per_node,
used_sock_cnt, job_id, node_inx);
for (s = 0; s < sock_cnt; s++) {
if (!used_sock[s] ||
!sock_gres->bits_by_sock[s])
continue;
/* Determine currently free GRES by socket */
used_sock[s] = bit_set_count(
sock_gres->bits_by_sock[s]) -
bit_overlap(
sock_gres->bits_by_sock[s],
node_specs->gres_bit_alloc);
if (used_sock[s] == 0)
used_sock_cnt--;
}
/* Exclude sockets with low GRES counts */
while (tres_mc_ptr->sockets_per_node > used_sock_cnt) {
int low_sock_inx = -1;
for (s = sock_cnt - 1; s >= 0; s--) {
if (used_sock[s] == 0)
continue;
if ((low_sock_inx == -1) ||
(used_sock[s] <
used_sock[low_sock_inx]))
low_sock_inx = s;
}
if (low_sock_inx == -1)
break;
used_sock[low_sock_inx] = 0;
used_sock_cnt--;
}
}
}
/*
* Identify the available GRES with best connectivity
* (i.e. higher link_cnt)
*/
if (node_specs->link_len == gres_cnt) {
links_cnt = xcalloc(gres_cnt, sizeof(int));
for (g = 0; g < gres_cnt; g++) {
if (bit_test(node_specs->gres_bit_alloc, g))
continue;
for (l = 0; l < gres_cnt; l++) {
if ((l == g) ||
bit_test(node_specs->gres_bit_alloc, l))
continue;
links_cnt[l] += node_specs->links_cnt[g][l];
}
}
for (l = 0; l < gres_cnt; l++)
best_link_cnt = MAX(links_cnt[l], best_link_cnt);
if (best_link_cnt > 4) {
/* Scale down to reasonable iteration count (<= 4) */
g = (best_link_cnt + 3) / 4;
best_link_cnt = 0;
for (l = 0; l < gres_cnt; l++) {
links_cnt[l] /= g;
best_link_cnt = MAX(links_cnt[l],best_link_cnt);
}
}
}
/*
* Now pick specific GRES for these sockets.
* Try to use GRES with best connectivity (higher link_cnt values)
*/
for (s = 0; s < sock_cnt; s++) {
if (!used_sock[s])
continue;
i = 0;
for (l = best_link_cnt;
((l >= 0) && (i < job_specs->gres_per_socket)); l--) {
for (g = 0; g < gres_cnt; g++) {
if (!sock_gres->bits_by_sock ||
!sock_gres->bits_by_sock[s] ||
!bit_test(sock_gres->bits_by_sock[s], g))
continue; /* GRES not on this socket */
if (node_specs->gres_bit_alloc &&
bit_test(node_specs->gres_bit_alloc, g))
continue; /* Already allocated GRES */
if (job_specs->gres_bit_select[node_inx] &&
bit_test(job_specs->gres_bit_select[node_inx],
g))
continue; /* Already allocated GRES */
bit_set(job_specs->gres_bit_select[node_inx],g);
job_specs->gres_cnt_node_select[node_inx]++;
if (++i == job_specs->gres_per_socket)
break;
}
}
if ((i < job_specs->gres_per_socket) &&
sock_gres->bits_any_sock) {
/* Add GRES unconstrained by socket as needed */
for (g = 0; g < gres_cnt; g++) {
if (!sock_gres->bits_any_sock ||
!bit_test(sock_gres->bits_any_sock, g))
continue; /* GRES not on this socket */
if (node_specs->gres_bit_alloc &&
bit_test(node_specs->gres_bit_alloc, g))
continue; /* Already allocated GRES */
if (job_specs->gres_bit_select[node_inx] &&
bit_test(job_specs->gres_bit_select[node_inx],
g))
continue; /* Already allocated GRES */
bit_set(job_specs->gres_bit_select[node_inx],g);
job_specs->gres_cnt_node_select[node_inx]++;
if (++i == job_specs->gres_per_socket)
break;
}
}
}
xfree(links_cnt);
xfree(used_sock);
}
/*
* Select specific GRES (set GRES bitmap) for this job on this node based upon
* per-task resource specification
* job_res IN - job resource allocation
* node_inx IN - global node index
* job_node_inx IN - node index for this job's allocation
* job_specs IN - job request specifications, UPDATED: set bits in
* gres_bit_select
* node_specs IN - node resource request specifications
* job_id IN - job ID for logging
* tres_mc_ptr IN - job's multi-core options
*/
static void _set_task_bits(struct job_resources *job_res, int node_inx,
int job_node_inx, sock_gres_t *sock_gres,
uint32_t job_id, gres_mc_data_t *tres_mc_ptr,
uint32_t **tasks_per_node_socket)
{
uint16_t sock_cnt = 0;
int gres_cnt, g, l, s;
gres_job_state_t *job_specs;
gres_node_state_t *node_specs;
uint32_t total_tasks = 0;
uint64_t total_gres_cnt = 0, total_gres_goal;
int *links_cnt = NULL, best_link_cnt = 0;
job_specs = sock_gres->job_specs;
node_specs = sock_gres->node_specs;
sock_cnt = sock_gres->sock_cnt;
gres_cnt = bit_size(job_specs->gres_bit_select[node_inx]);
if (node_specs->link_len == gres_cnt)
links_cnt = xcalloc(gres_cnt, sizeof(int));
/* First pick GRES for acitve sockets */
for (s = -1; /* Socket == - 1 if GRES avail from any socket */
s < sock_cnt; s++) {
if ((s > 0) &&
(!tasks_per_node_socket[node_inx] ||
(tasks_per_node_socket[node_inx][s] == 0)))
continue;
total_tasks += tasks_per_node_socket[node_inx][s];
total_gres_goal = total_tasks * job_specs->gres_per_task;
for (g = 0; g < gres_cnt; g++) {
if (total_gres_cnt >= total_gres_goal)
break;
if ((s == -1) &&
(!sock_gres->bits_any_sock ||
!bit_test(sock_gres->bits_any_sock, g)))
continue; /* GRES not avail any sock */
if ((s >= 0) &&
(!sock_gres->bits_by_sock ||
!sock_gres->bits_by_sock[s] ||
!bit_test(sock_gres->bits_by_sock[s], g)))
continue; /* GRES not on this socket */
if (bit_test(node_specs->gres_bit_alloc, g))
continue; /* Already allocated GRES */
if (bit_test(node_specs->gres_bit_alloc, g) ||
bit_test(job_specs->gres_bit_select[node_inx], g))
continue; /* Already allocated GRES */
bit_set(job_specs->gres_bit_select[node_inx], g);
job_specs->gres_cnt_node_select[node_inx]++;
total_gres_cnt++;
for (l = 0; links_cnt && (l < gres_cnt); l++) {
if ((l == g) ||
bit_test(node_specs->gres_bit_alloc, l))
continue;
links_cnt[l] += node_specs->links_cnt[g][l];
}
}
}
if (links_cnt) {
for (l = 0; l < gres_cnt; l++)
best_link_cnt = MAX(links_cnt[l], best_link_cnt);
if (best_link_cnt > 4) {
/* Scale down to reasonable iteration count (<= 4) */
g = (best_link_cnt + 3) / 4;
best_link_cnt = 0;
for (l = 0; l < gres_cnt; l++) {
links_cnt[l] /= g;
best_link_cnt = MAX(links_cnt[l],best_link_cnt);
}
}
}
/*
* Next pick additional GRES as needed. Favor use of GRES which
* are best linked to GRES which have already been selected.
*/
total_gres_goal = total_tasks * job_specs->gres_per_task;
for (l = best_link_cnt;
((l >= 0) && (total_gres_cnt < total_gres_goal)); l--) {
for (s = -1; /* Socket == - 1 if GRES avail from any socket */
((s < sock_cnt) && (total_gres_cnt < total_gres_goal));
s++) {
for (g = 0;
((g < gres_cnt) &&
(total_gres_cnt < total_gres_goal)); g++) {
if (links_cnt && (links_cnt[g] < l))
continue;
if ((s == -1) &&
(!sock_gres->bits_any_sock ||
!bit_test(sock_gres->bits_any_sock, g)))
continue; /* GRES not avail any sock */
if ((s >= 0) &&
(!sock_gres->bits_by_sock ||
!sock_gres->bits_by_sock[s] ||
!bit_test(sock_gres->bits_by_sock[s], g)))
continue; /* GRES not on this socket */
if (bit_test(node_specs->gres_bit_alloc, g) ||
bit_test(job_specs->gres_bit_select[node_inx],
g))
continue; /* Already allocated GRES */
bit_set(job_specs->gres_bit_select[node_inx],g);
job_specs->gres_cnt_node_select[node_inx]++;
total_gres_cnt++;
}
}
}
xfree(links_cnt);
if (total_gres_cnt < total_gres_goal) {
/* Something bad happened on task layout for this GRES type */
error("%s: Insufficient gres/%s allocated for job %u on node_inx %u "
"(%"PRIu64" < %"PRIu64")", __func__,
sock_gres->gres_name, job_id, node_inx,
total_gres_cnt, total_gres_goal);
}
}
/* Build array to identify task count for each node-socket pair */
static uint32_t **_build_tasks_per_node_sock(struct job_resources *job_res,
uint8_t overcommit,
gres_mc_data_t *tres_mc_ptr,
node_record_t *node_table_ptr)
{
uint32_t **tasks_per_node_socket;
int i, i_first, i_last, j, node_cnt, job_node_inx = 0;
int c, s, core_offset;
int cpus_per_task = 1, cpus_per_node, cpus_per_core;
int task_per_node_limit = 0;
int32_t rem_tasks, excess_tasks;
uint16_t sock_cnt = 0, cores_per_socket_cnt = 0;
rem_tasks = tres_mc_ptr->ntasks_per_job;
node_cnt = bit_size(job_res->node_bitmap);
tasks_per_node_socket = xcalloc(node_cnt, sizeof(uint32_t *));
i_first = bit_ffs(job_res->node_bitmap);
if (i_first != -1)
i_last = bit_fls(job_res->node_bitmap);
else
i_last = -2;
for (i = i_first; i <= i_last; i++) {
int tasks_per_node = 0;
if (!bit_test(job_res->node_bitmap, i))
continue;
if (get_job_resources_cnt(job_res, job_node_inx, &sock_cnt,
&cores_per_socket_cnt)) {
error("%s: failed to get socket/core count", __func__);
/* Set default of 1 task on socket 0 */
tasks_per_node_socket[i] = xmalloc(sizeof(uint32_t));
tasks_per_node_socket[i][0] = 1;
rem_tasks--;
continue;
}
tasks_per_node_socket[i] = xcalloc(sock_cnt, sizeof(uint32_t));
if (tres_mc_ptr->ntasks_per_node) {
task_per_node_limit = tres_mc_ptr->ntasks_per_node;
} else if (job_res->tasks_per_node &&
job_res->tasks_per_node[job_node_inx]) {
task_per_node_limit =
job_res->tasks_per_node[job_node_inx];
} else {
/*
* NOTE: We should never get here.
* cpus_per_node reports CPUs actually used by this
* job on this node. Divide by cpus_per_task to yield
* valid task count on this node. This can be bad on
* cores with more than one thread and job fails to
* use all threads.
*/
error("%s: tasks_per_node not set", __func__);
cpus_per_node = get_job_resources_cpus(job_res,
job_node_inx);
if (cpus_per_node < 1) {
error("%s: failed to get cpus_per_node count",
__func__);
/* Set default of 1 task on socket 0 */
tasks_per_node_socket[i][0] = 1;
rem_tasks--;
continue;
}
if (tres_mc_ptr->cpus_per_task)
cpus_per_task = tres_mc_ptr->cpus_per_task;
else
cpus_per_task = 1;
task_per_node_limit = cpus_per_node / cpus_per_task;
}
core_offset = get_job_resources_offset(job_res, job_node_inx++,
0, 0);
if (node_table_ptr[i].cores) {
cpus_per_core = node_table_ptr[i].cpus /
node_table_ptr[i].cores;
} else
cpus_per_core = 1;
for (s = 0; s < sock_cnt; s++) {
int tasks_per_socket = 0, tpc, skip_cores = 0;
for (c = 0; c < cores_per_socket_cnt; c++) {
j = (s * cores_per_socket_cnt) + c;
j += core_offset;
if (!bit_test(job_res->core_bitmap, j))
continue;
if (skip_cores > 0) {
skip_cores--;
continue;
}
if (tres_mc_ptr->ntasks_per_core) {
tpc = tres_mc_ptr->ntasks_per_core;
} else {
tpc = cpus_per_core / cpus_per_task;
if (tpc < 1) {
tpc = 1;
skip_cores = cpus_per_task /
cpus_per_core;
skip_cores--; /* This core */
}
/* Start with 1 task per core */
}
tasks_per_node_socket[i][s] += tpc;
tasks_per_node += tpc;
tasks_per_socket += tpc;
rem_tasks -= tpc;
if (task_per_node_limit) {
if (tasks_per_node >
task_per_node_limit) {
excess_tasks = tasks_per_node -
task_per_node_limit;
tasks_per_node_socket[i][s] -=
excess_tasks;
rem_tasks += excess_tasks;
}
if (tasks_per_node >=
task_per_node_limit) {
s = sock_cnt;
break;
}
}
/* NOTE: No support for ntasks_per_board */
if (tres_mc_ptr->ntasks_per_socket) {
if (tasks_per_socket >
tres_mc_ptr->ntasks_per_socket) {
excess_tasks = tasks_per_socket-
tres_mc_ptr->ntasks_per_socket;
tasks_per_node_socket[i][s] -=
excess_tasks;
rem_tasks += excess_tasks;
}
if (tasks_per_socket >=
tres_mc_ptr->ntasks_per_socket) {
break;
}
}
}
}
}
while ((rem_tasks > 0) && overcommit) {
for (i = i_first; (rem_tasks > 0) && (i <= i_last); i++) {
if (!bit_test(job_res->node_bitmap, i))
continue;
for (s = 0; (rem_tasks > 0) && (s < sock_cnt); s++) {
for (c = 0; c < cores_per_socket_cnt; c++) {
j = (s * cores_per_socket_cnt) + c;
if (!bit_test(job_res->core_bitmap, j))
continue;
tasks_per_node_socket[i][s]++;
rem_tasks--;
break;
}
}
}
}
if (rem_tasks > 0) /* This should never happen */
error("%s: rem_tasks not zero (%d > 0)", __func__, rem_tasks);
return tasks_per_node_socket;
}
static void _free_tasks_per_node_sock(uint32_t **tasks_per_node_socket,
int node_cnt)
{
int n;
if (!tasks_per_node_socket)
return;
for (n = 0; n < node_cnt; n++)
xfree(tasks_per_node_socket[n]);
xfree(tasks_per_node_socket);
}
/* Return the count of tasks for a job on a given node */
static uint32_t _get_task_cnt_node(uint32_t **tasks_per_node_socket,
int node_inx, int sock_cnt)
{
uint32_t task_cnt = 0;
int s;
if (!tasks_per_node_socket || !tasks_per_node_socket[node_inx]) {
error("%s: tasks_per_node_socket is NULL", __func__);
return 1; /* Best guess if no data structure */
}
for (s = 0; s < sock_cnt; s++)
task_cnt += tasks_per_node_socket[node_inx][s];
return task_cnt;
}
/* Determine maximum GRES allocation count on this node; no topology */
static uint64_t _get_job_cnt(sock_gres_t *sock_gres,
gres_node_state_t *node_specs, int rem_node_cnt)
{
uint64_t avail_gres, max_gres;
gres_job_state_t *job_specs = sock_gres->job_specs;
avail_gres = node_specs->gres_cnt_avail - node_specs->gres_cnt_alloc;
/* Ensure at least one GRES per node on remaining nodes */
max_gres = job_specs->gres_per_job - job_specs->total_gres -
(rem_node_cnt - 1);
max_gres = MIN(avail_gres, max_gres);
return max_gres;
}
/* Return count of GRES on this node */
static int _get_gres_node_cnt(gres_node_state_t *node_specs, int node_inx)
{
int i, gres_cnt = 0;
if (node_specs->gres_bit_alloc) {
gres_cnt = bit_size(node_specs->gres_bit_alloc);
return gres_cnt;
}
/* This logic should be redundant */
if (node_specs->topo_gres_bitmap && node_specs->topo_gres_bitmap[0]) {
gres_cnt = bit_size(node_specs->topo_gres_bitmap[0]);
return gres_cnt;
}
/* This logic should also be redundant */
gres_cnt = 0;
for (i = 0; i < node_specs->topo_cnt; i++)
gres_cnt += node_specs->topo_gres_cnt_avail[i];
return gres_cnt;
}
/*
* Make final GRES selection for the job
* sock_gres_list IN - per-socket GRES details, one record per allocated node
* job_id IN - job ID for logging
* job_res IN - job resource allocation
* overcommit IN - job's ability to overcommit resources
* tres_mc_ptr IN - job's multi-core options
* node_table_ptr IN - slurmctld's node records
* RET SLURM_SUCCESS or error code
*/
extern int gres_plugin_job_core_filter4(List *sock_gres_list, uint32_t job_id,
struct job_resources *job_res,
uint8_t overcommit,
gres_mc_data_t *tres_mc_ptr,
node_record_t *node_table_ptr)
{
ListIterator sock_gres_iter;
sock_gres_t *sock_gres;
gres_job_state_t *job_specs;
gres_node_state_t *node_specs;
int i, i_first, i_last, node_inx = -1, gres_cnt;
int node_cnt, rem_node_cnt;
int job_fini = -1; /* -1: not applicable, 0: more work, 1: fini */
uint32_t **tasks_per_node_socket = NULL;
int rc = SLURM_SUCCESS;
if (!job_res || !job_res->node_bitmap)
return SLURM_ERROR;
node_cnt = bit_size(job_res->node_bitmap);
rem_node_cnt = bit_set_count(job_res->node_bitmap);
i_first = bit_ffs(job_res->node_bitmap);
if (i_first != -1)
i_last = bit_fls(job_res->node_bitmap);
else
i_last = -2;
for (i = i_first; i <= i_last; i++) {
if (!bit_test(job_res->node_bitmap, i))
continue;
sock_gres_iter =
list_iterator_create(sock_gres_list[++node_inx]);
while ((sock_gres = (sock_gres_t *) list_next(sock_gres_iter))){
job_specs = sock_gres->job_specs;
node_specs = sock_gres->node_specs;
if (!job_specs || !node_specs)
continue;
if (job_specs->gres_per_task && /* Data needed */
!tasks_per_node_socket) { /* Not built yet */
tasks_per_node_socket =
_build_tasks_per_node_sock(job_res,
overcommit,
tres_mc_ptr,
node_table_ptr);
}
if (job_specs->total_node_cnt == 0) {
job_specs->total_node_cnt = node_cnt;
job_specs->total_gres = 0;
}
if (!job_specs->gres_cnt_node_select) {
job_specs->gres_cnt_node_select =
xcalloc(node_cnt, sizeof(uint64_t));
}
if (i == i_first) /* Reinitialize counter */
job_specs->total_gres = 0;
if (node_specs->topo_cnt == 0) {
/* No topology, just set a count */
if (job_specs->gres_per_node) {
job_specs->gres_cnt_node_select[i] =
job_specs->gres_per_node;
} else if (job_specs->gres_per_socket) {
job_specs->gres_cnt_node_select[i] =
job_specs->gres_per_socket;
job_specs->gres_cnt_node_select[i] *=
_get_sock_cnt(job_res, i,
node_inx);
} else if (job_specs->gres_per_task) {
job_specs->gres_cnt_node_select[i] =
job_specs->gres_per_task;
job_specs->gres_cnt_node_select[i] *=
_get_task_cnt_node(
tasks_per_node_socket, i,
node_table_ptr[i].sockets);
} else if (job_specs->gres_per_job) {
job_specs->gres_cnt_node_select[i] =
_get_job_cnt(sock_gres,
node_specs,
rem_node_cnt);
}
job_specs->total_gres +=
job_specs->gres_cnt_node_select[i];
continue;
}
/* Working with topology, need to pick specific GRES */
if (!job_specs->gres_bit_select) {
job_specs->gres_bit_select =
xcalloc(node_cnt, sizeof(bitstr_t *));
}
gres_cnt = _get_gres_node_cnt(node_specs, node_inx);
FREE_NULL_BITMAP(job_specs->gres_bit_select[i]);
job_specs->gres_bit_select[i] = bit_alloc(gres_cnt);
job_specs->gres_cnt_node_select[i] = 0;
if (job_specs->gres_per_node &&
_shared_gres(sock_gres->plugin_id)) {
/* gres/mps: select specific topo bit for job */
_pick_specific_topo(job_res, i, node_inx,
sock_gres, job_id,
tres_mc_ptr);
} else if (job_specs->gres_per_node) {
_set_node_bits(job_res, i, node_inx,
sock_gres, job_id, tres_mc_ptr);
} else if (job_specs->gres_per_socket) {
_set_sock_bits(job_res, i, node_inx,
sock_gres, job_id, tres_mc_ptr);
} else if (job_specs->gres_per_task) {
_set_task_bits(job_res, i, node_inx,
sock_gres, job_id, tres_mc_ptr,
tasks_per_node_socket);
} else if (job_specs->gres_per_job) {
uint16_t cpus_per_core;
cpus_per_core = node_table_ptr[i].cpus /
node_table_ptr[i].boards /
node_table_ptr[i].sockets /
node_table_ptr[i].cores;
job_fini = _set_job_bits1(job_res, i, node_inx,
rem_node_cnt, sock_gres,
job_id, tres_mc_ptr,
cpus_per_core);
} else {
error("%s job %u job_spec lacks GRES counter",
__func__, job_id);
}
if (job_fini == -1) {
/*
* _set_job_bits1() updates total_gres counter,
* this handle other cases.
*/
job_specs->total_gres +=
job_specs->gres_cnt_node_select[i];
}
}
rem_node_cnt--;
list_iterator_destroy(sock_gres_iter);
}
if (job_fini == 0) {
/*
* Need more GRES to satisfy gres-per-job option with bitmaps.
* This logic will make use of GRES that are not on allocated
* sockets and are thus generally less desirable to use.
*/
node_inx = -1;
for (i = i_first; i <= i_last; i++) {
if (!bit_test(job_res->node_bitmap, i))
continue;
sock_gres_iter =
list_iterator_create(sock_gres_list[++node_inx]);
while ((sock_gres = (sock_gres_t *)
list_next(sock_gres_iter))) {
job_specs = sock_gres->job_specs;
node_specs = sock_gres->node_specs;
if (!job_specs || !node_specs)
continue;
job_fini = _set_job_bits2(job_res, i, node_inx,
sock_gres, job_id,
tres_mc_ptr);
if (job_fini == 1)
break;
}
list_iterator_destroy(sock_gres_iter);
if (job_fini == 1)
break;
}
if (job_fini == 0) {
error("%s job %u failed to satisfy gres-per-job counter",
__func__, job_id);
rc = ESLURM_NODE_NOT_AVAIL;
}
}
_free_tasks_per_node_sock(tasks_per_node_socket, node_cnt);
return rc;
}
/*
* Determine if job GRES specification includes a tres-per-task specification
* RET TRUE if any GRES requested by the job include a tres-per-task option
*/
extern bool gres_plugin_job_tres_per_task(List job_gres_list)
{
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr;
gres_job_state_t *job_data_ptr;
bool have_gres_per_task = false;
if (!job_gres_list)
return false;
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
if (job_data_ptr->gres_per_task == 0)
continue;
have_gres_per_task = true;
break;
}
list_iterator_destroy(job_gres_iter);
return have_gres_per_task;
}
/*
* Determine if the job GRES specification includes a mem-per-tres specification
* RET largest mem-per-tres specification found
*/
extern uint64_t gres_plugin_job_mem_max(List job_gres_list)
{
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr;
gres_job_state_t *job_data_ptr;
uint64_t mem_max = 0, mem_per_gres;
if (!job_gres_list)
return 0;
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
if (job_data_ptr->mem_per_gres)
mem_per_gres = job_data_ptr->mem_per_gres;
else
mem_per_gres = job_data_ptr->def_mem_per_gres;
mem_max = MAX(mem_max, mem_per_gres);
}
list_iterator_destroy(job_gres_iter);
return mem_max;
}
/*
* Set per-node memory limits based upon GRES assignments
* RET TRUE if mem-per-tres specification used to set memory limits
*/
extern bool gres_plugin_job_mem_set(List job_gres_list,
job_resources_t *job_res)
{
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr;
gres_job_state_t *job_data_ptr;
bool rc = false, first_set = true;
uint64_t gres_cnt, mem_size, mem_per_gres;
int i, i_first, i_last, node_off;
if (!job_gres_list)
return false;
i_first = bit_ffs(job_res->node_bitmap);
if (i_first < 0)
return false;
i_last = bit_fls(job_res->node_bitmap);
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
if (job_data_ptr->mem_per_gres)
mem_per_gres = job_data_ptr->mem_per_gres;
else
mem_per_gres = job_data_ptr->def_mem_per_gres;
/*
* The logic below is correct because the only mem_per_gres
* is --mem-per-gpu adding another option will require change
* to take MAX of mem_per_gres for all types.
*/
if ((mem_per_gres == 0) || !job_data_ptr->gres_cnt_node_select)
continue;
rc = true;
node_off = -1;
for (i = i_first; i <= i_last; i++) {
if (!bit_test(job_res->node_bitmap, i))
continue;
node_off++;
if (job_res->whole_node == 1) {
gres_state_t *node_gres_ptr;
gres_node_state_t *node_state_ptr;
node_gres_ptr = list_find_first(
node_record_table_ptr[i].gres_list,
_gres_find_id,
&job_gres_ptr->plugin_id);
if (!node_gres_ptr)
continue;
node_state_ptr = node_gres_ptr->gres_data;
gres_cnt = node_state_ptr->gres_cnt_avail;
} else
gres_cnt =
job_data_ptr->gres_cnt_node_select[i];
mem_size = mem_per_gres * gres_cnt;
if (first_set)
job_res->memory_allocated[node_off] = mem_size;
else
job_res->memory_allocated[node_off] += mem_size;
}
first_set = false;
}
list_iterator_destroy(job_gres_iter);
return rc;
}
/*
* Determine the minimum number of CPUs required to satify the job's GRES
* request (based upon total GRES times cpus_per_gres value)
* node_count IN - count of nodes in job allocation
* sockets_per_node IN - count of sockets per node in job allocation
* task_count IN - count of tasks in job allocation
* job_gres_list IN - job GRES specification
* RET count of required CPUs for the job
*/
extern int gres_plugin_job_min_cpus(uint32_t node_count,
uint32_t sockets_per_node,
uint32_t task_count,
List job_gres_list)
{
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr;
gres_job_state_t *job_data_ptr;
int tmp, min_cpus = 0;
uint16_t cpus_per_gres;
if (!job_gres_list || (list_count(job_gres_list) == 0))
return 0;
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
uint64_t total_gres = 0;
job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
if (job_data_ptr->cpus_per_gres)
cpus_per_gres = job_data_ptr->cpus_per_gres;
else
cpus_per_gres = job_data_ptr->def_cpus_per_gres;
if (cpus_per_gres == 0)
continue;
if (job_data_ptr->gres_per_job) {
total_gres = job_data_ptr->gres_per_job;
} else if (job_data_ptr->gres_per_node) {
total_gres = job_data_ptr->gres_per_node *
node_count;
} else if (job_data_ptr->gres_per_socket) {
total_gres = job_data_ptr->gres_per_socket *
node_count * sockets_per_node;
} else if (job_data_ptr->gres_per_task) {
total_gres = job_data_ptr->gres_per_task * task_count;
} else
continue;
tmp = cpus_per_gres * total_gres;
min_cpus = MAX(min_cpus, tmp);
}
list_iterator_destroy(job_gres_iter);
return min_cpus;
}
/*
* Determine the minimum number of CPUs required to satify the job's GRES
* request on one node
* sockets_per_node IN - count of sockets per node in job allocation
* tasks_per_node IN - count of tasks per node in job allocation
* job_gres_list IN - job GRES specification
* RET count of required CPUs for the job
*/
extern int gres_plugin_job_min_cpu_node(uint32_t sockets_per_node,
uint32_t tasks_per_node,
List job_gres_list)
{
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr;
gres_job_state_t *job_data_ptr;
int tmp, min_cpus = 0;
uint16_t cpus_per_gres;
if (!job_gres_list || (list_count(job_gres_list) == 0))
return 0;
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
uint64_t total_gres = 0;
job_data_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
if (job_data_ptr->cpus_per_gres)
cpus_per_gres = job_data_ptr->cpus_per_gres;
else
cpus_per_gres = job_data_ptr->def_cpus_per_gres;
if (cpus_per_gres == 0)
continue;
if (job_data_ptr->gres_per_node) {
total_gres = job_data_ptr->gres_per_node;
} else if (job_data_ptr->gres_per_socket) {
total_gres = job_data_ptr->gres_per_socket *
sockets_per_node;
} else if (job_data_ptr->gres_per_task) {
total_gres = job_data_ptr->gres_per_task *
tasks_per_node;
} else
total_gres = 1;
tmp = cpus_per_gres * total_gres;
min_cpus = MAX(min_cpus, tmp);
}
return min_cpus;
}
/*
* Determine if specific GRES index on node is available to a job's allocated
* cores
* IN core_bitmap - bitmap of cores allocated to the job on this node
* IN/OUT alloc_core_bitmap - cores already allocated, NULL if don't care,
* updated when the function returns true
* IN node_gres_ptr - GRES data for this node
* IN gres_inx - index of GRES being considered for use
* IN job_gres_ptr - GRES data for this job
* RET true if available to those core, false otherwise
*/
static bool _cores_on_gres(bitstr_t *core_bitmap, bitstr_t *alloc_core_bitmap,
gres_node_state_t *node_gres_ptr, int gres_inx,
gres_job_state_t *job_gres_ptr)
{
int i, avail_cores;
if ((core_bitmap == NULL) || (node_gres_ptr->topo_cnt == 0))
return true;
for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
if (!node_gres_ptr->topo_gres_bitmap[i])
continue;
if (bit_size(node_gres_ptr->topo_gres_bitmap[i]) < gres_inx)
continue;
if (!bit_test(node_gres_ptr->topo_gres_bitmap[i], gres_inx))
continue;
if (job_gres_ptr->type_name &&
(!node_gres_ptr->topo_type_name[i] ||
(job_gres_ptr->type_id != node_gres_ptr->topo_type_id[i])))
continue;
if (!node_gres_ptr->topo_core_bitmap[i])
return true;
if (bit_size(node_gres_ptr->topo_core_bitmap[i]) !=
bit_size(core_bitmap))
break;
avail_cores = bit_overlap(node_gres_ptr->topo_core_bitmap[i],
core_bitmap);
if (avail_cores && alloc_core_bitmap) {
avail_cores -= bit_overlap(node_gres_ptr->
topo_core_bitmap[i],
alloc_core_bitmap);
if (avail_cores) {
bit_or(alloc_core_bitmap,
node_gres_ptr->topo_core_bitmap[i]);
}
}
if (avail_cores)
return true;
}
return false;
}
/* Clear any vestigial job gres state. This may be needed on job requeue. */
extern void gres_plugin_job_clear(List job_gres_list)
{
int i;
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr;
gres_job_state_t *job_state_ptr;
if (job_gres_list == NULL)
return;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
job_state_ptr = (gres_job_state_t *) job_gres_ptr->gres_data;
for (i = 0; i < job_state_ptr->node_cnt; i++) {
if (job_state_ptr->gres_bit_alloc) {
FREE_NULL_BITMAP(job_state_ptr->
gres_bit_alloc[i]);
}
if (job_state_ptr->gres_bit_step_alloc) {
FREE_NULL_BITMAP(job_state_ptr->
gres_bit_step_alloc[i]);
}
}
xfree(job_state_ptr->gres_bit_alloc);
xfree(job_state_ptr->gres_bit_step_alloc);
xfree(job_state_ptr->gres_cnt_step_alloc);
xfree(job_state_ptr->gres_cnt_node_alloc);
job_state_ptr->node_cnt = 0;
}
list_iterator_destroy(job_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
}
static int _job_alloc(void *job_gres_data, void *node_gres_data, int node_cnt,
int node_index, int node_offset, char *gres_name,
uint32_t job_id, char *node_name,
bitstr_t *core_bitmap, uint32_t plugin_id,
uint32_t user_id)
{
int j, sz1, sz2;
int64_t gres_cnt, i;
gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data;
gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data;
bool type_array_updated = false;
bitstr_t *alloc_core_bitmap = NULL;
uint64_t gres_per_bit = 1;
bool log_cnt_err = true;
char *log_type;
bool shared_gres = false, use_busy_dev = false;
/*
* Validate data structures. Either job_gres_data->node_cnt and
* job_gres_data->gres_bit_alloc are both set or both zero/NULL.
*/
xassert(node_cnt);
xassert(node_offset >= 0);
xassert(job_gres_ptr);
xassert(node_gres_ptr);
if (node_gres_ptr->no_consume) {
job_gres_ptr->total_gres = NO_CONSUME_VAL64;
return SLURM_SUCCESS;
}
if (_shared_gres(plugin_id)) {
shared_gres = true;
gres_per_bit = job_gres_ptr->gres_per_node;
}
if ((plugin_id == mps_plugin_id) &&
(node_gres_ptr->gres_cnt_alloc != 0)) {
/* We must use the ONE already active GRES of this type */
use_busy_dev = true;
}
if (job_gres_ptr->type_name && !job_gres_ptr->type_name[0])
xfree(job_gres_ptr->type_name);
xfree(node_gres_ptr->gres_used); /* Clear cache */
if (job_gres_ptr->node_cnt == 0) {
job_gres_ptr->node_cnt = node_cnt;
if (job_gres_ptr->gres_bit_alloc) {
error("gres/%s: job %u node_cnt==0 and gres_bit_alloc is set",
gres_name, job_id);
xfree(job_gres_ptr->gres_bit_alloc);
}
}
/*
* These next 2 checks were added long before job resizing was allowed.
* They are not errors as we need to keep the original size around for
* any steps that might still be out there with the larger size. If the
* job was sized up the gres_plugin_job_merge() function handles the
* resize so we are set there.
*/
else if (job_gres_ptr->node_cnt < node_cnt) {
debug2("gres/%s: job %u node_cnt is now larger than it was when allocated from %u to %d",
gres_name, job_id, job_gres_ptr->node_cnt, node_cnt);
if (node_offset >= job_gres_ptr->node_cnt)
return SLURM_ERROR;
} else if (job_gres_ptr->node_cnt > node_cnt) {
debug2("gres/%s: job %u node_cnt is now smaller than it was when allocated %u to %d",
gres_name, job_id, job_gres_ptr->node_cnt, node_cnt);
}
if (!job_gres_ptr->gres_bit_alloc) {
job_gres_ptr->gres_bit_alloc = xcalloc(node_cnt,
sizeof(bitstr_t *));
}
if (!job_gres_ptr->gres_cnt_node_alloc) {
job_gres_ptr->gres_cnt_node_alloc = xcalloc(node_cnt,
sizeof(uint64_t));
}
/*
* select/cons_tres pre-selects the resources and we just need to update
* the data structures to reflect the selected GRES.
*/
if (job_gres_ptr->total_node_cnt) {
/* Resuming job */
if (job_gres_ptr->gres_cnt_node_alloc[node_offset]) {
gres_cnt = job_gres_ptr->
gres_cnt_node_alloc[node_offset];
} else if (job_gres_ptr->gres_bit_alloc[node_offset]) {
gres_cnt = bit_set_count(
job_gres_ptr->gres_bit_alloc[node_offset]);
gres_cnt *= gres_per_bit;
/* Using pre-selected GRES */
} else if (job_gres_ptr->gres_cnt_node_select &&
job_gres_ptr->gres_cnt_node_select[node_index]) {
gres_cnt = job_gres_ptr->
gres_cnt_node_select[node_index];
} else if (job_gres_ptr->gres_bit_select &&
job_gres_ptr->gres_bit_select[node_index]) {
gres_cnt = bit_set_count(
job_gres_ptr->gres_bit_select[node_index]);
gres_cnt *= gres_per_bit;
} else {
error("gres/%s: job %u node %s no resources selected",
gres_name, job_id, node_name);
return SLURM_ERROR;
}
} else {
gres_cnt = job_gres_ptr->gres_per_node;
}
/*
* Check that sufficient resources exist on this node
*/
job_gres_ptr->gres_cnt_node_alloc[node_offset] = gres_cnt;
i = node_gres_ptr->gres_cnt_alloc + gres_cnt;
if (i > node_gres_ptr->gres_cnt_avail) {
error("gres/%s: job %u node %s overallocated resources by %"
PRIu64", (%"PRIu64" > %"PRIu64")",
gres_name, job_id, node_name,
i - node_gres_ptr->gres_cnt_avail,
i, node_gres_ptr->gres_cnt_avail);
/* proceed with request, give job what is available */
}
if (!node_offset && job_gres_ptr->gres_cnt_step_alloc) {
uint64_t *tmp = xcalloc(job_gres_ptr->node_cnt,
sizeof(uint64_t));
memcpy(tmp, job_gres_ptr->gres_cnt_step_alloc,
sizeof(uint64_t) * MIN(node_cnt,
job_gres_ptr->node_cnt));
xfree(job_gres_ptr->gres_cnt_step_alloc);
job_gres_ptr->gres_cnt_step_alloc = tmp;
}
if (job_gres_ptr->gres_cnt_step_alloc == NULL) {
job_gres_ptr->gres_cnt_step_alloc =
xcalloc(job_gres_ptr->node_cnt, sizeof(uint64_t));
}
/*
* Select and/or allocate specific resources for this job.
*/
if (job_gres_ptr->gres_bit_alloc[node_offset]) {
/*
* Restarted slurmctld with active job or resuming a suspended
* job. In any case, the resources already selected.
*/
if (node_gres_ptr->gres_bit_alloc == NULL) {
node_gres_ptr->gres_bit_alloc =
bit_copy(job_gres_ptr->
gres_bit_alloc[node_offset]);
node_gres_ptr->gres_cnt_alloc +=
bit_set_count(node_gres_ptr->gres_bit_alloc);
node_gres_ptr->gres_cnt_alloc *= gres_per_bit;
} else if (node_gres_ptr->gres_bit_alloc) {
gres_cnt = (int64_t)MIN(
bit_size(node_gres_ptr->gres_bit_alloc),
bit_size(job_gres_ptr->
gres_bit_alloc[node_offset]));
for (i = 0; i < gres_cnt; i++) {
if (bit_test(job_gres_ptr->
gres_bit_alloc[node_offset], i) &&
(shared_gres ||
!bit_test(node_gres_ptr->gres_bit_alloc,
i))) {
bit_set(node_gres_ptr->gres_bit_alloc,i);
node_gres_ptr->gres_cnt_alloc +=
gres_per_bit;
}
}
}
} else if (job_gres_ptr->total_node_cnt &&
job_gres_ptr->gres_bit_select &&
job_gres_ptr->gres_bit_select[node_index] &&
job_gres_ptr->gres_cnt_node_select) {
/* Specific GRES already selected, update the node record */
bool job_mod = false;
sz1 = bit_size(job_gres_ptr->gres_bit_select[node_index]);
sz2 = bit_size(node_gres_ptr->gres_bit_alloc);
if (sz1 > sz2) {
error("gres/%s: job %u node %s gres bitmap size bad (%d > %d)",
gres_name, job_id, node_name, sz1, sz2);
job_gres_ptr->gres_bit_select[node_index] =
bit_realloc(
job_gres_ptr->gres_bit_select[node_index], sz2);
job_mod = true;
} else if (sz1 < sz2) {
error("gres/%s: job %u node %s gres bitmap size bad (%d < %d)",
gres_name, job_id, node_name, sz1, sz2);
job_gres_ptr->gres_bit_select[node_index] =
bit_realloc(
job_gres_ptr->gres_bit_select[node_index], sz2);
}
if (!shared_gres &&
bit_overlap_any(job_gres_ptr->gres_bit_select[node_index],
node_gres_ptr->gres_bit_alloc)) {
error("gres/%s: job %u node %s gres bitmap overlap",
gres_name, job_id, node_name);
bit_and_not(job_gres_ptr->gres_bit_select[node_index],
node_gres_ptr->gres_bit_alloc);
}
job_gres_ptr->gres_bit_alloc[node_offset] =
bit_copy(job_gres_ptr->gres_bit_select[node_index]);
job_gres_ptr->gres_cnt_node_alloc[node_offset] =
job_gres_ptr->gres_cnt_node_select[node_index];
if (!node_gres_ptr->gres_bit_alloc) {
node_gres_ptr->gres_bit_alloc =
bit_copy(job_gres_ptr->
gres_bit_alloc[node_offset]);
} else {
bit_or(node_gres_ptr->gres_bit_alloc,
job_gres_ptr->gres_bit_alloc[node_offset]);
}
if (job_mod) {
node_gres_ptr->gres_cnt_alloc =
bit_set_count(node_gres_ptr->gres_bit_alloc);
node_gres_ptr->gres_cnt_alloc *= gres_per_bit;
} else {
node_gres_ptr->gres_cnt_alloc += gres_cnt;
}
} else if (node_gres_ptr->gres_bit_alloc) {
int64_t gres_avail = node_gres_ptr->gres_cnt_avail;
i = bit_size(node_gres_ptr->gres_bit_alloc);
if (plugin_id == mps_plugin_id)
gres_avail = i;
else if (i < gres_avail) {
error("gres/%s: node %s gres bitmap size bad (%"PRIi64" < %"PRIi64")",
gres_name, node_name,
i, gres_avail);
node_gres_ptr->gres_bit_alloc =
bit_realloc(node_gres_ptr->gres_bit_alloc,
gres_avail);
}
job_gres_ptr->gres_bit_alloc[node_offset] =
bit_alloc(gres_avail);
if (core_bitmap)
alloc_core_bitmap = bit_alloc(bit_size(core_bitmap));
/* Pass 1: Allocate GRES overlapping all allocated cores */
for (i=0; i<gres_avail && gres_cnt>0; i++) {
if (bit_test(node_gres_ptr->gres_bit_alloc, i))
continue;
if (!_cores_on_gres(core_bitmap, alloc_core_bitmap,
node_gres_ptr, i, job_gres_ptr))
continue;
bit_set(node_gres_ptr->gres_bit_alloc, i);
bit_set(job_gres_ptr->gres_bit_alloc[node_offset], i);
node_gres_ptr->gres_cnt_alloc += gres_per_bit;
gres_cnt -= gres_per_bit;
}
FREE_NULL_BITMAP(alloc_core_bitmap);
/* Pass 2: Allocate GRES overlapping any allocated cores */
for (i=0; i<gres_avail && gres_cnt>0; i++) {
if (bit_test(node_gres_ptr->gres_bit_alloc, i))
continue;
if (!_cores_on_gres(core_bitmap, NULL, node_gres_ptr, i,
job_gres_ptr))
continue;
bit_set(node_gres_ptr->gres_bit_alloc, i);
bit_set(job_gres_ptr->gres_bit_alloc[node_offset], i);
node_gres_ptr->gres_cnt_alloc += gres_per_bit;
gres_cnt -= gres_per_bit;
}
if (gres_cnt) {
verbose("gres/%s topology sub-optimal for job %u",
gres_name, job_id);
}
/* Pass 3: Allocate any available GRES */
for (i=0; i<gres_avail && gres_cnt>0; i++) {
if (bit_test(node_gres_ptr->gres_bit_alloc, i))
continue;
bit_set(node_gres_ptr->gres_bit_alloc, i);
bit_set(job_gres_ptr->gres_bit_alloc[node_offset], i);
node_gres_ptr->gres_cnt_alloc += gres_per_bit;
gres_cnt -= gres_per_bit;
}
} else {
node_gres_ptr->gres_cnt_alloc += gres_cnt;
}
if (job_gres_ptr->gres_bit_alloc[node_offset] &&
node_gres_ptr->topo_gres_bitmap &&
node_gres_ptr->topo_gres_cnt_alloc) {
for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
if (job_gres_ptr->type_name &&
(!node_gres_ptr->topo_type_name[i] ||
(job_gres_ptr->type_id !=
node_gres_ptr->topo_type_id[i])))
continue;
if (use_busy_dev &&
(node_gres_ptr->topo_gres_cnt_alloc[i] == 0))
continue;
sz1 = bit_size(job_gres_ptr->gres_bit_alloc[node_offset]);
sz2 = bit_size(node_gres_ptr->topo_gres_bitmap[i]);
if ((sz1 != sz2) && log_cnt_err) {
if (_shared_gres(plugin_id))
log_type = "File";
else
log_type = "Count";
/* Avoid abort on bit_overlap below */
error("gres/%s %s mismatch for node %s (%d != %d)",
gres_name, log_type, node_name, sz1, sz2);
log_cnt_err = false;
}
if (sz1 != sz2)
continue; /* See error above */
gres_cnt = bit_overlap(job_gres_ptr->
gres_bit_alloc[node_offset],
node_gres_ptr->
topo_gres_bitmap[i]);
gres_cnt *= gres_per_bit;
node_gres_ptr->topo_gres_cnt_alloc[i] += gres_cnt;
if ((node_gres_ptr->type_cnt == 0) ||
(node_gres_ptr->topo_type_name == NULL) ||
(node_gres_ptr->topo_type_name[i] == NULL))
continue;
for (j = 0; j < node_gres_ptr->type_cnt; j++) {
if (!node_gres_ptr->type_name[j] ||
(node_gres_ptr->topo_type_id[i] !=
node_gres_ptr->type_id[j]))
continue;
node_gres_ptr->type_cnt_alloc[j] += gres_cnt;
break;
}
}
type_array_updated = true;
} else if (job_gres_ptr->gres_bit_alloc[node_offset]) {
int len; /* length of the gres bitmap on this node */
len = bit_size(job_gres_ptr->gres_bit_alloc[node_offset]);
if (!node_gres_ptr->topo_gres_cnt_alloc) {
node_gres_ptr->topo_gres_cnt_alloc =
xcalloc(len, sizeof(uint64_t));
} else {
len = MIN(len, node_gres_ptr->gres_cnt_config);
}
if ((node_gres_ptr->topo_cnt == 0) && shared_gres) {
/*
* Need to add node topo arrays for slurmctld restart
* and job state recovery (with GRES counts per topo)
*/
node_gres_ptr->topo_cnt =
bit_size(job_gres_ptr->gres_bit_alloc[node_offset]);
node_gres_ptr->topo_core_bitmap =
xcalloc(node_gres_ptr->topo_cnt,
sizeof(bitstr_t *));
node_gres_ptr->topo_gres_bitmap =
xcalloc(node_gres_ptr->topo_cnt,
sizeof(bitstr_t *));
node_gres_ptr->topo_gres_cnt_alloc =
xcalloc(node_gres_ptr->topo_cnt,
sizeof(uint64_t));
node_gres_ptr->topo_gres_cnt_avail =
xcalloc(node_gres_ptr->topo_cnt,
sizeof(uint64_t));
node_gres_ptr->topo_type_id =
xcalloc(node_gres_ptr->topo_cnt,
sizeof(uint32_t));
node_gres_ptr->topo_type_name =
xcalloc(node_gres_ptr->topo_cnt,
sizeof(char *));
for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
node_gres_ptr->topo_gres_bitmap[i] =
bit_alloc(node_gres_ptr->topo_cnt);
bit_set(node_gres_ptr->topo_gres_bitmap[i], i);
}
}
for (i = 0; i < len; i++) {
gres_cnt = 0;
if (!bit_test(job_gres_ptr->
gres_bit_alloc[node_offset], i))
continue;
/*
* NOTE: Immediately after slurmctld restart and before
* the node's registration, the GRES type and topology
* information will not be available and we will be
* unable to update topo_gres_cnt_alloc or
* type_cnt_alloc. This results in some incorrect
* internal bookkeeping, but does not cause failures
* in terms of allocating GRES to jobs.
*/
for (j = 0; j < node_gres_ptr->topo_cnt; j++) {
if (use_busy_dev &&
(node_gres_ptr->topo_gres_cnt_alloc[j] == 0))
continue;
if (node_gres_ptr->topo_gres_bitmap &&
node_gres_ptr->topo_gres_bitmap[j] &&
bit_test(node_gres_ptr->topo_gres_bitmap[j],
i)) {
node_gres_ptr->topo_gres_cnt_alloc[i] +=
gres_per_bit;
gres_cnt += gres_per_bit;
}
}
if ((node_gres_ptr->type_cnt == 0) ||
(node_gres_ptr->topo_type_name == NULL) ||
(node_gres_ptr->topo_type_name[i] == NULL))
continue;
for (j = 0; j < node_gres_ptr->type_cnt; j++) {
if (!node_gres_ptr->type_name[j] ||
(node_gres_ptr->topo_type_id[i] !=
node_gres_ptr->type_id[j]))
continue;
node_gres_ptr->type_cnt_alloc[j] += gres_cnt;
break;
}
}
type_array_updated = true;
if (job_gres_ptr->type_name && job_gres_ptr->type_name[0]) {
/*
* We may not know how many GRES of this type will be
* available on this node, but need to track how many
* are allocated to this job from here to avoid
* underflows when this job is deallocated
*/
_add_gres_type(job_gres_ptr->type_name, node_gres_ptr,
0);
for (j = 0; j < node_gres_ptr->type_cnt; j++) {
if (job_gres_ptr->type_id !=
node_gres_ptr->type_id[j])
continue;
node_gres_ptr->type_cnt_alloc[j] +=
job_gres_ptr->gres_per_node;
break;
}
}
}
if (!type_array_updated && job_gres_ptr->type_name) {
gres_cnt = job_gres_ptr->gres_per_node;
for (j = 0; j < node_gres_ptr->type_cnt; j++) {
int64_t k;
if (job_gres_ptr->type_id !=
node_gres_ptr->type_id[j])
continue;
k = node_gres_ptr->type_cnt_avail[j] -
node_gres_ptr->type_cnt_alloc[j];
k = MIN(gres_cnt, k);
node_gres_ptr->type_cnt_alloc[j] += k;
gres_cnt -= k;
if (gres_cnt == 0)
break;
}
}
return SLURM_SUCCESS;
}
static void _job_select_whole_node_internal(
gres_key_t *job_search_key, gres_node_state_t *node_state_ptr,
int type_inx, int context_inx, List job_gres_list)
{
gres_state_t *job_gres_ptr;
gres_job_state_t *job_state_ptr;
if (!(job_gres_ptr = list_find_first(job_gres_list,
_gres_find_job_by_key,
job_search_key))) {
job_state_ptr = xmalloc(sizeof(gres_job_state_t));
job_gres_ptr = xmalloc(sizeof(gres_state_t));
job_gres_ptr->plugin_id = job_search_key->plugin_id;
job_gres_ptr->gres_data = job_state_ptr;
job_state_ptr->gres_name =
xstrdup(gres_context[context_inx].gres_name);
if (type_inx != -1)
job_state_ptr->type_name =
xstrdup(node_state_ptr->type_name[type_inx]);
job_state_ptr->type_id = job_search_key->type_id;
list_append(job_gres_list, job_gres_ptr);
} else
job_state_ptr = job_gres_ptr->gres_data;
/*
* Add the total_gres here but no count, that will be done after
* allocation.
*/
if (node_state_ptr->no_consume) {
job_state_ptr->total_gres = NO_CONSUME_VAL64;
} else if (type_inx != -1)
job_state_ptr->total_gres +=
node_state_ptr->type_cnt_avail[type_inx];
else
job_state_ptr->total_gres += node_state_ptr->gres_cnt_avail;
}
static int _job_alloc_whole_node_internal(
gres_key_t *job_search_key, gres_node_state_t *node_state_ptr,
List job_gres_list, int node_cnt, int node_index, int node_offset,
int type_index, uint32_t job_id, char *node_name,
bitstr_t *core_bitmap, uint32_t user_id)
{
gres_state_t *job_gres_ptr;
gres_job_state_t *job_state_ptr;
if (!(job_gres_ptr = list_find_first(job_gres_list,
_gres_find_job_by_key,
job_search_key))) {
error("%s: This should never happen, we couldn't find the gres %u:%u",
__func__,
job_search_key->plugin_id,
job_search_key->type_id);
return SLURM_ERROR;
}
job_state_ptr = (gres_job_state_t *)job_gres_ptr->gres_data;
/*
* As the amount of gres on each node could
* differ. We need to set the gres_per_node
* correctly here to avoid heterogeneous node
* issues.
*/
if (type_index != -1)
job_state_ptr->gres_per_node =
node_state_ptr->type_cnt_avail[type_index];
else
job_state_ptr->gres_per_node = node_state_ptr->gres_cnt_avail;
return _job_alloc(job_state_ptr, node_state_ptr,
node_cnt, node_index, node_offset,
job_state_ptr->gres_name,
job_id, node_name, core_bitmap,
job_gres_ptr->plugin_id,
user_id);
}
/*
* Select and allocate GRES to a job and update node and job GRES information
* IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
* IN node_gres_list - node's gres_list built by
* gres_plugin_node_config_validate()
* IN node_cnt - total number of nodes originally allocated to the job
* IN node_index - zero-origin global node index
* IN node_offset - zero-origin index in job allocation to the node of interest
* IN job_id - job's ID (for logging)
* IN node_name - name of the node (for logging)
* IN core_bitmap - cores allocated to this job on this node (NULL if not
* available)
* IN user_id - job's user ID
* RET SLURM_SUCCESS or error code
*/
extern int gres_plugin_job_alloc(List job_gres_list, List node_gres_list,
int node_cnt, int node_index, int node_offset,
uint32_t job_id, char *node_name,
bitstr_t *core_bitmap, uint32_t user_id)
{
int i, rc, rc2;
ListIterator job_gres_iter, node_gres_iter;
gres_state_t *job_gres_ptr, *node_gres_ptr;
if (job_gres_list == NULL)
return SLURM_SUCCESS;
if (node_gres_list == NULL) {
error("%s: job %u has gres specification while node %s has none",
__func__, job_id, node_name);
return SLURM_ERROR;
}
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
for (i = 0; i < gres_context_cnt; i++) {
if (job_gres_ptr->plugin_id ==
gres_context[i].plugin_id)
break;
}
if (i >= gres_context_cnt) {
error("%s: no plugin configured for data type %u for job %u and node %s",
__func__, job_gres_ptr->plugin_id, job_id,
node_name);
/* A likely sign that GresPlugins has changed */
continue;
}
node_gres_iter = list_iterator_create(node_gres_list);
while ((node_gres_ptr = (gres_state_t *)
list_next(node_gres_iter))) {
if (job_gres_ptr->plugin_id == node_gres_ptr->plugin_id)
break;
}
list_iterator_destroy(node_gres_iter);
if (node_gres_ptr == NULL) {
error("%s: job %u allocated gres/%s on node %s lacking that gres",
__func__, job_id, gres_context[i].gres_name,
node_name);
continue;
}
rc2 = _job_alloc(job_gres_ptr->gres_data,
node_gres_ptr->gres_data, node_cnt, node_index,
node_offset, gres_context[i].gres_name,
job_id, node_name, core_bitmap,
job_gres_ptr->plugin_id, user_id);
if (rc2 != SLURM_SUCCESS)
rc = rc2;
}
list_iterator_destroy(job_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
/*
* Fill in job_gres_list with the total amount of GRES on a node.
* OUT job_gres_list - This list will be destroyed and remade with all GRES on
* node.
* IN node_gres_list - node's gres_list built by
* gres_plugin_node_config_validate()
* IN job_id - job's ID (for logging)
* IN node_name - name of the node (for logging)
* RET SLURM_SUCCESS or error code
*/
extern int gres_plugin_job_select_whole_node(
List *job_gres_list, List node_gres_list,
uint32_t job_id, char *node_name)
{
int i;
ListIterator node_gres_iter;
gres_state_t *node_gres_ptr;
gres_node_state_t *node_state_ptr;
if (job_gres_list == NULL)
return SLURM_SUCCESS;
if (node_gres_list == NULL) {
error("%s: job %u has gres specification while node %s has none",
__func__, job_id, node_name);
return SLURM_ERROR;
}
if (!*job_gres_list)
*job_gres_list = list_create(_gres_job_list_delete);
if (gres_plugin_init() != SLURM_SUCCESS)
return SLURM_ERROR;
slurm_mutex_lock(&gres_context_lock);
node_gres_iter = list_iterator_create(node_gres_list);
while ((node_gres_ptr = list_next(node_gres_iter))) {
gres_key_t job_search_key;
node_state_ptr = (gres_node_state_t *) node_gres_ptr->gres_data;
/*
* Don't check for no_consume here, we need them added here and
* will filter them out in gres_plugin_job_alloc_whole_node()
*/
if (!node_state_ptr->gres_cnt_config)
continue;
for (i = 0; i < gres_context_cnt; i++) {
if (node_gres_ptr->plugin_id ==
gres_context[i].plugin_id)
break;
}
if (i >= gres_context_cnt) {
error("%s: no plugin configured for data type %u for job %u and node %s",
__func__, node_gres_ptr->plugin_id, job_id,
node_name);
/* A likely sign that GresPlugins has changed */
continue;
}
job_search_key.plugin_id = node_gres_ptr->plugin_id;
if (!node_state_ptr->type_cnt) {
job_search_key.type_id = 0;
_job_select_whole_node_internal(
&job_search_key, node_state_ptr,
-1, i, *job_gres_list);
} else {
for (int j = 0; j < node_state_ptr->type_cnt; j++) {
job_search_key.type_id = gres_plugin_build_id(
node_state_ptr->type_name[j]);
_job_select_whole_node_internal(
&job_search_key, node_state_ptr,
j, i, *job_gres_list);
}
}
}
list_iterator_destroy(node_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return SLURM_SUCCESS;
}
/*
* Select and allocate all GRES on a node to a job and update node and job GRES
* information
* IN job_gres_list - job's gres_list built by gres_plugin_job_whole_node().
* IN node_gres_list - node's gres_list built by
* gres_plugin_node_config_validate()
* IN node_cnt - total number of nodes originally allocated to the job
* IN node_index - zero-origin global node index
* IN node_offset - zero-origin index in job allocation to the node of interest
* IN job_id - job's ID (for logging)
* IN node_name - name of the node (for logging)
* IN core_bitmap - cores allocated to this job on this node (NULL if not
* available)
* IN user_id - job's user ID
* RET SLURM_SUCCESS or error code
*/
extern int gres_plugin_job_alloc_whole_node(
List job_gres_list, List node_gres_list,
int node_cnt, int node_index, int node_offset,
uint32_t job_id, char *node_name,
bitstr_t *core_bitmap, uint32_t user_id)
{
int i, rc, rc2;
ListIterator node_gres_iter;
gres_state_t *node_gres_ptr;
gres_node_state_t *node_state_ptr;
if (job_gres_list == NULL)
return SLURM_SUCCESS;
if (node_gres_list == NULL) {
error("%s: job %u has gres specification while node %s has none",
__func__, job_id, node_name);
return SLURM_ERROR;
}
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
node_gres_iter = list_iterator_create(node_gres_list);
while ((node_gres_ptr = list_next(node_gres_iter))) {
gres_key_t job_search_key;
node_state_ptr = (gres_node_state_t *) node_gres_ptr->gres_data;
if (node_state_ptr->no_consume ||
!node_state_ptr->gres_cnt_config)
continue;
for (i = 0; i < gres_context_cnt; i++) {
if (node_gres_ptr->plugin_id ==
gres_context[i].plugin_id)
break;
}
if (i >= gres_context_cnt) {
error("%s: no plugin configured for data type %u for job %u and node %s",
__func__, node_gres_ptr->plugin_id, job_id,
node_name);
/* A likely sign that GresPlugins has changed */
continue;
}
job_search_key.plugin_id = node_gres_ptr->plugin_id;
if (!node_state_ptr->type_cnt) {
job_search_key.type_id = 0;
rc2 = _job_alloc_whole_node_internal(
&job_search_key, node_state_ptr,
job_gres_list, node_cnt, node_index,
node_offset, -1, job_id, node_name,
core_bitmap, user_id);
if (rc2 != SLURM_SUCCESS)
rc = rc2;
} else {
for (int j = 0; j < node_state_ptr->type_cnt; j++) {
job_search_key.type_id = gres_plugin_build_id(
node_state_ptr->type_name[j]);
rc2 = _job_alloc_whole_node_internal(
&job_search_key, node_state_ptr,
job_gres_list, node_cnt, node_index,
node_offset, j, job_id, node_name,
core_bitmap, user_id);
if (rc2 != SLURM_SUCCESS)
rc = rc2;
}
}
}
list_iterator_destroy(node_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
static int _job_dealloc(void *job_gres_data, void *node_gres_data,
int node_offset, char *gres_name, uint32_t job_id,
char *node_name, bool old_job, uint32_t plugin_id,
uint32_t user_id, bool job_fini)
{
int i, j, len, sz1, sz2;
gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data;
gres_node_state_t *node_gres_ptr = (gres_node_state_t *) node_gres_data;
bool type_array_updated = false;
uint64_t gres_cnt = 0, k;
uint64_t gres_per_bit = 1;
/*
* Validate data structures. Either job_gres_data->node_cnt and
* job_gres_data->gres_bit_alloc are both set or both zero/NULL.
*/
xassert(node_offset >= 0);
xassert(job_gres_ptr);
xassert(node_gres_ptr);
if (node_gres_ptr->no_consume)
return SLURM_SUCCESS;
if (job_gres_ptr->node_cnt <= node_offset) {
error("gres/%s: job %u dealloc of node %s bad node_offset %d "
"count is %u", gres_name, job_id, node_name, node_offset,
job_gres_ptr->node_cnt);
return SLURM_ERROR;
}
if (_shared_gres(plugin_id))
gres_per_bit = job_gres_ptr->gres_per_node;
xfree(node_gres_ptr->gres_used); /* Clear cache */
if (node_gres_ptr->gres_bit_alloc && job_gres_ptr->gres_bit_alloc &&
job_gres_ptr->gres_bit_alloc[node_offset]) {
len = bit_size(job_gres_ptr->gres_bit_alloc[node_offset]);
i = bit_size(node_gres_ptr->gres_bit_alloc);
if (i != len) {
error("gres/%s: job %u and node %s bitmap sizes differ "
"(%d != %d)", gres_name, job_id, node_name, len,
i);
len = MIN(len, i);
/* proceed with request, make best effort */
}
for (i = 0; i < len; i++) {
if (!bit_test(job_gres_ptr->gres_bit_alloc[node_offset],
i)) {
continue;
}
bit_clear(node_gres_ptr->gres_bit_alloc, i);
/*
* NOTE: Do not clear bit from
* job_gres_ptr->gres_bit_alloc[node_offset]
* since this may only be an emulated deallocate
*/
if (node_gres_ptr->gres_cnt_alloc >= gres_per_bit) {
node_gres_ptr->gres_cnt_alloc -= gres_per_bit;
} else {
error("gres/%s: job %u dealloc node %s GRES count underflow (%"PRIu64" < %"PRIu64")",
gres_name, job_id, node_name,
node_gres_ptr->gres_cnt_alloc,
gres_per_bit);
node_gres_ptr->gres_cnt_alloc = 0;
}
}
} else if (job_gres_ptr->gres_cnt_node_alloc) {
gres_cnt = job_gres_ptr->gres_cnt_node_alloc[node_offset];
} else {
gres_cnt = job_gres_ptr->gres_per_node;
}
if (gres_cnt && (node_gres_ptr->gres_cnt_alloc >= gres_cnt))
node_gres_ptr->gres_cnt_alloc -= gres_cnt;
else if (gres_cnt) {
error("gres/%s: job %u node %s GRES count underflow (%"PRIu64" < %"PRIu64")",
gres_name, job_id, node_name,
node_gres_ptr->gres_cnt_alloc, gres_cnt);
node_gres_ptr->gres_cnt_alloc = 0;
}
if (job_gres_ptr->gres_bit_alloc &&
job_gres_ptr->gres_bit_alloc[node_offset] &&
node_gres_ptr->topo_gres_bitmap &&
node_gres_ptr->topo_gres_cnt_alloc) {
for (i = 0; i < node_gres_ptr->topo_cnt; i++) {
sz1 = bit_size(
job_gres_ptr->gres_bit_alloc[node_offset]);
sz2 = bit_size(node_gres_ptr->topo_gres_bitmap[i]);
if (sz1 != sz2)
continue;
gres_cnt = (uint64_t)bit_overlap(
job_gres_ptr->gres_bit_alloc[node_offset],
node_gres_ptr->topo_gres_bitmap[i]);
gres_cnt *= gres_per_bit;
if (node_gres_ptr->topo_gres_cnt_alloc[i] >= gres_cnt) {
node_gres_ptr->topo_gres_cnt_alloc[i] -=
gres_cnt;
} else if (old_job) {
node_gres_ptr->topo_gres_cnt_alloc[i] = 0;
} else {
error("gres/%s: job %u dealloc node %s topo gres count underflow "
"(%"PRIu64" %"PRIu64")",
gres_name, job_id, node_name,
node_gres_ptr->topo_gres_cnt_alloc[i],
gres_cnt);
node_gres_ptr->topo_gres_cnt_alloc[i] = 0;
}
if ((node_gres_ptr->type_cnt == 0) ||
(node_gres_ptr->topo_type_name == NULL) ||
(node_gres_ptr->topo_type_name[i] == NULL))
continue;
for (j = 0; j < node_gres_ptr->type_cnt; j++) {
if (!node_gres_ptr->type_name[j] ||
(node_gres_ptr->topo_type_id[i] !=
node_gres_ptr->type_id[j]))
continue;
if (node_gres_ptr->type_cnt_alloc[j] >=
gres_cnt) {
node_gres_ptr->type_cnt_alloc[j] -=
gres_cnt;
} else if (old_job) {
node_gres_ptr->type_cnt_alloc[j] = 0;
} else {
error("gres/%s: job %u dealloc node %s type %s gres count underflow "
"(%"PRIu64" %"PRIu64")",
gres_name, job_id, node_name,
node_gres_ptr->type_name[j],
node_gres_ptr->type_cnt_alloc[j],
gres_cnt);
node_gres_ptr->type_cnt_alloc[j] = 0;
}
}
}
type_array_updated = true;
} else if (job_gres_ptr->gres_bit_alloc &&
job_gres_ptr->gres_bit_alloc[node_offset] &&
node_gres_ptr->topo_gres_cnt_alloc) {
/* Avoid crash if configuration inconsistent */
len = MIN(node_gres_ptr->gres_cnt_config,
bit_size(job_gres_ptr->
gres_bit_alloc[node_offset]));
for (i = 0; i < len; i++) {
if (!bit_test(job_gres_ptr->
gres_bit_alloc[node_offset], i) ||
!node_gres_ptr->topo_gres_cnt_alloc[i])
continue;
if (node_gres_ptr->topo_gres_cnt_alloc[i] >=
gres_per_bit) {
node_gres_ptr->topo_gres_cnt_alloc[i] -=
gres_per_bit;
} else {
error("gres/%s: job %u dealloc node %s "
"topo_gres_cnt_alloc[%d] count underflow "
"(%"PRIu64" %"PRIu64")",
gres_name, job_id, node_name, i,
node_gres_ptr->topo_gres_cnt_alloc[i],
gres_per_bit);
node_gres_ptr->topo_gres_cnt_alloc[i] = 0;
}
if ((node_gres_ptr->type_cnt == 0) ||
(node_gres_ptr->topo_type_name == NULL) ||
(node_gres_ptr->topo_type_name[i] == NULL))
continue;
for (j = 0; j < node_gres_ptr->type_cnt; j++) {
if (!node_gres_ptr->type_name[j] ||
(node_gres_ptr->topo_type_id[i] !=
node_gres_ptr->type_id[j]))
continue;
if (node_gres_ptr->type_cnt_alloc[j] >=
gres_per_bit) {
node_gres_ptr->type_cnt_alloc[j] -=
gres_per_bit;
} else {
error("gres/%s: job %u dealloc node %s "
"type %s type_cnt_alloc count underflow "
"(%"PRIu64" %"PRIu64")",
gres_name, job_id, node_name,
node_gres_ptr->type_name[j],
node_gres_ptr->type_cnt_alloc[j],
gres_per_bit);
node_gres_ptr->type_cnt_alloc[j] = 0;
}
}
}
type_array_updated = true;
}
if (!type_array_updated && job_gres_ptr->type_name) {
gres_cnt = job_gres_ptr->gres_per_node;
for (j = 0; j < node_gres_ptr->type_cnt; j++) {
if (job_gres_ptr->type_id !=
node_gres_ptr->type_id[j])
continue;
k = MIN(gres_cnt, node_gres_ptr->type_cnt_alloc[j]);
node_gres_ptr->type_cnt_alloc[j] -= k;
gres_cnt -= k;
if (gres_cnt == 0)
break;
}
}
return SLURM_SUCCESS;
}
/*
* Deallocate resource from a job and update node and job gres information
* IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
* IN node_gres_list - node's gres_list built by
* gres_plugin_node_config_validate()
* IN node_offset - zero-origin index to the node of interest
* IN job_id - job's ID (for logging)
* IN node_name - name of the node (for logging)
* IN old_job - true if job started before last slurmctld reboot.
* Immediately after slurmctld restart and before the node's
* registration, the GRES type and topology. This results in
* some incorrect internal bookkeeping, but does not cause
* failures in terms of allocating GRES to jobs.
* IN user_id - job's user ID
* IN: job_fini - job fully terminating on this node (not just a test)
* RET SLURM_SUCCESS or error code
*/
extern int gres_plugin_job_dealloc(List job_gres_list, List node_gres_list,
int node_offset, uint32_t job_id,
char *node_name, bool old_job,
uint32_t user_id, bool job_fini)
{
int i, rc, rc2;
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr, *node_gres_ptr;
char *gres_name = NULL;
if (job_gres_list == NULL)
return SLURM_SUCCESS;
if (node_gres_list == NULL) {
error("%s: job %u has gres specification while node %s has none",
__func__, job_id, node_name);
return SLURM_ERROR;
}
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
for (i = 0; i < gres_context_cnt; i++) {
if (job_gres_ptr->plugin_id ==
gres_context[i].plugin_id)
break;
}
if (i >= gres_context_cnt) {
error("%s: no plugin configured for data type %u for job %u and node %s",
__func__, job_gres_ptr->plugin_id, job_id,
node_name);
/* A likely sign that GresPlugins has changed */
gres_name = "UNKNOWN";
} else
gres_name = gres_context[i].gres_name;
node_gres_ptr = list_find_first(node_gres_list, _gres_find_id,
&job_gres_ptr->plugin_id);
if (node_gres_ptr == NULL) {
error("%s: node %s lacks gres/%s for job %u", __func__,
node_name, gres_name , job_id);
continue;
}
rc2 = _job_dealloc(job_gres_ptr->gres_data,
node_gres_ptr->gres_data, node_offset,
gres_name, job_id, node_name, old_job,
job_gres_ptr->plugin_id, user_id, job_fini);
if (rc2 != SLURM_SUCCESS)
rc = rc2;
}
list_iterator_destroy(job_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
/*
* Merge one job's gres allocation into another job's gres allocation.
* IN from_job_gres_list - List of gres records for the job being merged
* into another job
* IN from_job_node_bitmap - bitmap of nodes for the job being merged into
* another job
* IN/OUT to_job_gres_list - List of gres records for the job being merged
* into job
* IN to_job_node_bitmap - bitmap of nodes for the job being merged into
*/
extern void gres_plugin_job_merge(List from_job_gres_list,
bitstr_t *from_job_node_bitmap,
List to_job_gres_list,
bitstr_t *to_job_node_bitmap)
{
static int select_hetero = -1;
ListIterator gres_iter;
gres_state_t *gres_ptr, *gres_ptr2;
gres_job_state_t *gres_job_ptr, *gres_job_ptr2;
int new_node_cnt;
int i_first, i_last, i;
int from_inx, to_inx, new_inx;
bitstr_t **new_gres_bit_alloc, **new_gres_bit_step_alloc;
uint64_t *new_gres_cnt_step_alloc, *new_gres_cnt_node_alloc;
if (select_hetero == -1) {
/*
* Determine if the select plugin supports heterogeneous
* GRES allocations (count differ by node): 1=yes, 0=no
*/
char *select_type = slurm_get_select_type();
if (select_type &&
(strstr(select_type, "cons_tres") ||
(strstr(select_type, "cray_aries") &&
(slurm_get_select_type_param() & CR_OTHER_CONS_TRES)))) {
select_hetero = 1;
} else
select_hetero = 0;
xfree(select_type);
}
(void) gres_plugin_init();
new_node_cnt = bit_set_count(from_job_node_bitmap) +
bit_set_count(to_job_node_bitmap) -
bit_overlap(from_job_node_bitmap, to_job_node_bitmap);
i_first = MIN(bit_ffs(from_job_node_bitmap),
bit_ffs(to_job_node_bitmap));
i_first = MAX(i_first, 0);
i_last = MAX(bit_fls(from_job_node_bitmap),
bit_fls(to_job_node_bitmap));
if (i_last == -1) {
error("%s: node_bitmaps are empty", __func__);
return;
}
slurm_mutex_lock(&gres_context_lock);
/* Step one - Expand the gres data structures in "to" job */
if (!to_job_gres_list)
goto step2;
gres_iter = list_iterator_create(to_job_gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
gres_job_ptr = (gres_job_state_t *) gres_ptr->gres_data;
new_gres_bit_alloc = xcalloc(new_node_cnt, sizeof(bitstr_t *));
new_gres_cnt_node_alloc = xcalloc(new_node_cnt,
sizeof(uint64_t));
new_gres_bit_step_alloc = xcalloc(new_node_cnt,
sizeof(bitstr_t *));
new_gres_cnt_step_alloc = xcalloc(new_node_cnt,
sizeof(uint64_t));
from_inx = to_inx = new_inx = -1;
for (i = i_first; i <= i_last; i++) {
bool from_match = false, to_match = false;
if (bit_test(to_job_node_bitmap, i)) {
to_match = true;
to_inx++;
}
if (bit_test(from_job_node_bitmap, i)) {
from_match = true;
from_inx++;
}
if (from_match || to_match)
new_inx++;
if (to_match) {
if (gres_job_ptr->gres_bit_alloc) {
new_gres_bit_alloc[new_inx] =
gres_job_ptr->
gres_bit_alloc[to_inx];
}
if (gres_job_ptr->gres_cnt_node_alloc) {
new_gres_cnt_node_alloc[new_inx] =
gres_job_ptr->
gres_cnt_node_alloc[to_inx];
}
if (gres_job_ptr->gres_bit_step_alloc) {
new_gres_bit_step_alloc[new_inx] =
gres_job_ptr->
gres_bit_step_alloc[to_inx];
}
if (gres_job_ptr->gres_cnt_step_alloc) {
new_gres_cnt_step_alloc[new_inx] =
gres_job_ptr->
gres_cnt_step_alloc[to_inx];
}
}
}
gres_job_ptr->node_cnt = new_node_cnt;
xfree(gres_job_ptr->gres_bit_alloc);
gres_job_ptr->gres_bit_alloc = new_gres_bit_alloc;
xfree(gres_job_ptr->gres_cnt_node_alloc);
gres_job_ptr->gres_cnt_node_alloc = new_gres_cnt_node_alloc;
xfree(gres_job_ptr->gres_bit_step_alloc);
gres_job_ptr->gres_bit_step_alloc = new_gres_bit_step_alloc;
xfree(gres_job_ptr->gres_cnt_step_alloc);
gres_job_ptr->gres_cnt_step_alloc = new_gres_cnt_step_alloc;
}
list_iterator_destroy(gres_iter);
/*
* Step two - Merge the gres information from the "from" job into the
* existing gres information for the "to" job
*/
step2: if (!from_job_gres_list)
goto step3;
if (!to_job_gres_list) {
to_job_gres_list = list_create(_gres_job_list_delete);
}
gres_iter = list_iterator_create(from_job_gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
gres_job_ptr = (gres_job_state_t *) gres_ptr->gres_data;
gres_ptr2 = list_find_first(to_job_gres_list, _gres_find_id,
&gres_ptr->plugin_id);
if (gres_ptr2) {
gres_job_ptr2 = gres_ptr2->gres_data;
} else {
gres_ptr2 = xmalloc(sizeof(gres_state_t));
gres_job_ptr2 = xmalloc(sizeof(gres_job_state_t));
gres_ptr2->plugin_id = gres_ptr->plugin_id;
gres_ptr2->gres_data = gres_job_ptr2;
gres_job_ptr2->gres_name =
xstrdup(gres_job_ptr->gres_name);
gres_job_ptr2->cpus_per_gres =
gres_job_ptr->cpus_per_gres;
gres_job_ptr2->gres_per_job =
gres_job_ptr->gres_per_job;
gres_job_ptr2->gres_per_job =
gres_job_ptr->gres_per_job;
gres_job_ptr2->gres_per_socket =
gres_job_ptr->gres_per_socket;
gres_job_ptr2->gres_per_task =
gres_job_ptr->gres_per_task;
gres_job_ptr2->mem_per_gres =
gres_job_ptr->mem_per_gres;
gres_job_ptr2->node_cnt = new_node_cnt;
gres_job_ptr2->gres_bit_alloc =
xcalloc(new_node_cnt, sizeof(bitstr_t *));
gres_job_ptr2->gres_cnt_node_alloc =
xcalloc(new_node_cnt, sizeof(uint64_t));
gres_job_ptr2->gres_bit_step_alloc =
xcalloc(new_node_cnt, sizeof(bitstr_t *));
gres_job_ptr2->gres_cnt_step_alloc =
xcalloc(new_node_cnt, sizeof(uint64_t));
list_append(to_job_gres_list, gres_ptr2);
}
from_inx = to_inx = new_inx = -1;
for (i = i_first; i <= i_last; i++) {
bool from_match = false, to_match = false;
if (bit_test(to_job_node_bitmap, i)) {
to_match = true;
to_inx++;
}
if (bit_test(from_job_node_bitmap, i)) {
from_match = true;
from_inx++;
}
if (from_match || to_match)
new_inx++;
if (from_match) {
if (!gres_job_ptr->gres_bit_alloc) {
;
} else if (select_hetero &&
gres_job_ptr2->
gres_bit_alloc[new_inx] &&
gres_job_ptr->gres_bit_alloc &&
gres_job_ptr->
gres_bit_alloc[new_inx]) {
/* Merge job's GRES bitmaps */
bit_or(gres_job_ptr2->
gres_bit_alloc[new_inx],
gres_job_ptr->
gres_bit_alloc[from_inx]);
} else if (gres_job_ptr2->
gres_bit_alloc[new_inx]) {
/* Keep original job's GRES bitmap */
} else {
gres_job_ptr2->gres_bit_alloc[new_inx] =
gres_job_ptr->
gres_bit_alloc[from_inx];
gres_job_ptr->
gres_bit_alloc
[from_inx] = NULL;
}
if (!gres_job_ptr->gres_bit_alloc) {
;
} else if (select_hetero &&
gres_job_ptr2->
gres_cnt_node_alloc[new_inx] &&
gres_job_ptr->gres_cnt_node_alloc &&
gres_job_ptr->
gres_cnt_node_alloc[new_inx]) {
gres_job_ptr2->
gres_cnt_node_alloc[new_inx] +=
gres_job_ptr->
gres_cnt_node_alloc[from_inx];
} else if (gres_job_ptr2->
gres_cnt_node_alloc[new_inx]) {
/* Keep original job's GRES bitmap */
} else {
gres_job_ptr2->
gres_cnt_node_alloc[new_inx] =
gres_job_ptr->
gres_cnt_node_alloc[from_inx];
gres_job_ptr->
gres_cnt_node_alloc[from_inx]=0;
}
if (gres_job_ptr->gres_cnt_step_alloc &&
gres_job_ptr->
gres_cnt_step_alloc[from_inx]) {
error("Attempt to merge gres, from "
"job has active steps");
}
}
}
}
list_iterator_destroy(gres_iter);
step3: slurm_mutex_unlock(&gres_context_lock);
return;
}
/*
* Set environment variables as required for a batch job
* IN/OUT job_env_ptr - environment variable array
* IN gres_list - generated by gres_plugin_job_alloc()
* IN node_inx - zero origin node index
*/
extern void gres_plugin_job_set_env(char ***job_env_ptr, List job_gres_list,
int node_inx)
{
int i;
ListIterator gres_iter;
gres_state_t *gres_ptr = NULL;
bool found;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
for (i=0; i<gres_context_cnt; i++) {
if (gres_context[i].ops.job_set_env == NULL)
continue; /* No plugin to call */
found = false;
if (job_gres_list) {
gres_iter = list_iterator_create(job_gres_list);
while ((gres_ptr = (gres_state_t *)
list_next(gres_iter))) {
if (gres_ptr->plugin_id !=
gres_context[i].plugin_id)
continue;
(*(gres_context[i].ops.job_set_env))
(job_env_ptr, gres_ptr->gres_data,
node_inx);
found = true;
}
list_iterator_destroy(gres_iter);
}
/*
* We call the job_set_env of the gres even if this one is not
* requested in the job. This may be convenient on certain
* plugins, i.e. setting an env variable to say the GRES is not
* available.
*/
if (!found) {
(*(gres_context[i].ops.job_set_env))
(job_env_ptr, NULL, node_inx);
}
}
slurm_mutex_unlock(&gres_context_lock);
}
/*
* Set job default parameters in a given element of a list
* IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
* IN gres_name - name of gres, apply defaults to all elements (e.g. updates to
* gres_name="gpu" would apply to "gpu:tesla", "gpu:volta", etc.)
* IN cpu_per_gpu - value to set as default
* IN mem_per_gpu - value to set as default
*/
extern void gres_plugin_job_set_defs(List job_gres_list, char *gres_name,
uint64_t cpu_per_gpu,
uint64_t mem_per_gpu)
{
uint32_t plugin_id;
ListIterator gres_iter;
gres_state_t *gres_ptr = NULL;
gres_job_state_t *job_gres_data;
if (!job_gres_list)
return;
plugin_id = gres_plugin_build_id(gres_name);
gres_iter = list_iterator_create(job_gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
if (gres_ptr->plugin_id != plugin_id)
continue;
job_gres_data = (gres_job_state_t *) gres_ptr->gres_data;
if (!job_gres_data)
continue;
job_gres_data->def_cpus_per_gres = cpu_per_gpu;
job_gres_data->def_mem_per_gres = mem_per_gpu;
}
list_iterator_destroy(gres_iter);
}
/*
* Translate GRES flag to string.
* NOT reentrant
*/
static char *_gres_flags_str(uint16_t flags)
{
if (flags & GRES_NO_CONSUME)
return "no_consume";
return "";
}
static void _job_state_log(void *gres_data, uint32_t job_id, uint32_t plugin_id)
{
gres_job_state_t *gres_ptr;
char *sparse_msg = "", tmp_str[128];
int i;
xassert(gres_data);
gres_ptr = (gres_job_state_t *) gres_data;
info("gres:%s(%u) type:%s(%u) job:%u flags:%s state",
gres_ptr->gres_name, plugin_id, gres_ptr->type_name,
gres_ptr->type_id, job_id, _gres_flags_str(gres_ptr->flags));
if (gres_ptr->cpus_per_gres)
info(" cpus_per_gres:%u", gres_ptr->cpus_per_gres);
else if (gres_ptr->def_cpus_per_gres)
info(" def_cpus_per_gres:%u", gres_ptr->def_cpus_per_gres);
if (gres_ptr->gres_per_job)
info(" gres_per_job:%"PRIu64, gres_ptr->gres_per_job);
if (gres_ptr->gres_per_node) {
info(" gres_per_node:%"PRIu64" node_cnt:%u",
gres_ptr->gres_per_node, gres_ptr->node_cnt);
}
if (gres_ptr->gres_per_socket)
info(" gres_per_socket:%"PRIu64, gres_ptr->gres_per_socket);
if (gres_ptr->gres_per_task)
info(" gres_per_task:%"PRIu64, gres_ptr->gres_per_task);
if (gres_ptr->mem_per_gres)
info(" mem_per_gres:%"PRIu64, gres_ptr->mem_per_gres);
else if (gres_ptr->def_mem_per_gres)
info(" def_mem_per_gres:%"PRIu64, gres_ptr->def_mem_per_gres);
if (gres_ptr->node_cnt == 0)
return;
if (gres_ptr->gres_bit_alloc == NULL)
info(" gres_bit_alloc:NULL");
if (gres_ptr->gres_cnt_node_alloc == NULL)
info(" gres_cnt_node_alloc:NULL");
if (gres_ptr->gres_bit_step_alloc == NULL)
info(" gres_bit_step_alloc:NULL");
if (gres_ptr->gres_cnt_step_alloc == NULL)
info(" gres_cnt_step_alloc:NULL");
if (gres_ptr->gres_bit_select == NULL)
info(" gres_bit_select:NULL");
if (gres_ptr->gres_cnt_node_select == NULL)
info(" gres_cnt_node_select:NULL");
for (i = 0; i < gres_ptr->node_cnt; i++) {
if (gres_ptr->gres_cnt_node_alloc &&
gres_ptr->gres_cnt_node_alloc[i]) {
info(" gres_cnt_node_alloc[%d]:%"PRIu64,
i, gres_ptr->gres_cnt_node_alloc[i]);
} else if (gres_ptr->gres_cnt_node_alloc)
info(" gres_cnt_node_alloc[%d]:NULL", i);
if (gres_ptr->gres_bit_alloc && gres_ptr->gres_bit_alloc[i]) {
bit_fmt(tmp_str, sizeof(tmp_str),
gres_ptr->gres_bit_alloc[i]);
info(" gres_bit_alloc[%d]:%s of %d", i, tmp_str,
(int) bit_size(gres_ptr->gres_bit_alloc[i]));
} else if (gres_ptr->gres_bit_alloc)
info(" gres_bit_alloc[%d]:NULL", i);
if (gres_ptr->gres_bit_step_alloc &&
gres_ptr->gres_bit_step_alloc[i]) {
bit_fmt(tmp_str, sizeof(tmp_str),
gres_ptr->gres_bit_step_alloc[i]);
info(" gres_bit_step_alloc[%d]:%s of %d", i, tmp_str,
(int) bit_size(gres_ptr->gres_bit_step_alloc[i]));
} else if (gres_ptr->gres_bit_step_alloc)
info(" gres_bit_step_alloc[%d]:NULL", i);
if (gres_ptr->gres_cnt_step_alloc) {
info(" gres_cnt_step_alloc[%d]:%"PRIu64"", i,
gres_ptr->gres_cnt_step_alloc[i]);
}
}
/*
* These arrays are only used for resource selection and may include
* data for many nodes not used in the resources eventually allocated
* to this job.
*/
if (gres_ptr->total_node_cnt)
sparse_msg = " (sparsely populated for resource selection)";
info(" total_node_cnt:%u%s", gres_ptr->total_node_cnt, sparse_msg);
for (i = 0; i < gres_ptr->total_node_cnt; i++) {
if (gres_ptr->gres_cnt_node_select &&
gres_ptr->gres_cnt_node_select[i]) {
info(" gres_cnt_node_select[%d]:%"PRIu64,
i, gres_ptr->gres_cnt_node_select[i]);
}
if (gres_ptr->gres_bit_select &&
gres_ptr->gres_bit_select[i]) {
bit_fmt(tmp_str, sizeof(tmp_str),
gres_ptr->gres_bit_select[i]);
info(" gres_bit_select[%d]:%s of %d", i, tmp_str,
(int) bit_size(gres_ptr->gres_bit_select[i]));
}
}
}
/*
* Extract from the job record's gres_list the count of allocated resources of
* the named gres type.
* IN job_gres_list - job record's gres_list.
* IN gres_name_type - the name of the gres type to retrieve the associated
* value from.
* RET The value associated with the gres type or NO_VAL if not found.
*/
extern uint64_t gres_plugin_get_job_value_by_type(List job_gres_list,
char *gres_name_type)
{
uint64_t gres_val;
uint32_t gres_name_type_id;
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr;
if (job_gres_list == NULL)
return NO_VAL64;
slurm_mutex_lock(&gres_context_lock);
gres_name_type_id = gres_plugin_build_id(gres_name_type);
gres_val = NO_VAL64;
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
if (job_gres_ptr->plugin_id == gres_name_type_id) {
gres_val = ((gres_job_state_t *)
(job_gres_ptr->gres_data))->gres_per_node;
break;
}
}
list_iterator_destroy(job_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return gres_val;
}
/*
* Log a job's current gres state
* IN gres_list - generated by gres_plugin_job_state_validate()
* IN job_id - job's ID
*/
extern void gres_plugin_job_state_log(List gres_list, uint32_t job_id)
{
ListIterator gres_iter;
gres_state_t *gres_ptr;
if (!gres_debug || (gres_list == NULL))
return;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
_job_state_log(gres_ptr->gres_data, job_id,
gres_ptr->plugin_id);
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
}
static int _find_device(void *x, void *key)
{
gres_device_t *device_x = (gres_device_t *)x;
gres_device_t *device_key = (gres_device_t *)key;
if (!xstrcmp(device_x->path, device_key->path))
return 1;
return 0;
}
extern List gres_plugin_get_allocated_devices(List gres_list, bool is_job)
{
int i, j;
ListIterator gres_itr, dev_itr;
gres_state_t *gres_ptr;
bitstr_t **local_bit_alloc = NULL;
uint32_t node_cnt;
gres_device_t *gres_device;
List gres_devices;
List device_list = NULL;
(void) gres_plugin_init();
/*
* Create a unique device list of all possible GRES device files.
* Initialize each device to deny.
*/
for (j = 0; j < gres_context_cnt; j++) {
if (!gres_context[j].ops.get_devices)
continue;
gres_devices = (*(gres_context[j].ops.get_devices))();
if (!gres_devices || !list_count(gres_devices))
continue;
dev_itr = list_iterator_create(gres_devices);
while ((gres_device = list_next(dev_itr))) {
if (!device_list)
device_list = list_create(NULL);
gres_device->alloc = 0;
/*
* Keep the list unique by not adding duplicates (in the
* case of MPS and GPU)
*/
if (!list_find_first(device_list, _find_device,
gres_device))
list_append(device_list, gres_device);
}
list_iterator_destroy(dev_itr);
}
if (!gres_list)
return device_list;
slurm_mutex_lock(&gres_context_lock);
gres_itr = list_iterator_create(gres_list);
while ((gres_ptr = list_next(gres_itr))) {
for (j = 0; j < gres_context_cnt; j++) {
if (gres_ptr->plugin_id == gres_context[j].plugin_id)
break;
}
if (j >= gres_context_cnt) {
error("We were unable to find the gres in the context!!! This should never happen");
continue;
}
if (!gres_ptr->gres_data)
continue;
if (is_job) {
gres_job_state_t *gres_data_ptr =
(gres_job_state_t *)gres_ptr->gres_data;
local_bit_alloc = gres_data_ptr->gres_bit_alloc;
node_cnt = gres_data_ptr->node_cnt;
} else {
gres_step_state_t *gres_data_ptr =
(gres_step_state_t *)gres_ptr->gres_data;
local_bit_alloc = gres_data_ptr->gres_bit_alloc;
node_cnt = gres_data_ptr->node_cnt;
}
if ((node_cnt != 1) ||
!local_bit_alloc ||
!local_bit_alloc[0] ||
!gres_context[j].ops.get_devices)
continue;
gres_devices = (*(gres_context[j].ops.get_devices))();
if (!gres_devices) {
error("We should had got gres_devices, but for some reason none were set in the plugin.");
continue;
} else if ((int)bit_size(local_bit_alloc[0]) !=
list_count(gres_devices)) {
error("We got %d gres devices when we were only told about %d. This should never happen.",
list_count(gres_devices),
(int)bit_size(local_bit_alloc[0]));
continue;
}
dev_itr = list_iterator_create(gres_devices);
i = 0;
while ((gres_device = list_next(dev_itr))) {
if (bit_test(local_bit_alloc[0], i)) {
gres_device_t *gres_device2;
/*
* search for the device among the unique
* devices list (since two plugins could have
* device records that point to the same file,
* like with GPU and MPS)
*/
gres_device2 = list_find_first(device_list,
_find_device,
gres_device);
/*
* Set both, in case they point to different
* records
*/
gres_device->alloc = 1;
if (gres_device2)
gres_device2->alloc = 1;
}
//info("%d is %d", i, gres_device->alloc);
i++;
}
list_iterator_destroy(dev_itr);
}
list_iterator_destroy(gres_itr);
slurm_mutex_unlock(&gres_context_lock);
return device_list;
}
static void _step_state_delete(void *gres_data)
{
int i;
gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
if (gres_ptr == NULL)
return;
FREE_NULL_BITMAP(gres_ptr->node_in_use);
if (gres_ptr->gres_bit_alloc) {
for (i = 0; i < gres_ptr->node_cnt; i++)
FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc[i]);
xfree(gres_ptr->gres_bit_alloc);
}
xfree(gres_ptr->gres_cnt_node_alloc);
xfree(gres_ptr->type_name);
xfree(gres_ptr);
}
static void _gres_step_list_delete(void *list_element)
{
gres_state_t *gres_ptr = (gres_state_t *) list_element;
_step_state_delete(gres_ptr->gres_data);
xfree(gres_ptr);
}
static uint64_t _step_test(void *step_gres_data, void *job_gres_data,
int node_offset, bool first_step_node,
uint16_t cpus_per_task, int max_rem_nodes,
bool ignore_alloc,
uint32_t job_id, uint32_t step_id,
uint32_t plugin_id)
{
gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data;
gres_step_state_t *step_gres_ptr = (gres_step_state_t *) step_gres_data;
uint64_t core_cnt, gres_cnt, min_gres = 1, task_cnt;
xassert(job_gres_ptr);
xassert(step_gres_ptr);
if ((node_offset >= job_gres_ptr->node_cnt) &&
(job_gres_ptr->node_cnt != 0)) { /* GRES is type no_consume */
error("gres/%s: %s %u.%u node offset invalid (%d >= %u)",
job_gres_ptr->gres_name, __func__, job_id,
step_id, node_offset,
job_gres_ptr->node_cnt);
return 0;
}
if (first_step_node) {
if (ignore_alloc)
step_gres_ptr->gross_gres = 0;
else
step_gres_ptr->total_gres = 0;
}
if (step_gres_ptr->gres_per_node)
min_gres = step_gres_ptr-> gres_per_node;
if (step_gres_ptr->gres_per_socket)
min_gres = MAX(min_gres, step_gres_ptr->gres_per_socket);
if (step_gres_ptr->gres_per_task)
min_gres = MAX(min_gres, step_gres_ptr->gres_per_task);
if (step_gres_ptr->gres_per_step &&
(step_gres_ptr->gres_per_step > step_gres_ptr->total_gres) &&
(max_rem_nodes == 1)) {
gres_cnt = step_gres_ptr->gres_per_step;
if (ignore_alloc)
gres_cnt -= step_gres_ptr->gross_gres;
else
gres_cnt -= step_gres_ptr->total_gres;
min_gres = MAX(min_gres, gres_cnt);
}
if (!_shared_gres(plugin_id) &&
job_gres_ptr->gres_bit_alloc &&
job_gres_ptr->gres_bit_alloc[node_offset]) {
gres_cnt = bit_set_count(job_gres_ptr->
gres_bit_alloc[node_offset]);
if (!ignore_alloc &&
job_gres_ptr->gres_bit_step_alloc &&
job_gres_ptr->gres_bit_step_alloc[node_offset]) {
gres_cnt -= bit_set_count(job_gres_ptr->
gres_bit_step_alloc
[node_offset]);
}
if (min_gres > gres_cnt) {
core_cnt = 0;
} else if (step_gres_ptr->gres_per_task) {
task_cnt = (gres_cnt + step_gres_ptr->gres_per_task - 1)
/ step_gres_ptr->gres_per_task;
core_cnt = task_cnt * cpus_per_task;
} else
core_cnt = NO_VAL64;
} else if (job_gres_ptr->gres_cnt_node_alloc &&
job_gres_ptr->gres_cnt_step_alloc) {
gres_cnt = job_gres_ptr->gres_cnt_node_alloc[node_offset];
if (!ignore_alloc) {
gres_cnt -= job_gres_ptr->
gres_cnt_step_alloc[node_offset];
}
if (min_gres > gres_cnt) {
core_cnt = 0;
} else if (step_gres_ptr->gres_per_task) {
task_cnt = (gres_cnt + step_gres_ptr->gres_per_task - 1)
/ step_gres_ptr->gres_per_task;
core_cnt = task_cnt * cpus_per_task;
} else
core_cnt = NO_VAL64;
} else {
debug3("gres/%s: %s %u.%u gres_bit_alloc and gres_cnt_node_alloc are NULL",
job_gres_ptr->gres_name, __func__, job_id, step_id);
gres_cnt = 0;
core_cnt = NO_VAL64;
}
if (core_cnt != 0) {
if (ignore_alloc)
step_gres_ptr->gross_gres += gres_cnt;
else
step_gres_ptr->total_gres += gres_cnt;
}
return core_cnt;
}
/*
* TRES specification parse logic
* in_val IN - initial input string
* cnt OUT - count of values
* gres_list IN/OUT - where to search for (or add) new step TRES record
* save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
* rc OUT - unchanged or an error code
* RET gres - step record to set value in, found or created by this function
*/
static gres_step_state_t *_get_next_step_gres(char *in_val, uint64_t *cnt,
List gres_list, char **save_ptr,
int *rc)
{
static char *prev_save_ptr = NULL;
int context_inx = NO_VAL, my_rc = SLURM_SUCCESS;
gres_step_state_t *step_gres_data = NULL;
gres_state_t *gres_ptr;
gres_key_t step_search_key;
char *type = NULL, *name = NULL;
uint16_t flags = 0;
xassert(save_ptr);
if (!in_val && (*save_ptr == NULL)) {
return NULL;
}
if (*save_ptr == NULL) {
prev_save_ptr = in_val;
} else if (*save_ptr != prev_save_ptr) {
error("%s: parsing error", __func__);
my_rc = SLURM_ERROR;
goto fini;
}
if (prev_save_ptr[0] == '\0') { /* Empty input token */
*save_ptr = NULL;
return NULL;
}
if ((my_rc = _get_next_gres(in_val, &type, &context_inx,
cnt, &flags, &prev_save_ptr)) ||
(context_inx == NO_VAL)) {
prev_save_ptr = NULL;
goto fini;
}
/* Find the step GRES record */
step_search_key.plugin_id = gres_context[context_inx].plugin_id;
step_search_key.type_id = gres_plugin_build_id(type);
gres_ptr = list_find_first(gres_list, _gres_find_step_by_key,
&step_search_key);
if (gres_ptr) {
step_gres_data = gres_ptr->gres_data;
} else {
step_gres_data = xmalloc(sizeof(gres_step_state_t));
step_gres_data->type_id = gres_plugin_build_id(type);
step_gres_data->type_name = type;
type = NULL; /* String moved above */
gres_ptr = xmalloc(sizeof(gres_state_t));
gres_ptr->plugin_id = gres_context[context_inx].plugin_id;
gres_ptr->gres_data = step_gres_data;
list_append(gres_list, gres_ptr);
}
step_gres_data->flags = flags;
fini: xfree(name);
xfree(type);
if (my_rc != SLURM_SUCCESS) {
prev_save_ptr = NULL;
if (my_rc == ESLURM_INVALID_GRES)
info("Invalid GRES job specification %s", in_val);
*rc = my_rc;
}
*save_ptr = prev_save_ptr;
return step_gres_data;
}
/* Test that the step does not request more GRES than the job contains */
static void _validate_step_counts(List step_gres_list, List job_gres_list,
int *rc)
{
ListIterator iter;
gres_state_t *job_gres_ptr, *step_gres_ptr;
gres_job_state_t *job_gres_data;
gres_step_state_t *step_gres_data;
gres_key_t job_search_key;
uint16_t cpus_per_gres;
uint64_t mem_per_gres;
if (!step_gres_list || (list_count(step_gres_list) == 0))
return;
if (!job_gres_list || (list_count(job_gres_list) == 0)) {
*rc = ESLURM_INVALID_GRES;
return;
}
iter = list_iterator_create(step_gres_list);
while ((step_gres_ptr = (gres_state_t *) list_next(iter))) {
step_gres_data = (gres_step_state_t *) step_gres_ptr->gres_data;
job_search_key.plugin_id = step_gres_ptr->plugin_id;
if (step_gres_data->type_id == 0)
job_search_key.type_id = NO_VAL;
else
job_search_key.type_id = step_gres_data->type_id;
job_gres_ptr = list_find_first(job_gres_list,
_gres_find_job_by_key,
&job_search_key);
if (!job_gres_ptr || !job_gres_ptr->gres_data) {
*rc = ESLURM_INVALID_GRES;
break;
}
job_gres_data = (gres_job_state_t *) job_gres_ptr->gres_data;
if (job_gres_data->cpus_per_gres)
cpus_per_gres = job_gres_data->cpus_per_gres;
else
cpus_per_gres = job_gres_data->def_cpus_per_gres;
if (cpus_per_gres && step_gres_data->cpus_per_gres &&
(cpus_per_gres < step_gres_data->cpus_per_gres)) {
*rc = ESLURM_INVALID_GRES;
break;
}
if (job_gres_data->gres_per_job &&
step_gres_data->gres_per_step &&
(job_gres_data->gres_per_job <
step_gres_data->gres_per_step)) {
*rc = ESLURM_INVALID_GRES;
break;
}
if (job_gres_data->gres_per_node &&
step_gres_data->gres_per_node &&
(job_gres_data->gres_per_node <
step_gres_data->gres_per_node)) {
*rc = ESLURM_INVALID_GRES;
break;
}
if (job_gres_data->gres_per_socket &&
step_gres_data->gres_per_socket &&
(job_gres_data->gres_per_socket <
step_gres_data->gres_per_socket)) {
*rc = ESLURM_INVALID_GRES;
break;
}
if (job_gres_data->gres_per_task &&
step_gres_data->gres_per_task &&
(job_gres_data->gres_per_task <
step_gres_data->gres_per_task)) {
*rc = ESLURM_INVALID_GRES;
break;
}
if (job_gres_data->mem_per_gres)
mem_per_gres = job_gres_data->mem_per_gres;
else
mem_per_gres = job_gres_data->def_mem_per_gres;
if (mem_per_gres && step_gres_data->mem_per_gres &&
(mem_per_gres < step_gres_data->mem_per_gres)) {
*rc = ESLURM_INVALID_GRES;
break;
}
}
list_iterator_destroy(iter);
}
/*
* Given a step's requested gres configuration, validate it and build gres list
* IN *tres* - step's requested gres input string
* OUT step_gres_list - List of Gres records for this step to track usage
* IN job_gres_list - List of Gres records for this job
* IN job_id, step_id - ID of the step being allocated.
* RET SLURM_SUCCESS or ESLURM_INVALID_GRES
*/
extern int gres_plugin_step_state_validate(char *cpus_per_tres,
char *tres_per_step,
char *tres_per_node,
char *tres_per_socket,
char *tres_per_task,
char *mem_per_tres,
List *step_gres_list,
List job_gres_list, uint32_t job_id,
uint32_t step_id)
{
int rc;
gres_step_state_t *step_gres_data;
List new_step_list;
uint64_t cnt = 0;
*step_gres_list = NULL;
if ((rc = gres_plugin_init()) != SLURM_SUCCESS)
return rc;
slurm_mutex_lock(&gres_context_lock);
new_step_list = list_create(_gres_step_list_delete);
if (cpus_per_tres) {
char *in_val = cpus_per_tres, *save_ptr = NULL;
while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
new_step_list,
&save_ptr, &rc))) {
step_gres_data->cpus_per_gres = cnt;
in_val = NULL;
}
}
if (tres_per_step) {
char *in_val = tres_per_step, *save_ptr = NULL;
while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
new_step_list,
&save_ptr, &rc))) {
step_gres_data->gres_per_step = cnt;
in_val = NULL;
}
}
if (tres_per_node) {
char *in_val = tres_per_node, *save_ptr = NULL;
while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
new_step_list,
&save_ptr, &rc))) {
step_gres_data->gres_per_node = cnt;
in_val = NULL;
}
}
if (tres_per_socket) {
char *in_val = tres_per_socket, *save_ptr = NULL;
while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
new_step_list,
&save_ptr, &rc))) {
step_gres_data->gres_per_socket = cnt;
in_val = NULL;
}
}
if (tres_per_task) {
char *in_val = tres_per_task, *save_ptr = NULL;
while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
new_step_list,
&save_ptr, &rc))) {
step_gres_data->gres_per_task = cnt;
in_val = NULL;
}
}
if (mem_per_tres) {
char *in_val = mem_per_tres, *save_ptr = NULL;
while ((step_gres_data = _get_next_step_gres(in_val, &cnt,
new_step_list,
&save_ptr, &rc))) {
step_gres_data->mem_per_gres = cnt;
in_val = NULL;
}
}
if (list_count(new_step_list) == 0) {
FREE_NULL_LIST(new_step_list);
} else {
if (rc == SLURM_SUCCESS)
_validate_step_counts(new_step_list, job_gres_list,
&rc);
if (rc == SLURM_SUCCESS)
*step_gres_list = new_step_list;
else
FREE_NULL_LIST(new_step_list);
}
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
static void *_step_state_dup(void *gres_data)
{
int i;
gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
gres_step_state_t *new_gres_ptr;
xassert(gres_ptr);
new_gres_ptr = xmalloc(sizeof(gres_step_state_t));
new_gres_ptr->cpus_per_gres = gres_ptr->cpus_per_gres;
new_gres_ptr->gres_per_step = gres_ptr->gres_per_step;
new_gres_ptr->gres_per_node = gres_ptr->gres_per_node;
new_gres_ptr->gres_per_socket = gres_ptr->gres_per_socket;
new_gres_ptr->gres_per_task = gres_ptr->gres_per_task;
new_gres_ptr->mem_per_gres = gres_ptr->mem_per_gres;
new_gres_ptr->node_cnt = gres_ptr->node_cnt;
new_gres_ptr->total_gres = gres_ptr->total_gres;
if (gres_ptr->node_in_use)
new_gres_ptr->node_in_use = bit_copy(gres_ptr->node_in_use);
if (gres_ptr->gres_cnt_node_alloc) {
i = sizeof(uint64_t) * gres_ptr->node_cnt;
new_gres_ptr->gres_cnt_node_alloc = xmalloc(i);
memcpy(new_gres_ptr->gres_cnt_node_alloc,
gres_ptr->gres_cnt_node_alloc, i);
}
if (gres_ptr->gres_bit_alloc) {
new_gres_ptr->gres_bit_alloc = xcalloc(gres_ptr->node_cnt,
sizeof(bitstr_t *));
for (i = 0; i < gres_ptr->node_cnt; i++) {
if (gres_ptr->gres_bit_alloc[i] == NULL)
continue;
new_gres_ptr->gres_bit_alloc[i] =
bit_copy(gres_ptr->gres_bit_alloc[i]);
}
}
return new_gres_ptr;
}
uint64_t *gres_cnt_node_alloc; /* Per node GRES allocated, */
static void *_step_state_dup2(void *gres_data, int node_index)
{
gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
gres_step_state_t *new_gres_ptr;
xassert(gres_ptr);
new_gres_ptr = xmalloc(sizeof(gres_step_state_t));
new_gres_ptr->cpus_per_gres = gres_ptr->cpus_per_gres;
new_gres_ptr->gres_per_step = gres_ptr->gres_per_step;
new_gres_ptr->gres_per_node = gres_ptr->gres_per_node;
new_gres_ptr->gres_per_socket = gres_ptr->gres_per_socket;
new_gres_ptr->gres_per_task = gres_ptr->gres_per_task;
new_gres_ptr->mem_per_gres = gres_ptr->mem_per_gres;
new_gres_ptr->node_cnt = 1;
new_gres_ptr->total_gres = gres_ptr->total_gres;
if (gres_ptr->node_in_use)
new_gres_ptr->node_in_use = bit_copy(gres_ptr->node_in_use);
if (gres_ptr->gres_cnt_node_alloc) {
new_gres_ptr->gres_cnt_node_alloc = xmalloc(sizeof(uint64_t));
new_gres_ptr->gres_cnt_node_alloc[0] =
gres_ptr->gres_cnt_node_alloc[node_index];
}
if ((node_index < gres_ptr->node_cnt) && gres_ptr->gres_bit_alloc &&
gres_ptr->gres_bit_alloc[node_index]) {
new_gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *));
new_gres_ptr->gres_bit_alloc[0] =
bit_copy(gres_ptr->gres_bit_alloc[node_index]);
}
return new_gres_ptr;
}
/*
* Create a copy of a step's gres state
* IN gres_list - List of Gres records for this step to track usage
* RET The copy or NULL on failure
*/
List gres_plugin_step_state_dup(List gres_list)
{
return gres_plugin_step_state_extract(gres_list, -1);
}
/*
* Create a copy of a step's gres state for a particular node index
* IN gres_list - List of Gres records for this step to track usage
* IN node_index - zero-origin index to the node
* RET The copy or NULL on failure
*/
List gres_plugin_step_state_extract(List gres_list, int node_index)
{
ListIterator gres_iter;
gres_state_t *gres_ptr, *new_gres_state;
List new_gres_list = NULL;
void *new_gres_data;
if (gres_list == NULL)
return new_gres_list;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
if (node_index == -1)
new_gres_data = _step_state_dup(gres_ptr->gres_data);
else {
new_gres_data = _step_state_dup2(gres_ptr->gres_data,
node_index);
}
if (new_gres_list == NULL) {
new_gres_list = list_create(_gres_step_list_delete);
}
new_gres_state = xmalloc(sizeof(gres_state_t));
new_gres_state->plugin_id = gres_ptr->plugin_id;
new_gres_state->gres_data = new_gres_data;
list_append(new_gres_list, new_gres_state);
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return new_gres_list;
}
/*
* A job allocation size has changed. Update the job step gres information
* bitmaps and other data structures.
* IN gres_list - List of Gres records for this step to track usage
* IN orig_job_node_bitmap - bitmap of nodes in the original job allocation
* IN new_job_node_bitmap - bitmap of nodes in the new job allocation
*/
void gres_plugin_step_state_rebase(List gres_list,
bitstr_t *orig_job_node_bitmap,
bitstr_t *new_job_node_bitmap)
{
ListIterator gres_iter;
gres_state_t *gres_ptr;
gres_step_state_t *gres_step_ptr;
int new_node_cnt;
int i_first, i_last, i;
int old_inx, new_inx;
bitstr_t *new_node_in_use;
bitstr_t **new_gres_bit_alloc = NULL;
if (gres_list == NULL)
return;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
gres_step_ptr = (gres_step_state_t *) gres_ptr->gres_data;
if (!gres_step_ptr)
continue;
if (!gres_step_ptr->node_in_use) {
error("gres_plugin_step_state_rebase: node_in_use is NULL");
continue;
}
new_node_cnt = bit_set_count(new_job_node_bitmap);
i_first = MIN(bit_ffs(orig_job_node_bitmap),
bit_ffs(new_job_node_bitmap));
i_first = MAX(i_first, 0);
i_last = MAX(bit_fls(orig_job_node_bitmap),
bit_fls(new_job_node_bitmap));
if (i_last == -1) {
error("gres_plugin_step_state_rebase: node_bitmaps "
"are empty");
continue;
}
new_node_in_use = bit_alloc(new_node_cnt);
old_inx = new_inx = -1;
for (i = i_first; i <= i_last; i++) {
bool old_match = false, new_match = false;
if (bit_test(orig_job_node_bitmap, i)) {
old_match = true;
old_inx++;
}
if (bit_test(new_job_node_bitmap, i)) {
new_match = true;
new_inx++;
}
if (old_match && new_match) {
bit_set(new_node_in_use, new_inx);
if (gres_step_ptr->gres_bit_alloc) {
if (!new_gres_bit_alloc) {
new_gres_bit_alloc =
xcalloc(new_node_cnt,
sizeof(bitstr_t *));
}
new_gres_bit_alloc[new_inx] =
gres_step_ptr->gres_bit_alloc[old_inx];
}
} else if (old_match &&
gres_step_ptr->gres_bit_alloc &&
gres_step_ptr->gres_bit_alloc[old_inx]) {
/* Node removed from job allocation,
* release step's resources */
bit_free(gres_step_ptr->
gres_bit_alloc[old_inx]);
}
}
gres_step_ptr->node_cnt = new_node_cnt;
bit_free(gres_step_ptr->node_in_use);
gres_step_ptr->node_in_use = new_node_in_use;
xfree(gres_step_ptr->gres_bit_alloc);
gres_step_ptr->gres_bit_alloc = new_gres_bit_alloc;
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return;
}
/*
* Pack a step's current gres status, called from slurmctld for save/restore
* IN gres_list - generated by gres_plugin_step_alloc()
* IN/OUT buffer - location to write state to
* IN job_id, step_id - job and step ID for logging
*/
extern int gres_plugin_step_state_pack(List gres_list, Buf buffer,
uint32_t job_id, uint32_t step_id,
uint16_t protocol_version)
{
int i, rc = SLURM_SUCCESS;
uint32_t top_offset, tail_offset, magic = GRES_MAGIC;
uint16_t rec_cnt = 0;
ListIterator gres_iter;
gres_state_t *gres_ptr;
gres_step_state_t *gres_step_ptr;
top_offset = get_buf_offset(buffer);
pack16(rec_cnt, buffer); /* placeholder if data */
if (gres_list == NULL)
return rc;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
gres_step_ptr = (gres_step_state_t *) gres_ptr->gres_data;
if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
pack32(magic, buffer);
pack32(gres_ptr->plugin_id, buffer);
pack16(gres_step_ptr->cpus_per_gres, buffer);
pack16(gres_step_ptr->flags, buffer);
pack64(gres_step_ptr->gres_per_step, buffer);
pack64(gres_step_ptr->gres_per_node, buffer);
pack64(gres_step_ptr->gres_per_socket, buffer);
pack64(gres_step_ptr->gres_per_task, buffer);
pack64(gres_step_ptr->mem_per_gres, buffer);
pack64(gres_step_ptr->total_gres, buffer);
pack32(gres_step_ptr->node_cnt, buffer);
pack_bit_str_hex(gres_step_ptr->node_in_use, buffer);
if (gres_step_ptr->gres_cnt_node_alloc) {
pack8((uint8_t) 1, buffer);
pack64_array(gres_step_ptr->gres_cnt_node_alloc,
gres_step_ptr->node_cnt, buffer);
} else {
pack8((uint8_t) 0, buffer);
}
if (gres_step_ptr->gres_bit_alloc) {
pack8((uint8_t) 1, buffer);
for (i = 0; i < gres_step_ptr->node_cnt; i++)
pack_bit_str_hex(gres_step_ptr->
gres_bit_alloc[i],
buffer);
} else {
pack8((uint8_t) 0, buffer);
}
rec_cnt++;
} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
pack32(magic, buffer);
pack32(gres_ptr->plugin_id, buffer);
pack16(gres_step_ptr->cpus_per_gres, buffer);
pack64(gres_step_ptr->gres_per_step, buffer);
pack64(gres_step_ptr->gres_per_node, buffer);
pack64(gres_step_ptr->gres_per_socket, buffer);
pack64(gres_step_ptr->gres_per_task, buffer);
pack64(gres_step_ptr->mem_per_gres, buffer);
pack64(gres_step_ptr->total_gres, buffer);
pack32(gres_step_ptr->node_cnt, buffer);
pack_bit_str_hex(gres_step_ptr->node_in_use, buffer);
if (gres_step_ptr->gres_cnt_node_alloc) {
pack8((uint8_t) 1, buffer);
pack64_array(gres_step_ptr->gres_cnt_node_alloc,
gres_step_ptr->node_cnt, buffer);
} else {
pack8((uint8_t) 0, buffer);
}
if (gres_step_ptr->gres_bit_alloc) {
pack8((uint8_t) 1, buffer);
for (i = 0; i < gres_step_ptr->node_cnt; i++)
pack_bit_str_hex(gres_step_ptr->
gres_bit_alloc[i],
buffer);
} else {
pack8((uint8_t) 0, buffer);
}
rec_cnt++;
} else {
error("%s: protocol_version %hu not supported",
__func__, protocol_version);
break;
}
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
tail_offset = get_buf_offset(buffer);
set_buf_offset(buffer, top_offset);
pack16(rec_cnt, buffer);
set_buf_offset(buffer, tail_offset);
return rc;
}
/*
* Unpack a step's current gres status, called from slurmctld for save/restore
* OUT gres_list - restored state stored by gres_plugin_step_state_pack()
* IN/OUT buffer - location to read state from
* IN job_id, step_id - job and step ID for logging
*/
extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
uint32_t job_id, uint32_t step_id,
uint16_t protocol_version)
{
int i, rc;
uint32_t magic = 0, plugin_id = 0, uint32_tmp = 0;
uint16_t rec_cnt = 0;
uint8_t data_flag = 0;
gres_state_t *gres_ptr;
gres_step_state_t *gres_step_ptr = NULL;
safe_unpack16(&rec_cnt, buffer);
if (rec_cnt == 0)
return SLURM_SUCCESS;
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
*gres_list = list_create(_gres_step_list_delete);
}
while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
if ((buffer == NULL) || (remaining_buf(buffer) == 0))
break;
rec_cnt--;
if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
safe_unpack32(&magic, buffer);
if (magic != GRES_MAGIC)
goto unpack_error;
safe_unpack32(&plugin_id, buffer);
gres_step_ptr = xmalloc(sizeof(gres_step_state_t));
safe_unpack16(&gres_step_ptr->cpus_per_gres, buffer);
safe_unpack16(&gres_step_ptr->flags, buffer);
safe_unpack64(&gres_step_ptr->gres_per_step, buffer);
safe_unpack64(&gres_step_ptr->gres_per_node, buffer);
safe_unpack64(&gres_step_ptr->gres_per_socket, buffer);
safe_unpack64(&gres_step_ptr->gres_per_task, buffer);
safe_unpack64(&gres_step_ptr->mem_per_gres, buffer);
safe_unpack64(&gres_step_ptr->total_gres, buffer);
safe_unpack32(&gres_step_ptr->node_cnt, buffer);
if (gres_step_ptr->node_cnt > NO_VAL)
goto unpack_error;
unpack_bit_str_hex(&gres_step_ptr->node_in_use, buffer);
safe_unpack8(&data_flag, buffer);
if (data_flag) {
safe_unpack64_array(
&gres_step_ptr->gres_cnt_node_alloc,
&uint32_tmp, buffer);
}
safe_unpack8(&data_flag, buffer);
if (data_flag) {
gres_step_ptr->gres_bit_alloc =
xcalloc(gres_step_ptr->node_cnt,
sizeof(bitstr_t *));
for (i = 0; i < gres_step_ptr->node_cnt; i++) {
unpack_bit_str_hex(&gres_step_ptr->
gres_bit_alloc[i],
buffer);
}
}
} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
safe_unpack32(&magic, buffer);
if (magic != GRES_MAGIC)
goto unpack_error;
safe_unpack32(&plugin_id, buffer);
gres_step_ptr = xmalloc(sizeof(gres_step_state_t));
safe_unpack16(&gres_step_ptr->cpus_per_gres, buffer);
safe_unpack64(&gres_step_ptr->gres_per_step, buffer);
safe_unpack64(&gres_step_ptr->gres_per_node, buffer);
safe_unpack64(&gres_step_ptr->gres_per_socket, buffer);
safe_unpack64(&gres_step_ptr->gres_per_task, buffer);
safe_unpack64(&gres_step_ptr->mem_per_gres, buffer);
safe_unpack64(&gres_step_ptr->total_gres, buffer);
safe_unpack32(&gres_step_ptr->node_cnt, buffer);
if (gres_step_ptr->node_cnt > NO_VAL)
goto unpack_error;
unpack_bit_str_hex(&gres_step_ptr->node_in_use, buffer);
safe_unpack8(&data_flag, buffer);
if (data_flag) {
safe_unpack64_array(
&gres_step_ptr->gres_cnt_node_alloc,
&uint32_tmp, buffer);
}
safe_unpack8(&data_flag, buffer);
if (data_flag) {
gres_step_ptr->gres_bit_alloc =
xcalloc(gres_step_ptr->node_cnt,
sizeof(bitstr_t *));
for (i = 0; i < gres_step_ptr->node_cnt; i++) {
unpack_bit_str_hex(&gres_step_ptr->
gres_bit_alloc[i],
buffer);
}
}
} else {
error("%s: protocol_version %hu not supported",
__func__, protocol_version);
goto unpack_error;
}
for (i = 0; i < gres_context_cnt; i++) {
if (gres_context[i].plugin_id == plugin_id)
break;
}
if (i >= gres_context_cnt) {
/*
* A likely sign that GresPlugins has changed.
* Not a fatal error, skip over the data.
*/
info("%s: no plugin configured to unpack data type %u from step %u.%u",
__func__, plugin_id, job_id, step_id);
_step_state_delete(gres_step_ptr);
gres_step_ptr = NULL;
continue;
}
gres_ptr = xmalloc(sizeof(gres_state_t));
gres_ptr->plugin_id = gres_context[i].plugin_id;
gres_ptr->gres_data = gres_step_ptr;
gres_step_ptr = NULL;
list_append(*gres_list, gres_ptr);
}
slurm_mutex_unlock(&gres_context_lock);
return rc;
unpack_error:
error("%s: unpack error from step %u.%u", __func__, job_id, step_id);
if (gres_step_ptr)
_step_state_delete(gres_step_ptr);
slurm_mutex_unlock(&gres_context_lock);
return SLURM_ERROR;
}
/* Return the count of GRES of a specific name on this machine
* IN step_gres_list - generated by gres_plugin_step_alloc()
* IN gres_name - name of the GRES to match
* RET count of GRES of this specific name available to the job or NO_VAL64
*/
extern uint64_t gres_plugin_step_count(List step_gres_list, char *gres_name)
{
uint64_t gres_cnt = NO_VAL64;
gres_state_t *gres_ptr = NULL;
gres_step_state_t *gres_step_ptr = NULL;
ListIterator gres_iter;
int i;
if (!step_gres_list)
return gres_cnt;
slurm_mutex_lock(&gres_context_lock);
for (i = 0; i < gres_context_cnt; i++) {
if (xstrcmp(gres_context[i].gres_name, gres_name))
continue;
gres_iter = list_iterator_create(step_gres_list);
while ((gres_ptr = (gres_state_t *)list_next(gres_iter))) {
if (gres_ptr->plugin_id != gres_context[i].plugin_id)
continue;
gres_step_ptr = (gres_step_state_t*)gres_ptr->gres_data;
if (gres_cnt == NO_VAL64)
gres_cnt = gres_step_ptr->gres_per_node;
else
gres_cnt += gres_step_ptr->gres_per_node;
}
list_iterator_destroy(gres_iter);
break;
}
slurm_mutex_unlock(&gres_context_lock);
return gres_cnt;
}
/*
* Given a GRES context index, return a bitmap representing those GRES
* which are available from the CPUs current allocated to this process.
* This function only works with task/cgroup and constrained devices or
* if the job step has access to the entire node's resources.
*/
static bitstr_t * _get_usable_gres(int context_inx)
{
#if defined(__APPLE__)
return NULL;
#else
#ifdef __NetBSD__
// On NetBSD, cpuset_t is an opaque data type
cpuset_t *mask = cpuset_create();
#else
cpu_set_t mask;
#endif
bitstr_t *usable_gres = NULL;
int i, i_last, rc;
ListIterator iter;
gres_slurmd_conf_t *gres_slurmd_conf;
int gres_inx = 0;
if (!gres_conf_list) {
error("gres_conf_list is null!");
return NULL;
}
CPU_ZERO(&mask);
#ifdef __FreeBSD__
rc = cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1,
sizeof(mask), &mask);
#else
rc = sched_getaffinity(0, sizeof(mask), &mask);
#endif
if (rc) {
error("sched_getaffinity error: %m");
return usable_gres;
}
usable_gres = bit_alloc(MAX_GRES_BITMAP);
iter = list_iterator_create(gres_conf_list);
while ((gres_slurmd_conf = (gres_slurmd_conf_t *) list_next(iter))) {
if (gres_slurmd_conf->plugin_id !=
gres_context[context_inx].plugin_id)
continue;
if ((gres_inx + gres_slurmd_conf->count) >= MAX_GRES_BITMAP) {
error("GRES %s bitmap overflow ((%d + %"PRIu64") >= %d)",
gres_slurmd_conf->name, gres_inx,
gres_slurmd_conf->count, MAX_GRES_BITMAP);
continue;
}
if (!gres_slurmd_conf->cpus_bitmap) {
bit_nset(usable_gres, gres_inx,
gres_inx + gres_slurmd_conf->count - 1);
} else {
i_last = bit_fls(gres_slurmd_conf->cpus_bitmap);
for (i = 0; i <= i_last; i++) {
if (!bit_test(gres_slurmd_conf->cpus_bitmap, i))
continue;
if (!CPU_ISSET(i, &mask))
continue;
bit_nset(usable_gres, gres_inx,
gres_inx + gres_slurmd_conf->count -1);
break;
}
}
gres_inx += gres_slurmd_conf->count;
}
list_iterator_destroy(iter);
#ifdef __NetBSD__
cpuset_destroy(mask);
#endif
return usable_gres;
#endif
}
/*
* Configure the GRES hardware allocated to the current step while privileged
*
* IN step_gres_list - Step's GRES specification
* IN node_id - relative position of this node in step
* IN settings - string containing configuration settings for the hardware
*/
extern void gres_plugin_step_hardware_init(List step_gres_list,
uint32_t node_id, char *settings)
{
int i;
ListIterator iter;
gres_state_t *gres_ptr;
gres_step_state_t *gres_step_ptr;
bitstr_t *devices;
if (!step_gres_list)
return;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
for (i = 0; i < gres_context_cnt; i++) {
if (gres_context[i].ops.step_hardware_init == NULL)
continue;
iter = list_iterator_create(step_gres_list);
while ((gres_ptr = list_next(iter))) {
if (gres_ptr->plugin_id == gres_context[i].plugin_id)
break;
}
list_iterator_destroy(iter);
if (!gres_ptr || !gres_ptr->gres_data)
continue;
gres_step_ptr = (gres_step_state_t *) gres_ptr->gres_data;
if ((gres_step_ptr->node_cnt != 1) ||
!gres_step_ptr->gres_bit_alloc ||
!gres_step_ptr->gres_bit_alloc[0])
continue;
devices = gres_step_ptr->gres_bit_alloc[0];
if (settings)
debug2("settings: %s", settings);
if (devices) {
char *dev_str = bit_fmt_full(devices);
info("devices: %s", dev_str);
xfree(dev_str);
}
(*(gres_context[i].ops.step_hardware_init))(devices, settings);
}
slurm_mutex_unlock(&gres_context_lock);
}
/*
* Optionally undo GRES hardware configuration while privileged
*/
extern void gres_plugin_step_hardware_fini(void)
{
int i;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
for (i = 0; i < gres_context_cnt; i++) {
if (gres_context[i].ops.step_hardware_fini == NULL) {
continue;
}
(*(gres_context[i].ops.step_hardware_fini)) ();
}
slurm_mutex_unlock(&gres_context_lock);
}
/*
* Given a set GRES maps and the local process ID, return the bitmap of
* GRES that should be available to this task.
*/
static bitstr_t *_get_gres_map(char *map_gres, int local_proc_id)
{
bitstr_t *usable_gres = NULL;
char *tmp, *tok, *save_ptr = NULL, *mult;
int task_offset = 0, task_mult;
int map_value;
if (!map_gres || !map_gres[0])
return NULL;
while (usable_gres == NULL) {
tmp = xstrdup(map_gres);
tok = strtok_r(tmp, ",", &save_ptr);
while (tok) {
if ((mult = strchr(tok, '*'))) {
mult[0] = '\0';
task_mult = atoi(mult + 1);
} else
task_mult = 1;
if (task_mult == 0)
task_mult = 1;
if ((local_proc_id >= task_offset) &&
(local_proc_id <= (task_offset + task_mult - 1))) {
map_value = strtol(tok, NULL, 0);
if ((map_value < 0) ||
(map_value >= MAX_GRES_BITMAP)) {
xfree(tmp);
goto end; /* Bad value */
}
usable_gres = bit_alloc(MAX_GRES_BITMAP);
bit_set(usable_gres, map_value);
break; /* All done */
} else {
task_offset += task_mult;
}
tok = strtok_r(NULL, ",", &save_ptr);
}
xfree(tmp);
}
end:
return usable_gres;
}
/*
* Given a set GRES masks and the local process ID, return the bitmap of
* GRES that should be available to this task.
*/
static bitstr_t * _get_gres_mask(char *mask_gres, int local_proc_id)
{
bitstr_t *usable_gres = NULL;
char *tmp, *tok, *save_ptr = NULL, *mult;
int i, task_offset = 0, task_mult;
uint64_t mask_value;
if (!mask_gres || !mask_gres[0])
return NULL;
tmp = xstrdup(mask_gres);
tok = strtok_r(tmp, ",", &save_ptr);
while (tok) {
if ((mult = strchr(tok, '*')))
task_mult = atoi(mult + 1);
else
task_mult = 1;
if ((local_proc_id >= task_offset) &&
(local_proc_id <= (task_offset + task_mult - 1))) {
mask_value = strtol(tok, NULL, 0);
if ((mask_value <= 0) || (mask_value >= 0xffffffff))
break; /* Bad value */
usable_gres = bit_alloc(MAX_GRES_BITMAP);
for (i = 0; i < 64; i++) {
if ((mask_value >> i) & 0x1)
bit_set(usable_gres, i);
}
break; /* All done */
} else {
task_offset += task_mult;
}
tok = strtok_r(NULL, ",", &save_ptr);
}
xfree(tmp);
return usable_gres;
}
/*
* Set environment as required for all tasks of a job step
* IN/OUT job_env_ptr - environment variable array
* IN step_gres_list - generated by gres_plugin_step_alloc()
* IN accel_bind_type - GRES binding options (old format, a bitmap)
* IN tres_bind - TRES binding directives (new format, a string)
* IN local_proc_id - task rank, local to this compute node only
*/
extern void gres_plugin_step_set_env(char ***job_env_ptr, List step_gres_list,
uint16_t accel_bind_type, char *tres_bind,
int local_proc_id)
{
int i;
ListIterator gres_iter;
gres_state_t *gres_ptr = NULL;
bool bind_gpu = accel_bind_type & ACCEL_BIND_CLOSEST_GPU;
bool bind_nic = accel_bind_type & ACCEL_BIND_CLOSEST_NIC;
bool bind_mic = accel_bind_type & ACCEL_BIND_CLOSEST_MIC;
char *sep, *map_gpu = NULL, *mask_gpu = NULL;
bitstr_t *usable_gres = NULL;
bool found;
if (!bind_gpu && tres_bind && (sep = strstr(tres_bind, "gpu:"))) {
sep += 4;
if (!strncasecmp(sep, "closest", 7))
bind_gpu = true;
else if (!strncasecmp(sep, "map_gpu:", 8))
map_gpu = sep + 8;
else if (!strncasecmp(sep, "mask_gpu:", 9))
mask_gpu = sep + 9;
}
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
for (i = 0; i < gres_context_cnt; i++) {
if (!gres_context[i].ops.step_set_env)
continue; /* No plugin to call */
if (bind_gpu || bind_mic || bind_nic || map_gpu || mask_gpu) {
if (!xstrcmp(gres_context[i].gres_name, "gpu")) {
if (map_gpu) {
usable_gres = _get_gres_map(map_gpu,
local_proc_id);
} else if (mask_gpu) {
usable_gres = _get_gres_mask(mask_gpu,
local_proc_id);
} else if (bind_gpu)
usable_gres = _get_usable_gres(i);
else
continue;
} else if (!xstrcmp(gres_context[i].gres_name,
"mic")) {
if (bind_mic)
usable_gres = _get_usable_gres(i);
else
continue;
} else if (!xstrcmp(gres_context[i].gres_name,
"nic")) {
if (bind_nic)
usable_gres = _get_usable_gres(i);
else
continue;
} else {
continue;
}
}
found = false;
if (step_gres_list) {
gres_iter = list_iterator_create(step_gres_list);
while ((gres_ptr = (gres_state_t *)
list_next(gres_iter))) {
if (gres_ptr->plugin_id !=
gres_context[i].plugin_id)
continue;
if (accel_bind_type || tres_bind) {
(*(gres_context[i].ops.step_reset_env))
(job_env_ptr,
gres_ptr->gres_data,
usable_gres);
} else {
(*(gres_context[i].ops.step_set_env))
(job_env_ptr,
gres_ptr->gres_data);
}
found = true;
}
list_iterator_destroy(gres_iter);
}
if (!found) { /* No data fond */
if (accel_bind_type || tres_bind) {
(*(gres_context[i].ops.step_reset_env))
(job_env_ptr, NULL, NULL);
} else {
(*(gres_context[i].ops.step_set_env))
(job_env_ptr, NULL);
}
}
FREE_NULL_BITMAP(usable_gres);
}
slurm_mutex_unlock(&gres_context_lock);
FREE_NULL_BITMAP(usable_gres);
}
static void _step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id,
char *gres_name)
{
gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
char tmp_str[128];
int i;
xassert(gres_ptr);
info("gres:%s type:%s(%u) step:%u.%u flags:%s state", gres_name,
gres_ptr->type_name, gres_ptr->type_id, job_id, step_id,
_gres_flags_str(gres_ptr->flags));
if (gres_ptr->cpus_per_gres)
info(" cpus_per_gres:%u", gres_ptr->cpus_per_gres);
if (gres_ptr->gres_per_step)
info(" gres_per_step:%"PRIu64, gres_ptr->gres_per_step);
if (gres_ptr->gres_per_node) {
info(" gres_per_node:%"PRIu64" node_cnt:%u",
gres_ptr->gres_per_node, gres_ptr->node_cnt);
}
if (gres_ptr->gres_per_socket)
info(" gres_per_socket:%"PRIu64, gres_ptr->gres_per_socket);
if (gres_ptr->gres_per_task)
info(" gres_per_task:%"PRIu64, gres_ptr->gres_per_task);
if (gres_ptr->mem_per_gres)
info(" mem_per_gres:%"PRIu64, gres_ptr->mem_per_gres);
if (gres_ptr->node_in_use == NULL)
info(" node_in_use:NULL");
else if (gres_ptr->gres_bit_alloc == NULL)
info(" gres_bit_alloc:NULL");
else {
for (i = 0; i < gres_ptr->node_cnt; i++) {
if (!bit_test(gres_ptr->node_in_use, i))
continue;
if (gres_ptr->gres_bit_alloc[i]) {
bit_fmt(tmp_str, sizeof(tmp_str),
gres_ptr->gres_bit_alloc[i]);
info(" gres_bit_alloc[%d]:%s of %d", i,
tmp_str,
(int)bit_size(gres_ptr->gres_bit_alloc[i]));
} else
info(" gres_bit_alloc[%d]:NULL", i);
}
}
}
/*
* Log a step's current gres state
* IN gres_list - generated by gres_plugin_step_alloc()
* IN job_id - job's ID
*/
extern void gres_plugin_step_state_log(List gres_list, uint32_t job_id,
uint32_t step_id)
{
int i;
ListIterator gres_iter;
gres_state_t *gres_ptr;
if (!gres_debug || (gres_list == NULL))
return;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
for (i = 0; i < gres_context_cnt; i++) {
if (gres_ptr->plugin_id != gres_context[i].plugin_id)
continue;
_step_state_log(gres_ptr->gres_data, job_id, step_id,
gres_context[i].gres_name);
break;
}
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
}
/*
* Determine how many cores of a job's allocation can be allocated to a step
* on a specific node
* IN job_gres_list - a running job's gres info
* IN/OUT step_gres_list - a pending job step's gres requirements
* IN node_offset - index into the job's node allocation
* IN first_step_node - true if this is node zero of the step (do initialization)
* IN cpus_per_task - number of CPUs required per task
* IN max_rem_nodes - maximum nodes remaining for step (including this one)
* IN ignore_alloc - if set ignore resources already allocated to running steps
* IN job_id, step_id - ID of the step being allocated.
* RET Count of available cores on this node (sort of):
* NO_VAL64 if no limit or 0 if node is not usable
*/
extern uint64_t gres_plugin_step_test(List step_gres_list, List job_gres_list,
int node_offset, bool first_step_node,
uint16_t cpus_per_task, int max_rem_nodes,
bool ignore_alloc,
uint32_t job_id, uint32_t step_id)
{
uint64_t core_cnt, tmp_cnt;
ListIterator step_gres_iter;
gres_state_t *job_gres_ptr, *step_gres_ptr;
gres_step_state_t *step_data_ptr = NULL;
if (step_gres_list == NULL)
return NO_VAL64;
if (job_gres_list == NULL)
return 0;
if (cpus_per_task == 0)
cpus_per_task = 1;
core_cnt = NO_VAL64;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
step_gres_iter = list_iterator_create(step_gres_list);
while ((step_gres_ptr = (gres_state_t *) list_next(step_gres_iter))) {
gres_key_t job_search_key;
step_data_ptr = (gres_step_state_t *)step_gres_ptr->gres_data;
job_search_key.plugin_id = step_gres_ptr->plugin_id;
if (step_data_ptr->type_name)
job_search_key.type_id = step_data_ptr->type_id;
else
job_search_key.type_id = NO_VAL;
job_search_key.node_offset = node_offset;
if (!(job_gres_ptr = list_find_first(
job_gres_list,
_gres_find_job_by_key_with_cnt,
&job_search_key))) {
/* job lack resources required by the step */
core_cnt = 0;
break;
}
tmp_cnt = _step_test(step_data_ptr,
job_gres_ptr->gres_data,
node_offset, first_step_node,
cpus_per_task, max_rem_nodes,
ignore_alloc,
job_id, step_id,
step_gres_ptr->plugin_id);
if ((tmp_cnt != NO_VAL64) && (tmp_cnt < core_cnt))
core_cnt = tmp_cnt;
if (core_cnt == 0)
break;
}
list_iterator_destroy(step_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return core_cnt;
}
/*
* Return TRUE if this plugin ID consumes GRES count > 1 for a single device
* file (e.g. MPS)
*/
static bool _shared_gres(uint32_t plugin_id)
{
if (plugin_id == mps_plugin_id)
return true;
return false;
}
/*
* Return TRUE if this plugin ID shares resources with another GRES that
* consumes subsets of its resources (e.g. GPU)
*/
static bool _sharing_gres(uint32_t plugin_id)
{
if (plugin_id == gpu_plugin_id)
return true;
return false;
}
static int _step_alloc(void *step_gres_data, void *job_gres_data,
uint32_t plugin_id, int node_offset,
bool first_step_node,
uint32_t job_id, uint32_t step_id,
uint16_t tasks_on_node, uint32_t rem_nodes)
{
gres_job_state_t *job_gres_ptr = (gres_job_state_t *) job_gres_data;
gres_step_state_t *step_gres_ptr = (gres_step_state_t *) step_gres_data;
uint64_t gres_needed, gres_avail, max_gres = 0;
bitstr_t *gres_bit_alloc;
int i, len;
xassert(job_gres_ptr);
xassert(step_gres_ptr);
if (job_gres_ptr->node_cnt == 0) /* no_consume */
return SLURM_SUCCESS;
if (node_offset >= job_gres_ptr->node_cnt) {
error("gres/%s: %s for %u.%u, node offset invalid (%d >= %u)",
job_gres_ptr->gres_name, __func__, job_id,
step_id, node_offset,
job_gres_ptr->node_cnt);
return SLURM_ERROR;
}
if (first_step_node)
step_gres_ptr->total_gres = 0;
if (step_gres_ptr->gres_per_node) {
gres_needed = step_gres_ptr->gres_per_node;
} else if (step_gres_ptr->gres_per_task) {
gres_needed = step_gres_ptr->gres_per_task * tasks_on_node;
} else if (step_gres_ptr->gres_per_step && (rem_nodes == 1)) {
gres_needed = step_gres_ptr->gres_per_step -
step_gres_ptr->total_gres;
} else if (step_gres_ptr->gres_per_step) {
/* Leave at least one GRES per remaining node */
max_gres = step_gres_ptr->gres_per_step -
step_gres_ptr->total_gres - (rem_nodes - 1);
gres_needed = 1;
} else {
/*
* No explicit step GRES specification.
* Note that gres_per_socket is not supported for steps
*/
gres_needed = job_gres_ptr->gres_cnt_node_alloc[node_offset];
}
if (step_gres_ptr->node_cnt == 0)
step_gres_ptr->node_cnt = job_gres_ptr->node_cnt;
if (!step_gres_ptr->gres_cnt_node_alloc) {
step_gres_ptr->gres_cnt_node_alloc =
xcalloc(step_gres_ptr->node_cnt, sizeof(uint64_t));
}
if (job_gres_ptr->gres_cnt_node_alloc &&
job_gres_ptr->gres_cnt_node_alloc[node_offset])
gres_avail = job_gres_ptr->gres_cnt_node_alloc[node_offset];
else if (job_gres_ptr->gres_bit_select &&
job_gres_ptr->gres_bit_select[node_offset])
gres_avail = bit_set_count(
job_gres_ptr->gres_bit_select[node_offset]);
else if (job_gres_ptr->gres_cnt_node_alloc)
gres_avail = job_gres_ptr->gres_cnt_node_alloc[node_offset];
else
gres_avail = job_gres_ptr->gres_per_node;
if (gres_needed > gres_avail) {
error("gres/%s: %s for %u.%u, step's > job's "
"for node %d (%"PRIu64" > %"PRIu64")",
job_gres_ptr->gres_name, __func__, job_id,
step_id, node_offset, gres_needed, gres_avail);
return SLURM_ERROR;
}
if (!job_gres_ptr->gres_cnt_step_alloc) {
job_gres_ptr->gres_cnt_step_alloc =
xcalloc(job_gres_ptr->node_cnt, sizeof(uint64_t));
}
if (gres_needed >
(gres_avail - job_gres_ptr->gres_cnt_step_alloc[node_offset])) {
error("gres/%s: %s for %u.%u, step's > job's "
"remaining for node %d (%"PRIu64" > "
"(%"PRIu64" - %"PRIu64"))",
job_gres_ptr->gres_name, __func__, job_id,
step_id, node_offset, gres_needed, gres_avail,
job_gres_ptr->gres_cnt_step_alloc[node_offset]);
return SLURM_ERROR;
}
gres_avail -= job_gres_ptr->gres_cnt_step_alloc[node_offset];
if (max_gres)
gres_needed = MIN(gres_avail, max_gres);
if (step_gres_ptr->gres_cnt_node_alloc &&
(node_offset < step_gres_ptr->node_cnt))
step_gres_ptr->gres_cnt_node_alloc[node_offset] = gres_needed;
step_gres_ptr->total_gres += gres_needed;
if (step_gres_ptr->node_in_use == NULL) {
step_gres_ptr->node_in_use = bit_alloc(job_gres_ptr->node_cnt);
}
bit_set(step_gres_ptr->node_in_use, node_offset);
job_gres_ptr->gres_cnt_step_alloc[node_offset] += gres_needed;
if ((job_gres_ptr->gres_bit_alloc == NULL) ||
(job_gres_ptr->gres_bit_alloc[node_offset] == NULL)) {
debug3("gres/%s: %s gres_bit_alloc for %u.%u is NULL",
job_gres_ptr->gres_name, __func__, job_id, step_id);
return SLURM_SUCCESS;
}
gres_bit_alloc = bit_copy(job_gres_ptr->gres_bit_alloc[node_offset]);
len = bit_size(gres_bit_alloc);
if (_shared_gres(plugin_id)) {
for (i = 0; i < len; i++) {
if (gres_needed > 0) {
if (bit_test(gres_bit_alloc, i))
gres_needed = 0;
} else {
bit_clear(gres_bit_alloc, i);
}
}
} else {
if (job_gres_ptr->gres_bit_step_alloc &&
job_gres_ptr->gres_bit_step_alloc[node_offset]) {
bit_and_not(gres_bit_alloc,
job_gres_ptr->gres_bit_step_alloc[node_offset]);
}
for (i = 0; i < len; i++) {
if (gres_needed > 0) {
if (bit_test(gres_bit_alloc, i))
gres_needed--;
} else {
bit_clear(gres_bit_alloc, i);
}
}
}
if (gres_needed) {
error("gres/%s: %s step %u.%u oversubscribed resources on node %d",
job_gres_ptr->gres_name, __func__,
job_id, step_id, node_offset);
}
if (job_gres_ptr->gres_bit_step_alloc == NULL) {
job_gres_ptr->gres_bit_step_alloc =
xcalloc(job_gres_ptr->node_cnt, sizeof(bitstr_t *));
}
if (job_gres_ptr->gres_bit_step_alloc[node_offset]) {
bit_or(job_gres_ptr->gres_bit_step_alloc[node_offset],
gres_bit_alloc);
} else {
job_gres_ptr->gres_bit_step_alloc[node_offset] =
bit_copy(gres_bit_alloc);
}
if (step_gres_ptr->gres_bit_alloc == NULL) {
step_gres_ptr->gres_bit_alloc = xcalloc(job_gres_ptr->node_cnt,
sizeof(bitstr_t *));
}
if (step_gres_ptr->gres_bit_alloc[node_offset]) {
error("gres/%s: %s step %u.%u bit_alloc already exists",
job_gres_ptr->gres_name, __func__, job_id, step_id);
bit_or(step_gres_ptr->gres_bit_alloc[node_offset],
gres_bit_alloc);
FREE_NULL_BITMAP(gres_bit_alloc);
} else {
step_gres_ptr->gres_bit_alloc[node_offset] = gres_bit_alloc;
}
return SLURM_SUCCESS;
}
/*
* Allocate resource to a step and update job and step gres information
* IN step_gres_list - step's gres_list built by
* gres_plugin_step_state_validate()
* IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
* IN node_offset - job's zero-origin index to the node of interest
* IN first_step_node - true if this is the first node in the step's allocation
* IN tasks_on_node - number of tasks to be launched on this node
* IN rem_nodes - desired additional node count to allocate, including this node
* IN job_id, step_id - ID of the step being allocated.
* RET SLURM_SUCCESS or error code
*/
extern int gres_plugin_step_alloc(List step_gres_list, List job_gres_list,
int node_offset, bool first_step_node,
uint16_t tasks_on_node, uint32_t rem_nodes,
uint32_t job_id, uint32_t step_id)
{
int rc, rc2;
ListIterator step_gres_iter;
gres_state_t *step_gres_ptr, *job_gres_ptr;
if (step_gres_list == NULL)
return SLURM_SUCCESS;
if (job_gres_list == NULL) {
error("%s: step allocates GRES, but job %u has none",
__func__, job_id);
return SLURM_ERROR;
}
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
step_gres_iter = list_iterator_create(step_gres_list);
while ((step_gres_ptr = (gres_state_t *) list_next(step_gres_iter))) {
gres_step_state_t *step_data_ptr =
(gres_step_state_t *) step_gres_ptr->gres_data;
gres_key_t job_search_key;
step_data_ptr = (gres_step_state_t *)step_gres_ptr->gres_data;
job_search_key.plugin_id = step_gres_ptr->plugin_id;
if (step_data_ptr->type_name)
job_search_key.type_id = step_data_ptr->type_id;
else
job_search_key.type_id = NO_VAL;
job_search_key.node_offset = node_offset;
if (!(job_gres_ptr = list_find_first(
job_gres_list,
_gres_find_job_by_key_with_cnt,
&job_search_key))) {
/* job lack resources required by the step */
rc = ESLURM_INVALID_GRES;
break;
}
rc2 = _step_alloc(step_data_ptr,
job_gres_ptr->gres_data,
step_gres_ptr->plugin_id, node_offset,
first_step_node,
job_id, step_id, tasks_on_node, rem_nodes);
if (rc2 != SLURM_SUCCESS)
rc = rc2;
}
list_iterator_destroy(step_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
static int _step_dealloc(gres_state_t *step_gres_ptr, List job_gres_list,
uint32_t job_id, uint32_t step_id)
{
gres_state_t *job_gres_ptr;
gres_step_state_t *step_data_ptr =
(gres_step_state_t *)step_gres_ptr->gres_data;
gres_job_state_t *job_data_ptr;
uint32_t i, j;
uint64_t gres_cnt;
int len_j, len_s;
gres_key_t job_search_key;
xassert(job_gres_list);
xassert(step_data_ptr);
job_search_key.plugin_id = step_gres_ptr->plugin_id;
if (step_data_ptr->type_name)
job_search_key.type_id = step_data_ptr->type_id;
else
job_search_key.type_id = NO_VAL;
for (i = 0; i < step_data_ptr->node_cnt; i++) {
job_search_key.node_offset = i;
if (!(job_gres_ptr = list_find_first(
job_gres_list,
_gres_find_job_by_key_with_cnt,
&job_search_key)))
continue;
job_data_ptr = (gres_job_state_t *)job_gres_ptr->gres_data;
if (job_data_ptr->node_cnt == 0) { /* no_consume */
xassert(!step_data_ptr->node_in_use);
xassert(!step_data_ptr->gres_bit_alloc);
return SLURM_SUCCESS;
} else if (job_data_ptr->node_cnt < i)
return SLURM_SUCCESS;
if (!step_data_ptr->node_in_use) {
error("gres/%s: %s step %u.%u dealloc, node_in_use is NULL",
job_data_ptr->gres_name, __func__,
job_id, step_id);
return SLURM_ERROR;
}
if (!bit_test(step_data_ptr->node_in_use, i))
continue;
if (step_data_ptr->gres_cnt_node_alloc)
gres_cnt = step_data_ptr->gres_cnt_node_alloc[i];
else
gres_cnt = step_data_ptr->gres_per_node;
if (job_data_ptr->gres_cnt_step_alloc) {
if (job_data_ptr->gres_cnt_step_alloc[i] >=
gres_cnt) {
job_data_ptr->gres_cnt_step_alloc[i] -=
gres_cnt;
} else {
error("gres/%s: %s step %u.%u dealloc count underflow",
job_data_ptr->gres_name, __func__,
job_id, step_id);
job_data_ptr->gres_cnt_step_alloc[i] = 0;
}
}
if ((step_data_ptr->gres_bit_alloc == NULL) ||
(step_data_ptr->gres_bit_alloc[i] == NULL))
continue;
if (job_data_ptr->gres_bit_alloc[i] == NULL) {
error("gres/%s: %s job %u gres_bit_alloc[%d] is NULL",
job_data_ptr->gres_name, __func__, job_id, i);
continue;
}
len_j = bit_size(job_data_ptr->gres_bit_alloc[i]);
len_s = bit_size(step_data_ptr->gres_bit_alloc[i]);
if (len_j != len_s) {
error("gres/%s: %s step %u.%u dealloc, bit_alloc[%d] size mis-match (%d != %d)",
job_data_ptr->gres_name, __func__,
job_id, step_id, i, len_j, len_s);
len_j = MIN(len_j, len_s);
}
for (j = 0; j < len_j; j++) {
if (!bit_test(step_data_ptr->gres_bit_alloc[i], j))
continue;
if (job_data_ptr->gres_bit_step_alloc &&
job_data_ptr->gres_bit_step_alloc[i]) {
bit_clear(job_data_ptr->gres_bit_step_alloc[i],
j);
}
}
FREE_NULL_BITMAP(step_data_ptr->gres_bit_alloc[i]);
}
return SLURM_SUCCESS;
}
/*
* Deallocate resource to a step and update job and step gres information
* IN step_gres_list - step's gres_list built by
* gres_plugin_step_state_validate()
* IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
* IN job_id, step_id - ID of the step being allocated.
* RET SLURM_SUCCESS or error code
*/
extern int gres_plugin_step_dealloc(List step_gres_list, List job_gres_list,
uint32_t job_id, uint32_t step_id)
{
int rc, rc2;
ListIterator step_gres_iter;
gres_state_t *step_gres_ptr;
if (step_gres_list == NULL)
return SLURM_SUCCESS;
if (job_gres_list == NULL) {
error("%s: step deallocates gres, but job %u has none",
__func__, job_id);
return SLURM_ERROR;
}
rc = gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
step_gres_iter = list_iterator_create(step_gres_list);
while ((step_gres_ptr = list_next(step_gres_iter))) {
rc2 = _step_dealloc(step_gres_ptr,
job_gres_list,
job_id, step_id);
if (rc2 != SLURM_SUCCESS)
rc = rc2;
}
list_iterator_destroy(step_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
/*
* Determine total count GRES of a given type are allocated to a job across
* all nodes
* IN job_gres_list - job's gres_list built by gres_plugin_job_state_validate()
* IN gres_name - name of a GRES type
* RET count of this GRES allocated to this job
*/
extern uint64_t gres_get_value_by_type(List job_gres_list, char *gres_name)
{
int i;
uint32_t plugin_id;
uint64_t gres_cnt = 0;
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr;
gres_job_state_t *job_gres_data;
if (job_gres_list == NULL)
return NO_VAL64;
gres_cnt = NO_VAL64;
(void) gres_plugin_init();
plugin_id = gres_plugin_build_id(gres_name);
slurm_mutex_lock(&gres_context_lock);
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
for (i = 0; i < gres_context_cnt; i++) {
if (job_gres_ptr->plugin_id != plugin_id)
continue;
job_gres_data = (gres_job_state_t *)
job_gres_ptr->gres_data;
gres_cnt = job_gres_data->gres_per_node;
break;
}
}
list_iterator_destroy(job_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return gres_cnt;
}
/*
* Fill in an array of GRES type IDs contained within the given job gres_list
* and an array of corresponding counts of those GRES types.
* IN gres_list - a List of GRES types allocated to a job.
* IN arr_len - Length of the arrays (the number of elements in the gres_list).
* IN gres_count_ids, gres_count_vals - the GRES type ID's and values found
* in the gres_list.
* RET SLURM_SUCCESS or error code
*/
extern int gres_plugin_job_count(List gres_list, int arr_len,
uint32_t *gres_count_ids,
uint64_t *gres_count_vals)
{
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr;
void *job_gres_data;
int rc, ix = 0;
rc = gres_plugin_init();
if ((rc == SLURM_SUCCESS) && (arr_len <= 0))
rc = EINVAL;
if (rc != SLURM_SUCCESS)
return rc;
slurm_mutex_lock(&gres_context_lock);
job_gres_iter = list_iterator_create(gres_list);
while ((job_gres_ptr = (gres_state_t*) list_next(job_gres_iter))) {
gres_job_state_t *job_gres_state_ptr;
job_gres_data = job_gres_ptr->gres_data;
job_gres_state_ptr = (gres_job_state_t *) job_gres_data;
xassert(job_gres_state_ptr);
gres_count_ids[ix] = job_gres_ptr->plugin_id;
if (job_gres_state_ptr->total_gres == NO_CONSUME_VAL64)
gres_count_vals[ix] = 0;
else
gres_count_vals[ix] = job_gres_state_ptr->total_gres;
if (++ix >= arr_len)
break;
}
list_iterator_destroy(job_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
/*
* Build a string identifying total GRES counts of each type
* IN gres_list - a List of GRES types allocated to a job.
* RET string containing comma-separated list of gres type:model:count
* must release memory using xfree()
*/
extern char *gres_plugin_job_alloc_count(List gres_list)
{
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr;
void *job_gres_data;
char *gres_alloc = NULL, *gres_name, *sep = "";
int i;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
job_gres_iter = list_iterator_create(gres_list);
while ((job_gres_ptr = (gres_state_t*) list_next(job_gres_iter))) {
gres_job_state_t *job_gres_state_ptr;
uint64_t total_gres;
job_gres_data = job_gres_ptr->gres_data;
job_gres_state_ptr = (gres_job_state_t *) job_gres_data;
if (!job_gres_state_ptr) {
error("%s: job gres_data is NULL", __func__);
continue;
}
gres_name = "UNKNOWN";
for (i = 0; i < gres_context_cnt; i++) {
if (gres_context[i].plugin_id !=
job_gres_ptr->plugin_id)
continue;
gres_name = gres_context[i].gres_name;
}
if (job_gres_state_ptr->total_gres == NO_CONSUME_VAL64)
total_gres = 0;
else
total_gres = job_gres_state_ptr->total_gres;
if (job_gres_state_ptr->type_name) {
xstrfmtcat(gres_alloc, "%s%s:%s:%"PRIu64, sep,
gres_name, job_gres_state_ptr->type_name,
total_gres);
} else {
xstrfmtcat(gres_alloc, "%s%s:%"PRIu64, sep, gres_name,
total_gres);
}
sep = ",";
}
list_iterator_destroy(job_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return gres_alloc;
}
/*
* Fill in an array of GRES type ids contained within the given node gres_list
* and an array of corresponding counts of those GRES types.
* IN gres_list - a List of GRES types found on a node.
* IN arrlen - Length of the arrays (the number of elements in the gres_list).
* IN gres_count_ids, gres_count_vals - the GRES type ID's and values found
* in the gres_list.
* IN val_type - Type of value desired, see GRES_VAL_TYPE_*
* RET SLURM_SUCCESS or error code
*/
extern int gres_plugin_node_count(List gres_list, int arr_len,
uint32_t *gres_count_ids,
uint64_t *gres_count_vals,
int val_type)
{
ListIterator node_gres_iter;
gres_state_t* node_gres_ptr;
void* node_gres_data;
uint64_t val;
int rc, ix = 0;
rc = gres_plugin_init();
if ((rc == SLURM_SUCCESS) && (arr_len <= 0))
rc = EINVAL;
if (rc != SLURM_SUCCESS)
return rc;
slurm_mutex_lock(&gres_context_lock);
node_gres_iter = list_iterator_create(gres_list);
while ((node_gres_ptr = (gres_state_t*) list_next(node_gres_iter))) {
gres_node_state_t *node_gres_state_ptr;
val = 0;
node_gres_data = node_gres_ptr->gres_data;
node_gres_state_ptr = (gres_node_state_t *) node_gres_data;
xassert(node_gres_state_ptr);
switch (val_type) {
case (GRES_VAL_TYPE_FOUND):
val = node_gres_state_ptr->gres_cnt_found;
break;
case (GRES_VAL_TYPE_CONFIG):
val = node_gres_state_ptr->gres_cnt_config;
break;
case (GRES_VAL_TYPE_AVAIL):
val = node_gres_state_ptr->gres_cnt_avail;
break;
case (GRES_VAL_TYPE_ALLOC):
val = node_gres_state_ptr->gres_cnt_alloc;
}
gres_count_ids[ix] = node_gres_ptr->plugin_id;
gres_count_vals[ix] = val;
if (++ix >= arr_len)
break;
}
list_iterator_destroy(node_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
/* Send GRES information to slurmstepd on the specified file descriptor */
extern void gres_plugin_send_stepd(int fd)
{
int i;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
for (i = 0; i < gres_context_cnt; i++) {
safe_write(fd, &gres_context[i].config_flags, sizeof(uint8_t));
if (gres_context[i].ops.send_stepd == NULL)
continue; /* No plugin to call */
(*(gres_context[i].ops.send_stepd)) (fd);
}
slurm_mutex_unlock(&gres_context_lock);
return;
rwfail:
error("%s: failed", __func__);
slurm_mutex_unlock(&gres_context_lock);
}
/* Receive GRES information from slurmd on the specified file descriptor */
extern void gres_plugin_recv_stepd(int fd)
{
int i;
(void) gres_plugin_init();
slurm_mutex_lock(&gres_context_lock);
for (i = 0; i < gres_context_cnt; i++) {
safe_read(fd, &gres_context[i].config_flags, sizeof(uint8_t));
(void)_load_gres_plugin(&gres_context[i]);
if (gres_context[i].ops.recv_stepd == NULL)
continue; /* No plugin to call */
(*(gres_context[i].ops.recv_stepd)) (fd);
}
slurm_mutex_unlock(&gres_context_lock);
return;
rwfail:
error("%s: failed", __func__);
slurm_mutex_unlock(&gres_context_lock);
}
/* Get generic GRES data types here. Call the plugin for others */
static int _get_job_info(int gres_inx, gres_job_state_t *job_gres_data,
uint32_t node_inx, enum gres_job_data_type data_type,
void *data)
{
uint64_t *u64_data = (uint64_t *) data;
bitstr_t **bit_data = (bitstr_t **) data;
int rc = SLURM_SUCCESS;
if (!job_gres_data || !data)
return EINVAL;
if (node_inx >= job_gres_data->node_cnt)
return ESLURM_INVALID_NODE_COUNT;
if (data_type == GRES_JOB_DATA_COUNT) {
*u64_data = job_gres_data->gres_per_node;
} else if (data_type == GRES_JOB_DATA_BITMAP) {
if (job_gres_data->gres_bit_alloc)
*bit_data = job_gres_data->gres_bit_alloc[node_inx];
else
*bit_data = NULL;
} else {
/* Support here for plugin-specific data types */
rc = (*(gres_context[gres_inx].ops.job_info))
(job_gres_data, node_inx, data_type, data);
}
return rc;
}
/*
* get data from a job's GRES data structure
* IN job_gres_list - job's GRES data structure
* IN gres_name - name of a GRES type
* IN node_inx - zero-origin index of the node within the job's allocation
* for which data is desired
* IN data_type - type of data to get from the job's data
* OUT data - pointer to the data from job's GRES data structure
* DO NOT FREE: This is a pointer into the job's data structure
* RET - SLURM_SUCCESS or error code
*/
extern int gres_get_job_info(List job_gres_list, char *gres_name,
uint32_t node_inx,
enum gres_job_data_type data_type, void *data)
{
int i, rc = ESLURM_INVALID_GRES;
uint32_t plugin_id;
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr;
gres_job_state_t *job_gres_data;
if (data == NULL)
return EINVAL;
if (job_gres_list == NULL) /* No GRES allocated */
return ESLURM_INVALID_GRES;
(void) gres_plugin_init();
plugin_id = gres_plugin_build_id(gres_name);
slurm_mutex_lock(&gres_context_lock);
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
for (i = 0; i < gres_context_cnt; i++) {
if (job_gres_ptr->plugin_id != plugin_id)
continue;
job_gres_data = (gres_job_state_t *)
job_gres_ptr->gres_data;
rc = _get_job_info(i, job_gres_data, node_inx,
data_type, data);
break;
}
}
list_iterator_destroy(job_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
/* Given a job's GRES data structure, return the indecies for selected elements
* IN job_gres_list - job's GRES data structure
* OUT gres_detail_cnt - Number of elements (nodes) in gres_detail_str
* OUT gres_detail_str - Description of GRES on each node
* OUT total_gres_str - String containing all gres in the job and counts.
*/
extern void gres_build_job_details(List job_gres_list,
uint32_t *gres_detail_cnt,
char ***gres_detail_str,
char **total_gres_str)
{
int i, j;
ListIterator job_gres_iter;
gres_state_t *job_gres_ptr;
gres_job_state_t *job_gres_data;
char *sep1, *sep2, tmp_str[128], *type, **my_gres_details = NULL;
uint32_t my_gres_cnt = 0;
char *gres_name, *gres_str = NULL;
uint64_t gres_cnt;
/* Release any vestigial data (e.g. from job requeue) */
for (i = 0; i < *gres_detail_cnt; i++)
xfree(gres_detail_str[0][i]);
xfree(*gres_detail_str);
xfree(*total_gres_str);
*gres_detail_cnt = 0;
if (job_gres_list == NULL) /* No GRES allocated */
return;
(void) gres_plugin_init();
job_gres_iter = list_iterator_create(job_gres_list);
while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) {
job_gres_data = (gres_job_state_t *) job_gres_ptr->gres_data;
if (job_gres_data->gres_bit_alloc == NULL)
continue;
if (my_gres_details == NULL) {
my_gres_cnt = job_gres_data->node_cnt;
my_gres_details = xcalloc(my_gres_cnt, sizeof(char *));
}
if (job_gres_data->type_name) {
sep2 = ":";
type = job_gres_data->type_name;
} else {
sep2 = "";
type = "";
}
gres_name = xstrdup_printf(
"%s%s%s",
job_gres_data->gres_name, sep2, type);
gres_cnt = 0;
for (j = 0; j < my_gres_cnt; j++) {
if (j >= job_gres_data->node_cnt)
break; /* node count mismatch */
if (my_gres_details[j])
sep1 = ",";
else
sep1 = "";
gres_cnt += job_gres_data->gres_cnt_node_alloc[j];
if (job_gres_data->gres_bit_alloc[j]) {
bit_fmt(tmp_str, sizeof(tmp_str),
job_gres_data->gres_bit_alloc[j]);
xstrfmtcat(my_gres_details[j],
"%s%s:%"PRIu64"(IDX:%s)",
sep1, gres_name,
job_gres_data->
gres_cnt_node_alloc[j],
tmp_str);
} else if (job_gres_data->gres_cnt_node_alloc[j]) {
xstrfmtcat(my_gres_details[j],
"%s%s(CNT:%"PRIu64")",
sep1, gres_name,
job_gres_data->
gres_cnt_node_alloc[j]);
}
}
xstrfmtcat(gres_str, "%s%s:%"PRIu64,
gres_str ? "," : "", gres_name, gres_cnt);
xfree(gres_name);
}
list_iterator_destroy(job_gres_iter);
*gres_detail_cnt = my_gres_cnt;
*gres_detail_str = my_gres_details;
*total_gres_str = gres_str;
}
/* Get generic GRES data types here. Call the plugin for others */
static int _get_step_info(int gres_inx, gres_step_state_t *step_gres_data,
uint32_t node_inx, enum gres_step_data_type data_type,
void *data)
{
uint64_t *u64_data = (uint64_t *) data;
bitstr_t **bit_data = (bitstr_t **) data;
int rc = SLURM_SUCCESS;
if (!step_gres_data || !data)
return EINVAL;
if (node_inx >= step_gres_data->node_cnt)
return ESLURM_INVALID_NODE_COUNT;
if (data_type == GRES_STEP_DATA_COUNT) {
*u64_data = step_gres_data->gres_per_node;
} else if (data_type == GRES_STEP_DATA_BITMAP) {
if (step_gres_data->gres_bit_alloc)
*bit_data = step_gres_data->gres_bit_alloc[node_inx];
else
*bit_data = NULL;
} else {
/* Support here for plugin-specific data types */
rc = (*(gres_context[gres_inx].ops.step_info))
(step_gres_data, node_inx, data_type, data);
}
return rc;
}
/*
* get data from a step's GRES data structure
* IN step_gres_list - step's GRES data structure
* IN gres_name - name of a GRES type
* IN node_inx - zero-origin index of the node within the job's allocation
* for which data is desired. Note this can differ from the step's
* node allocation index.
* IN data_type - type of data to get from the step's data
* OUT data - pointer to the data from step's GRES data structure
* DO NOT FREE: This is a pointer into the step's data structure
* RET - SLURM_SUCCESS or error code
*/
extern int gres_get_step_info(List step_gres_list, char *gres_name,
uint32_t node_inx,
enum gres_step_data_type data_type, void *data)
{
int i, rc = ESLURM_INVALID_GRES;
uint32_t plugin_id;
ListIterator step_gres_iter;
gres_state_t *step_gres_ptr;
gres_step_state_t *step_gres_data;
if (data == NULL)
return EINVAL;
if (step_gres_list == NULL) /* No GRES allocated */
return ESLURM_INVALID_GRES;
(void) gres_plugin_init();
plugin_id = gres_plugin_build_id(gres_name);
slurm_mutex_lock(&gres_context_lock);
step_gres_iter = list_iterator_create(step_gres_list);
while ((step_gres_ptr = (gres_state_t *) list_next(step_gres_iter))) {
for (i = 0; i < gres_context_cnt; i++) {
if (step_gres_ptr->plugin_id != plugin_id)
continue;
step_gres_data = (gres_step_state_t *)
step_gres_ptr->gres_data;
rc = _get_step_info(i, step_gres_data, node_inx,
data_type, data);
break;
}
}
list_iterator_destroy(step_gres_iter);
slurm_mutex_unlock(&gres_context_lock);
return rc;
}
extern gres_step_state_t *gres_get_step_state(List gres_list, char *name)
{
gres_state_t *gres_state_ptr;
if (!gres_list || !name || !list_count(gres_list))
return NULL;
slurm_mutex_lock(&gres_context_lock);
gres_state_ptr = list_find_first(gres_list, _gres_step_find_name, name);
slurm_mutex_unlock(&gres_context_lock);
if (!gres_state_ptr)
return NULL;
return (gres_step_state_t *)gres_state_ptr->gres_data;
}
extern gres_job_state_t *gres_get_job_state(List gres_list, char *name)
{
gres_state_t *gres_state_ptr;
if (!gres_list || !name || !list_count(gres_list))
return NULL;
slurm_mutex_lock(&gres_context_lock);
gres_state_ptr = list_find_first(gres_list, _gres_job_find_name, name);
slurm_mutex_unlock(&gres_context_lock);
if (!gres_state_ptr)
return NULL;
return (gres_job_state_t *)gres_state_ptr->gres_data;
}
extern uint32_t gres_get_autodetect_types(void)
{
return autodetect_types;
}
extern char *gres_2_tres_str(List gres_list, bool is_job, bool locked)
{
ListIterator itr;
slurmdb_tres_rec_t *tres_rec;
gres_state_t *gres_state_ptr;
int i;
uint64_t count;
char *col_name = NULL;
char *tres_str = NULL;
static bool first_run = 1;
static slurmdb_tres_rec_t tres_req;
assoc_mgr_lock_t locks = { .tres = READ_LOCK };
/* we only need to init this once */
if (first_run) {
first_run = 0;
memset(&tres_req, 0, sizeof(slurmdb_tres_rec_t));
tres_req.type = "gres";
}
if (!gres_list)
return NULL;
/* must be locked first before gres_contrex_lock!!! */
if (!locked)
assoc_mgr_lock(&locks);
slurm_mutex_lock(&gres_context_lock);
itr = list_iterator_create(gres_list);
while ((gres_state_ptr = list_next(itr))) {
if (is_job) {
gres_job_state_t *gres_data_ptr = (gres_job_state_t *)
gres_state_ptr->gres_data;
col_name = gres_data_ptr->type_name;
count = gres_data_ptr->total_gres;
} else {
gres_step_state_t *gres_data_ptr = (gres_step_state_t *)
gres_state_ptr->gres_data;
col_name = gres_data_ptr->type_name;
count = gres_data_ptr->total_gres;
}
for (i = 0; i < gres_context_cnt; i++) {
if (gres_context[i].plugin_id ==
gres_state_ptr->plugin_id) {
tres_req.name = gres_context[i].gres_name;
break;
}
}
if (!tres_req.name) {
debug("%s: couldn't find name", __func__);
continue;
}
/* If we are no_consume, print a 0 */
if (count == NO_CONSUME_VAL64)
count = 0;
tres_rec = assoc_mgr_find_tres_rec(&tres_req);
if (tres_rec &&
slurmdb_find_tres_count_in_string(
tres_str, tres_rec->id) == INFINITE64)
/* New gres */
xstrfmtcat(tres_str, "%s%u=%"PRIu64,
tres_str ? "," : "",
tres_rec->id, count);
if (i < gres_context_cnt) {
if (col_name) {
/*
* Now let's put of the : name TRES if we are
* tracking it as well. This would be handy
* for GRES like "gpu:tesla", where you might
* want to track both as TRES.
*/
tres_req.name = xstrdup_printf(
"%s%s",
gres_context[i].gres_name_colon,
col_name);
tres_rec = assoc_mgr_find_tres_rec(&tres_req);
xfree(tres_req.name);
if (tres_rec &&
slurmdb_find_tres_count_in_string(
tres_str, tres_rec->id) == INFINITE64)
/* New GRES */
xstrfmtcat(tres_str, "%s%u=%"PRIu64,
tres_str ? "," : "",
tres_rec->id, count);
} else {
/*
* Job allocated GRES without "type"
* specification, but Slurm is only accounting
* for this GRES by specific "type", so pick
* some valid "type" to get some accounting.
* Although the reported "type" may not be
* accurate, it is better than nothing...
*/
tres_req.name = xstrdup_printf(
"%s", gres_context[i].gres_name);
tres_rec = assoc_mgr_find_tres_rec2(&tres_req);
xfree(tres_req.name);
if (tres_rec &&
slurmdb_find_tres_count_in_string(
tres_str, tres_rec->id) == INFINITE64)
/* New GRES */
xstrfmtcat(tres_str, "%s%u=%"PRIu64,
tres_str ? "," : "",
tres_rec->id, count);
}
}
}
list_iterator_destroy(itr);
slurm_mutex_unlock(&gres_context_lock);
if (!locked)
assoc_mgr_unlock(&locks);
return tres_str;
}
/* Fill in job/node TRES arrays with allocated GRES. */
static void _set_type_tres_cnt(gres_state_type_enum_t state_type,
List gres_list,
uint32_t node_cnt,
uint64_t *tres_cnt,
bool locked)
{
ListIterator itr;
gres_state_t *gres_state_ptr;
static bool first_run = 1;
static slurmdb_tres_rec_t tres_rec;
char *col_name = NULL;
uint64_t count;
int i, tres_pos;
assoc_mgr_lock_t locks = { .tres = READ_LOCK };
/* we only need to init this once */
if (first_run) {
first_run = 0;
memset(&tres_rec, 0, sizeof(slurmdb_tres_rec_t));
tres_rec.type = "gres";
}
if (!gres_list || !tres_cnt ||
((state_type == GRES_STATE_TYPE_JOB) &&
(!node_cnt || (node_cnt == NO_VAL))))
return;
/* must be locked first before gres_contrex_lock!!! */
if (!locked)
assoc_mgr_lock(&locks);
slurm_mutex_lock(&gres_context_lock);
/* Initialize all GRES counters to zero. Increment them later. */
for (i = 0; i < gres_context_cnt; i++) {
tres_rec.name = gres_context[i].gres_name;
if (tres_rec.name &&
((tres_pos = assoc_mgr_find_tres_pos(&tres_rec,true)) !=-1))
tres_cnt[tres_pos] = 0;
}
itr = list_iterator_create(gres_list);
while ((gres_state_ptr = list_next(itr))) {
bool set_total = false;
for (i = 0; i < gres_context_cnt; i++) {
if (gres_context[i].plugin_id ==
gres_state_ptr->plugin_id) {
tres_rec.name = gres_context[i].gres_name;
break;
}
}
if (!tres_rec.name) {
debug("%s: couldn't find name", __func__);
continue;
}
/* Get alloc count for main GRES. */
switch (state_type) {
case GRES_STATE_TYPE_JOB:
{
gres_job_state_t *gres_data_ptr = (gres_job_state_t *)
gres_state_ptr->gres_data;
count = gres_data_ptr->total_gres;
break;
}
case GRES_STATE_TYPE_NODE:
{
gres_node_state_t *gres_data_ptr = (gres_node_state_t *)
gres_state_ptr->gres_data;
count = gres_data_ptr->gres_cnt_alloc;
break;
}
default:
error("%s: unsupported state type %d", __func__,
state_type);
continue;
}
/*
* Set main TRES's count (i.e. if no GRES "type" is being
* accounted for). We need to increment counter since the job
* may have been allocated multiple GRES types, but Slurm is
* only configured to track the total count. For example, a job
* allocated 1 GPU of type "tesla" and 1 GPU of type "volta",
* but we want to record that the job was allocated a total of
* 2 GPUs.
*/
if ((tres_pos = assoc_mgr_find_tres_pos(&tres_rec,true)) != -1){
if (count == NO_CONSUME_VAL64)
tres_cnt[tres_pos] = NO_CONSUME_VAL64;
else
tres_cnt[tres_pos] += count;
set_total = true;
}
/*
* Set TRES count for GRES model types. This would be handy for
* GRES like "gpu:tesla", where you might want to track both as
* TRES.
*/
switch (state_type) {
case GRES_STATE_TYPE_JOB:
{
gres_job_state_t *gres_data_ptr = (gres_job_state_t *)
gres_state_ptr->gres_data;
col_name = gres_data_ptr->type_name;
if (col_name) {
tres_rec.name = xstrdup_printf(
"%s%s",
gres_context[i].gres_name_colon,
col_name);
if ((tres_pos = assoc_mgr_find_tres_pos(
&tres_rec, true)) != -1)
tres_cnt[tres_pos] = count;
xfree(tres_rec.name);
} else if (!set_total) {
/*
* Job allocated GRES without "type"
* specification, but Slurm is only accounting
* for this GRES by specific "type", so pick
* some valid "type" to get some accounting.
* Although the reported "type" may not be
* accurate, it is better than nothing...
*/
tres_rec.name = xstrdup_printf(
"%s", gres_context[i].gres_name);
if ((tres_pos = assoc_mgr_find_tres_pos2(
&tres_rec, true)) != -1)
tres_cnt[tres_pos] = count;
xfree(tres_rec.name);
}
break;
}
case GRES_STATE_TYPE_NODE:
{
int type;
gres_node_state_t *gres_data_ptr = (gres_node_state_t *)
gres_state_ptr->gres_data;
for (type = 0; type < gres_data_ptr->type_cnt; type++) {
col_name = gres_data_ptr->type_name[type];
if (!col_name)
continue;
tres_rec.name = xstrdup_printf(
"%s%s",
gres_context[i].gres_name_colon,
col_name);
count = gres_data_ptr->type_cnt_alloc[type];
if ((tres_pos = assoc_mgr_find_tres_pos(
&tres_rec, true)) != -1)
tres_cnt[tres_pos] = count;
xfree(tres_rec.name);
}
break;
}
default:
error("%s: unsupported state type %d", __func__,
state_type);
continue;
}
}
list_iterator_destroy(itr);
slurm_mutex_unlock(&gres_context_lock);
if (!locked)
assoc_mgr_unlock(&locks);
return;
}
extern void gres_set_job_tres_cnt(List gres_list,
uint32_t node_cnt,
uint64_t *tres_cnt,
bool locked)
{
_set_type_tres_cnt(GRES_STATE_TYPE_JOB,
gres_list, node_cnt, tres_cnt, locked);
}
extern void gres_set_node_tres_cnt(List gres_list,
uint64_t *tres_cnt,
bool locked)
{
_set_type_tres_cnt(GRES_STATE_TYPE_NODE,
gres_list, 0, tres_cnt, locked);
}
extern char *gres_device_major(char *dev_path)
{
int loc_major, loc_minor;
char *ret_major = NULL;
struct stat fs;
if (stat(dev_path, &fs) < 0) {
error("%s: stat(%s): %m", __func__, dev_path);
return NULL;
}
loc_major = (int)major(fs.st_rdev);
loc_minor = (int)minor(fs.st_rdev);
debug3("%s : %s major %d, minor %d",
__func__, dev_path, loc_major, loc_minor);
if (S_ISBLK(fs.st_mode)) {
xstrfmtcat(ret_major, "b %d:", loc_major);
//info("device is block ");
}
if (S_ISCHR(fs.st_mode)) {
xstrfmtcat(ret_major, "c %d:", loc_major);
//info("device is character ");
}
xstrfmtcat(ret_major, "%d rwm", loc_minor);
return ret_major;
}
/* Free memory for gres_device_t record */
extern void destroy_gres_device(void *gres_device_ptr)
{
gres_device_t *gres_device = (gres_device_t *) gres_device_ptr;
if (!gres_device)
return;
xfree(gres_device->path);
xfree(gres_device->major);
xfree(gres_device);
}
/* Destroy a gres_slurmd_conf_t record, free it's memory */
extern void destroy_gres_slurmd_conf(void *x)
{
gres_slurmd_conf_t *p = (gres_slurmd_conf_t *) x;
xassert(p);
xfree(p->cpus);
FREE_NULL_BITMAP(p->cpus_bitmap);
xfree(p->file); /* Only used by slurmd */
xfree(p->links);
xfree(p->name);
xfree(p->type_name);
xfree(p);
}
/*
* Convert GRES config_flags to a string. The pointer returned references local
* storage in this function, which is not re-entrant.
*/
extern char *gres_flags2str(uint8_t config_flags)
{
static char flag_str[128];
char *sep = "";
flag_str[0] = '\0';
if (config_flags & GRES_CONF_COUNT_ONLY) {
strcat(flag_str, sep);
strcat(flag_str, "CountOnly");
sep = ",";
}
if (config_flags & GRES_CONF_HAS_FILE) {
strcat(flag_str, sep);
strcat(flag_str, "HAS_FILE");
sep = ",";
}
if (config_flags & GRES_CONF_LOADED) {
strcat(flag_str, sep);
strcat(flag_str, "LOADED");
sep = ",";
}
if (config_flags & GRES_CONF_HAS_TYPE) {
strcat(flag_str, sep);
strcat(flag_str, "HAS_TYPE");
sep = ",";
}
return flag_str;
}
/*
* Creates a gres_slurmd_conf_t record to add to a list of gres_slurmd_conf_t
* records
*/
extern void add_gres_to_list(List gres_list, char *name, uint64_t device_cnt,
int cpu_cnt, char *cpu_aff_abs_range,
bitstr_t *cpu_aff_mac_bitstr, char *device_file,
char *type, char *links)
{
gres_slurmd_conf_t *gpu_record;
bool use_empty_first_record = false;
ListIterator itr = list_iterator_create(gres_list);
/*
* If the first record already exists and has a count of 0 then
* overwrite it.
* This is a placeholder record created in _merge_config()
*/
gpu_record = list_next(itr);
if (gpu_record && (gpu_record->count == 0))
use_empty_first_record = true;
else
gpu_record = xmalloc(sizeof(gres_slurmd_conf_t));
gpu_record->cpu_cnt = cpu_cnt;
if (cpu_aff_mac_bitstr)
gpu_record->cpus_bitmap = bit_copy(cpu_aff_mac_bitstr);
if (device_file)
gpu_record->config_flags |= GRES_CONF_HAS_FILE;
if (type)
gpu_record->config_flags |= GRES_CONF_HAS_TYPE;
gpu_record->cpus = xstrdup(cpu_aff_abs_range);
gpu_record->type_name = xstrdup(type);
gpu_record->name = xstrdup(name);
gpu_record->file = xstrdup(device_file);
gpu_record->links = xstrdup(links);
gpu_record->count = device_cnt;
gpu_record->plugin_id = gres_plugin_build_id(name);
if (!use_empty_first_record)
list_append(gres_list, gpu_record);
list_iterator_destroy(itr);
}