| /*****************************************************************************\ |
| * gres.c - driver for gres plugin |
| ***************************************************************************** |
| * Copyright (C) 2010 Lawrence Livermore National Security. |
| * Copyright (C) SchedMD LLC. |
| * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). |
| * Written by Morris Jette <jette1@llnl.gov> |
| * CODE-OCEC-09-009. All rights reserved. |
| * |
| * This file is part of Slurm, a resource management program. |
| * For details, see <https://slurm.schedmd.com/>. |
| * Please also read the included file: DISCLAIMER. |
| * |
| * Slurm is free software; you can redistribute it and/or modify it under |
| * the terms of the GNU General Public License as published by the Free |
| * Software Foundation; either version 2 of the License, or (at your option) |
| * any later version. |
| * |
| * In addition, as a special exception, the copyright holders give permission |
| * to link the code of portions of this program with the OpenSSL library under |
| * certain conditions as described in each individual source file, and |
| * distribute linked combinations including the two. You must obey the GNU |
| * General Public License in all respects for all of the code used other than |
| * OpenSSL. If you modify file(s) with this exception, you may extend this |
| * exception to your version of the file(s), but you are not obligated to do |
| * so. If you do not wish to do so, delete this exception statement from your |
| * version. If you delete this exception statement from all source files in |
| * the program, then also delete it here. |
| * |
| * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY |
| * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
| * details. |
| * |
| * You should have received a copy of the GNU General Public License along |
| * with Slurm; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| \*****************************************************************************/ |
| |
| #include "config.h" |
| |
| #define _GNU_SOURCE |
| |
| #include <ctype.h> |
| #include <inttypes.h> |
| #include <limits.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <math.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| |
| #ifdef MAJOR_IN_MKDEV |
| # include <sys/mkdev.h> |
| #endif |
| #ifdef MAJOR_IN_SYSMACROS |
| # include <sys/sysmacros.h> |
| #endif |
| |
| #include "slurm/slurm.h" |
| #include "slurm/slurm_errno.h" |
| #include "src/common/assoc_mgr.h" |
| #include "src/common/bitstring.h" |
| #include "src/interfaces/cgroup.h" |
| #include "src/interfaces/gres.h" |
| #include "src/interfaces/gpu.h" |
| #include "src/common/job_resources.h" |
| #include "src/common/list.h" |
| #include "src/common/log.h" |
| #include "src/common/macros.h" |
| #include "src/common/node_conf.h" |
| #include "src/common/pack.h" |
| #include "src/common/parse_config.h" |
| #include "src/common/plugin.h" |
| #include "src/common/plugrack.h" |
| #include "src/common/read_config.h" |
| #include "src/interfaces/select.h" |
| #include "src/common/slurm_protocol_api.h" |
| #include "src/common/slurm_protocol_pack.h" |
| #include "src/common/strlcpy.h" |
| #include "src/common/xmalloc.h" |
| #include "src/common/xsched.h" |
| #include "src/common/xstring.h" |
| |
| #define MAX_GRES_BITMAP 1024 |
| |
| strong_alias(gres_find_id, slurm_gres_find_id); |
| strong_alias(gres_find_job_by_key_exact_type, |
| slurm_gres_find_job_by_key_exact_type); |
| strong_alias(gres_find_sock_by_job_state, slurm_gres_find_sock_by_job_state); |
| strong_alias(gres_get_node_used, slurm_gres_get_node_used); |
| strong_alias(gres_get_system_cnt, slurm_gres_get_system_cnt); |
| strong_alias(gres_get_step_info, slurm_gres_get_step_info); |
| strong_alias(gres_sock_delete, slurm_gres_sock_delete); |
| strong_alias(gres_job_list_delete, slurm_gres_job_list_delete); |
| strong_alias(destroy_gres_device, slurm_destroy_gres_device); |
| strong_alias(destroy_gres_slurmd_conf, slurm_destroy_gres_slurmd_conf); |
| |
| static s_p_options_t _gres_options[] = { |
| {"AutoDetect", S_P_STRING}, |
| {"Count", S_P_STRING}, /* Number of Gres available */ |
| {"CPUs" , S_P_STRING}, /* CPUs to bind to Gres resource |
| * (deprecated, use Cores) */ |
| {"Cores", S_P_STRING}, /* Cores to bind to Gres resource */ |
| {"File", S_P_STRING}, /* Path to Gres device */ |
| {"Files", S_P_STRING}, /* Path to Gres device */ |
| {"Flags", S_P_STRING}, /* GRES Flags */ |
| {"Link", S_P_STRING}, /* Communication link IDs */ |
| {"Links", S_P_STRING}, /* Communication link IDs */ |
| {"MultipleFiles", S_P_STRING}, /* list of GRES device files */ |
| {"Name", S_P_STRING}, /* Gres name */ |
| {"Type", S_P_STRING}, /* Gres type (e.g. model name) */ |
| {NULL} |
| }; |
| |
| /* Gres symbols provided by the plugin */ |
| typedef struct slurm_gres_ops { |
| int (*node_config_load) ( list_t *gres_conf_list, |
| node_config_load_t *node_conf); |
| void (*job_set_env) ( char ***job_env_ptr, |
| bitstr_t *gres_bit_alloc, |
| uint64_t gres_cnt, |
| gres_internal_flags_t flags); |
| void (*step_set_env) ( char ***step_env_ptr, |
| bitstr_t *gres_bit_alloc, |
| uint64_t gres_cnt, |
| gres_internal_flags_t flags); |
| void (*task_set_env) ( char ***task_env_ptr, |
| bitstr_t *gres_bit_alloc, |
| uint64_t gres_cnt, |
| bitstr_t *usable_gres, |
| gres_internal_flags_t flags); |
| void (*send_stepd) ( buf_t *buffer ); |
| void (*recv_stepd) ( buf_t *buffer ); |
| list_t *(*get_devices)(void); |
| void (*step_hardware_init) ( bitstr_t *, char * ); |
| void (*step_hardware_fini) ( void ); |
| gres_prep_t *(*prep_build_env)(gres_job_state_t *gres_js); |
| void (*prep_set_env) ( char ***prep_env_ptr, |
| gres_prep_t *gres_prep, |
| int node_inx ); |
| } slurm_gres_ops_t; |
| |
| /* |
| * Gres plugin context, one for each gres type. |
| * Add to gres_context through _add_gres_context(). |
| */ |
| typedef struct slurm_gres_context { |
| plugin_handle_t cur_plugin; |
| uint32_t config_flags; /* See GRES_CONF_* in gres.h */ |
| char * gres_name; /* name (e.g. "gpu") */ |
| char * gres_name_colon; /* name + colon (e.g. "gpu:") */ |
| int gres_name_colon_len; /* size of gres_name_colon */ |
| char * gres_type; /* plugin name (e.g. "gres/gpu") */ |
| list_t *np_gres_devices; /* list of devices when we don't have a plugin */ |
| slurm_gres_ops_t ops; /* pointers to plugin symbols */ |
| uint32_t plugin_id; /* key for searches */ |
| plugrack_t *plugin_list; /* plugrack info */ |
| uint64_t total_cnt; /* Total GRES across all nodes */ |
| } slurm_gres_context_t; |
| |
| typedef struct { |
| uint32_t plugin_id; |
| bool with_type; |
| bool without_type; |
| void *without_type_state; /* gres_[job|step]_state_t */ |
| } overlap_check_t; |
| |
| typedef struct { |
| slurm_gres_context_t *gres_ctx; |
| int new_has_file; |
| int new_has_type; |
| int rec_count; |
| } foreach_gres_conf_t; |
| |
| typedef struct { |
| bitstr_t **gres_bit_alloc; |
| uint64_t gres_cnt; |
| uint64_t **gres_per_bit; |
| bool is_job; |
| int node_inx; |
| uint32_t plugin_id; |
| bool sharing_gres_allocated; |
| } foreach_gres_accumulate_device_t; |
| |
| typedef struct { |
| node_config_load_t *config; |
| list_t **gres_devices; |
| int index; |
| int max_dev_num; |
| list_t *names_list; |
| int rc; |
| } foreach_fill_in_gres_devices_t; |
| |
| typedef struct { |
| char *node_list; |
| list_t *prep_gres_list; |
| } foreach_prep_build_env_t; |
| |
| typedef struct { |
| int node_inx; |
| char ***prep_env_ptr; |
| } foreach_prep_set_env_t; |
| |
| typedef struct { |
| uint32_t core_cnt; |
| int core_end_bit; |
| int core_start_bit; |
| uint32_t job_id; |
| list_t *node_gres_list; |
| char *node_name; |
| bool use_total_gres; |
| } foreach_job_test_t; |
| |
| typedef struct { |
| void *data; |
| enum gres_step_data_type data_type; |
| uint32_t node_inx; |
| uint32_t plugin_id; |
| int rc; |
| } foreach_step_info_t; |
| |
| typedef struct { |
| char *gres_str; |
| char *sep; |
| int sock_inx; |
| } foreach_sock_str_t; |
| |
| typedef struct { |
| list_t *device_list; |
| bitstr_t *gres_bit_alloc; |
| bitstr_t *usable_gres; |
| } foreach_alloc_gres_device_t; |
| |
| typedef struct { |
| bool filter_type; |
| uint64_t gres_cnt; |
| char *gres_type; |
| bool is_job; |
| uint32_t plugin_id; |
| } foreach_gres_list_cnt_t; |
| |
| typedef struct { |
| int job_node_index; |
| list_t *new_gres_list; |
| } foreach_state_list_dup_t; |
| |
| typedef struct { |
| int bitmap_size; |
| int gres_inx; |
| uint32_t plugin_id; |
| bitstr_t *task_cpus_bitmap; |
| bitstr_t *usable_gres; |
| } foreach_closest_usable_gres_t; |
| |
| typedef struct { |
| int best_slot; |
| int gres_inx; |
| bitstr_t *gres_slots; |
| int ntasks_per_gres; |
| bool overlap; |
| uint32_t plugin_id; |
| bitstr_t *task_cpus_bitmap; |
| } foreach_gres_to_task_t; |
| |
| typedef struct { |
| int array_len; |
| uint32_t *gres_count_ids; |
| uint64_t *gres_count_vals; |
| int index; |
| int val_type; |
| } foreach_node_count_t; |
| |
| /* Pointers to functions in src/slurmd/common/xcpuinfo.h that we may use */ |
| typedef struct xcpuinfo_funcs { |
| int (*xcpuinfo_abs_to_mac) (char *abs, char **mac); |
| } xcpuinfo_funcs_t; |
| xcpuinfo_funcs_t xcpuinfo_ops; |
| |
| typedef struct { |
| uint32_t flags; |
| uint32_t name_hash; |
| bool no_gpu_env; |
| } prev_gres_flags_t; |
| |
| typedef struct { |
| uint32_t config_flags; |
| int config_type_cnt; |
| uint32_t cpu_set_cnt; |
| uint64_t gres_cnt; |
| uint32_t plugin_id; |
| uint32_t rec_cnt; |
| uint64_t topo_cnt; |
| } tot_from_slurmd_conf_t; |
| |
| typedef struct { |
| int core_cnt; |
| int cores_per_sock; |
| bool cpu_config_err; |
| int cpus_config; |
| uint64_t dev_cnt; |
| slurm_gres_context_t *gres_ctx; |
| gres_node_state_t *gres_ns; |
| int gres_inx; |
| int topo_cnt; |
| bool has_file; |
| char *node_name; |
| int rc; |
| char **reason_down; |
| int sock_cnt; |
| uint64_t tot_gres_cnt; |
| } rebuild_topo_t; |
| |
| typedef struct { |
| slurm_gres_context_t *gres_ctx; |
| gres_node_state_t *gres_ns; |
| } add_gres_info_t; |
| |
| typedef struct { |
| uint64_t count; |
| slurm_gres_context_t *gres_ctx; |
| char *type_name; |
| } conf_cnt_t; |
| |
| typedef struct { |
| list_t *gres_conf_list; |
| slurm_gres_context_t *gres_ctx; |
| } check_conf_t; |
| |
| typedef struct { |
| uint64_t cpu_cnt; |
| list_t *gres_conf_list; |
| slurm_gres_context_t *gres_ctx; |
| list_t *new_list; |
| } merge_gres_t; |
| |
| typedef struct { |
| void *generic_gres_data; |
| bool is_job; |
| uint32_t plugin_id; |
| } merge_generic_t; |
| |
| typedef struct { |
| uint32_t cpus_per_gres; |
| gres_job_state_validate_t *gres_js_val; |
| bool have_gres_shared; |
| bool have_gres_sharing; |
| bool is_job; |
| bool overlap_merge; |
| int over_count; |
| overlap_check_t *over_array; |
| int rc; |
| uint32_t tmp_min_cpus; |
| } job_validate_t; |
| |
| typedef struct { |
| uint32_t job_id; |
| list_t *node_gres_list; |
| int node_inx; |
| char *node_name; |
| } validate_job_gres_cnt_t; |
| |
| typedef struct { |
| int job_node_index; |
| list_t *new_list; |
| } job_state_extract_t; |
| |
| typedef struct { |
| buf_t *buffer; |
| bool details; |
| uint32_t magic; |
| uint16_t protocol_version; |
| } pack_state_t; |
| |
| /* Local variables */ |
| static int gres_context_cnt = -1; |
| static uint32_t gres_cpu_cnt = 0; |
| static slurm_gres_context_t *gres_context = NULL; |
| static char *gres_node_name = NULL; |
| static char *local_plugins_str = NULL; |
| static pthread_mutex_t gres_context_lock = PTHREAD_MUTEX_INITIALIZER; |
| static list_t *gres_conf_list = NULL; |
| static uint32_t gpu_plugin_id = NO_VAL; |
| static volatile uint32_t autodetect_flags = GRES_AUTODETECT_UNSET; |
| static buf_t *gres_context_buf = NULL; |
| static buf_t *gres_conf_buf = NULL; |
| static bool reset_prev = true; |
| static bool use_local_index = false; |
| static bool dev_index_mode_set = false; |
| |
| /* Local functions */ |
| static void _accumulate_job_gres_alloc(gres_job_state_t *gres_js, |
| int node_inx, |
| bitstr_t **gres_bit_alloc, |
| uint64_t *gres_cnt); |
| static void _accumulate_step_gres_alloc(gres_state_t *gres_state_step, |
| bitstr_t **gres_bit_alloc, |
| uint64_t *gres_cnt, |
| uint64_t **gres_per_bit); |
| static void _add_gres_context(char *gres_name); |
| static gres_node_state_t *_build_gres_node_state(void); |
| static void _build_node_gres_str(list_t **gres_list, char **gres_str, |
| int cores_per_sock, int sock_per_node); |
| static bitstr_t *_core_bitmap_rebuild(bitstr_t *old_core_bitmap, int new_size); |
| static void _prep_list_del(void *x); |
| static void _get_gres_cnt(gres_node_state_t *gres_ns, char *orig_config, |
| char *gres_name, char *gres_name_colon, |
| int gres_name_colon_len); |
| static uint64_t _get_job_gres_list_cnt(list_t *gres_list, char *gres_name, |
| char *gres_type); |
| static void * _job_state_dup2(gres_job_state_t *gres_js, int job_node_index); |
| static int _load_plugin(slurm_gres_context_t *gres_ctx); |
| static int _log_gres_slurmd_conf(void *x, void *arg); |
| static void _my_stat(char *file_name); |
| static void _node_config_init(char *orig_config, |
| slurm_gres_context_t *gres_ctx, |
| gres_state_t *gres_state_node); |
| static char * _node_gres_used(gres_node_state_t *gres_ns, char *gres_name); |
| static int _node_reconfig(char *node_name, char *new_gres, char **gres_str, |
| gres_state_t *gres_state_node, |
| bool config_overrides, |
| slurm_gres_context_t *gres_ctx, |
| bool *updated_gpu_cnt); |
| static int _node_reconfig_test(char *node_name, char *new_gres, |
| gres_state_t *gres_state_node, |
| slurm_gres_context_t *gres_ctx); |
| static void * _node_state_dup(gres_node_state_t *gres_ns); |
| static int _parse_gres_config(void **dest, slurm_parser_enum_t type, |
| const char *key, const char *value, |
| const char *line, char **leftover); |
| static int _parse_gres_config_node(void **dest, slurm_parser_enum_t type, |
| const char *key, const char *value, |
| const char *line, char **leftover); |
| static int _post_plugin_gres_conf(void *x, void *arg); |
| static void * _step_state_dup(gres_step_state_t *gres_ss); |
| static void * _step_state_dup2(gres_step_state_t *gres_ss, |
| int job_node_index); |
| static int _unload_plugin(slurm_gres_context_t *gres_ctx); |
| static void _validate_slurm_conf(list_t *slurm_conf_list, |
| slurm_gres_context_t *gres_ctx); |
| static void _validate_gres_conf(list_t *gres_conf_list, |
| slurm_gres_context_t *gres_ctx); |
| static int _validate_file(char *path_name, char *gres_name); |
| static int _valid_gres_type(char *gres_name, gres_node_state_t *gres_ns, |
| bool config_overrides, char **reason_down); |
| static void _parse_accel_bind_type(uint16_t accel_bind_type, |
| char *tres_bind_str); |
| static int _get_usable_gres(int context_inx, int proc_id, |
| char *tres_bind_str, bitstr_t **usable_gres_ptr, |
| bitstr_t *gres_bit_alloc, bool get_devices, |
| stepd_step_rec_t *step, uint64_t *gres_per_bit, |
| gres_internal_flags_t *flags); |
| |
| extern uint32_t gres_build_id(char *name) |
| { |
| int i, j; |
| uint32_t id = 0; |
| |
| if (!name) |
| return id; |
| |
| for (i = 0, j = 0; name[i]; i++) { |
| id += (name[i] << j); |
| j = (j + 8) % 32; |
| } |
| |
| return id; |
| } |
| |
| extern int gres_find_id(void *x, void *key) |
| { |
| uint32_t *plugin_id = (uint32_t *)key; |
| gres_state_t *state_ptr = (gres_state_t *) x; |
| if (state_ptr->plugin_id == *plugin_id) |
| return 1; |
| return 0; |
| } |
| |
| extern int gres_find_flags(void *x, void *key) |
| { |
| gres_state_t *state_ptr = x; |
| uint32_t flags = *(uint32_t *)key; |
| if (state_ptr->config_flags & flags) |
| return 1; |
| return 0; |
| } |
| |
| /* Find job record with matching name and type */ |
| extern int gres_find_job_by_key_exact_type(void *x, void *key) |
| { |
| gres_state_t *gres_state_job = (gres_state_t *) x; |
| gres_key_t *job_key = (gres_key_t *) key; |
| gres_job_state_t *gres_js; |
| gres_js = (gres_job_state_t *)gres_state_job->gres_data; |
| |
| if ((gres_state_job->plugin_id == job_key->plugin_id) && |
| (gres_js->type_id == job_key->type_id)) |
| return 1; |
| return 0; |
| } |
| |
| /* Find job record with matching name and type */ |
| extern int gres_find_job_by_key(void *x, void *key) |
| { |
| gres_state_t *gres_state_job = (gres_state_t *) x; |
| gres_key_t *job_key = (gres_key_t *) key; |
| gres_job_state_t *gres_js; |
| gres_js = (gres_job_state_t *)gres_state_job->gres_data; |
| |
| if ((gres_state_job->plugin_id == job_key->plugin_id) && |
| ((job_key->type_id == NO_VAL) || |
| (gres_js->type_id == job_key->type_id))) |
| return 1; |
| return 0; |
| } |
| |
| /* Find job record with matching name and type */ |
| extern int gres_find_job_by_key_with_cnt(void *x, void *key) |
| { |
| gres_state_t *gres_state_job = (gres_state_t *) x; |
| gres_key_t *job_key = (gres_key_t *) key; |
| gres_job_state_t *gres_js; |
| gres_js = (gres_job_state_t *)gres_state_job->gres_data; |
| |
| if (!gres_find_job_by_key(x, key)) |
| return 0; |
| |
| /* This gres has been allocated on this node */ |
| if (!gres_js->node_cnt || |
| ((job_key->node_offset < gres_js->node_cnt) && |
| gres_js->gres_cnt_node_alloc[job_key->node_offset])) |
| return 1; |
| |
| return 0; |
| } |
| |
| extern int gres_find_step_by_key(void *x, void *key) |
| { |
| gres_state_t *state_ptr = (gres_state_t *) x; |
| gres_key_t *step_key = (gres_key_t *) key; |
| gres_step_state_t *gres_ss = (gres_step_state_t *)state_ptr->gres_data; |
| |
| if ((state_ptr->plugin_id == step_key->plugin_id) && |
| (gres_ss->type_id == step_key->type_id)) |
| return 1; |
| return 0; |
| } |
| |
| |
| extern bool gres_use_local_device_index(void) |
| { |
| bool use_cgroup = false; |
| |
| if (dev_index_mode_set) |
| return use_local_index; |
| dev_index_mode_set = true; |
| |
| if (!slurm_conf.task_plugin) |
| return use_local_index; |
| |
| if (xstrstr(slurm_conf.task_plugin, "cgroup")) |
| use_cgroup = true; |
| if (!use_cgroup) |
| return use_local_index; |
| |
| cgroup_conf_init(); |
| if (slurm_cgroup_conf.constrain_devices) |
| use_local_index = true; |
| |
| return use_local_index; |
| } |
| |
| extern gres_state_t *gres_create_state(void *src_ptr, |
| gres_state_src_t state_src, |
| gres_state_type_enum_t state_type, |
| void *gres_data) |
| { |
| gres_state_t *new_gres_state = xmalloc(sizeof(gres_state_t)); |
| |
| new_gres_state->gres_data = gres_data; |
| new_gres_state->state_type = state_type; |
| |
| switch (state_src) { |
| case GRES_STATE_SRC_STATE_PTR: |
| { |
| gres_state_t *gres_state = src_ptr; |
| new_gres_state->config_flags = gres_state->config_flags; |
| new_gres_state->plugin_id = gres_state->plugin_id; |
| new_gres_state->gres_name = xstrdup(gres_state->gres_name); |
| break; |
| } |
| case GRES_STATE_SRC_CONTEXT_PTR: |
| { |
| slurm_gres_context_t *gres_ctx = src_ptr; |
| new_gres_state->config_flags = gres_ctx->config_flags; |
| new_gres_state->plugin_id = gres_ctx->plugin_id; |
| new_gres_state->gres_name = xstrdup(gres_ctx->gres_name); |
| break; |
| } |
| case GRES_STATE_SRC_KEY_PTR: |
| { |
| gres_key_t *search_key = src_ptr; |
| new_gres_state->config_flags = search_key->config_flags; |
| new_gres_state->plugin_id = search_key->plugin_id; |
| /* |
| * gres_name should be handled after this since search_key |
| * doesn't have that |
| */ |
| break; |
| } |
| default: |
| error("%s: No way to create gres_state given", __func__); |
| xfree(new_gres_state); |
| break; |
| } |
| |
| return new_gres_state; |
| } |
| |
| /* |
| * Find a gres_context by plugin_id |
| * Must hold gres_context_lock before calling. |
| */ |
| static slurm_gres_context_t *_find_context_by_id(uint32_t plugin_id) |
| { |
| for (int j = 0; j < gres_context_cnt; j++) |
| if (gres_context[j].plugin_id == plugin_id) |
| return &gres_context[j]; |
| return NULL; |
| } |
| |
| static int _load_plugin(slurm_gres_context_t *gres_ctx) |
| { |
| /* |
| * Must be synchronized with slurm_gres_ops_t above. |
| */ |
| static const char *syms[] = { |
| "gres_p_node_config_load", |
| "gres_p_job_set_env", |
| "gres_p_step_set_env", |
| "gres_p_task_set_env", |
| "gres_p_send_stepd", |
| "gres_p_recv_stepd", |
| "gres_p_get_devices", |
| "gres_p_step_hardware_init", |
| "gres_p_step_hardware_fini", |
| "gres_p_prep_build_env", |
| "gres_p_prep_set_env" |
| }; |
| int n_syms = sizeof(syms) / sizeof(char *); |
| |
| /* Find the correct plugin */ |
| if (gres_ctx->config_flags & GRES_CONF_COUNT_ONLY) { |
| debug("Plugin of type %s only tracks gres counts", |
| gres_ctx->gres_type); |
| return SLURM_SUCCESS; |
| } |
| |
| gres_ctx->cur_plugin = plugin_load_and_link( |
| gres_ctx->gres_type, |
| n_syms, syms, |
| (void **) &gres_ctx->ops); |
| if (gres_ctx->cur_plugin != PLUGIN_INVALID_HANDLE) |
| return SLURM_SUCCESS; |
| |
| if (errno != ESLURM_PLUGIN_NOTFOUND) { |
| error("Couldn't load specified plugin name for %s: %s", |
| gres_ctx->gres_type, slurm_strerror(errno)); |
| return SLURM_ERROR; |
| } |
| |
| debug("gres: Couldn't find the specified plugin name for %s looking " |
| "at all files", gres_ctx->gres_type); |
| |
| /* Get plugin list */ |
| if (gres_ctx->plugin_list == NULL) { |
| gres_ctx->plugin_list = plugrack_create("gres"); |
| plugrack_read_dir(gres_ctx->plugin_list, |
| slurm_conf.plugindir); |
| } |
| |
| gres_ctx->cur_plugin = plugrack_use_by_type( |
| gres_ctx->plugin_list, |
| gres_ctx->gres_type ); |
| if (gres_ctx->cur_plugin == PLUGIN_INVALID_HANDLE) { |
| debug("Cannot find plugin of type %s, just track gres counts", |
| gres_ctx->gres_type); |
| gres_ctx->config_flags |= GRES_CONF_COUNT_ONLY; |
| return SLURM_ERROR; |
| } |
| |
| /* Dereference the API. */ |
| if (plugin_get_syms(gres_ctx->cur_plugin, |
| n_syms, syms, |
| (void **) &gres_ctx->ops ) < n_syms ) { |
| error("Incomplete %s plugin detected", |
| gres_ctx->gres_type); |
| return SLURM_ERROR; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _unload_plugin(slurm_gres_context_t *gres_ctx) |
| { |
| int rc; |
| |
| /* |
| * Must check return code here because plugins might still |
| * be loaded and active. |
| */ |
| if (gres_ctx->plugin_list) |
| rc = plugrack_destroy(gres_ctx->plugin_list); |
| else { |
| rc = SLURM_SUCCESS; |
| plugin_unload(gres_ctx->cur_plugin); |
| } |
| xfree(gres_ctx->gres_name); |
| xfree(gres_ctx->gres_name_colon); |
| xfree(gres_ctx->gres_type); |
| FREE_NULL_LIST(gres_ctx->np_gres_devices); |
| |
| return rc; |
| } |
| |
| extern bool gres_is_shared_name(char *name) |
| { |
| if (!xstrcmp(name, "mps") || |
| !xstrcmp(name, "shard")) |
| return true; |
| return false; |
| } |
| |
| static void _set_shared_flag(char *name, uint32_t *config_flags) |
| { |
| if (gres_is_shared_name(name)) |
| *config_flags |= GRES_CONF_SHARED; |
| } |
| |
| /* |
| * Add new gres context to gres_context array and load the plugin. |
| * Must hold gres_context_lock before calling. |
| */ |
| static void _add_gres_context(char *gres_name) |
| { |
| slurm_gres_context_t *gres_ctx; |
| |
| if (!gres_name || !gres_name[0]) |
| fatal("%s: invalid empty gres_name", __func__); |
| |
| xrecalloc(gres_context, (gres_context_cnt + 1), |
| sizeof(slurm_gres_context_t)); |
| |
| gres_ctx = &gres_context[gres_context_cnt]; |
| _set_shared_flag(gres_name, &gres_ctx->config_flags); |
| gres_ctx->gres_name = xstrdup(gres_name); |
| gres_ctx->plugin_id = gres_build_id(gres_name); |
| gres_ctx->gres_type = xstrdup_printf("gres/%s", gres_name); |
| gres_ctx->plugin_list = NULL; |
| gres_ctx->cur_plugin = PLUGIN_INVALID_HANDLE; |
| |
| gres_context_cnt++; |
| } |
| |
| /* |
| * Initialize the GRES plugins. |
| * |
| * Returns a Slurm errno. |
| */ |
| extern int gres_init(void) |
| { |
| int i, j, rc = SLURM_SUCCESS; |
| char *last = NULL, *names, *one_name, *full_name; |
| char *sorted_names = NULL, *sep = "", *shared_names = NULL; |
| bool have_gpu = false, have_shared = false; |
| char *shared_sep = ""; |
| |
| slurm_mutex_lock(&gres_context_lock); |
| |
| if (gres_context_cnt >= 0) |
| goto fini; |
| |
| local_plugins_str = xstrdup(slurm_conf.gres_plugins); |
| gres_context_cnt = 0; |
| if ((local_plugins_str == NULL) || (local_plugins_str[0] == '\0')) |
| goto fini; |
| |
| /* Ensure that "gres/'shared'" follows "gres/gpu" */ |
| have_gpu = false; |
| have_shared = false; |
| names = xstrdup(local_plugins_str); |
| one_name = strtok_r(names, ",", &last); |
| while (one_name) { |
| bool skip_name = false; |
| if (gres_is_shared_name(one_name)) { |
| have_shared = true; |
| if (!have_gpu) { |
| /* "shared" must follow "gpu" */ |
| skip_name = true; |
| xstrfmtcat(shared_names, "%s%s", |
| shared_sep, one_name); |
| shared_sep = ","; |
| } |
| } else if (!xstrcmp(one_name, "gpu")) { |
| have_gpu = true; |
| gpu_plugin_id = gres_build_id("gpu"); |
| } |
| if (!skip_name) { |
| xstrfmtcat(sorted_names, "%s%s", sep, one_name); |
| sep = ","; |
| } |
| one_name = strtok_r(NULL, ",", &last); |
| } |
| if (shared_names) { |
| if (!have_gpu) |
| fatal("GresTypes: gres/'shared' requires that gres/gpu also be configured"); |
| xstrfmtcat(sorted_names, "%s%s", sep, shared_names); |
| xfree(shared_names); |
| } |
| xfree(names); |
| |
| gres_context_cnt = 0; |
| one_name = strtok_r(sorted_names, ",", &last); |
| while (one_name) { |
| full_name = xstrdup("gres/"); |
| xstrcat(full_name, one_name); |
| for (i = 0; i < gres_context_cnt; i++) { |
| if (!xstrcmp(full_name, gres_context[i].gres_type)) |
| break; |
| } |
| xfree(full_name); |
| if (i < gres_context_cnt) { |
| error("Duplicate plugin %s ignored", |
| gres_context[i].gres_type); |
| } else { |
| _add_gres_context(one_name); |
| } |
| one_name = strtok_r(NULL, ",", &last); |
| } |
| xfree(sorted_names); |
| |
| /* Ensure that plugin_id is valid and unique */ |
| for (i = 0; i < gres_context_cnt; i++) { |
| for (j = i + 1; j < gres_context_cnt; j++) { |
| if (gres_context[i].plugin_id != |
| gres_context[j].plugin_id) |
| continue; |
| fatal("Gres: Duplicate plugin_id %u for %s and %s, " |
| "change gres name for one of them", |
| gres_context[i].plugin_id, |
| gres_context[i].gres_type, |
| gres_context[j].gres_type); |
| } |
| xassert(gres_context[i].gres_name); |
| |
| gres_context[i].gres_name_colon = |
| xstrdup_printf("%s:", gres_context[i].gres_name); |
| gres_context[i].gres_name_colon_len = |
| strlen(gres_context[i].gres_name_colon); |
| } |
| |
| fini: |
| if (have_shared && running_in_slurmctld() && !running_cons_tres()) { |
| fatal("Use of shared gres requires the use of select/cons_tres"); |
| } |
| |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| return rc; |
| } |
| |
| extern int gres_get_gres_cnt(void) |
| { |
| static int cnt = -1; |
| |
| if (cnt != -1) |
| return cnt; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| cnt = gres_context_cnt; |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| return cnt; |
| } |
| |
| /* |
| * Add a GRES record. This is used by the node_features plugin after the |
| * slurm.conf file is read and the initial GRES records are built by |
| * gres_init(). |
| */ |
| extern void gres_add(char *gres_name) |
| { |
| int i; |
| |
| slurm_mutex_lock(&gres_context_lock); |
| for (i = 0; i < gres_context_cnt; i++) { |
| if (!xstrcmp(gres_context[i].gres_name, gres_name)) |
| goto fini; |
| } |
| |
| _add_gres_context(gres_name); |
| fini: slurm_mutex_unlock(&gres_context_lock); |
| } |
| |
| /* Given a gres_name, return its context index or -1 if not found */ |
| static int _gres_name_context(char *gres_name) |
| { |
| int i; |
| |
| for (i = 0; i < gres_context_cnt; i++) { |
| if (!xstrcmp(gres_context[i].gres_name, gres_name)) |
| return i; |
| } |
| |
| return -1; |
| } |
| |
| /* |
| * Takes a GRES config line (typically from slurm.conf) and remove any |
| * records for GRES which are not defined in GresTypes. |
| * RET string of valid GRES, Release memory using xfree() |
| */ |
| extern char *gres_name_filter(char *orig_gres, char *nodes) |
| { |
| char *new_gres = NULL, *save_ptr = NULL; |
| char *colon, *sep = "", *tmp, *tok, *name; |
| |
| slurm_mutex_lock(&gres_context_lock); |
| if (!orig_gres || !orig_gres[0] || !gres_context_cnt) { |
| slurm_mutex_unlock(&gres_context_lock); |
| return new_gres; |
| } |
| |
| tmp = xstrdup(orig_gres); |
| tok = strtok_r(tmp, ",", &save_ptr); |
| while (tok) { |
| name = xstrdup(tok); |
| if ((colon = strchr(name, ':'))) |
| colon[0] = '\0'; |
| if (_gres_name_context(name) != -1) { |
| xstrfmtcat(new_gres, "%s%s", sep, tok); |
| sep = ","; |
| } else { |
| /* Logging may not be initialized at this point */ |
| error("Invalid GRES configured on node %s: %s", nodes, |
| tok); |
| } |
| xfree(name); |
| tok = strtok_r(NULL, ",", &save_ptr); |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| xfree(tmp); |
| |
| return new_gres; |
| } |
| |
| /* |
| * Terminate the gres plugin. Free memory. |
| * |
| * Returns a Slurm errno. |
| */ |
| extern int gres_fini(void) |
| { |
| int i, j, rc = SLURM_SUCCESS; |
| |
| slurm_mutex_lock(&gres_context_lock); |
| xfree(gres_node_name); |
| if (gres_context_cnt < 0) |
| goto fini; |
| |
| for (i = 0; i < gres_context_cnt; i++) { |
| j = _unload_plugin(gres_context + i); |
| if (j != SLURM_SUCCESS) |
| rc = j; |
| } |
| xfree(gres_context); |
| xfree(local_plugins_str); |
| FREE_NULL_LIST(gres_conf_list); |
| FREE_NULL_BUFFER(gres_context_buf); |
| FREE_NULL_BUFFER(gres_conf_buf); |
| gres_context_cnt = -1; |
| |
| fini: slurm_mutex_unlock(&gres_context_lock); |
| return rc; |
| } |
| |
| /* |
| * ************************************************************************ |
| * P L U G I N C A L L S * |
| * ************************************************************************ |
| */ |
| |
| /* |
| * Return a plugin-specific help message for salloc, sbatch and srun |
| * Result must be xfree()'d. |
| * |
| * NOTE: GRES "type" (e.g. model) information is only available from slurmctld |
| * after slurmd registers. It is not readily available from srun (as used here). |
| */ |
| extern char *gres_help_msg(void) |
| { |
| int i; |
| char *msg = xstrdup("Valid gres options are:\n"); |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| for (i = 0; i < gres_context_cnt; i++) { |
| xstrcat(msg, gres_context[i].gres_name); |
| xstrcat(msg, "[[:type]:count]\n"); |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| return msg; |
| } |
| |
| /* |
| * Perform reconfig, re-read any configuration files |
| * OUT did_change - set if gres configuration changed |
| */ |
| extern int gres_reconfig(void) |
| { |
| int rc = SLURM_SUCCESS; |
| bool plugin_change; |
| |
| slurm_mutex_lock(&gres_context_lock); |
| |
| if (xstrcmp(slurm_conf.gres_plugins, local_plugins_str)) |
| plugin_change = true; |
| else |
| plugin_change = false; |
| |
| reset_prev = true; |
| |
| /* Reset the flags so when the node checks in we believe that */ |
| for (int i = 0; i < gres_context_cnt; i++) |
| gres_context[i].config_flags |= GRES_CONF_FROM_STATE; |
| |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| if (plugin_change) { |
| error("GresPlugins changed from %s to %s ignored", |
| local_plugins_str, slurm_conf.gres_plugins); |
| error("Restart the slurmctld daemon to change GresPlugins"); |
| #if 0 |
| /* This logic would load new plugins, but we need the old |
| * plugins to persist in order to process old state |
| * information. */ |
| rc = gres_fini(); |
| if (rc == SLURM_SUCCESS) |
| rc = gres_init(); |
| #endif |
| } |
| |
| return rc; |
| } |
| |
| /* Return 1 if a gres_conf record is the correct plugin_id and has no file */ |
| static int _find_fileless_gres(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = (gres_slurmd_conf_t *)x; |
| uint32_t plugin_id = *(uint32_t *)arg; |
| |
| if ((gres_slurmd_conf->plugin_id == plugin_id) && |
| !gres_slurmd_conf->file) { |
| warning("Ignoring file-less GPU %s:%s from final GRES list", |
| gres_slurmd_conf->name, gres_slurmd_conf->type_name); |
| return 1; |
| } |
| return 0; |
| |
| } |
| |
| /* |
| * Log the contents of a gres_slurmd_conf_t record |
| */ |
| static int _log_gres_slurmd_conf(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *p; |
| int index = -1, offset, mult = 1; |
| |
| p = (gres_slurmd_conf_t *) x; |
| xassert(p); |
| |
| if (!(slurm_conf.debug_flags & DEBUG_FLAG_GRES)) { |
| verbose("Gres Name=%s Type=%s Count=%"PRIu64" Flags=%s", |
| p->name, p->type_name, p->count, |
| gres_flags2str(p->config_flags)); |
| return 0; |
| } |
| |
| if (p->file) { |
| index = 0; |
| offset = strlen(p->file); |
| while (offset > 0) { |
| offset--; |
| if ((p->file[offset] < '0') || (p->file[offset] > '9')) |
| break; |
| index += (p->file[offset] - '0') * mult; |
| mult *= 10; |
| } |
| } |
| |
| if (p->cpus && (index != -1)) { |
| info("Gres Name=%s Type=%s Count=%"PRIu64" Index=%d ID=%u File=%s Cores=%s CoreCnt=%u Links=%s Flags=%s", |
| p->name, |
| p->type_name, |
| p->count, |
| index, |
| p->plugin_id, |
| p->file, |
| p->cpus, |
| p->cpu_cnt, |
| p->links, |
| gres_flags2str(p->config_flags)); |
| } else if (index != -1) { |
| info("Gres Name=%s Type=%s Count=%"PRIu64" Index=%d ID=%u File=%s Links=%s Flags=%s", |
| p->name, |
| p->type_name, |
| p->count, |
| index, |
| p->plugin_id, |
| p->file, |
| p->links, |
| gres_flags2str(p->config_flags)); |
| } else if (p->file) { |
| info("Gres Name=%s Type=%s Count=%"PRIu64" ID=%u File=%s Links=%s Flags=%s", |
| p->name, |
| p->type_name, |
| p->count, |
| p->plugin_id, |
| p->file, |
| p->links, |
| gres_flags2str(p->config_flags)); |
| } else { |
| info("Gres Name=%s Type=%s Count=%"PRIu64" ID=%u Links=%s Flags=%s", |
| p->name, |
| p->type_name, |
| p->count, |
| p->plugin_id, |
| p->links, |
| gres_flags2str(p->config_flags)); |
| } |
| |
| return 0; |
| } |
| |
| |
| static int _post_plugin_gres_conf(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| slurm_gres_context_t *gres_ctx = arg; |
| |
| if (gres_slurmd_conf->plugin_id != gres_ctx->plugin_id) |
| return 0; |
| |
| if (gres_slurmd_conf->config_flags & GRES_CONF_GLOBAL_INDEX) |
| gres_ctx->config_flags |= GRES_CONF_GLOBAL_INDEX; |
| |
| return 1; |
| } |
| |
| /* Make sure that specified file name exists, wait up to 20 seconds or generate |
| * fatal error and exit. */ |
| static void _my_stat(char *file_name) |
| { |
| struct stat config_stat; |
| bool sent_msg = false; |
| int i; |
| |
| if (!running_in_slurmd_stepd()) |
| return; |
| |
| for (i = 0; i < 20; i++) { |
| if (i) |
| sleep(1); |
| if (stat(file_name, &config_stat) == 0) { |
| if (sent_msg) |
| info("gres.conf file %s now exists", file_name); |
| return; |
| } |
| |
| if (errno != ENOENT) |
| break; |
| |
| if (!sent_msg) { |
| error("Waiting for gres.conf file %s", file_name); |
| sent_msg = true; |
| } |
| } |
| fatal("can't stat gres.conf file %s: %m", file_name); |
| return; |
| } |
| |
| static int _validate_file(char *filenames, char *gres_name) |
| { |
| char *one_name; |
| hostlist_t *hl; |
| int file_count = 0; |
| |
| if (!(hl = hostlist_create(filenames))) |
| fatal("can't parse File=%s", filenames); |
| |
| while ((one_name = hostlist_shift(hl))) { |
| _my_stat(one_name); |
| file_count++; |
| free(one_name); |
| } |
| |
| hostlist_destroy(hl); |
| |
| return file_count; |
| } |
| |
| /* |
| * Create and return a comma-separated zeroed-out links string with a -1 in the |
| * given GPU position indicated by index. Caller must xfree() the returned |
| * string. |
| * |
| * Used to record the enumeration order (PCI bus ID order) of GPUs for sorting, |
| * even when the GPU does not support nvlinks. E.g. for three total GPUs, their |
| * links strings would look like this: |
| * |
| * GPU at index 0: -1,0,0 |
| * GPU at index 1: 0,-1,0 |
| * GPU at index 2: -0,0,-1 |
| */ |
| extern char *gres_links_create_empty(unsigned int index, |
| unsigned int device_count) |
| { |
| char *links_str = NULL; |
| |
| for (unsigned int i = 0; i < device_count; ++i) { |
| xstrfmtcat(links_str, "%s%d", |
| i ? "," : "", |
| (i == index) ? -1 : 0); |
| } |
| |
| return links_str; |
| } |
| |
| /* |
| * Check that we have a comma-delimited list of numbers, and return the index of |
| * the GPU (-1) in the links string. |
| * |
| * Returns a non-zero-based index of the GPU in the links string, if found. |
| * If not found, returns a negative value. |
| * Return values: |
| * 0+: GPU index |
| * -1: links string is NULL. |
| * -2: links string is not NULL, but is invalid. Possible invalid reasons: |
| * * error parsing the comma-delimited links string |
| * * links string is an empty string |
| * * the 'self' GPU identifier isn't found (i.e. no -1) |
| * * there is more than one 'self' GPU identifier found |
| */ |
| extern int gres_links_validate(char *links) |
| { |
| char *tmp, *tok, *save_ptr = NULL, *end_ptr = NULL; |
| long int val; |
| int rc; |
| int i; |
| |
| if (!links) |
| return -1; |
| if (links[0] == '\0') { |
| error("%s: Links is an empty string", __func__); |
| return -2; |
| } |
| |
| tmp = xstrdup(links); |
| tok = strtok_r(tmp, ",", &save_ptr); |
| rc = -1; |
| i = 0; |
| while (tok) { |
| val = strtol(tok, &end_ptr, 10); |
| if ((val < -2) || (val > GRES_MAX_LINK) || (val == LONG_MIN) || |
| (end_ptr[0] != '\0')) { |
| error("%s: Failed to parse token '%s' in links string '%s'", |
| __func__, tok, links); |
| rc = -2; |
| break; |
| } |
| if (val == -1) { |
| if (rc != -1) { |
| error("%s: links string '%s' has more than one -1", |
| __func__, links); |
| rc = -2; |
| break; |
| } |
| rc = i; |
| } |
| i++; |
| tok = strtok_r(NULL, ",", &save_ptr); |
| } |
| xfree(tmp); |
| |
| /* If the current GPU (-1) wasn't found, that's an error */ |
| if (rc == -1) { |
| error("%s: -1 wasn't found in links string '%s'", __func__, |
| links); |
| rc = -2; |
| } |
| |
| return rc; |
| } |
| |
| static char *_get_autodetect_flags_str(void) |
| { |
| char *flags = NULL; |
| |
| if (!(autodetect_flags & GRES_AUTODETECT_GPU_FLAGS)) |
| xstrfmtcat(flags, "%sunset", flags ? "," : ""); |
| else { |
| if (autodetect_flags & GRES_AUTODETECT_GPU_NVML) |
| xstrfmtcat(flags, "%snvml", flags ? "," : ""); |
| else if (autodetect_flags & GRES_AUTODETECT_GPU_RSMI) |
| xstrfmtcat(flags, "%srsmi", flags ? "," : ""); |
| else if (autodetect_flags & GRES_AUTODETECT_GPU_ONEAPI) |
| xstrfmtcat(flags, "%soneapi", flags ? "," : ""); |
| else if (autodetect_flags & GRES_AUTODETECT_GPU_NRT) |
| xstrfmtcat(flags, "%snrt", flags ? "," : ""); |
| else if (autodetect_flags & GRES_AUTODETECT_GPU_NVIDIA) |
| xstrfmtcat(flags, "%snvidia", flags ? "," : ""); |
| else if (autodetect_flags & GRES_AUTODETECT_GPU_OFF) |
| xstrfmtcat(flags, "%soff", flags ? "," : ""); |
| } |
| |
| return flags; |
| } |
| |
| static uint32_t _handle_autodetect_flags(char *str) |
| { |
| uint32_t flags = 0; |
| |
| /* Set the node-local gpus value of autodetect_flags */ |
| if (xstrcasestr(str, "nvml")) |
| flags |= GRES_AUTODETECT_GPU_NVML; |
| else if (xstrcasestr(str, "rsmi")) |
| flags |= GRES_AUTODETECT_GPU_RSMI; |
| else if (xstrcasestr(str, "oneapi")) |
| flags |= GRES_AUTODETECT_GPU_ONEAPI; |
| else if (xstrcasestr(str, "nrt")) |
| flags |= GRES_AUTODETECT_GPU_NRT; |
| else if (xstrcasestr(str, "nvidia")) |
| flags |= GRES_AUTODETECT_GPU_NVIDIA; |
| else if (!xstrcasecmp(str, "off")) |
| flags |= GRES_AUTODETECT_GPU_OFF; |
| else |
| error("unknown autodetect flag '%s'", str); |
| |
| return flags; |
| } |
| |
| static void _handle_local_autodetect(char *str) |
| { |
| uint32_t autodetect_flags_local = _handle_autodetect_flags(str); |
| |
| /* Only set autodetect_flags once locally, unless it's the same val */ |
| if ((autodetect_flags != GRES_AUTODETECT_UNSET) && |
| (autodetect_flags != autodetect_flags_local)) { |
| fatal("gres.conf: duplicate node-local AutoDetect specification does not match the first"); |
| return; |
| } |
| |
| /* Set the node-local gpus value of autodetect_flags */ |
| autodetect_flags |= autodetect_flags_local; |
| |
| if (slurm_conf.debug_flags & DEBUG_FLAG_GRES) { |
| char *flags = _get_autodetect_flags_str(); |
| log_flag(GRES, "Using node-local AutoDetect=%s(%d)", |
| flags, autodetect_flags); |
| xfree(flags); |
| } |
| } |
| |
| static void _handle_global_autodetect(char *str) |
| { |
| /* If GPU flags exist, node-local value was already specified */ |
| if (autodetect_flags & GRES_AUTODETECT_GPU_FLAGS) |
| debug2("gres.conf: AutoDetect GPU flags were locally set, so ignoring global flags"); |
| else |
| autodetect_flags |= _handle_autodetect_flags(str); |
| |
| if (slurm_conf.debug_flags & DEBUG_FLAG_GRES) { |
| char *flags = _get_autodetect_flags_str(); |
| log_flag(GRES, "Global AutoDetect=%s(%d)", |
| flags, autodetect_flags); |
| xfree(flags); |
| } |
| } |
| |
| static int _get_match(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf1 = x; |
| gres_slurmd_conf_t *gres_slurmd_conf2 = arg; |
| |
| /* We only need to check type name because they should all be gpus */ |
| if (!gres_slurmd_conf1->type_name && !gres_slurmd_conf2->type_name) |
| return 1; |
| |
| if (!gres_slurmd_conf1->type_name || !gres_slurmd_conf2->type_name) |
| return 0; |
| |
| if (!xstrcmp(gres_slurmd_conf1->type_name, |
| gres_slurmd_conf2->type_name)) |
| return 1; |
| |
| return 0; |
| } |
| |
| static int _merge_by_type(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x, *merged_gres_slurmd_conf; |
| list_t *gres_list_merged = arg; |
| |
| merged_gres_slurmd_conf = list_find_first(gres_list_merged, _get_match, |
| gres_slurmd_conf); |
| |
| /* We are merging types and don't care about files or links */ |
| if (merged_gres_slurmd_conf) |
| merged_gres_slurmd_conf->count++; |
| else |
| list_append(gres_list_merged, gres_slurmd_conf); |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static int _slurm_conf_gres_str(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| char **gres_str = arg; |
| if (gres_slurmd_conf && gres_slurmd_conf->name) { |
| bool has_type = gres_slurmd_conf->type_name && |
| gres_slurmd_conf->type_name[0]; |
| xstrfmtcat(*gres_str, "%s%s:%s%s%ld", |
| gres_str && gres_str[0] ? "," : "", |
| gres_slurmd_conf->name, |
| has_type ? gres_slurmd_conf->type_name : "", |
| has_type ? ":" : "", |
| gres_slurmd_conf->count); |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| extern void gres_get_autodetected_gpus(node_config_load_t node_conf, |
| char **first_gres_str, |
| char **autodetect_str) |
| { |
| list_t *gres_list_system = NULL, *gres_list_merged = NULL; |
| |
| char *gres_str = NULL; |
| char *autodetect_option_name = NULL; |
| |
| int autodetect_options[] = { |
| GRES_AUTODETECT_GPU_NVML, |
| GRES_AUTODETECT_GPU_NVIDIA, |
| GRES_AUTODETECT_GPU_RSMI, |
| GRES_AUTODETECT_GPU_ONEAPI, |
| GRES_AUTODETECT_GPU_NRT, |
| GRES_AUTODETECT_UNSET /* For loop is done */ |
| }; |
| |
| for (int i = 0; autodetect_options[i] != GRES_AUTODETECT_UNSET; i++) { |
| autodetect_flags = autodetect_options[i]; |
| if (gpu_plugin_init() != SLURM_SUCCESS) |
| continue; |
| gres_list_system = gpu_g_get_system_gpu_list(&node_conf); |
| if (gres_list_system) { |
| gres_list_merged = list_create(NULL); |
| list_for_each(gres_list_system, _merge_by_type, |
| gres_list_merged); |
| list_for_each(gres_list_merged, _slurm_conf_gres_str, |
| &gres_str); |
| } |
| FREE_NULL_LIST(gres_list_merged); |
| FREE_NULL_LIST(gres_list_system); |
| gpu_plugin_fini(); |
| |
| if (!gres_str) |
| continue; |
| |
| if (autodetect_flags == GRES_AUTODETECT_GPU_NVML) |
| i++; /* Skip NVIDIA if NVML finds gpus */ |
| |
| autodetect_option_name = _get_autodetect_flags_str(); |
| xstrfmtcat(*autodetect_str, "%sFound %s with Autodetect=%s (Substring of gpu name may be used instead)", |
| (*autodetect_str ? "\n" : ""), |
| gres_str, |
| autodetect_option_name); |
| xfree(autodetect_option_name); |
| |
| if (!*first_gres_str){ |
| *first_gres_str = gres_str; |
| gres_str = NULL; |
| } else { |
| xfree(gres_str); |
| } |
| } |
| } |
| |
| /* |
| * Check to see if current GRES record matches the name of the previous GRES |
| * record that set env flags. |
| */ |
| static bool _same_gres_name_as_prev(prev_gres_flags_t *prev_gres, |
| gres_slurmd_conf_t *p) |
| { |
| if ((gres_build_id(p->name) == prev_gres->name_hash)) |
| return true; |
| else |
| return false; |
| } |
| |
| /* |
| * Save off env flags, GRES name, and no_gpu_env (for the next gres.conf line to |
| * possibly inherit or to check against). |
| */ |
| static void _set_prev_gres_flags(prev_gres_flags_t *prev_gres, |
| gres_slurmd_conf_t *p, uint32_t env_flags, |
| bool no_gpu_env) |
| { |
| prev_gres->flags = env_flags; |
| prev_gres->name_hash = gres_build_id(p->name); |
| prev_gres->no_gpu_env = no_gpu_env; |
| } |
| |
| /* |
| * Parse a gres.conf Flags string |
| */ |
| extern uint32_t gres_flags_parse(char *input, bool *no_gpu_env, |
| bool *sharing_mentioned) |
| { |
| uint32_t flags = 0; |
| if (xstrcasestr(input, "CountOnly")) |
| flags |= GRES_CONF_COUNT_ONLY; |
| if (xstrcasestr(input, "nvidia_gpu_env")) |
| flags |= GRES_CONF_ENV_NVML; |
| if (xstrcasestr(input, "amd_gpu_env")) |
| flags |= GRES_CONF_ENV_RSMI; |
| if (xstrcasestr(input, "intel_gpu_env")) |
| flags |= GRES_CONF_ENV_ONEAPI; |
| if (xstrcasestr(input, "opencl_env")) |
| flags |= GRES_CONF_ENV_OPENCL; |
| if (xstrcasestr(input, "one_sharing")) |
| flags |= GRES_CONF_ONE_SHARING; |
| if (xstrcasestr(input, "explicit")) |
| flags |= GRES_CONF_EXPLICIT; |
| /* String 'no_gpu_env' will clear all GPU env vars */ |
| if (no_gpu_env) |
| *no_gpu_env = xstrcasestr(input, "no_gpu_env"); |
| if (sharing_mentioned) { |
| if ((flags & GRES_CONF_ONE_SHARING) || |
| xstrcasestr(input, "all_sharing")) |
| *sharing_mentioned = true; |
| } |
| return flags; |
| } |
| |
| /* |
| * Build gres_slurmd_conf_t record based upon a line from the gres.conf file |
| */ |
| static int _parse_gres_config(void **dest, slurm_parser_enum_t type, |
| const char *key, const char *value, |
| const char *line, char **leftover) |
| { |
| int i; |
| s_p_hashtbl_t *tbl; |
| gres_slurmd_conf_t *p; |
| uint64_t tmp_uint64, mult; |
| char *tmp_str, *last; |
| bool cores_flag = false, cpus_flag = false; |
| char *type_str = NULL; |
| char *autodetect_string = NULL; |
| bool autodetect = false, set_default_envs = true; |
| /* Remember the last-set Flags value */ |
| static prev_gres_flags_t prev_gres = { 0 }; |
| |
| if (reset_prev) { |
| memset(&prev_gres, 0, sizeof(prev_gres)); |
| reset_prev = false; |
| } |
| |
| tbl = s_p_hashtbl_create(_gres_options); |
| s_p_parse_line(tbl, *leftover, leftover); |
| |
| p = xmalloc(sizeof(gres_slurmd_conf_t)); |
| |
| /* |
| * Detect and set the node-local AutoDetect option only if |
| * NodeName is specified. |
| */ |
| if (s_p_get_string(&autodetect_string, "AutoDetect", tbl)) { |
| if (value) |
| error("gres.conf: In-line AutoDetect requires NodeName to take effect"); |
| else { |
| _handle_local_autodetect(autodetect_string); |
| /* AutoDetect was specified w/ NodeName */ |
| autodetect = true; |
| } |
| xfree(autodetect_string); |
| } |
| |
| if (!value) { |
| if (!s_p_get_string(&p->name, "Name", tbl)) { |
| if (!autodetect) |
| error("Invalid GRES data, no type name (%s)", |
| line); |
| xfree(p); |
| s_p_hashtbl_destroy(tbl); |
| return 0; |
| } |
| } else { |
| p->name = xstrdup(value); |
| } |
| |
| if (s_p_get_string(&p->type_name, "Type", tbl)) { |
| p->config_flags |= GRES_CONF_HAS_TYPE; |
| } |
| |
| p->cpu_cnt = gres_cpu_cnt; |
| if (s_p_get_string(&p->cpus, "Cores", tbl)) { |
| cores_flag = true; |
| type_str = "Cores"; |
| } else if (s_p_get_string(&p->cpus, "CPUs", tbl)) { |
| cpus_flag = true; |
| type_str = "CPUs"; |
| } |
| if (cores_flag || cpus_flag) { |
| char *local_cpus = NULL; |
| if (xcpuinfo_ops.xcpuinfo_abs_to_mac) { |
| i = (xcpuinfo_ops.xcpuinfo_abs_to_mac) |
| (p->cpus, &local_cpus); |
| if (i != SLURM_SUCCESS) { |
| error("Invalid GRES data for %s, %s=%s", |
| p->name, type_str, p->cpus); |
| } |
| } else { |
| /* |
| * Not converting Cores into machine format is only for |
| * testing or if we don't care about cpus_bitmap. The |
| * slurmd should always convert to machine format. |
| */ |
| debug("%s: %s=%s is not being converted to machine-local format", |
| __func__, type_str, p->cpus); |
| local_cpus = xstrdup(p->cpus); |
| i = SLURM_SUCCESS; |
| } |
| if (i == SLURM_SUCCESS) { |
| p->cpus_bitmap = bit_alloc(gres_cpu_cnt); |
| if (!bit_size(p->cpus_bitmap) || |
| bit_unfmt(p->cpus_bitmap, local_cpus)) { |
| fatal("Invalid GRES data for %s, %s=%s (only %u CPUs are available)", |
| p->name, type_str, p->cpus, gres_cpu_cnt); |
| } |
| } |
| xfree(local_cpus); |
| } |
| |
| if (s_p_get_string(&p->file, "File", tbl) || |
| s_p_get_string(&p->file, "Files", tbl)) { |
| p->count = _validate_file(p->file, p->name); |
| p->config_flags |= GRES_CONF_HAS_FILE; |
| } |
| |
| if (s_p_get_string(&p->file, "MultipleFiles", tbl)) { |
| int file_count = 0; |
| if (p->config_flags & GRES_CONF_HAS_FILE) |
| fatal("File and MultipleFiles options are mutually exclusive"); |
| p->count = 1; |
| file_count = _validate_file(p->file, p->name); |
| if (file_count < 2) |
| fatal("MultipleFiles does not contain multiple files. Use File instead"); |
| p->config_flags |= GRES_CONF_HAS_FILE; |
| p->config_flags |= GRES_CONF_HAS_MULT; |
| } |
| |
| if (s_p_get_string(&tmp_str, "Flags", tbl)) { |
| uint32_t env_flags = 0; |
| bool no_gpu_env = false; |
| bool sharing_mentioned = false; |
| uint32_t flags = gres_flags_parse(tmp_str, &no_gpu_env, |
| &sharing_mentioned); |
| |
| /* The default for MPS is to have only one gpu sharing */ |
| if (!sharing_mentioned && !xstrcasecmp(p->name, "mps")) |
| flags |= GRES_CONF_ONE_SHARING; |
| |
| /* Break out flags into env flags and non-env flags */ |
| env_flags = flags & GRES_CONF_ENV_SET; |
| p->config_flags |= flags; |
| |
| if (env_flags && no_gpu_env) |
| fatal("Invalid GRES record name=%s type=%s: Flags (%s) contains \"no_gpu_env\", which must be mutually exclusive to all other GRES env flags of same node and name", |
| p->name, p->type_name, tmp_str); |
| |
| set_default_envs = false; |
| /* |
| * Make sure that Flags are consistent with each other |
| * if set for multiple lines of the same GRES. |
| */ |
| if (prev_gres.name_hash && |
| _same_gres_name_as_prev(&prev_gres, p) && |
| ((prev_gres.flags != flags) || |
| (prev_gres.no_gpu_env != no_gpu_env))) |
| fatal("Invalid GRES record name=%s type=%s: Flags (%s) does not match env flags for previous GRES of same node and name", |
| p->name, p->type_name, tmp_str); |
| |
| _set_prev_gres_flags(&prev_gres, p, flags, |
| no_gpu_env); |
| |
| xfree(tmp_str); |
| } else if ((prev_gres.flags || prev_gres.no_gpu_env) && |
| _same_gres_name_as_prev(&prev_gres, p)) { |
| /* Inherit flags from previous GRES line with same name */ |
| set_default_envs = false; |
| p->config_flags |= prev_gres.flags; |
| } else { |
| if (!xstrcasecmp(p->name, "mps")) |
| p->config_flags |= GRES_CONF_ONE_SHARING; |
| } |
| |
| /* Flags not set. By default, all env vars are set for GPUs */ |
| if (set_default_envs && !xstrcasecmp(p->name, "gpu")) { |
| uint32_t env_flags = GRES_CONF_ENV_SET | GRES_CONF_ENV_DEF; |
| p->config_flags |= env_flags; |
| _set_prev_gres_flags(&prev_gres, p, env_flags, false); |
| } |
| |
| if (s_p_get_string(&p->links, "Link", tbl) || |
| s_p_get_string(&p->links, "Links", tbl)) { |
| if (gres_links_validate(p->links) < -1) { |
| error("gres.conf: Ignoring invalid Links=%s for Name=%s", |
| p->links, p->name); |
| xfree(p->links); |
| } |
| |
| } |
| |
| _set_shared_flag(p->name, &p->config_flags); |
| |
| if (s_p_get_string(&tmp_str, "Count", tbl)) { |
| tmp_uint64 = strtoll(tmp_str, &last, 10); |
| if ((tmp_uint64 == LONG_MIN) || (tmp_uint64 == LONG_MAX)) { |
| fatal("Invalid GRES record for %s, invalid count %s", |
| p->name, tmp_str); |
| } |
| if ((mult = suffix_mult(last)) != NO_VAL64) { |
| tmp_uint64 *= mult; |
| } else { |
| fatal("Invalid GRES record for %s, invalid count %s", |
| p->name, tmp_str); |
| } |
| /* |
| * Some GRES can have count > 1 for a given file. For example, |
| * each GPU can have arbitrary count of MPS elements. |
| */ |
| if (p->count && (p->count != tmp_uint64) && |
| !gres_id_shared(p->config_flags)) { |
| fatal("Invalid GRES record for %s, count does not match File value", |
| p->name); |
| } |
| if (tmp_uint64 >= NO_VAL64) { |
| fatal("GRES %s has invalid count value %"PRIu64, |
| p->name, tmp_uint64); |
| } |
| p->count = tmp_uint64; |
| xfree(tmp_str); |
| } else if (p->count == 0) |
| p->count = 1; |
| |
| s_p_hashtbl_destroy(tbl); |
| |
| for (i = 0; i < gres_context_cnt; i++) { |
| if (xstrcasecmp(p->name, gres_context[i].gres_name) == 0) |
| break; |
| } |
| if (i >= gres_context_cnt) { |
| error("Ignoring gres.conf record, invalid name: %s", p->name); |
| destroy_gres_slurmd_conf(p); |
| return 0; |
| } |
| p->plugin_id = gres_context[i].plugin_id; |
| *dest = (void *)p; |
| return 1; |
| } |
| static int _parse_gres_config_node(void **dest, slurm_parser_enum_t type, |
| const char *key, const char *value, |
| const char *line, char **leftover) |
| { |
| s_p_hashtbl_t *tbl; |
| |
| if (gres_node_name && value) { |
| bool match = false; |
| hostlist_t *hl; |
| hl = hostlist_create(value); |
| if (hl) { |
| match = (hostlist_find(hl, gres_node_name) >= 0); |
| hostlist_destroy(hl); |
| } |
| if (!match) { |
| debug("skipping GRES for NodeName=%s %s", value, line); |
| tbl = s_p_hashtbl_create(_gres_options); |
| s_p_parse_line(tbl, *leftover, leftover); |
| s_p_hashtbl_destroy(tbl); |
| return 0; |
| } |
| } |
| return _parse_gres_config(dest, type, key, NULL, line, leftover); |
| } |
| |
| static int _foreach_slurm_conf(void *x, void *arg) |
| { |
| gres_state_t *gres_state_node = (gres_state_t *)x; |
| slurm_gres_context_t *gres_ctx = (slurm_gres_context_t *)arg; |
| gres_node_state_t *gres_ns; |
| uint64_t tmp_count = 0; |
| |
| /* Only look at the GRES under the current plugin (same name) */ |
| if (gres_state_node->plugin_id != gres_ctx->plugin_id) |
| return 0; |
| |
| gres_ns = (gres_node_state_t *)gres_state_node->gres_data; |
| |
| /* |
| * gres_cnt_config should equal the combined count from |
| * type_cnt_avail if there are no untyped GRES |
| */ |
| for (uint16_t i = 0; i < gres_ns->type_cnt; i++) |
| tmp_count += gres_ns->type_cnt_avail[i]; |
| |
| /* Forbid mixing typed and untyped GRES under the same name */ |
| if (gres_ns->type_cnt && |
| gres_ns->gres_cnt_config > tmp_count) |
| fatal("%s: Some %s GRES in slurm.conf have a type while others do not (gres_ns->gres_cnt_config (%"PRIu64") > tmp_count (%"PRIu64"))", |
| __func__, gres_ctx->gres_name, |
| gres_ns->gres_cnt_config, tmp_count); |
| return 1; |
| } |
| |
| static void _validate_slurm_conf(list_t *slurm_conf_list, |
| slurm_gres_context_t *gres_ctx) |
| { |
| if (!slurm_conf_list) |
| return; |
| |
| (void)list_for_each_nobreak(slurm_conf_list, _foreach_slurm_conf, |
| gres_ctx); |
| } |
| |
| static int _foreach_gres_conf(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = (gres_slurmd_conf_t *)x; |
| foreach_gres_conf_t *foreach_gres_conf = (foreach_gres_conf_t *)arg; |
| slurm_gres_context_t *gres_ctx = foreach_gres_conf->gres_ctx; |
| bool orig_has_file, orig_has_type; |
| |
| /* Only look at the GRES under the current plugin (same name) */ |
| if (gres_slurmd_conf->plugin_id != gres_ctx->plugin_id) |
| return 0; |
| |
| /* |
| * If any plugin of this type has this set it will virally set |
| * any other to be the same as we use the gres_ctx from here |
| * on out. |
| */ |
| if (gres_slurmd_conf->config_flags & GRES_CONF_EXPLICIT) |
| gres_ctx->config_flags |= GRES_CONF_EXPLICIT; |
| |
| if (gres_slurmd_conf->config_flags & GRES_CONF_COUNT_ONLY) |
| gres_ctx->config_flags |= GRES_CONF_COUNT_ONLY; |
| |
| if (gres_slurmd_conf->config_flags & GRES_CONF_HAS_FILE) |
| gres_ctx->config_flags |= GRES_CONF_HAS_FILE; |
| |
| if (gres_slurmd_conf->config_flags & GRES_CONF_ONE_SHARING) |
| gres_ctx->config_flags |= GRES_CONF_ONE_SHARING; |
| /* |
| * Since there could be multiple types of the same plugin we |
| * need to only make sure we load it once. |
| */ |
| if (!(gres_ctx->config_flags & GRES_CONF_LOADED)) { |
| /* |
| * Ignore return code, as we will still support the gres |
| * with or without the plugin. |
| */ |
| if (_load_plugin(gres_ctx) == SLURM_SUCCESS) |
| gres_ctx->config_flags |= GRES_CONF_LOADED; |
| } |
| |
| foreach_gres_conf->rec_count++; |
| orig_has_file = gres_slurmd_conf->config_flags & GRES_CONF_HAS_FILE; |
| if (foreach_gres_conf->new_has_file == -1) { |
| if (gres_slurmd_conf->config_flags & GRES_CONF_HAS_FILE) |
| foreach_gres_conf->new_has_file = 1; |
| else |
| foreach_gres_conf->new_has_file = 0; |
| } else if ((foreach_gres_conf->new_has_file && !orig_has_file) || |
| (!foreach_gres_conf->new_has_file && orig_has_file)) { |
| fatal("gres.conf for %s, some records have \"File\" specification while others do not", |
| gres_ctx->gres_name); |
| } |
| orig_has_type = gres_slurmd_conf->config_flags & |
| GRES_CONF_HAS_TYPE; |
| if (foreach_gres_conf->new_has_type == -1) { |
| if (gres_slurmd_conf->config_flags & |
| GRES_CONF_HAS_TYPE) { |
| foreach_gres_conf->new_has_type = 1; |
| } else |
| foreach_gres_conf->new_has_type = 0; |
| } else if ((foreach_gres_conf->new_has_type && !orig_has_type) || |
| (!foreach_gres_conf->new_has_type && orig_has_type)) { |
| fatal("gres.conf for %s, some records have \"Type=\" specification while others do not", |
| gres_ctx->gres_name); |
| } |
| |
| if (!foreach_gres_conf->new_has_file && |
| !foreach_gres_conf->new_has_type && |
| (foreach_gres_conf->rec_count > 1)) { |
| fatal("gres.conf duplicate records for %s", |
| gres_ctx->gres_name); |
| } |
| |
| if (foreach_gres_conf->new_has_file) |
| gres_ctx->config_flags |= GRES_CONF_HAS_FILE; |
| |
| return 0; |
| } |
| |
| static void _validate_gres_conf(list_t *gres_conf_list, |
| slurm_gres_context_t *gres_ctx) |
| { |
| foreach_gres_conf_t gres_conf = { |
| .gres_ctx = gres_ctx, |
| .new_has_file = -1, |
| .new_has_type = -1, |
| .rec_count = 0, |
| }; |
| |
| (void)list_for_each_nobreak(gres_conf_list, _foreach_gres_conf, |
| &gres_conf); |
| |
| if (!(gres_ctx->config_flags & GRES_CONF_LOADED)) { |
| /* |
| * This means there was no gres.conf line for this gres found. |
| * We still need to try to load it for AutoDetect's sake. |
| * If we fail loading we will treat it as a count |
| * only GRES since the stepd will try to load it elsewise. |
| */ |
| if (_load_plugin(gres_ctx) != SLURM_SUCCESS) |
| gres_ctx->config_flags |= GRES_CONF_COUNT_ONLY; |
| } else |
| /* Remove as this is only really used locally */ |
| gres_ctx->config_flags &= (~GRES_CONF_LOADED); |
| } |
| |
| /* |
| * Keep track of which gres.conf lines have a count greater than expected |
| * according to the current slurm.conf GRES. Modify the count of |
| * gres_slurmd_conf to keep track of this. Any gres.conf records |
| * with a count > 0 means that slurm.conf did not account for it completely. |
| * |
| * gres_slurmd_conf - (in/out) pointer to conf we are looking at. |
| * This should be a temporary copy that we can modify. |
| * conf_cnt->count - (in) The count of the current slurm.conf GRES record. |
| * conf_cnt->type_name - (in) The type of the current slurm.conf GRES record. |
| */ |
| static int _foreach_compare_conf_counts(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| conf_cnt_t *conf_cnt = arg; |
| |
| /* Note: plugin type filter already applied */ |
| /* Check that type is the same */ |
| if (gres_slurmd_conf->type_name && |
| xstrcasecmp(gres_slurmd_conf->type_name, conf_cnt->type_name)) |
| return 0; |
| /* Keep track of counts */ |
| if (gres_slurmd_conf->count > conf_cnt->count) { |
| gres_slurmd_conf->count -= conf_cnt->count; |
| /* This slurm.conf GRES specification is now used up */ |
| return -1; |
| } else { |
| conf_cnt->count -= gres_slurmd_conf->count; |
| gres_slurmd_conf->count = 0; |
| } |
| return 0; |
| } |
| |
| static int _lite_copy_gres_slurmd_conf(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| check_conf_t *check_conf = arg; |
| gres_slurmd_conf_t *gres_slurmd_conf_tmp; |
| |
| if (gres_slurmd_conf->plugin_id != check_conf->gres_ctx->plugin_id) |
| return 0; |
| |
| gres_slurmd_conf_tmp = xmalloc(sizeof(*gres_slurmd_conf_tmp)); |
| gres_slurmd_conf_tmp->name = xstrdup(gres_slurmd_conf->name); |
| gres_slurmd_conf_tmp->type_name = xstrdup(gres_slurmd_conf->type_name); |
| gres_slurmd_conf_tmp->count = gres_slurmd_conf->count; |
| list_append(check_conf->gres_conf_list, gres_slurmd_conf_tmp); |
| |
| return 0; |
| } |
| |
| static int _foreach_slurm_conf_mismatch_comp(void *x, void *arg) |
| { |
| gres_state_t *gres_state_node = x; |
| check_conf_t *check_conf = arg; |
| gres_node_state_t *gres_ns; |
| conf_cnt_t conf_cnt = { 0 }; |
| |
| if (gres_state_node->plugin_id != check_conf->gres_ctx->plugin_id) |
| return 0; |
| |
| /* Determine if typed or untyped, and act accordingly */ |
| gres_ns = gres_state_node->gres_data; |
| if (!gres_ns->type_name) { |
| conf_cnt.count = gres_ns->gres_cnt_config; |
| conf_cnt.type_name = NULL; |
| (void) list_for_each(check_conf->gres_conf_list, |
| _foreach_compare_conf_counts, |
| &conf_cnt); |
| return 0; |
| } |
| |
| for (int i = 0; i < gres_ns->type_cnt; ++i) { |
| conf_cnt.count = gres_ns->type_cnt_avail[i]; |
| conf_cnt.type_name = gres_ns->type_name[i]; |
| (void) list_for_each(check_conf->gres_conf_list, |
| _foreach_compare_conf_counts, |
| &conf_cnt); |
| } |
| |
| return 0; |
| } |
| |
| int _print_slurm_conf_mismatch(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| |
| if (gres_slurmd_conf->count > 0) |
| warning("A line in gres.conf for GRES %s%s%s has %"PRIu64" more configured than expected in slurm.conf. Ignoring extra GRES.", |
| gres_slurmd_conf->name, |
| (gres_slurmd_conf->type_name) ? ":" : "", |
| (gres_slurmd_conf->type_name) ? |
| gres_slurmd_conf->type_name : "", |
| gres_slurmd_conf->count); |
| return 0; |
| } |
| |
| /* |
| * Loop through each entry in gres.conf and see if there is a corresponding |
| * entry in slurm.conf. If so, see if the counts line up. If there are more |
| * devices specified in gres.conf than in slurm.conf, emit errors. |
| * |
| * slurm_conf_list - (in) The slurm.conf GRES list. |
| * gres_conf_list - (in) The gres.conf GRES list. |
| * gres_ctx - (in) Which GRES plugin we are currently working in. |
| */ |
| static void _check_conf_mismatch(list_t *slurm_conf_list, list_t *gres_conf_list, |
| slurm_gres_context_t *gres_ctx) |
| { |
| check_conf_t check_conf = { |
| .gres_ctx = gres_ctx, |
| }; |
| |
| /* E.g. slurm_conf_list will be NULL in the case of --gpu-bind */ |
| if (!slurm_conf_list || !gres_conf_list) |
| return; |
| |
| /* |
| * Duplicate the gres.conf list with records relevant to this GRES |
| * plugin only so we can mangle records. Only add records under the |
| * current plugin. |
| */ |
| check_conf.gres_conf_list = list_create(destroy_gres_slurmd_conf); |
| (void) list_for_each(gres_conf_list, |
| _lite_copy_gres_slurmd_conf, |
| &check_conf); |
| |
| /* |
| * Loop through the slurm.conf list and see if there are more gres.conf |
| * GRES than expected. |
| */ |
| (void) list_for_each(slurm_conf_list, |
| _foreach_slurm_conf_mismatch_comp, |
| &check_conf); |
| |
| /* |
| * Loop through gres_conf_list_tmp to print errors for gres.conf |
| * records that were not completely accounted for in slurm.conf. |
| */ |
| (void) list_for_each(check_conf.gres_conf_list, |
| _print_slurm_conf_mismatch, |
| NULL); |
| |
| FREE_NULL_LIST(check_conf.gres_conf_list); |
| } |
| |
| /* |
| * Match the type of a GRES from slurm.conf to a GRES in the gres.conf list. If |
| * a match is found, pop it off the gres.conf list and return it. |
| * |
| * gres_context - (in) Which GRES plugin we are currently working in. |
| * type_name - (in) The type of the slurm.conf GRES record. If null, then |
| * it's an untyped GRES. |
| * |
| * Returns the first gres.conf record from gres_conf_list with the same type |
| * name as the slurm.conf record. |
| */ |
| static int _match_type(void *x, void *key) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| conf_cnt_t *conf_cnt = key; |
| |
| if (gres_slurmd_conf->plugin_id != conf_cnt->gres_ctx->plugin_id) |
| return 0; |
| |
| /* |
| * If type_name is NULL we will take the first matching |
| * gres_slurmd_conf that we find. This means we also will |
| * remove the type from the gres_slurmd_conf to match 18.08 |
| * stylings. |
| */ |
| if (!conf_cnt->type_name) { |
| xfree(gres_slurmd_conf->type_name); |
| gres_slurmd_conf->config_flags &= ~GRES_CONF_HAS_TYPE; |
| } else if (xstrcasecmp(gres_slurmd_conf->type_name, |
| conf_cnt->type_name)) |
| return 0; |
| |
| return 1; |
| } |
| |
| /* |
| * Add a GRES conf record with count == 0 to gres_list. |
| * |
| * new_list - (in/out) The gres list to add to. |
| * gres_ctx - (in) The GRES plugin to add a GRES record for. |
| * count - (in) The cpu count configured for the node. |
| */ |
| static void _add_gres_config_empty(merge_gres_t *merge_gres) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = |
| xmalloc(sizeof(*gres_slurmd_conf)); |
| gres_slurmd_conf->cpu_cnt = merge_gres->cpu_cnt; |
| gres_slurmd_conf->name = xstrdup(merge_gres->gres_ctx->gres_name); |
| gres_slurmd_conf->plugin_id = merge_gres->gres_ctx->plugin_id; |
| list_append(merge_gres->new_list, gres_slurmd_conf); |
| } |
| |
| /* |
| * Truncate the File hostrange string of a GRES record to be at most |
| * new_count entries. The extra entries will be removed. |
| * |
| * gres_slurmd_conf - (in/out) The GRES record to modify. |
| * count - (in) The new number of entries in File |
| */ |
| static void _set_file_subset(gres_slurmd_conf_t *gres_slurmd_conf, |
| uint64_t new_count) |
| { |
| /* Convert file to hostrange */ |
| hostlist_t *hl = hostlist_create(gres_slurmd_conf->file); |
| unsigned long old_count = hostlist_count(hl); |
| |
| if (new_count >= old_count) { |
| hostlist_destroy(hl); |
| /* Nothing to do */ |
| return; |
| } |
| |
| /* Remove all but the first entries */ |
| for (int i = old_count; i > new_count; --i) { |
| free(hostlist_pop(hl)); |
| } |
| |
| debug3("%s: Truncating %s:%s File from (%ld) %s", __func__, |
| gres_slurmd_conf->name, gres_slurmd_conf->type_name, old_count, |
| gres_slurmd_conf->file); |
| |
| /* Set file to the new subset */ |
| xfree(gres_slurmd_conf->file); |
| gres_slurmd_conf->file = hostlist_ranged_string_xmalloc(hl); |
| |
| debug3("%s: to (%"PRIu64") %s", __func__, new_count, |
| gres_slurmd_conf->file); |
| hostlist_destroy(hl); |
| } |
| |
| /* |
| * A continuation of _merge_gres() depending on if the slurm.conf GRES is typed |
| * or not. |
| * |
| * gres_conf_list - (in) The gres.conf list. |
| * new_list - (out) The new merged [slurm|gres].conf list. |
| * count - (in) The count of the slurm.conf GRES record. |
| * type_name - (in) The type of the slurm.conf GRES record, if it exists. |
| * gres_context - (in) Which GRES plugin we are working in. |
| * cpu_cnt - (in) A count of CPUs on the node. |
| */ |
| static void _merge_gres2(merge_gres_t *merge_gres, |
| uint64_t count, char *type_name) |
| { |
| gres_slurmd_conf_t *match; |
| gres_slurmd_conf_t gres_slurmd_conf = { |
| .cpu_cnt = merge_gres->cpu_cnt, |
| .name = merge_gres->gres_ctx->gres_name, |
| .type_name = type_name, |
| }; |
| conf_cnt_t conf_cnt = { |
| .count = count, |
| .gres_ctx = merge_gres->gres_ctx, |
| .type_name = type_name, |
| }; |
| |
| /* If slurm.conf count is initially 0, don't waste time on it */ |
| if (count == 0) |
| return; |
| |
| /* |
| * There can be multiple gres.conf GRES lines contained within a |
| * single slurm.conf GRES line, due to different values of Cores |
| * and Links. Append them to the list where possible. |
| */ |
| while ((match = list_remove_first( |
| merge_gres->gres_conf_list, _match_type, &conf_cnt))) { |
| list_append(merge_gres->new_list, match); |
| |
| debug3("%s: From gres.conf, using %s:%s:%"PRIu64":%s", __func__, |
| match->name, match->type_name, match->count, |
| match->file); |
| |
| /* |
| * See if we need to merge with any more gres.conf records. |
| * NOTE: _set_file_subset() won't run on a MultipleFiles GRES, |
| * since match->count will always be 1 and count is always >= 1 |
| */ |
| if (match->count > count) { |
| /* |
| * Truncate excess count of gres.conf to match total |
| * count of slurm.conf. |
| */ |
| match->count = count; |
| /* |
| * Truncate excess file of gres.conf to match total |
| * count of slurm.conf. |
| */ |
| if (match->file) |
| _set_file_subset(match, count); |
| /* Floor to 0 to break out of loop. */ |
| count = 0; |
| } else |
| /* |
| * Subtract this gres.conf line count from the |
| * slurm.conf total. |
| */ |
| count -= match->count; |
| |
| /* |
| * All devices outlined by this slurm.conf record have now been |
| * merged with gres.conf records and added to new_list, so exit. |
| */ |
| if (count == 0) |
| break; |
| } |
| |
| if (count == 0) |
| return; |
| |
| /* |
| * There are leftover GRES specified in this slurm.conf record that are |
| * not accounted for in gres.conf that still need to be added. |
| */ |
| |
| /* Set default env flags, and allow AutoDetect to override */ |
| if (!xstrcasecmp(merge_gres->gres_ctx->gres_name, "gpu")) |
| gres_slurmd_conf.config_flags |= |
| (GRES_CONF_ENV_SET | GRES_CONF_ENV_DEF); |
| if (merge_gres->gres_ctx->config_flags & GRES_CONF_COUNT_ONLY) |
| gres_slurmd_conf.config_flags |= GRES_CONF_COUNT_ONLY; |
| |
| gres_slurmd_conf.count = count; |
| |
| add_gres_to_list(merge_gres->new_list, &gres_slurmd_conf); |
| } |
| |
| /* |
| * Merge a single slurm.conf GRES specification with any relevant gres.conf |
| * records and append the result to new_list. |
| * |
| * gres_conf_list - (in) The gres.conf list. |
| * new_list - (out) The new merged [slurm|gres].conf list. |
| * ptr - (in) A slurm.conf GRES record. |
| * gres_ctx - (in) Which GRES plugin we are working in. |
| * cpu_cnt - (in) A count of CPUs on the node. |
| */ |
| static int _merge_gres(void *x, void *arg) |
| { |
| gres_state_t *gres_state_node = x; |
| merge_gres_t *merge_gres = arg; |
| gres_node_state_t *gres_ns; |
| |
| if (gres_state_node->plugin_id != merge_gres->gres_ctx->plugin_id) |
| return 0; |
| |
| gres_ns = gres_state_node->gres_data; |
| /* If this GRES has no types, merge in the single untyped GRES */ |
| if (gres_ns->type_cnt == 0) { |
| _merge_gres2(merge_gres, |
| gres_ns->gres_cnt_config, NULL); |
| return 0; |
| } |
| |
| /* If this GRES has types, merge in each typed GRES */ |
| for (int i = 0; i < gres_ns->type_cnt; i++) { |
| _merge_gres2(merge_gres, |
| gres_ns->type_cnt_avail[i], |
| gres_ns->type_name[i]); |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Merge slurm.conf and gres.conf GRES configuration. |
| * gres.conf can only work within what is outlined in slurm.conf. Every |
| * gres.conf device that does not match up to a device in slurm.conf is |
| * discarded with an error. If no gres conf found for what is specified in |
| * slurm.conf, create a zero-count conf record. |
| * |
| * node_conf - (in) node configuration info (cpu count). |
| * gres_conf_list - (in/out) GRES data from gres.conf. This becomes the new |
| * merged slurm.conf/gres.conf list. |
| * slurm_conf_list - (in) GRES data from slurm.conf. |
| */ |
| static void _merge_config(node_config_load_t *node_conf, list_t *gres_conf_list, |
| list_t *slurm_conf_list) |
| { |
| merge_gres_t merge_gres = { |
| .cpu_cnt = node_conf->cpu_cnt, |
| .gres_conf_list = gres_conf_list, |
| .new_list = list_create(destroy_gres_slurmd_conf), |
| }; |
| |
| for (int i = 0; i < gres_context_cnt; i++) { |
| merge_gres.gres_ctx = &gres_context[i]; |
| |
| /* Copy GRES configuration from slurm.conf */ |
| if (slurm_conf_list) { |
| if (list_for_each(slurm_conf_list, |
| _merge_gres, |
| &merge_gres) > 0) |
| continue; |
| } |
| |
| /* Add GRES record with zero count */ |
| _add_gres_config_empty(&merge_gres); |
| } |
| /* Set gres_conf_list to be the new merged list */ |
| list_flush(gres_conf_list); |
| list_transfer(gres_conf_list, merge_gres.new_list); |
| FREE_NULL_LIST(merge_gres.new_list); |
| } |
| |
| static void _pack_gres_context(slurm_gres_context_t *gres_ctx, buf_t *buffer) |
| { |
| /* gres_ctx->cur_plugin: DON'T PACK will be filled in on the other |
| * side */ |
| pack32(gres_ctx->config_flags, buffer); |
| packstr(gres_ctx->gres_name, buffer); |
| packstr(gres_ctx->gres_name_colon, buffer); |
| pack32((uint32_t)gres_ctx->gres_name_colon_len, buffer); |
| packstr(gres_ctx->gres_type, buffer); |
| gres_send_stepd(buffer, gres_ctx->np_gres_devices); |
| /* gres_ctx->ops: DON'T PACK will be filled in on the other side */ |
| pack32(gres_ctx->plugin_id, buffer); |
| /* gres_ctx->plugin_list: DON'T PACK will be filled in on the other |
| * side */ |
| pack64(gres_ctx->total_cnt, buffer); |
| } |
| |
| static int _unpack_gres_context(slurm_gres_context_t *gres_ctx, buf_t *buffer) |
| { |
| uint32_t uint32_tmp; |
| |
| /* gres_ctx->cur_plugin: filled in later with _load_plugin() */ |
| safe_unpack32(&gres_ctx->config_flags, buffer); |
| safe_unpackstr(&gres_ctx->gres_name, buffer); |
| safe_unpackstr(&gres_ctx->gres_name_colon, buffer); |
| safe_unpack32(&uint32_tmp, buffer); |
| gres_ctx->gres_name_colon_len = (int)uint32_tmp; |
| safe_unpackstr(&gres_ctx->gres_type, buffer); |
| gres_recv_stepd(buffer, &gres_ctx->np_gres_devices); |
| /* gres_ctx->ops: filled in later with _load_plugin() */ |
| safe_unpack32(&gres_ctx->plugin_id, buffer); |
| /* gres_ctx->plugin_list: filled in later with _load_plugin() */ |
| safe_unpack64(&gres_ctx->total_cnt, buffer); |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| error("%s: unpack_error", __func__); |
| return SLURM_ERROR; |
| } |
| |
| static void _pack_gres_slurmd_conf(void *in, uint16_t protocol_version, |
| buf_t *buffer) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = (gres_slurmd_conf_t *)in; |
| |
| /* |
| * Ignore protocol_version at the time of writing this only deals with |
| * communication from the slurmd to a new stepd which should always be |
| * the same version. This function is called from slurm_pack_list which |
| * requires protocol_version. |
| */ |
| |
| /* Pack gres_slurmd_conf_t */ |
| pack32(gres_slurmd_conf->config_flags, buffer); |
| pack64(gres_slurmd_conf->count, buffer); |
| pack32(gres_slurmd_conf->cpu_cnt, buffer); |
| packstr(gres_slurmd_conf->cpus, buffer); |
| pack_bit_str_hex(gres_slurmd_conf->cpus_bitmap, buffer); |
| packstr(gres_slurmd_conf->file, buffer); |
| packstr(gres_slurmd_conf->links, buffer); |
| packstr(gres_slurmd_conf->name, buffer); |
| packstr(gres_slurmd_conf->type_name, buffer); |
| packstr(gres_slurmd_conf->unique_id, buffer); |
| pack32(gres_slurmd_conf->plugin_id, buffer); |
| } |
| |
| static int _unpack_gres_slurmd_conf(void **object, uint16_t protocol_version, |
| buf_t *buffer) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = |
| xmalloc(sizeof(*gres_slurmd_conf)); |
| |
| /* |
| * Ignore protocol_version at the time of writing this only deals with |
| * communication from the slurmd to a new stepd which should always be |
| * the same version. This function is called from slurm_unpack_list |
| * which requires protocol_version. |
| */ |
| |
| /* Unpack gres_slurmd_conf_t */ |
| safe_unpack32(&gres_slurmd_conf->config_flags, buffer); |
| safe_unpack64(&gres_slurmd_conf->count, buffer); |
| safe_unpack32(&gres_slurmd_conf->cpu_cnt, buffer); |
| safe_unpackstr(&gres_slurmd_conf->cpus, buffer); |
| unpack_bit_str_hex(&gres_slurmd_conf->cpus_bitmap, buffer); |
| safe_unpackstr(&gres_slurmd_conf->file, buffer); |
| safe_unpackstr(&gres_slurmd_conf->links, buffer); |
| safe_unpackstr(&gres_slurmd_conf->name, buffer); |
| safe_unpackstr(&gres_slurmd_conf->type_name, buffer); |
| safe_unpackstr(&gres_slurmd_conf->unique_id, buffer); |
| safe_unpack32(&gres_slurmd_conf->plugin_id, buffer); |
| |
| *object = gres_slurmd_conf; |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| destroy_gres_slurmd_conf(gres_slurmd_conf); |
| *object = NULL; |
| return SLURM_ERROR; |
| } |
| |
| /* gres_context_lock should be locked before this */ |
| static void _pack_context_buf(void) |
| { |
| FREE_NULL_BUFFER(gres_context_buf); |
| |
| gres_context_buf = init_buf(0); |
| pack32(gres_context_cnt, gres_context_buf); |
| if (gres_context_cnt <= 0) { |
| debug3("%s: No GRES context count sent to stepd", __func__); |
| return; |
| } |
| |
| for (int i = 0; i < gres_context_cnt; i++) { |
| slurm_gres_context_t *gres_ctx = &gres_context[i]; |
| _pack_gres_context(gres_ctx, gres_context_buf); |
| if (gres_ctx->ops.send_stepd) |
| (*(gres_ctx->ops.send_stepd))(gres_context_buf); |
| } |
| } |
| |
| static int _unpack_context_buf(buf_t *buffer) |
| { |
| uint32_t cnt; |
| safe_unpack32(&cnt, buffer); |
| |
| gres_context_cnt = cnt; |
| |
| if (!gres_context_cnt) |
| return SLURM_SUCCESS; |
| |
| xrecalloc(gres_context, gres_context_cnt, sizeof(slurm_gres_context_t)); |
| for (int i = 0; i < gres_context_cnt; i++) { |
| slurm_gres_context_t *gres_ctx = &gres_context[i]; |
| if (_unpack_gres_context(gres_ctx, buffer) != SLURM_SUCCESS) |
| goto unpack_error; |
| (void)_load_plugin(gres_ctx); |
| if (gres_ctx->ops.recv_stepd) |
| (*(gres_ctx->ops.recv_stepd))(buffer); |
| } |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| error("%s: failed", __func__); |
| return SLURM_ERROR; |
| } |
| |
| /* gres_context_lock should be locked before this */ |
| static void _pack_gres_conf(void) |
| { |
| int len = 0; |
| FREE_NULL_BUFFER(gres_conf_buf); |
| |
| gres_conf_buf = init_buf(0); |
| pack32(autodetect_flags, gres_conf_buf); |
| |
| /* If there is no list to send, let the stepd know */ |
| if (!gres_conf_list || !(len = list_count(gres_conf_list))) { |
| pack32(len, gres_conf_buf); |
| return; |
| } |
| pack32(len, gres_conf_buf); |
| |
| if (slurm_pack_list(gres_conf_list, _pack_gres_slurmd_conf, |
| gres_conf_buf, SLURM_PROTOCOL_VERSION) |
| != SLURM_SUCCESS) { |
| error("%s: Failed to pack gres_conf_list", __func__); |
| return; |
| } |
| } |
| |
| static int _unpack_gres_conf(buf_t *buffer) |
| { |
| uint32_t cnt; |
| safe_unpack32(&cnt, buffer); |
| autodetect_flags = cnt; |
| safe_unpack32(&cnt, buffer); |
| |
| if (!cnt) |
| return SLURM_SUCCESS; |
| |
| if (slurm_unpack_list(&gres_conf_list, _unpack_gres_slurmd_conf, |
| destroy_gres_slurmd_conf, buffer, |
| SLURM_PROTOCOL_VERSION) != SLURM_SUCCESS) |
| goto unpack_error; |
| |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| error("%s: failed", __func__); |
| return SLURM_ERROR; |
| } |
| |
| /* List helper function for gres_node_config_load */ |
| static void _free_name_list(void *x) |
| { |
| free(x); |
| } |
| |
| /* Fills major and minor information for a gres_device_t dev */ |
| static int _set_gres_device_desc(gres_device_t *dev) |
| { |
| struct stat fs; |
| |
| dev->dev_desc.type = DEV_TYPE_NONE; |
| dev->dev_desc.major = NO_VAL; |
| dev->dev_desc.minor = NO_VAL; |
| |
| if (stat(dev->path, &fs) < 0) { |
| error("%s: stat(%s): %m", __func__, dev->path); |
| return SLURM_ERROR; |
| } |
| |
| dev->dev_desc.major = major(fs.st_rdev); |
| dev->dev_desc.minor = minor(fs.st_rdev); |
| log_flag(GRES, "%s : %s major %d, minor %d", __func__, dev->path, |
| dev->dev_desc.major, dev->dev_desc.minor); |
| |
| if (S_ISBLK(fs.st_mode)) |
| dev->dev_desc.type = DEV_TYPE_BLOCK; |
| else if (S_ISCHR(fs.st_mode)) |
| dev->dev_desc.type = DEV_TYPE_CHAR; |
| else { |
| error("%s is not a valid character or block device, fix your gres.conf", |
| dev->path); |
| return SLURM_ERROR; |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| |
| /* |
| * Creates and initializes a gres_device_t from a path, an bitmap index and a |
| * unique_id. At failure return NULL. |
| */ |
| static gres_device_t *_init_gres_device(int index, char *one_name, |
| char *unique_id) |
| { |
| int tmp, digit = -1; |
| gres_device_t *gres_device = xmalloc(sizeof(gres_device_t)); |
| |
| gres_device->dev_num = -1; |
| gres_device->index = index; |
| gres_device->path = xstrdup(one_name); |
| gres_device->unique_id = xstrdup(unique_id); |
| |
| if (_set_gres_device_desc(gres_device) != SLURM_SUCCESS) { |
| xfree(gres_device); |
| return NULL; |
| } |
| |
| tmp = strlen(one_name); |
| for (int i = 1; i <= tmp; i++) { |
| if (isdigit(one_name[tmp - i])) { |
| digit = tmp - i; |
| continue; |
| } |
| break; |
| } |
| if (digit >= 0) |
| gres_device->dev_num = atoi(one_name + digit); |
| else |
| gres_device->dev_num = -1; |
| |
| return gres_device; |
| } |
| |
| /* Load the specific GRES plugins here */ |
| static int _load_specific_gres_plugins(void) |
| { |
| int rc; |
| |
| if ((rc = gpu_plugin_init()) != SLURM_SUCCESS) |
| return rc; |
| |
| return rc; |
| } |
| |
| static int _foreach_fill_in_gres_devices(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| foreach_fill_in_gres_devices_t *fill_in_gres_devices = arg; |
| node_config_load_t *config = fill_in_gres_devices->config; |
| hostlist_t *hl; |
| char *one_name; |
| |
| if (!(gres_slurmd_conf->config_flags & GRES_CONF_HAS_FILE) || |
| !gres_slurmd_conf->file || |
| xstrcmp(gres_slurmd_conf->name, config->gres_name)) |
| return 0; |
| |
| if (!(hl = hostlist_create(gres_slurmd_conf->file))) { |
| error("can't parse gres.conf file record (%s)", |
| gres_slurmd_conf->file); |
| return 0; |
| } |
| |
| while ((one_name = hostlist_shift(hl))) { |
| /* We don't care about gres_devices in slurmctld */ |
| if (config->in_slurmd) { |
| gres_device_t *gres_device; |
| if (!*fill_in_gres_devices->gres_devices) |
| *fill_in_gres_devices->gres_devices = |
| list_create(destroy_gres_device); |
| |
| if (!(gres_device = _init_gres_device( |
| fill_in_gres_devices->index, one_name, |
| gres_slurmd_conf->unique_id))) { |
| free(one_name); |
| continue; |
| } |
| |
| if (gres_device->dev_num > |
| fill_in_gres_devices->max_dev_num) |
| fill_in_gres_devices->max_dev_num = |
| gres_device->dev_num; |
| |
| list_append(*fill_in_gres_devices->gres_devices, |
| gres_device); |
| } |
| |
| /* |
| * Don't check for file duplicates or increment the |
| * device bitmap index if this is a MultipleFiles GRES |
| */ |
| if (gres_slurmd_conf->config_flags & GRES_CONF_HAS_MULT) { |
| free(one_name); |
| continue; |
| } |
| |
| if ((fill_in_gres_devices->rc == SLURM_SUCCESS) && |
| list_find_first(fill_in_gres_devices->names_list, |
| slurm_find_char_exact_in_list, |
| one_name)) { |
| error("%s duplicate device file name (%s)", |
| config->gres_name, one_name); |
| fill_in_gres_devices->rc = SLURM_ERROR; |
| } |
| |
| list_append(fill_in_gres_devices->names_list, one_name); |
| |
| /* Increment device bitmap index */ |
| fill_in_gres_devices->index++; |
| } |
| hostlist_destroy(hl); |
| |
| if (gres_slurmd_conf->config_flags & GRES_CONF_HAS_MULT) |
| fill_in_gres_devices->index++; |
| |
| return 0; |
| } |
| |
| static int _foreach_fill_in_gres_devices_dev_id(void *x, void *arg) |
| { |
| gres_device_t *gres_device = x; |
| foreach_fill_in_gres_devices_t *fill_in_gres_devices = arg; |
| |
| if (gres_device->dev_num == -1) |
| gres_device->dev_num = ++fill_in_gres_devices->max_dev_num; |
| |
| if (slurm_conf.debug_flags & DEBUG_FLAG_GRES) { |
| char *dev_id_str = gres_device_id2str(&gres_device->dev_desc); |
| log_flag(GRES, "%s device number %d(%s):%s", |
| fill_in_gres_devices->config->gres_name, |
| gres_device->dev_num, |
| gres_device->path, |
| dev_id_str); |
| xfree(dev_id_str); |
| } |
| |
| return 0; |
| } |
| |
| extern int gres_node_config_load(list_t *gres_conf_list, |
| node_config_load_t *config, |
| list_t **gres_devices) |
| { |
| foreach_fill_in_gres_devices_t fill_in_gres_devices = { |
| .config = config, |
| .gres_devices = gres_devices, |
| .index = 0, |
| .max_dev_num = -1, |
| .names_list = list_create(_free_name_list), |
| .rc = SLURM_SUCCESS, |
| }; |
| xassert(gres_conf_list); |
| xassert(gres_devices); |
| |
| (void) list_for_each(gres_conf_list, _foreach_fill_in_gres_devices, |
| &fill_in_gres_devices); |
| FREE_NULL_LIST(fill_in_gres_devices.names_list); |
| |
| if (*gres_devices) |
| (void) list_for_each(*gres_devices, |
| _foreach_fill_in_gres_devices_dev_id, |
| &fill_in_gres_devices); |
| |
| return fill_in_gres_devices.rc; |
| } |
| |
| /* |
| * Load this node's configuration (how many resources it has, topology, etc.) |
| * IN cpu_cnt - Number of CPUs configured for node node_name. |
| * IN node_name - Name of the node to load the GRES config for. |
| * IN gres_list - Node's GRES information as loaded from slurm.conf by slurmd |
| * IN xcpuinfo_abs_to_mac - Pointer to xcpuinfo_abs_to_mac() funct. If |
| * specified, Slurm will convert gres_slurmd_conf->cpus_bitmap (a bitmap |
| * derived from gres.conf's "Cores" range string) into machine format |
| * (normal slrumd/stepd operation). If not, it will remain unconverted (for |
| * testing purposes or when unused). |
| * IN xcpuinfo_mac_to_abs - Pointer to xcpuinfo_mac_to_abs() funct. Used to |
| * convert CPU affinities from machine format (as collected from NVML and |
| * others) into abstract format, for sanity checking purposes. |
| * NOTE: Called from slurmd (and from slurmctld for each cloud node) |
| */ |
| extern int gres_g_node_config_load(uint32_t cpu_cnt, char *node_name, |
| list_t *gres_list, |
| void *xcpuinfo_abs_to_mac, |
| void *xcpuinfo_mac_to_abs) |
| { |
| static s_p_options_t _gres_conf_options[] = { |
| {"AutoDetect", S_P_STRING}, |
| {"Name", S_P_ARRAY, _parse_gres_config, NULL}, |
| {"NodeName", S_P_ARRAY, _parse_gres_config_node, NULL}, |
| {NULL} |
| }; |
| list_t *tmp_gres_conf_list = NULL; |
| |
| int count = 0, i, rc, rc2; |
| struct stat config_stat; |
| s_p_hashtbl_t *tbl; |
| gres_slurmd_conf_t **gres_array; |
| char *gres_conf_file = NULL; |
| char *autodetect_string = NULL; |
| bool in_slurmd = running_in_slurmd(); |
| |
| node_config_load_t node_conf = { |
| .cpu_cnt = cpu_cnt, |
| .in_slurmd = in_slurmd, |
| .xcpuinfo_mac_to_abs = xcpuinfo_mac_to_abs |
| }; |
| |
| if (cpu_cnt == 0) { |
| error("%s: Invalid cpu_cnt of 0 for node %s", |
| __func__, node_name); |
| return ESLURM_INVALID_CPU_COUNT; |
| } |
| |
| if (xcpuinfo_abs_to_mac) |
| xcpuinfo_ops.xcpuinfo_abs_to_mac = xcpuinfo_abs_to_mac; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| |
| if (gres_context_cnt == 0) { |
| rc = SLURM_SUCCESS; |
| goto fini; |
| } |
| |
| tmp_gres_conf_list = list_create(destroy_gres_slurmd_conf); |
| gres_conf_file = get_extra_conf_path("gres.conf"); |
| if (stat(gres_conf_file, &config_stat) < 0) { |
| info("Can not stat gres.conf file (%s), using slurm.conf data", |
| gres_conf_file); |
| } else { |
| if (xstrcmp(gres_node_name, node_name)) { |
| xfree(gres_node_name); |
| gres_node_name = xstrdup(node_name); |
| } |
| |
| gres_cpu_cnt = cpu_cnt; |
| tbl = s_p_hashtbl_create(_gres_conf_options); |
| if (s_p_parse_file(tbl, NULL, gres_conf_file, 0, NULL) == |
| SLURM_ERROR) |
| fatal("error opening/reading %s", gres_conf_file); |
| |
| /* Overwrite unspecified local AutoDetect with global default */ |
| if (s_p_get_string(&autodetect_string, "Autodetect", tbl)) { |
| _handle_global_autodetect(autodetect_string); |
| xfree(autodetect_string); |
| } |
| |
| /* AutoDetect cannot run on the slurmctld node */ |
| if (running_in_slurmctld() && |
| autodetect_flags && |
| !((autodetect_flags & GRES_AUTODETECT_GPU_FLAGS) & |
| GRES_AUTODETECT_GPU_OFF)) { |
| rc = ESLURM_UNSUPPORTED_GRES; |
| error("Cannot use AutoDetect on cloud/dynamic node \"%s\"", |
| gres_node_name); |
| s_p_hashtbl_destroy(tbl); |
| goto fini; |
| } |
| |
| if (s_p_get_array((void ***) &gres_array, |
| &count, "Name", tbl)) { |
| for (i = 0; i < count; i++) { |
| list_append(tmp_gres_conf_list, gres_array[i]); |
| gres_array[i] = NULL; |
| } |
| } |
| if (s_p_get_array((void ***) &gres_array, |
| &count, "NodeName", tbl)) { |
| for (i = 0; i < count; i++) { |
| list_append(tmp_gres_conf_list, gres_array[i]); |
| gres_array[i] = NULL; |
| } |
| } |
| s_p_hashtbl_destroy(tbl); |
| } |
| FREE_NULL_LIST(gres_conf_list); |
| gres_conf_list = tmp_gres_conf_list; |
| tmp_gres_conf_list = NULL; |
| |
| /* Validate gres.conf and slurm.conf somewhat before merging */ |
| for (i = 0; i < gres_context_cnt; i++) { |
| _validate_slurm_conf(gres_list, &gres_context[i]); |
| _validate_gres_conf(gres_conf_list, &gres_context[i]); |
| _check_conf_mismatch(gres_list, gres_conf_list, |
| &gres_context[i]); |
| } |
| |
| /* Merge slurm.conf and gres.conf together into gres_conf_list */ |
| _merge_config(&node_conf, gres_conf_list, gres_list); |
| |
| if ((rc = _load_specific_gres_plugins()) != SLURM_SUCCESS) { |
| goto fini; |
| } |
| |
| for (i = 0; i < gres_context_cnt; i++) { |
| node_conf.gres_name = gres_context[i].gres_name; |
| if (gres_context[i].ops.node_config_load) |
| rc2 = (*(gres_context[i].ops.node_config_load))( |
| gres_conf_list, &node_conf); |
| else if (gres_context[i].config_flags & GRES_CONF_HAS_FILE) { |
| rc2 = gres_node_config_load( |
| gres_conf_list, &node_conf, |
| &gres_context[i].np_gres_devices); |
| } else |
| continue; |
| |
| if (rc == SLURM_SUCCESS) |
| rc = rc2; |
| } |
| |
| /* Postprocess gres_conf_list after all plugins' node_config_load */ |
| |
| /* Remove every GPU with an empty File */ |
| (void) list_delete_all(gres_conf_list, _find_fileless_gres, |
| &gpu_plugin_id); |
| |
| list_for_each(gres_conf_list, _log_gres_slurmd_conf, NULL); |
| |
| for (i = 0; i < gres_context_cnt; i++) { |
| list_for_each(gres_conf_list, _post_plugin_gres_conf, |
| &gres_context[i]); |
| } |
| |
| fini: |
| /* |
| * We no longer need the gpu plugin unless this option is set: |
| * AcctGatherEnergyType=acct_gather_energy/gpu |
| * Note: slurmstepds may still load gpu plugin for gpu_g_usage_read() |
| * unless JobAcctGatherParams=DisableGPUAcct is set |
| */ |
| if (!in_slurmd || !xstrstr(slurm_conf.acct_gather_energy_type, "gpu")) |
| gpu_plugin_fini(); |
| xfree(gres_conf_file); |
| FREE_NULL_LIST(tmp_gres_conf_list); |
| _pack_context_buf(); |
| _pack_gres_conf(); |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| return rc; |
| } |
| |
| /* |
| * Pack this node's gres configuration into a buffer |
| * IN/OUT buffer - message buffer to pack |
| */ |
| extern int gres_node_config_pack(buf_t *buffer) |
| { |
| int rc = SLURM_SUCCESS; |
| uint32_t magic = GRES_MAGIC; |
| uint16_t rec_cnt = 0, version = SLURM_PROTOCOL_VERSION; |
| list_itr_t *iter; |
| gres_slurmd_conf_t *gres_slurmd_conf; |
| |
| pack16(version, buffer); |
| if (gres_conf_list) |
| rec_cnt = list_count(gres_conf_list); |
| pack16(rec_cnt, buffer); |
| if (rec_cnt) { |
| /* |
| * It might be tempting to convert this to slurm_pack_list, |
| * The problem with that is how we unpack things in the function |
| * below this. It uses 'node_name' all throughout which can not |
| * be passed to slurm_unpack_list. This function is not called |
| * very often (only when the slurmd registers). The efforts to |
| * make this work are just not worth it. |
| */ |
| iter = list_iterator_create(gres_conf_list); |
| while ((gres_slurmd_conf = |
| (gres_slurmd_conf_t *) list_next(iter))) { |
| pack32(magic, buffer); |
| pack64(gres_slurmd_conf->count, buffer); |
| pack32(gres_slurmd_conf->cpu_cnt, buffer); |
| pack32(gres_slurmd_conf->config_flags, buffer); |
| pack32(gres_slurmd_conf->plugin_id, buffer); |
| packstr(gres_slurmd_conf->cpus, buffer); |
| packstr(gres_slurmd_conf->links, buffer); |
| packstr(gres_slurmd_conf->name, buffer); |
| packstr(gres_slurmd_conf->type_name, buffer); |
| packstr(gres_slurmd_conf->unique_id, buffer); |
| } |
| list_iterator_destroy(iter); |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * Unpack this node's configuration from a buffer (built/packed by slurmd) |
| * IN/OUT buffer - message buffer to unpack |
| * IN node_name - name of node whose data is being unpacked |
| */ |
| extern int gres_node_config_unpack(buf_t *buffer, char *node_name) |
| { |
| int i, rc = SLURM_SUCCESS; |
| uint32_t cpu_cnt = 0, magic = 0, plugin_id = 0; |
| uint64_t count64 = 0; |
| uint16_t rec_cnt = 0, protocol_version = 0; |
| uint32_t config_flags = 0; |
| char *tmp_cpus = NULL, *tmp_links = NULL, *tmp_name = NULL; |
| char *tmp_type = NULL; |
| char *tmp_unique_id = NULL; |
| gres_slurmd_conf_t *p; |
| bool locked = false; |
| slurm_gres_context_t *gres_ctx; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| FREE_NULL_LIST(gres_conf_list); |
| gres_conf_list = list_create(destroy_gres_slurmd_conf); |
| |
| safe_unpack16(&protocol_version, buffer); |
| |
| safe_unpack16(&rec_cnt, buffer); |
| if (rec_cnt == 0) |
| return SLURM_SUCCESS; |
| if (rec_cnt > NO_VAL16) |
| goto unpack_error; |
| |
| slurm_mutex_lock(&gres_context_lock); |
| locked = true; |
| if (protocol_version < SLURM_MIN_PROTOCOL_VERSION) { |
| error("%s: protocol_version %hu not supported", |
| __func__, protocol_version); |
| goto unpack_error; |
| } |
| for (i = 0; i < rec_cnt; i++) { |
| bool new_has_file; |
| bool orig_has_file; |
| if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| safe_unpack32(&magic, buffer); |
| if (magic != GRES_MAGIC) |
| goto unpack_error; |
| |
| safe_unpack64(&count64, buffer); |
| safe_unpack32(&cpu_cnt, buffer); |
| safe_unpack32(&config_flags, buffer); |
| safe_unpack32(&plugin_id, buffer); |
| safe_unpackstr(&tmp_cpus, buffer); |
| safe_unpackstr(&tmp_links, buffer); |
| safe_unpackstr(&tmp_name, buffer); |
| safe_unpackstr(&tmp_type, buffer); |
| safe_unpackstr(&tmp_unique_id, buffer); |
| } |
| |
| if (!count64) |
| goto empty; |
| |
| log_flag(GRES, "Node:%s Gres:%s Type:%s UniqueId:%s Flags:%s CPU_IDs:%s CPU#:%u Count:%"PRIu64" Links:%s", |
| node_name, tmp_name, tmp_type, tmp_unique_id, |
| gres_flags2str(config_flags), tmp_cpus, cpu_cnt, |
| count64, tmp_links); |
| |
| if (!(gres_ctx = _find_context_by_id(plugin_id))) { |
| /* |
| * GresPlugins is inconsistently configured. |
| * Not a fatal error, but skip this data. |
| */ |
| error("%s: No plugin configured to process GRES data from node %s (Name:%s Type:%s PluginID:%u Count:%"PRIu64")", |
| __func__, node_name, tmp_name, tmp_type, |
| plugin_id, count64); |
| xfree(tmp_cpus); |
| xfree(tmp_links); |
| xfree(tmp_name); |
| xfree(tmp_type); |
| xfree(tmp_unique_id); |
| continue; |
| } |
| |
| if (xstrcmp(gres_ctx->gres_name, tmp_name)) { |
| /* |
| * Should have been caught in |
| * gres_init() |
| */ |
| error("%s: gres/%s duplicate plugin ID with %s, unable to process", |
| __func__, tmp_name, |
| gres_ctx->gres_name); |
| continue; |
| } |
| new_has_file = config_flags & GRES_CONF_HAS_FILE; |
| orig_has_file = gres_ctx->config_flags & |
| GRES_CONF_HAS_FILE; |
| if (orig_has_file && !new_has_file && count64) { |
| error("%s: gres/%s lacks \"File=\" parameter for node %s", |
| __func__, tmp_name, node_name); |
| config_flags |= GRES_CONF_HAS_FILE; |
| } |
| if (new_has_file && (count64 > MAX_GRES_BITMAP) && |
| !gres_id_shared(config_flags)) { |
| /* |
| * Avoid over-subscribing memory with |
| * huge bitmaps |
| */ |
| error("%s: gres/%s has \"File=\" plus very large " |
| "\"Count\" (%"PRIu64") for node %s, " |
| "resetting value to %d", |
| __func__, tmp_name, count64, |
| node_name, MAX_GRES_BITMAP); |
| count64 = MAX_GRES_BITMAP; |
| } |
| |
| /* |
| * If one node in the bunch said a gres has removed |
| * GRES_CONF_ONE_SHARING then remove it from the |
| * context. |
| */ |
| if ((gres_ctx->config_flags & GRES_CONF_LOADED) && |
| gres_id_shared(config_flags)) { |
| bool gc_one_sharing = |
| gres_ctx->config_flags & |
| GRES_CONF_ONE_SHARING; |
| bool got_one_sharing = |
| config_flags & GRES_CONF_ONE_SHARING; |
| if (gc_one_sharing == got_one_sharing) { |
| } else if (!gc_one_sharing && got_one_sharing) { |
| log_flag(GRES, "gres/%s was already set up to share all ignoring one_sharing from %s", |
| tmp_name, node_name); |
| config_flags &= ~GRES_CONF_ONE_SHARING; |
| } else if (!got_one_sharing) { |
| log_flag(GRES, "gres/%s was already set up to only share one, but we just found the opposite from %s. Removing flag.", |
| tmp_name, node_name); |
| gres_ctx->config_flags &= |
| ~GRES_CONF_ONE_SHARING; |
| } |
| } |
| |
| /* |
| * If we read in from state we want to take the slurmd's view |
| * over our state. |
| */ |
| if (gres_ctx->config_flags & GRES_CONF_FROM_STATE) |
| gres_ctx->config_flags = config_flags; |
| else |
| gres_ctx->config_flags |= config_flags; |
| |
| /* |
| * On the slurmctld we need to load the plugins to |
| * correctly set env vars. We want to call this only |
| * after we have the config_flags so we can tell if we |
| * are CountOnly or not. |
| */ |
| if (!(gres_ctx->config_flags & |
| GRES_CONF_LOADED)) { |
| (void)_load_plugin(gres_ctx); |
| gres_ctx->config_flags |= |
| GRES_CONF_LOADED; |
| } |
| empty: |
| p = xmalloc(sizeof(gres_slurmd_conf_t)); |
| p->config_flags = config_flags; |
| p->count = count64; |
| p->cpu_cnt = cpu_cnt; |
| p->cpus = tmp_cpus; |
| tmp_cpus = NULL; /* Nothing left to xfree */ |
| p->links = tmp_links; |
| tmp_links = NULL; /* Nothing left to xfree */ |
| p->name = tmp_name; /* Preserve for accounting! */ |
| p->type_name = tmp_type; |
| tmp_type = NULL; /* Nothing left to xfree */ |
| p->plugin_id = plugin_id; |
| p->unique_id = tmp_unique_id; |
| tmp_unique_id = NULL; |
| if (gres_links_validate(p->links) < -1) { |
| error("%s: Ignoring invalid Links=%s for Name=%s", |
| __func__, p->links, p->name); |
| xfree(p->links); |
| } |
| list_append(gres_conf_list, p); |
| } |
| |
| slurm_mutex_unlock(&gres_context_lock); |
| return rc; |
| |
| unpack_error: |
| error("%s: unpack error from node %s", __func__, node_name); |
| xfree(tmp_cpus); |
| xfree(tmp_links); |
| xfree(tmp_name); |
| xfree(tmp_type); |
| if (locked) |
| slurm_mutex_unlock(&gres_context_lock); |
| return SLURM_ERROR; |
| } |
| |
| static void _gres_state_delete_members(void *x) |
| { |
| gres_state_t *gres_ptr = (gres_state_t *) x; |
| |
| if (!gres_ptr) |
| return; |
| |
| xfree(gres_ptr->gres_name); |
| xassert(!gres_ptr->gres_data); /* This must be freed beforehand */ |
| xfree(gres_ptr); |
| } |
| |
| static void _gres_node_state_delete_topo(gres_node_state_t *gres_ns) |
| { |
| int i; |
| |
| for (i = 0; i < gres_ns->topo_cnt; i++) { |
| if (gres_ns->topo_gres_bitmap) |
| FREE_NULL_BITMAP(gres_ns->topo_gres_bitmap[i]); |
| if (gres_ns->topo_core_bitmap) |
| FREE_NULL_BITMAP(gres_ns->topo_core_bitmap[i]); |
| if (gres_ns->topo_res_core_bitmap) |
| FREE_NULL_BITMAP(gres_ns->topo_res_core_bitmap[i]); |
| xfree(gres_ns->topo_type_name[i]); |
| } |
| xfree(gres_ns->topo_gres_bitmap); |
| xfree(gres_ns->topo_core_bitmap); |
| xfree(gres_ns->topo_gres_cnt_alloc); |
| xfree(gres_ns->topo_gres_cnt_avail); |
| xfree(gres_ns->topo_res_core_bitmap); |
| xfree(gres_ns->topo_type_id); |
| xfree(gres_ns->topo_type_name); |
| } |
| |
| static void _gres_node_state_delete(gres_node_state_t *gres_ns) |
| { |
| int i; |
| |
| FREE_NULL_BITMAP(gres_ns->gres_bit_alloc); |
| xfree(gres_ns->gres_used); |
| if (gres_ns->links_cnt) { |
| for (i = 0; i < gres_ns->link_len; i++) |
| xfree(gres_ns->links_cnt[i]); |
| xfree(gres_ns->links_cnt); |
| } |
| |
| _gres_node_state_delete_topo(gres_ns); |
| |
| for (i = 0; i < gres_ns->type_cnt; i++) { |
| xfree(gres_ns->type_name[i]); |
| } |
| xfree(gres_ns->type_cnt_alloc); |
| xfree(gres_ns->type_cnt_avail); |
| xfree(gres_ns->type_id); |
| xfree(gres_ns->type_name); |
| xfree(gres_ns); |
| } |
| |
| /* |
| * Delete an element placed on gres_list by _node_config_validate() |
| * free associated memory |
| */ |
| static void _gres_node_list_delete(void *list_element) |
| { |
| gres_state_t *gres_state_node; |
| gres_node_state_t *gres_ns; |
| |
| gres_state_node = (gres_state_t *) list_element; |
| gres_ns = (gres_node_state_t *) gres_state_node->gres_data; |
| _gres_node_state_delete(gres_ns); |
| gres_state_node->gres_data = NULL; |
| _gres_state_delete_members(gres_state_node); |
| } |
| |
| extern void gres_add_type(char *type, gres_node_state_t *gres_ns, |
| uint64_t tmp_gres_cnt) |
| { |
| int i; |
| uint32_t type_id; |
| |
| if (!xstrcasecmp(type, "no_consume")) { |
| gres_ns->no_consume = true; |
| return; |
| } |
| |
| type_id = gres_build_id(type); |
| for (i = 0; i < gres_ns->type_cnt; i++) { |
| if (gres_ns->type_id[i] != type_id) |
| continue; |
| gres_ns->type_cnt_avail[i] += tmp_gres_cnt; |
| break; |
| } |
| |
| if (i >= gres_ns->type_cnt) { |
| gres_ns->type_cnt++; |
| gres_ns->type_cnt_alloc = |
| xrealloc(gres_ns->type_cnt_alloc, |
| sizeof(uint64_t) * gres_ns->type_cnt); |
| gres_ns->type_cnt_avail = |
| xrealloc(gres_ns->type_cnt_avail, |
| sizeof(uint64_t) * gres_ns->type_cnt); |
| gres_ns->type_id = |
| xrealloc(gres_ns->type_id, |
| sizeof(uint32_t) * gres_ns->type_cnt); |
| gres_ns->type_name = |
| xrealloc(gres_ns->type_name, |
| sizeof(char *) * gres_ns->type_cnt); |
| gres_ns->type_cnt_avail[i] += tmp_gres_cnt; |
| gres_ns->type_id[i] = type_id; |
| gres_ns->type_name[i] = xstrdup(type); |
| } |
| } |
| |
| /* |
| * Compute the total GRES count for a particular gres_name. |
| * Note that a given gres_name can appear multiple times in the orig_config |
| * string for multiple types (e.g. "gres=gpu:kepler:1,gpu:tesla:2"). |
| * IN/OUT gres_ns - set gres_cnt_config field in this structure |
| * IN orig_config - gres configuration from slurm.conf |
| * IN gres_name - name of the gres type (e.g. "gpu") |
| * IN gres_name_colon - gres name with appended colon |
| * IN gres_name_colon_len - size of gres_name_colon |
| * RET - Total configured count for this GRES type |
| */ |
| static void _get_gres_cnt(gres_node_state_t *gres_ns, char *orig_config, |
| char *gres_name, char *gres_name_colon, |
| int gres_name_colon_len) |
| { |
| char *node_gres_config, *tok, *last_tok = NULL; |
| char *sub_tok, *last_sub_tok = NULL; |
| char *num, *paren, *last_num = NULL; |
| uint64_t gres_config_cnt = 0, tmp_gres_cnt = 0, mult; |
| int i; |
| |
| xassert(gres_ns); |
| if (orig_config == NULL) { |
| gres_ns->gres_cnt_config = 0; |
| return; |
| } |
| |
| for (i = 0; i < gres_ns->type_cnt; i++) { |
| gres_ns->type_cnt_avail[i] = 0; |
| } |
| |
| node_gres_config = xstrdup(orig_config); |
| tok = strtok_r(node_gres_config, ",", &last_tok); |
| while (tok) { |
| if (!xstrcmp(tok, gres_name)) { |
| gres_config_cnt = 1; |
| break; |
| } |
| if (!xstrncmp(tok, gres_name_colon, gres_name_colon_len)) { |
| paren = strrchr(tok, '('); |
| if (paren) /* Ignore socket binding info */ |
| paren[0] = '\0'; |
| num = strrchr(tok, ':'); |
| if (!num) { |
| error("Bad GRES configuration: %s", tok); |
| break; |
| } |
| tmp_gres_cnt = strtoll(num + 1, &last_num, 10); |
| if ((num[1] < '0') || (num[1] > '9')) { |
| /* |
| * Type name, no count (e.g. "gpu:tesla"). |
| * assume count of 1. |
| */ |
| tmp_gres_cnt = 1; |
| } else if ((mult = suffix_mult(last_num)) != NO_VAL64) { |
| tmp_gres_cnt *= mult; |
| num[0] = '\0'; |
| } else { |
| error("Bad GRES configuration: %s", tok); |
| break; |
| } |
| |
| gres_config_cnt += tmp_gres_cnt; |
| |
| sub_tok = strtok_r(tok, ":", &last_sub_tok); |
| if (sub_tok) /* Skip GRES name */ |
| sub_tok = strtok_r(NULL, ":", &last_sub_tok); |
| while (sub_tok) { |
| gres_add_type(sub_tok, gres_ns, |
| tmp_gres_cnt); |
| sub_tok = strtok_r(NULL, ":", &last_sub_tok); |
| } |
| } |
| tok = strtok_r(NULL, ",", &last_tok); |
| } |
| xfree(node_gres_config); |
| |
| gres_ns->gres_cnt_config = gres_config_cnt; |
| } |
| |
| static int _find_gres_type(gres_node_state_t *gres_ns, uint32_t type_id) |
| { |
| int type_index = -1; |
| for (int i = 0; i < gres_ns->type_cnt; i++) { |
| if(type_id == gres_ns->type_id[i]) { |
| type_index = i; |
| break; |
| } |
| } |
| return type_index; |
| } |
| |
| static int _valid_gres_type(char *gres_name, gres_node_state_t *gres_ns, |
| bool config_overrides, char **reason_down) |
| { |
| int i, j; |
| uint64_t model_cnt; |
| int num_type_rem = 0; |
| |
| if (gres_ns->type_cnt == 0) |
| return SLURM_SUCCESS; |
| |
| for (i = 0; i < gres_ns->type_cnt; i++) { |
| model_cnt = 0; |
| if (gres_ns->type_cnt) { |
| for (j = 0; j < gres_ns->type_cnt; j++) { |
| if (gres_ns->type_id[i] == |
| gres_ns->type_id[j]) |
| model_cnt += |
| gres_ns->type_cnt_avail[j]; |
| } |
| } else { |
| for (j = 0; j < gres_ns->topo_cnt; j++) { |
| if (gres_ns->topo_type_id[i] == |
| gres_ns->topo_type_id[j]) |
| model_cnt += gres_ns-> |
| topo_gres_cnt_avail[j]; |
| } |
| } |
| if (config_overrides) { |
| gres_ns->type_cnt_avail[i] = model_cnt; |
| } else if (model_cnt < gres_ns->type_cnt_avail[i]) { |
| if (reason_down) { |
| xstrfmtcat(*reason_down, |
| "%s:%s count too low " |
| "(%"PRIu64" < %"PRIu64")", |
| gres_name, gres_ns->type_name[i], |
| model_cnt, |
| gres_ns->type_cnt_avail[i]); |
| } |
| return SLURM_ERROR; |
| } |
| } |
| |
| /* |
| * Remove types with 0 available. This happens when updating the type |
| * of a gres in slurm.conf during a reconfig |
| */ |
| for (int i = 0; i < gres_ns->type_cnt; i++) { |
| if (gres_ns->type_cnt_avail[i]) |
| continue; |
| num_type_rem++; |
| } |
| |
| if (num_type_rem) { |
| int tmp_cnt; |
| uint64_t *tmp_type_cnt_alloc, *tmp_type_cnt_avail; |
| uint32_t *tmp_type_id; |
| char **tmp_type_name; |
| |
| tmp_cnt = gres_ns->type_cnt - num_type_rem; |
| tmp_type_id = xcalloc(tmp_cnt, sizeof(*tmp_type_id)); |
| tmp_type_cnt_alloc = |
| xcalloc(tmp_cnt, sizeof(*tmp_type_cnt_alloc)); |
| tmp_type_cnt_avail = |
| xcalloc(tmp_cnt, sizeof(*tmp_type_cnt_avail)); |
| tmp_type_name = |
| xcalloc(tmp_cnt, sizeof(*tmp_type_name)); |
| |
| for (int j = 0, i = 0; i < gres_ns->type_cnt; i++) { |
| if (!gres_ns->type_cnt_avail[i]) { |
| xfree(gres_ns->type_name[i]); |
| continue; |
| } |
| tmp_type_cnt_alloc[j] = |
| gres_ns->type_cnt_alloc[i]; |
| tmp_type_cnt_avail[j] = |
| gres_ns->type_cnt_avail[i]; |
| tmp_type_id[j] = gres_ns->type_id[i]; |
| tmp_type_name[j] = gres_ns->type_name[i]; |
| j++; |
| } |
| |
| xfree(gres_ns->type_cnt_alloc); |
| xfree(gres_ns->type_cnt_avail); |
| xfree(gres_ns->type_id); |
| xfree(gres_ns->type_name); |
| |
| gres_ns->type_cnt_alloc = tmp_type_cnt_alloc; |
| gres_ns->type_cnt_avail = tmp_type_cnt_avail; |
| gres_ns->type_id = tmp_type_id; |
| gres_ns->type_name = tmp_type_name; |
| gres_ns->type_cnt -= num_type_rem; |
| } |
| |
| for (int i = 0; i < gres_ns->topo_cnt; i++) { |
| if (_find_gres_type(gres_ns, gres_ns->topo_type_id[i]) < 0) { |
| if (reason_down && (*reason_down == NULL)) { |
| xstrfmtcat(*reason_down, |
| "%s type (%s) reported but not configured", |
| gres_name, |
| gres_ns->topo_type_name[i]); |
| } |
| return SLURM_ERROR; |
| } |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| static gres_node_state_t *_build_gres_node_state(void) |
| { |
| gres_node_state_t *gres_ns; |
| |
| gres_ns = xmalloc(sizeof(gres_node_state_t)); |
| gres_ns->gres_cnt_config = NO_VAL64; |
| gres_ns->gres_cnt_found = NO_VAL64; |
| |
| return gres_ns; |
| } |
| |
| /* |
| * Build a node's gres record based only upon the slurm.conf contents |
| */ |
| static void _node_config_init(char *orig_config, slurm_gres_context_t *gres_ctx, |
| gres_state_t *gres_state_node) |
| { |
| gres_node_state_t *gres_ns; |
| |
| if (!gres_state_node->gres_data) |
| gres_state_node->gres_data = _build_gres_node_state(); |
| gres_ns = (gres_node_state_t *) gres_state_node->gres_data; |
| |
| /* If the resource isn't configured for use with this node */ |
| if ((orig_config == NULL) || (orig_config[0] == '\0')) { |
| gres_ns->gres_cnt_config = 0; |
| return; |
| } |
| |
| _get_gres_cnt(gres_ns, orig_config, |
| gres_ctx->gres_name, |
| gres_ctx->gres_name_colon, |
| gres_ctx->gres_name_colon_len); |
| |
| gres_ctx->total_cnt += gres_ns->gres_cnt_config; |
| |
| /* Use count from recovered state, if higher */ |
| gres_ns->gres_cnt_avail = MAX(gres_ns->gres_cnt_avail, |
| gres_ns->gres_cnt_config); |
| if ((gres_ns->gres_bit_alloc != NULL) && |
| (gres_ns->gres_cnt_avail > |
| bit_size(gres_ns->gres_bit_alloc)) && |
| !gres_id_shared(gres_ctx->config_flags)) { |
| bit_realloc(gres_ns->gres_bit_alloc, |
| gres_ns->gres_cnt_avail); |
| } |
| } |
| |
| /* Set up the shared/sharing pointers for easy look up later */ |
| static void _set_alt_gres(gres_state_t *gres_state_node_shared, |
| gres_state_t *gres_state_node_sharing) |
| { |
| if (gres_state_node_shared) { |
| if (!gres_state_node_sharing) { |
| error("we have a shared gres of '%s' but no gres that is sharing", |
| gres_state_node_shared->gres_name); |
| } else { |
| gres_node_state_t *gres_ns_shared = |
| gres_state_node_shared->gres_data; |
| gres_node_state_t *gres_ns_sharing = |
| gres_state_node_sharing->gres_data; |
| gres_ns_shared->alt_gres = gres_state_node_sharing; |
| gres_ns_sharing->alt_gres = gres_state_node_shared; |
| } |
| } |
| } |
| |
| /* |
| * Build a node's gres record based only upon the slurm.conf contents |
| * IN orig_config - Gres information supplied from slurm.conf |
| * IN/OUT gres_list - List of Gres records for this node to track usage |
| */ |
| extern void gres_init_node_config(char *orig_config, list_t **gres_list) |
| { |
| gres_state_t *gres_state_node, *gres_state_node_sharing = NULL, |
| *gres_state_node_shared = NULL; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| if ((gres_context_cnt > 0) && (*gres_list == NULL)) { |
| *gres_list = list_create(_gres_node_list_delete); |
| } |
| for (int i = 0; i < gres_context_cnt; i++) { |
| gres_node_state_t *gres_ns; |
| /* Find or create gres_state entry on the list */ |
| gres_state_node = list_find_first(*gres_list, gres_find_id, |
| &gres_context[i].plugin_id); |
| if (gres_state_node == NULL) { |
| gres_state_node = gres_create_state( |
| &gres_context[i], GRES_STATE_SRC_CONTEXT_PTR, |
| GRES_STATE_TYPE_NODE, _build_gres_node_state()); |
| list_append(*gres_list, gres_state_node); |
| } |
| |
| _node_config_init(orig_config, &gres_context[i], |
| gres_state_node); |
| |
| gres_ns = gres_state_node->gres_data; |
| if (gres_ns && gres_ns->gres_cnt_config) { |
| if (gres_id_sharing(gres_state_node->plugin_id)) |
| gres_state_node_sharing = gres_state_node; |
| else if (gres_id_shared(gres_state_node->config_flags)) |
| gres_state_node_shared = gres_state_node; |
| } |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| _set_alt_gres(gres_state_node_shared, gres_state_node_sharing); |
| } |
| |
| static int _foreach_get_tot_from_slurmd_conf(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| tot_from_slurmd_conf_t *slurmd_conf_tot = arg; |
| |
| if (gres_slurmd_conf->plugin_id != slurmd_conf_tot->plugin_id) |
| return 0; |
| |
| slurmd_conf_tot->config_flags |= gres_slurmd_conf->config_flags; |
| |
| slurmd_conf_tot->gres_cnt += gres_slurmd_conf->count; |
| slurmd_conf_tot->rec_cnt++; |
| |
| if (gres_slurmd_conf->cpus || gres_slurmd_conf->type_name) |
| slurmd_conf_tot->cpu_set_cnt++; |
| |
| return 0; |
| } |
| |
| /* |
| * Determine GRES availability on some node |
| * |
| * tot_from_slurmd_conf_t: |
| * plugin_id IN - plugin number to search for |
| * config_flags OUT - config flags from slurmd |
| * topo_cnt OUT - count of gres.conf records of this ID found by slurmd |
| * (each can have different topology) |
| * config_type_cnt OUT - Count of records for this GRES found in configuration, |
| * each of this represents a different Type of of GRES with |
| * this name (e.g. GPU model) |
| * gres_cnt OUT - total number of GRES available of this ID on this node in (sum |
| * across all records of this ID) |
| */ |
| static void _get_tot_from_slurmd_conf(tot_from_slurmd_conf_t *slurmd_conf_tot) |
| { |
| xassert(slurmd_conf_tot); |
| |
| slurmd_conf_tot->config_flags = 0; |
| slurmd_conf_tot->cpu_set_cnt = 0; |
| slurmd_conf_tot->config_type_cnt = 0; |
| slurmd_conf_tot->topo_cnt = 0; |
| slurmd_conf_tot->gres_cnt = 0; |
| slurmd_conf_tot->rec_cnt = 0; |
| |
| if (gres_conf_list == NULL) |
| return; |
| |
| (void) list_for_each(gres_conf_list, _foreach_get_tot_from_slurmd_conf, |
| slurmd_conf_tot); |
| |
| slurmd_conf_tot->config_type_cnt = slurmd_conf_tot->rec_cnt; |
| } |
| |
| /* Convert comma-delimited array of link counts to an integer array */ |
| static int _links_str2array(char *links, char *node_name, |
| gres_node_state_t *gres_ns, |
| int gres_inx, int gres_cnt, |
| char **reason_down) |
| { |
| char *start_ptr, *end_ptr = NULL, *tmp = NULL; |
| int i = 0, rc = SLURM_SUCCESS; |
| |
| if (!links) /* No "Links=" data */ |
| return SLURM_SUCCESS; |
| if (gres_inx >= gres_ns->link_len) { |
| tmp = xstrdup_printf("Invalid GRES index (%d >= %d)", |
| gres_inx, gres_cnt); |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| |
| start_ptr = links; |
| while (1) { |
| gres_ns->links_cnt[gres_inx][i] = |
| strtol(start_ptr, &end_ptr, 10); |
| if (gres_ns->links_cnt[gres_inx][i] < -2) { |
| tmp = xstrdup_printf("Invalid GRES Links value (%s) on node %s: Link value '%d' < -2", |
| links, node_name, |
| gres_ns->links_cnt[gres_inx][i]); |
| |
| gres_ns->links_cnt[gres_inx][i] = 0; |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| if (end_ptr[0] == '\0') |
| return SLURM_SUCCESS; |
| if (end_ptr[0] != ',') { |
| tmp = xstrdup_printf("Invalid GRES Links value (%s) on node %s: end_ptr[0]='%c' != ','", |
| links, node_name, end_ptr[0]); |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| if (++i >= gres_ns->link_len) { |
| tmp = xstrdup_printf("Invalid GRES Links value (%s) on node %s: i=%d >= link_len=%d.", |
| links, node_name, |
| i, gres_ns->link_len); |
| rc = SLURM_ERROR; |
| goto end_it; |
| } |
| start_ptr = end_ptr + 1; |
| } |
| |
| end_it: |
| if (rc) { |
| error("%s: %s If using AutoDetect the amount of GPUs configured in slurm.conf does not match what was detected. If this is intentional, please turn off AutoDetect and manually specify them in gres.conf.", |
| __func__, tmp); |
| if (reason_down && !(*reason_down)) { |
| *reason_down = tmp; |
| tmp = NULL; |
| } else |
| xfree(tmp); |
| |
| /* create zeroed-out links array (NVLINK_NONE == 0) */ |
| memset(gres_ns->links_cnt[gres_inx], 0, gres_cnt * sizeof(int)); |
| } |
| |
| return rc; |
| } |
| |
| static bool _valid_gres_types(char *gres_name, gres_node_state_t *gres_ns, |
| char **reason_down) |
| { |
| bool rc = true; |
| uint64_t gres_cnt_found = 0, gres_sum; |
| int topo_inx, type_inx; |
| |
| if ((gres_ns->type_cnt == 0) || (gres_ns->topo_cnt == 0)) |
| return rc; |
| |
| for (type_inx = 0; type_inx < gres_ns->type_cnt; type_inx++) { |
| gres_cnt_found = 0; |
| for (topo_inx = 0; topo_inx < gres_ns->topo_cnt; topo_inx++) { |
| if (gres_ns->topo_type_id[topo_inx] != |
| gres_ns->type_id[type_inx]) |
| continue; |
| gres_sum = gres_cnt_found + |
| gres_ns->topo_gres_cnt_avail[topo_inx]; |
| if (gres_sum > gres_ns->type_cnt_avail[type_inx]) { |
| gres_ns->topo_gres_cnt_avail[topo_inx] -= |
| (gres_sum - |
| gres_ns->type_cnt_avail[type_inx]); |
| } |
| gres_cnt_found += |
| gres_ns->topo_gres_cnt_avail[topo_inx]; |
| } |
| if (gres_cnt_found < gres_ns->type_cnt_avail[type_inx]) { |
| rc = false; |
| break; |
| } |
| } |
| if (!rc && reason_down && (*reason_down == NULL)) { |
| xstrfmtcat(*reason_down, |
| "%s:%s count too low (%"PRIu64" < %"PRIu64")", |
| gres_name, gres_ns->type_name[type_inx], |
| gres_cnt_found, gres_ns->type_cnt_avail[type_inx]); |
| } |
| |
| return rc; |
| } |
| |
| static void _gres_bit_alloc_resize(gres_node_state_t *gres_ns, |
| uint64_t gres_bits) |
| { |
| if (!gres_bits) { |
| FREE_NULL_BITMAP(gres_ns->gres_bit_alloc); |
| return; |
| } |
| |
| if (!gres_ns->gres_bit_alloc) |
| gres_ns->gres_bit_alloc = bit_alloc(gres_bits); |
| else if (gres_bits != bit_size(gres_ns->gres_bit_alloc)) |
| bit_realloc(gres_ns->gres_bit_alloc, gres_bits); |
| } |
| |
| /* |
| * Job scheduling handles gres affinity on a socket basis internally. |
| * However, the interface for setting affinity is to specify cores. This can |
| * lead to the faulty expectation that the core affinity will be respected by |
| * the Slurm scheduler. |
| * |
| * Therefore this check was added to avoid users setting the cores limit and |
| * expecting Slurm to respect it (which it doesn't and never has). |
| * |
| * In addition to misleading users, a bug can arise where steps and jobs don't |
| * line up because steps do look at the cores rather than the sockets like the |
| * jobs. (i.e. job allocates a core the the step rejects), if we just wanted to |
| * solve this bug we would just expand the cpu list to fill the socket here |
| * instead of throwing an error. |
| */ |
| static int _check_core_range_matches_sock(bitstr_t *tmp_bitmap, |
| rebuild_topo_t *rebuild_topo, |
| gres_slurmd_conf_t *gres_slurmd_conf) |
| { |
| for (int i = 0; (i < rebuild_topo->sock_cnt); i++) { |
| int first = i * rebuild_topo->cores_per_sock; |
| int last = (i + 1) * rebuild_topo->cores_per_sock; |
| int core_cnt = bit_set_count_range(tmp_bitmap, first, last); |
| |
| if (core_cnt && (core_cnt != rebuild_topo->cores_per_sock)) { |
| slurm_gres_context_t *gres_ctx = rebuild_topo->gres_ctx; |
| gres_node_state_t *gres_ns = rebuild_topo->gres_ns; |
| char *gres_cores_str = bit_fmt_full(tmp_bitmap); |
| char *tmp; |
| |
| if (gres_slurmd_conf->config_flags & |
| GRES_CONF_AUTODETECT) { |
| tmp = xstrdup_printf( |
| "%s GRES autodetected core affinity %s on node %s doesn't match socket boundaries. (Socket %d is cores %d-%d). " |
| "Consider setting SlurmdParameters=l3cache_as_socket (recommended) or override this by manually specifying core affinity in gres.conf.", |
| gres_ctx->gres_type, gres_cores_str, |
| rebuild_topo->node_name, i, first, |
| (last - 1)); |
| } else { |
| tmp = xstrdup_printf( |
| "%s GRES core specification %s for node %s doesn't match socket boundaries. (Socket %d is cores %d-%d)", |
| gres_ctx->gres_type, gres_cores_str, |
| rebuild_topo->node_name, i, first, |
| (last - 1)); |
| } |
| xfree(gres_cores_str); |
| FREE_NULL_BITMAP(gres_ns->topo_core_bitmap[ |
| rebuild_topo->topo_cnt]); |
| rebuild_topo->rc = EINVAL; |
| error("%s: %s", __func__, tmp); |
| if (rebuild_topo->reason_down && |
| !(*rebuild_topo->reason_down)) |
| xstrfmtcat(*rebuild_topo->reason_down, "%s", |
| tmp); |
| xfree(tmp); |
| return SLURM_ERROR; |
| } |
| } |
| return SLURM_SUCCESS; |
| } |
| |
| static int _foreach_rebuild_topo(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| rebuild_topo_t *rebuild_topo = arg; |
| slurm_gres_context_t *gres_ctx = rebuild_topo->gres_ctx; |
| gres_node_state_t *gres_ns = rebuild_topo->gres_ns; |
| int topo_cnt = rebuild_topo->topo_cnt; |
| |
| if (gres_slurmd_conf->plugin_id != gres_ctx->plugin_id) |
| return 0; |
| |
| if (gres_ns->gres_bit_alloc && !gres_id_shared(gres_ctx->config_flags)) |
| gres_ns->topo_gres_cnt_alloc[topo_cnt] = 0; |
| gres_ns->topo_gres_cnt_avail[topo_cnt] = gres_slurmd_conf->count; |
| if (gres_slurmd_conf->cpus) { |
| /* NOTE: gres_slurmd_conf->cpus is cores */ |
| bitstr_t *tmp_bitmap = bit_alloc(rebuild_topo->core_cnt); |
| int ret = bit_unfmt(tmp_bitmap, gres_slurmd_conf->cpus); |
| if (ret != SLURM_SUCCESS) { |
| error("%s: %s: invalid GRES core specification (%s) on node %s", |
| __func__, gres_ctx->gres_type, |
| gres_slurmd_conf->cpus, |
| rebuild_topo->node_name); |
| FREE_NULL_BITMAP(tmp_bitmap); |
| rebuild_topo->rc = ESLURM_INVALID_GRES; |
| return -1; |
| } else { |
| FREE_NULL_BITMAP( |
| gres_ns->topo_core_bitmap[topo_cnt]); |
| gres_ns->topo_core_bitmap[topo_cnt] = tmp_bitmap; |
| } |
| if (_check_core_range_matches_sock(tmp_bitmap, rebuild_topo, |
| gres_slurmd_conf)) |
| return -1; |
| |
| rebuild_topo->cpus_config = rebuild_topo->core_cnt; |
| } else if (rebuild_topo->cpus_config && !rebuild_topo->cpu_config_err) { |
| rebuild_topo->cpu_config_err = true; |
| error("%s: %s: has CPUs configured for only some of the records on node %s", |
| __func__, gres_ctx->gres_type, rebuild_topo->node_name); |
| } |
| |
| if (gres_slurmd_conf->links) { |
| if (gres_ns->links_cnt && |
| (gres_ns->link_len != rebuild_topo->tot_gres_cnt)) { |
| /* Size changed, need to rebuild */ |
| for (int j = 0; j < gres_ns->link_len; j++) |
| xfree(gres_ns->links_cnt[j]); |
| xfree(gres_ns->links_cnt); |
| } |
| if (!gres_ns->links_cnt) { |
| gres_ns->link_len = rebuild_topo->tot_gres_cnt; |
| gres_ns->links_cnt = xcalloc(rebuild_topo->tot_gres_cnt, |
| sizeof(int *)); |
| for (int j = 0; j < rebuild_topo->tot_gres_cnt; j++) { |
| gres_ns->links_cnt[j] = |
| xcalloc(rebuild_topo->tot_gres_cnt, |
| sizeof(int)); |
| } |
| } |
| } |
| if (gres_id_shared(gres_slurmd_conf->config_flags)) { |
| /* If running jobs recovered then already set */ |
| if (!gres_ns->topo_gres_bitmap[topo_cnt]) { |
| gres_ns->topo_gres_bitmap[topo_cnt] = |
| bit_alloc(rebuild_topo->dev_cnt); |
| bit_set(gres_ns->topo_gres_bitmap[topo_cnt], |
| rebuild_topo->gres_inx); |
| } |
| rebuild_topo->gres_inx++; |
| } else if (!rebuild_topo->dev_cnt) { |
| /* |
| * Slurmd found GRES, but slurmctld can't use |
| * them. Avoid creating zero-size bitmaps. |
| */ |
| rebuild_topo->has_file = false; |
| } else { |
| FREE_NULL_BITMAP(gres_ns->topo_gres_bitmap[topo_cnt]); |
| gres_ns->topo_gres_bitmap[topo_cnt] = |
| bit_alloc(rebuild_topo->dev_cnt); |
| for (int j = 0; j < gres_slurmd_conf->count; j++) { |
| if (rebuild_topo->gres_inx >= rebuild_topo->dev_cnt) { |
| /* Ignore excess GRES on node */ |
| break; |
| } |
| bit_set(gres_ns->topo_gres_bitmap[topo_cnt], |
| rebuild_topo->gres_inx); |
| if (gres_ns->gres_bit_alloc && |
| bit_test(gres_ns->gres_bit_alloc, |
| rebuild_topo->gres_inx)) { |
| /* Set by recovered job */ |
| gres_ns->topo_gres_cnt_alloc[topo_cnt]++; |
| } |
| if (_links_str2array( |
| gres_slurmd_conf->links, |
| rebuild_topo->node_name, gres_ns, |
| rebuild_topo->gres_inx, |
| rebuild_topo->tot_gres_cnt, |
| rebuild_topo->reason_down) != SLURM_SUCCESS) |
| rebuild_topo->rc = EINVAL; |
| |
| rebuild_topo->gres_inx++; |
| } |
| } |
| gres_ns->topo_type_id[topo_cnt] = |
| gres_build_id(gres_slurmd_conf->type_name); |
| xfree(gres_ns->topo_type_name[topo_cnt]); |
| gres_ns->topo_type_name[topo_cnt] = |
| xstrdup(gres_slurmd_conf->type_name); |
| rebuild_topo->topo_cnt++; |
| if (rebuild_topo->topo_cnt >= gres_ns->topo_cnt) |
| return -1; |
| |
| return 0; |
| } |
| |
| static int _foreach_rebuild_topo_no_cpus(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| rebuild_topo_t *rebuild_topo = arg; |
| slurm_gres_context_t *gres_ctx = rebuild_topo->gres_ctx; |
| gres_node_state_t *gres_ns = rebuild_topo->gres_ns; |
| |
| if (gres_slurmd_conf->plugin_id != gres_ctx->plugin_id) |
| return 0; |
| |
| for (int j = 0; j < rebuild_topo->topo_cnt; j++) { |
| if (gres_ns->topo_core_bitmap[j]) |
| continue; |
| gres_ns->topo_core_bitmap[j] = |
| bit_alloc(rebuild_topo->core_cnt); |
| bit_set_all(gres_ns->topo_core_bitmap[j]); |
| } |
| |
| return 0; |
| } |
| |
| static int _foreach_add_gres_info(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| add_gres_info_t *add_gres_info = arg; |
| slurm_gres_context_t *gres_ctx = add_gres_info->gres_ctx; |
| gres_node_state_t *gres_ns = add_gres_info->gres_ns; |
| uint32_t type_id; |
| int i; |
| |
| if (gres_slurmd_conf->plugin_id != gres_ctx->plugin_id) |
| return 0; |
| |
| type_id = gres_build_id(gres_slurmd_conf->type_name); |
| |
| for (i = 0; i < gres_ns->type_cnt; i++) { |
| if (type_id == gres_ns->type_id[i]) |
| break; |
| } |
| if (i < gres_ns->type_cnt) { |
| /* Update count as needed */ |
| gres_ns->type_cnt_avail[i] = gres_slurmd_conf->count; |
| } else { |
| gres_add_type(gres_slurmd_conf->type_name, |
| gres_ns, |
| gres_slurmd_conf->count); |
| } |
| |
| return 0; |
| } |
| |
| static int _node_config_validate(node_record_t *node_ptr, |
| gres_state_t *gres_state_node, int cpu_cnt, |
| int core_cnt, int sock_cnt, int cores_per_sock, |
| bool config_overrides, char **reason_down, |
| slurm_gres_context_t *gres_ctx) |
| { |
| int i, rc = SLURM_SUCCESS; |
| uint64_t dev_cnt; |
| bool updated_config = false; |
| gres_node_state_t *gres_ns; |
| bool has_file, has_type, first_time = false, rebuild_topo = false; |
| tot_from_slurmd_conf_t slurmd_conf_tot = { |
| .plugin_id = gres_ctx->plugin_id, |
| }; |
| char *orig_config = node_ptr->config_ptr->gres; |
| char *node_name = node_ptr->name; |
| xassert(core_cnt); |
| if (gres_state_node->gres_data == NULL) |
| gres_state_node->gres_data = _build_gres_node_state(); |
| gres_ns = (gres_node_state_t *) gres_state_node->gres_data; |
| if (gres_ns->node_feature) |
| return rc; |
| |
| _get_tot_from_slurmd_conf(&slurmd_conf_tot); |
| |
| /* If the gres is sharing we need to have topo configured. */ |
| if (slurmd_conf_tot.cpu_set_cnt || |
| (gres_id_sharing(slurmd_conf_tot.plugin_id) && gres_ns->alt_gres)) |
| slurmd_conf_tot.topo_cnt = slurmd_conf_tot.rec_cnt; |
| |
| /* |
| * Check existing config_flags before overriding from |
| * slurmd_conf_tot.config_flags. |
| */ |
| if (gres_state_node->config_flags & GRES_CONF_UPDATE_CONFIG) |
| updated_config = true; |
| |
| /* Make sure these are insync after we get it from the slurmd */ |
| gres_state_node->config_flags = slurmd_conf_tot.config_flags; |
| |
| if (gres_ns->gres_cnt_config > slurmd_conf_tot.gres_cnt) { |
| if (reason_down && (*reason_down == NULL)) { |
| xstrfmtcat(*reason_down, |
| "%s count reported lower than configured " |
| "(%"PRIu64" < %"PRIu64")", |
| gres_ctx->gres_type, |
| slurmd_conf_tot.gres_cnt, |
| gres_ns->gres_cnt_config); |
| } |
| rc = EINVAL; |
| } |
| if ((slurmd_conf_tot.gres_cnt > gres_ns->gres_cnt_config)) { |
| debug("%s: %s: Ignoring excess count on node %s (%" |
| PRIu64" > %"PRIu64")", |
| __func__, gres_ctx->gres_type, node_name, |
| slurmd_conf_tot.gres_cnt, |
| gres_ns->gres_cnt_config); |
| slurmd_conf_tot.gres_cnt = gres_ns->gres_cnt_config; |
| } |
| if (gres_ns->gres_cnt_found != slurmd_conf_tot.gres_cnt) { |
| if (gres_ns->gres_cnt_found != NO_VAL64) { |
| info("%s: %s: Count changed on node %s (%"PRIu64" != %"PRIu64")", |
| __func__, gres_ctx->gres_type, node_name, |
| gres_ns->gres_cnt_found, |
| slurmd_conf_tot.gres_cnt); |
| } |
| if ((gres_ns->gres_cnt_found != NO_VAL64) && |
| (gres_ns->gres_cnt_alloc != 0)) { |
| if (reason_down && (*reason_down == NULL)) { |
| xstrfmtcat(*reason_down, |
| "%s count changed and jobs are using them " |
| "(%"PRIu64" != %"PRIu64")", |
| gres_ctx->gres_type, |
| gres_ns->gres_cnt_found, |
| slurmd_conf_tot.gres_cnt); |
| } |
| rc = EINVAL; |
| } else { |
| gres_ns->gres_cnt_found = slurmd_conf_tot.gres_cnt; |
| updated_config = true; |
| first_time = true; |
| } |
| } |
| if (!updated_config && gres_ns->type_cnt) { |
| /* |
| * This is needed to address the GRES specification in |
| * gres.conf having a Type option, while the GRES specification |
| * in slurm.conf does not. |
| */ |
| for (i = 0; i < gres_ns->type_cnt; i++) { |
| if (gres_ns->type_cnt_avail[i]) |
| continue; |
| updated_config = true; |
| break; |
| } |
| } |
| |
| if (!first_time && gres_ns->type_cnt && gres_ns->topo_cnt) { |
| for (i = 0; i < gres_ns->topo_cnt; i++) { |
| int type_index = _find_gres_type(gres_ns, |
| gres_ns->topo_type_id[i]); |
| /* |
| * On a reconfig if a type was removed from slurm.conf |
| * its type_cnt_avail will be set to 0. If the type is |
| * not found then the topo is from a previous invalid |
| * registration. |
| */ |
| if ((type_index < 0) || |
| (gres_ns->type_cnt_avail[type_index] == 0 && |
| gres_ns->topo_gres_cnt_avail[i])) { |
| if (gres_ns->gres_cnt_alloc != 0) { |
| if (reason_down && |
| (*reason_down == NULL)) { |
| xstrfmtcat(*reason_down, |
| "%s type changed and jobs are using them", |
| gres_ctx->gres_type); |
| } |
| rc = EINVAL; |
| updated_config = false; |
| } else { |
| updated_config = true; |
| } |
| } |
| |
| } |
| } |
| |
| if (!updated_config && !(IS_NODE_INVALID_REG(node_ptr))) |
| return rc; |
| |
| if (gres_id_sharing(slurmd_conf_tot.plugin_id) && gres_ns->alt_gres) { |
| /* |
| * Tell the shared gres to update itself if the sharing gres is |
| * updated -- which will happen in a subsequent call to |
| * _node_config_validate() since gres_node_config_validate() is |
| * looping on all gres_contexts. |
| */ |
| gres_ns->alt_gres->config_flags |= GRES_CONF_UPDATE_CONFIG; |
| } |
| |
| if ((slurmd_conf_tot.gres_cnt > gres_ns->gres_cnt_config) && |
| config_overrides) { |
| info("%s: %s: count on node %s inconsistent with slurmctld count (%"PRIu64" != %"PRIu64")", |
| __func__, gres_ctx->gres_type, node_name, |
| slurmd_conf_tot.gres_cnt, gres_ns->gres_cnt_config); |
| slurmd_conf_tot.gres_cnt = gres_ns->gres_cnt_config; |
| /* Ignore excess GRES */ |
| } |
| if ((slurmd_conf_tot.topo_cnt == 0) && |
| (slurmd_conf_tot.topo_cnt != gres_ns->topo_cnt)) { |
| /* Need to clear topology info */ |
| _gres_node_state_delete_topo(gres_ns); |
| |
| gres_ns->topo_cnt = slurmd_conf_tot.topo_cnt; |
| } |
| |
| has_file = gres_ctx->config_flags & GRES_CONF_HAS_FILE; |
| has_type = gres_ctx->config_flags & GRES_CONF_HAS_TYPE; |
| if (gres_id_shared(gres_ctx->config_flags)) |
| dev_cnt = slurmd_conf_tot.topo_cnt; |
| else |
| dev_cnt = slurmd_conf_tot.gres_cnt; |
| if (has_file && (slurmd_conf_tot.topo_cnt != gres_ns->topo_cnt) && |
| (dev_cnt == 0)) { |
| /* |
| * Clear any vestigial GRES node state info. |
| */ |
| _gres_node_state_delete_topo(gres_ns); |
| |
| xfree(gres_ns->gres_bit_alloc); |
| |
| gres_ns->topo_cnt = 0; |
| } else if (has_file && |
| (slurmd_conf_tot.topo_cnt != gres_ns->topo_cnt)) { |
| /* |
| * Need to rebuild topology info. |
| * Resize the data structures here. |
| */ |
| rebuild_topo = true; |
| /* |
| * Clear any vestigial GRES node state info. |
| */ |
| _gres_node_state_delete_topo(gres_ns); |
| |
| gres_ns->topo_gres_cnt_alloc = |
| xrealloc(gres_ns->topo_gres_cnt_alloc, |
| slurmd_conf_tot.topo_cnt * sizeof(uint64_t)); |
| gres_ns->topo_gres_cnt_avail = |
| xrealloc(gres_ns->topo_gres_cnt_avail, |
| slurmd_conf_tot.topo_cnt * sizeof(uint64_t)); |
| gres_ns->topo_gres_bitmap = |
| xrealloc(gres_ns->topo_gres_bitmap, |
| slurmd_conf_tot.topo_cnt * |
| sizeof(bitstr_t *)); |
| gres_ns->topo_core_bitmap = |
| xrealloc(gres_ns->topo_core_bitmap, |
| slurmd_conf_tot.topo_cnt * |
| sizeof(bitstr_t *)); |
| gres_ns->topo_res_core_bitmap = |
| xrealloc(gres_ns->topo_res_core_bitmap, |
| slurmd_conf_tot.topo_cnt * |
| sizeof(bitstr_t *)); |
| gres_ns->topo_type_id = xrealloc(gres_ns->topo_type_id, |
| slurmd_conf_tot.topo_cnt * |
| sizeof(uint32_t)); |
| gres_ns->topo_type_name = xrealloc(gres_ns->topo_type_name, |
| slurmd_conf_tot.topo_cnt * |
| sizeof(char *)); |
| if (gres_ns->gres_bit_alloc) |
| bit_realloc(gres_ns->gres_bit_alloc, dev_cnt); |
| gres_ns->topo_cnt = slurmd_conf_tot.topo_cnt; |
| } else if (gres_ns->topo_cnt) { |
| /* |
| * Need to rebuild topology info to recover state after |
| * slurmctld restart with running jobs. The number of gpus, |
| * cores, and type might have changed in slurm.conf |
| */ |
| rebuild_topo = true; |
| } |
| |
| if (rebuild_topo) { |
| rebuild_topo_t rebuild_topo = { |
| .core_cnt = core_cnt, |
| .cores_per_sock = cores_per_sock, |
| .dev_cnt = dev_cnt, |
| .gres_ctx = gres_ctx, |
| .gres_ns = gres_ns, |
| .has_file = has_file, |
| .node_name = node_name, |
| .rc = rc, |
| .reason_down = reason_down, |
| .sock_cnt = sock_cnt, |
| .tot_gres_cnt = slurmd_conf_tot.gres_cnt, |
| }; |
| (void) list_for_each(gres_conf_list, _foreach_rebuild_topo, |
| &rebuild_topo); |
| rc = rebuild_topo.rc; |
| has_file = rebuild_topo.has_file; |
| |
| if (rebuild_topo.cpu_config_err) { |
| /* |
| * Some GRES of this type have "CPUs" configured. Set |
| * topo_core_bitmap for all others with all bits set. |
| */ |
| (void) list_for_each(gres_conf_list, |
| _foreach_rebuild_topo_no_cpus, |
| &rebuild_topo); |
| } |
| } else if (!has_file && has_type) { |
| add_gres_info_t add_gres_info = { |
| .gres_ctx = gres_ctx, |
| .gres_ns = gres_ns, |
| }; |
| /* Add GRES Type information as needed */ |
| (void) list_for_each(gres_conf_list, |
| _foreach_add_gres_info, |
| &add_gres_info); |
| } |
| |
| if ((orig_config == NULL) || (orig_config[0] == '\0')) |
| gres_ns->gres_cnt_config = 0; |
| else if (gres_ns->gres_cnt_config == NO_VAL64) { |
| /* This should have been filled in by _node_config_init() */ |
| _get_gres_cnt(gres_ns, orig_config, |
| gres_ctx->gres_name, |
| gres_ctx->gres_name_colon, |
| gres_ctx->gres_name_colon_len); |
| } |
| |
| gres_ns->gres_cnt_avail = gres_ns->gres_cnt_config; |
| |
| if (has_file) { |
| uint64_t gres_bits; |
| if (gres_id_shared(gres_ctx->config_flags)) { |
| gres_bits = slurmd_conf_tot.topo_cnt; |
| } else { |
| if (gres_ns->gres_cnt_avail > MAX_GRES_BITMAP) { |
| error("%s: %s has \"File\" plus very large \"Count\" " |
| "(%"PRIu64") for node %s, resetting value to %u", |
| __func__, gres_ctx->gres_type, |
| gres_ns->gres_cnt_avail, node_name, |
| MAX_GRES_BITMAP); |
| gres_ns->gres_cnt_avail = MAX_GRES_BITMAP; |
| gres_ns->gres_cnt_found = MAX_GRES_BITMAP; |
| } |
| gres_bits = gres_ns->gres_cnt_avail; |
| } |
| |
| _gres_bit_alloc_resize(gres_ns, gres_bits); |
| } |
| |
| gres_validate_node_cores(gres_ns, core_cnt, node_name); |
| |
| if ((slurmd_conf_tot.config_type_cnt > 1) && |
| !_valid_gres_types(gres_ctx->gres_type, gres_ns, reason_down)){ |
| rc = EINVAL; |
| } else if (!config_overrides && |
| (gres_ns->gres_cnt_found < gres_ns->gres_cnt_config)) { |
| if (reason_down && (*reason_down == NULL)) { |
| xstrfmtcat(*reason_down, |
| "%s count too low (%"PRIu64" < %"PRIu64")", |
| gres_ctx->gres_type, |
| gres_ns->gres_cnt_found, |
| gres_ns->gres_cnt_config); |
| } |
| rc = EINVAL; |
| } else if (_valid_gres_type(gres_ctx->gres_type, gres_ns, |
| config_overrides, reason_down)) { |
| rc = EINVAL; |
| } else if (config_overrides && gres_ns->topo_cnt && |
| (gres_ns->gres_cnt_found != gres_ns->gres_cnt_config)) { |
| error("%s on node %s configured for %"PRIu64" resources but " |
| "%"PRIu64" found, ignoring topology support", |
| gres_ctx->gres_type, node_name, |
| gres_ns->gres_cnt_config, gres_ns->gres_cnt_found); |
| if (gres_ns->topo_core_bitmap) { |
| for (i = 0; i < gres_ns->topo_cnt; i++) { |
| if (gres_ns->topo_core_bitmap) { |
| FREE_NULL_BITMAP(gres_ns-> |
| topo_core_bitmap[i]); |
| } |
| if (gres_ns->topo_gres_bitmap) { |
| FREE_NULL_BITMAP(gres_ns-> |
| topo_gres_bitmap[i]); |
| } |
| xfree(gres_ns->topo_type_name[i]); |
| } |
| xfree(gres_ns->topo_core_bitmap); |
| xfree(gres_ns->topo_gres_bitmap); |
| xfree(gres_ns->topo_gres_cnt_alloc); |
| xfree(gres_ns->topo_gres_cnt_avail); |
| xfree(gres_ns->topo_type_id); |
| xfree(gres_ns->topo_type_name); |
| } |
| gres_ns->topo_cnt = 0; |
| } |
| |
| return rc; |
| } |
| |
| /* The GPU count on a node changed. Update SHARED data structures to match */ |
| static void _sync_node_shared_to_sharing(gres_state_t *sharing_gres_state_node) |
| { |
| gres_node_state_t *sharing_gres_ns, *shared_gres_ns; |
| uint64_t sharing_cnt, shared_alloc = 0, shared_rem; |
| int i; |
| |
| if (!sharing_gres_state_node) |
| return; |
| |
| sharing_gres_ns = sharing_gres_state_node->gres_data; |
| |
| if (!sharing_gres_ns->alt_gres) |
| return; |
| |
| shared_gres_ns = sharing_gres_ns->alt_gres->gres_data; |
| |
| sharing_cnt = sharing_gres_ns->gres_cnt_avail; |
| if (shared_gres_ns->gres_bit_alloc) { |
| if ((sharing_cnt == bit_size(shared_gres_ns->gres_bit_alloc)) && |
| (sharing_cnt == shared_gres_ns->topo_cnt)) { |
| debug3("No change for gres/'shared'"); |
| return; |
| } |
| } |
| |
| if (sharing_cnt == 0) |
| return; /* Still no SHARINGs */ |
| |
| /* Free any excess gres/'shared' topo records */ |
| for (i = sharing_cnt; i < shared_gres_ns->topo_cnt; i++) { |
| if (shared_gres_ns->topo_core_bitmap) |
| FREE_NULL_BITMAP(shared_gres_ns->topo_core_bitmap[i]); |
| if (shared_gres_ns->topo_gres_bitmap) |
| FREE_NULL_BITMAP(shared_gres_ns->topo_gres_bitmap[i]); |
| xfree(shared_gres_ns->topo_type_name[i]); |
| } |
| |
| if (shared_gres_ns->gres_cnt_avail == 0) { |
| /* No gres/'shared' on this node */ |
| shared_gres_ns->topo_cnt = 0; |
| return; |
| } |
| |
| if (!shared_gres_ns->gres_bit_alloc) { |
| shared_gres_ns->gres_bit_alloc = bit_alloc(sharing_cnt); |
| } else { |
| bit_realloc(shared_gres_ns->gres_bit_alloc, sharing_cnt); |
| } |
| |
| /* Add any additional required gres/'shared' topo records */ |
| if (shared_gres_ns->topo_cnt) { |
| shared_gres_ns->topo_core_bitmap = |
| xrealloc(shared_gres_ns->topo_core_bitmap, |
| sizeof(bitstr_t *) * sharing_cnt); |
| shared_gres_ns->topo_res_core_bitmap = |
| xrealloc(shared_gres_ns->topo_res_core_bitmap, |
| sizeof(bitstr_t *) * sharing_cnt); |
| shared_gres_ns->topo_gres_bitmap = |
| xrealloc(shared_gres_ns->topo_gres_bitmap, |
| sizeof(bitstr_t *) * sharing_cnt); |
| shared_gres_ns->topo_gres_cnt_alloc = |
| xrealloc(shared_gres_ns->topo_gres_cnt_alloc, |
| sizeof(uint64_t) * sharing_cnt); |
| shared_gres_ns->topo_gres_cnt_avail = |
| xrealloc(shared_gres_ns->topo_gres_cnt_avail, |
| sizeof(uint64_t) * sharing_cnt); |
| shared_gres_ns->topo_type_id = |
| xrealloc(shared_gres_ns->topo_type_id, |
| sizeof(uint32_t) * sharing_cnt); |
| shared_gres_ns->topo_type_name = |
| xrealloc(shared_gres_ns->topo_type_name, |
| sizeof(char *) * sharing_cnt); |
| } else { |
| shared_gres_ns->topo_core_bitmap = |
| xcalloc(sharing_cnt, sizeof(bitstr_t *)); |
| shared_gres_ns->topo_res_core_bitmap = |
| xcalloc(sharing_cnt, sizeof(bitstr_t *)); |
| shared_gres_ns->topo_gres_bitmap = |
| xcalloc(sharing_cnt, sizeof(bitstr_t *)); |
| shared_gres_ns->topo_gres_cnt_alloc = |
| xcalloc(sharing_cnt, sizeof(uint64_t)); |
| shared_gres_ns->topo_gres_cnt_avail = |
| xcalloc(sharing_cnt, sizeof(uint64_t)); |
| shared_gres_ns->topo_type_id = |
| xcalloc(sharing_cnt, sizeof(uint32_t)); |
| shared_gres_ns->topo_type_name = |
| xcalloc(sharing_cnt, sizeof(char *)); |
| } |
| |
| /* |
| * Evenly distribute any remaining SHARED counts. |
| * Counts get reset as needed when the node registers. |
| */ |
| for (i = 0; i < shared_gres_ns->topo_cnt; i++) |
| shared_alloc += shared_gres_ns->topo_gres_cnt_avail[i]; |
| if (shared_alloc >= shared_gres_ns->gres_cnt_avail) |
| shared_rem = 0; |
| else |
| shared_rem = shared_gres_ns->gres_cnt_avail - shared_alloc; |
| for (i = shared_gres_ns->topo_cnt; i < sharing_cnt; i++) { |
| shared_gres_ns->topo_gres_bitmap[i] = bit_alloc(sharing_cnt); |
| bit_set(shared_gres_ns->topo_gres_bitmap[i], i); |
| shared_alloc = shared_rem / (sharing_cnt - i); |
| shared_gres_ns->topo_gres_cnt_avail[i] = shared_alloc; |
| shared_rem -= shared_alloc; |
| } |
| shared_gres_ns->topo_cnt = sharing_cnt; |
| |
| for (i = 0; i < shared_gres_ns->topo_cnt; i++) { |
| if (shared_gres_ns->topo_gres_bitmap && |
| shared_gres_ns->topo_gres_bitmap[i] && |
| (sharing_cnt != |
| bit_size(shared_gres_ns->topo_gres_bitmap[i]))) { |
| bit_realloc(shared_gres_ns->topo_gres_bitmap[i], |
| sharing_cnt); |
| } |
| } |
| } |
| |
| /* |
| * Validate a node's configuration and put a gres record onto a list |
| * Called immediately after gres_node_config_unpack(). |
| * IN node_ptr - With the relevant attributes for this function being: |
| * ->name - name of the node for which the gres information applies |
| * ->config_ptr->gres - Gres information supplied from merged |
| * slurm.conf/gres.conf |
| * ->gres - Updated gres info from slurm.conf |
| * ->gres_list - List of Gres records for this node to track usage |
| * IN threads_per_core - Count of CPUs (threads) per core on this node |
| * IN cores_per_sock - Count of cores per socket on this node |
| * IN sock_cnt - Count of sockets on this node |
| * IN config_overrides - true: Don't validate hardware, use slurm.conf |
| * configuration |
| * false: Validate hardware config, but use slurm.conf |
| * config |
| * OUT reason_down - set to an explanation of failure, if any, don't set if NULL |
| */ |
| extern int gres_node_config_validate(node_record_t *node_ptr, |
| int threads_per_core, int cores_per_sock, |
| int sock_cnt, bool config_overrides, |
| char **reason_down) |
| { |
| int i, rc = SLURM_SUCCESS, rc2; |
| gres_state_t *gres_state_node, *gres_gpu_ptr = NULL; |
| int core_cnt = sock_cnt * cores_per_sock; |
| int cpu_cnt = core_cnt * threads_per_core; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| if ((gres_context_cnt > 0) && (node_ptr->gres_list == NULL)) |
| node_ptr->gres_list = list_create(_gres_node_list_delete); |
| for (i = 0; i < gres_context_cnt; i++) { |
| /* Find or create gres_state entry on the list */ |
| gres_state_node = |
| list_find_first(node_ptr->gres_list, gres_find_id, |
| &gres_context[i].plugin_id); |
| if (gres_state_node == NULL) { |
| gres_state_node = gres_create_state( |
| &gres_context[i], GRES_STATE_SRC_CONTEXT_PTR, |
| GRES_STATE_TYPE_NODE, _build_gres_node_state()); |
| list_append(node_ptr->gres_list, gres_state_node); |
| } |
| rc2 = _node_config_validate(node_ptr, gres_state_node, cpu_cnt, |
| core_cnt, sock_cnt, cores_per_sock, |
| config_overrides, reason_down, |
| &gres_context[i]); |
| rc = MAX(rc, rc2); |
| if (gres_id_sharing(gres_state_node->plugin_id)) |
| gres_gpu_ptr = gres_state_node; |
| } |
| _sync_node_shared_to_sharing(gres_gpu_ptr); |
| _build_node_gres_str(&node_ptr->gres_list, &node_ptr->gres, |
| cores_per_sock, sock_cnt); |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| return rc; |
| } |
| |
| /* Convert number to new value with suffix (e.g. 2096 -> 2K) */ |
| static void _gres_scale_value(uint64_t gres_size, uint64_t *gres_scaled, |
| char **suffix) |
| { |
| uint64_t tmp_gres_size = gres_size; |
| int i; |
| |
| tmp_gres_size = gres_size; |
| for (i = 0; i < 4; i++) { |
| if ((tmp_gres_size != 0) && ((tmp_gres_size % 1024) == 0)) |
| tmp_gres_size /= 1024; |
| else |
| break; |
| } |
| |
| *gres_scaled = tmp_gres_size; |
| if (i == 0) |
| *suffix = ""; |
| else if (i == 1) |
| *suffix = "K"; |
| else if (i == 2) |
| *suffix = "M"; |
| else if (i == 3) |
| *suffix = "G"; |
| else |
| *suffix = "T"; |
| } |
| |
| /* |
| * Add a GRES from node_feature plugin |
| * IN node_name - name of the node for which the gres information applies |
| * IN gres_name - name of the GRES being added or updated from the plugin |
| * IN gres_size - count of this GRES on this node |
| * IN/OUT new_config - Updated GRES info from slurm.conf |
| * IN/OUT gres_list - List of GRES records for this node to track usage |
| */ |
| extern void gres_node_feature(char *node_name, |
| char *gres_name, uint64_t gres_size, |
| char **new_config, list_t **gres_list) |
| { |
| char *new_gres = NULL, *tok, *save_ptr = NULL, *sep = "", *suffix = ""; |
| gres_state_t *gres_state_node; |
| gres_node_state_t *gres_ns; |
| uint32_t plugin_id; |
| uint64_t gres_scaled = 0; |
| int gres_name_len; |
| |
| xassert(gres_name); |
| gres_name_len = strlen(gres_name); |
| plugin_id = gres_build_id(gres_name); |
| if (*new_config) { |
| tok = strtok_r(*new_config, ",", &save_ptr); |
| while (tok) { |
| if (!strncmp(tok, gres_name, gres_name_len) && |
| ((tok[gres_name_len] == ':') || |
| (tok[gres_name_len] == '\0'))) { |
| /* Skip this record */ |
| } else { |
| xstrfmtcat(new_gres, "%s%s", sep, tok); |
| sep = ","; |
| } |
| tok = strtok_r(NULL, ",", &save_ptr); |
| } |
| } |
| _gres_scale_value(gres_size, &gres_scaled, &suffix); |
| xstrfmtcat(new_gres, "%s%s:%"PRIu64"%s", |
| sep, gres_name, gres_scaled, suffix); |
| xfree(*new_config); |
| *new_config = new_gres; |
| |
| slurm_mutex_lock(&gres_context_lock); |
| if (gres_context_cnt > 0) { |
| if (*gres_list == NULL) |
| *gres_list = list_create(_gres_node_list_delete); |
| gres_state_node = list_find_first(*gres_list, gres_find_id, |
| &plugin_id); |
| if (gres_state_node == NULL) { |
| gres_state_node = xmalloc(sizeof(gres_state_t)); |
| /* FIXME: no config_flags known at this moment */ |
| /* gres_state_node->config_flags = ; */ |
| gres_state_node->plugin_id = plugin_id; |
| gres_state_node->gres_data = _build_gres_node_state(); |
| gres_state_node->gres_name = xstrdup(gres_name); |
| gres_state_node->state_type = GRES_STATE_TYPE_NODE; |
| list_append(*gres_list, gres_state_node); |
| } |
| gres_ns = gres_state_node->gres_data; |
| if (gres_size >= gres_ns->gres_cnt_alloc) { |
| gres_ns->gres_cnt_avail = gres_size - |
| gres_ns->gres_cnt_alloc; |
| } else { |
| error("%s: Changed size count of GRES %s from %"PRIu64 |
| " to %"PRIu64", resource over allocated", |
| __func__, gres_name, |
| gres_ns->gres_cnt_avail, gres_size); |
| gres_ns->gres_cnt_avail = 0; |
| } |
| gres_ns->gres_cnt_config = gres_size; |
| gres_ns->gres_cnt_found = gres_size; |
| gres_ns->node_feature = true; |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| } |
| |
| /* |
| * Check validity of a GRES change. Specifically if a GRES type has "Files" |
| * configured then the only valid new counts are the current count or zero |
| * |
| * RET true of the requested change is valid |
| */ |
| static int _node_reconfig_test(char *node_name, char *new_gres, |
| gres_state_t *gres_state_node, |
| slurm_gres_context_t *gres_ctx) |
| { |
| gres_node_state_t *orig_gres_ns, *new_gres_ns; |
| int rc = SLURM_SUCCESS; |
| |
| xassert(gres_state_node); |
| if (!(gres_ctx->config_flags & GRES_CONF_HAS_FILE)) |
| return SLURM_SUCCESS; |
| |
| orig_gres_ns = gres_state_node->gres_data; |
| new_gres_ns = _build_gres_node_state(); |
| _get_gres_cnt(new_gres_ns, new_gres, |
| gres_ctx->gres_name, |
| gres_ctx->gres_name_colon, |
| gres_ctx->gres_name_colon_len); |
| if ((new_gres_ns->gres_cnt_config != 0) && |
| (new_gres_ns->gres_cnt_config != |
| orig_gres_ns->gres_cnt_config)) { |
| error("Attempt to change gres/%s Count on node %s from %" |
| PRIu64" to %"PRIu64" invalid with File configuration", |
| gres_ctx->gres_name, node_name, |
| orig_gres_ns->gres_cnt_config, |
| new_gres_ns->gres_cnt_config); |
| rc = ESLURM_INVALID_GRES; |
| } |
| _gres_node_state_delete(new_gres_ns); |
| |
| return rc; |
| } |
| |
| static int _node_reconfig(char *node_name, char *new_gres, char **gres_str, |
| gres_state_t *gres_state_node, bool config_overrides, |
| slurm_gres_context_t *gres_ctx, |
| bool *updated_gpu_cnt) |
| { |
| int i; |
| gres_node_state_t *gres_ns; |
| uint64_t gres_bits, orig_cnt; |
| |
| xassert(gres_state_node); |
| xassert(updated_gpu_cnt); |
| *updated_gpu_cnt = false; |
| if (gres_state_node->gres_data == NULL) |
| gres_state_node->gres_data = _build_gres_node_state(); |
| gres_ns = gres_state_node->gres_data; |
| orig_cnt = gres_ns->gres_cnt_config; |
| |
| _get_gres_cnt(gres_ns, new_gres, |
| gres_ctx->gres_name, |
| gres_ctx->gres_name_colon, |
| gres_ctx->gres_name_colon_len); |
| |
| if (gres_ns->gres_cnt_config == orig_cnt) |
| return SLURM_SUCCESS; /* No change in count */ |
| |
| /* Update count */ |
| gres_ctx->total_cnt -= orig_cnt; |
| gres_ctx->total_cnt += gres_ns->gres_cnt_config; |
| |
| gres_ns->gres_cnt_avail = gres_ns->gres_cnt_config; |
| |
| if (gres_ctx->config_flags & GRES_CONF_HAS_FILE) { |
| if (gres_id_shared(gres_ctx->config_flags)) |
| gres_bits = gres_ns->topo_cnt; |
| else |
| gres_bits = gres_ns->gres_cnt_avail; |
| |
| _gres_bit_alloc_resize(gres_ns, gres_bits); |
| } else if (gres_ns->gres_bit_alloc && |
| !gres_id_shared(gres_ctx->config_flags)) { |
| /* |
| * If GRES count changed in configuration between reboots, |
| * update bitmap sizes as needed. |
| */ |
| gres_bits = gres_ns->gres_cnt_avail; |
| if (gres_bits != bit_size(gres_ns->gres_bit_alloc)) { |
| info("gres/%s count changed on node %s to %"PRIu64, |
| gres_ctx->gres_name, node_name, gres_bits); |
| if (gres_id_sharing(gres_ctx->plugin_id)) |
| *updated_gpu_cnt = true; |
| bit_realloc(gres_ns->gres_bit_alloc, gres_bits); |
| for (i = 0; i < gres_ns->topo_cnt; i++) { |
| if (gres_ns->topo_gres_bitmap && |
| gres_ns->topo_gres_bitmap[i] && |
| (gres_bits != |
| bit_size(gres_ns->topo_gres_bitmap[i]))){ |
| bit_realloc(gres_ns->topo_gres_bitmap[i], |
| gres_bits); |
| } |
| } |
| } |
| } |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* Convert core bitmap into socket string, xfree return value */ |
| static char *_core_bitmap2str(bitstr_t *core_map, int cores_per_sock, |
| int sock_per_node) |
| { |
| char *sock_info = NULL, tmp[256]; |
| bitstr_t *sock_map; |
| int c, s, core_offset, max_core; |
| bool any_set = false; |
| |
| xassert(core_map); |
| max_core = bit_size(core_map) - 1; |
| sock_map = bit_alloc(sock_per_node); |
| for (s = 0; s < sock_per_node; s++) { |
| core_offset = s * cores_per_sock; |
| for (c = 0; c < cores_per_sock; c++) { |
| if (core_offset > max_core) { |
| error("%s: bad core offset (%d >= %d)", |
| __func__, core_offset, max_core); |
| break; |
| } |
| if (bit_test(core_map, core_offset++)) { |
| bit_set(sock_map, s); |
| any_set = true; |
| break; |
| } |
| } |
| } |
| if (any_set) { |
| bit_fmt(tmp, sizeof(tmp), sock_map); |
| xstrfmtcat(sock_info, "(S:%s)", tmp); |
| } else { |
| /* We have a core bitmap with no bits set */ |
| sock_info = xstrdup(""); |
| } |
| FREE_NULL_BITMAP(sock_map); |
| |
| return sock_info; |
| } |
| |
| /* Given a count, modify it as needed and return suffix (e.g. "M" for mega ) */ |
| static char *_get_suffix(uint64_t *count) |
| { |
| if (*count == 0) |
| return ""; |
| if ((*count % ((uint64_t)1024 * 1024 * 1024 * 1024 * 1024)) == 0) { |
| *count /= ((uint64_t)1024 * 1024 * 1024 * 1024 * 1024); |
| return "P"; |
| } else if ((*count % ((uint64_t)1024 * 1024 * 1024 * 1024)) == 0) { |
| *count /= ((uint64_t)1024 * 1024 * 1024 * 1024); |
| return "T"; |
| } else if ((*count % ((uint64_t)1024 * 1024 * 1024)) == 0) { |
| *count /= ((uint64_t)1024 * 1024 * 1024); |
| return "G"; |
| } else if ((*count % (1024 * 1024)) == 0) { |
| *count /= (1024 * 1024); |
| return "M"; |
| } else if ((*count % 1024) == 0) { |
| *count /= 1024; |
| return "K"; |
| } else { |
| return ""; |
| } |
| } |
| |
| /* Build node's GRES string based upon data in that node's GRES list */ |
| static void _build_node_gres_str(list_t **gres_list, char **gres_str, |
| int cores_per_sock, int sock_per_node) |
| { |
| gres_state_t *gres_state_node; |
| gres_node_state_t *gres_ns; |
| bitstr_t *done_topo, *core_map; |
| uint64_t gres_sum; |
| char *sep = "", *suffix, *sock_info = NULL, *sock_str, *no_consume_str; |
| int c, i, j; |
| |
| xassert(gres_str); |
| xfree(*gres_str); |
| for (c = 0; c < gres_context_cnt; c++) { |
| /* Find gres_state entry on the list */ |
| gres_state_node = list_find_first(*gres_list, gres_find_id, |
| &gres_context[c].plugin_id); |
| if (gres_state_node == NULL) |
| continue; /* Node has none of this GRES */ |
| |
| gres_ns = (gres_node_state_t *) gres_state_node->gres_data; |
| no_consume_str = gres_ns->no_consume ? ":no_consume" : ""; |
| if (gres_ns->topo_cnt && |
| gres_ns->gres_cnt_avail) { |
| done_topo = bit_alloc(gres_ns->topo_cnt); |
| for (i = 0; i < gres_ns->topo_cnt; i++) { |
| if (bit_test(done_topo, i)) |
| continue; |
| bit_set(done_topo, i); |
| gres_sum = gres_ns-> |
| topo_gres_cnt_avail[i]; |
| if (gres_ns->topo_core_bitmap[i]) { |
| core_map = bit_copy( |
| gres_ns-> |
| topo_core_bitmap[i]); |
| } else |
| core_map = NULL; |
| for (j = 0; j < gres_ns->topo_cnt; j++){ |
| if (gres_ns->topo_type_id[i] != |
| gres_ns->topo_type_id[j]) |
| continue; |
| if (bit_test(done_topo, j)) |
| continue; |
| bit_set(done_topo, j); |
| gres_sum += gres_ns-> |
| topo_gres_cnt_avail[j]; |
| if (core_map && |
| gres_ns-> |
| topo_core_bitmap[j]) { |
| bit_or(core_map, |
| gres_ns-> |
| topo_core_bitmap[j]); |
| } else if (gres_ns-> |
| topo_core_bitmap[j]) { |
| core_map = bit_copy( |
| gres_ns-> |
| topo_core_bitmap[j]); |
| } |
| } |
| if (core_map) { |
| sock_info = _core_bitmap2str( |
| core_map, |
| cores_per_sock, |
| sock_per_node); |
| FREE_NULL_BITMAP(core_map); |
| sock_str = sock_info; |
| } else |
| sock_str = ""; |
| suffix = _get_suffix(&gres_sum); |
| if (gres_ns->topo_type_name[i]) { |
| xstrfmtcat(*gres_str, |
| "%s%s:%s%s:%"PRIu64"%s%s", sep, |
| gres_context[c].gres_name, |
| gres_ns-> |
| topo_type_name[i], |
| no_consume_str, gres_sum, |
| suffix, sock_str); |
| } else { |
| xstrfmtcat(*gres_str, |
| "%s%s%s:%"PRIu64"%s%s", sep, |
| gres_context[c].gres_name, |
| no_consume_str, gres_sum, |
| suffix, sock_str); |
| } |
| xfree(sock_info); |
| sep = ","; |
| } |
| FREE_NULL_BITMAP(done_topo); |
| } else if (gres_ns->type_cnt && |
| gres_ns->gres_cnt_avail) { |
| for (i = 0; i < gres_ns->type_cnt; i++) { |
| gres_sum = gres_ns->type_cnt_avail[i]; |
| suffix = _get_suffix(&gres_sum); |
| xstrfmtcat(*gres_str, "%s%s:%s%s:%"PRIu64"%s", |
| sep, gres_context[c].gres_name, |
| gres_ns->type_name[i], |
| no_consume_str, gres_sum, suffix); |
| sep = ","; |
| } |
| } else if (gres_ns->gres_cnt_avail) { |
| gres_sum = gres_ns->gres_cnt_avail; |
| suffix = _get_suffix(&gres_sum); |
| xstrfmtcat(*gres_str, "%s%s%s:%"PRIu64"%s", |
| sep, gres_context[c].gres_name, |
| no_consume_str, gres_sum, suffix); |
| sep = ","; |
| } |
| } |
| } |
| |
| static int _foreach_node_state_pack(void *x, void *arg) |
| { |
| gres_state_t *gres_state_node = x; |
| pack_state_t *pack_state = arg; |
| gres_node_state_t *gres_ns = gres_state_node->gres_data; |
| uint16_t gres_bitmap_size; |
| |
| if (pack_state->protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| pack32(pack_state->magic, pack_state->buffer); |
| pack32(gres_state_node->plugin_id, pack_state->buffer); |
| pack32(gres_state_node->config_flags, pack_state->buffer); |
| pack64(gres_ns->gres_cnt_avail, pack_state->buffer); |
| /* |
| * Just note if gres_bit_alloc exists. |
| * Rebuild it based upon the state of recovered jobs |
| */ |
| if (gres_ns->gres_bit_alloc) |
| gres_bitmap_size = bit_size(gres_ns->gres_bit_alloc); |
| else |
| gres_bitmap_size = 0; |
| pack16(gres_bitmap_size, pack_state->buffer); |
| |
| pack16(gres_ns->topo_cnt, pack_state->buffer); |
| for (int i = 0; i < gres_ns->topo_cnt; i++) { |
| pack_bit_str_hex(gres_ns->topo_core_bitmap[i], |
| pack_state->buffer); |
| pack_bit_str_hex(gres_ns->topo_gres_bitmap[i], |
| pack_state->buffer); |
| pack_bit_str_hex(gres_ns->topo_res_core_bitmap[i], |
| pack_state->buffer); |
| } |
| pack64_array(gres_ns->topo_gres_cnt_alloc, gres_ns->topo_cnt, |
| pack_state->buffer); |
| pack64_array(gres_ns->topo_gres_cnt_avail, gres_ns->topo_cnt, |
| pack_state->buffer); |
| pack32_array(gres_ns->topo_type_id, gres_ns->topo_cnt, |
| pack_state->buffer); |
| packstr_array(gres_ns->topo_type_name, gres_ns->topo_cnt, |
| pack_state->buffer); |
| } else { |
| error("%s: protocol_version %hu not supported", |
| __func__, pack_state->protocol_version); |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| static int _foreach_job_state_pack(void *x, void *arg) |
| { |
| gres_state_t *gres_state_job = x; |
| pack_state_t *pack_state = arg; |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| int i; |
| |
| if (pack_state->protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| pack32(pack_state->magic, pack_state->buffer); |
| pack32(gres_state_job->plugin_id, pack_state->buffer); |
| pack16(gres_js->cpus_per_gres, pack_state->buffer); |
| pack16(gres_js->flags, pack_state->buffer); |
| pack64(gres_js->gres_per_job, pack_state->buffer); |
| pack64(gres_js->gres_per_node, pack_state->buffer); |
| pack64(gres_js->gres_per_socket, pack_state->buffer); |
| pack64(gres_js->gres_per_task, pack_state->buffer); |
| pack64(gres_js->mem_per_gres, pack_state->buffer); |
| pack16(gres_js->ntasks_per_gres, pack_state->buffer); |
| pack64(gres_js->total_gres, pack_state->buffer); |
| packstr(gres_js->type_name, pack_state->buffer); |
| pack32(gres_js->node_cnt, pack_state->buffer); |
| |
| if (gres_js->gres_cnt_node_alloc) { |
| pack8((uint8_t) 1, pack_state->buffer); |
| pack64_array(gres_js->gres_cnt_node_alloc, |
| gres_js->node_cnt, pack_state->buffer); |
| } else { |
| pack8((uint8_t) 0, pack_state->buffer); |
| } |
| |
| if (gres_js->gres_bit_alloc) { |
| pack8((uint8_t) 1, pack_state->buffer); |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| pack_bit_str_hex(gres_js-> |
| gres_bit_alloc[i], |
| pack_state->buffer); |
| } |
| } else { |
| pack8((uint8_t) 0, pack_state->buffer); |
| } |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| if (!gres_js->gres_per_bit_alloc || |
| !gres_js->gres_per_bit_alloc[i] || |
| !gres_js->gres_bit_alloc || |
| !gres_js->gres_bit_alloc[i]) { |
| pack8((uint8_t)0, pack_state->buffer); |
| continue; |
| } |
| pack8((uint8_t)1, pack_state->buffer); |
| pack64_array( |
| gres_js->gres_per_bit_alloc[i], |
| bit_size(gres_js->gres_bit_alloc[i]), |
| pack_state->buffer); |
| } |
| if (pack_state->details && gres_js->gres_bit_step_alloc) { |
| pack8((uint8_t) 1, pack_state->buffer); |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| pack_bit_str_hex(gres_js-> |
| gres_bit_step_alloc[i], |
| pack_state->buffer); |
| } |
| } else { |
| pack8((uint8_t) 0, pack_state->buffer); |
| } |
| if (pack_state->details && gres_js->gres_cnt_step_alloc) { |
| pack8((uint8_t) 1, pack_state->buffer); |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| pack64(gres_js-> |
| gres_cnt_step_alloc[i], |
| pack_state->buffer); |
| } |
| } else { |
| pack8((uint8_t) 0, pack_state->buffer); |
| } |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| if (!pack_state->details || |
| !gres_js->gres_per_bit_step_alloc || |
| !gres_js->gres_per_bit_step_alloc[i] || |
| !gres_js->gres_bit_step_alloc || |
| !gres_js->gres_bit_step_alloc[i]) { |
| pack8((uint8_t)0, pack_state->buffer); |
| continue; |
| } |
| pack8((uint8_t)1, pack_state->buffer); |
| pack64_array( |
| gres_js->gres_per_bit_step_alloc[i], |
| bit_size(gres_js->gres_bit_step_alloc[i]), |
| pack_state->buffer); |
| } |
| } else { |
| error("%s: protocol_version %hu not supported", |
| __func__, pack_state->protocol_version); |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| static int _foreach_step_state_pack(void *x, void *arg) |
| { |
| gres_state_t *gres_state_step = x; |
| pack_state_t *pack_state = arg; |
| gres_step_state_t *gres_ss = gres_state_step->gres_data; |
| int i; |
| |
| if (pack_state->protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| pack32(pack_state->magic, pack_state->buffer); |
| pack32(gres_state_step->plugin_id, pack_state->buffer); |
| pack16(gres_ss->cpus_per_gres, pack_state->buffer); |
| pack16(gres_ss->flags, pack_state->buffer); |
| pack64(gres_ss->gres_per_step, pack_state->buffer); |
| pack64(gres_ss->gres_per_node, pack_state->buffer); |
| pack64(gres_ss->gres_per_socket, pack_state->buffer); |
| pack64(gres_ss->gres_per_task, pack_state->buffer); |
| pack64(gres_ss->mem_per_gres, pack_state->buffer); |
| pack64(gres_ss->total_gres, pack_state->buffer); |
| packstr(gres_ss->type_name, pack_state->buffer); |
| pack32(gres_ss->node_cnt, pack_state->buffer); |
| pack_bit_str_hex(gres_ss->node_in_use, pack_state->buffer); |
| if (gres_ss->gres_cnt_node_alloc) { |
| pack8((uint8_t) 1, pack_state->buffer); |
| pack64_array(gres_ss->gres_cnt_node_alloc, |
| gres_ss->node_cnt, pack_state->buffer); |
| } else { |
| pack8((uint8_t) 0, pack_state->buffer); |
| } |
| if (gres_ss->gres_bit_alloc) { |
| pack8((uint8_t) 1, pack_state->buffer); |
| for (i = 0; i < gres_ss->node_cnt; i++) |
| pack_bit_str_hex(gres_ss->gres_bit_alloc[i], |
| pack_state->buffer); |
| } else { |
| pack8((uint8_t) 0, pack_state->buffer); |
| } |
| for (i = 0; i < gres_ss->node_cnt; i++) { |
| if (!gres_ss->gres_per_bit_alloc || |
| !gres_ss->gres_per_bit_alloc[i] || |
| !gres_ss->gres_bit_alloc || |
| !gres_ss->gres_bit_alloc[i]) { |
| pack8((uint8_t)0, pack_state->buffer); |
| continue; |
| } |
| pack8((uint8_t)1, pack_state->buffer); |
| pack64_array(gres_ss->gres_per_bit_alloc[i], |
| bit_size(gres_ss->gres_bit_alloc[i]), |
| pack_state->buffer); |
| } |
| } else { |
| error("%s: protocol_version %hu not supported", |
| __func__, pack_state->protocol_version); |
| return -1; |
| } |
| return 0; |
| } |
| |
| static int _pack_state(list_t *gres_list, pack_state_t *pack_state, |
| int (*pack_function) (void *x, void *key)) |
| { |
| int rc = SLURM_SUCCESS; |
| uint32_t top_offset, tail_offset; |
| uint16_t rec_cnt = 0; |
| |
| top_offset = get_buf_offset(pack_state->buffer); |
| pack16(rec_cnt, pack_state->buffer); /* placeholder if data */ |
| |
| if (!gres_list) |
| return rc; |
| |
| rec_cnt = list_for_each(gres_list, pack_function, pack_state); |
| |
| if (rec_cnt > 0) { |
| tail_offset = get_buf_offset(pack_state->buffer); |
| set_buf_offset(pack_state->buffer, top_offset); |
| pack16(rec_cnt, pack_state->buffer); |
| set_buf_offset(pack_state->buffer, tail_offset); |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * Note that a node's configuration has been modified (e.g. "scontol update ..") |
| * IN node_name - name of the node for which the gres information applies |
| * IN new_gres - Updated GRES information supplied from slurm.conf or scontrol |
| * IN/OUT gres_str - Node's current GRES string, updated as needed |
| * IN/OUT gres_list - List of Gres records for this node to track usage |
| * IN config_overrides - true: Don't validate hardware, use slurm.conf |
| * configuration |
| * false: Validate hardware config, but use slurm.conf |
| * config |
| * IN cores_per_sock - Number of cores per socket on this node |
| * IN sock_per_node - Total count of sockets on this node (on any board) |
| */ |
| extern int gres_node_reconfig(char *node_name, |
| char *new_gres, |
| char **gres_str, |
| list_t **gres_list, |
| bool config_overrides, |
| int cores_per_sock, |
| int sock_per_node) |
| { |
| int i, rc = SLURM_SUCCESS; |
| gres_state_t *gres_state_node = NULL, **gres_state_node_array; |
| gres_state_t *gpu_gres_state_node = NULL; |
| |
| xassert(gres_context_cnt >= 0); |
| slurm_mutex_lock(&gres_context_lock); |
| gres_state_node_array = xcalloc(gres_context_cnt, |
| sizeof(gres_state_t *)); |
| if ((gres_context_cnt > 0) && (*gres_list == NULL)) |
| *gres_list = list_create(_gres_node_list_delete); |
| |
| /* First validate all of the requested GRES changes */ |
| for (i = 0; (rc == SLURM_SUCCESS) && (i < gres_context_cnt); i++) { |
| /* Find gres_state entry on the list */ |
| gres_state_node = list_find_first(*gres_list, gres_find_id, |
| &gres_context[i].plugin_id); |
| if (gres_state_node == NULL) |
| continue; |
| gres_state_node_array[i] = gres_state_node; |
| rc = _node_reconfig_test(node_name, new_gres, gres_state_node, |
| &gres_context[i]); |
| } |
| |
| /* Now update the GRES counts */ |
| for (i = 0; (rc == SLURM_SUCCESS) && (i < gres_context_cnt); i++) { |
| bool updated_gpu_cnt = false; |
| if (gres_state_node_array[i] == NULL) |
| continue; |
| rc = _node_reconfig(node_name, new_gres, gres_str, |
| gres_state_node_array[i], config_overrides, |
| &gres_context[i], &updated_gpu_cnt); |
| if (updated_gpu_cnt) |
| gpu_gres_state_node = gres_state_node; |
| } |
| |
| /* Now synchronize gres/gpu and gres/'shared' state */ |
| if (gpu_gres_state_node) { |
| /* Update gres/'shared' counts and bitmaps to match gres/gpu */ |
| _sync_node_shared_to_sharing(gpu_gres_state_node); |
| } |
| |
| /* Build new per-node gres_str */ |
| _build_node_gres_str(gres_list, gres_str, cores_per_sock,sock_per_node); |
| slurm_mutex_unlock(&gres_context_lock); |
| xfree(gres_state_node_array); |
| |
| return rc; |
| } |
| |
| extern void gres_node_remove(node_record_t *node_ptr) |
| { |
| if (!node_ptr->gres_list) |
| return; |
| |
| slurm_mutex_lock(&gres_context_lock); |
| for (int i = 0; i < gres_context_cnt; i++) { |
| gres_state_t *gres_state_node; |
| |
| if (!(gres_state_node = |
| list_find_first(node_ptr->gres_list, gres_find_id, |
| &gres_context[i].plugin_id))) |
| continue; |
| |
| if (gres_state_node->gres_data) { |
| gres_node_state_t *gres_ns = gres_state_node->gres_data; |
| gres_context[i].total_cnt -= gres_ns->gres_cnt_config; |
| } |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| } |
| |
| /* |
| * Pack a node's current gres status, called from slurmctld for save/restore |
| * IN gres_list - generated by gres_node_config_validate() |
| * IN/OUT buffer - location to write state to |
| */ |
| extern int gres_node_state_pack(list_t *gres_list, buf_t *buffer, |
| uint16_t protocol_version) |
| { |
| pack_state_t pack_state = { |
| .buffer = buffer, |
| .magic = GRES_MAGIC, |
| .protocol_version = protocol_version, |
| }; |
| |
| return _pack_state(gres_list, &pack_state, _foreach_node_state_pack); |
| } |
| |
| /* |
| * Unpack a node's current gres status, called from slurmctld for save/restore |
| * OUT gres_list - restored state stored by gres_node_state_pack() |
| * IN/OUT buffer - location to read state from |
| * IN node_name - name of the node for which the gres information applies |
| */ |
| extern int gres_node_state_unpack(list_t **gres_list, buf_t *buffer, |
| char *node_name, |
| uint16_t protocol_version) |
| { |
| int rc = SLURM_SUCCESS; |
| uint32_t magic = 0, plugin_id = 0, config_flags = 0; |
| uint16_t gres_bitmap_size = 0, rec_cnt = 0; |
| gres_state_t *gres_state_node; |
| gres_node_state_t *gres_ns = NULL; |
| bool locked = false; |
| |
| safe_unpack16(&rec_cnt, buffer); |
| if (rec_cnt == 0) |
| return SLURM_SUCCESS; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| locked = true; |
| if ((gres_context_cnt > 0) && (*gres_list == NULL)) |
| *gres_list = list_create(_gres_node_list_delete); |
| |
| while ((rc == SLURM_SUCCESS) && (rec_cnt)) { |
| uint32_t tmp_uint32; |
| uint32_t full_config_flags = 0; |
| slurm_gres_context_t *gres_ctx; |
| if ((buffer == NULL) || (remaining_buf(buffer) == 0)) |
| break; |
| rec_cnt--; |
| |
| gres_ns = _build_gres_node_state(); |
| |
| if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| safe_unpack32(&magic, buffer); |
| if (magic != GRES_MAGIC) |
| goto unpack_error; |
| safe_unpack32(&plugin_id, buffer); |
| safe_unpack32(&config_flags, buffer); |
| safe_unpack64(&gres_ns->gres_cnt_avail, buffer); |
| safe_unpack16(&gres_bitmap_size, buffer); |
| |
| safe_unpack16(&gres_ns->topo_cnt, buffer); |
| if (gres_ns->topo_cnt) { |
| gres_ns->topo_core_bitmap = |
| xcalloc(gres_ns->topo_cnt, |
| sizeof(bitstr_t *)); |
| gres_ns->topo_gres_bitmap = |
| xcalloc(gres_ns->topo_cnt, |
| sizeof(bitstr_t *)); |
| gres_ns->topo_res_core_bitmap = |
| xcalloc(gres_ns->topo_cnt, |
| sizeof(bitstr_t *)); |
| for (int i = 0; i < gres_ns->topo_cnt; i++) { |
| unpack_bit_str_hex( |
| &gres_ns->topo_core_bitmap[i], |
| buffer); |
| unpack_bit_str_hex( |
| &gres_ns->topo_gres_bitmap[i], |
| buffer); |
| unpack_bit_str_hex( |
| &gres_ns-> |
| topo_res_core_bitmap[i], |
| buffer); |
| } |
| } |
| safe_unpack64_array(&gres_ns->topo_gres_cnt_alloc, |
| &tmp_uint32, buffer); |
| safe_unpack64_array(&gres_ns->topo_gres_cnt_avail, |
| &tmp_uint32, buffer); |
| safe_unpack32_array(&gres_ns->topo_type_id, &tmp_uint32, |
| buffer); |
| safe_unpackstr_array(&gres_ns->topo_type_name, |
| &tmp_uint32, buffer); |
| } else { |
| error("%s: protocol_version %hu not supported", |
| __func__, protocol_version); |
| goto unpack_error; |
| } |
| |
| if (!(gres_ctx = _find_context_by_id(plugin_id))) { |
| error("%s: no plugin configured to unpack data type %u from node %s", |
| __func__, plugin_id, node_name); |
| /* |
| * A likely sign that GresPlugins has changed. |
| * Not a fatal error, skip over the data. |
| */ |
| _gres_node_state_delete(gres_ns); |
| continue; |
| } |
| |
| if (gres_bitmap_size) { |
| gres_ns->gres_bit_alloc = |
| bit_alloc(gres_bitmap_size); |
| } |
| |
| /* We don't want to lose flags from gres_ctx */ |
| full_config_flags = gres_ctx->config_flags; |
| |
| /* |
| * Flag this as flags read from state so we only use them until |
| * the node checks in. |
| */ |
| gres_ctx->config_flags = config_flags | GRES_CONF_FROM_STATE; |
| |
| gres_state_node = gres_create_state( |
| gres_ctx, GRES_STATE_SRC_CONTEXT_PTR, |
| GRES_STATE_TYPE_NODE, gres_ns); |
| list_append(*gres_list, gres_state_node); |
| gres_ctx->config_flags |= full_config_flags; |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| return rc; |
| |
| unpack_error: |
| error("%s: unpack error from node %s", __func__, node_name); |
| _gres_node_state_delete(gres_ns); |
| if (locked) |
| slurm_mutex_unlock(&gres_context_lock); |
| return SLURM_ERROR; |
| } |
| |
| static void *_node_state_dup(gres_node_state_t *gres_ns) |
| { |
| int i, j; |
| gres_node_state_t *new_gres_ns; |
| |
| if (gres_ns == NULL) |
| return NULL; |
| |
| new_gres_ns = xmalloc(sizeof(gres_node_state_t)); |
| new_gres_ns->gres_cnt_found = gres_ns->gres_cnt_found; |
| new_gres_ns->gres_cnt_config = gres_ns->gres_cnt_config; |
| new_gres_ns->gres_cnt_avail = gres_ns->gres_cnt_avail; |
| new_gres_ns->gres_cnt_alloc = gres_ns->gres_cnt_alloc; |
| new_gres_ns->no_consume = gres_ns->no_consume; |
| if (gres_ns->gres_bit_alloc) |
| new_gres_ns->gres_bit_alloc = bit_copy(gres_ns->gres_bit_alloc); |
| |
| if (gres_ns->links_cnt && gres_ns->link_len) { |
| new_gres_ns->links_cnt = xcalloc(gres_ns->link_len, |
| sizeof(int *)); |
| j = sizeof(int) * gres_ns->link_len; |
| for (i = 0; i < gres_ns->link_len; i++) { |
| new_gres_ns->links_cnt[i] = xmalloc(j); |
| memcpy(new_gres_ns->links_cnt[i], |
| gres_ns->links_cnt[i], j); |
| } |
| new_gres_ns->link_len = gres_ns->link_len; |
| } |
| |
| if (gres_ns->topo_cnt) { |
| new_gres_ns->topo_cnt = gres_ns->topo_cnt; |
| new_gres_ns->topo_core_bitmap = xcalloc(gres_ns->topo_cnt, |
| sizeof(bitstr_t *)); |
| new_gres_ns->topo_gres_bitmap = xcalloc(gres_ns->topo_cnt, |
| sizeof(bitstr_t *)); |
| new_gres_ns->topo_res_core_bitmap = xcalloc(gres_ns->topo_cnt, |
| sizeof(bitstr_t *)); |
| new_gres_ns->topo_gres_cnt_alloc = xcalloc(gres_ns->topo_cnt, |
| sizeof(uint64_t)); |
| new_gres_ns->topo_gres_cnt_avail = xcalloc(gres_ns->topo_cnt, |
| sizeof(uint64_t)); |
| new_gres_ns->topo_type_id = xcalloc(gres_ns->topo_cnt, |
| sizeof(uint32_t)); |
| new_gres_ns->topo_type_name = xcalloc(gres_ns->topo_cnt, |
| sizeof(char *)); |
| for (i = 0; i < gres_ns->topo_cnt; i++) { |
| if (gres_ns->topo_core_bitmap[i]) { |
| new_gres_ns->topo_core_bitmap[i] = |
| bit_copy(gres_ns->topo_core_bitmap[i]); |
| } |
| if (gres_ns->topo_res_core_bitmap[i]) { |
| new_gres_ns->topo_res_core_bitmap[i] = |
| bit_copy(gres_ns-> |
| topo_res_core_bitmap[i]); |
| } |
| if (gres_ns->topo_gres_bitmap[i]) { |
| new_gres_ns->topo_gres_bitmap[i] = |
| bit_copy(gres_ns->topo_gres_bitmap[i]); |
| } |
| new_gres_ns->topo_gres_cnt_alloc[i] = |
| gres_ns->topo_gres_cnt_alloc[i]; |
| new_gres_ns->topo_gres_cnt_avail[i] = |
| gres_ns->topo_gres_cnt_avail[i]; |
| new_gres_ns->topo_type_id[i] = gres_ns->topo_type_id[i]; |
| new_gres_ns->topo_type_name[i] = |
| xstrdup(gres_ns->topo_type_name[i]); |
| } |
| } |
| |
| if (gres_ns->type_cnt) { |
| new_gres_ns->type_cnt = gres_ns->type_cnt; |
| new_gres_ns->type_cnt_alloc = xcalloc(gres_ns->type_cnt, |
| sizeof(uint64_t)); |
| new_gres_ns->type_cnt_avail = xcalloc(gres_ns->type_cnt, |
| sizeof(uint64_t)); |
| new_gres_ns->type_id = xcalloc(gres_ns->type_cnt, |
| sizeof(uint32_t)); |
| new_gres_ns->type_name = xcalloc(gres_ns->type_cnt, |
| sizeof(char *)); |
| for (i = 0; i < gres_ns->type_cnt; i++) { |
| new_gres_ns->type_cnt_alloc[i] = |
| gres_ns->type_cnt_alloc[i]; |
| new_gres_ns->type_cnt_avail[i] = |
| gres_ns->type_cnt_avail[i]; |
| new_gres_ns->type_id[i] = gres_ns->type_id[i]; |
| new_gres_ns->type_name[i] = |
| xstrdup(gres_ns->type_name[i]); |
| } |
| } |
| |
| return new_gres_ns; |
| } |
| |
| static int _foreach_node_state_dup(void *x, void *arg) |
| { |
| gres_state_t *gres_state_node = x, *new_gres; |
| list_t *new_list = arg; |
| void *gres_ns; |
| |
| if (!_find_context_by_id(gres_state_node->plugin_id)) { |
| error("Could not find plugin id %u to dup node record", |
| gres_state_node->plugin_id); |
| return 0; |
| } |
| |
| gres_ns = _node_state_dup(gres_state_node->gres_data); |
| if (gres_ns) { |
| new_gres = gres_create_state( |
| gres_state_node, GRES_STATE_SRC_STATE_PTR, |
| GRES_STATE_TYPE_NODE, gres_ns); |
| /* |
| * Because "gres/'shared'" follows "gres/gpu" (see gres_init) |
| * the sharing gres will be in new list already. |
| */ |
| if (gres_id_shared(new_gres->config_flags)) { |
| /* |
| * gres_id_sharing currently only includes gpus so we |
| * can just search for that. |
| */ |
| _set_alt_gres(new_gres, |
| list_find_first(new_list, gres_find_id, |
| &gpu_plugin_id)); |
| } |
| list_append(new_list, new_gres); |
| } |
| return 0; |
| } |
| |
| /* |
| * Duplicate a node gres status (used for will-run logic) |
| * IN gres_list - node gres state information |
| * RET a copy of gres_list or NULL on failure |
| */ |
| extern list_t *gres_node_state_list_dup(list_t *gres_list) |
| { |
| list_t *new_list = NULL; |
| |
| if (gres_list == NULL) |
| return new_list; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| if ((gres_context_cnt > 0)) { |
| new_list = list_create(_gres_node_list_delete); |
| (void) list_for_each(gres_list, |
| _foreach_node_state_dup, |
| new_list); |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| return new_list; |
| } |
| |
| static int _node_state_dealloc(void *x, void *arg) |
| { |
| gres_state_t *gres_state_node = x; |
| int i; |
| gres_node_state_t *gres_ns; |
| |
| gres_ns = (gres_node_state_t *) gres_state_node->gres_data; |
| gres_ns->gres_cnt_alloc = 0; |
| if (gres_ns->gres_bit_alloc) |
| bit_clear_all(gres_ns->gres_bit_alloc); |
| |
| if (gres_ns->topo_cnt && !gres_ns->topo_gres_cnt_alloc) { |
| error("gres_node_state_dealloc_all: gres/%s topo_cnt!=0 " |
| "and topo_gres_cnt_alloc is NULL", |
| gres_state_node->gres_name); |
| } else if (gres_ns->topo_cnt) { |
| for (i = 0; i < gres_ns->topo_cnt; i++) { |
| gres_ns->topo_gres_cnt_alloc[i] = 0; |
| } |
| } else { |
| /* |
| * This array can be set at startup if a job has been allocated |
| * specific GRES and the node has not registered with the |
| * details needed to track individual GRES (rather than only |
| * a GRES count). |
| */ |
| xfree(gres_ns->topo_gres_cnt_alloc); |
| } |
| |
| for (i = 0; i < gres_ns->type_cnt; i++) { |
| gres_ns->type_cnt_alloc[i] = 0; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Deallocate all resources on this node previous allocated to any jobs. |
| * This function isused to synchronize state after slurmctld restarts or |
| * is reconfigured. |
| * IN gres_list - node gres state information |
| */ |
| extern void gres_node_state_dealloc_all(list_t *gres_list) |
| { |
| if (gres_list == NULL) |
| return; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| (void) list_for_each(gres_list, _node_state_dealloc, NULL); |
| } |
| |
| static char *_node_gres_used(gres_node_state_t *gres_ns, char *gres_name) |
| { |
| char *sep = ""; |
| int i, j; |
| |
| xassert(gres_ns); |
| |
| if (!gres_ns->gres_cnt_avail) { |
| return NULL; |
| } else if ((gres_ns->topo_cnt != 0) && (gres_ns->no_consume == false)) { |
| bitstr_t *topo_printed = bit_alloc(gres_ns->topo_cnt); |
| xfree(gres_ns->gres_used); /* Free any cached value */ |
| for (i = 0; i < gres_ns->topo_cnt; i++) { |
| /* |
| * For non-shared gres, we record which indices have |
| * gres allocated. For shared gres, we record the count |
| * of allocated gres at each index (may be >1, as |
| * opposed to non-shared gres which is never >1) |
| * |
| * topo_gres_bitmap is used for non-shared gres, while |
| * topo_gres_cnt_alloc_str is used for shared gres |
| * (shard, mps). |
| */ |
| bitstr_t *topo_gres_bitmap = NULL; |
| char *topo_gres_cnt_alloc_str = NULL; |
| |
| uint64_t gres_alloc_cnt = 0; |
| char *gres_alloc_idx, tmp_str[64]; |
| bool is_shared; |
| |
| if (bit_test(topo_printed, i)) |
| continue; |
| bit_set(topo_printed, i); |
| |
| is_shared = gres_is_shared_name(gres_name); |
| if (is_shared) { |
| uint64_t alloc, avail; |
| alloc = gres_ns->topo_gres_cnt_alloc[i]; |
| avail = gres_ns->topo_gres_cnt_avail[i]; |
| xstrfmtcat(topo_gres_cnt_alloc_str, |
| "%"PRIu64"/%"PRIu64, |
| alloc, avail); |
| gres_alloc_cnt += alloc; |
| } else if (gres_ns->topo_gres_bitmap[i]) { |
| topo_gres_bitmap = |
| bit_copy(gres_ns-> |
| topo_gres_bitmap[i]); |
| } |
| for (j = i + 1; j < gres_ns->topo_cnt; j++) { |
| if (bit_test(topo_printed, j)) |
| continue; |
| if (gres_ns->topo_type_id[i] != |
| gres_ns->topo_type_id[j]) |
| continue; |
| bit_set(topo_printed, j); |
| if (is_shared) { |
| uint64_t alloc, avail; |
| alloc = gres_ns->topo_gres_cnt_alloc[j]; |
| avail = gres_ns->topo_gres_cnt_avail[j]; |
| xstrfmtcat(topo_gres_cnt_alloc_str, |
| ",%"PRIu64"/%"PRIu64, |
| alloc, avail); |
| gres_alloc_cnt += alloc; |
| } else if (gres_ns->topo_gres_bitmap[j]) { |
| if (!topo_gres_bitmap) { |
| topo_gres_bitmap = |
| bit_copy(gres_ns-> |
| topo_gres_bitmap[j]); |
| } else if (bit_size(topo_gres_bitmap) == |
| bit_size(gres_ns-> |
| topo_gres_bitmap[j])){ |
| bit_or(topo_gres_bitmap, |
| gres_ns-> |
| topo_gres_bitmap[j]); |
| } |
| } |
| } |
| if (!is_shared && gres_ns->gres_bit_alloc && |
| topo_gres_bitmap && |
| (bit_size(topo_gres_bitmap) == |
| bit_size(gres_ns->gres_bit_alloc))) { |
| bit_and(topo_gres_bitmap, |
| gres_ns->gres_bit_alloc); |
| gres_alloc_cnt = bit_set_count(topo_gres_bitmap); |
| } |
| if (is_shared) { |
| gres_alloc_idx = topo_gres_cnt_alloc_str; |
| } else if (gres_alloc_cnt > 0) { |
| bit_fmt(tmp_str, sizeof(tmp_str), |
| topo_gres_bitmap); |
| gres_alloc_idx = tmp_str; |
| } else { |
| gres_alloc_idx = "N/A"; |
| } |
| xstrfmtcat(gres_ns->gres_used, |
| "%s%s:%s:%"PRIu64"(%s%s)", sep, gres_name, |
| gres_ns->topo_type_name[i], gres_alloc_cnt, |
| is_shared ? "" : "IDX:", gres_alloc_idx); |
| sep = ","; |
| FREE_NULL_BITMAP(topo_gres_bitmap); |
| xfree(topo_gres_cnt_alloc_str); |
| } |
| FREE_NULL_BITMAP(topo_printed); |
| } else if (gres_ns->gres_used) { |
| ; /* Used cached value */ |
| } else if (gres_ns->type_cnt == 0) { |
| if (gres_ns->no_consume) { |
| xstrfmtcat(gres_ns->gres_used, "%s:0", gres_name); |
| } else { |
| xstrfmtcat(gres_ns->gres_used, "%s:%"PRIu64, |
| gres_name, gres_ns->gres_cnt_alloc); |
| } |
| } else { |
| for (i = 0; i < gres_ns->type_cnt; i++) { |
| if (gres_ns->no_consume) { |
| xstrfmtcat(gres_ns->gres_used, |
| "%s%s:%s:0", sep, gres_name, |
| gres_ns->type_name[i]); |
| } else { |
| xstrfmtcat(gres_ns->gres_used, |
| "%s%s:%s:%"PRIu64, sep, gres_name, |
| gres_ns->type_name[i], |
| gres_ns->type_cnt_alloc[i]); |
| } |
| sep = ","; |
| } |
| } |
| |
| return gres_ns->gres_used; |
| } |
| |
| static int _foreach_node_state_log(void *x, void *arg) |
| { |
| gres_state_t *gres_state_node = x; |
| gres_node_state_t *gres_ns = gres_state_node->gres_data; |
| char *gres_name = gres_state_node->gres_name; |
| char *node_name = arg; |
| |
| int i, j; |
| char *buf = NULL, *sep, tmp_str[128]; |
| |
| xassert(gres_ns); |
| |
| info("gres/%s: state for %s", gres_name, node_name); |
| if (gres_ns->gres_cnt_found == NO_VAL64) { |
| snprintf(tmp_str, sizeof(tmp_str), "TBD"); |
| } else { |
| snprintf(tmp_str, sizeof(tmp_str), "%"PRIu64, |
| gres_ns->gres_cnt_found); |
| } |
| |
| if (gres_ns->no_consume) { |
| info(" gres_cnt found:%s configured:%"PRIu64" " |
| "avail:%"PRIu64" no_consume", |
| tmp_str, gres_ns->gres_cnt_config, |
| gres_ns->gres_cnt_avail); |
| } else { |
| info(" gres_cnt found:%s configured:%"PRIu64" " |
| "avail:%"PRIu64" alloc:%"PRIu64"", |
| tmp_str, gres_ns->gres_cnt_config, |
| gres_ns->gres_cnt_avail, |
| gres_ns->gres_cnt_alloc); |
| } |
| |
| if (gres_ns->gres_bit_alloc) { |
| bit_fmt(tmp_str, sizeof(tmp_str),gres_ns->gres_bit_alloc); |
| info(" gres_bit_alloc:%s of %d", |
| tmp_str, (int) bit_size(gres_ns->gres_bit_alloc)); |
| } else { |
| info(" gres_bit_alloc:NULL"); |
| } |
| |
| info(" gres_used:%s", gres_ns->gres_used); |
| |
| if (gres_ns->links_cnt && gres_ns->link_len) { |
| for (i = 0; i < gres_ns->link_len; i++) { |
| sep = ""; |
| for (j = 0; j < gres_ns->link_len; j++) { |
| xstrfmtcat(buf, "%s%d", sep, |
| gres_ns->links_cnt[i][j]); |
| sep = ", "; |
| } |
| info(" links[%d]:%s", i, buf); |
| xfree(buf); |
| } |
| } |
| |
| for (i = 0; i < gres_ns->topo_cnt; i++) { |
| info(" topo[%d]:%s(%u)", i, gres_ns->topo_type_name[i], |
| gres_ns->topo_type_id[i]); |
| if (gres_ns->topo_core_bitmap[i]) { |
| bit_fmt(tmp_str, sizeof(tmp_str), |
| gres_ns->topo_core_bitmap[i]); |
| info(" topo_core_bitmap[%d]:%s of %d", i, tmp_str, |
| (int)bit_size(gres_ns->topo_core_bitmap[i])); |
| } else |
| info(" topo_core_bitmap[%d]:NULL", i); |
| if (gres_ns->topo_gres_bitmap[i]) { |
| bit_fmt(tmp_str, sizeof(tmp_str), |
| gres_ns->topo_gres_bitmap[i]); |
| info(" topo_gres_bitmap[%d]:%s of %d", i, tmp_str, |
| (int)bit_size(gres_ns->topo_gres_bitmap[i])); |
| } else |
| info(" topo_gres_bitmap[%d]:NULL", i); |
| info(" topo_gres_cnt_alloc[%d]:%"PRIu64"", i, |
| gres_ns->topo_gres_cnt_alloc[i]); |
| info(" topo_gres_cnt_avail[%d]:%"PRIu64"", i, |
| gres_ns->topo_gres_cnt_avail[i]); |
| } |
| |
| for (i = 0; i < gres_ns->type_cnt; i++) { |
| info(" type[%d]:%s(%u)", i, gres_ns->type_name[i], |
| gres_ns->type_id[i]); |
| info(" type_cnt_alloc[%d]:%"PRIu64, i, |
| gres_ns->type_cnt_alloc[i]); |
| info(" type_cnt_avail[%d]:%"PRIu64, i, |
| gres_ns->type_cnt_avail[i]); |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Log a node's current gres state |
| * IN gres_list - generated by gres_node_config_validate() |
| * IN node_name - name of the node for which the gres information applies |
| */ |
| extern void gres_node_state_log(list_t *gres_list, char *node_name) |
| { |
| if (!(slurm_conf.debug_flags & DEBUG_FLAG_GRES) || !gres_list) |
| return; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| (void) list_for_each(gres_list, _foreach_node_state_log, node_name); |
| } |
| |
| /* Find node_state_t gres record with any allocated gres (key is unused) */ |
| static int _find_node_state_with_alloc_gres(void *x, void *key) |
| { |
| gres_state_t *gres_state_node = (gres_state_t *) x; |
| |
| if (((gres_node_state_t *) gres_state_node->gres_data)->gres_cnt_alloc) |
| return 1; |
| else |
| return 0; |
| } |
| |
| extern bool gres_node_state_list_has_alloc_gres(list_t *gres_list) |
| { |
| if (!gres_list) |
| return false; |
| |
| return list_find_first(gres_list, |
| _find_node_state_with_alloc_gres, NULL); |
| } |
| |
| /* |
| * Build a string indicating a node's drained GRES |
| * IN gres_list - generated by gres_node_config_validate() |
| * RET - string, must be xfreed by caller |
| */ |
| extern char *gres_get_node_drain(list_t *gres_list) |
| { |
| char *node_drain = xstrdup("N/A"); |
| |
| return node_drain; |
| } |
| |
| static int _foreach_get_node_used(void *x, void *arg) |
| { |
| gres_state_t *gres_state_node = x; |
| char **gres_usedp = arg; |
| char *gres_used = *gres_usedp; |
| char *tmp = NULL; |
| |
| if (!(tmp = _node_gres_used(gres_state_node->gres_data, |
| gres_state_node->gres_name))) |
| return 0; |
| |
| if (gres_used) |
| xstrcat(gres_used, ","); |
| xstrcat(gres_used, tmp); |
| |
| *gres_usedp = gres_used; |
| |
| return 0; |
| } |
| |
| /* |
| * Build a string indicating a node's used GRES |
| * IN gres_list - generated by gres_node_config_validate() |
| * RET - string, must be xfreed by caller |
| */ |
| extern char *gres_get_node_used(list_t *gres_list) |
| { |
| char *gres_used = NULL; |
| |
| if (gres_list) |
| (void) list_for_each(gres_list, |
| _foreach_get_node_used, |
| &gres_used); |
| |
| return gres_used; |
| } |
| |
| /* |
| * Give the total system count of a given GRES |
| * Returns NO_VAL64 if name not found |
| */ |
| extern uint64_t gres_get_system_cnt(char *name, bool case_insensitive) |
| { |
| uint64_t count = NO_VAL64; |
| int i; |
| |
| if (!name) |
| return NO_VAL64; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| for (i = 0; i < gres_context_cnt; i++) { |
| if (case_insensitive ? |
| !xstrcasecmp(gres_context[i].gres_name, name) : |
| !xstrcmp(gres_context[i].gres_name, name)) { |
| count = gres_context[i].total_cnt; |
| break; |
| } |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| return count; |
| } |
| |
| |
| /* |
| * Get the count of a node's GRES |
| * IN gres_list - List of Gres records for this node to track usage |
| * IN name - name of gres |
| */ |
| extern uint64_t gres_node_config_cnt(list_t *gres_list, char *name) |
| { |
| int i; |
| gres_state_t *gres_state_node; |
| gres_node_state_t *gres_ns; |
| uint64_t count = 0; |
| |
| if (!gres_list || !name || !list_count(gres_list)) |
| return count; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| for (i = 0; i < gres_context_cnt; i++) { |
| if (!xstrcasecmp(gres_context[i].gres_name, name)) { |
| /* Find or create gres_state entry on the list */ |
| gres_state_node = list_find_first( |
| gres_list, gres_find_id, |
| &gres_context[i].plugin_id); |
| |
| if (!gres_state_node || !gres_state_node->gres_data) |
| break; |
| gres_ns = gres_state_node->gres_data; |
| count = gres_ns->gres_cnt_config; |
| break; |
| } else if (!xstrncasecmp(name, gres_context[i].gres_name_colon, |
| gres_context[i].gres_name_colon_len)) { |
| int type; |
| uint32_t type_id; |
| char *type_str = NULL; |
| |
| if (!(type_str = strchr(name, ':'))) { |
| error("Invalid gres name '%s'", name); |
| break; |
| } |
| type_str++; |
| |
| gres_state_node = list_find_first( |
| gres_list, gres_find_id, |
| &gres_context[i].plugin_id); |
| |
| if (!gres_state_node || !gres_state_node->gres_data) |
| break; |
| gres_ns = gres_state_node->gres_data; |
| type_id = gres_build_id(type_str); |
| for (type = 0; type < gres_ns->type_cnt; type++) { |
| if (gres_ns->type_id[type] == type_id) { |
| count = gres_ns->type_cnt_avail[type]; |
| break; |
| } |
| } |
| break; |
| } |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| return count; |
| } |
| |
| extern void gres_job_state_delete(gres_job_state_t *gres_js) |
| { |
| int i; |
| |
| if (gres_js == NULL) |
| return; |
| |
| gres_job_clear_alloc(gres_js); |
| |
| if (gres_js->gres_bit_select) { |
| for (i = 0; i < gres_js->total_node_cnt; i++) |
| FREE_NULL_BITMAP(gres_js->gres_bit_select[i]); |
| xfree(gres_js->gres_bit_select); |
| } |
| if (gres_js->gres_per_bit_select) { |
| for (i = 0; i < gres_js->total_node_cnt; i++){ |
| xfree(gres_js->gres_per_bit_select[i]); |
| } |
| xfree(gres_js->gres_per_bit_select); |
| } |
| |
| if (gres_js->res_gpu_cores) { |
| for (i = 0; i < gres_js->res_array_size; i++) { |
| FREE_NULL_BITMAP(gres_js->res_gpu_cores[i]); |
| } |
| xfree(gres_js->res_gpu_cores); |
| } |
| |
| xfree(gres_js->gres_cnt_node_alloc); |
| xfree(gres_js->gres_cnt_node_select); |
| xfree(gres_js->type_name); |
| xfree(gres_js); |
| } |
| |
| extern void gres_job_clear_alloc(gres_job_state_t *gres_js) |
| { |
| for (int i = 0; i < gres_js->node_cnt; i++) { |
| if (gres_js->gres_bit_alloc) |
| FREE_NULL_BITMAP(gres_js->gres_bit_alloc[i]); |
| if (gres_js->gres_bit_step_alloc) |
| FREE_NULL_BITMAP(gres_js->gres_bit_step_alloc[i]); |
| if (gres_js->gres_per_bit_alloc) |
| xfree(gres_js->gres_per_bit_alloc[i]); |
| if (gres_js->gres_per_bit_step_alloc) |
| xfree(gres_js->gres_per_bit_step_alloc[i]); |
| } |
| |
| xfree(gres_js->gres_bit_alloc); |
| xfree(gres_js->gres_bit_step_alloc); |
| xfree(gres_js->gres_per_bit_alloc); |
| xfree(gres_js->gres_per_bit_step_alloc); |
| xfree(gres_js->gres_cnt_step_alloc); |
| xfree(gres_js->gres_cnt_node_alloc); |
| gres_js->node_cnt = 0; |
| } |
| |
| extern void gres_job_list_delete(void *list_element) |
| { |
| gres_state_t *gres_state_job; |
| |
| gres_state_job = (gres_state_t *) list_element; |
| gres_job_state_delete(gres_state_job->gres_data); |
| gres_state_job->gres_data = NULL; |
| _gres_state_delete_members(gres_state_job); |
| } |
| |
| /* |
| * Ensure consistency of gres_per_* options |
| * Modify task and node count as needed for consistentcy with GRES options |
| * RET -1 on failure, 0 on success |
| */ |
| static int _test_gres_cnt(gres_state_t *gres_state_job, |
| gres_job_state_validate_t *gres_js_val) |
| { |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| int req_nodes, req_tasks, req_tasks_per_node, req_tasks_per_socket; |
| int req_sockets, req_cpus_per_task; |
| uint16_t cpus_per_gres; |
| |
| /* Ensure gres_per_job >= gres_per_node >= gres_per_socket */ |
| if (gres_js->gres_per_job && |
| ((gres_js->gres_per_node && |
| (gres_js->gres_per_node > gres_js->gres_per_job)) || |
| (gres_js->gres_per_task && |
| (gres_js->gres_per_task > gres_js->gres_per_job)) || |
| (gres_js->gres_per_socket && |
| (gres_js->gres_per_socket > |
| gres_js->gres_per_job)))) { |
| error("Failed to ensure --%ss >= --gres=%s/--%ss-per-node >= --%ss-per-socket", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name, |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } |
| |
| /* Ensure gres_per_job >= gres_per_task */ |
| if (gres_js->gres_per_node && |
| ((gres_js->gres_per_task && |
| (gres_js->gres_per_task > gres_js->gres_per_node)) || |
| (gres_js->gres_per_socket && |
| (gres_js->gres_per_socket > |
| gres_js->gres_per_node)))) { |
| error("Failed to ensure --%ss >= --%ss-per-task", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } |
| |
| /* gres_per_socket requires sockets-per-node count specification */ |
| if (gres_js->gres_per_socket) { |
| if (*gres_js_val->sockets_per_node == NO_VAL16) { |
| error("--%ss-per-socket option requires --sockets-per-node specification", |
| gres_state_job->gres_name); |
| return -1; |
| } |
| } |
| |
| /* make sure --cpu-per-gres is not combined with --cpus-per-task */ |
| if (!running_in_slurmctld() && gres_js->cpus_per_gres && |
| (*gres_js_val->cpus_per_task != NO_VAL16)) { |
| error("--cpus-per-%s is mutually exclusive with --cpus-per-task", |
| gres_state_job->gres_name); |
| return -1; |
| } |
| |
| |
| /* |
| * Ensure gres_per_job is multiple of gres_per_node |
| * Ensure node count is consistent with GRES parameters |
| */ |
| if (gres_js->gres_per_job && gres_js->gres_per_node) { |
| if (gres_js->gres_per_job % gres_js->gres_per_node){ |
| /* gres_per_job not multiple of gres_per_node */ |
| error("Failed to validate job spec, --%ss is not multiple of --gres=%s/--%ss-per-node", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } |
| req_nodes = gres_js->gres_per_job / |
| gres_js->gres_per_node; |
| if (((*gres_js_val->min_nodes != NO_VAL) && |
| (req_nodes < *gres_js_val->min_nodes)) || |
| (req_nodes > *gres_js_val->max_nodes)) { |
| error("Failed to validate job spec. Based on --%s and --gres=%s/--%ss-per-node required nodes (%u) doesn't fall between min_nodes (%u) and max_nodes (%u) boundaries.", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name, |
| gres_state_job->gres_name, |
| req_nodes, |
| *gres_js_val->min_nodes, |
| *gres_js_val->max_nodes); |
| return -1; |
| } |
| *gres_js_val->min_nodes = *gres_js_val->max_nodes = req_nodes; |
| } |
| |
| /* |
| * Ensure gres_per_node is multiple of gres_per_socket |
| * Ensure task count is consistent with GRES parameters |
| */ |
| if (gres_js->gres_per_node && gres_js->gres_per_socket) { |
| if (gres_js->gres_per_node % |
| gres_js->gres_per_socket) { |
| /* gres_per_node not multiple of gres_per_socket */ |
| error("Failed to validate job spec, --gres=%s/--%ss-per-node not multiple of --%ss-per-socket.", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } |
| req_sockets = gres_js->gres_per_node / |
| gres_js->gres_per_socket; |
| if (*gres_js_val->sockets_per_node == NO_VAL16) |
| *gres_js_val->sockets_per_node = req_sockets; |
| else if (*gres_js_val->sockets_per_node != req_sockets) { |
| error("Failed to validate job spec. Based on --gres=%s/--%ss-per-node and --%ss-per-socket required number of sockets differ from --sockets-per-node.", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } |
| } |
| |
| /* |
| * Ensure ntasks_per_tres is multiple of num_tasks |
| */ |
| if (gres_js->ntasks_per_gres && |
| (gres_js->ntasks_per_gres != NO_VAL16) && |
| (*gres_js_val->num_tasks != NO_VAL)) { |
| int tmp = *gres_js_val->num_tasks / gres_js->ntasks_per_gres; |
| if ((tmp * gres_js->ntasks_per_gres) != |
| *gres_js_val->num_tasks) { |
| error("Failed to validate job spec, -n/--ntasks has to be a multiple of --ntasks-per-%s.", |
| gres_state_job->gres_name); |
| return -1; |
| } |
| } |
| |
| /* |
| * Ensure gres_per_job is multiple of gres_per_task |
| * Ensure task count is consistent with GRES parameters |
| */ |
| if (gres_js->gres_per_task) { |
| if(gres_js->gres_per_job) { |
| if (gres_js->gres_per_job % |
| gres_js->gres_per_task) { |
| /* gres_per_job not multiple of gres_per_task */ |
| error("Failed to validate job spec, --%ss not multiple of --%ss-per-task", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } |
| req_tasks = gres_js->gres_per_job / |
| gres_js->gres_per_task; |
| if (*gres_js_val->num_tasks == NO_VAL) |
| *gres_js_val->num_tasks = req_tasks; |
| else if (*gres_js_val->num_tasks != req_tasks) { |
| if (running_in_slurmctld()) { |
| /* requesting new task count */ |
| gres_js->total_gres = |
| gres_js->gres_per_job = |
| *gres_js_val->num_tasks * |
| gres_js->gres_per_task; |
| } else { |
| /* |
| * Anywhere outside of the slurmctld we |
| * are asking for something incorrect. |
| */ |
| error("Failed to validate job spec. Based on --%ss and --%ss-per-task number of requested tasks differ from -n/--ntasks.", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } |
| } |
| } else if (*gres_js_val->num_tasks != NO_VAL) { |
| gres_js->gres_per_job = *gres_js_val->num_tasks * |
| gres_js->gres_per_task; |
| } else if (!xstrcmp(gres_state_job->gres_name, "gpu")) { |
| error("Failed to validate job spec. --%ss-per-task or --tres-per-task used without either --%ss or -n/--ntasks is not allowed.", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } else { |
| error("Failed to validate job spec. --tres-per-task used without -n/--ntasks is not allowed."); |
| return -1; |
| } |
| } |
| |
| /* |
| * Ensure gres_per_node is multiple of gres_per_task |
| * Ensure tasks_per_node is consistent with GRES parameters |
| */ |
| if (gres_js->gres_per_node && gres_js->gres_per_task) { |
| if (gres_js->gres_per_node % |
| gres_js->gres_per_task) { |
| /* gres_per_node not multiple of gres_per_task */ |
| error("Failed to validate job spec, --gres=%s/--%ss-per-node not multiple of --%ss-per-task.", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } |
| req_tasks_per_node = gres_js->gres_per_node / |
| gres_js->gres_per_task; |
| if ((*gres_js_val->ntasks_per_node == NO_VAL16) || |
| (*gres_js_val->ntasks_per_node == 0)) |
| *gres_js_val->ntasks_per_node = req_tasks_per_node; |
| else if (*gres_js_val->ntasks_per_node != req_tasks_per_node) { |
| error("Failed to validate job spec. Based on --gres=%s/--%ss-per-node and --%ss-per-task requested number of tasks per node differ from --ntasks-per-node.", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } |
| } |
| |
| /* |
| * Ensure gres_per_socket is multiple of gres_per_task |
| * Ensure ntasks_per_socket is consistent with GRES parameters |
| */ |
| if (gres_js->gres_per_socket && gres_js->gres_per_task) { |
| if (gres_js->gres_per_socket % |
| gres_js->gres_per_task) { |
| /* gres_per_socket not multiple of gres_per_task */ |
| error("Failed to validate job spec, --%ss-per-socket not multiple of --%ss-per-task.", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } |
| req_tasks_per_socket = gres_js->gres_per_socket / |
| gres_js->gres_per_task; |
| if ((*gres_js_val->ntasks_per_socket == NO_VAL16) || |
| (*gres_js_val->ntasks_per_socket == 0)) |
| *gres_js_val->ntasks_per_socket = req_tasks_per_socket; |
| else if (*gres_js_val->ntasks_per_socket != |
| req_tasks_per_socket) { |
| error("Failed to validate job spec. Based on --%ss-per-socket and --%ss-per-task requested number of tasks per sockets differ from --ntasks-per-socket.", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } |
| } |
| |
| /* Ensure that cpus_per_gres * gres_per_task == cpus_per_task */ |
| if (gres_js->cpus_per_gres) |
| cpus_per_gres = gres_js->cpus_per_gres; |
| else |
| cpus_per_gres = gres_js->def_cpus_per_gres; |
| if (cpus_per_gres && gres_js->gres_per_task) { |
| req_cpus_per_task = cpus_per_gres * gres_js->gres_per_task; |
| if ((*gres_js_val->cpus_per_task == NO_VAL16) || |
| (*gres_js_val->cpus_per_task == 0)) |
| *gres_js_val->cpus_per_task = req_cpus_per_task; |
| else if (*gres_js_val->cpus_per_task != req_cpus_per_task) { |
| error("Failed to validate job spec. Based on --cpus-per-%s and --%ss-per-task requested number of cpus differ from -c/--cpus-per-task.", |
| gres_state_job->gres_name, |
| gres_state_job->gres_name); |
| return -1; |
| } |
| } |
| |
| /* Ensure tres_per_job >= node count */ |
| if (gres_js->gres_per_job) { |
| if ((*gres_js_val->min_nodes != NO_VAL) && |
| (gres_js->gres_per_job < *gres_js_val->min_nodes)) { |
| error("Failed to validate job spec, --%ss < -N", |
| gres_state_job->gres_name); |
| return -1; |
| } |
| if ((*gres_js_val->max_nodes != NO_VAL) && |
| (gres_js->gres_per_job < *gres_js_val->max_nodes)) { |
| *gres_js_val->max_nodes = gres_js->gres_per_job; |
| } |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Reentrant TRES specification parse logic |
| * in_val IN - initial input string |
| * type OUT - must be xfreed by caller |
| * cnt OUT - count of values |
| * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call |
| * RET rc - error code |
| */ |
| static int _get_next_gres(char *in_val, char **type_ptr, int *context_inx_ptr, |
| uint64_t *cnt, char **save_ptr) |
| { |
| char *name = NULL, *type = NULL, *tres_type = "gres"; |
| int i, rc = SLURM_SUCCESS; |
| uint64_t value = 0; |
| |
| xassert(cnt); |
| xassert(save_ptr); |
| |
| rc = slurm_get_next_tres(&tres_type, in_val, &name, &type, |
| &value, save_ptr); |
| if (name) { |
| for (i = 0; i < gres_context_cnt; i++) { |
| if (!xstrcmp(name, gres_context[i].gres_name) || |
| !xstrncmp(name, gres_context[i].gres_name_colon, |
| gres_context[i].gres_name_colon_len)) |
| break; /* GRES name match found */ |
| } |
| if (i >= gres_context_cnt) { |
| debug("%s: Failed to locate GRES %s", __func__, name); |
| rc = ESLURM_INVALID_GRES; |
| } else |
| *context_inx_ptr = i; |
| xfree(name); |
| } |
| |
| if (rc != SLURM_SUCCESS) { |
| *save_ptr = NULL; |
| if ((rc == ESLURM_INVALID_TRES) && running_in_slurmctld()) { |
| info("%s: Invalid GRES job specification %s", __func__, |
| in_val); |
| } |
| xfree(type); |
| *type_ptr = NULL; |
| } else { |
| *cnt = value; |
| *type_ptr = type; |
| } |
| xfree(name); |
| |
| return rc; |
| } |
| |
| /* |
| * TRES specification parse logic |
| * in_val IN - initial input string |
| * cnt OUT - count of values |
| * gres_list IN/OUT - where to search for (or add) new job TRES record |
| * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call |
| * rc OUT - unchanged or an error code |
| * RET gres - job record to set value in, found or created by this function |
| */ |
| static gres_state_t *_get_next_job_gres(char *in_val, uint64_t *cnt, |
| list_t *gres_list, char **save_ptr, |
| int *rc) |
| { |
| static char *prev_save_ptr = NULL; |
| int context_inx = NO_VAL, my_rc = SLURM_SUCCESS; |
| gres_job_state_t *gres_js = NULL; |
| gres_state_t *gres_state_job = NULL; |
| gres_key_t job_search_key; |
| char *type = NULL, *name = NULL; |
| |
| xassert(save_ptr); |
| if (!in_val && (*save_ptr == NULL)) { |
| return NULL; |
| } |
| |
| if (*save_ptr == NULL) { |
| prev_save_ptr = in_val; |
| } else if (*save_ptr != prev_save_ptr) { |
| error("%s: parsing error", __func__); |
| my_rc = SLURM_ERROR; |
| goto fini; |
| } |
| |
| if (prev_save_ptr[0] == '\0') { /* Empty input token */ |
| *save_ptr = NULL; |
| return NULL; |
| } |
| |
| if ((my_rc = _get_next_gres(in_val, &type, &context_inx, |
| cnt, &prev_save_ptr)) || |
| (context_inx == NO_VAL)) { |
| prev_save_ptr = NULL; |
| goto fini; |
| } |
| |
| /* Find the job GRES record */ |
| job_search_key.config_flags = gres_context[context_inx].config_flags; |
| job_search_key.plugin_id = gres_context[context_inx].plugin_id; |
| job_search_key.type_id = gres_build_id(type); |
| gres_state_job = list_find_first(gres_list, gres_find_job_by_key, |
| &job_search_key); |
| |
| if (gres_state_job) { |
| gres_js = gres_state_job->gres_data; |
| } else { |
| gres_js = xmalloc(sizeof(gres_job_state_t)); |
| gres_js->type_id = job_search_key.type_id; |
| gres_js->type_name = type; |
| type = NULL; /* String moved above */ |
| |
| gres_state_job = gres_create_state( |
| &gres_context[context_inx], GRES_STATE_SRC_CONTEXT_PTR, |
| GRES_STATE_TYPE_JOB, gres_js); |
| list_append(gres_list, gres_state_job); |
| } |
| |
| fini: xfree(name); |
| xfree(type); |
| if (my_rc != SLURM_SUCCESS) { |
| prev_save_ptr = NULL; |
| if ((my_rc == ESLURM_INVALID_GRES) && running_in_slurmctld()) { |
| info("%s: Invalid GRES job specification %s", __func__, |
| in_val); |
| } |
| *rc = my_rc; |
| } |
| *save_ptr = prev_save_ptr; |
| return gres_state_job; |
| } |
| |
| /* Return true if job specification only includes cpus_per_gres or mem_per_gres |
| * Return false if any other field set |
| */ |
| static bool _generic_state(void *gres_data, bool is_job) |
| { |
| if (is_job) { |
| gres_job_state_t *gres_js = gres_data; |
| if (gres_js->gres_per_job || |
| gres_js->gres_per_node || |
| gres_js->gres_per_socket || |
| gres_js->gres_per_task) |
| return false; |
| } else { |
| gres_step_state_t *gres_ss = gres_data; |
| if (gres_ss->gres_per_step || |
| gres_ss->gres_per_node || |
| gres_ss->gres_per_socket || |
| gres_ss->gres_per_task) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /* |
| * Setup over_array to mark if we have gres of the same type. |
| */ |
| static void _set_over_array(gres_state_t *gres_state, |
| job_validate_t *job_validate) |
| { |
| char *type_name = job_validate->is_job ? |
| ((gres_job_state_t *) gres_state->gres_data)->type_name: |
| ((gres_step_state_t *) gres_state->gres_data)->type_name; |
| int i; |
| overlap_check_t *overlap_check = NULL; |
| |
| xassert(job_validate->over_array); |
| |
| for (i = 0; i < job_validate->over_count; i++) { |
| if (job_validate->over_array[i].plugin_id == |
| gres_state->plugin_id) |
| break; |
| } |
| |
| /* |
| * Set overlap_check after the loop since when over_count is 0 the loop |
| * won't happen. |
| */ |
| overlap_check = &job_validate->over_array[i]; |
| xassert(overlap_check); |
| |
| if (i >= job_validate->over_count) { |
| job_validate->over_count++; |
| overlap_check->plugin_id = gres_state->plugin_id; |
| if (type_name) { |
| overlap_check->with_type = true; |
| } else { |
| overlap_check->without_type = true; |
| overlap_check->without_type_state = |
| gres_state->gres_data; |
| } |
| } else if (type_name) { |
| overlap_check->with_type = true; |
| if (overlap_check->without_type) |
| job_validate->overlap_merge = true; |
| } else { |
| overlap_check->without_type = true; |
| overlap_check->without_type_state = gres_state->gres_data; |
| if (overlap_check->with_type) |
| job_validate->overlap_merge = true; |
| } |
| |
| return; |
| } |
| |
| static int _foreach_merge_generic_data(void *x, void *arg) |
| { |
| gres_state_t *gres_state = x; |
| merge_generic_t *merge_generic = arg; |
| |
| if (merge_generic->plugin_id != gres_state->plugin_id) |
| return 0; |
| |
| if (merge_generic->generic_gres_data == gres_state->gres_data) |
| return 1; |
| |
| if (merge_generic->is_job) { |
| gres_job_state_t *gres_js_in = merge_generic->generic_gres_data; |
| gres_job_state_t *gres_js = gres_state->gres_data; |
| |
| if (!gres_js->cpus_per_gres) |
| gres_js->cpus_per_gres = gres_js_in->cpus_per_gres; |
| if (!gres_js->mem_per_gres) |
| gres_js->mem_per_gres = gres_js_in->mem_per_gres; |
| } else { |
| gres_step_state_t *gres_ss_in = |
| merge_generic->generic_gres_data; |
| gres_step_state_t *gres_ss = gres_state->gres_data; |
| |
| if (!gres_ss->cpus_per_gres) |
| gres_ss->cpus_per_gres = gres_ss_in->cpus_per_gres; |
| if (!gres_ss->mem_per_gres) |
| gres_ss->mem_per_gres = gres_ss_in->mem_per_gres; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Put generic data (*_per_gres) on other gres of the same kind. |
| */ |
| static int _merge_generic_data( |
| list_t *gres_list, job_validate_t *job_validate) |
| { |
| int rc = SLURM_SUCCESS; |
| merge_generic_t merge_generic = { |
| .is_job = job_validate->is_job, |
| }; |
| |
| for (int i = 0; i < job_validate->over_count; i++) { |
| overlap_check_t *overlap_check = &job_validate->over_array[i]; |
| if (!overlap_check->with_type || |
| !overlap_check->without_type_state) |
| continue; |
| if (!_generic_state(overlap_check->without_type_state, |
| job_validate->is_job)) { |
| rc = ESLURM_INVALID_GRES_TYPE; |
| break; |
| } |
| |
| /* Propagate generic parameters */ |
| merge_generic.generic_gres_data = |
| overlap_check->without_type_state; |
| merge_generic.plugin_id = overlap_check->plugin_id; |
| |
| (void) list_delete_all(gres_list, |
| _foreach_merge_generic_data, |
| &merge_generic); |
| } |
| |
| return rc; |
| } |
| |
| static int _foreach_set_over_array(void *x, void *arg) |
| { |
| _set_over_array(x, arg); |
| |
| return 0; |
| } |
| |
| static int _foreach_job_state_validate(void *x, void *arg) |
| { |
| gres_state_t *gres_state_job = x; |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| job_validate_t *job_validate = arg; |
| |
| if (_test_gres_cnt(gres_state_job, job_validate->gres_js_val) != 0) { |
| job_validate->rc = ESLURM_INVALID_GRES; |
| return -1; |
| } |
| if (!job_validate->have_gres_sharing && |
| gres_id_sharing(gres_state_job->plugin_id)) |
| job_validate->have_gres_sharing = true; |
| if (gres_id_shared(gres_state_job->config_flags)) { |
| job_validate->have_gres_shared = true; |
| } |
| if (job_validate->have_gres_sharing && job_validate->have_gres_shared) { |
| job_validate->rc = ESLURM_INVALID_GRES; |
| return -1; |
| } |
| |
| if (job_validate->cpus_per_gres && |
| (gres_state_job->plugin_id == gres_get_gpu_plugin_id())) |
| job_validate->tmp_min_cpus += |
| job_validate->cpus_per_gres * gres_js->total_gres; |
| |
| (void) _foreach_set_over_array(gres_state_job, job_validate); |
| |
| return 0; |
| } |
| |
| extern int gres_job_state_validate(gres_job_state_validate_t *gres_js_val) |
| { |
| int rc = SLURM_SUCCESS, size; |
| bool requested_gpu = false; |
| gres_state_t *gres_state_job; |
| gres_job_state_t *gres_js; |
| uint64_t cnt = 0; |
| char *cpus_per_tres; |
| char *mem_per_tres; |
| char *tres_freq; |
| char *tres_per_job; |
| char *tres_per_node; |
| char *tres_per_socket; |
| char *tres_per_task; |
| job_validate_t job_validate = { |
| .gres_js_val = gres_js_val, |
| .is_job = true, |
| .rc = SLURM_SUCCESS, |
| }; |
| |
| xassert(gres_js_val); |
| xassert(gres_js_val->gres_list); |
| xassert(!*gres_js_val->gres_list); |
| |
| cpus_per_tres = gres_js_val->cpus_per_tres; |
| mem_per_tres = gres_js_val->mem_per_tres; |
| tres_freq = gres_js_val->tres_freq; |
| tres_per_job = gres_js_val->tres_per_job; |
| tres_per_node = gres_js_val->tres_per_node; |
| tres_per_socket = gres_js_val->tres_per_socket; |
| tres_per_task = gres_js_val->tres_per_task; |
| |
| if (tres_per_task && running_in_slurmctld() && !running_cons_tres()) { |
| char *tmp = xstrdup(tres_per_task); |
| /* |
| * Check if cpus_per_task is the only part of tres_per_task. If |
| * so, continue with validation. If not, then the request is |
| * invalid: reject the request. |
| */ |
| slurm_option_update_tres_per_task(0, "cpu", &tmp); |
| if (tmp) { |
| xfree(tmp); |
| return ESLURM_UNSUPPORTED_GRES; |
| } |
| } |
| |
| if (running_in_slurmctld() && !running_cons_tres() && |
| (cpus_per_tres || tres_per_job || tres_per_socket || mem_per_tres)) |
| return ESLURM_UNSUPPORTED_GRES; |
| |
| if (!cpus_per_tres && !tres_per_job && !tres_per_node && |
| !tres_per_socket && !tres_per_task && !mem_per_tres && |
| !gres_js_val->ntasks_per_tres) |
| return SLURM_SUCCESS; |
| |
| if ((tres_per_task || (*gres_js_val->ntasks_per_tres != NO_VAL16)) && |
| (*gres_js_val->num_tasks == NO_VAL) && |
| (*gres_js_val->min_nodes != NO_VAL) && |
| (*gres_js_val->min_nodes == *gres_js_val->max_nodes)) { |
| /* Implicitly set task count */ |
| if (*gres_js_val->ntasks_per_tres != NO_VAL16) |
| *gres_js_val->num_tasks = *gres_js_val->min_nodes * |
| *gres_js_val->ntasks_per_tres; |
| else if (*gres_js_val->ntasks_per_node != NO_VAL16) |
| *gres_js_val->num_tasks = *gres_js_val->min_nodes * |
| *gres_js_val->ntasks_per_node; |
| else if (*gres_js_val->cpus_per_task == NO_VAL16) |
| *gres_js_val->num_tasks = *gres_js_val->min_nodes; |
| } |
| |
| xassert(gres_context_cnt >= 0); |
| |
| /* |
| * Set new values as requested |
| */ |
| *gres_js_val->gres_list = list_create(gres_job_list_delete); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| if (cpus_per_tres) { |
| char *in_val = cpus_per_tres, *save_ptr = NULL; |
| while ((gres_state_job = _get_next_job_gres(in_val, &cnt, |
| *gres_js_val-> |
| gres_list, |
| &save_ptr, &rc))) { |
| gres_js = gres_state_job->gres_data; |
| gres_js->cpus_per_gres = cnt; |
| in_val = NULL; |
| gres_js->ntasks_per_gres = |
| *gres_js_val->ntasks_per_tres; |
| |
| /* |
| * In theory MAX(cpus_per_gres) shouldn't matter because |
| * we should only allow one gres name to have |
| * cpus_per_gres and it should be the same for all types |
| * (e.g., gpu:k80 vs gpu:tesla) of that same gres (gpu) |
| */ |
| job_validate.cpus_per_gres = |
| MAX(job_validate.cpus_per_gres, cnt); |
| } |
| } |
| if (tres_per_job) { |
| char *in_val = tres_per_job, *save_ptr = NULL; |
| while ((gres_state_job = _get_next_job_gres(in_val, &cnt, |
| *gres_js_val-> |
| gres_list, |
| &save_ptr, &rc))) { |
| if (!requested_gpu && |
| (!xstrcmp(gres_state_job->gres_name, "gpu"))) |
| requested_gpu = true; |
| gres_js = gres_state_job->gres_data; |
| gres_js->gres_per_job = cnt; |
| in_val = NULL; |
| gres_js->total_gres = |
| MAX(gres_js->total_gres, cnt); |
| gres_js->ntasks_per_gres = |
| *gres_js_val->ntasks_per_tres; |
| } |
| } |
| if (tres_per_node) { |
| char *in_val = tres_per_node, *save_ptr = NULL; |
| while ((gres_state_job = _get_next_job_gres(in_val, &cnt, |
| *gres_js_val-> |
| gres_list, |
| &save_ptr, &rc))) { |
| if (!requested_gpu && |
| (!xstrcmp(gres_state_job->gres_name, "gpu"))) |
| requested_gpu = true; |
| gres_js = gres_state_job->gres_data; |
| gres_js->gres_per_node = cnt; |
| in_val = NULL; |
| if (*gres_js_val->min_nodes != NO_VAL) |
| cnt *= *gres_js_val->min_nodes; |
| gres_js->total_gres = |
| MAX(gres_js->total_gres, cnt); |
| gres_js->ntasks_per_gres = |
| *gres_js_val->ntasks_per_tres; |
| } |
| } |
| if (tres_per_socket) { |
| char *in_val = tres_per_socket, *save_ptr = NULL; |
| while ((gres_state_job = _get_next_job_gres(in_val, &cnt, |
| *gres_js_val-> |
| gres_list, |
| &save_ptr, &rc))) { |
| if (!requested_gpu && |
| (!xstrcmp(gres_state_job->gres_name, "gpu"))) |
| requested_gpu = true; |
| gres_js = gres_state_job->gres_data; |
| gres_js->gres_per_socket = cnt; |
| in_val = NULL; |
| if ((*gres_js_val->min_nodes != NO_VAL) && |
| (*gres_js_val->sockets_per_node != NO_VAL16)) { |
| cnt *= (*gres_js_val->min_nodes * |
| *gres_js_val->sockets_per_node); |
| } else if ((*gres_js_val->num_tasks != NO_VAL) && |
| (*gres_js_val->ntasks_per_socket != |
| NO_VAL16)) { |
| cnt *= ROUNDUP(*gres_js_val->num_tasks, |
| *gres_js_val->ntasks_per_socket); |
| } else if (*gres_js_val->sockets_per_node != NO_VAL16) { |
| /* default 1 node */ |
| cnt *= *gres_js_val->sockets_per_node; |
| } |
| gres_js->total_gres = |
| MAX(gres_js->total_gres, cnt); |
| gres_js->ntasks_per_gres = |
| *gres_js_val->ntasks_per_tres; |
| } |
| } |
| if (tres_per_task) { |
| char *in_val = tres_per_task, *save_ptr = NULL; |
| while ((gres_state_job = _get_next_job_gres(in_val, &cnt, |
| *gres_js_val-> |
| gres_list, |
| &save_ptr, &rc))) { |
| if (!requested_gpu && |
| (!xstrcmp(gres_state_job->gres_name, "gpu"))) |
| requested_gpu = true; |
| gres_js = gres_state_job->gres_data; |
| gres_js->gres_per_task = cnt; |
| in_val = NULL; |
| if (*gres_js_val->num_tasks != NO_VAL) |
| cnt *= *gres_js_val->num_tasks; |
| gres_js->total_gres = |
| MAX(gres_js->total_gres, cnt); |
| gres_js->ntasks_per_gres = |
| *gres_js_val->ntasks_per_tres; |
| } |
| } |
| if (mem_per_tres) { |
| char *in_val = mem_per_tres, *save_ptr = NULL; |
| while ((gres_state_job = _get_next_job_gres(in_val, &cnt, |
| *gres_js_val-> |
| gres_list, |
| &save_ptr, &rc))) { |
| gres_js = gres_state_job->gres_data; |
| gres_js->mem_per_gres = cnt; |
| in_val = NULL; |
| gres_js->ntasks_per_gres = |
| *gres_js_val->ntasks_per_tres; |
| } |
| } |
| |
| /* |
| * *gres_js_val->num_tasks and *gres_js_val->ntasks_per_tres could be 0 |
| * on requeue |
| */ |
| if (!gres_js_val->ntasks_per_tres || |
| !*gres_js_val->ntasks_per_tres || |
| (*gres_js_val->ntasks_per_tres == NO_VAL16)) { |
| /* do nothing */ |
| } else if (requested_gpu && list_count(*gres_js_val->gres_list)) { |
| /* Set num_tasks = gpus * ntasks/gpu */ |
| uint64_t gpus = _get_job_gres_list_cnt( |
| *gres_js_val->gres_list, "gpu", NULL); |
| if (gpus != NO_VAL64) |
| *gres_js_val->num_tasks = |
| gpus * *gres_js_val->ntasks_per_tres; |
| else { |
| error("%s: Can't set num_tasks = gpus * *ntasks_per_tres because there are no allocated GPUs", |
| __func__); |
| rc = ESLURM_INVALID_GRES; |
| } |
| } else if (*gres_js_val->num_tasks && |
| (*gres_js_val->num_tasks != NO_VAL)) { |
| /* |
| * If job_gres_list empty, and ntasks_per_tres is specified, |
| * then derive GPUs according to how many tasks there are. |
| * GPU GRES = [ntasks / (ntasks_per_tres)] |
| * For now, only generate type-less GPUs. |
| */ |
| uint32_t gpus = *gres_js_val->num_tasks / |
| *gres_js_val->ntasks_per_tres; |
| char *save_ptr = NULL, *gres = NULL, *in_val; |
| xstrfmtcat(gres, "gres/gpu:%u", gpus); |
| in_val = gres; |
| while ((gres_state_job = _get_next_job_gres(in_val, &cnt, |
| *gres_js_val-> |
| gres_list, |
| &save_ptr, &rc))) { |
| gres_js = gres_state_job->gres_data; |
| gres_js->ntasks_per_gres = |
| *gres_js_val->ntasks_per_tres; |
| /* Simulate a tres_per_job specification */ |
| gres_js->gres_per_job = cnt; |
| gres_js->total_gres = |
| MAX(gres_js->total_gres, cnt); |
| in_val = NULL; |
| } |
| if (list_count(*gres_js_val->gres_list) == 0) |
| error("%s: Failed to add generated GRES %s (via ntasks_per_tres) to gres_list", |
| __func__, gres); |
| else |
| requested_gpu = true; |
| xfree(gres); |
| } else { |
| error("%s: --ntasks-per-tres needs either a GRES GPU specification or a node/ntask specification", |
| __func__); |
| rc = ESLURM_INVALID_GRES; |
| } |
| |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| if (rc != SLURM_SUCCESS) |
| return rc; |
| size = list_count(*gres_js_val->gres_list); |
| if (size == 0) { |
| FREE_NULL_LIST(*gres_js_val->gres_list); |
| return rc; |
| } |
| |
| /* |
| * If someone requested [mem|cpus]_per_tres but didn't request any GPUs |
| * (even if --exclusive was used), then error. For now we only test for |
| * GPUs since --[mem|cpus]-per-gpu are the only allowed |
| * [mem|cpus]_per_gres options. Even though --exclusive means that you |
| * will be allocated all of the GRES on the node, we still require that |
| * GPUs are explicitly requested when --[mem|cpus]-per-gpu is used. |
| */ |
| if (mem_per_tres && (!requested_gpu)) { |
| error("Requested mem_per_tres=%s but did not request any GPU.", |
| mem_per_tres); |
| return ESLURM_INVALID_GRES; |
| } |
| if (cpus_per_tres && (!requested_gpu)) { |
| error("Requested cpus_per_tres=%s but did not request any GPU.", |
| cpus_per_tres); |
| return ESLURM_INVALID_GRES; |
| } |
| |
| /* |
| * Check for record overlap (e.g. "gpu:2,gpu:tesla:1") |
| * Ensure tres_per_job >= tres_per_node >= tres_per_socket |
| */ |
| job_validate.over_array = xcalloc(size, sizeof(overlap_check_t)); |
| |
| (void) list_for_each(*gres_js_val->gres_list, |
| _foreach_job_state_validate, |
| &job_validate); |
| |
| if (job_validate.tmp_min_cpus > *gres_js_val->min_cpus) |
| *gres_js_val->min_cpus = job_validate.tmp_min_cpus; |
| |
| if (((*gres_js_val->cpus_per_task) != NO_VAL16) && |
| ((*gres_js_val->num_tasks) != NO_VAL)) { |
| cnt = (*gres_js_val->cpus_per_task) * (*gres_js_val->num_tasks); |
| if (*gres_js_val->min_cpus < cnt) |
| *gres_js_val->min_cpus = cnt; |
| } |
| |
| if (job_validate.have_gres_shared && |
| (job_validate.rc == SLURM_SUCCESS) && |
| tres_freq && |
| strstr(tres_freq, "gpu")) { |
| job_validate.rc = ESLURM_INVALID_GRES; |
| } |
| |
| if (job_validate.overlap_merge) /* Merge generic data if possible */ |
| job_validate.rc = _merge_generic_data(*gres_js_val->gres_list, |
| &job_validate); |
| |
| xfree(job_validate.over_array); |
| |
| return job_validate.rc; |
| } |
| |
| static int _find_gres_per_jst(void *x, void *arg) |
| { |
| gres_state_t *gres_state_job = x; |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| |
| if (gres_js->gres_per_job || |
| gres_js->gres_per_socket || |
| gres_js->gres_per_task) |
| return 1; |
| |
| return 0; |
| } |
| |
| /* |
| * Determine if a job's specified GRES can be supported. This is designed to |
| * prevent the running of a job using the GRES options only supported by the |
| * select/cons_tres plugin when switching (on slurmctld restart) from the |
| * cons_tres plugin to any other select plugin. |
| * |
| * IN gres_list - List of GRES records for this job to track usage |
| * RET SLURM_SUCCESS or ESLURM_INVALID_GRES |
| */ |
| extern int gres_job_revalidate(list_t *gres_list) |
| { |
| if (!gres_list || running_cons_tres()) |
| return SLURM_SUCCESS; |
| |
| if (list_find_first(gres_list, _find_gres_per_jst, NULL)) |
| return ESLURM_UNSUPPORTED_GRES; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Return TRUE if any of this job's GRES has a populated gres_bit_alloc element. |
| * This indicates the allocated GRES has a File configuration parameter and is |
| * tracking individual file assignments. |
| */ |
| static int _find_job_has_gres_bits(void *x, void *arg) |
| { |
| gres_state_t *gres_state_job = x; |
| gres_job_state_t *gres_js = gres_state_job->gres_data;; |
| |
| for (int i = 0; i < gres_js->node_cnt; i++) { |
| if (gres_js->gres_bit_alloc && gres_js->gres_bit_alloc[i]) |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| |
| static int _find_invalid_job_gres_on_node(void *x, void *arg) |
| { |
| gres_state_t *gres_state_job = x; |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| validate_job_gres_cnt_t *validate_job_gres_cnt = arg; |
| gres_state_t *gres_state_node; |
| uint32_t plugin_id; |
| int job_gres_cnt, node_gres_cnt = 0; |
| |
| if (!gres_js || |
| !gres_js->gres_bit_alloc || |
| (gres_js->node_cnt <= validate_job_gres_cnt->node_inx) || |
| !gres_js->gres_bit_alloc[validate_job_gres_cnt->node_inx]) |
| return 0; |
| |
| job_gres_cnt = bit_size( |
| gres_js->gres_bit_alloc[validate_job_gres_cnt->node_inx]); |
| |
| if (gres_id_shared(gres_state_job->config_flags)) |
| plugin_id = gpu_plugin_id; |
| else |
| plugin_id = gres_state_job->plugin_id; |
| |
| if ((gres_state_node = list_find_first(validate_job_gres_cnt-> |
| node_gres_list, |
| gres_find_id, |
| &plugin_id))) { |
| gres_node_state_t *gres_ns = gres_state_node->gres_data; |
| node_gres_cnt = (int) gres_ns->gres_cnt_config; |
| if (gres_js->type_id) { |
| bool found_type = false; |
| gres_node_state_t *gres_ns = gres_state_node->gres_data; |
| |
| for (int i = 0; i < gres_ns->type_cnt; i++) { |
| if (gres_ns->type_id[i] == gres_js->type_id) { |
| found_type = true; |
| break; |
| } |
| } |
| if (!found_type) { |
| error("%s: Killing job %u: gres/%s type %s not found on node %s", |
| __func__, |
| validate_job_gres_cnt->job_id, |
| gres_state_job->gres_name, |
| gres_js->type_name, |
| validate_job_gres_cnt->node_name); |
| return 1; |
| } |
| } |
| } |
| |
| if (job_gres_cnt != node_gres_cnt) { |
| error("%s: Killing job %u: gres/%s count mismatch on node " |
| "%s (%d != %d)", |
| __func__, validate_job_gres_cnt->job_id, |
| gres_state_job->gres_name, |
| validate_job_gres_cnt-> node_name, |
| job_gres_cnt, node_gres_cnt); |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Determine if a job's specified GRES are currently valid. This is designed to |
| * manage jobs allocated GRES which are either no longer supported or a GRES |
| * configured with the "File" option in gres.conf where the count has changed, |
| * in which case we don't know how to map the job's old GRES bitmap onto the |
| * current GRES bitmaps. |
| * |
| * IN job_id - ID of job being validated (used for logging) |
| * IN job_gres_list - List of GRES records for this job to track usage |
| * RET SLURM_SUCCESS or ESLURM_INVALID_GRES |
| */ |
| extern int gres_job_revalidate2(uint32_t job_id, list_t *job_gres_list, |
| bitstr_t *node_bitmap) |
| { |
| node_record_t *node_ptr; |
| int rc = SLURM_SUCCESS; |
| validate_job_gres_cnt_t validate_job_gres_cnt = { |
| .job_id = job_id, |
| .node_inx = -1, |
| }; |
| |
| if (!job_gres_list || !node_bitmap || |
| !list_find_first(job_gres_list, _find_job_has_gres_bits, NULL)) |
| return SLURM_SUCCESS; |
| |
| for (int i = 0; (node_ptr = next_node_bitmap(node_bitmap, &i)); i++) { |
| /* If no node_ptr->gres_list we are invalid */ |
| if (!node_ptr->gres_list) |
| return ESLURM_INVALID_GRES; |
| |
| validate_job_gres_cnt.node_inx++; |
| validate_job_gres_cnt.node_gres_list = node_ptr->gres_list; |
| validate_job_gres_cnt.node_name = node_ptr->name; |
| |
| if (list_find_first(job_gres_list, |
| _find_invalid_job_gres_on_node, |
| &validate_job_gres_cnt)) |
| return ESLURM_INVALID_GRES; |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * Find a sock_gres_t record in a list by matching the plugin_id and type_id |
| * from a gres_state_t job record |
| * IN x - a sock_gres_t record to test |
| * IN key - the gres_state_t record (from a job) we want to match |
| * RET 1 on match, otherwise 0 |
| */ |
| extern int gres_find_sock_by_job_state(void *x, void *key) |
| { |
| sock_gres_t *sock_data = (sock_gres_t *) x; |
| gres_state_t *job_gres_state = (gres_state_t *) key; |
| gres_job_state_t *sock_gres_js, *gres_js; |
| |
| gres_js = (gres_job_state_t *) job_gres_state->gres_data; |
| sock_gres_js = sock_data->gres_state_job->gres_data; |
| |
| if ((sock_data->gres_state_job->plugin_id == |
| job_gres_state->plugin_id) && |
| (sock_gres_js->type_id == gres_js->type_id)) |
| return 1; |
| return 0; |
| } |
| |
| /* |
| * Create a (partial) copy of a job's gres state for job binding |
| * IN gres_list - List of Gres records for this job to track usage |
| * RET The copy or NULL on failure |
| * NOTE: Only job details are copied, NOT the job step details |
| */ |
| extern list_t *gres_job_state_list_dup(list_t *gres_list) |
| { |
| return gres_job_state_extract(gres_list, -1); |
| } |
| |
| static gres_job_state_t *_job_state_dup_common(gres_job_state_t *gres_js) |
| { |
| gres_job_state_t *new_gres_js = xmalloc(sizeof(gres_job_state_t)); |
| |
| new_gres_js->cpus_per_gres = gres_js->cpus_per_gres; |
| new_gres_js->def_cpus_per_gres = gres_js->def_cpus_per_gres; |
| new_gres_js->def_mem_per_gres = gres_js->def_mem_per_gres; |
| new_gres_js->flags = gres_js->flags; |
| new_gres_js->gres_per_job = gres_js->gres_per_job; |
| new_gres_js->gres_per_node = gres_js->gres_per_node; |
| new_gres_js->gres_per_socket = gres_js->gres_per_socket; |
| new_gres_js->gres_per_task = gres_js->gres_per_task; |
| new_gres_js->mem_per_gres = gres_js->mem_per_gres; |
| new_gres_js->ntasks_per_gres = gres_js->ntasks_per_gres; |
| new_gres_js->node_cnt = gres_js->node_cnt; |
| new_gres_js->res_array_size = gres_js->res_array_size; |
| new_gres_js->total_gres = gres_js->total_gres; |
| new_gres_js->total_node_cnt = gres_js->total_node_cnt; |
| new_gres_js->type_id = gres_js->type_id; |
| new_gres_js->type_name = xstrdup(gres_js->type_name); |
| |
| return new_gres_js; |
| } |
| |
| /* Copy gres_job_state_t record for ALL nodes */ |
| extern void *gres_job_state_dup(gres_job_state_t *gres_js) |
| { |
| |
| int i; |
| gres_job_state_t *new_gres_js; |
| |
| if (gres_js == NULL) |
| return NULL; |
| |
| new_gres_js = _job_state_dup_common(gres_js); |
| |
| if (gres_js->gres_cnt_node_alloc) { |
| i = sizeof(uint64_t) * gres_js->node_cnt; |
| new_gres_js->gres_cnt_node_alloc = xmalloc(i); |
| memcpy(new_gres_js->gres_cnt_node_alloc, |
| gres_js->gres_cnt_node_alloc, i); |
| } |
| if (gres_js->gres_cnt_step_alloc) { |
| new_gres_js->gres_cnt_step_alloc = xcalloc( |
| gres_js->node_cnt, |
| sizeof(*new_gres_js->gres_cnt_step_alloc)); |
| memcpy(new_gres_js->gres_cnt_step_alloc, |
| gres_js->gres_cnt_step_alloc, |
| (sizeof(*new_gres_js->gres_cnt_step_alloc) * |
| gres_js->node_cnt)); |
| } |
| if (gres_js->gres_bit_alloc) { |
| new_gres_js->gres_bit_alloc = xcalloc(gres_js->node_cnt, |
| sizeof(bitstr_t *)); |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| if (gres_js->gres_bit_alloc[i] == NULL) |
| continue; |
| new_gres_js->gres_bit_alloc[i] = |
| bit_copy(gres_js->gres_bit_alloc[i]); |
| } |
| } |
| if (gres_js->gres_per_bit_alloc && gres_js->gres_bit_alloc) { |
| new_gres_js->gres_per_bit_alloc = xcalloc(gres_js->node_cnt, |
| sizeof(uint64_t *)); |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| int bit_cnt = bit_size(gres_js->gres_bit_alloc[i]); |
| new_gres_js->gres_per_bit_alloc[i] = xcalloc( |
| bit_cnt, sizeof(uint64_t)); |
| memcpy(new_gres_js->gres_per_bit_alloc[i], |
| gres_js->gres_per_bit_alloc[i], bit_cnt); |
| } |
| } |
| if (gres_js->gres_bit_step_alloc) { |
| new_gres_js->gres_bit_step_alloc = xcalloc(gres_js->node_cnt, |
| sizeof(bitstr_t *)); |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| if (!gres_js->gres_bit_step_alloc[i]) |
| continue; |
| new_gres_js->gres_bit_step_alloc[i] = |
| bit_copy(gres_js->gres_bit_step_alloc[i]); |
| } |
| } |
| if (gres_js->gres_per_bit_step_alloc && gres_js->gres_bit_alloc) { |
| new_gres_js->gres_per_bit_step_alloc = xcalloc( |
| gres_js->node_cnt, sizeof(uint64_t *)); |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| int bit_cnt = bit_size(gres_js->gres_bit_alloc[i]); |
| new_gres_js->gres_per_bit_step_alloc[i] = xcalloc( |
| bit_cnt, sizeof(uint64_t)); |
| memcpy(new_gres_js->gres_per_bit_step_alloc[i], |
| gres_js->gres_per_bit_step_alloc[i], |
| bit_cnt * sizeof(uint64_t)); |
| } |
| } |
| if (gres_js->gres_cnt_node_select) { |
| i = sizeof(uint64_t) * gres_js->total_node_cnt; |
| new_gres_js->gres_cnt_node_select = xmalloc(i); |
| memcpy(new_gres_js->gres_cnt_node_select, |
| gres_js->gres_cnt_node_select, i); |
| } |
| if (gres_js->gres_bit_select) { |
| new_gres_js->gres_bit_select = xcalloc(gres_js->total_node_cnt, |
| sizeof(bitstr_t *)); |
| for (i = 0; i < gres_js->total_node_cnt; i++) { |
| if (gres_js->gres_bit_select[i] == NULL) |
| continue; |
| new_gres_js->gres_bit_select[i] = |
| bit_copy(gres_js->gres_bit_select[i]); |
| } |
| } |
| if (gres_js->gres_per_bit_select && gres_js->gres_bit_select) { |
| new_gres_js->gres_per_bit_select = |
| xcalloc(gres_js->total_node_cnt, sizeof(uint64_t *)); |
| for (i = 0; i < gres_js->total_node_cnt; i++) { |
| int bit_cnt; |
| |
| if (!gres_js->gres_bit_select[i]) |
| continue; |
| |
| bit_cnt = bit_size(gres_js->gres_bit_select[i]); |
| new_gres_js->gres_per_bit_select[i] = xcalloc( |
| bit_cnt, sizeof(uint64_t)); |
| memcpy(new_gres_js->gres_per_bit_select[i], |
| gres_js->gres_per_bit_select[i], bit_cnt); |
| } |
| } |
| |
| if (gres_js->res_gpu_cores) { |
| new_gres_js->res_gpu_cores = xcalloc(gres_js->res_array_size, |
| sizeof(bitstr_t *)); |
| for (i = 0; i < gres_js->res_array_size; i++) { |
| if (gres_js->res_gpu_cores[i] == NULL) |
| continue; |
| new_gres_js->res_gpu_cores[i] = |
| bit_copy(gres_js->res_gpu_cores[i]); |
| } |
| } |
| |
| return new_gres_js; |
| } |
| |
| /* Copy gres_job_state_t record for one specific node (stepd) */ |
| static void *_job_state_dup2(gres_job_state_t *gres_js, int job_node_index) |
| { |
| gres_job_state_t *new_gres_js; |
| |
| if (gres_js == NULL) |
| return NULL; |
| |
| new_gres_js = _job_state_dup_common(gres_js); |
| new_gres_js->total_node_cnt = 1; |
| new_gres_js->node_cnt = 1; |
| |
| if (gres_js->gres_cnt_node_alloc) { |
| new_gres_js->gres_cnt_node_alloc = xmalloc(sizeof(uint64_t)); |
| new_gres_js->gres_cnt_node_alloc[0] = |
| gres_js->gres_cnt_node_alloc[job_node_index]; |
| } |
| if (gres_js->gres_bit_alloc && |
| gres_js->gres_bit_alloc[job_node_index]) { |
| new_gres_js->gres_bit_alloc = xmalloc(sizeof(bitstr_t *)); |
| new_gres_js->gres_bit_alloc[0] = |
| bit_copy(gres_js->gres_bit_alloc[job_node_index]); |
| } |
| if (gres_js->gres_per_bit_alloc && |
| gres_js->gres_bit_alloc && |
| gres_js->gres_bit_alloc[job_node_index]) { |
| new_gres_js->gres_per_bit_alloc = xmalloc(sizeof(uint64_t *)); |
| new_gres_js->gres_per_bit_alloc[0] = xcalloc( |
| bit_size(gres_js->gres_bit_alloc[job_node_index]), |
| sizeof(uint64_t)); |
| memcpy(new_gres_js->gres_per_bit_alloc[0], |
| gres_js->gres_per_bit_alloc[job_node_index], |
| bit_size(gres_js->gres_bit_alloc[job_node_index]) * |
| sizeof(uint64_t)); |
| } |
| |
| /* |
| * No reason to do |
| * |
| * gres_js->gres_cnt_node_select |
| * gres_js->gres_bit_select |
| * |
| * they are based off the entire cluster this is not needed for the |
| * stepd. |
| */ |
| |
| return new_gres_js; |
| } |
| |
| static int _foreach_job_state_extract(void *x, void *arg) |
| { |
| gres_state_t *gres_state_job = x; |
| job_state_extract_t *job_state_extract = arg; |
| gres_state_t *new_gres_state; |
| void *new_gres_data; |
| |
| if (job_state_extract->job_node_index == -1) |
| new_gres_data = gres_job_state_dup( |
| gres_state_job->gres_data); |
| else |
| new_gres_data = _job_state_dup2( |
| gres_state_job->gres_data, |
| job_state_extract->job_node_index); |
| |
| if (!new_gres_data) |
| return -1; |
| |
| if (!job_state_extract->new_list) |
| job_state_extract->new_list = list_create(gres_job_list_delete); |
| |
| new_gres_state = gres_create_state( |
| gres_state_job, GRES_STATE_SRC_STATE_PTR, |
| GRES_STATE_TYPE_JOB, new_gres_data); |
| list_append(job_state_extract->new_list, new_gres_state); |
| |
| return 0; |
| } |
| |
| /* |
| * Create a (partial) copy of a job's gres state for a particular node index |
| * IN gres_list - List of Gres records for this job to track usage |
| * IN job_node_index - zero-origin index to the node |
| * RET The copy or NULL on failure |
| */ |
| extern list_t *gres_job_state_extract(list_t *gres_list, int job_node_index) |
| { |
| job_state_extract_t job_state_extract = { |
| .job_node_index = job_node_index, |
| }; |
| |
| if (gres_list) |
| (void) list_for_each(gres_list, |
| _foreach_job_state_extract, |
| &job_state_extract); |
| |
| return job_state_extract.new_list; |
| } |
| |
| /* |
| * Pack a job's current gres status, called from slurmctld for save/restore |
| * IN gres_list - generated by gres_job_config_validate() |
| * IN/OUT buffer - location to write state to |
| * IN job_id - job's ID |
| * IN details - if set then pack job step allocation details (only needed to |
| * save/restore job state, not needed in job credential for |
| * slurmd task binding) |
| * |
| * NOTE: A job's allocation to steps is not recorded here, but recovered with |
| * the job step state information upon slurmctld restart. |
| */ |
| extern int gres_job_state_pack(list_t *gres_list, buf_t *buffer, |
| uint32_t job_id, bool details, |
| uint16_t protocol_version) |
| { |
| pack_state_t pack_state = { |
| .buffer = buffer, |
| .details = details, |
| .magic = GRES_MAGIC, |
| .protocol_version = protocol_version, |
| }; |
| |
| return _pack_state(gres_list, &pack_state, _foreach_job_state_pack); |
| } |
| |
| /* |
| * Unpack a job's current gres status, called from slurmctld for save/restore |
| * OUT gres_list - restored state stored by gres_job_state_pack() |
| * IN/OUT buffer - location to read state from |
| * IN job_id - job's ID |
| */ |
| extern int gres_job_state_unpack(list_t **gres_list, buf_t *buffer, |
| uint32_t job_id, |
| uint16_t protocol_version) |
| { |
| int i = 0, rc = SLURM_SUCCESS; |
| uint32_t magic = 0, plugin_id = 0, utmp32 = 0; |
| uint16_t rec_cnt = 0; |
| uint8_t has_more = 0; |
| gres_state_t *gres_state_job; |
| gres_job_state_t *gres_js = NULL; |
| bool locked = false; |
| |
| safe_unpack16(&rec_cnt, buffer); |
| if (rec_cnt == 0) |
| return SLURM_SUCCESS; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| locked = true; |
| if ((gres_context_cnt > 0) && (*gres_list == NULL)) { |
| *gres_list = list_create(gres_job_list_delete); |
| } |
| |
| while ((rc == SLURM_SUCCESS) && (rec_cnt)) { |
| slurm_gres_context_t *gres_ctx; |
| if ((buffer == NULL) || (remaining_buf(buffer) == 0)) |
| break; |
| rec_cnt--; |
| |
| if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| safe_unpack32(&magic, buffer); |
| if (magic != GRES_MAGIC) |
| goto unpack_error; |
| safe_unpack32(&plugin_id, buffer); |
| gres_js = xmalloc(sizeof(gres_job_state_t)); |
| safe_unpack16(&gres_js->cpus_per_gres, buffer); |
| safe_unpack16(&gres_js->flags, buffer); |
| safe_unpack64(&gres_js->gres_per_job, buffer); |
| safe_unpack64(&gres_js->gres_per_node, buffer); |
| safe_unpack64(&gres_js->gres_per_socket, buffer); |
| safe_unpack64(&gres_js->gres_per_task, buffer); |
| safe_unpack64(&gres_js->mem_per_gres, buffer); |
| safe_unpack16(&gres_js->ntasks_per_gres, buffer); |
| safe_unpack64(&gres_js->total_gres, buffer); |
| safe_unpackstr(&gres_js->type_name, buffer); |
| gres_js->type_id = |
| gres_build_id(gres_js->type_name); |
| safe_unpack32(&gres_js->node_cnt, buffer); |
| if (gres_js->node_cnt > NO_VAL) |
| goto unpack_error; |
| |
| safe_unpack8(&has_more, buffer); |
| if (has_more) { |
| safe_unpack64_array( |
| &gres_js->gres_cnt_node_alloc, |
| &utmp32, buffer); |
| } |
| |
| safe_unpack8(&has_more, buffer); |
| if (has_more) { |
| safe_xcalloc(gres_js->gres_bit_alloc, |
| gres_js->node_cnt, |
| sizeof(bitstr_t *)); |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| unpack_bit_str_hex(&gres_js-> |
| gres_bit_alloc[i], |
| buffer); |
| } |
| } |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| safe_unpack8(&has_more, buffer); |
| if (!has_more) |
| continue; |
| if (!gres_js->gres_per_bit_alloc) |
| safe_xcalloc( |
| gres_js->gres_per_bit_alloc, |
| gres_js->node_cnt, |
| sizeof(uint64_t *)); |
| safe_unpack64_array( |
| &gres_js->gres_per_bit_alloc[i], |
| &utmp32, buffer); |
| } |
| safe_unpack8(&has_more, buffer); |
| if (has_more) { |
| safe_xcalloc(gres_js->gres_bit_step_alloc, |
| gres_js->node_cnt, |
| sizeof(bitstr_t *)); |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| unpack_bit_str_hex(&gres_js-> |
| gres_bit_step_alloc[i], |
| buffer); |
| } |
| } |
| safe_unpack8(&has_more, buffer); |
| if (has_more) { |
| safe_xcalloc(gres_js->gres_cnt_step_alloc, |
| gres_js->node_cnt, |
| sizeof(uint64_t)); |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| safe_unpack64(&gres_js-> |
| gres_cnt_step_alloc[i], |
| buffer); |
| } |
| } |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| safe_unpack8(&has_more, buffer); |
| if (!has_more) |
| continue; |
| if (!gres_js->gres_per_bit_step_alloc) |
| safe_xcalloc( |
| gres_js->gres_per_bit_step_alloc, |
| gres_js->node_cnt, |
| sizeof(uint64_t *)); |
| safe_unpack64_array( |
| &gres_js->gres_per_bit_step_alloc[i], |
| &utmp32, buffer); |
| } |
| } else { |
| error("%s: protocol_version %hu not supported", |
| __func__, protocol_version); |
| goto unpack_error; |
| } |
| |
| if (!(gres_ctx = _find_context_by_id(plugin_id))) { |
| /* |
| * A likely sign that GresPlugins has changed. |
| * Not a fatal error, skip over the data. |
| */ |
| error("%s: no plugin configured to unpack data type %u from job %u. This is likely due to a difference in the GresTypes configured in slurm.conf on different cluster nodes.", |
| __func__, plugin_id, job_id); |
| gres_job_state_delete(gres_js); |
| continue; |
| } |
| |
| gres_state_job = gres_create_state( |
| gres_ctx, GRES_STATE_SRC_CONTEXT_PTR, |
| GRES_STATE_TYPE_JOB, gres_js); |
| gres_js = NULL; /* nothing left to free on error */ |
| list_append(*gres_list, gres_state_job); |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| return rc; |
| |
| unpack_error: |
| error("%s: unpack error from job %u", __func__, job_id); |
| if (gres_js) |
| gres_job_state_delete(gres_js); |
| if (locked) |
| slurm_mutex_unlock(&gres_context_lock); |
| return SLURM_ERROR; |
| } |
| |
| extern void gres_prep_pack(void *in, uint16_t protocol_version, buf_t *buffer) |
| { |
| uint32_t magic = GRES_MAGIC; |
| gres_prep_t *gres_prep = in; |
| |
| if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| pack32(magic, buffer); |
| pack32(gres_prep->plugin_id, buffer); |
| pack32(gres_prep->node_cnt, buffer); |
| if (gres_prep->gres_cnt_node_alloc) { |
| pack8((uint8_t) 1, buffer); |
| pack64_array(gres_prep->gres_cnt_node_alloc, |
| gres_prep->node_cnt, buffer); |
| } else { |
| pack8((uint8_t) 0, buffer); |
| } |
| if (gres_prep->gres_bit_alloc) { |
| pack8((uint8_t) 1, buffer); |
| for (int i = 0; i < gres_prep->node_cnt; i++) { |
| pack_bit_str_hex(gres_prep-> |
| gres_bit_alloc[i], |
| buffer); |
| } |
| } else { |
| pack8((uint8_t) 0, buffer); |
| } |
| } else { |
| error("%s: protocol_version %hu not supported", |
| __func__, protocol_version); |
| } |
| } |
| |
| /* |
| * Pack a job's allocated gres information for use by prolog/epilog |
| * IN gres_list - generated by gres_job_config_validate() |
| * IN/OUT buffer - location to write state to |
| * |
| * When 24.11 is no longer supported this can be removed. |
| */ |
| extern int gres_prep_pack_legacy(list_t *gres_list, buf_t *buffer, |
| uint16_t protocol_version) |
| { |
| int rc = SLURM_SUCCESS; |
| uint32_t top_offset, tail_offset; |
| uint16_t rec_cnt = 0; |
| list_itr_t *gres_iter; |
| gres_prep_t *gres_prep; |
| |
| top_offset = get_buf_offset(buffer); |
| pack16(rec_cnt, buffer); /* placeholder if data */ |
| |
| if (gres_list == NULL) |
| return rc; |
| |
| if (protocol_version < SLURM_MIN_PROTOCOL_VERSION) { |
| error("%s: protocol_version %hu not supported", |
| __func__, protocol_version); |
| return rc; |
| } |
| |
| gres_iter = list_iterator_create(gres_list); |
| while ((gres_prep = list_next(gres_iter))) { |
| gres_prep_pack(gres_prep, protocol_version, buffer); |
| rec_cnt++; |
| } |
| list_iterator_destroy(gres_iter); |
| |
| tail_offset = get_buf_offset(buffer); |
| set_buf_offset(buffer, top_offset); |
| pack16(rec_cnt, buffer); |
| set_buf_offset(buffer, tail_offset); |
| |
| return rc; |
| } |
| |
| static void _prep_list_del(void *x) |
| { |
| gres_prep_t *gres_prep = (gres_prep_t *) x; |
| int i; |
| |
| if (!gres_prep) |
| return; |
| |
| if (gres_prep->gres_bit_alloc) { |
| for (i = 0; i < gres_prep->node_cnt; i++) |
| FREE_NULL_BITMAP(gres_prep->gres_bit_alloc[i]); |
| xfree(gres_prep->gres_bit_alloc); |
| } |
| xfree(gres_prep->gres_cnt_node_alloc); |
| xfree(gres_prep->node_list); |
| xfree(gres_prep); |
| } |
| |
| static int _gres_prep_unpack(void **object, uint16_t protocol_version, |
| buf_t *buffer) |
| { |
| uint32_t magic = 0, utmp32 = 0; |
| uint8_t filled = 0; |
| gres_prep_t *gres_prep = NULL; |
| |
| gres_prep = xmalloc(sizeof(gres_prep_t)); |
| |
| if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| safe_unpack32(&magic, buffer); |
| if (magic != GRES_MAGIC) |
| goto unpack_error; |
| safe_unpack32(&gres_prep->plugin_id, buffer); |
| safe_unpack32(&gres_prep->node_cnt, buffer); |
| if (gres_prep->node_cnt > NO_VAL) |
| goto unpack_error; |
| safe_unpack8(&filled, buffer); |
| if (filled) { |
| safe_unpack64_array( |
| &gres_prep->gres_cnt_node_alloc, |
| &utmp32, buffer); |
| } |
| safe_unpack8(&filled, buffer); |
| if (filled) { |
| safe_xcalloc(gres_prep->gres_bit_alloc, |
| gres_prep->node_cnt, |
| sizeof(bitstr_t *)); |
| for (int i = 0; i < gres_prep->node_cnt; i++) { |
| unpack_bit_str_hex(&gres_prep-> |
| gres_bit_alloc[i], |
| buffer); |
| } |
| } |
| } else { |
| error("%s: protocol_version %hu not supported", |
| __func__, protocol_version); |
| goto unpack_error; |
| } |
| |
| if (!_find_context_by_id(gres_prep->plugin_id)) { |
| /* |
| * A likely sign that GresPlugins has changed. |
| * Not a fatal error, skip over the data. |
| */ |
| error("%s: no plugin configured to unpack data type %u", |
| __func__, gres_prep->plugin_id); |
| _prep_list_del(gres_prep); |
| gres_prep = NULL; |
| /* Don't return SLURM_ERROR */ |
| } |
| |
| *object = gres_prep; |
| |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| error("%s: unpack error", __func__); |
| _prep_list_del(gres_prep); |
| |
| return SLURM_ERROR; |
| } |
| |
| extern int gres_prep_unpack_list(list_t **out, buf_t *buffer, |
| uint16_t protocol_version) |
| { |
| int rc = SLURM_SUCCESS; |
| |
| /* We have to have gres_context_lock locked to call the unpack */ |
| slurm_mutex_lock(&gres_context_lock); |
| if ((rc = slurm_unpack_list(out, _gres_prep_unpack, _prep_list_del, |
| buffer, protocol_version)) != SLURM_SUCCESS) |
| FREE_NULL_LIST(*out); |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| return rc; |
| } |
| |
| /* |
| * Unpack a job's allocated gres information for use by prolog/epilog |
| * OUT gres_list - restored state stored by gres_prep_pack() |
| * IN/OUT buffer - location to read state from |
| * |
| * When 24.11 is no longer supported this can be removed. |
| */ |
| extern int gres_prep_unpack_legacy(list_t **gres_list, buf_t *buffer, |
| uint16_t protocol_version) |
| { |
| int rc = SLURM_SUCCESS; |
| uint16_t rec_cnt = 0; |
| gres_prep_t *gres_prep = NULL; |
| bool locked = false; |
| |
| safe_unpack16(&rec_cnt, buffer); |
| if (rec_cnt == 0) |
| return SLURM_SUCCESS; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| locked = true; |
| if ((gres_context_cnt > 0) && (*gres_list == NULL)) { |
| *gres_list = list_create(_prep_list_del); |
| } |
| |
| while ((rc == SLURM_SUCCESS) && (rec_cnt)) { |
| if ((buffer == NULL) || (remaining_buf(buffer) == 0)) |
| break; |
| rec_cnt--; |
| |
| if (_gres_prep_unpack((void **)&gres_prep, protocol_version, |
| buffer) != SLURM_SUCCESS) |
| goto unpack_error; |
| |
| if (gres_prep) { |
| list_append(*gres_list, gres_prep); |
| gres_prep = NULL; |
| } |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| return rc; |
| |
| unpack_error: |
| error("%s: unpack error", __func__); |
| if (gres_prep) |
| _prep_list_del(gres_prep); |
| if (locked) |
| slurm_mutex_unlock(&gres_context_lock); |
| return SLURM_ERROR; |
| } |
| |
| |
| static int _foreach_prep_build_env(void *x, void *arg) |
| { |
| gres_state_t *gres_ptr = x; |
| foreach_prep_build_env_t *foreach_prep_build_env = arg; |
| slurm_gres_context_t *gres_ctx; |
| gres_prep_t *gres_prep; |
| |
| if (!(gres_ctx = _find_context_by_id(gres_ptr->plugin_id))) { |
| error("%s: gres not found in context. This should never happen", |
| __func__); |
| return 0; |
| } |
| |
| if (!gres_ctx->ops.prep_build_env) /* No plugin to call */ |
| return 0; |
| |
| gres_prep = (*(gres_ctx->ops.prep_build_env))(gres_ptr->gres_data); |
| if (!gres_prep) /* No info to add for this plugin */ |
| return 0; |
| |
| if (!foreach_prep_build_env->prep_gres_list) |
| foreach_prep_build_env->prep_gres_list = |
| list_create(_prep_list_del); |
| |
| gres_prep->plugin_id = gres_ctx->plugin_id; |
| gres_prep->node_list = xstrdup(foreach_prep_build_env->node_list); |
| list_append(foreach_prep_build_env->prep_gres_list, gres_prep); |
| |
| return 0; |
| } |
| |
| /* |
| * Build List of information needed to set job's Prolog or Epilog environment |
| * variables |
| * |
| * IN job_gres_list - job's GRES allocation info |
| * IN hostlist - list of nodes associated with the job |
| * RET information about the job's GRES allocation needed by Prolog or Epilog |
| */ |
| extern list_t *gres_g_prep_build_env(list_t *job_gres_list, char *node_list) |
| { |
| foreach_prep_build_env_t foreach_prep_build_env = { |
| .node_list = node_list, |
| }; |
| |
| if (!job_gres_list) |
| return NULL; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| (void) list_for_each(job_gres_list, _foreach_prep_build_env, |
| &foreach_prep_build_env); |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| return foreach_prep_build_env.prep_gres_list; |
| } |
| |
| static int _foreach_prep_set_env(void *x, void *arg) |
| { |
| gres_prep_t *gres_prep = x; |
| foreach_prep_set_env_t *foreach_prep_set_env = arg; |
| slurm_gres_context_t *gres_ctx; |
| |
| if (!(gres_ctx = _find_context_by_id(gres_prep->plugin_id))) { |
| error("%s: GRES ID %u not found in context", |
| __func__, gres_prep->plugin_id); |
| return 0; |
| } |
| |
| if (!gres_ctx->ops.prep_set_env) /* No plugin to call */ |
| return 0; |
| |
| (*(gres_ctx->ops.prep_set_env)) |
| (foreach_prep_set_env->prep_env_ptr, gres_prep, |
| foreach_prep_set_env->node_inx); |
| |
| return 0; |
| } |
| |
| /* |
| * Set environment variables as appropriate for a job's prolog or epilog based |
| * GRES allocated to the job. |
| * |
| * IN/OUT prep_env_ptr - environment variable array |
| * IN prep_gres_list - generated by TBD |
| * IN node_inx - zero origin node index |
| */ |
| extern void gres_g_prep_set_env(char ***prep_env_ptr, |
| list_t *prep_gres_list, int node_inx) |
| { |
| foreach_prep_set_env_t foreach_prep_set_env = { |
| .node_inx = node_inx, |
| .prep_env_ptr = prep_env_ptr, |
| }; |
| |
| *prep_env_ptr = NULL; |
| if (!prep_gres_list) |
| return; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| (void) list_for_each(prep_gres_list, _foreach_prep_set_env, |
| &foreach_prep_set_env); |
| slurm_mutex_unlock(&gres_context_lock); |
| } |
| |
| /* |
| * If core bitmap from slurmd differs in size from that in slurmctld, |
| * then modify bitmap from slurmd so we can use bit_and, bit_or, etc. |
| */ |
| static bitstr_t *_core_bitmap_rebuild(bitstr_t *old_core_bitmap, int new_size) |
| { |
| int i, j, old_size, ratio; |
| bitstr_t *new_core_bitmap; |
| |
| new_core_bitmap = bit_alloc(new_size); |
| old_size = bit_size(old_core_bitmap); |
| if (old_size > new_size) { |
| ratio = old_size / new_size; |
| for (i = 0; i < new_size; i++) { |
| for (j = 0; j < ratio; j++) { |
| if (bit_test(old_core_bitmap, i*ratio+j)) { |
| bit_set(new_core_bitmap, i); |
| break; |
| } |
| } |
| } |
| } else { |
| ratio = new_size / old_size; |
| for (i = 0; i < old_size; i++) { |
| if (!bit_test(old_core_bitmap, i)) |
| continue; |
| for (j = 0; j < ratio; j++) { |
| bit_set(new_core_bitmap, i*ratio+j); |
| } |
| } |
| } |
| |
| return new_core_bitmap; |
| } |
| |
| extern void gres_validate_node_cores(gres_node_state_t *gres_ns, |
| int cores_ctld, char *node_name) |
| { |
| int i, cores_slurmd; |
| bitstr_t *new_core_bitmap; |
| int log_mismatch = true; |
| |
| if (gres_ns->topo_cnt == 0) |
| return; |
| |
| if (gres_ns->topo_core_bitmap == NULL) { |
| error("Gres topo_core_bitmap is NULL on node %s", node_name); |
| return; |
| } |
| |
| |
| for (i = 0; i < gres_ns->topo_cnt; i++) { |
| if (!gres_ns->topo_core_bitmap[i]) |
| continue; |
| cores_slurmd = bit_size(gres_ns->topo_core_bitmap[i]); |
| if (cores_slurmd == cores_ctld) |
| continue; |
| if (log_mismatch) { |
| debug("Rebuilding node %s gres core bitmap (%d != %d)", |
| node_name, cores_slurmd, cores_ctld); |
| log_mismatch = false; |
| } |
| new_core_bitmap = _core_bitmap_rebuild( |
| gres_ns->topo_core_bitmap[i], |
| cores_ctld); |
| FREE_NULL_BITMAP(gres_ns->topo_core_bitmap[i]); |
| gres_ns->topo_core_bitmap[i] = new_core_bitmap; |
| } |
| } |
| |
| static uint32_t _job_test(gres_state_t *gres_state_job, |
| gres_state_t *gres_state_node, |
| bool use_total_gres, |
| int core_start_bit, int core_end_bit, |
| uint32_t job_id, char *node_name) |
| { |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| gres_node_state_t *gres_ns = gres_state_node->gres_data; |
| char *gres_name = gres_state_job->gres_name; |
| int i, j, core_size, core_ctld, top_inx = -1; |
| uint64_t gres_avail = 0, gres_total, gres_tmp; |
| uint64_t min_gres_node = 0; |
| uint32_t *cores_addnt = NULL; /* Additional cores avail from this GRES */ |
| uint32_t *cores_avail = NULL; /* cores initially avail from this GRES */ |
| uint32_t core_cnt = 0; |
| bitstr_t *alloc_core_bitmap = NULL; |
| bitstr_t *avail_core_bitmap = NULL; |
| bool use_single_dev = (gres_id_shared(gres_state_job->config_flags) && |
| !(slurm_conf.select_type_param & |
| SELECT_MULTIPLE_SHARING_GRES_PJ)); |
| bool use_busy_dev; |
| |
| if (gres_ns->no_consume) |
| use_total_gres = true; |
| |
| use_busy_dev = gres_use_busy_dev(gres_state_node, use_total_gres); |
| |
| /* Determine minimum GRES count needed on this node */ |
| if (gres_js->gres_per_job) |
| min_gres_node = 1; |
| min_gres_node = MAX(min_gres_node, gres_js->gres_per_node); |
| min_gres_node = MAX(min_gres_node, gres_js->gres_per_socket); |
| min_gres_node = MAX(min_gres_node, gres_js->gres_per_task); |
| |
| if (min_gres_node && gres_ns->topo_cnt) { |
| /* Need to determine which specific cores can be used */ |
| gres_avail = gres_ns->gres_cnt_avail; |
| if (!use_total_gres) |
| gres_avail -= gres_ns->gres_cnt_alloc; |
| if (min_gres_node > gres_avail) |
| return (uint32_t) 0; /* insufficient GRES avail */ |
| |
| core_ctld = core_end_bit - core_start_bit + 1; |
| for (i = 0; i < gres_ns->topo_cnt; i++) { |
| if (!gres_ns->topo_core_bitmap[i]) |
| continue; |
| core_ctld = bit_size(gres_ns-> |
| topo_core_bitmap[i]); |
| break; |
| } |
| |
| alloc_core_bitmap = bit_alloc(core_ctld); |
| bit_set_all(alloc_core_bitmap); |
| |
| |
| avail_core_bitmap = bit_copy(alloc_core_bitmap); |
| cores_addnt = xcalloc(gres_ns->topo_cnt, |
| sizeof(uint32_t)); |
| cores_avail = xcalloc(gres_ns->topo_cnt, |
| sizeof(uint32_t)); |
| for (i = 0; i < gres_ns->topo_cnt; i++) { |
| if (gres_ns->topo_gres_cnt_avail[i] == 0) |
| continue; |
| if (use_busy_dev && |
| (gres_ns->topo_gres_cnt_alloc[i] == 0)) |
| continue; |
| if (!use_total_gres && |
| (gres_ns->topo_gres_cnt_alloc[i] >= |
| gres_ns->topo_gres_cnt_avail[i])) |
| continue; |
| if (gres_js->type_name && |
| (!gres_ns->topo_type_name[i] || |
| (gres_ns->topo_type_id[i] != |
| gres_js->type_id))) |
| continue; |
| if (!gres_ns->topo_core_bitmap[i]) { |
| cores_avail[i] = core_end_bit - |
| core_start_bit + 1; |
| continue; |
| } |
| core_size = bit_size(gres_ns->topo_core_bitmap[i]); |
| for (j = 0; j < core_size; j++) { |
| if (bit_test(gres_ns-> |
| topo_core_bitmap[i], j)) { |
| cores_avail[i]++; |
| } |
| } |
| } |
| |
| /* Pick the topology entries with the most cores available */ |
| gres_avail = 0; |
| gres_total = 0; |
| while (gres_avail < min_gres_node) { |
| top_inx = -1; |
| for (j = 0; j < gres_ns->topo_cnt; j++) { |
| if ((gres_avail == 0) || |
| (cores_avail[j] == 0) || |
| !gres_ns->topo_core_bitmap[j]) { |
| cores_addnt[j] = cores_avail[j]; |
| } else { |
| cores_addnt[j] = cores_avail[j] - |
| bit_overlap(alloc_core_bitmap, |
| gres_ns-> |
| topo_core_bitmap[j]); |
| } |
| |
| if (top_inx == -1) { |
| if (cores_avail[j]) |
| top_inx = j; |
| } else if (cores_addnt[j] > cores_addnt[top_inx]) |
| top_inx = j; |
| } |
| if ((top_inx < 0) || (cores_avail[top_inx] == 0)) { |
| if (gres_total < min_gres_node) |
| core_cnt = 0; |
| break; |
| } |
| cores_avail[top_inx] = 0; /* Flag as used */ |
| gres_tmp = gres_ns->topo_gres_cnt_avail[top_inx]; |
| if (!use_total_gres && |
| (gres_tmp >= |
| gres_ns->topo_gres_cnt_alloc[top_inx])) { |
| gres_tmp -= gres_ns-> |
| topo_gres_cnt_alloc[top_inx]; |
| } else if (!use_total_gres) { |
| gres_tmp = 0; |
| } |
| if (gres_id_shared(gres_state_job->config_flags) && |
| gres_js->gres_per_task) { |
| /* |
| * Remove remaining shared gres_per_task |
| * Because we don't allocate shared |
| * gres_per_task across multiple sharing gres. |
| * See _set_shared_task_bits() in |
| * gres_select_filter.c |
| */ |
| gres_tmp -= (gres_tmp % gres_js->gres_per_task); |
| } |
| if (gres_tmp == 0) { |
| error("gres/%s: topology allocation error on node %s", |
| gres_name, node_name); |
| break; |
| } |
| /* update counts of allocated cores and GRES */ |
| if (use_single_dev) { |
| /* |
| * Process outside of loop after specific |
| * device selected |
| */ |
| } else if (!gres_ns->topo_core_bitmap[top_inx]) { |
| bit_set_all(alloc_core_bitmap); |
| } else if (gres_avail) { |
| bit_or(alloc_core_bitmap, |
| gres_ns-> |
| topo_core_bitmap[top_inx]); |
| } else { |
| bit_and(alloc_core_bitmap, |
| gres_ns-> |
| topo_core_bitmap[top_inx]); |
| } |
| if (use_single_dev) { |
| gres_total = MAX(gres_total, gres_tmp); |
| gres_avail = gres_total; |
| } else { |
| /* |
| * Available GRES count is up to gres_tmp, |
| * but take 1 per loop to maximize available |
| * core count |
| */ |
| gres_avail += 1; |
| gres_total += gres_tmp; |
| core_cnt = bit_set_count(alloc_core_bitmap); |
| } |
| } |
| if (use_single_dev && (top_inx >= 0) && |
| (gres_avail >= min_gres_node)) { |
| if (!gres_ns->topo_core_bitmap[top_inx]) { |
| bit_set_all(alloc_core_bitmap); |
| } else { |
| bit_or(alloc_core_bitmap, |
| gres_ns-> |
| topo_core_bitmap[top_inx]); |
| } |
| core_cnt = bit_set_count(alloc_core_bitmap); |
| } |
| FREE_NULL_BITMAP(alloc_core_bitmap); |
| FREE_NULL_BITMAP(avail_core_bitmap); |
| xfree(cores_addnt); |
| xfree(cores_avail); |
| return core_cnt; |
| } else if (gres_js->type_name) { |
| for (i = 0; i < gres_ns->type_cnt; i++) { |
| if (gres_ns->type_name[i] && |
| (gres_ns->type_id[i] == |
| gres_js->type_id)) |
| break; |
| } |
| if (i >= gres_ns->type_cnt) |
| return (uint32_t) 0; /* no such type */ |
| gres_avail = gres_ns->type_cnt_avail[i]; |
| if (!use_total_gres) |
| gres_avail -= gres_ns->type_cnt_alloc[i]; |
| gres_tmp = gres_ns->gres_cnt_avail; |
| if (!use_total_gres) |
| gres_tmp -= gres_ns->gres_cnt_alloc; |
| gres_avail = MIN(gres_avail, gres_tmp); |
| if (min_gres_node > gres_avail) |
| return (uint32_t) 0; /* insufficient GRES avail */ |
| return NO_VAL; |
| } else { |
| gres_avail = gres_ns->gres_cnt_avail; |
| if (!use_total_gres) |
| gres_avail -= gres_ns->gres_cnt_alloc; |
| if (min_gres_node > gres_avail) |
| return (uint32_t) 0; /* insufficient GRES avail */ |
| return NO_VAL; |
| } |
| } |
| |
| static int _foreach_job_test(void *x, void *arg) |
| { |
| gres_state_t *gres_state_job = x; |
| foreach_job_test_t *foreach_job_test = arg; |
| uint32_t tmp_cnt; |
| gres_state_t *gres_state_node = |
| list_find_first(foreach_job_test->node_gres_list, |
| gres_find_id, |
| &gres_state_job->plugin_id); |
| if (!gres_state_node) { |
| /* node lack resources required by the job */ |
| foreach_job_test->core_cnt = 0; |
| return -1; |
| } |
| |
| tmp_cnt = _job_test(gres_state_job, gres_state_node, |
| foreach_job_test->use_total_gres, |
| foreach_job_test->core_start_bit, |
| foreach_job_test->core_end_bit, |
| foreach_job_test->job_id, |
| foreach_job_test->node_name); |
| if (tmp_cnt != NO_VAL) { |
| if (foreach_job_test->core_cnt == NO_VAL) |
| foreach_job_test->core_cnt = tmp_cnt; |
| else |
| foreach_job_test->core_cnt = |
| MIN(tmp_cnt, foreach_job_test->core_cnt); |
| } |
| |
| if (foreach_job_test->core_cnt == 0) |
| return -1; |
| |
| return 0; |
| } |
| |
| /* |
| * Determine how many cores on the node can be used by this job |
| * IN job_gres_list - job's gres_list built by gres_job_state_validate() |
| * IN node_gres_list - node's gres_list built by gres_node_config_validate() |
| * IN use_total_gres - if set then consider all gres resources as available, |
| * and none are committed to running jobs |
| * IN core_start_bit - index into core_bitmap for this node's first core |
| * IN core_end_bit - index into core_bitmap for this node's last core |
| * IN job_id - job's ID (for logging) |
| * IN node_name - name of the node (for logging) |
| * IN disable binding- --gres-flags=disable-binding |
| * RET: NO_VAL - All cores on node are available |
| * otherwise - Count of available cores |
| */ |
| extern uint32_t gres_job_test(list_t *job_gres_list, list_t *node_gres_list, |
| bool use_total_gres, |
| int core_start_bit, int core_end_bit, |
| uint32_t job_id, char *node_name) |
| { |
| foreach_job_test_t foreach_job_test = { |
| .core_cnt = NO_VAL, |
| .core_end_bit = core_end_bit, |
| .core_start_bit = core_start_bit, |
| .job_id = job_id, |
| .node_gres_list = node_gres_list, |
| .node_name = node_name, |
| .use_total_gres = use_total_gres, |
| }; |
| |
| if (job_gres_list == NULL) |
| return NO_VAL; |
| if (node_gres_list == NULL) |
| return 0; |
| |
| (void) list_for_each(job_gres_list, _foreach_job_test, |
| &foreach_job_test); |
| |
| return foreach_job_test.core_cnt; |
| } |
| |
| extern void gres_sock_delete(void *x) |
| { |
| sock_gres_t *sock_gres = (sock_gres_t *) x; |
| int s; |
| |
| if (sock_gres) { |
| FREE_NULL_BITMAP(sock_gres->bits_any_sock); |
| if (sock_gres->bits_by_sock) { |
| for (s = 0; s < sock_gres->sock_cnt; s++) |
| FREE_NULL_BITMAP(sock_gres->bits_by_sock[s]); |
| xfree(sock_gres->bits_by_sock); |
| } |
| xfree(sock_gres->cnt_by_sock); |
| xfree(sock_gres); |
| } |
| } |
| |
| static int _foreach_sock_str(void *x, void *arg) |
| { |
| sock_gres_t *sock_gres = x; |
| foreach_sock_str_t *foreach_sock_str = arg; |
| char *gres_name = sock_gres->gres_state_job->gres_name; |
| gres_job_state_t *gres_js = sock_gres->gres_state_job->gres_data; |
| char *type_name = gres_js->type_name; |
| |
| if (foreach_sock_str->sock_inx < 0) { |
| if (sock_gres->cnt_any_sock) { |
| if (type_name) { |
| xstrfmtcat(foreach_sock_str->gres_str, |
| "%s%s:%s:%"PRIu64, |
| foreach_sock_str->sep, |
| gres_name, |
| type_name, |
| sock_gres->cnt_any_sock); |
| } else { |
| xstrfmtcat(foreach_sock_str->gres_str, |
| "%s%s:%"PRIu64, |
| foreach_sock_str->sep, gres_name, |
| sock_gres->cnt_any_sock); |
| } |
| foreach_sock_str->sep = " "; |
| } |
| return 0; |
| } |
| if (!sock_gres->cnt_by_sock || |
| (sock_gres->cnt_by_sock[foreach_sock_str->sock_inx] == 0)) |
| return 0; |
| if (type_name) { |
| xstrfmtcat(foreach_sock_str->gres_str, "%s%s:%s:%"PRIu64, |
| foreach_sock_str->sep, |
| gres_name, type_name, |
| sock_gres->cnt_by_sock[foreach_sock_str->sock_inx]); |
| } else { |
| xstrfmtcat(foreach_sock_str->gres_str, "%s%s:%"PRIu64, |
| foreach_sock_str->sep, |
| gres_name, |
| sock_gres->cnt_by_sock[foreach_sock_str->sock_inx]); |
| } |
| foreach_sock_str->sep = " "; |
| return 0; |
| } |
| |
| /* |
| * Build a string containing the GRES details for a given node and socket |
| * sock_gres_list IN - List of sock_gres_t entries |
| * sock_inx IN - zero-origin socket for which information is to be returned |
| * if value < 0, then report GRES unconstrained by core |
| * RET string, must call xfree() to release memory |
| */ |
| extern char *gres_sock_str(list_t *sock_gres_list, int sock_inx) |
| { |
| foreach_sock_str_t foreach_sock_str = { |
| .gres_str = NULL, |
| .sep = "", |
| .sock_inx = sock_inx, |
| }; |
| |
| if (!sock_gres_list) |
| return NULL; |
| |
| (void) list_for_each(sock_gres_list, _foreach_sock_str, |
| &foreach_sock_str); |
| |
| return foreach_sock_str.gres_str; |
| } |
| |
| static void _accumulate_job_gres_alloc(gres_job_state_t *gres_js, |
| int node_inx, |
| bitstr_t **gres_bit_alloc, |
| uint64_t *gres_cnt) |
| { |
| if (gres_js->node_cnt <= node_inx) { |
| error("gres_job_state_t node count less than node_inx. This should never happen"); |
| return; |
| } |
| |
| if ((node_inx >= 0) && (node_inx < gres_js->node_cnt) && |
| gres_js->gres_bit_alloc && |
| gres_js->gres_bit_alloc[node_inx]) { |
| if (!*gres_bit_alloc) { |
| *gres_bit_alloc = bit_alloc( |
| bit_size(gres_js-> |
| gres_bit_alloc[node_inx])); |
| } |
| bit_or(*gres_bit_alloc, gres_js->gres_bit_alloc[node_inx]); |
| } |
| if (gres_cnt && gres_js->gres_cnt_node_alloc) |
| *gres_cnt += gres_js->gres_cnt_node_alloc[node_inx]; |
| } |
| |
| static int _accumulate_gres_device(void *x, void *arg) |
| { |
| gres_state_t *gres_ptr = x; |
| foreach_gres_accumulate_device_t *foreach_gres_accumulate_device = arg; |
| |
| if (gres_ptr->plugin_id != foreach_gres_accumulate_device->plugin_id) |
| return 0; |
| |
| if (foreach_gres_accumulate_device->is_job) { |
| _accumulate_job_gres_alloc( |
| gres_ptr->gres_data, |
| foreach_gres_accumulate_device->node_inx, |
| foreach_gres_accumulate_device->gres_bit_alloc, |
| &foreach_gres_accumulate_device->gres_cnt); |
| } else { |
| _accumulate_step_gres_alloc( |
| gres_ptr, |
| foreach_gres_accumulate_device->gres_bit_alloc, |
| &foreach_gres_accumulate_device->gres_cnt, |
| foreach_gres_accumulate_device->gres_per_bit); |
| } |
| |
| /* Does job have a sharing GRES (GPU)? */ |
| if (gres_id_sharing(foreach_gres_accumulate_device->plugin_id)) |
| foreach_gres_accumulate_device->sharing_gres_allocated = true; |
| |
| return 0; |
| } |
| |
| /* |
| * Set environment variables as required for a batch or interactive step |
| */ |
| extern void gres_g_job_set_env(stepd_step_rec_t *step, int node_inx) |
| { |
| int i; |
| gres_internal_flags_t flags = GRES_INTERNAL_FLAG_NONE; |
| bitstr_t *gres_bit_alloc = NULL; |
| foreach_gres_accumulate_device_t foreach_gres_accumulate_device = { |
| .gres_bit_alloc = &gres_bit_alloc, |
| .is_job = true, |
| .node_inx = node_inx, |
| }; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| for (i = 0; i < gres_context_cnt; i++) { |
| slurm_gres_context_t *gres_ctx = &gres_context[i]; |
| if (!gres_ctx->ops.job_set_env) |
| continue; /* No plugin to call */ |
| if (step->job_gres_list) { |
| foreach_gres_accumulate_device.plugin_id = |
| gres_ctx->plugin_id; |
| (void) list_for_each(step->job_gres_list, |
| _accumulate_gres_device, |
| &foreach_gres_accumulate_device); |
| } |
| |
| /* |
| * Do not let MPS or Shard (shared GRES) clear any envs set for |
| * a GPU (sharing GRES) when a GPU is allocated but an |
| * MPS/Shard is not. Sharing GRES plugins always run before |
| * shared GRES, so we don't need to protect MPS/Shard from GPU. |
| */ |
| if (gres_id_shared(gres_ctx->config_flags) && |
| foreach_gres_accumulate_device.sharing_gres_allocated) |
| flags |= GRES_INTERNAL_FLAG_PROTECT_ENV; |
| |
| if ((step->flags & LAUNCH_EXT_LAUNCHER)) { |
| /* |
| * We need the step environment variables, but still |
| * use all the job's gres. |
| */ |
| (*(gres_ctx->ops.step_set_env))( |
| &step->env, |
| gres_bit_alloc, |
| foreach_gres_accumulate_device.gres_cnt, |
| flags); |
| } else |
| (*(gres_ctx->ops.job_set_env))( |
| &step->env, |
| gres_bit_alloc, |
| foreach_gres_accumulate_device.gres_cnt, |
| flags); |
| foreach_gres_accumulate_device.gres_cnt = 0; |
| FREE_NULL_BITMAP(gres_bit_alloc); |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| } |
| |
| static int _job_state_log(void *x, void *arg) |
| { |
| gres_state_t *gres_state_job = x; |
| uint32_t job_id = *(uint32_t *)arg; |
| gres_job_state_t *gres_js = gres_state_job->gres_data; |
| char *sparse_msg = "", tmp_str[128]; |
| int i; |
| |
| xassert(gres_js); |
| info("gres_job_state gres:%s(%u) type:%s(%u) job:%u flags:%s", |
| gres_state_job->gres_name, gres_state_job->plugin_id, |
| gres_js->type_name, |
| gres_js->type_id, job_id, gres_flags2str(gres_js->flags)); |
| if (gres_js->cpus_per_gres) |
| info(" cpus_per_gres:%u", gres_js->cpus_per_gres); |
| else if (gres_js->def_cpus_per_gres) |
| info(" def_cpus_per_gres:%u", gres_js->def_cpus_per_gres); |
| if (gres_js->gres_per_job) |
| info(" gres_per_job:%"PRIu64, gres_js->gres_per_job); |
| if (gres_js->gres_per_node) { |
| info(" gres_per_node:%"PRIu64" node_cnt:%u", |
| gres_js->gres_per_node, gres_js->node_cnt); |
| } |
| if (gres_js->gres_per_socket) |
| info(" gres_per_socket:%"PRIu64, gres_js->gres_per_socket); |
| if (gres_js->gres_per_task) |
| info(" gres_per_task:%"PRIu64, gres_js->gres_per_task); |
| if (gres_js->mem_per_gres) |
| info(" mem_per_gres:%"PRIu64, gres_js->mem_per_gres); |
| else if (gres_js->def_mem_per_gres) |
| info(" def_mem_per_gres:%"PRIu64, gres_js->def_mem_per_gres); |
| if (gres_js->ntasks_per_gres) |
| info(" ntasks_per_gres:%u", gres_js->ntasks_per_gres); |
| |
| /* |
| * These arrays are only used for resource selection and may include |
| * data for many nodes not used in the resources eventually allocated |
| * to this job. |
| */ |
| if (gres_js->total_node_cnt) { |
| sparse_msg = " (sparsely populated for resource selection)"; |
| info(" total_node_cnt:%u%s", gres_js->total_node_cnt, |
| sparse_msg); |
| } |
| for (i = 0; i < gres_js->total_node_cnt; i++) { |
| if (gres_js->gres_cnt_node_select && |
| gres_js->gres_cnt_node_select[i]) { |
| info(" gres_cnt_node_select[%d]:%"PRIu64, |
| i, gres_js->gres_cnt_node_select[i]); |
| } |
| if (gres_js->gres_bit_select && |
| gres_js->gres_bit_select[i]) { |
| bit_fmt(tmp_str, sizeof(tmp_str), |
| gres_js->gres_bit_select[i]); |
| info(" gres_bit_select[%d]:%s of %d", i, tmp_str, |
| (int) bit_size(gres_js->gres_bit_select[i])); |
| } |
| if (gres_js->gres_bit_select && |
| gres_js->gres_bit_select[i] && |
| gres_js->gres_per_bit_select && |
| gres_js->gres_per_bit_select[i]) { |
| for (int j = 0; |
| (j = bit_ffs_from_bit(gres_js->gres_bit_select[i], |
| j)) >= 0; |
| j++) { |
| info(" gres_per_bit_select[%d][%d]:%"PRIu64, |
| i, j, gres_js->gres_per_bit_select[i][j]); |
| } |
| } |
| } |
| |
| if (gres_js->total_gres) |
| info(" total_gres:%"PRIu64, gres_js->total_gres); |
| if (gres_js->node_cnt) |
| info(" node_cnt:%u", gres_js->node_cnt); |
| for (i = 0; i < gres_js->node_cnt; i++) { |
| if (gres_js->gres_cnt_node_alloc && |
| gres_js->gres_cnt_node_alloc[i]) { |
| info(" gres_cnt_node_alloc[%d]:%"PRIu64, |
| i, gres_js->gres_cnt_node_alloc[i]); |
| } else if (gres_js->gres_cnt_node_alloc) |
| info(" gres_cnt_node_alloc[%d]:NULL", i); |
| |
| if (gres_js->gres_bit_alloc && gres_js->gres_bit_alloc[i]) { |
| bit_fmt(tmp_str, sizeof(tmp_str), |
| gres_js->gres_bit_alloc[i]); |
| info(" gres_bit_alloc[%d]:%s of %d", i, tmp_str, |
| (int) bit_size(gres_js->gres_bit_alloc[i])); |
| } else if (gres_js->gres_bit_alloc) |
| info(" gres_bit_alloc[%d]:NULL", i); |
| |
| if (gres_js->gres_bit_alloc && |
| gres_js->gres_bit_alloc[i] && |
| gres_js->gres_per_bit_alloc && |
| gres_js->gres_per_bit_alloc[i]) { |
| for (int j = 0; |
| (j = bit_ffs_from_bit(gres_js->gres_bit_alloc[i], |
| j)) >= 0; |
| j++) { |
| info(" gres_per_bit_alloc[%d][%d]:%"PRIu64, |
| i, j, gres_js->gres_per_bit_alloc[i][j]); |
| } |
| } |
| |
| if (gres_js->gres_bit_step_alloc && |
| gres_js->gres_bit_step_alloc[i]) { |
| bit_fmt(tmp_str, sizeof(tmp_str), |
| gres_js->gres_bit_step_alloc[i]); |
| info(" gres_bit_step_alloc[%d]:%s of %d", i, tmp_str, |
| (int) bit_size(gres_js->gres_bit_step_alloc[i])); |
| } else if (gres_js->gres_bit_step_alloc) |
| info(" gres_bit_step_alloc[%d]:NULL", i); |
| |
| if (gres_js->gres_bit_step_alloc && |
| gres_js->gres_bit_step_alloc[i] && |
| gres_js->gres_per_bit_step_alloc && |
| gres_js->gres_per_bit_step_alloc[i]) { |
| for (int j = 0; |
| (j = bit_ffs_from_bit( |
| gres_js->gres_bit_step_alloc[i], j)) >= 0; |
| j++) { |
| info(" gres_per_bit_step_alloc[%d][%d]:%"PRIu64, |
| i, j, |
| gres_js->gres_per_bit_step_alloc[i][j]); |
| } |
| } |
| |
| if (gres_js->gres_cnt_step_alloc) { |
| info(" gres_cnt_step_alloc[%d]:%"PRIu64"", i, |
| gres_js->gres_cnt_step_alloc[i]); |
| } |
| } |
| |
| return 0; |
| } |
| |
| static int _foreach_gres_list_cnt(void *x, void *arg) |
| { |
| gres_state_t *gres_state_ptr = x; |
| foreach_gres_list_cnt_t *foreach_gres_list_cnt = arg; |
| uint64_t total_gres; |
| void *type_name; |
| |
| if (gres_state_ptr->plugin_id != foreach_gres_list_cnt->plugin_id) |
| return 0; |
| |
| if (foreach_gres_list_cnt->is_job) { |
| gres_job_state_t *gres_js = gres_state_ptr->gres_data; |
| type_name = gres_js->type_name; |
| total_gres = gres_js->total_gres; |
| } else { |
| gres_step_state_t *gres_ss = gres_state_ptr->gres_data; |
| type_name = gres_ss->type_name; |
| total_gres = gres_ss->total_gres; |
| } |
| |
| /* If we are filtering on GRES type, ignore other types */ |
| if (foreach_gres_list_cnt->filter_type && |
| xstrcasecmp(foreach_gres_list_cnt->gres_type, type_name)) |
| return 0; |
| |
| if ((total_gres == NO_VAL64) || (total_gres == 0)) |
| return 0; |
| |
| if (foreach_gres_list_cnt->gres_cnt == NO_VAL64) |
| foreach_gres_list_cnt->gres_cnt = total_gres; |
| else |
| foreach_gres_list_cnt->gres_cnt += total_gres; |
| |
| return 0; |
| } |
| |
| /* |
| * Extract from the job/step gres_list the count of GRES of the specified name |
| * and (optionally) type. If no type is specified, then the count will include |
| * all GRES of that name, regardless of type. |
| * |
| * IN gres_list - job/step record's gres_list. |
| * IN gres_name - the name of the GRES to query. |
| * IN gres_type - (optional) the type of the GRES to query. |
| * IN is_job - True if the GRES list is for the job, false if for the step. |
| * RET The number of GRES in the job/step gres_list or NO_VAL64 if not found. |
| */ |
| static uint64_t _get_gres_list_cnt(list_t *gres_list, char *gres_name, |
| char *gres_type, bool is_job) |
| { |
| foreach_gres_list_cnt_t foreach_gres_list_cnt = { |
| .gres_cnt = NO_VAL64, |
| .gres_type = gres_type, |
| .is_job = is_job, |
| }; |
| |
| if ((gres_list == NULL) || (list_count(gres_list) == 0)) |
| return foreach_gres_list_cnt.gres_cnt; |
| |
| foreach_gres_list_cnt.plugin_id = gres_build_id(gres_name); |
| |
| if (gres_type && (gres_type[0] != '\0')) |
| foreach_gres_list_cnt.filter_type = true; |
| |
| (void) list_for_each(gres_list, _foreach_gres_list_cnt, |
| &foreach_gres_list_cnt); |
| |
| return foreach_gres_list_cnt.gres_cnt; |
| } |
| |
| static uint64_t _get_job_gres_list_cnt(list_t *gres_list, char *gres_name, |
| char *gres_type) |
| { |
| return _get_gres_list_cnt(gres_list, gres_name, gres_type, true); |
| } |
| |
| static uint64_t _get_step_gres_list_cnt(list_t *gres_list, char *gres_name, |
| char *gres_type) |
| { |
| return _get_gres_list_cnt(gres_list, gres_name, gres_type, false); |
| } |
| |
| /* |
| * Log a job's current gres state |
| * IN gres_list - generated by gres_job_state_validate() |
| * IN job_id - job's ID |
| */ |
| extern void gres_job_state_log(list_t *gres_list, uint32_t job_id) |
| { |
| if (!(slurm_conf.debug_flags & DEBUG_FLAG_GRES) || !gres_list) |
| return; |
| |
| (void) list_for_each(gres_list, _job_state_log, &job_id); |
| } |
| |
| static int _find_device(void *x, void *key) |
| { |
| gres_device_t *device_x = (gres_device_t *)x; |
| gres_device_t *device_key = (gres_device_t *)key; |
| |
| if (!xstrcmp(device_x->path, device_key->path)) |
| return 1; |
| |
| return 0; |
| } |
| |
| static int _foreach_init_device_list(void *x, void *arg) |
| { |
| gres_device_t *gres_device = x; |
| list_t **device_list = arg; |
| |
| if (!*device_list) |
| *device_list = list_create(NULL); |
| gres_device->alloc = 0; |
| /* |
| * Keep the list unique by not adding duplicates (in the |
| * case of MPS and GPU) |
| */ |
| if (!list_find_first(*device_list, _find_device, gres_device)) |
| list_append(*device_list, gres_device); |
| |
| return 0; |
| } |
| |
| static int _foreach_alloc_gres_device(void *x, void *arg) |
| { |
| gres_device_t *gres_device = x; |
| foreach_alloc_gres_device_t *foreach_alloc_gres_device = arg; |
| if (!bit_test(foreach_alloc_gres_device->gres_bit_alloc, |
| gres_device->index)) |
| return 0; |
| |
| if (!foreach_alloc_gres_device->usable_gres || |
| bit_test(foreach_alloc_gres_device->usable_gres, |
| gres_device->index)) { |
| /* |
| * Search for the device among the unique |
| * devices list (since two plugins could have |
| * device records that point to the same file, |
| * like with GPU and MPS) |
| */ |
| gres_device_t *gres_device2 = list_find_first( |
| foreach_alloc_gres_device->device_list, |
| _find_device, |
| gres_device); |
| /* |
| * Set both, in case they point to different records |
| */ |
| gres_device->alloc = 1; |
| if (gres_device2) |
| gres_device2->alloc = 1; |
| } |
| |
| return 0; |
| } |
| |
| extern list_t *gres_g_get_devices(list_t *gres_list, bool is_job, |
| uint16_t accel_bind_type, char *tres_bind_str, |
| int local_proc_id, stepd_step_rec_t *step) |
| { |
| int j; |
| bitstr_t *gres_bit_alloc = NULL; |
| uint64_t *gres_per_bit = NULL; |
| list_t *gres_devices; |
| list_t *device_list = NULL; |
| bitstr_t *usable_gres = NULL; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| /* |
| * Create a unique device list of all possible GRES device files. |
| * Initialize each device to deny. |
| */ |
| slurm_mutex_lock(&gres_context_lock); |
| for (j = 0; j < gres_context_cnt; j++) { |
| if (!gres_context[j].ops.get_devices){ |
| gres_devices = gres_context[j].np_gres_devices; |
| } else { |
| gres_devices = (*(gres_context[j].ops.get_devices))(); |
| } |
| if (!gres_devices || !list_count(gres_devices)) |
| continue; |
| |
| (void) list_for_each(gres_devices, _foreach_init_device_list, |
| &device_list); |
| } |
| |
| if (!gres_list) { |
| slurm_mutex_unlock(&gres_context_lock); |
| return device_list; |
| } |
| |
| if (accel_bind_type) |
| _parse_accel_bind_type(accel_bind_type, tres_bind_str); |
| |
| for (j = 0; j < gres_context_cnt; j++) { |
| /* We need to get a gres_bit_alloc with all the gres types |
| * merged (accumulated) together */ |
| foreach_gres_accumulate_device_t arg = { |
| .gres_bit_alloc = &gres_bit_alloc, |
| .gres_per_bit = &gres_per_bit, |
| .is_job = is_job, |
| .plugin_id = gres_context[j].plugin_id, |
| }; |
| foreach_alloc_gres_device_t foreach_alloc_gres_device = { |
| .device_list = device_list, |
| }; |
| |
| (void) list_for_each(gres_list, _accumulate_gres_device, &arg); |
| |
| if (!gres_bit_alloc) |
| continue; |
| if (!gres_context[j].ops.get_devices){ |
| gres_devices = gres_context[j].np_gres_devices; |
| } else { |
| gres_devices = (*(gres_context[j].ops.get_devices))(); |
| } |
| if (!gres_devices) { |
| error("We should had got gres_devices, but for some reason none were set in the plugin."); |
| continue; |
| } |
| |
| if (_get_usable_gres(j, local_proc_id, tres_bind_str, |
| &usable_gres, gres_bit_alloc, true, step, |
| gres_per_bit, NULL) == SLURM_ERROR) |
| continue; |
| |
| foreach_alloc_gres_device.gres_bit_alloc = gres_bit_alloc; |
| foreach_alloc_gres_device.usable_gres = usable_gres; |
| |
| (void) list_for_each(gres_devices, _foreach_alloc_gres_device, |
| &foreach_alloc_gres_device); |
| |
| FREE_NULL_BITMAP(gres_bit_alloc); |
| FREE_NULL_BITMAP(usable_gres); |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| return device_list; |
| } |
| |
| static void _step_state_delete(void *gres_data) |
| { |
| int i; |
| gres_step_state_t *gres_ss = (gres_step_state_t *) gres_data; |
| |
| if (gres_ss == NULL) |
| return; |
| |
| FREE_NULL_BITMAP(gres_ss->node_in_use); |
| if (gres_ss->gres_bit_alloc) { |
| for (i = 0; i < gres_ss->node_cnt; i++) |
| FREE_NULL_BITMAP(gres_ss->gres_bit_alloc[i]); |
| xfree(gres_ss->gres_bit_alloc); |
| } |
| if (gres_ss->gres_per_bit_alloc) { |
| for (i = 0; i < gres_ss->node_cnt; i++){ |
| xfree(gres_ss->gres_per_bit_alloc[i]); |
| } |
| xfree(gres_ss->gres_per_bit_alloc); |
| } |
| xfree(gres_ss->gres_cnt_node_alloc); |
| xfree(gres_ss->type_name); |
| xfree(gres_ss); |
| } |
| |
| extern void gres_step_list_delete(void *list_element) |
| { |
| gres_state_t *gres_state_step = (gres_state_t *) list_element; |
| |
| _step_state_delete(gres_state_step->gres_data); |
| gres_state_step->gres_data = NULL; |
| _gres_state_delete_members(gres_state_step); |
| } |
| |
| /* |
| * TRES specification parse logic |
| * in_val IN - initial input string |
| * cnt OUT - count of values |
| * gres_list IN/OUT - where to search for (or add) new step TRES record |
| * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call |
| * rc OUT - unchanged or an error code |
| * RET gres - step record to set value in, found or created by this function |
| */ |
| static gres_state_t *_get_next_step_gres(char *in_val, uint64_t *cnt, |
| list_t *gres_list, char **save_ptr, |
| int *rc) |
| { |
| static char *prev_save_ptr = NULL; |
| int context_inx = NO_VAL, my_rc = SLURM_SUCCESS; |
| gres_step_state_t *gres_ss = NULL; |
| gres_state_t *gres_state_step = NULL; |
| gres_key_t step_search_key; |
| char *type = NULL, *name = NULL; |
| |
| xassert(save_ptr); |
| if (!in_val && (*save_ptr == NULL)) { |
| return NULL; |
| } |
| |
| if (*save_ptr == NULL) { |
| prev_save_ptr = in_val; |
| } else if (*save_ptr != prev_save_ptr) { |
| error("%s: parsing error", __func__); |
| my_rc = SLURM_ERROR; |
| goto fini; |
| } |
| |
| if (prev_save_ptr[0] == '\0') { /* Empty input token */ |
| *save_ptr = NULL; |
| return NULL; |
| } |
| |
| if ((my_rc = _get_next_gres(in_val, &type, &context_inx, |
| cnt, &prev_save_ptr)) || |
| (context_inx == NO_VAL)) { |
| prev_save_ptr = NULL; |
| goto fini; |
| } |
| |
| /* Find the step GRES record */ |
| step_search_key.config_flags = gres_context[context_inx].config_flags; |
| step_search_key.plugin_id = gres_context[context_inx].plugin_id; |
| step_search_key.type_id = gres_build_id(type); |
| gres_state_step = list_find_first(gres_list, gres_find_step_by_key, |
| &step_search_key); |
| |
| if (gres_state_step) { |
| gres_ss = gres_state_step->gres_data; |
| } else { |
| gres_ss = xmalloc(sizeof(gres_step_state_t)); |
| gres_ss->type_id = step_search_key.type_id; |
| gres_ss->type_name = type; |
| type = NULL; /* String moved above */ |
| gres_state_step = gres_create_state( |
| &gres_context[context_inx], GRES_STATE_SRC_CONTEXT_PTR, |
| GRES_STATE_TYPE_STEP, gres_ss); |
| list_append(gres_list, gres_state_step); |
| } |
| |
| fini: xfree(name); |
| xfree(type); |
| if (my_rc != SLURM_SUCCESS) { |
| prev_save_ptr = NULL; |
| if (my_rc == ESLURM_INVALID_GRES && running_in_slurmctld()) |
| info("Invalid GRES step specification %s", in_val); |
| *rc = my_rc; |
| } |
| *save_ptr = prev_save_ptr; |
| return gres_state_step; |
| } |
| |
| static int _handle_ntasks_per_tres_step(list_t *new_step_list, |
| uint16_t ntasks_per_tres, |
| uint32_t *num_tasks, |
| uint32_t *cpu_count) |
| { |
| gres_state_t *gres_state_step; |
| gres_step_state_t *gres_ss; |
| uint64_t cnt = 0; |
| int rc = SLURM_SUCCESS; |
| |
| uint64_t tmp = _get_step_gres_list_cnt(new_step_list, "gpu", NULL); |
| if ((tmp == NO_VAL64) && (*num_tasks != NO_VAL)) { |
| /* |
| * Generate GPUs from ntasks_per_tres when not specified |
| * and ntasks is specified |
| */ |
| uint32_t gpus = *num_tasks / ntasks_per_tres; |
| /* For now, do type-less GPUs */ |
| char *save_ptr = NULL, *gres = NULL, *in_val; |
| xstrfmtcat(gres, "gres/gpu:%u", gpus); |
| in_val = gres; |
| if (*num_tasks != ntasks_per_tres * gpus) { |
| log_flag(GRES, "%s: -n/--ntasks %u is not a multiple of --ntasks-per-gpu=%u", |
| __func__, *num_tasks, ntasks_per_tres); |
| return ESLURM_INVALID_GRES; |
| } |
| while ((gres_state_step = |
| _get_next_step_gres(in_val, &cnt, |
| new_step_list, |
| &save_ptr, &rc))) { |
| gres_ss = gres_state_step->gres_data; |
| /* Simulate a tres_per_job specification */ |
| gres_ss->gres_per_step = cnt; |
| gres_ss->ntasks_per_gres = ntasks_per_tres; |
| gres_ss->total_gres = |
| MAX(gres_ss->total_gres, cnt); |
| in_val = NULL; |
| } |
| xfree(gres); |
| xassert(list_count(new_step_list) != 0); |
| } else if (tmp != NO_VAL64) { |
| tmp = tmp * ntasks_per_tres; |
| if (*num_tasks < tmp) { |
| uint32_t cpus_per_task = *cpu_count / *num_tasks; |
| *num_tasks = tmp; |
| tmp = tmp * cpus_per_task; |
| if (*cpu_count && (*cpu_count < tmp)) { |
| /* step_spec->cpu_count == 0 means SSF_OVERSUBSCRIBE */ |
| *cpu_count = tmp; |
| } |
| } |
| } else { |
| error("%s: ntasks_per_tres was specified, but there was either no task count or no GPU specification to go along with it, or both were already specified.", |
| __func__); |
| rc = SLURM_ERROR; |
| } |
| |
| return rc; |
| } |
| |
| extern int gres_step_state_validate(char *cpus_per_tres, |
| char *tres_per_step, |
| char *tres_per_node, |
| char *tres_per_socket, |
| char *tres_per_task, |
| char *mem_per_tres, |
| uint16_t ntasks_per_tres, |
| uint32_t step_min_nodes, |
| list_t **step_gres_list, |
| uint32_t job_id, |
| uint32_t step_id, |
| uint32_t *num_tasks, |
| uint32_t *cpu_count, char **err_msg) |
| { |
| int rc = SLURM_SUCCESS; |
| gres_step_state_t *gres_ss; |
| gres_state_t *gres_state_step; |
| list_t *new_step_list; |
| uint64_t cnt = 0; |
| uint16_t cpus_per_gres = 0; |
| char *cpus_per_gres_name = NULL; |
| char *cpus_per_gres_type = NULL; |
| |
| *step_gres_list = NULL; |
| xassert(gres_context_cnt >= 0); |
| xassert(num_tasks); |
| xassert(cpu_count); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| new_step_list = list_create(gres_step_list_delete); |
| if (cpus_per_tres) { |
| char *in_val = cpus_per_tres, *save_ptr = NULL; |
| while ((gres_state_step = _get_next_step_gres( |
| in_val, &cnt, |
| new_step_list, |
| &save_ptr, &rc))) { |
| gres_ss = gres_state_step->gres_data; |
| gres_ss->cpus_per_gres = cnt; |
| in_val = NULL; |
| /* Only a single cpus_per_tres value is allowed. */ |
| if (cpus_per_gres) { |
| if (err_msg) |
| *err_msg = xstrdup("You may only request cpus_per_tres for one tres"); |
| else |
| error("You may only request cpus_per_tres for one tres"); |
| rc = ESLURM_INVALID_GRES; |
| FREE_NULL_LIST(new_step_list); |
| goto fini; |
| } else { |
| cpus_per_gres = cnt; |
| cpus_per_gres_name = gres_state_step->gres_name; |
| cpus_per_gres_type = gres_ss->type_name; |
| } |
| } |
| } |
| if (tres_per_step) { |
| char *in_val = tres_per_step, *save_ptr = NULL; |
| while ((gres_state_step = _get_next_step_gres( |
| in_val, &cnt, |
| new_step_list, |
| &save_ptr, &rc))) { |
| gres_ss = gres_state_step->gres_data; |
| gres_ss->gres_per_step = cnt; |
| in_val = NULL; |
| gres_ss->total_gres = |
| MAX(gres_ss->total_gres, cnt); |
| } |
| } |
| if (tres_per_node) { |
| char *in_val = tres_per_node, *save_ptr = NULL; |
| while ((gres_state_step = _get_next_step_gres( |
| in_val, &cnt, |
| new_step_list, |
| &save_ptr, &rc))) { |
| gres_ss = gres_state_step->gres_data; |
| gres_ss->gres_per_node = cnt; |
| in_val = NULL; |
| gres_ss->total_gres = |
| MAX(gres_ss->total_gres, step_min_nodes * cnt); |
| } |
| } |
| if (tres_per_socket) { |
| char *in_val = tres_per_socket, *save_ptr = NULL; |
| while ((gres_state_step = _get_next_step_gres( |
| in_val, &cnt, |
| new_step_list, |
| &save_ptr, &rc))) { |
| gres_ss = gres_state_step->gres_data; |
| gres_ss->gres_per_socket = cnt; |
| in_val = NULL; |
| // TODO: What is sockets_per_node and ntasks_per_socket? |
| // if (*sockets_per_node != NO_VAL16) { |
| // cnt *= *sockets_per_node; |
| // } else if ((*num_tasks != NO_VAL) && |
| // (*ntasks_per_socket != NO_VAL16)) { |
| // cnt *= ROUNDUP(*num_tasks, *ntasks_per_socket); |
| // } |
| // gres_ss->total_gres = |
| // MAX(gres_ss->total_gres, cnt); |
| } |
| } |
| if (tres_per_task) { |
| char *in_val = tres_per_task, *save_ptr = NULL; |
| while ((gres_state_step = _get_next_step_gres( |
| in_val, &cnt, |
| new_step_list, |
| &save_ptr, &rc))) { |
| gres_ss = gres_state_step->gres_data; |
| gres_ss->gres_per_task = cnt; |
| in_val = NULL; |
| if (*num_tasks != NO_VAL) |
| cnt *= *num_tasks; |
| gres_ss->total_gres = |
| MAX(gres_ss->total_gres, cnt); |
| } |
| } |
| if (mem_per_tres) { |
| char *in_val = mem_per_tres, *save_ptr = NULL; |
| while ((gres_state_step = _get_next_step_gres( |
| in_val, &cnt, |
| new_step_list, |
| &save_ptr, &rc))) { |
| gres_ss = gres_state_step->gres_data; |
| gres_ss->mem_per_gres = cnt; |
| in_val = NULL; |
| } |
| } |
| |
| if ((ntasks_per_tres != NO_VAL16)) { |
| rc = _handle_ntasks_per_tres_step(new_step_list, |
| ntasks_per_tres, |
| num_tasks, |
| cpu_count); |
| } |
| |
| if ((rc == SLURM_SUCCESS) && cpus_per_gres && *cpu_count && |
| running_in_slurmctld()) { |
| /* |
| * Update cpu_count = the total requested gres * cpus_per_gres |
| * |
| * If SSF_OVERCOMMIT (step_spec->cpu_count == 0), don't update. |
| * Only update if in slurmctld because the step can inherit |
| * gres from the job_gres_list_req, which only exists in |
| * slurmctld. |
| */ |
| uint64_t gpu_cnt = _get_step_gres_list_cnt(new_step_list, |
| cpus_per_gres_name, |
| cpus_per_gres_type); |
| |
| if (gpu_cnt == NO_VAL64) { |
| if (err_msg) |
| *err_msg = xstrdup("cpus_per_gres also requires specifying the same gres"); |
| else |
| error("cpus_per_gres also requires specifying the same gres"); |
| rc = ESLURM_INVALID_GRES; |
| FREE_NULL_LIST(new_step_list); |
| } else |
| *cpu_count = gpu_cnt * cpus_per_gres; |
| } |
| |
| if (list_count(new_step_list) == 0) { |
| FREE_NULL_LIST(new_step_list); |
| } else { |
| if (rc == SLURM_SUCCESS) { |
| job_validate_t job_validate = { |
| .over_array = xcalloc(list_count(new_step_list), |
| sizeof(overlap_check_t)), |
| }; |
| |
| (void) list_for_each(new_step_list, |
| _foreach_set_over_array, |
| &job_validate); |
| |
| if (job_validate.overlap_merge) |
| rc = _merge_generic_data(new_step_list, |
| &job_validate); |
| xfree(job_validate.over_array); |
| } |
| if (rc == SLURM_SUCCESS) |
| *step_gres_list = new_step_list; |
| else |
| FREE_NULL_LIST(new_step_list); |
| } |
| fini: |
| slurm_mutex_unlock(&gres_context_lock); |
| return rc; |
| } |
| |
| static void *_step_state_dup(gres_step_state_t *gres_ss) |
| { |
| |
| int i; |
| gres_step_state_t *new_gres_ss; |
| |
| xassert(gres_ss); |
| new_gres_ss = xmalloc(sizeof(gres_step_state_t)); |
| new_gres_ss->cpus_per_gres = gres_ss->cpus_per_gres; |
| new_gres_ss->gres_per_step = gres_ss->gres_per_step; |
| new_gres_ss->gres_per_node = gres_ss->gres_per_node; |
| new_gres_ss->gres_per_socket = gres_ss->gres_per_socket; |
| new_gres_ss->gres_per_task = gres_ss->gres_per_task; |
| new_gres_ss->mem_per_gres = gres_ss->mem_per_gres; |
| new_gres_ss->node_cnt = gres_ss->node_cnt; |
| new_gres_ss->total_gres = gres_ss->total_gres; |
| |
| if (gres_ss->node_in_use) |
| new_gres_ss->node_in_use = bit_copy(gres_ss->node_in_use); |
| |
| if (gres_ss->gres_cnt_node_alloc) { |
| i = sizeof(uint64_t) * gres_ss->node_cnt; |
| new_gres_ss->gres_cnt_node_alloc = xmalloc(i); |
| memcpy(new_gres_ss->gres_cnt_node_alloc, |
| gres_ss->gres_cnt_node_alloc, i); |
| } |
| if (gres_ss->gres_bit_alloc) { |
| new_gres_ss->gres_bit_alloc = xcalloc(gres_ss->node_cnt, |
| sizeof(bitstr_t *)); |
| for (i = 0; i < gres_ss->node_cnt; i++) { |
| if (gres_ss->gres_bit_alloc[i] == NULL) |
| continue; |
| new_gres_ss->gres_bit_alloc[i] = |
| bit_copy(gres_ss->gres_bit_alloc[i]); |
| } |
| } |
| if (new_gres_ss->gres_per_bit_alloc && gres_ss->gres_bit_alloc) { |
| new_gres_ss->gres_per_bit_alloc = xcalloc(gres_ss->node_cnt, |
| sizeof(uint64_t *)); |
| for (i = 0; i < gres_ss->node_cnt; i++) { |
| int bit_cnt = bit_size(gres_ss->gres_bit_alloc[i]); |
| new_gres_ss->gres_per_bit_alloc[i] = xcalloc( |
| bit_cnt, sizeof(uint64_t)); |
| memcpy(new_gres_ss->gres_per_bit_alloc[i], |
| gres_ss->gres_per_bit_alloc[i], |
| bit_cnt * sizeof(uint64_t)); |
| } |
| } |
| return new_gres_ss; |
| } |
| |
| static void *_step_state_dup2(gres_step_state_t *gres_ss, int job_node_index) |
| { |
| gres_step_state_t *new_gres_ss; |
| |
| xassert(gres_ss); |
| new_gres_ss = xmalloc(sizeof(gres_step_state_t)); |
| new_gres_ss->cpus_per_gres = gres_ss->cpus_per_gres; |
| new_gres_ss->gres_per_step = gres_ss->gres_per_step; |
| new_gres_ss->gres_per_node = gres_ss->gres_per_node; |
| new_gres_ss->gres_per_socket = gres_ss->gres_per_socket; |
| new_gres_ss->gres_per_task = gres_ss->gres_per_task; |
| new_gres_ss->mem_per_gres = gres_ss->mem_per_gres; |
| new_gres_ss->node_cnt = 1; |
| new_gres_ss->total_gres = gres_ss->total_gres; |
| |
| if (gres_ss->node_in_use) |
| new_gres_ss->node_in_use = bit_copy(gres_ss->node_in_use); |
| |
| if (gres_ss->gres_cnt_node_alloc) { |
| new_gres_ss->gres_cnt_node_alloc = xmalloc(sizeof(uint64_t)); |
| new_gres_ss->gres_cnt_node_alloc[0] = |
| gres_ss->gres_cnt_node_alloc[job_node_index]; |
| } |
| |
| if ((job_node_index < gres_ss->node_cnt) && gres_ss->gres_bit_alloc && |
| gres_ss->gres_bit_alloc[job_node_index]) { |
| new_gres_ss->gres_bit_alloc = xmalloc(sizeof(bitstr_t *)); |
| new_gres_ss->gres_bit_alloc[0] = |
| bit_copy(gres_ss->gres_bit_alloc[job_node_index]); |
| } |
| if (gres_ss->gres_per_bit_alloc && |
| (job_node_index < gres_ss->node_cnt) && gres_ss->gres_bit_alloc && |
| gres_ss->gres_bit_alloc[job_node_index]) { |
| int bit_cnt = bit_size(gres_ss->gres_bit_alloc[job_node_index]); |
| new_gres_ss->gres_per_bit_alloc = xmalloc(sizeof(uint64_t *)); |
| new_gres_ss->gres_per_bit_alloc[0] = xcalloc(bit_cnt, |
| sizeof(uint64_t)); |
| memcpy(new_gres_ss->gres_per_bit_alloc[0], |
| gres_ss->gres_per_bit_alloc[job_node_index], |
| bit_cnt * sizeof(uint64_t)); |
| } |
| return new_gres_ss; |
| } |
| |
| /* |
| * Create a copy of a step's gres state |
| * IN gres_list - List of Gres records for this step to track usage |
| * RET The copy or NULL on failure |
| */ |
| list_t *gres_step_state_list_dup(list_t *gres_list) |
| { |
| return gres_step_state_extract(gres_list, -1); |
| } |
| |
| static int _foreach_step_state_extract(void *x, void *arg) |
| { |
| gres_state_t *gres_state_step = x; |
| foreach_state_list_dup_t *foreach_state_list_dup = arg; |
| gres_state_t *new_gres_state_step; |
| void *new_gres_data; |
| |
| if (foreach_state_list_dup->job_node_index == -1) |
| new_gres_data = _step_state_dup(gres_state_step->gres_data); |
| else |
| new_gres_data = _step_state_dup2( |
| gres_state_step->gres_data, |
| foreach_state_list_dup->job_node_index); |
| |
| if (!foreach_state_list_dup->new_gres_list) |
| foreach_state_list_dup->new_gres_list = |
| list_create(gres_step_list_delete); |
| |
| new_gres_state_step = gres_create_state( |
| gres_state_step, GRES_STATE_SRC_STATE_PTR, |
| GRES_STATE_TYPE_STEP, new_gres_data); |
| list_append(foreach_state_list_dup->new_gres_list, new_gres_state_step); |
| |
| return 0; |
| } |
| |
| /* |
| * Create a copy of a step's gres state for a particular node index |
| * IN gres_list - List of Gres records for this step to track usage |
| * IN job_node_index - zero-origin index to the node |
| * RET The copy or NULL on failure |
| */ |
| list_t *gres_step_state_extract(list_t *gres_list, int job_node_index) |
| { |
| foreach_state_list_dup_t foreach_state_list_dup = { |
| .job_node_index = job_node_index, |
| }; |
| |
| if (gres_list) |
| (void) list_for_each(gres_list, _foreach_step_state_extract, |
| &foreach_state_list_dup); |
| |
| return foreach_state_list_dup.new_gres_list; |
| } |
| |
| /* |
| * Pack a step's current gres status, called from slurmctld for save/restore |
| * IN gres_list - generated by gres_stepmgr_step_alloc() |
| * IN/OUT buffer - location to write state to |
| * IN step_id - job and step ID for logging |
| */ |
| extern int gres_step_state_pack(list_t *gres_list, buf_t *buffer, |
| slurm_step_id_t *step_id, |
| uint16_t protocol_version) |
| { |
| pack_state_t pack_state = { |
| .buffer = buffer, |
| .magic = GRES_MAGIC, |
| .protocol_version = protocol_version, |
| }; |
| |
| return _pack_state(gres_list, &pack_state, _foreach_step_state_pack); |
| } |
| |
| /* |
| * Unpack a step's current gres status, called from slurmctld for save/restore |
| * OUT gres_list - restored state stored by gres_step_state_pack() |
| * IN/OUT buffer - location to read state from |
| * IN step_id - job and step ID for logging |
| */ |
| extern int gres_step_state_unpack(list_t **gres_list, buf_t *buffer, |
| slurm_step_id_t *step_id, |
| uint16_t protocol_version) |
| { |
| int i, rc = SLURM_SUCCESS; |
| uint32_t magic = 0, plugin_id = 0, uint32_tmp = 0; |
| uint16_t rec_cnt = 0; |
| uint8_t data_flag = 0; |
| gres_state_t *gres_state_step; |
| gres_step_state_t *gres_ss = NULL; |
| bool locked = false; |
| |
| safe_unpack16(&rec_cnt, buffer); |
| if (rec_cnt == 0) |
| return SLURM_SUCCESS; |
| |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| locked = true; |
| if ((gres_context_cnt > 0) && (*gres_list == NULL)) { |
| *gres_list = list_create(gres_step_list_delete); |
| } |
| |
| while ((rc == SLURM_SUCCESS) && (rec_cnt)) { |
| slurm_gres_context_t *gres_ctx; |
| if ((buffer == NULL) || (remaining_buf(buffer) == 0)) |
| break; |
| rec_cnt--; |
| if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { |
| safe_unpack32(&magic, buffer); |
| if (magic != GRES_MAGIC) |
| goto unpack_error; |
| safe_unpack32(&plugin_id, buffer); |
| gres_ss = xmalloc(sizeof(gres_step_state_t)); |
| safe_unpack16(&gres_ss->cpus_per_gres, buffer); |
| safe_unpack16(&gres_ss->flags, buffer); |
| safe_unpack64(&gres_ss->gres_per_step, buffer); |
| safe_unpack64(&gres_ss->gres_per_node, buffer); |
| safe_unpack64(&gres_ss->gres_per_socket, buffer); |
| safe_unpack64(&gres_ss->gres_per_task, buffer); |
| safe_unpack64(&gres_ss->mem_per_gres, buffer); |
| safe_unpack64(&gres_ss->total_gres, buffer); |
| safe_unpackstr(&gres_ss->type_name, buffer); |
| gres_ss->type_id = gres_build_id(gres_ss->type_name); |
| safe_unpack32(&gres_ss->node_cnt, buffer); |
| if (gres_ss->node_cnt > NO_VAL) |
| goto unpack_error; |
| unpack_bit_str_hex(&gres_ss->node_in_use, buffer); |
| safe_unpack8(&data_flag, buffer); |
| if (data_flag) { |
| safe_unpack64_array( |
| &gres_ss->gres_cnt_node_alloc, |
| &uint32_tmp, buffer); |
| } |
| safe_unpack8(&data_flag, buffer); |
| if (data_flag) { |
| gres_ss->gres_bit_alloc = |
| xcalloc(gres_ss->node_cnt, |
| sizeof(bitstr_t *)); |
| for (i = 0; i < gres_ss->node_cnt; i++) { |
| unpack_bit_str_hex(&gres_ss-> |
| gres_bit_alloc[i], |
| buffer); |
| } |
| } |
| for (i = 0; i < gres_ss->node_cnt; i++) { |
| safe_unpack8(&data_flag, buffer); |
| if (!data_flag) |
| continue; |
| if (!gres_ss->gres_per_bit_alloc) |
| safe_xcalloc( |
| gres_ss->gres_per_bit_alloc, |
| gres_ss->node_cnt, |
| sizeof(uint64_t *)); |
| safe_unpack64_array( |
| &gres_ss->gres_per_bit_alloc[i], |
| &uint32_tmp, buffer); |
| } |
| } else { |
| error("%s: protocol_version %hu not supported", |
| __func__, protocol_version); |
| goto unpack_error; |
| } |
| |
| if (!(gres_ctx = _find_context_by_id(plugin_id))) { |
| /* |
| * A likely sign that GresPlugins has changed. |
| * Not a fatal error, skip over the data. |
| */ |
| info("%s: no plugin configured to unpack data type %u from %ps", |
| __func__, plugin_id, step_id); |
| _step_state_delete(gres_ss); |
| gres_ss = NULL; |
| continue; |
| } |
| gres_state_step = gres_create_state( |
| gres_ctx, GRES_STATE_SRC_CONTEXT_PTR, |
| GRES_STATE_TYPE_STEP, gres_ss); |
| gres_ss = NULL; |
| list_append(*gres_list, gres_state_step); |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| return rc; |
| |
| unpack_error: |
| error("%s: unpack error from %ps", __func__, step_id); |
| if (gres_ss) |
| _step_state_delete(gres_ss); |
| if (locked) |
| slurm_mutex_unlock(&gres_context_lock); |
| return SLURM_ERROR; |
| } |
| |
| static int _foreach_step_count(void *x, void *arg) |
| { |
| gres_state_t *gres_state_step = x; |
| foreach_gres_list_cnt_t *foreach_gres_list_cnt = arg; |
| gres_step_state_t *gres_ss = gres_state_step->gres_data; |
| |
| if (gres_state_step->plugin_id != foreach_gres_list_cnt->plugin_id) |
| return 0; |
| |
| /* gres_cnt_node_alloc has one element in slurmstepd */ |
| if (foreach_gres_list_cnt->gres_cnt == NO_VAL64) |
| foreach_gres_list_cnt->gres_cnt = |
| gres_ss->gres_cnt_node_alloc[0]; |
| else |
| foreach_gres_list_cnt->gres_cnt += |
| gres_ss->gres_cnt_node_alloc[0]; |
| return 0; |
| } |
| |
| /* Return the count of GRES of a specific name on this machine |
| * IN step_gres_list - generated by gres_stepmgr_step_alloc() |
| * IN gres_name - name of the GRES to match |
| * RET count of GRES of this specific name available to the job or NO_VAL64 |
| */ |
| extern uint64_t gres_step_count(list_t *step_gres_list, char *gres_name) |
| { |
| foreach_gres_list_cnt_t foreach_gres_list_cnt = { |
| .gres_cnt = NO_VAL64, |
| }; |
| |
| if (step_gres_list) { |
| slurm_mutex_lock(&gres_context_lock); |
| for (int i = 0; i < gres_context_cnt; i++) { |
| if (!xstrcmp(gres_context[i].gres_name, gres_name)) { |
| foreach_gres_list_cnt.plugin_id = |
| gres_context[i].plugin_id; |
| (void) list_for_each(step_gres_list, |
| _foreach_step_count, |
| &foreach_gres_list_cnt); |
| break; |
| } |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| } |
| |
| return foreach_gres_list_cnt.gres_cnt; |
| } |
| |
| /* |
| * Here we convert usable_gres from a mask just for the gres in the allocation |
| * to one for the gres on the node. Essentially putting in a '0' for gres not |
| * in the allocation |
| * |
| * IN/OUT - usable_gres |
| * IN - gres_bit_alloc |
| */ |
| static void _translate_step_to_global_device_index(bitstr_t **usable_gres, |
| bitstr_t *gres_bit_alloc) |
| { |
| bitstr_t *tmp = bit_alloc(bit_size(gres_bit_alloc)); |
| int i_last, bit, bit2 = 0; |
| |
| i_last = bit_fls(gres_bit_alloc); |
| for (bit = 0; bit <= i_last; bit++) { |
| if (bit_test(gres_bit_alloc, bit)) { |
| if (bit_test(*usable_gres, bit2)) { |
| bit_set(tmp, bit); |
| } |
| bit2++; |
| } |
| } |
| FREE_NULL_BITMAP(*usable_gres); |
| *usable_gres = tmp; |
| } |
| |
| bitstr_t *cpu_set_to_bit_str(cpu_set_t *cpu_set, int cpu_count) |
| { |
| bitstr_t *cpu_bitstr = bit_alloc(cpu_count); |
| |
| if (cpu_set) { |
| for (int i = 0; i < cpu_count; i++) |
| if (CPU_ISSET(i, cpu_set)) |
| bit_set(cpu_bitstr, i); |
| } else { |
| bit_set_all(cpu_bitstr); |
| } |
| |
| return cpu_bitstr; |
| |
| } |
| |
| static int _foreach_closest_usable_gres(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| foreach_closest_usable_gres_t *foreach_closest_usable_gres = arg; |
| |
| if (gres_slurmd_conf->plugin_id != |
| foreach_closest_usable_gres->plugin_id) |
| return 0; |
| if ((foreach_closest_usable_gres->gres_inx + gres_slurmd_conf->count) > |
| foreach_closest_usable_gres->bitmap_size) { |
| error("GRES %s bitmap overflow ((%d + %"PRIu64") > %d)", |
| gres_slurmd_conf->name, |
| foreach_closest_usable_gres->gres_inx, |
| gres_slurmd_conf->count, |
| foreach_closest_usable_gres->bitmap_size); |
| return 0; |
| } |
| if (!gres_slurmd_conf->cpus_bitmap || |
| bit_overlap_any(gres_slurmd_conf->cpus_bitmap, |
| foreach_closest_usable_gres->task_cpus_bitmap)) { |
| bit_nset(foreach_closest_usable_gres->usable_gres, |
| foreach_closest_usable_gres->gres_inx, |
| foreach_closest_usable_gres->gres_inx + |
| gres_slurmd_conf->count - 1); |
| } |
| foreach_closest_usable_gres->gres_inx += gres_slurmd_conf->count; |
| |
| return 0; |
| } |
| |
| /* |
| * Given a GRES context index, return a bitmap representing those GRES |
| * which are available from the CPUs current allocated to this process. |
| * This function only works with task/cgroup and constrained devices or |
| * if the job step has access to the entire node's resources. |
| */ |
| static bitstr_t *_get_closest_usable_gres(uint32_t plugin_id, |
| bitstr_t *gres_bit_alloc, |
| cpu_set_t *task_cpu_set) |
| { |
| foreach_closest_usable_gres_t foreach_closest_usable_gres = { |
| .gres_inx = 0, |
| .plugin_id = plugin_id, |
| }; |
| |
| if (!gres_conf_list) { |
| error("gres_conf_list is null!"); |
| return NULL; |
| } |
| |
| foreach_closest_usable_gres.task_cpus_bitmap = cpu_set_to_bit_str( |
| task_cpu_set, |
| ((gres_slurmd_conf_t *)list_peek(gres_conf_list))->cpu_cnt); |
| foreach_closest_usable_gres.bitmap_size = bit_size(gres_bit_alloc); |
| foreach_closest_usable_gres.usable_gres = |
| bit_alloc(foreach_closest_usable_gres.bitmap_size); |
| |
| (void) list_for_each(gres_conf_list, _foreach_closest_usable_gres, |
| &foreach_closest_usable_gres); |
| |
| FREE_NULL_BITMAP(foreach_closest_usable_gres.task_cpus_bitmap); |
| |
| bit_and(foreach_closest_usable_gres.usable_gres, gres_bit_alloc); |
| |
| return foreach_closest_usable_gres.usable_gres; |
| } |
| |
| static int _foreach_gres_to_task(void *x, void *arg) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf = x; |
| foreach_gres_to_task_t *foreach_gres_to_task = arg; |
| int start, end; |
| |
| if (gres_slurmd_conf->plugin_id != foreach_gres_to_task->plugin_id) |
| return 0; |
| |
| start = foreach_gres_to_task->gres_inx * |
| foreach_gres_to_task->ntasks_per_gres; |
| foreach_gres_to_task->gres_inx += gres_slurmd_conf->count; |
| end = foreach_gres_to_task->gres_inx * |
| foreach_gres_to_task->ntasks_per_gres; |
| |
| if (!bit_set_count_range(foreach_gres_to_task->gres_slots, start, end)) |
| return 0; |
| |
| if (gres_slurmd_conf->cpus_bitmap) { |
| if (bit_super_set(foreach_gres_to_task->task_cpus_bitmap, |
| gres_slurmd_conf->cpus_bitmap)) { |
| foreach_gres_to_task->best_slot = bit_ffs_from_bit( |
| foreach_gres_to_task->gres_slots, start); |
| return -1; |
| } |
| |
| if (foreach_gres_to_task->overlap) |
| return 0; |
| |
| if (bit_overlap_any(foreach_gres_to_task->task_cpus_bitmap, |
| gres_slurmd_conf->cpus_bitmap)) { |
| foreach_gres_to_task->best_slot = bit_ffs_from_bit( |
| foreach_gres_to_task->gres_slots, start); |
| foreach_gres_to_task->overlap = true; |
| return 0; |
| } |
| } |
| |
| if (foreach_gres_to_task->best_slot == -1) |
| foreach_gres_to_task->best_slot = bit_ffs_from_bit( |
| foreach_gres_to_task->gres_slots, start); |
| |
| return 0; |
| } |
| |
| /* Select the best available gres from gres_slots */ |
| static int _assign_gres_to_task(cpu_set_t *task_cpu_set, int ntasks_per_gres, |
| bitstr_t *gres_slots, uint32_t plugin_id) |
| { |
| foreach_gres_to_task_t foreach_gres_to_task = { |
| .best_slot = -1, |
| .gres_inx = 0, |
| .gres_slots = gres_slots, |
| .ntasks_per_gres = ntasks_per_gres, |
| .overlap = false, |
| .plugin_id = plugin_id, |
| .task_cpus_bitmap = cpu_set_to_bit_str( |
| task_cpu_set, |
| ((gres_slurmd_conf_t *)list_peek(gres_conf_list))-> |
| cpu_cnt), |
| }; |
| |
| (void) list_for_each(gres_conf_list, _foreach_gres_to_task, |
| &foreach_gres_to_task); |
| FREE_NULL_BITMAP(foreach_gres_to_task.task_cpus_bitmap); |
| |
| if (foreach_gres_to_task.best_slot != -1) { |
| bit_clear(foreach_gres_to_task.gres_slots, |
| foreach_gres_to_task.best_slot); |
| return (foreach_gres_to_task.best_slot / |
| foreach_gres_to_task.ntasks_per_gres); |
| } else { |
| log_flag(GRES, "%s Can't find free slot", __func__); |
| return -1; |
| } |
| } |
| |
| /* |
| * Given the cpu affinity of all tasks, return a bitmap binding a single gres to |
| * this task. |
| */ |
| static bitstr_t *_get_single_usable_gres(int context_inx, |
| int ntasks_per_gres, |
| int local_proc_id, |
| stepd_step_rec_t *step, |
| bitstr_t *gres_bit_alloc) |
| { |
| int idx = 0; |
| bitstr_t *usable_gres = NULL; |
| bitstr_t *gres_slots = NULL; |
| int32_t gres_count = bit_set_count(gres_bit_alloc); |
| |
| |
| /* No need to select gres if there is only 1 to use */ |
| if (gres_count <= 1) { |
| log_flag(GRES, "%s: (task %d) No need to select single gres since count is 0 or 1", |
| __func__, local_proc_id); |
| return bit_copy(gres_bit_alloc); |
| } |
| |
| /* |
| * Create bitmap called gres_slots. This represents the available slots |
| * for tasks on that gres based off of ntasks_per_gres and if that gres |
| * is allocated to the step. |
| */ |
| if (ntasks_per_gres == 1) |
| gres_slots = bit_copy(gres_bit_alloc); |
| else { |
| gres_slots = bit_alloc(bit_size(gres_bit_alloc) * |
| ntasks_per_gres); |
| for (int i = -1; |
| (i = bit_ffs_from_bit(gres_bit_alloc, i + 1)) >= 0;) { |
| bit_nset(gres_slots, i * ntasks_per_gres, |
| (((i + 1) * ntasks_per_gres) - 1)); |
| } |
| } |
| |
| /* |
| * To ensure no task gets more than ntasks_per_gres, here we one by one, |
| * select an available gres_slot for each task and clear a gres_slot. |
| * Once we reach the current task we can take the gres assignment and |
| * quit the loop |
| */ |
| for (int i = 0; i <= local_proc_id; i++) { |
| idx = _assign_gres_to_task(step->task[i]->cpu_set, |
| ntasks_per_gres, gres_slots, |
| gres_context[context_inx].plugin_id); |
| } |
| FREE_NULL_BITMAP(gres_slots); |
| |
| /* Return a bitmap with this as the only usable GRES */ |
| usable_gres = bit_alloc(bit_size(gres_bit_alloc)); |
| if (idx < 0) { |
| int n; |
| error("%s Can't find free slot for local_proc_id = %d, continue using block distribution", |
| __func__, local_proc_id); |
| n = local_proc_id % gres_count; |
| idx = bit_get_bit_num(gres_bit_alloc, n); |
| } |
| |
| bit_set(usable_gres, idx); |
| |
| if (slurm_conf.debug_flags & DEBUG_FLAG_GRES){ |
| char *usable_gres_str = bit_fmt_hexmask_trim(usable_gres); |
| log_flag(GRES, "%s: local_proc_id = %d; usable_gres: %s", |
| __func__, local_proc_id, usable_gres_str); |
| xfree(usable_gres_str); |
| } |
| |
| return usable_gres; |
| } |
| |
| /* |
| * Configure the GRES hardware allocated to the current step while privileged |
| * |
| * IN step_gres_list - Step's GRES specification |
| * IN node_id - relative position of this node in step |
| * IN settings - string containing configuration settings for the hardware |
| */ |
| extern void gres_g_step_hardware_init(list_t *step_gres_list, |
| uint32_t node_id, char *settings) |
| { |
| int i; |
| gres_state_t *gres_state_step; |
| gres_step_state_t *gres_ss; |
| bitstr_t *devices; |
| |
| if (!step_gres_list) |
| return; |
| |
| xassert(gres_context_cnt >= 0); |
| slurm_mutex_lock(&gres_context_lock); |
| for (i = 0; i < gres_context_cnt; i++) { |
| if (gres_context[i].ops.step_hardware_init == NULL) |
| continue; |
| |
| gres_state_step = list_find_first(step_gres_list, gres_find_id, |
| &gres_context[i].plugin_id); |
| if (!gres_state_step || !gres_state_step->gres_data) |
| continue; |
| gres_ss = (gres_step_state_t *) gres_state_step->gres_data; |
| if ((gres_ss->node_cnt != 1) || |
| !gres_ss->gres_bit_alloc || |
| !gres_ss->gres_bit_alloc[0]) |
| continue; |
| |
| devices = gres_ss->gres_bit_alloc[0]; |
| if (settings) |
| debug2("settings: %s", settings); |
| (*(gres_context[i].ops.step_hardware_init))(devices, settings); |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| } |
| |
| /* |
| * Optionally undo GRES hardware configuration while privileged |
| */ |
| extern void gres_g_step_hardware_fini(void) |
| { |
| int i; |
| xassert(gres_context_cnt >= 0); |
| slurm_mutex_lock(&gres_context_lock); |
| for (i = 0; i < gres_context_cnt; i++) { |
| if (gres_context[i].ops.step_hardware_fini == NULL) { |
| continue; |
| } |
| (*(gres_context[i].ops.step_hardware_fini)) (); |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| } |
| |
| /* |
| * Given a set of GRES masks or maps and the local process ID, return the bitmap |
| * of GRES that should be available to this task. |
| * |
| * IN map_or_mask |
| * IN local_proc_id |
| * IN gres_bit_alloc |
| * IN is_map |
| * IN get_devices |
| * |
| * RET usable_gres |
| */ |
| static bitstr_t *_get_usable_gres_map_or_mask(char *map_or_mask, |
| int local_proc_id, |
| bitstr_t *gres_bit_alloc, |
| bool is_map, |
| bool get_devices) |
| { |
| bitstr_t *usable_gres = NULL; |
| char *tmp, *tok, *save_ptr = NULL, *mult; |
| int i, task_offset = 0, task_mult, bitmap_size; |
| int value, min, max; |
| |
| if (!map_or_mask || !map_or_mask[0]) |
| return NULL; |
| |
| bitmap_size = bit_size(gres_bit_alloc); |
| min = (is_map ? 0 : 1); |
| max = (is_map ? bitmap_size - 1 : ~(-1 << bitmap_size)); |
| while (usable_gres == NULL) { |
| tmp = xstrdup(map_or_mask); |
| strtok(tmp,"+"); |
| tok = strtok_r(tmp, ",", &save_ptr); |
| while (tok) { |
| if ((mult = strchr(tok, '*'))) |
| task_mult = atoi(mult + 1); |
| else |
| task_mult = 1; |
| if (task_mult == 0) { |
| error("Repetition count of 0 not allowed in gres binding mask, using 1 instead"); |
| task_mult = 1; |
| } |
| if ((local_proc_id >= task_offset) && |
| (local_proc_id <= (task_offset + task_mult - 1))) { |
| value = strtol(tok, NULL, 0); |
| usable_gres = bit_alloc(bitmap_size); |
| if ((value < min) || (value > max)) { |
| error("Invalid map or mask value specified."); |
| xfree(tmp); |
| goto end; /* Bad value */ |
| } |
| if (is_map) |
| bit_set(usable_gres, value); |
| else |
| for (i = 0; i < bitmap_size; i++) { |
| if ((value >> i) & 0x1) |
| bit_set(usable_gres, i); |
| } |
| break; /* All done */ |
| } else { |
| task_offset += task_mult; |
| } |
| tok = strtok_r(NULL, ",", &save_ptr); |
| } |
| xfree(tmp); |
| } |
| |
| end: |
| if (gres_use_local_device_index()) { |
| if (get_devices) |
| _translate_step_to_global_device_index( |
| &usable_gres, gres_bit_alloc); |
| else{ |
| bit_consolidate(usable_gres); |
| } |
| } else { |
| bit_and(usable_gres, gres_bit_alloc); |
| } |
| |
| return usable_gres; |
| } |
| |
| static void _accumulate_step_gres_alloc(gres_state_t *gres_state_step, |
| bitstr_t **gres_bit_alloc, |
| uint64_t *gres_cnt, |
| uint64_t **gres_per_bit) |
| { |
| gres_step_state_t *gres_ss = |
| (gres_step_state_t *)gres_state_step->gres_data; |
| |
| /* Since this should only run on the node node_cnt should always be 1 */ |
| if (gres_ss->node_cnt != 1) { |
| error("gres_step_state_t node count not 1 while on node. This should never happen"); |
| return; |
| } |
| |
| if (gres_ss->gres_bit_alloc && |
| gres_ss->gres_bit_alloc[0]) { |
| if (!*gres_bit_alloc) { |
| *gres_bit_alloc = bit_alloc( |
| bit_size(gres_ss->gres_bit_alloc[0])); |
| } |
| bit_or(*gres_bit_alloc, gres_ss->gres_bit_alloc[0]); |
| } |
| if (gres_cnt && gres_ss->gres_cnt_node_alloc) |
| *gres_cnt += gres_ss->gres_cnt_node_alloc[0]; |
| if (gres_per_bit && |
| gres_ss->gres_per_bit_alloc && |
| gres_ss->gres_per_bit_alloc[0] && |
| gres_ss->gres_bit_alloc && |
| gres_ss->gres_bit_alloc[0]) { |
| if (!*gres_per_bit) |
| *gres_per_bit = xcalloc( |
| bit_size(gres_ss->gres_bit_alloc[0]), |
| sizeof(uint64_t)); |
| for (int i = 0; i < bit_size(gres_ss->gres_bit_alloc[0]); i++) { |
| (*gres_per_bit)[i] += gres_ss->gres_per_bit_alloc[0][i]; |
| } |
| } |
| } |
| |
| static void _filter_gres_per_task(bitstr_t *test_gres, |
| bitstr_t *gres_bit_avail, |
| bitstr_t *usable_gres, |
| uint64_t *gres_needed, |
| bool set_usable_gres) |
| { |
| for (int bit = 0; |
| *gres_needed && (bit = bit_ffs_from_bit(test_gres, bit)) >= 0; |
| bit++) { |
| (*gres_needed)--; |
| bit_clear(gres_bit_avail, bit); |
| if (set_usable_gres) |
| bit_set(usable_gres, bit); |
| } |
| } |
| |
| /* |
| * Given a required gres_per_task count, determine which gres should be assigned |
| * to this task. Prefer gres with cpu affinity that match the task. |
| * |
| * RET usable_gres |
| */ |
| static bitstr_t *_get_gres_per_task(bitstr_t *gres_bit_alloc, |
| uint64_t gres_per_task, |
| stepd_step_rec_t *step, |
| uint32_t plugin_id, |
| int local_proc_id) |
| { |
| uint64_t gres_needed; |
| bitstr_t *usable_gres, *gres_bit_avail; |
| |
| usable_gres = bit_alloc(bit_size(gres_bit_alloc)); |
| gres_bit_avail = bit_copy(gres_bit_alloc); |
| |
| /* |
| * We must determine what the previous tasks are taking first to know |
| * which gres are available to be assigned to this task. |
| */ |
| for (int i = 0; i <= local_proc_id; i++) { |
| gres_needed = gres_per_task; |
| |
| /* First: Try to select device with with cpu affinity */ |
| if (gres_needed) { |
| bitstr_t *closest_gres = _get_closest_usable_gres( |
| plugin_id, gres_bit_avail, |
| step->task[i]->cpu_set); |
| _filter_gres_per_task(closest_gres, gres_bit_avail, |
| usable_gres, &gres_needed, |
| (i == local_proc_id)); |
| FREE_NULL_BITMAP(closest_gres); |
| } |
| |
| /* Second: Select any available device */ |
| if (gres_needed) |
| _filter_gres_per_task(gres_bit_avail, gres_bit_avail, |
| usable_gres, &gres_needed, |
| (i == local_proc_id)); |
| |
| if (gres_needed) { |
| error("Not enough gres to bind %"PRIu64" per task", |
| gres_per_task); |
| break; |
| } |
| } |
| FREE_NULL_BITMAP(gres_bit_avail); |
| return usable_gres; |
| } |
| |
| static void _filter_shared_gres_per_task(bitstr_t *test_gres, |
| bitstr_t *usable_gres, |
| uint64_t *gres_per_bit_avail, |
| uint64_t *gres_needed, |
| bool use_single_dev, |
| bool set_usable_gres) |
| { |
| for (int bit = 0; |
| *gres_needed && (bit = bit_ffs_from_bit(test_gres, bit)) >= 0; |
| bit++) { |
| uint64_t dec = MIN(gres_per_bit_avail[bit], *gres_needed); |
| |
| if (dec < (use_single_dev ? *gres_needed : 1)) |
| continue; |
| |
| gres_per_bit_avail[bit] -= dec; |
| *gres_needed -= dec; |
| |
| if (set_usable_gres) |
| bit_set(usable_gres, bit); |
| } |
| } |
| |
| /* |
| * Given a required gres_per_task count, determine which shared gres should be |
| * assigned to this task. Prefer gres with core affinity that match the task |
| * and prefer allocating shared gres belonging to a single device if possible. |
| */ |
| static bitstr_t *_get_shared_gres_per_task(bitstr_t *gres_bit_alloc, |
| uint64_t *gres_per_bit, |
| uint64_t gres_per_task, |
| stepd_step_rec_t *step, |
| uint32_t sharing_plugin_id, |
| int local_proc_id) |
| { |
| uint64_t gres_needed; |
| bitstr_t *usable_gres, *closest_gres; |
| uint64_t *gres_per_bit_avail; |
| |
| usable_gres = bit_alloc(bit_size(gres_bit_alloc)); |
| gres_per_bit_avail = xcalloc(bit_size(gres_bit_alloc), |
| sizeof(uint64_t)); |
| memcpy(gres_per_bit_avail, gres_per_bit, |
| bit_size(gres_bit_alloc) * sizeof(uint64_t)); |
| |
| /* |
| * We must determine what the previous tasks are taking first to know |
| * which gres are available to be assigned to this task. |
| */ |
| for (int i = 0; i <= local_proc_id; i++) { |
| closest_gres = _get_closest_usable_gres(sharing_plugin_id, |
| gres_bit_alloc, |
| step->task[i]->cpu_set); |
| |
| gres_needed = gres_per_task; |
| |
| /* |
| * Compare this selection priority with _set_shared_task_bits() |
| * in gres_select_filter.c |
| * |
| * First: Get a single device with core affinity with sufficient |
| * available shared gres. |
| * Second: Get a single device with sufficient available shared |
| * gres |
| * Third: Get devices with core affinity with any available |
| * shared gres |
| * Fourth: Get devices with any available shared gres |
| */ |
| if (gres_needed) |
| _filter_shared_gres_per_task(closest_gres, usable_gres, |
| gres_per_bit_avail, |
| &gres_needed, true, |
| (i == local_proc_id)); |
| if (gres_needed) |
| _filter_shared_gres_per_task(gres_bit_alloc, |
| usable_gres, |
| gres_per_bit_avail, |
| &gres_needed, true, |
| (i == local_proc_id)); |
| if (gres_needed) |
| _filter_shared_gres_per_task(closest_gres, usable_gres, |
| gres_per_bit_avail, |
| &gres_needed, false, |
| (i == local_proc_id)); |
| if (gres_needed) |
| _filter_shared_gres_per_task(gres_bit_alloc, |
| usable_gres, |
| gres_per_bit_avail, |
| &gres_needed, false, |
| (i == local_proc_id)); |
| FREE_NULL_BITMAP(closest_gres); |
| if (gres_needed) { |
| error("Not enough shared gres to bind %"PRIu64" per task", |
| gres_per_task); |
| break; |
| } |
| } |
| xfree(gres_per_bit_avail); |
| return usable_gres; |
| } |
| |
| /* Convert old binding options to current gres binding format |
| * |
| * IN accel_bind_type - GRES binding options (old format, a bitmap) |
| * IN/OUT tres_bind_str - TRES binding directives (new format, a string) |
| */ |
| static void _parse_accel_bind_type(uint16_t accel_bind_type, char *tres_bind_str) |
| { |
| if (accel_bind_type & ACCEL_BIND_CLOSEST_GPU) { |
| xstrfmtcat(tres_bind_str, "%sgres/gpu:closest", |
| tres_bind_str ? "+" : ""); |
| } |
| if (accel_bind_type & ACCEL_BIND_CLOSEST_NIC) { |
| xstrfmtcat(tres_bind_str, "%sgres/nic:closest", |
| tres_bind_str ? "+" : ""); |
| } |
| } |
| |
| static int _get_usable_gres(int context_inx, int proc_id, |
| char *tres_bind_str, bitstr_t **usable_gres_ptr, |
| bitstr_t *gres_bit_alloc, bool get_devices, |
| stepd_step_rec_t *step, uint64_t *gres_per_bit, |
| gres_internal_flags_t *flags) |
| { |
| char *tres_name = NULL, *sep; |
| bitstr_t *usable_gres = NULL; |
| uint32_t plugin_id = gres_context[context_inx].plugin_id; |
| *usable_gres_ptr = NULL; |
| |
| if (!gres_bit_alloc || !tres_bind_str) |
| return SLURM_SUCCESS; |
| |
| tres_name = xstrdup_printf("gres/%s:", |
| gres_context[context_inx].gres_name); |
| sep = xstrstr(tres_bind_str, tres_name); |
| if (!sep) { |
| xfree(tres_name); |
| return SLURM_SUCCESS; |
| } |
| sep += strlen(tres_name); |
| xfree(tres_name); |
| |
| if (!xstrncasecmp(sep, "verbose,", 8)){ |
| sep += 8; |
| if (flags) |
| *flags |= GRES_INTERNAL_FLAG_VERBOSE; |
| } |
| |
| if (step->flags & LAUNCH_GRES_ALLOW_TASK_SHARING) { |
| if (get_devices) |
| return SLURM_SUCCESS; |
| /* |
| * Overwrite device index setting to use the global node/job GRES |
| * index, rather than the index local to the task. This ensures |
| * that the GRES environment variable is set correctly on the |
| * task when multiple devices are constrained to the task, and |
| * only the environment variables are bound to specific GRES. |
| */ |
| use_local_index = false; |
| dev_index_mode_set = true; |
| |
| /* |
| * Consolidate allocated gres bitstring so that we get the GRES |
| * device index of the GRES within the context of the job, and |
| * not within the context of the whole node, unless specifically |
| * required with the GRES_CONF_GLOBAL_INDEX flag. |
| */ |
| if (!(gres_context[context_inx].config_flags & |
| GRES_CONF_GLOBAL_INDEX)) |
| bit_consolidate(gres_bit_alloc); |
| } |
| |
| if (gres_context[context_inx].config_flags & GRES_CONF_GLOBAL_INDEX) { |
| use_local_index = false; |
| dev_index_mode_set = true; |
| } |
| |
| if (!gres_id_shared(gres_context[context_inx].config_flags)) { |
| if (!xstrncasecmp(sep, "map_gpu:", 8)) { // Old Syntax |
| usable_gres = _get_usable_gres_map_or_mask( |
| (sep + 8), proc_id, gres_bit_alloc, |
| true, get_devices); |
| } else if (!xstrncasecmp(sep, "mask_gpu:", 9)) { // Old Syntax |
| usable_gres = _get_usable_gres_map_or_mask( |
| (sep + 9), proc_id, gres_bit_alloc, |
| false, get_devices); |
| } else if (!xstrncasecmp(sep, "map:", 4)) { |
| usable_gres = _get_usable_gres_map_or_mask( |
| (sep + 4), proc_id, gres_bit_alloc, |
| true, get_devices); |
| } else if (!xstrncasecmp(sep, "mask:", 5)) { |
| usable_gres = _get_usable_gres_map_or_mask( |
| (sep + 5), proc_id, gres_bit_alloc, |
| false, get_devices); |
| } else if (!xstrncasecmp(sep, "single:", 7)) { |
| if (!get_devices && gres_use_local_device_index()) { |
| usable_gres = bit_alloc( |
| bit_size(gres_bit_alloc)); |
| bit_set(usable_gres, 0); |
| } else { |
| usable_gres = _get_single_usable_gres( |
| context_inx, slurm_atoul(sep + 7), |
| proc_id, step, gres_bit_alloc); |
| } |
| } else if (!xstrncasecmp(sep, "closest", 7)) { |
| usable_gres = _get_closest_usable_gres( |
| plugin_id, gres_bit_alloc, |
| step->task[proc_id]->cpu_set); |
| if (!get_devices && gres_use_local_device_index()) |
| bit_consolidate(usable_gres); |
| } else if (!xstrncasecmp(sep, "per_task:", 9)) { |
| if (!get_devices && gres_use_local_device_index()) { |
| usable_gres = bit_alloc( |
| bit_size(gres_bit_alloc)); |
| bit_nset(usable_gres, 0, |
| slurm_atoul(sep + 9) - 1); |
| } else { |
| usable_gres = _get_gres_per_task( |
| gres_bit_alloc, slurm_atoul(sep + 9), |
| step, plugin_id, proc_id); |
| } |
| } else if (!xstrncasecmp(sep, "none", 4)) { |
| usable_gres = bit_copy(gres_bit_alloc); |
| } else |
| return SLURM_ERROR; |
| } else { // Shared gres only support per_task binding for now |
| if (!xstrncasecmp(sep, "per_task:", 9)) { |
| usable_gres = _get_shared_gres_per_task( |
| gres_bit_alloc, gres_per_bit, |
| slurm_atoul(sep + 9), |
| step, gpu_plugin_id, proc_id); |
| if (!get_devices && gres_use_local_device_index()) |
| bit_consolidate(usable_gres); |
| } else if (!xstrncasecmp(sep, "none", 4)) { |
| usable_gres = bit_copy(gres_bit_alloc); |
| } else |
| return SLURM_ERROR; |
| } |
| |
| if (usable_gres && !bit_set_count(usable_gres)) { |
| error("Bind request %s does not specify any devices within the allocation for task %d. Binding to the first device in the allocation instead.", |
| tres_bind_str, proc_id); |
| if (!get_devices && gres_use_local_device_index()) |
| bit_set(usable_gres, 0); |
| else |
| bit_set(usable_gres, bit_ffs(gres_bit_alloc)); |
| } |
| |
| *usable_gres_ptr = usable_gres; |
| |
| return SLURM_SUCCESS; |
| } |
| |
| /* |
| * Set environment as required for all tasks of a job step |
| */ |
| extern void gres_g_step_set_env(stepd_step_rec_t *step) |
| { |
| int i; |
| bitstr_t *gres_bit_alloc = NULL; |
| gres_internal_flags_t flags = GRES_INTERNAL_FLAG_NONE; |
| foreach_gres_accumulate_device_t foreach_gres_accumulate_device = { |
| .gres_bit_alloc = &gres_bit_alloc, |
| .is_job = false, |
| }; |
| |
| xassert(gres_context_cnt >= 0); |
| slurm_mutex_lock(&gres_context_lock); |
| for (i = 0; i < gres_context_cnt; i++) { |
| slurm_gres_context_t *gres_ctx = &gres_context[i]; |
| if (!gres_ctx->ops.step_set_env) |
| continue; /* No plugin to call */ |
| if (!step->step_gres_list) { |
| /* Clear GRES environment variables */ |
| (*(gres_ctx->ops.step_set_env))( |
| &step->env, NULL, 0, GRES_INTERNAL_FLAG_NONE); |
| continue; |
| } |
| foreach_gres_accumulate_device.plugin_id = gres_ctx->plugin_id; |
| (void) list_for_each(step->step_gres_list, |
| _accumulate_gres_device, |
| &foreach_gres_accumulate_device); |
| |
| /* |
| * Do not let MPS or Shard (shared GRES) clear any envs set for |
| * a GPU (sharing GRES) when a GPU is allocated but an |
| * MPS/Shard is not. Sharing GRES plugins always run before |
| * shared GRES, so we don't need to protect MPS/Shard from GPU. |
| */ |
| if (gres_id_shared(gres_ctx->config_flags) && |
| foreach_gres_accumulate_device.sharing_gres_allocated) |
| flags |= GRES_INTERNAL_FLAG_PROTECT_ENV; |
| |
| (*(gres_ctx->ops.step_set_env))( |
| &step->env, |
| gres_bit_alloc, |
| foreach_gres_accumulate_device.gres_cnt, |
| flags); |
| foreach_gres_accumulate_device.gres_cnt = 0; |
| FREE_NULL_BITMAP(gres_bit_alloc); |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| } |
| |
| /* |
| * Change the task's inherited environment (from the step, and set by |
| * gres_g_step_set_env()). Use this to implement GPU task binding. |
| */ |
| extern void gres_g_task_set_env(stepd_step_rec_t *step, int local_proc_id) |
| { |
| int i; |
| bitstr_t *usable_gres = NULL; |
| bitstr_t *gres_bit_alloc = NULL; |
| uint64_t *gres_per_bit = NULL; |
| foreach_gres_accumulate_device_t foreach_gres_accumulate_device = { |
| .gres_bit_alloc = &gres_bit_alloc, |
| .gres_per_bit = &gres_per_bit, |
| .is_job = false, |
| }; |
| |
| if (step->accel_bind_type) |
| _parse_accel_bind_type(step->accel_bind_type, step->tres_bind); |
| |
| xassert(gres_context_cnt >= 0); |
| slurm_mutex_lock(&gres_context_lock); |
| for (i = 0; i < gres_context_cnt; i++) { |
| gres_internal_flags_t flags = GRES_INTERNAL_FLAG_NONE; |
| slurm_gres_context_t *gres_ctx = &gres_context[i]; |
| if (!gres_ctx->ops.task_set_env) |
| continue; /* No plugin to call */ |
| if (!step->step_gres_list) { |
| /* Clear GRES environment variables */ |
| (*(gres_ctx->ops.task_set_env))( |
| &step->envtp->env, NULL, 0, NULL, |
| GRES_INTERNAL_FLAG_NONE); |
| continue; |
| } |
| foreach_gres_accumulate_device.plugin_id = gres_ctx->plugin_id; |
| (void) list_for_each(step->step_gres_list, |
| _accumulate_gres_device, |
| &foreach_gres_accumulate_device); |
| |
| if (_get_usable_gres(i, local_proc_id, step->tres_bind, |
| &usable_gres, gres_bit_alloc, false, step, |
| gres_per_bit, &flags) == SLURM_ERROR) { |
| goto next; |
| } |
| |
| /* |
| * Do not let MPS or Shard (shared GRES) clear any envs set for |
| * a GPU (sharing GRES) when a GPU is allocated but an |
| * MPS/Shard is not. Sharing GRES plugins always run before |
| * shared GRES, so we don't need to protect MPS/Shard from GPU. |
| */ |
| if (gres_id_shared(gres_ctx->config_flags) && |
| foreach_gres_accumulate_device.sharing_gres_allocated) |
| flags |= GRES_INTERNAL_FLAG_PROTECT_ENV; |
| |
| (*(gres_ctx->ops.task_set_env))( |
| &step->envtp->env, |
| gres_bit_alloc, |
| foreach_gres_accumulate_device.gres_cnt, |
| usable_gres, flags); |
| next: |
| foreach_gres_accumulate_device.gres_cnt = 0; |
| xfree(gres_per_bit); |
| FREE_NULL_BITMAP(gres_bit_alloc); |
| FREE_NULL_BITMAP(usable_gres); |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| } |
| |
| static void _step_state_log_node(gres_step_state_t *gres_ss, int i) |
| { |
| char tmp_str[128]; |
| if (gres_ss->gres_bit_alloc[i]) { |
| bit_fmt(tmp_str, sizeof(tmp_str), gres_ss->gres_bit_alloc[i]); |
| info(" gres_bit_alloc[%d]:%s of %d", i, tmp_str, |
| (int)bit_size(gres_ss->gres_bit_alloc[i])); |
| } else |
| info(" gres_bit_alloc[%d]:NULL", i); |
| |
| if (gres_ss->gres_per_bit_alloc && gres_ss->gres_per_bit_alloc[i]) { |
| for (int j = 0; |
| (j = bit_ffs_from_bit(gres_ss->gres_bit_alloc[i], j)) >= 0; |
| j++) { |
| info(" gres_per_bit_alloc[%d][%d]:%" PRIu64, i, j, |
| gres_ss->gres_per_bit_alloc[i][j]); |
| } |
| } |
| } |
| |
| static int _step_state_log(void *x, void *arg) |
| { |
| gres_state_t *gres_state_step = x; |
| gres_step_state_t *gres_ss = gres_state_step->gres_data; |
| char *gres_name = gres_state_step->gres_name; |
| slurm_step_id_t *step_id = arg; |
| int i; |
| |
| xassert(gres_ss); |
| info("gres:%s type:%s(%u) %ps flags:%s state", gres_name, |
| gres_ss->type_name, gres_ss->type_id, step_id, |
| gres_flags2str(gres_ss->flags)); |
| if (gres_ss->cpus_per_gres) |
| info(" cpus_per_gres:%u", gres_ss->cpus_per_gres); |
| if (gres_ss->gres_per_step) |
| info(" gres_per_step:%"PRIu64, gres_ss->gres_per_step); |
| if (gres_ss->gres_per_node) { |
| info(" gres_per_node:%"PRIu64" node_cnt:%u", |
| gres_ss->gres_per_node, gres_ss->node_cnt); |
| } |
| if (gres_ss->gres_per_socket) |
| info(" gres_per_socket:%"PRIu64, gres_ss->gres_per_socket); |
| if (gres_ss->gres_per_task) |
| info(" gres_per_task:%"PRIu64, gres_ss->gres_per_task); |
| if (gres_ss->mem_per_gres) |
| info(" mem_per_gres:%"PRIu64, gres_ss->mem_per_gres); |
| |
| if (gres_ss->node_in_use == NULL) |
| info(" node_in_use:NULL"); |
| else if (gres_ss->gres_bit_alloc == NULL) |
| info(" gres_bit_alloc:NULL"); |
| else { |
| for (i = 0; i < gres_ss->node_cnt; i++) { |
| if (bit_test(gres_ss->node_in_use, i)) |
| _step_state_log_node(gres_ss, i); |
| } |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Log a step's current gres state |
| * IN gres_list - generated by gres_stepmgr_step_alloc() |
| * IN job_id - job's ID |
| * IN step_id - step's ID |
| */ |
| extern void gres_step_state_log(list_t *gres_list, uint32_t job_id, |
| uint32_t step_id) |
| { |
| slurm_step_id_t tmp_step_id = { |
| .job_id = job_id, |
| .step_het_comp = NO_VAL, |
| .step_id = step_id, |
| }; |
| |
| if (!(slurm_conf.debug_flags & DEBUG_FLAG_GRES) || !gres_list) |
| return; |
| |
| (void) list_for_each(gres_list, _step_state_log, &tmp_step_id); |
| } |
| |
| /* |
| * Return TRUE if this plugin ID consumes GRES count > 1 for a single device |
| * file (e.g. MPS) |
| */ |
| extern bool gres_id_shared(uint32_t config_flags) |
| { |
| if (config_flags & GRES_CONF_SHARED) |
| return true; |
| return false; |
| } |
| /* |
| * Return TRUE if this plugin ID shares resources with another GRES that |
| * consumes subsets of its resources (e.g. GPU) |
| */ |
| extern bool gres_id_sharing(uint32_t plugin_id) |
| { |
| if (plugin_id == gpu_plugin_id) |
| return true; |
| return false; |
| } |
| |
| static int _foreach_node_count(void *x, void *arg) |
| { |
| gres_state_t *gres_state_node = x; |
| foreach_node_count_t *foreach_node_count = arg; |
| gres_node_state_t *gres_ns = gres_state_node->gres_data; |
| uint64_t val = 0; |
| |
| xassert(gres_ns); |
| |
| switch (foreach_node_count->val_type) { |
| case GRES_VAL_TYPE_FOUND: |
| val = gres_ns->gres_cnt_found; |
| break; |
| case GRES_VAL_TYPE_CONFIG: |
| val = gres_ns->gres_cnt_config; |
| break; |
| case GRES_VAL_TYPE_AVAIL: |
| val = gres_ns->gres_cnt_avail; |
| break; |
| case GRES_VAL_TYPE_ALLOC: |
| val = gres_ns->gres_cnt_alloc; |
| break; |
| } |
| |
| foreach_node_count->gres_count_ids[foreach_node_count->index] = |
| gres_state_node->plugin_id; |
| foreach_node_count->gres_count_vals[foreach_node_count->index] = val; |
| |
| if (++foreach_node_count->index >= foreach_node_count->array_len) |
| return -1; |
| return 0; |
| } |
| |
| /* |
| * Fill in an array of GRES type ids contained within the given node gres_list |
| * and an array of corresponding counts of those GRES types. |
| * IN gres_list - a List of GRES types found on a node. |
| * IN arrlen - Length of the arrays (the number of elements in the gres_list). |
| * IN gres_count_ids, gres_count_vals - the GRES type ID's and values found |
| * in the gres_list. |
| * IN val_type - Type of value desired, see GRES_VAL_TYPE_* |
| * RET SLURM_SUCCESS or error code |
| */ |
| extern int gres_node_count(list_t *gres_list, int arr_len, |
| uint32_t *gres_count_ids, |
| uint64_t *gres_count_vals, |
| int val_type) |
| { |
| foreach_node_count_t foreach_node_count = { |
| .array_len = arr_len, |
| .gres_count_ids = gres_count_ids, |
| .gres_count_vals = gres_count_vals, |
| .val_type = val_type, |
| }; |
| |
| if (arr_len <= 0) |
| return EINVAL; |
| |
| (void) list_for_each(gres_list, _foreach_node_count, |
| &foreach_node_count); |
| |
| return SLURM_SUCCESS; |
| } |
| static void _gres_device_pack( |
| void *in, uint16_t protocol_version, buf_t *buffer) |
| { |
| gres_device_t *gres_device = in; |
| |
| /* DON'T PACK gres_device->alloc */ |
| pack32(gres_device->index, buffer); |
| pack32(gres_device->dev_num, buffer); |
| pack32(gres_device->dev_desc.type, buffer); |
| pack32(gres_device->dev_desc.major, buffer); |
| pack32(gres_device->dev_desc.minor, buffer); |
| packstr(gres_device->path, buffer); |
| packstr(gres_device->unique_id, buffer); |
| } |
| |
| extern void gres_send_stepd(buf_t *buffer, list_t *gres_devices) |
| { |
| slurm_pack_list(gres_devices, _gres_device_pack, buffer, |
| SLURM_PROTOCOL_VERSION); |
| } |
| |
| static int _gres_device_unpack(void **object, uint16_t protocol_version, |
| buf_t *buffer) |
| { |
| uint32_t uint32_tmp = 0; |
| gres_device_t *gres_device = xmalloc(sizeof(gres_device_t)); |
| |
| safe_unpack32(&uint32_tmp, buffer); |
| gres_device->index = uint32_tmp; |
| safe_unpack32(&uint32_tmp, buffer); |
| gres_device->dev_num = uint32_tmp; |
| safe_unpack32(&uint32_tmp, buffer); |
| gres_device->dev_desc.type = uint32_tmp; |
| safe_unpack32(&uint32_tmp, buffer); |
| gres_device->dev_desc.major = uint32_tmp; |
| safe_unpack32(&uint32_tmp, buffer); |
| gres_device->dev_desc.minor = uint32_tmp; |
| safe_unpackstr(&gres_device->path, buffer); |
| safe_unpackstr(&gres_device->unique_id, buffer); |
| /* info("adding %d %s %s", gres_device->dev_num, */ |
| /* gres_device->major, gres_device->path); */ |
| |
| *object = gres_device; |
| |
| return SLURM_SUCCESS; |
| |
| unpack_error: |
| error("%s: failed", __func__); |
| destroy_gres_device(gres_device); |
| return SLURM_ERROR; |
| } |
| |
| extern void gres_recv_stepd(buf_t *buffer, list_t **gres_devices) |
| { |
| (void) slurm_unpack_list(gres_devices, _gres_device_unpack, |
| destroy_gres_device, |
| buffer, SLURM_PROTOCOL_VERSION); |
| } |
| |
| /* Send GRES information to slurmstepd on the specified file descriptor */ |
| extern void gres_g_send_stepd(int fd, slurm_msg_t *msg) |
| { |
| int len; |
| uint32_t step_id; |
| cred_data_enum_t check; |
| slurm_cred_t *cred = NULL; |
| |
| /* Setup the gres_device list and other plugin-specific data */ |
| xassert(gres_context_cnt >= 0); |
| |
| slurm_mutex_lock(&gres_context_lock); |
| xassert(gres_context_buf); |
| |
| len = get_buf_offset(gres_context_buf); |
| safe_write(fd, &len, sizeof(len)); |
| safe_write(fd, get_buf_data(gres_context_buf), len); |
| |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| if (msg->msg_type == REQUEST_BATCH_JOB_LAUNCH) { |
| batch_job_launch_msg_t *job = msg->data; |
| step_id = SLURM_BATCH_SCRIPT; |
| cred = job->cred; |
| } else { |
| launch_tasks_request_msg_t *job = msg->data; |
| step_id = job->step_id.step_id; |
| cred = job->cred; |
| } |
| |
| /* If we are a special step we get the JOB_GRES_LIST */ |
| if (step_id >= SLURM_MAX_NORMAL_STEP_ID) |
| check = CRED_DATA_JOB_GRES_LIST; |
| else |
| check = CRED_DATA_STEP_GRES_LIST; |
| /* Send the merged slurm.conf/gres.conf and autodetect data */ |
| if (slurm_cred_get(cred, check)) { |
| len = get_buf_offset(gres_conf_buf); |
| safe_write(fd, &len, sizeof(len)); |
| safe_write(fd, get_buf_data(gres_conf_buf), len); |
| } |
| |
| return; |
| rwfail: |
| error("%s: failed", __func__); |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| return; |
| } |
| |
| /* Receive GRES information from slurmd on the specified file descriptor */ |
| extern int gres_g_recv_stepd(int fd, slurm_msg_t *msg) |
| { |
| int len, rc = SLURM_ERROR; |
| buf_t *buffer = NULL; |
| uint32_t step_id; |
| cred_data_enum_t check; |
| slurm_cred_t *cred = NULL; |
| |
| slurm_mutex_lock(&gres_context_lock); |
| |
| safe_read(fd, &len, sizeof(int)); |
| |
| buffer = init_buf(len); |
| safe_read(fd, buffer->head, len); |
| |
| rc = _unpack_context_buf(buffer); |
| |
| if (rc == SLURM_ERROR) |
| goto rwfail; |
| |
| FREE_NULL_BUFFER(buffer); |
| |
| if (msg->msg_type == REQUEST_BATCH_JOB_LAUNCH) { |
| batch_job_launch_msg_t *job = msg->data; |
| step_id = SLURM_BATCH_SCRIPT; |
| cred = job->cred; |
| } else { |
| launch_tasks_request_msg_t *job = msg->data; |
| step_id = job->step_id.step_id; |
| cred = job->cred; |
| } |
| |
| /* If we are a special step we get the JOB_GRES_LIST */ |
| if (step_id >= SLURM_MAX_NORMAL_STEP_ID) |
| check = CRED_DATA_JOB_GRES_LIST; |
| else |
| check = CRED_DATA_STEP_GRES_LIST; |
| /* Recv the merged slurm.conf/gres.conf and autodetect data */ |
| if (slurm_cred_get(cred, check)) { |
| safe_read(fd, &len, sizeof(int)); |
| |
| buffer = init_buf(len); |
| safe_read(fd, buffer->head, len); |
| |
| rc = _unpack_gres_conf(buffer); |
| |
| if (rc == SLURM_ERROR) |
| goto rwfail; |
| |
| FREE_NULL_BUFFER(buffer); |
| } |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| /* Set debug flags only */ |
| (void) gres_init(); |
| |
| rc = _load_specific_gres_plugins(); |
| |
| return rc; |
| rwfail: |
| FREE_NULL_BUFFER(buffer); |
| error("%s: failed", __func__); |
| slurm_mutex_unlock(&gres_context_lock); |
| |
| /* Set debug flags only */ |
| (void) gres_init(); |
| |
| rc = _load_specific_gres_plugins(); |
| |
| return rc; |
| } |
| |
| /* Get generic GRES data types here. Call the plugin for others */ |
| static int _get_step_info(gres_step_state_t *gres_ss, |
| uint32_t node_inx, enum gres_step_data_type data_type, |
| void *data) |
| { |
| uint64_t *u64_data = (uint64_t *) data; |
| bitstr_t **bit_data = (bitstr_t **) data; |
| int rc = SLURM_SUCCESS; |
| |
| if (!gres_ss || !data) |
| return EINVAL; |
| if (node_inx >= gres_ss->node_cnt) |
| return ESLURM_INVALID_NODE_COUNT; |
| |
| switch (data_type) { |
| case GRES_STEP_DATA_COUNT: |
| *u64_data += gres_ss->gres_cnt_node_alloc[node_inx]; |
| break; |
| case GRES_STEP_DATA_BITMAP: |
| if (gres_ss->gres_bit_alloc) { |
| if (!*bit_data) { |
| *bit_data = bit_copy( |
| gres_ss->gres_bit_alloc[node_inx]); |
| } else { |
| xassert(bit_size(*bit_data) == |
| bit_size(gres_ss->gres_bit_alloc[ |
| node_inx])); |
| bit_or(*bit_data, |
| gres_ss->gres_bit_alloc[node_inx]); |
| } |
| } |
| break; |
| default: |
| error("%s: unknown enum given %d", __func__, data_type); |
| rc = EINVAL; |
| break; |
| } |
| |
| return rc; |
| } |
| |
| static int _foreach_get_step_info(void *x, void *arg) |
| { |
| gres_state_t *gres_state_step = x; |
| foreach_step_info_t *foreach_step_info = arg; |
| |
| if (gres_state_step->plugin_id != foreach_step_info->plugin_id) |
| return 0; |
| |
| foreach_step_info->rc = _get_step_info(gres_state_step->gres_data, |
| foreach_step_info->node_inx, |
| foreach_step_info->data_type, |
| foreach_step_info->data); |
| if (foreach_step_info->rc != SLURM_SUCCESS) |
| return -1; |
| return 0; |
| } |
| |
| /* |
| * get data from a step's GRES data structure |
| * IN step_gres_list - step's GRES data structure |
| * IN gres_name - name of a GRES type |
| * IN node_inx - zero-origin index of the node within the job's allocation |
| * for which data is desired. Note this can differ from the step's |
| * node allocation index. |
| * IN data_type - type of data to get from the step's data |
| * OUT data - pointer to the data from step's GRES data structure |
| * DO NOT FREE: This is a pointer into the step's data structure |
| * RET - SLURM_SUCCESS or error code |
| */ |
| extern int gres_get_step_info(list_t *step_gres_list, char *gres_name, |
| uint32_t node_inx, |
| enum gres_step_data_type data_type, void *data) |
| { |
| foreach_step_info_t foreach_step_info = { |
| .data = data, |
| .data_type = data_type, |
| .node_inx = node_inx, |
| .rc = ESLURM_INVALID_GRES, |
| }; |
| if (data == NULL) |
| return EINVAL; |
| if (step_gres_list == NULL) /* No GRES allocated */ |
| return ESLURM_INVALID_GRES; |
| |
| xassert(gres_context_cnt >= 0); |
| foreach_step_info.plugin_id = gres_build_id(gres_name); |
| |
| (void) list_for_each(step_gres_list, _foreach_get_step_info, |
| &foreach_step_info); |
| |
| return foreach_step_info.rc; |
| } |
| |
| extern uint32_t gres_get_autodetect_flags(void) |
| { |
| return autodetect_flags; |
| } |
| |
| extern void gres_clear_tres_cnt(uint64_t *tres_cnt, bool locked) |
| { |
| assoc_mgr_lock_t locks = { .tres = READ_LOCK }; |
| |
| /* |
| * If gres_context_lock is ever locked/unlocked here, it should happen |
| * in between assoc_mgr_lock() and before assoc_mgr_unlock(). |
| */ |
| |
| if (!locked) |
| assoc_mgr_lock(&locks); |
| |
| /* Initialize all GRES counters to zero. Increment them later. */ |
| for (int i = 0; i < g_tres_count; ++i) { |
| /* Skip all non-GRES TRES */ |
| if (xstrcasecmp(assoc_mgr_tres_array[i]->type, "gres")) |
| continue; |
| tres_cnt[i] = 0; |
| } |
| |
| if (!locked) |
| assoc_mgr_unlock(&locks); |
| } |
| |
| extern char *gres_device_id2str(gres_device_id_t *gres_dev) |
| { |
| char *res = NULL; |
| |
| xstrfmtcat(res, "%c %u:%u rwm", |
| gres_dev->type == DEV_TYPE_BLOCK ? 'b' : 'c', |
| gres_dev->major, gres_dev->minor); |
| |
| return res; |
| } |
| |
| |
| /* Free memory for gres_device_t record */ |
| extern void destroy_gres_device(void *gres_device_ptr) |
| { |
| gres_device_t *gres_device = (gres_device_t *) gres_device_ptr; |
| |
| if (!gres_device) |
| return; |
| xfree(gres_device->path); |
| xfree(gres_device->unique_id); |
| xfree(gres_device); |
| } |
| |
| /* Destroy a gres_slurmd_conf_t record, free it's memory */ |
| extern void destroy_gres_slurmd_conf(void *x) |
| { |
| gres_slurmd_conf_t *p = (gres_slurmd_conf_t *) x; |
| |
| xassert(p); |
| xfree(p->cpus); |
| FREE_NULL_BITMAP(p->cpus_bitmap); |
| xfree(p->file); /* Only used by slurmd */ |
| xfree(p->links); |
| xfree(p->name); |
| xfree(p->type_name); |
| xfree(p->unique_id); |
| xfree(p); |
| } |
| |
| /* |
| * Convert GRES config_flags to a string. The pointer returned references local |
| * storage in this function, which is not re-entrant. |
| */ |
| extern char *gres_flags2str(uint32_t config_flags) |
| { |
| static char flag_str[128]; |
| char *sep = ""; |
| |
| flag_str[0] = '\0'; |
| if (config_flags & GRES_CONF_COUNT_ONLY) { |
| strcat(flag_str, sep); |
| strcat(flag_str, "CountOnly"); |
| sep = ","; |
| } |
| |
| if (config_flags & GRES_CONF_EXPLICIT) { |
| strcat(flag_str, sep); |
| strcat(flag_str, "Explicit"); |
| sep = ","; |
| } |
| |
| if (config_flags & GRES_CONF_HAS_FILE) { |
| strcat(flag_str, sep); |
| strcat(flag_str, "HAS_FILE"); |
| sep = ","; |
| } |
| |
| if (config_flags & GRES_CONF_LOADED) { |
| strcat(flag_str, sep); |
| strcat(flag_str, "LOADED"); |
| sep = ","; |
| } |
| |
| if (config_flags & GRES_CONF_HAS_TYPE) { |
| strcat(flag_str, sep); |
| strcat(flag_str, "HAS_TYPE"); |
| sep = ","; |
| } |
| |
| if (config_flags & GRES_CONF_ENV_NVML) { |
| strcat(flag_str, sep); |
| strcat(flag_str, "ENV_NVML"); |
| sep = ","; |
| } |
| |
| if (config_flags & GRES_CONF_ENV_RSMI) { |
| strcat(flag_str, sep); |
| strcat(flag_str, "ENV_RSMI"); |
| sep = ","; |
| } |
| |
| if (config_flags & GRES_CONF_ENV_ONEAPI) { |
| strcat(flag_str, sep); |
| strcat(flag_str, "ENV_ONEAPI"); |
| sep = ","; |
| } |
| |
| if (config_flags & GRES_CONF_ENV_OPENCL) { |
| strcat(flag_str, sep); |
| strcat(flag_str, "ENV_OPENCL"); |
| sep = ","; |
| } |
| |
| if (config_flags & GRES_CONF_ENV_DEF) { |
| strcat(flag_str, sep); |
| strcat(flag_str, "ENV_DEFAULT"); |
| sep = ","; |
| } |
| |
| if (config_flags & GRES_CONF_SHARED) { |
| strcat(flag_str, sep); |
| strcat(flag_str, "SHARED"); |
| sep = ","; |
| } |
| |
| if (config_flags & GRES_CONF_ONE_SHARING) { |
| strcat(flag_str, sep); |
| strcat(flag_str, "ONE_SHARING"); |
| sep = ","; |
| } |
| |
| return flag_str; |
| } |
| |
| /* |
| * Creates a gres_slurmd_conf_t record to add to a list of gres_slurmd_conf_t |
| * records |
| */ |
| extern void add_gres_to_list(list_t *gres_list, |
| gres_slurmd_conf_t *gres_slurmd_conf_in) |
| { |
| gres_slurmd_conf_t *gres_slurmd_conf; |
| bool use_empty_first_record = false; |
| |
| /* |
| * If the first record already exists and has a count of 0 then |
| * overwrite it. |
| * This is a placeholder record created in _merge_config() |
| */ |
| gres_slurmd_conf = list_peek(gres_list); |
| if (gres_slurmd_conf && (gres_slurmd_conf->count == 0)) |
| use_empty_first_record = true; |
| else |
| gres_slurmd_conf = xmalloc(sizeof(gres_slurmd_conf_t)); |
| gres_slurmd_conf->cpu_cnt = gres_slurmd_conf_in->cpu_cnt; |
| if (gres_slurmd_conf_in->cpus_bitmap) { |
| bitstr_t *cpu_aff = bit_copy(gres_slurmd_conf_in->cpus_bitmap); |
| |
| /* |
| * Size down (or possibly up) cpus_bitmap, if necessary, so that |
| * the size of cpus_bitmap for system-detected devices matches |
| * the size of cpus_bitmap for configured devices. |
| */ |
| if (bit_size(cpu_aff) != gres_slurmd_conf_in->cpu_cnt) { |
| /* Calculate minimum size to hold CPU affinity */ |
| int64_t size = bit_fls(cpu_aff) + 1; |
| if (size > gres_slurmd_conf_in->cpu_cnt) { |
| char *cpu_str = bit_fmt_hexmask_trim(cpu_aff); |
| fatal("This CPU affinity bitmask (%s) does not fit within the CPUs configured for this node (%d). Make sure that the node's CPU count is configured correctly.", |
| cpu_str, gres_slurmd_conf_in->cpu_cnt); |
| xfree(cpu_str); |
| } |
| bit_realloc(cpu_aff, gres_slurmd_conf_in->cpu_cnt); |
| } |
| gres_slurmd_conf->cpus_bitmap = cpu_aff; |
| } |
| |
| /* Set default env flags, if necessary */ |
| if ((gres_slurmd_conf_in->config_flags & GRES_CONF_ENV_DEF) && |
| ((gres_slurmd_conf_in->config_flags & GRES_CONF_ENV_SET) != |
| GRES_CONF_ENV_SET)) |
| gres_slurmd_conf_in->config_flags |= GRES_CONF_ENV_SET; |
| |
| gres_slurmd_conf->config_flags = gres_slurmd_conf_in->config_flags; |
| |
| if (gres_slurmd_conf_in->file) { |
| hostlist_t *hl = hostlist_create(gres_slurmd_conf_in->file); |
| gres_slurmd_conf->config_flags |= GRES_CONF_HAS_FILE; |
| if (hostlist_count(hl) > 1) |
| gres_slurmd_conf->config_flags |= GRES_CONF_HAS_MULT; |
| hostlist_destroy(hl); |
| } |
| if (gres_slurmd_conf_in->type_name) |
| gres_slurmd_conf->config_flags |= GRES_CONF_HAS_TYPE; |
| gres_slurmd_conf->cpus = xstrdup(gres_slurmd_conf_in->cpus); |
| gres_slurmd_conf->type_name = xstrdup(gres_slurmd_conf_in->type_name); |
| gres_slurmd_conf->name = xstrdup(gres_slurmd_conf_in->name); |
| gres_slurmd_conf->file = xstrdup(gres_slurmd_conf_in->file); |
| gres_slurmd_conf->links = xstrdup(gres_slurmd_conf_in->links); |
| gres_slurmd_conf->unique_id = xstrdup(gres_slurmd_conf_in->unique_id); |
| gres_slurmd_conf->count = gres_slurmd_conf_in->count; |
| gres_slurmd_conf->plugin_id = gres_build_id(gres_slurmd_conf_in->name); |
| if (!use_empty_first_record) |
| list_append(gres_list, gres_slurmd_conf); |
| } |
| |
| extern char *gres_prepend_tres_type(const char *gres_str) |
| { |
| char *output = NULL; |
| |
| if (gres_str) { |
| output = xstrdup_printf("gres/%s", gres_str); |
| xstrsubstituteall(output, ",", ",gres/"); |
| xstrsubstituteall(output, "gres/gres/", "gres/"); |
| } |
| return output; |
| } |
| |
| extern bool gres_use_busy_dev(gres_state_t *gres_state_node, |
| bool use_total_gres) |
| { |
| gres_node_state_t *gres_ns = gres_state_node->gres_data; |
| |
| if (!use_total_gres && |
| gres_id_shared(gres_state_node->config_flags) && |
| (gres_state_node->config_flags & GRES_CONF_ONE_SHARING) && |
| (gres_ns->gres_cnt_alloc != 0)) { |
| /* We must use the ONE already active GRES of this type */ |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* Return the plugin id made from gres_build_id("gpu") */ |
| extern uint32_t gres_get_gpu_plugin_id(void) |
| { |
| return gpu_plugin_id; |
| } |
| |
| extern bool gres_valid_name(char *name) |
| { |
| if (!name || (name[0] == '\0')) |
| return false; |
| if (gres_get_system_cnt(name, false) != NO_VAL64) |
| return true; |
| |
| return false; |
| } |